updated documentation

This commit is contained in:
Uldis Locans
2017-08-10 14:57:48 +02:00
parent 7ca93a3a49
commit ccc4329bef
38 changed files with 939 additions and 673 deletions

View File

@ -15,6 +15,9 @@
class DKSBaseMuSR; class DKSBaseMuSR;
/**
* Interface to implement ChiSquareRuntime class for musrfit.
*/
class ChiSquareRuntime { class ChiSquareRuntime {
friend class DKSBaseMuSR; friend class DKSBaseMuSR;
@ -63,23 +66,54 @@ public:
/** Default constructor */ /** Default constructor */
//ChiSquareRuntime(); //ChiSquareRuntime();
/** Default destructor */ /** Default destructor. */
virtual ~ChiSquareRuntime() { }; virtual ~ChiSquareRuntime() { };
/**
* Compile GPU programm generated at runtime.
*/
virtual int compileProgram(std::string function, bool mlh = false) = 0; virtual int compileProgram(std::string function, bool mlh = false) = 0;
/**
* Launche the compiled chiSquare kernel.
*/
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap, int numpar, int numfunc, int nummap,
double timeStart, double timeStep, double timeStart, double timeStep,
double &result) = 0; double &result) = 0;
/**
* Write the parameter values to the GPU.
*/
virtual int writeParams(const double *params, int numparams) = 0; virtual int writeParams(const double *params, int numparams) = 0;
/**
* Write the function values to the GPU.
*/
virtual int writeFunc(const double *func, int numfunc) = 0; virtual int writeFunc(const double *func, int numfunc) = 0;
/**
* Write map values to the GPU.
*/
virtual int writeMap(const int *map, int nummap) = 0; virtual int writeMap(const int *map, int nummap) = 0;
/**
* Allocate temporary memory needed for the chi square calucaltios on the device.
*/
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0; virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
/**
* Free device memory allocated for chi square calculations.
*/
virtual int freeChiSquare() = 0; virtual int freeChiSquare() = 0;
/**
* Check if available device can run the chi square GPU code.
*/
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0; virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
/** Set N0, tau and bgk values to use for the kernel. /**
* Set N0, tau and bgk values to use for the kernel.
* If values changes between data sets this needs to be called before * If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS. * every kernel call. Returns DKS_SUCCESS.
*/ */
@ -91,7 +125,8 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Set alpha and beta values to use for the kernel. /**
* Set alpha and beta values to use for the kernel.
* If values changes between data sets this needs to be called before * If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS. * every kernel call. Returns DKS_SUCCESS.
*/ */
@ -101,8 +136,9 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Set number of blocks and threads. /**
* Used to set parameters obtained from auto-tuning * Set number of blocks and threads.
* Used to set parameters obtained from auto-tuning
*/ */
int setKernelParams(int numBlocks, int blockSize) { int setKernelParams(int numBlocks, int blockSize) {
int ierr = DKS_ERROR; int ierr = DKS_ERROR;
@ -118,8 +154,9 @@ public:
return ierr; return ierr;
} }
/** Get the number of operations in compiled kernel. /**
* Count the number of operation in the ptx file for the compiled program. * Get the number of operations in compiled kernel.
* Count the number of operation in the ptx file for the compiled program.
*/ */
int getOperations(int &oper) { int getOperations(int &oper) {

View File

@ -5,6 +5,9 @@
#include <string> #include <string>
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/**
* Interface to impelment particle matter interaction for OPAL.
*/
class DKSCollimatorPhysics { class DKSCollimatorPhysics {
protected: protected:
@ -15,29 +18,61 @@ protected:
public: public:
virtual ~DKSCollimatorPhysics() { } virtual ~DKSCollimatorPhysics() { }
/**
* Execute collimator physics kernel.
*
*/
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
bool enableRutherforScattering = true) = 0; bool enableRutherforScattering = true) = 0;
/**
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles) = 0; void *par_ptr, int numparticles) = 0;
/**
* Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array
*/
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0; virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
/**
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback) = 0; void *par_ptr, int numparticles, int &numaddback) = 0;
/**
* BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0; double dt, double c, bool usedt = false, int streamId = -1) = 0;
/**
* BorisPusher kick function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1) = 0; double mass, int npart, double c, int streamId = -1) = 0;
/**
* BorisPusher push function with transformto function form OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false, double dt, double c, bool usedt = false,

View File

@ -6,12 +6,21 @@
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/**
* Abstract class defining methods for DKS FFT class.
* Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
*/
class BaseFFT { class BaseFFT {
protected: protected:
int defaultN[3]; int defaultN[3];
int defaultNdim; int defaultNdim;
/**
* Check if FFT plan is created for the needed dimension and FFT size.
* Returns true if the plan has been created and false if no plan for specified dimension
* and size exists.
*/
bool useDefaultPlan(int ndim, int N[3]) { bool useDefaultPlan(int ndim, int N[3]) {
if (ndim != defaultNdim) if (ndim != defaultNdim)
return false; return false;
@ -24,18 +33,57 @@ public:
virtual ~BaseFFT() { } virtual ~BaseFFT() { }
/** Setup FFT - init FFT library used by chosen device. */
virtual int setupFFT(int ndim, int N[3]) = 0; virtual int setupFFT(int ndim, int N[3]) = 0;
/** Setup real to complex FFT - init FFT library used by chosen device. */
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0; virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
/** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0; virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
/** Clean up. */
virtual int destroyFFT() = 0; virtual int destroyFFT() = 0;
/**
* Exectute C2C FFT.
* mem_ptr - memory ptr on the device for complex data.
* Performs in place FFT.
*/
virtual int executeFFT(void * mem_ptr, int ndim, int N[3], virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
int streamId = -1, bool forward = true) = 0; int streamId = -1, bool forward = true) = 0;
/**
* Exectute inverse C2C FFT.
* mem_ptr - memory ptr on the device for complex data.
* Performs in place FFT.
*/
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
/**
* Normalize the FFT or IFFT.
* mem_ptr - memory to complex data.
*/
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
/**
* Exectute R2C FFT.
* real_ptr - real input data for FFT, comp_ptr - memory on the device where
* results for the FFT are stored as complex numbers.
*/
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0; int streamId = -1) = 0;
/**
* Exectute C2R FFT.
* real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
*/
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0; int streamId = -1) = 0;
/**
* Normalize CR FFT.
*/
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0; virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
}; };

View File

@ -4,24 +4,27 @@
#include <iostream> #include <iostream>
#include <cmath> #include <cmath>
/**
* Interface to implement Greens function calculations for OPAL.
*/
class GreensFunction { class GreensFunction {
public: public:
virtual ~GreensFunction() { } virtual ~GreensFunction() { }
/** calc greens integral, as defined in OPAL */ /** calc greens integral, as defined in OPAL. */
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0; double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
/** integration if rho2_m, see OPAL for more details */ /** integration if rho2_m, see OPAL for more details. */
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1) = 0; int streamId = -1) = 0;
/** mirror rho2_m field */ /** mirror rho2_m field. */
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0; virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
/** multiply two complex fields from device memory */ /** multiply two complex fields from device memory. */
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0; virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
}; };

View File

@ -5,17 +5,22 @@
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
/** Struct to hold voxel position for PET image. */
struct VoxelPosition { struct VoxelPosition {
float x; float x;
float y; float y;
float z; float z;
}; };
/** Struct that holds pair of detectors that registered an envent. */
struct ListEvent { struct ListEvent {
unsigned detA : 16; unsigned detA : 16;
unsigned detB : 16; unsigned detB : 16;
}; };
/**
* Interface to implement PET image reconstruction.
*/
class ImageReconstruction { class ImageReconstruction {
protected: protected:
@ -25,7 +30,8 @@ public:
virtual ~ImageReconstruction() { } virtual ~ImageReconstruction() { }
/** Caluclate source. /**
* Caluclate source.
* Places a sphere at each voxel position and calculate the avg value and std value of pixels * Places a sphere at each voxel position and calculate the avg value and std value of pixels
* that are inside this sphere. All the sphere used have the same diameter. * that are inside this sphere. All the sphere used have the same diameter.
*/ */
@ -33,7 +39,8 @@ public:
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** Calculate background. /**
* Calculate background.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels * Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter. * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -42,7 +49,8 @@ public:
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** Caluclate source using differente sources. /**
* Caluclate source using differente sources.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels * Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* each sphere is given by *diameter array. * each sphere is given by *diameter array.
@ -52,7 +60,7 @@ public:
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** /**
* Places two sphere at each voxel position, calculates the avg value and std value of pixels * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
* smaller sphere. * smaller sphere.
@ -61,7 +69,8 @@ public:
void *avg, void *std, void *diameter, int total_voxels, void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** Generate normalization. /**
* Generate normalization.
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
* that updates voxel values in the image on the slope between these two detectors. * that updates voxel values in the image on the slope between these two detectors.
*/ */
@ -69,14 +78,16 @@ public:
void *det_position, int total_det) = 0; void *det_position, int total_det) = 0;
/** Calculate forward projection. /**
* Calculate forward projection.
* For image reconstruction calculates forward projections. * For image reconstruction calculates forward projections.
* see recon.cpp for details * see recon.cpp for details
*/ */
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events) = 0; void *image_position, int num_events) = 0;
/** Calculate backward projection. /**
* Calculate backward projection.
* For image reconstruction calculates backward projections. * For image reconstruction calculates backward projections.
* see recon.cpp for details * see recon.cpp for details
*/ */
@ -84,29 +95,29 @@ public:
void *det_position, void *image_position, void *det_position, void *image_position,
int num_events, int num_voxels) = 0; int num_events, int num_voxels) = 0;
/** Set the voxel dimensins on device. /**
* *Set the voxel dimensins on device.
*/ */
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0; virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
/** Set the image edge variables on the device. /**
* * Set the image edge variables on the device.
*/ */
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0; virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
/** Set the image edge1 on the device. /**
* * Set the image edge1 on the device.
*/ */
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0; virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
/** Set the minimum crystan in one ring values on the device. /**
* * Set the minimum crystan in one ring values on the device.
*/ */
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
float min_CrystalDist_InOneRing1) = 0; float min_CrystalDist_InOneRing1) = 0;
/** Set all other required parameters for reconstruction. /**
* * Set all other required parameters for reconstruction.
*/ */
virtual int setParams(float matrix_distance_factor, float phantom_diameter, virtual int setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter) = 0; float atten_per_mm, float ring_diameter) = 0;

View File

@ -18,6 +18,17 @@
typedef std::vector<Parameter> Parameters; typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States; typedef std::vector<State> States;
/**
* DKS autotuning class, allows to auto-tune the defince function.
* Executes the defined function for auto-tuning and searches for optimal parameters to improve
* the function execution time. The function that is auto-tuned, parameters and the ranges
* need to be set. Includes multiple search methods, that searches the parameter space to finde
* the optimal solution.
* 1) exaustive search
* 2) line search
* 3) hill climbimg
* 4) simulated annealing
*/
class DKSAutoTuning { class DKSAutoTuning {
private: private:
@ -36,12 +47,13 @@ private:
int loops_m; int loops_m;
/** Update parameters from a state */ /** Update parameters from a state. */
int setParameterValues(States states); int setParameterValues(States states);
/** Evaluate the function and set execution time /**
* Returns DKS_ERROR if errors occured during function execution. * Evaluate the function and set execution time
* Returns DKS_SUCCESS if function executed as planned. * Returns DKS_ERROR if errors occured during function execution.
* Returns DKS_SUCCESS if function executed as planned.
*/ */
int evaluateFunction(double &value); int evaluateFunction(double &value);
@ -50,12 +62,13 @@ public:
/** Constructor */ /** Constructor */
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100); DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
/** Destructor */ /** Destructor. */
~DKSAutoTuning(); ~DKSAutoTuning();
/** Set function to auto tune. /**
* Caller of setFunction is responsible to bind the correct parameters * Set function to auto tune.
* to the function with std::bind. * Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind.
*/ */
void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) { void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
f_m = f; f_m = f;
@ -63,15 +76,21 @@ public:
evaluate_time_m = evaluate_time; evaluate_time_m = evaluate_time;
} }
/**
* Set function to auto tune.
* Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind.
*/
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) { void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
fd_m = f; fd_m = f;
function_name_m = name; function_name_m = name;
evaluate_time_m = evaluate_time; evaluate_time_m = evaluate_time;
} }
/** Set parameter for auto tuning. /**
* Provide a pointer to a parameter that will be changed during auto-tuning * Set parameter for auto tuning.
* and a min-max value for this element * Provide a pointer to a parameter that will be changed during auto-tuning
* and a min-max value for this element
*/ */
template <typename T1> template <typename T1>
void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) { void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
@ -85,9 +104,9 @@ public:
/** Perform exaustive search evaluating all the parameter configurations */ /** Perform exaustive search evaluating all the parameter configurations */
void exaustiveSearch(); void exaustiveSearch();
/** Perform auto-tuning. /**
* Perform line-search auto-tuning by variying parameters one at a time and keeping other * Perform line-search auto-tuning by variying parameters one at a time.
* parameters constant. * After one parameter is auto-tuned the next on is varied
*/ */
void lineSearch(); void lineSearch();

View File

@ -4,6 +4,7 @@
#include <iostream> #include <iostream>
#include <cmath> #include <cmath>
/** Tester class for auto-tuning search algorithms. */
class DKSAutoTuningTester { class DKSAutoTuningTester {
friend class DKSBaseMuSR; friend class DKSBaseMuSR;

View File

@ -1,9 +1,3 @@
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
*/
#ifndef DKS_CONFIG #ifndef DKS_CONFIG
#define DKS_CONFIG #define DKS_CONFIG
@ -29,6 +23,13 @@ namespace pt = boost::property_tree;
const std::string config_dir = "/.config/DKS"; const std::string config_dir = "/.config/DKS";
const std::string config_file = "/autotuning.xml"; const std::string config_file = "/autotuning.xml";
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
* TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
* so the auto-tuning has no effect.
*/
class DKSConfig { class DKSConfig {
private: private:

View File

@ -9,6 +9,9 @@
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE }; enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
/**
* Parameter class allows to change the searchable parameters during the auto-tuning.
*/
class Parameter { class Parameter {
private: private:
@ -64,6 +67,10 @@ public:
}; };
/**
* Struct to hold a auto-tuning state.
* Holds the current value, min, max and a step to witch a state can change.
*/
struct State { struct State {
double value; double value;
double min; double min;
@ -74,6 +81,12 @@ struct State {
typedef std::vector<Parameter> Parameters; typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States; typedef std::vector<State> States;
/**
* Used by auto-tuning search algorithms to move between parameter configurations.
* Allows to move from one parameter stat to another, get neighboring states,
* move to neighboring states and save state information. Print functions are available
* for debugging purposes, to follow how algorithm muves between sates.
*/
class DKSSearchStates { class DKSSearchStates {
private: private:

View File

@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/*
Info: allcate memory and write data (push)
Return: pointer to memory object
*/
/*
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
void * mem_ptr;
mem_ptr = cuda_allocateMemory(size, ierr);
if (ierr == DKS_SUCCESS)
ierr = cuda_writeData(mem_ptr, in_data, size);
return mem_ptr;
}
*/
/*
Info: read data and free memory (pull)
Return: success or error code
*/
/*
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
ierr = cuda_readData(mem_ptr, out_data, size);
if (ierr == DKS_SUCCESS)
ierr = cuda_freeMemory(mem_ptr);
else
return DKS_ERROR;
if (ierr == DKS_SUCCESS)
return DKS_SUCCESS;
else
return DKS_ERROR;
}
*/
/*
Info: execute function
Return: success or error code
*/
int CudaBase::cuda_executeFunction() {
std::cout << "Execute function" << std::endl;
return DKS_SUCCESS;
}
/*
Info: clean up
Return: success or error code
*/
int CudaBase::cuda_cleanUp() {
std::cout << "clean up" << std::endl;
return DKS_SUCCESS;
}

View File

@ -16,6 +16,11 @@
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
/**
* CUDA base class handles device setup and basic communication with the device.
* Handles devicew setup, memory manegement, data transfers and stream setup for
* asynchronous data transfers and kernel executions.
*/
class CudaBase { class CudaBase {
private: private:
@ -48,17 +53,17 @@ public:
/** /**
* Delete curandState. * Delete curandState.
* Delete curandState array on the GPU and free memory. * Delete curandState array on the GPU and free memory.
* Return success or error code * Return success or error code
*/ */
int cuda_deleteCurandStates(); int cuda_deleteCurandStates();
/** Create 'size' random numbers on the device and save in mem_ptr array /**
* * Create 'size' random numbers on the device and save in mem_ptr array.
*/ */
int cuda_createRandomNumbers(void *mem_ptr, int size); int cuda_createRandomNumbers(void *mem_ptr, int size);
/** Get a pointer to curand states /**
* * Get a pointer to curand states.
*/ */
curandState* cuda_getCurandStates(); curandState* cuda_getCurandStates();
@ -75,93 +80,98 @@ public:
int cuda_addStream(cudaStream_t tmpStream, int &streamId); int cuda_addStream(cudaStream_t tmpStream, int &streamId);
/** /**
* delete cuda stream * delete cuda stream.
* success or error code * success or error code
*/ */
int cuda_deleteStream(int id); int cuda_deleteStream(int id);
/** /**
* delete all streams * delete all streams.
* success or error code * success or error code
*/ */
int cuda_deleteStreams(); int cuda_deleteStreams();
/** /**
* set stream to use * set stream to use.
* success or error code * success or error code
*/ */
int cuda_setStream(int id); int cuda_setStream(int id);
/** /**
* Info: get stream that is used * get stream that is used.
* Return: return id of curretn stream * Return: return id of curretn stream
*/ */
int cuda_getStreamId(); int cuda_getStreamId();
/** /**
* Info: reset to default stream * reset to default stream.
* Return: success or error code * Return: success or error code
*/ */
int cuda_defaultStream(); int cuda_defaultStream();
/** /**
* Info: get number of streams * get number of streams.
* Return: success or error code * Return: success or error code
*/ */
int cuda_numberOfStreams(); int cuda_numberOfStreams();
/** /**
* Info: get stream * get stream.
* Return: stream * Return: stream
*/ */
cudaStream_t cuda_getStream(int id); cudaStream_t cuda_getStream(int id);
/** /**
* Get default cublass handle * Get default cublass handle.
*/ */
cublasHandle_t cuda_getCublas(); cublasHandle_t cuda_getCublas();
/** /**
* Info: get information on cuda devices * get information on cuda devices.
* Return: success or error code * Return: success or error code
*/ */
int cuda_getDevices(); int cuda_getDevices();
/** Get CUDA device count. /**
* Sets the number of devices on the platform that can use CUDA. * Get CUDA device count.
* Returns DKS_SUCCESS * Sets the number of devices on the platform that can use CUDA.
*/ */
int cuda_getDeviceCount(int &ndev); int cuda_getDeviceCount(int &ndev);
/** Get the name of the device. /**
* QUery the device properties of the used device and set the string device_name * Get the name of the device.
* QUery the device properties of the used device and set the string device_name
*/ */
int cuda_getDeviceName(std::string &device_name); int cuda_getDeviceName(std::string &device_name);
/** Set CUDA device to use. /**
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR * Set CUDA device to use.
* If device passed in is larger than the number of devices use
* the default:0 and return DKS_ERROR
*/ */
int cuda_setDevice(int device); int cuda_setDevice(int device);
/** Get unique devices /**
* Get array of indeces with the unique CUDA devices available on the paltform * Get unique devices.
* Get array of indeces with the unique CUDA devices available on the paltform
*/ */
int cuda_getUniqueDevices(std::vector<int> &devices); int cuda_getUniqueDevices(std::vector<int> &devices);
/** /**
* Info: init device * Initialize connection to the device.
* Only needed when runtime compilation is used.
* Return: success or error code * Return: success or error code
*/ */
int cuda_setUp(); int cuda_setUp();
/** /**
* Info: allocate memory on cuda device * Allocate memory on cuda device.
* Return: pointer to memory object * Return: pointer to memory object
*/ */
void * cuda_allocateMemory(size_t size, int &ierr); void * cuda_allocateMemory(size_t size, int &ierr);
/** /**
* Info: allocate host memory in pinned memory * Allocate host memory in pinned memory
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -174,8 +184,9 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory. /**
* Set all the elements of the array on the device to zero. * Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/ */
template<typename T> template<typename T>
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) { int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
@ -189,8 +200,9 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory. /**
* Set all the elements of the array on the device to zero. * Zero CUDA memory async.
* Set all the elements of the array on the device to zero.
*/ */
template<typename T> template<typename T>
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) { int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
@ -209,7 +221,7 @@ public:
} }
/** /**
* Info: write data to memory * Write data to memory
* Retrun: success or error code * Retrun: success or error code
*/ */
template<typename T> template<typename T>
@ -226,7 +238,7 @@ public:
} }
/** /**
* Info: write data assynchonuously * Write data assynchonuously
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -258,7 +270,7 @@ public:
} }
/** /**
* Info: read data from memory * Read data from memory
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -275,7 +287,7 @@ public:
} }
/** /**
* Info: read data async from device memory * Read data async from device memory
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -307,19 +319,19 @@ public:
} }
/** /**
* Info: free memory on device * Free memory on device
* Return: success or error code * Return: success or error code
*/ */
int cuda_freeMemory(void * mem_ptr); int cuda_freeMemory(void * mem_ptr);
/** /**
* Info: free page locked memory on host * Free page locked memory on host
* Return: success or erro code * Return: success or erro code
*/ */
int cuda_freeHostMemory(void * mem_ptr); int cuda_freeHostMemory(void * mem_ptr);
/** /**
* Info: allcate memory and write data (push) * Allcate memory and write data (push)
* Return: pointer to memory object * Return: pointer to memory object
*/ */
template<typename T> template<typename T>
@ -335,7 +347,7 @@ public:
} }
/** /**
* Info: read data and free memory (pull) * Read data and free memory (pull)
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -353,21 +365,10 @@ public:
else else
return DKS_ERROR; return DKS_ERROR;
} }
/**
* Info: execute function
* Return: success or error code
*/
int cuda_executeFunction();
/**
* Info: clean up
* Return: success or error code
*/
int cuda_cleanUp();
/** /**
* Info: sync cuda device * Sync cuda device.
* Waits till all the tasks on the GPU are finished.
* Return: success or error code * Return: success or error code
*/ */
int cuda_syncDevice() { int cuda_syncDevice() {
@ -376,7 +377,7 @@ public:
} }
/** /**
* Page-lock host memory * Page-lock host memory.
*/ */
template<typename T> template<typename T>
int cuda_hostRegister(T *ptr, int size) { int cuda_hostRegister(T *ptr, int size) {
@ -390,7 +391,7 @@ public:
} }
/** /**
* Release page locked memory * Release page locked memory.
*/ */
template<typename T> template<typename T>
int cuda_hostUnregister(T *ptr) { int cuda_hostUnregister(T *ptr) {
@ -403,7 +404,7 @@ public:
} }
/** /**
* Info: print device memory info (total, used, avail) * Print device memory info (total, used, avail)
* Return: success or error code * Return: success or error code
*/ */
int cuda_memInfo() { int cuda_memInfo() {

View File

@ -8,6 +8,7 @@
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
class CudaChiSquare { class CudaChiSquare {
private: private:

View File

@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
const std::string cudaFunctFooter = "}\n"; const std::string cudaFunctFooter = "}\n";
/**
* CUDA implementation of ChiSquareRuntime class.
* Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
*/
class CudaChiSquareRuntime : public ChiSquareRuntime{ class CudaChiSquareRuntime : public ChiSquareRuntime{
private: private:
@ -29,65 +33,72 @@ private:
cublasHandle_t defaultCublasRT; cublasHandle_t defaultCublasRT;
/** Setup to init device /**
* Create context and init device for RT compilation * Setup to init device.
* Create context and init device for RT compilation
*/ */
void setUpContext(); void setUpContext();
/** Private function to add function to kernel string /**
* * Private function to add function to kernel string.
*/ */
std::string buildProgram(std::string function); std::string buildProgram(std::string function);
public: public:
/** Constructor with CudaBase argument /**
* * Constructor with CudaBase argument
*/ */
CudaChiSquareRuntime(CudaBase *base); CudaChiSquareRuntime(CudaBase *base);
/** Default constructor init cuda device /**
* * Default constructor init cuda device
*/ */
CudaChiSquareRuntime(); CudaChiSquareRuntime();
/** Default destructor /**
* * Default destructor.
*/ */
~CudaChiSquareRuntime(); ~CudaChiSquareRuntime();
/** Compile program and save ptx. /**
* Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program * Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in * Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]] * a form par[map[idx]]
*/ */
int compileProgram(std::string function, bool mlh = false); int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel /**
* Launch selected kernel.
* Launched the selected kernel from the compiled code. * Launched the selected kernel from the compiled code.
* Result is put in &result variable * Result is put in &result variable.
*/ */
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap, int numpar, int numfunc, int nummap,
double timeStart, double timeStep, double timeStart, double timeStep,
double &result); double &result);
/** Write params to device. /**
* Write params to device.
* Write params from double array to mem_param_m memory on the device. * Write params from double array to mem_param_m memory on the device.
*/ */
int writeParams(const double *params, int numparams); int writeParams(const double *params, int numparams);
/** Write functions to device. /**
* Write functions to device.
* Write function values from double array to mem_func_m memory on the device. * Write function values from double array to mem_func_m memory on the device.
*/ */
int writeFunc(const double *func, int numfunc); int writeFunc(const double *func, int numfunc);
/** Write maps to device. /**
* Write maps to device.
* Write map values from int array to mem_map_m memory on the device. * Write map values from int array to mem_map_m memory on the device.
*/ */
int writeMap(const int *map, int nummap); int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square. /**
* Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param, * the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in * size_func and size_map are the maximum number of parameters, functions and maps used in
@ -96,14 +107,16 @@ public:
int initChiSquare(int size_data, int size_param, int size_func, int size_map); int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square. /**
* Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps * Frees the chisq temporary memory and memory for params, functions and maps
*/ */
int freeChiSquare(); int freeChiSquare();
/** Check if CUDA device is able to run the chi square kernel. /**
* Redundant - all new CUDA devices that support RT compilation will also support * Check if CUDA device is able to run the chi square kernel.
* double precision, there are no other requirements to run chi square on GPU * Redundant - all new CUDA devices that support RT compilation will also support
* double precision, there are no other requirements to run chi square on GPU
*/ */
int checkChiSquareKernels(int fitType, int &threadsPerBlock) { int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
return DKS_SUCCESS; return DKS_SUCCESS;

View File

@ -1,5 +1,6 @@
#include "CudaCollimatorPhysics.cuh" #include "CudaCollimatorPhysics.cuh"
//constants used in OPAL
//#define M_P 0.93827231e+00 //#define M_P 0.93827231e+00
#define M_P 0.93827204e+00 #define M_P 0.93827204e+00
#define C 299792458.0 #define C 299792458.0
@ -11,6 +12,7 @@
#define Z_P 1 #define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
//parameter array indexes
#define POSITION 0 #define POSITION 0
#define ZSIZE 1 #define ZSIZE 1
#define RHO_M 2 #define RHO_M 2
@ -28,12 +30,18 @@
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
#define NUMPAR 13 #define NUMPAR 13
/**
* CUDA device function for calculating dot product.
*/
__device__ inline double dot(double3 &d1, double3 &d2) { __device__ inline double dot(double3 &d1, double3 &d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
} }
/**
* CUDA devce function to calculate cross product.
*/
__device__ inline double3 cross(double3 &lhs, double3 &rhs) { __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp; double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y; tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
return tmp; return tmp;
} }
/**
* CUDA device function to calculate arbitrary rotation.
*/
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) { __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
double c=cos(Theta); double c=cos(Theta);
double s=sin(Theta); double s=sin(Theta);
@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
return tmp; return tmp;
} }
/**
* CUDA device function to check if particle is still in material.
* z - particle position, par - parameter array. Particle is considered inside the
* material if z is > material starting position and z < material starting position - mat size.
*/
__device__ inline bool checkHit(double &z, double *par) { __device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */ /* check if particle is in the degrader material */
@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) {
} }
/**
* CUDA device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
{ {
@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
} }
/**
* CUDA device function for rotation in 2 dimensions.
* For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),
* "Passage of particles through matter"
*/
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, __device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
double &normP, double &thetacou, double &deltas, int coord, double &normP, double &thetacou, double &deltas, int coord,
double *par) double *par)
@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
} }
/**
* CUDA device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* For details on the algorithm see OPAL user guide.
*/
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par, __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
bool enableRutherfordScattering) bool enableRutherfordScattering)
{ {
@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
} }
/**
* CUDA kernel that performs one step in particle movement trough mater.
* One thread is launched for each particle in the simulation. The kernel checks if the particle
* is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
* particles that are exiting the material.
* @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
* @param[in] *par array of material properties, always constant size - 13
* @param[in] *state array holding cuRand states to preserve states between kernel launches
* @param[in] numparticles number of particles in the simulation
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
*/
template <typename T> template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles, bool enableRutherfordScattering) int numparticles, bool enableRutherfordScattering)
@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
volatile int tid = threadIdx.x; volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid; volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory //transfer params and particle positions to shared memory
//R is kept in shared memory in order to reduce register pressure for the kernel
extern __shared__ double smem[]; extern __shared__ double smem[];
double *p = (double*)smem; double *p = (double*)smem;
double3 *R = (double3*)&smem[NUMPAR]; double3 *R = (double3*)&smem[NUMPAR];
curandState s; curandState s; //each tread gets its own cuRand state for random number generation
double3 P; double3 P;
//load parameters to shared memory
for (int tt = tid; tt < NUMPAR; tt += blockDim.x) for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
p[tt] = par[tt]; p[tt] = par[tt];
//sync threads to ensure that parameters are finished loading
__syncthreads(); __syncthreads();
//there might be some empty threads that do no work
if (idx < numparticles) { if (idx < numparticles) {
s = state[idx]; s = state[idx]; //load cuRand state to local memory
R[tid] = data[idx].Rincol; R[tid] = data[idx].Rincol; //load position to shared memory
P = data[idx].Pincol; P = data[idx].Pincol; //load momentum to local memory
bool pdead = false; bool pdead = false;
volatile double sq = sqrt(1.0 + dot(P, P)); volatile double sq = sqrt(1.0 + dot(P, P));
double Eng; double Eng;
//check if particle is still in the material
if (checkHit(R[tid].z, p)) { if (checkHit(R[tid].z, p)) {
//calculate enery loss
Eng = (sq - 1) * M_P; Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p); energyLoss(Eng, pdead, s, p);
//check if particle is not dead
if (!pdead) { if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P)); sq = sqrt(dot(P, P));
//caluclate Coulomb scattering
P.x = P.x * ptot / sq; P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq; P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq; P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering); coulombScat(R[tid], P, s, p, enableRutherfordScattering);
//update particle momentum
data[idx].Pincol = P; data[idx].Pincol = P;
} else { } else {
//mark particle as dead (-1)
data[idx].label = -1; data[idx].label = -1;
} }
//update cuRand state
state[idx] = s; state[idx] = s;
} else { } else {
//particle exits material - drift and mark as exiting (-2)
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -272,14 +325,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
} }
//update particle position
data[idx].Rincol = R[tid]; data[idx].Rincol = R[tid];
} }
} }
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, /**
curandState *state, int numparticles, * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
bool enableRutherfordScattering) * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
* Deprecated - GPU version does not use SoA.
* @param[in] data structure of arrays containing particle data
* @param[in] *par array of material properties, always constant size - 13
* @param[in] *state array holding cuRand states to preserve states between kernel launches
* @param[in] numparticles number of particles in the simulation
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
*/
__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles,
bool enableRutherfordScattering)
{ {
//get global id and thread id //get global id and thread id
@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
} }
/**
* Device function to swich off unitless positions.
*/
inline __device__ void unitlessOff(double3 &a, const double &c) { inline __device__ void unitlessOff(double3 &a, const double &c) {
a.x *= c; a.x *= c;
a.y *= c; a.y *= c;
a.z *= c; a.z *= c;
} }
/**
* Device function to swich on unitless positions.
*/
inline __device__ void unitlessOn(double3 &a, const double &c) { inline __device__ void unitlessOn(double3 &a, const double &c) {
a.x /= c; a.x /= c;
a.y /= c; a.y /= c;
a.z /= c; a.z /= c;
} }
//swithch to unitless positions with dtc
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOn(R, dtc);
unitlessOn(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc to unitless positions with dt*c
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc off unitless positions with dtc
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOff(R, dtc);
unitlessOff(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//switch off unitelss positions with dt*c
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
/**
* CUDA kernel to perform particle push.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] npart number of particles
* @param[in] dtc dt*c
*/
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
//get global id and thread id //get global id and thread id
@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
} }
} }
/**
* CUDA kernel to perform particle push.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] *gdt array of time steps for each particle
* @param[in] npart number of particles
* @param[in] c speed of light
*/
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) { __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
//get global id and thread id //get global id and thread id
@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
} }
} }
/**
* CUDA kernel to perform particle kick.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] *gEf
* @param[in] *gBf
* @param[in] *gdt array of time steps for each particle
* @param[in] npart number of particles
* @param[in] c speed of light
*/
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
double3 *gBf, double *gdt, double charge, double3 *gBf, double *gdt, double charge,
double mass, int npart, double c) double mass, int npart, double c)
@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
} }
struct compare_particle
{
int threshold;
compare_particle() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART p1, CUDA_PART p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART p1) {
return p1.label < threshold;
}
};
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
struct less_then
{
__host__ __device__
bool operator()(int x)
{
return x < 0;
}
};
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering) bool enableRutherfordScattering)
{ {

View File

@ -20,7 +20,8 @@
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU or MIC as AoS.
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
*/ */
typedef struct __align__(16) { typedef struct __align__(16) {
int label; int label;
@ -37,7 +38,10 @@ typedef struct __align__(16) {
} CUDA_PART; } CUDA_PART;
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU as AoS
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
* on the host side.
*/ */
typedef struct { typedef struct {
int label; int label;
@ -47,7 +51,8 @@ typedef struct {
} CUDA_PART_SMALL; } CUDA_PART_SMALL;
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU as SoA.
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
*/ */
typedef struct { typedef struct {
int *label; int *label;
@ -65,6 +70,9 @@ typedef struct {
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
* on the host side.
*/ */
typedef struct { typedef struct {
int *label; int *label;
@ -73,11 +81,39 @@ typedef struct {
double3 *Pincol; double3 *Pincol;
} CUDA_PART2_SMALL; } CUDA_PART2_SMALL;
/** CudaCollimatorPhysics class. /**
* Contains kerenls that execute CollimatorPhysics functions form OPAL. * Operator used in thrust sort to compare particles by label.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation * Used to move dead particles to the end of array, since they have label -1 or -2.
*/ */
class CudaCollimatorPhysics : public DKSCollimatorPhysics{ struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
/**
* CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
*/
class CudaCollimatorPhysics : public DKSCollimatorPhysics {
private: private:
@ -86,32 +122,44 @@ private:
public: public:
/** Constructor with CudaBase argument /**
* * Constructor with CudaBase as argument.
* Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
*/ */
CudaCollimatorPhysics(CudaBase *base) { CudaCollimatorPhysics(CudaBase *base) {
m_base = base; m_base = base;
base_create = false; base_create = false;
} }
/** Constructor - empty. */ /**
* Empty constructor.
* Create a new instance of CudaCollimatorPhysics with its own CudaBase.
*/
CudaCollimatorPhysics() { CudaCollimatorPhysics() {
m_base = new CudaBase(); m_base = new CudaBase();
base_create = true; base_create = true;
} }
/** Destructor - empty */ /**
* Destructor.
* Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
*/
~CudaCollimatorPhysics() { ~CudaCollimatorPhysics() {
if (base_create) if (base_create)
delete m_base; delete m_base;
}; };
/** Execute collimator physics kernel. /**
* Execute collimator physics kernel.
* *
*/ */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numpartices, bool enableRutherforScattering = true); int numpartices, bool enableRutherforScattering = true);
/**
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
@ -120,12 +168,17 @@ public:
return DKS_ERROR; return DKS_ERROR;
} }
/** Sort particle array on GPU. /**
* Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array * array so these particles are at the end of array
*/ */
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
@ -134,18 +187,25 @@ public:
return DKS_ERROR; return DKS_ERROR;
} }
/** BorisPusher push function for integration from OPAL. /**
* BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda. * ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal * For more details see ParallelTTracler docomentation in opal
*/ */
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1); double dt, double c, bool usedt = false, int streamId = -1);
/**
* BorisPusher kick function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass, void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId = -1); int npart, double c, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL /**
* BorisPusher push function with transformto function form OPAL.
* ParallelTTracker integration from OPAL implemented in cuda. * ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal * For more details see ParallelTTracler docomentation in opal
*/ */

View File

@ -10,6 +10,10 @@
#include "../Algorithms/FFT.h" #include "../Algorithms/FFT.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
/**
* Cuda FFT class based on BaseFFT interface.
* Uses cuFFT library to perform FFTs on nvidias GPUs.
*/
class CudaFFT : public BaseFFT { class CudaFFT : public BaseFFT {
private: private:
@ -34,7 +38,7 @@ public:
~CudaFFT(); ~CudaFFT();
/** /**
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type * Init cufftPlans witch can be reused for all FFTs of the same size and type
* Return: success or error code * Return: success or error code
*/ */
int setupFFT(int ndim, int N[3]); int setupFFT(int ndim, int N[3]);
@ -42,45 +46,21 @@ public:
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
/** /**
* Info: destroy default FFT plans * Destroy default FFT plans
* Return: success or error code * Return: success or error code
*/ */
int destroyFFT(); int destroyFFT();
/*
Info: execute complex to complex double precision fft using cufft library
Return: success or error code
*/
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
/*
Info: execute ifft
Return: success or error code
*/
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize using cuda kernel for complex to complex iFFT
Return: success or error code
*/
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute real to complex double precision FFT
Return: success or error code
*/
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: exectue complex to real double precision FFT
Return: success or error code
*/
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1); int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
}; };

View File

@ -12,6 +12,7 @@
#include "../Algorithms/GreensFunction.h" #include "../Algorithms/GreensFunction.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
class CudaGreensFunction : public GreensFunction{ class CudaGreensFunction : public GreensFunction{
private: private:

View File

@ -10,6 +10,7 @@
#include "../Algorithms/ImageReconstruction.h" #include "../Algorithms/ImageReconstruction.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** CUDA implementation of ImageReconstruction interface. */
class CudaImageReconstruction : public ImageReconstruction { class CudaImageReconstruction : public ImageReconstruction {
private: private:

View File

@ -1,11 +1,3 @@
/** DKSBase class.
* DKSBase.h
* Author: Uldis Locans
* Date: 15.09.2014
* Base class of Dynamic Kernel Scheduler that handles the function calls
* from host application to DKS
*/
#ifndef H_DKS_BASE #ifndef H_DKS_BASE
#define H_DKS_BASE #define H_DKS_BASE
@ -41,7 +33,12 @@
#include "AutoTuning/DKSConfig.h" #include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */ /**
* API for handling communication function calls to DKS library.
* DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
* memory manegement, data transfer and other basic communication functions between
* the host and device.
*/
class DKSBase { class DKSBase {
private: private:
@ -74,7 +71,7 @@ protected:
DKSConfig dksconfig; DKSConfig dksconfig;
/** /**
* Check if current API is set to OpenCL * Check if current API is set to OpenCL.
* Return true/false wether current api is opencl * Return true/false wether current api is opencl
*/ */
bool apiOpenCL(); bool apiOpenCL();
@ -91,11 +88,11 @@ protected:
*/ */
bool apiOpenMP(); bool apiOpenMP();
/** Check if device is GPU */ /** Check if device is GPU. */
bool deviceGPU(); bool deviceGPU();
/** Check if device is CPU */ /** Check if device is CPU. */
bool deviceCPU(); bool deviceCPU();
/** Check if device is MIC */ /** Check if device is MIC. */
bool deviceMIC(); bool deviceMIC();
/** /**

View File

@ -20,6 +20,11 @@
#include "OpenCL/OpenCLChiSquareRuntime.h" #include "OpenCL/OpenCLChiSquareRuntime.h"
#endif #endif
/**
* API to handle musrfit calls to DKS library.
* Using ChiSquareRuntime interface allows to call chi square functions on the
* GPU or CPU using CUDA or OpenCL.
*/
class DKSBaseMuSR : public DKSFFT { class DKSBaseMuSR : public DKSFFT {
private: private:

View File

@ -24,6 +24,10 @@
#include "MIC/MICFFT.h" #include "MIC/MICFFT.h"
#endif #endif
/**
* API to handel calls to DKSFFT.
* Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
*/
class DKSFFT : public DKSBase { class DKSFFT : public DKSBase {
private: private:

View File

@ -10,6 +10,9 @@
#include "CUDA/CudaImageReconstruction.cuh" #include "CUDA/CudaImageReconstruction.cuh"
#endif #endif
/**
* API to handle PET image reconstruction calls.
*/
class DKSImageRecon : public DKSBase { class DKSImageRecon : public DKSBase {
private: private:
@ -22,87 +25,88 @@ public:
~DKSImageRecon(); ~DKSImageRecon();
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateSource(void *image_space, void *image_position, void *source_position, int callCalculateSource(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateBackground(void *image_space, void *image_position, void *source_position, int callCalculateBackground(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateSources(void *image_space, void *image_position, void *source_position, int callCalculateSources(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels, void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels, void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction - generate normalization. /**
* * Image reconstruction - generate normalization.
*/ */
int callGenerateNormalization(void *recon, void *image_position, int callGenerateNormalization(void *recon, void *image_position,
void *det_position, int total_det); void *det_position, int total_det);
/** Image reconstruction - forward correction. /**
* * Image reconstruction - forward correction.
*/ */
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events); void *image_position, int num_events);
/** Image reconstruction - backward projection. /**
* * Image reconstruction - backward projection.
*/ */
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position, void *det_position, void *image_position,
int num_events, int num_voxels); int num_events, int num_voxels);
/** Set the voxel dimensins on device. /**
* Set the voxel dimensins on device.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
/** Set the image edge. /**
* Set the image edge.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setEdge(float x_edge, float y_edge, float z_edge); int setEdge(float x_edge, float y_edge, float z_edge);
/** Set the image edge1. /**
* Set the image edge1.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
/** Set the minimum crystan in one ring values. /**
* Set the minimum crystal in one ring values.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
/** Set all other required parameters for reconstruction. /**
* Set all other required parameters for reconstruction.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.

View File

@ -33,6 +33,11 @@
#include "MIC/MICCollimatorPhysics.h" #include "MIC/MICCollimatorPhysics.h"
#endif #endif
/**
* API to handle OPAL calls to DKS library.
* Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
* functions.
*/
class DKSOPAL : public DKSFFT { class DKSOPAL : public DKSFFT {
private: private:

View File

@ -26,6 +26,9 @@
#define MIC_WIDTH 128 #define MIC_WIDTH 128
/** MIC Base class handles device setup and basic communication with the device.
* Handles devicew setup, memory manegement and data transfers.
*/
class MICBase { class MICBase {
private: private:
@ -45,58 +48,60 @@ public:
int m_device_id; int m_device_id;
/* constructor */ /** constructor */
MICBase(); MICBase();
/* destructor */ /** destructor */
~MICBase(); ~MICBase();
/* /**
Info: create MKL rand streams for each thread * Create MKL rand streams for each thread
Return: success or error code * Return: success or error code
*/ */
int mic_createRandStreams(int size); int mic_createRandStreams(int size);
/* /**
Info: delete MKL rand streams * Delete MKL rand streams
Return: succes or error code * Return: succes or error code
*/ */
int mic_deleteRandStreams(); int mic_deleteRandStreams();
/* /**
Info: create a new signal for the mic * Create a new signal for the mic.
Return: success or error code * Signals can be used for assynchronous data transfers.
*/ * Return: success or error code
*/
int mic_createStream(int & streamId); int mic_createStream(int & streamId);
/* /**
Info: get the signal from the vector * Info: get the signal from the vector.
Return: mic signal * Return: mic signal
*/ */
int& mic_getStream(int id); int& mic_getStream(int id);
/* /**
Info: delete streams * Info: delete streams.
Return: success or error code * Return: success or error code
*/ */
int mic_deleteStreams(); int mic_deleteStreams();
/* /**
Info: set device id * Info: set device id.
Return: success or error code * Return: success or error code
*/ */
int mic_setDeviceId(int id); int mic_setDeviceId(int id);
/* /**
Info: get mic devices * Info: get mic devices.
Return: success or error code * Prints information about mic devices.
*/ * Return: success or error code
*/
int mic_getDevices(); int mic_getDevices();
/* /**
Info: allocate memory on MIC device * Allocate memory on MIC device.
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
void * mic_allocateMemory(int size) { void * mic_allocateMemory(int size) {
@ -109,10 +114,10 @@ public:
return tmp; return tmp;
} }
/* /**
Info: transfer data to device * Transfer data to device.
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) { int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
T* tmp_ptr = (T*)data_ptr; T* tmp_ptr = (T*)data_ptr;
@ -123,10 +128,10 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Info: write data to device, non-blocking * Write data to device, non-blocking.
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
{ {
@ -139,10 +144,10 @@ public:
} }
/* /**
Info: read data from device * Read data from device
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) { int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
T* tmp_ptr = (T*)data_ptr; T* tmp_ptr = (T*)data_ptr;
@ -154,10 +159,10 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Info: read data from device waiting for signal * Read data from device waiting for signal
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_readDataAsync(const void * data_ptr, void * result, int size, int mic_readDataAsync(const void * data_ptr, void * result, int size,
int streamId = -1, int offset = 0) { int streamId = -1, int offset = 0) {
@ -172,10 +177,10 @@ public:
} }
/* /**
Info: wait till all the signals are complete * Wait till all the signals are complete
Return siccess or error code * Return siccess or error code
*/ */
int mic_syncDevice() { int mic_syncDevice() {
//empty offload to wait for all the signals to finish and launch a new empy signal //empty offload to wait for all the signals to finish and launch a new empy signal
@ -193,10 +198,10 @@ public:
} }
/* /**
Info: free memory on device * Free memory on device
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_freeMemory(void * data_ptr, int size) { int mic_freeMemory(void * data_ptr, int size) {
@ -210,10 +215,10 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Info: allocate memory and write data to device * Allocate memory and write data to device
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
void * mic_pushData(const void * data, int size) { void * mic_pushData(const void * data, int size) {
T* tmp_ptr = new T[size]; T* tmp_ptr = new T[size];
@ -227,10 +232,10 @@ public:
return tmp_ptr; return tmp_ptr;
} }
/* /**
Info: read data and free memory on device * Read data and free memory on device
Return: success or erro code * Return: success or erro code
*/ */
template<typename T> template<typename T>
int mic_pullData(void * data_ptr, void * result, int size) { int mic_pullData(void * data_ptr, void * result, int size) {
T* tmp_ptr = (T*)data_ptr; T* tmp_ptr = (T*)data_ptr;

View File

@ -14,6 +14,9 @@
#include <offload.h> #include <offload.h>
#include "MICBase.h" #include "MICBase.h"
/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices.
* Not complete and untested because of the poor performance of first MIC devices.
*/
class MICChiSquare { class MICChiSquare {
MICBase *m_micbase; MICBase *m_micbase;

View File

@ -22,22 +22,34 @@
#define I_M 10 #define I_M 10
#define DT_M 11 #define DT_M 11
/**
* MIC device function for calculating dot product.
*/
__declspec(target(mic)) __declspec(target(mic))
double dot(mic_double3 d1, mic_double3 d2) { double dot(mic_double3 d1, mic_double3 d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
} }
/**
* MIC device function for calculating dot product.
*/
__declspec(target(mic)) __declspec(target(mic))
double dot(double dx, double dy, double dz) { double dot(double dx, double dy, double dz) {
return (dx * dx + dy * dy + dz * dz); return (dx * dx + dy * dy + dz * dz);
} }
/**
* MIC device function to check if particle is still in material.
*/
__declspec(target(mic)) __declspec(target(mic))
bool checkHit(double &z, double *par) { bool checkHit(double &z, double *par) {
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
} }
/**
* MIC device function to calculate arbitrary rotation.
*/
__declspec(target(mic)) __declspec(target(mic))
void Rot(double &px, double &pz, double &x, double &z, double xplane, void Rot(double &px, double &pz, double &x, double &z, double xplane,
double normP, double thetacou, double deltas, int coord) double normP, double thetacou, double deltas, int coord)
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
} }
/**
* MIC device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* Uses AoS to store particle positions and momentum, paralelized using OpenMP.
* For details on the algorithm see OPAL user guide.
* Deprecated on favor of SoA data layout.
*/
__declspec(target(mic)) __declspec(target(mic))
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) { void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
} }
/**
* MIC device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* Uses SoA to store particle positions and momentum, paralelized using OpenMP.
* For details on the algorithm see OPAL user guide.
*/
__declspec(target(mic)) __declspec(target(mic))
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label, void coulombScat(double *rx, double *ry, double *rz,
double *px, double *py, double *pz, int *label,
double *par, VSLStreamStatePtr &stream, int ii, int size) double *par, VSLStreamStatePtr &stream, int ii, int size)
{ {
//arrays for temporary storage, each core proceses MIC_WIDTH particles
double normP[MIC_WIDTH] __attribute__((aligned(64))); double normP[MIC_WIDTH] __attribute__((aligned(64)));
double deltas[MIC_WIDTH] __attribute__((aligned(64))); double deltas[MIC_WIDTH] __attribute__((aligned(64)));
double theta0[MIC_WIDTH] __attribute__((aligned(64))); double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
double z2[MIC_WIDTH] __attribute__((aligned(64))); double z2[MIC_WIDTH] __attribute__((aligned(64)));
double thetacou[MIC_WIDTH] __attribute__((aligned(64))); double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
//simd instruction tells the compiler its safe to vectorize the loop
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) { for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
} }
} }
//vectorize the loop
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int i = ii; i < ii + size; i++) { for (int i = ii; i < ii + size; i++) {
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
} }
} }
//generate array of random numbers //generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
} }
/**
* MIC device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__declspec(target(mic)) __declspec(target(mic))
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) { void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
pdead = 1; pdead = 1;
} }
/**
* MIC device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__declspec(target(mic)) __declspec(target(mic))
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) { void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
double *par = (double*) par_ptr; double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream; VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
the memory allocation and data trasnfer need to be handled before */
#pragma offload target(mic:m_micbase->m_device_id) \ #pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \ inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \
@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()]; VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x //for loop trough particles if not checkhit set label to -2 and update R.x
#pragma omp for simd #pragma omp for simd
for (int i = 0; i < numparticles; i++) { for (int i = 0; i < numparticles; i++) {
if ( !checkHit(data[i].Rincol.z, par) ) { if ( !checkHit(data[i].Rincol.z, par) ) {
@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
{ {
//cast device memory pointers to appropriate types
int *label = (int*)label_ptr; int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr; unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr; double *rx = (double*)rx_ptr;
@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream; VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
the memory allocation and data trasnfer need to be handled before */
#pragma offload target (mic:0) \ #pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \ in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \ in(localID:length(0) DKS_REUSE DKS_RETAIN) \

View File

@ -26,6 +26,12 @@ typedef struct {
} MIC_PART_SMALL; } MIC_PART_SMALL;
/**
* MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Implementes OPALs collimator physics class for particle matter interactions using OpenMP
* and offload mode targetomg Intel Xeon Phi processors.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
*/
class MICCollimatorPhysics : public DKSCollimatorPhysics { class MICCollimatorPhysics : public DKSCollimatorPhysics {
private: private:

View File

@ -10,6 +10,10 @@
#include "../Algorithms/FFT.h" #include "../Algorithms/FFT.h"
#include "MICBase.h" #include "MICBase.h"
/**
* MIC FFT based on BaseFFT interface.
* uses MKL library to offload FFT on Intel Xeon Phi devices.
*/
class MICFFT : public BaseFFT { class MICFFT : public BaseFFT {
private: private:

View File

@ -15,6 +15,7 @@
#define DKS_SUCCESS 0 #define DKS_SUCCESS 0
#define DKS_ERROR 1 #define DKS_ERROR 1
/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
class MICGreensFunction : public GreensFunction { class MICGreensFunction : public GreensFunction {
private: private:

View File

@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
return p; return p;
} }
/**
* Merge sort implementation for intel MIC.
* Paralellized over all the MIC cores using OpenMP tasks.
*/
template <typename T> template <typename T>
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) { void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
} }
} }
/**
* Quicksort algorithm, developed for use on Intel MIC devices.
*/
template <typename T> template <typename T>
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) { void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
} }
/**
* Insertion sort of @p list, developed for use on Intel MIC.
* Used by quick_sort to sort small lists.
*/
template <typename T> template <typename T>
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) { void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

View File

@ -1,16 +1,3 @@
/*
Name: OpenCLBase
Author: Uldis Locans
Info: OpenCL base class to handle all the common details associated
with kernel launch on OpenCL device
Date: 2014.09.18
*/
#ifndef H_OPENCL_BASE #ifndef H_OPENCL_BASE
#define H_OPENCL_BASE #define H_OPENCL_BASE
@ -32,7 +19,7 @@
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/* struct for random number state */ /** struct for random number state. */
typedef struct { typedef struct {
double s10; double s10;
double s11; double s11;
@ -44,168 +31,194 @@ typedef struct {
bool gen; bool gen;
} RNDState; } RNDState;
/**
* OpenCL base class to handle device setup and basic communication wiht the device.
* Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
* The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
* class contains methods to read the kernel files, compile the kernel codes and launch kernels
* from the compiled codes. Which kernel file needs to be loaded for the specif functin is
* handled by the base class that is launching the kernel.
*/
class OpenCLBase { class OpenCLBase {
private: private:
//variables containig OpenCL device and platform ids
static cl_platform_id m_platform_id; static cl_platform_id m_platform_id;
static cl_device_id m_device_id; static cl_device_id m_device_id;
//variables containit compiled OpenCL program and kernel
cl_context_properties m_context_properties[3]; cl_context_properties m_context_properties[3];
cl_program m_program; cl_program m_program;
cl_kernel m_kernel; cl_kernel m_kernel;
//variables for tracking OpenCL events
static cl_event m_last_event; static cl_event m_last_event;
cl_int m_num_events; cl_int m_num_events;
std::vector<cl_event> m_events; std::vector<cl_event> m_events;
//currently load kernel file
char * m_kernel_file; char * m_kernel_file;
//type of device used by OpenCL
cl_device_type m_device_type; cl_device_type m_device_type;
/* /**
Name: getPlatforms * Get all available OpenCL platforms.
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms * Get all avaialble platforms and save in m_platform_ids, save number of platforms
Return: success or error code * Return: success or error code
*/ */
int ocl_getPlatforms(); int ocl_getPlatforms();
/* /**
Name: getDevice * Get first available OpenCL device of specified type.
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu) * Get first avaialble devices and save device id and platform id for this device,
ReturnL success or error code * device name: (-gpu, -mic, -cpu)
*/ * ReturnL success or error code
*/
int ocl_getDevice(const char* device_name); int ocl_getDevice(const char* device_name);
/* /**
Name getDeviceType * Get cl_device_type from the specified device name.
Info: get device type from device name (-gpu, -cpu, -mic) * get device type from device name (-gpu, -cpu, -mic)
Return: success or error code * Return: success or error code
*/ */
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type); int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
/* /**
Name: createContext * Create OpenCL context with specified device.
Info: create context with specified device * Return: success or error code
Return: success or error code */
*/
int ocl_createContext(); int ocl_createContext();
/* /**
Name: buildProgram * Build program from specified kernel file.
Info: build program from specified kernel file * Return: success or error code.
Return: success or error code
*/ */
int ocl_buildProgram(const char* kernel_file); int ocl_buildProgram(const char* kernel_file);
/** Compile program from kernel source string /**
* * Compile program from kernel source string.
* Takes a string read from OpenCL kernel file saved in kernel_source and compiles the
* OpenCL program, that can be then executed on the device.
* opts is a string specifiend additional compiler flags.
*/ */
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL); int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
protected: protected:
//memory for random number states
int defaultRndSet; int defaultRndSet;
cl_mem defaultRndState; cl_mem defaultRndState;
public: public:
//OpenCL context and commad queue
static cl_context m_context; static cl_context m_context;
static cl_command_queue m_command_queue; static cl_command_queue m_command_queue;
/* /**
constructor * constructor
*/ */
OpenCLBase(); OpenCLBase();
/* /**
destructor * destructor
*/ */
~OpenCLBase(); ~OpenCLBase();
/* /**
Create RND states * Allocate memory for size random number states and init the rnd states.
Return: success or error code * Uses AMD clRng library for random numbers.
*/ * This library is only compatible with AMD devices.
*/
int ocl_createRndStates(int size); int ocl_createRndStates(int size);
/* Create an array of random numbers on the device /**
* * Create an array of random numbers on the device.
* Filles hte mem_ptr with random numbers.
*/ */
int ocl_createRandomNumbers(void *mem_ptr, int size); int ocl_createRandomNumbers(void *mem_ptr, int size);
/* /**
Destroy rnd states * Destroy rnd states and free device memory.
Return: success or error code * Return: success or error code
*/ */
int ocl_deleteRndStates(); int ocl_deleteRndStates();
/* /**
Name: getAllDevices * Prints info about all the available platforms and devices.
Info: get all available devices * Can be used for information purposes to see what devices are available on the system.
ReturnL success or error code * ReturnL success or error code.
*/ */
int ocl_getAllDevices(); int ocl_getAllDevices();
/** Get the OpenCL device count for the set type of device /**
* * Get the OpenCL device count for the set type of device.
* Device count is set in ndev parameter, returns success or error code.
*/ */
int ocl_getDeviceCount(int &ndev); int ocl_getDeviceCount(int &ndev);
/** Get the name of the device used /**
* Get the name of the device currently us use.
*/ */
int ocl_getDeviceName(std::string &device_name); int ocl_getDeviceName(std::string &device_name);
/** Set the device to use for OpenCL kernels. /**
* device id to use is passed as integer. * Set the device to use for OpenCL kernels.
* Device id to use is passed as integer.
*/ */
int ocl_setDevice(int device); int ocl_setDevice(int device);
/** Get a list of all the unique devices of the same type that can run OpenCL kernels /**
* Used when GPUs of different types might be pressent on the system. * Get a list of all the unique devices of the same type that can run OpenCL kernels.
* Used when GPUs of different types might be pressent on the system.
*/ */
int ocl_getUniqueDevices(std::vector<int> &devices); int ocl_getUniqueDevices(std::vector<int> &devices);
/* /**
Name: setUp * Initialize OpenCL connection with a device of specified type.
Info: set up opencl resources * Find if specified device is avaialble, creates a contex and command queue.
Return: success or error code * Returns success or error code.
*/ */
int ocl_setUp(const char* device_name); int ocl_setUp(const char* device_name);
/* /**
Name: loadKernel * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
Info: load and compile opencl kernel file if it has changed * Load and compile opencl kernel file if it has changed.
Return: success or error code * Return: success or error code
*/ */
int ocl_loadKernel(const char* kernel_file); int ocl_loadKernel(const char* kernel_file);
/** Build program from kernel source. /**
* Build program from kernel source.
* Builds a program from source code provided in kernel_source. * Builds a program from source code provided in kernel_source.
* If compilation fails will return DKS_ERROR * If compilation fails will return DKS_ERROR
*/ */
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL); int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
/* /**
Name: allocateMemory * Allocate memory on the device.
Info: allocate memory on device * Return: return pointer to memory
Return: return pointer to memory
*/ */
cl_mem ocl_allocateMemory(size_t size, int &ierr); cl_mem ocl_allocateMemory(size_t size, int &ierr);
/* /**
Name: allocateMemory * Allocate memory of specific type on device.
Info: allocate memory on device * The availabel types are cl_mem_flags type listed in OpenCL documentation:
Return: return pointer to memory * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,
* CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
* Return: return pointer to memory
*/ */
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer /**
* Set all the elemetns in the device array to zero * Zero OpenCL memory buffer.
* Set all the elemetns in the device array to zero.
*/ */
template <typename T> template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) { int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@ -218,92 +231,90 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Name: writeData * Write data to device memory (needs ptr to mem object)
Info: write data to device memory (needs ptr to mem object) * Return: success or error code
Return: success or error code */
*/
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/* /**
Name: copyData * Copy data from one buffer on the device to another
Info: copy data from one buffer on the device to another * Return: success or error code
Return: success or error code */
*/
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size); int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
/* /**
Name: createKernel * Create kernel from compiled OpenCL program.
Info: create kernel from program * Return: success or error code
Return: success or error code */
*/
int ocl_createKernel(const char* kernel_name); int ocl_createKernel(const char* kernel_name);
/* /**
Name: setKernelArgs * Set argiments for the kernel that will be launched.
Info: set opencl kernel arguments * Return: success or error code
Return: success or error code */
*/
int ocl_setKernelArg(int idx, size_t size, const void *arg_value); int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
/* /**
Name: executeKernel * Execute selected kernel.
Info: execute selected kernel (needs kernel parameters) * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
Return: success or error code * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
* kernel arguments must be set.
* Return: success or error code
*/ */
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL); int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
/* /**
Name: readData * Read data from device (needs pointer to mem object).
Info: read data from device (needs pointer to mem object) * Return: success or error code
Return: success or error code */
*/
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/* /**
Name: freeMemory * Free device memory (needs ptr to mem object).
Info: free device memory (needs ptr to mem object) * Return: success or error code
Return: success or error code */
*/
int ocl_freeMemory(cl_mem mem_ptr); int ocl_freeMemory(cl_mem mem_ptr);
/* /**
Name: cleanUp * Free opencl resources.
Info: free opencl resources * Deletes the kernel, compiled program, command queue and colese the connection
Return: success or error code * to device by releasing the context.
*/ * Return: success or error code
*/
int ocl_cleanUp(); int ocl_cleanUp();
/* /**
Name: deviceInfo * Print info of currently selected device.
Info: print device info (mostly for debugging purposes) * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
Return: success or error code * Return: success or error code
*/ */
int ocl_deviceInfo(bool verbose = true); int ocl_deviceInfo(bool verbose = true);
/* Check OpenCL kernel. /*
* Query device and check if it can run the kernel with required parameters * Check OpenCL kernel.
* Query device and check if it can run the kernel with required parameters.
* Also check the available OpenCL extensions - usefull for checking the supported device
* features, like double precission.
*/ */
int ocl_checkKernel(const char* kernel_name, int work_group_size, int ocl_checkKernel(const char* kernel_name, int work_group_size,
bool double_precision, int &threadsPerBlock); bool double_precision, int &threadsPerBlock);
/* /**
Name: clearEvents * Clear the event list.
Info: clear saved events (for debuging purposes) * Events can be used for timing and synchronization purposes.
Return: nothing */
*/
void ocl_clearEvents(); void ocl_clearEvents();
/* /**
Name: eventInfo * print information about kernel timings from event list.
Info: print information about kernel timings (for debuging purposes) * for debuging purposes
Return: nothing */
*/
void ocl_eventInfo(); void ocl_eventInfo();
/* /**
Return current command queue * Return current command queue.
*/ */
cl_command_queue ocl_getQueue() { return m_command_queue; } cl_command_queue ocl_getQueue() { return m_command_queue; }
}; };

View File

@ -14,7 +14,7 @@
#define DKS_SUCCESS 0 #define DKS_SUCCESS 0
#define DKS_ERROR 1 #define DKS_ERROR 1
/** Deprecated, SimpleFit implementation of ChiSquare. */
class OpenCLChiSquare { class OpenCLChiSquare {
private: private:

View File

@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
} }
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) { int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
//write params to gpu
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams); int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
return ierr; return ierr;
} }
@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
if (numfunc == 0) if (numfunc == 0)
return DKS_SUCCESS; return DKS_SUCCESS;
//write function values to the GPU
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc); int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
return ierr; return ierr;
} }
@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
if (nummap == 0) if (nummap == 0)
return DKS_SUCCESS; return DKS_SUCCESS;
//wrtie map values to the GPU
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap); int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
return ierr; return ierr;
} }
@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
freeChiSquare(); freeChiSquare();
} }
//allocate temporary memory //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr); mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr); mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
if (size_func == 0) if (size_func == 0)
@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
int ierr = DKS_ERROR; int ierr = DKS_ERROR;
if (initDone_m) { if (initDone_m) {
//free memory //free GPU memory
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
return DKS_ERROR; return DKS_ERROR;
} }
//check the GPU kernel
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock); ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
return ierr; return ierr;

View File

@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
const std::string openclFunctFooter = "}\n"; const std::string openclFunctFooter = "}\n";
/**
* OpenCL implementation of ChiSquareRuntime class.
* Implements ChiSquareRuntime interface to allow musrfit to target devices that
* support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
*/
class OpenCLChiSquareRuntime : public ChiSquareRuntime { class OpenCLChiSquareRuntime : public ChiSquareRuntime {
private: private:
OpenCLBase *m_oclbase; OpenCLBase *m_oclbase;
/** Private function to add user defined function to kernel string /**
* * Private function to add user defined function to kernel string.
*/ */
std::string buildProgram(std::string function); std::string buildProgram(std::string function);
/**
* Launch parallel reduction kernel to calculate the sum of data array
*/
double calculateSum(cl_mem data, int length); double calculateSum(cl_mem data, int length);
public: public:
/** Constructor wiht openclbase argument /**
* * Constructor wiht openclbase argument.
*/ */
OpenCLChiSquareRuntime(OpenCLBase *base); OpenCLChiSquareRuntime(OpenCLBase *base);
/** Default constructor /**
* * Default constructor
*/ */
OpenCLChiSquareRuntime(); OpenCLChiSquareRuntime();
/** Default destructor /**
* * Default destructor
*/ */
~OpenCLChiSquareRuntime(); ~OpenCLChiSquareRuntime();
/** Compile program and save ptx. /**
* Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program * Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in * Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]] * a form par[map[idx]]
*/ */
int compileProgram(std::string function, bool mlh = false); int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel /**
* Launch selected kernel.
* Launched the selected kernel from the compiled code. * Launched the selected kernel from the compiled code.
* Result is put in &result variable * Result is put in &result variable
*/ */
@ -64,22 +74,26 @@ public:
double timeStart, double timeStep, double timeStart, double timeStep,
double &result); double &result);
/** Write params to device. /**
* Write params to device.
* Write params from double array to mem_param_m memory on the device. * Write params from double array to mem_param_m memory on the device.
*/ */
int writeParams(const double *params, int numparams); int writeParams(const double *params, int numparams);
/** Write functions to device. /**
* Write functions to device.
* Write function values from double array to mem_func_m memory on the device. * Write function values from double array to mem_func_m memory on the device.
*/ */
int writeFunc(const double *func, int numfunc); int writeFunc(const double *func, int numfunc);
/** Write maps to device. /**
* Write maps to device.
* Write map values from int array to mem_map_m memory on the device. * Write map values from int array to mem_map_m memory on the device.
*/ */
int writeMap(const int *map, int nummap); int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square. /**
* Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param, * the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in * size_func and size_map are the maximum number of parameters, functions and maps used in
@ -87,14 +101,16 @@ public:
*/ */
int initChiSquare(int size_data, int size_param, int size_func, int size_map); int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square. /**
* Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps * Frees the chisq temporary memory and memory for params, functions and maps
*/ */
int freeChiSquare(); int freeChiSquare();
/** Check MuSR kernels for necessary resources. /**
* Check MuSR kernels for necessary resources.
* Query device properties to get if sufficient resources are * Query device properties to get if sufficient resources are
* available to run the kernels * available to run the kernels. Also checks if double precission is enabled on the device.
*/ */
int checkChiSquareKernels(int fitType, int &threadsPerBlock); int checkChiSquareKernels(int fitType, int &threadsPerBlock);

View File

@ -17,12 +17,16 @@
#include "boost/compute/core.hpp" #include "boost/compute/core.hpp"
*/ */
/** Double3 structure for use in OpenCL code. */
typedef struct { typedef struct {
double x; double x;
double y; double y;
double z; double z;
} Double3; } Double3;
/**
* Structure for stroing particles in OpenCL code.
*/
typedef struct { typedef struct {
int label; int label;
unsigned localID; unsigned localID;
@ -35,6 +39,10 @@ typedef struct {
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z)); //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol)); //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
/**
* OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
*/
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics { class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
private: private:
@ -42,16 +50,20 @@ private:
public: public:
/* constructor */ /**
* Constructor with OpenCLBase as argument.
* Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
*/
OpenCLCollimatorPhysics(OpenCLBase *base) { OpenCLCollimatorPhysics(OpenCLBase *base) {
m_oclbase = base; m_oclbase = base;
} }
/* destructor */ /**
* Destructor.
*/
~OpenCLCollimatorPhysics() { ~OpenCLCollimatorPhysics() {
} }
/* execute degrader code on device */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherforScattering = true); bool enableRutherforScattering = true);

View File

@ -1,14 +1,3 @@
/*
Name: OpenCLFFT
Author: Uldis Locans
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
Data: 19.09.2014
*/
#ifndef H_OPENCL_FFT #ifndef H_OPENCL_FFT
#define H_OPENCL_FFT #define H_OPENCL_FFT
@ -22,6 +11,12 @@
#include "clFFT.h" #include "clFFT.h"
/**
* OpenCL FFT class based on BaseFFT interface.
* Uses clFFT library to perform FFTs on AMD gpus.
* clFFT library works also on nvida GPUs and other devices that
* support OpenCL.
*/
class OpenCLFFT : public BaseFFT { class OpenCLFFT : public BaseFFT {
private: private:

View File

@ -7,6 +7,7 @@
#include "../Algorithms/GreensFunction.h" #include "../Algorithms/GreensFunction.h"
#include "OpenCLBase.h" #include "OpenCLBase.h"
/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
class OpenCLGreensFunction : public GreensFunction { class OpenCLGreensFunction : public GreensFunction {
private: private:
@ -31,7 +32,7 @@ public:
int buildProgram(); int buildProgram();
/** /**
Info: calc itegral on device memory (taken from OPAL src code) Info: calc itegral on device memory (taken from OPAL src code).
Return: success or error code Return: success or error code
*/ */
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
@ -39,20 +40,20 @@ public:
int streamId = -1); int streamId = -1);
/** /**
Info: integration of rho2_m field (taken from OPAL src code) Info: integration of rho2_m field (taken from OPAL src code).
Return: success or error code Return: success or error code
*/ */
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1); int streamId = -1);
/** /**
Info: mirror rho field (taken from OPAL src code) Info: mirror rho field (taken from OPAL src code).
Return: succes or error code Return: succes or error code
*/ */
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1); int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/** /**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1 Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
Return: success or error code Return: success or error code
*/ */
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);

View File

@ -5,6 +5,10 @@
#include <string> #include <string>
#include <sys/time.h> #include <sys/time.h>
/**
* Custom timer class.
* Allows to insert timers in the code to get function exectution times.
*/
class DKSTimer { class DKSTimer {
private: private:
@ -17,39 +21,45 @@ private:
public: public:
/** Init DKSTimer by seting timer to zero */ /** Init DKSTimer by seting timer to zero. */
DKSTimer(); DKSTimer();
~DKSTimer(); ~DKSTimer();
/** Init the timer /**
* Set the name for timer and clear all values * Init the timer.
* Set the name for timer and clear all values
*/ */
void init(std::string n); void init(std::string n);
/** Start the timer. /**
* Get the curret time with gettimeofday and save in timeStart * Start the timer.
* Get the curret time with gettimeofday and save in timeStart
*/ */
void start(); void start();
/** Stop the timer /**
* Get the curretn time with gettimeofday and save in timeEnd * Stop the timer.
* Calculate elapsed time by timeEnd - timeStart and add to timervalue * Get the curretn time with gettimeofday and save in timeEnd
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
*/ */
void stop(); void stop();
/** Reset timervalue to zero. /**
* Set timervalue, timeStart and timeEnd to zero * Reset timervalue to zero.
* Set timervalue, timeStart and timeEnd to zero
*/ */
void reset(); void reset();
/** Return elapsed time in seconds. /**
* Return the value of timervalue * Return elapsed time in seconds.
* Return the value of timervalue
*/ */
double gettime(); double gettime();
/** Print timer. /**
* Print the elapsed time of the timer * Print timer.
* Print the elapsed time of the timer
*/ */
void print(); void print();