diff --git a/src/Algorithms/ChiSquareRuntime.h b/src/Algorithms/ChiSquareRuntime.h index 85e8d9f..68e794b 100644 --- a/src/Algorithms/ChiSquareRuntime.h +++ b/src/Algorithms/ChiSquareRuntime.h @@ -15,6 +15,9 @@ class DKSBaseMuSR; +/** + * Interface to implement ChiSquareRuntime class for musrfit. + */ class ChiSquareRuntime { friend class DKSBaseMuSR; @@ -63,23 +66,54 @@ public: /** Default constructor */ //ChiSquareRuntime(); - /** Default destructor */ + /** Default destructor. */ virtual ~ChiSquareRuntime() { }; + /** + * Compile GPU programm generated at runtime. + */ virtual int compileProgram(std::string function, bool mlh = false) = 0; + + /** + * Launche the compiled chiSquare kernel. + */ virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, int numpar, int numfunc, int nummap, double timeStart, double timeStep, double &result) = 0; + /** + * Write the parameter values to the GPU. + */ virtual int writeParams(const double *params, int numparams) = 0; + + /** + * Write the function values to the GPU. + */ virtual int writeFunc(const double *func, int numfunc) = 0; + + /** + * Write map values to the GPU. + */ virtual int writeMap(const int *map, int nummap) = 0; + + /** + * Allocate temporary memory needed for the chi square calucaltios on the device. + */ virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0; + + /** + * Free device memory allocated for chi square calculations. + */ virtual int freeChiSquare() = 0; + + /** + * Check if available device can run the chi square GPU code. + */ virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0; - /** Set N0, tau and bgk values to use for the kernel. + /** + * Set N0, tau and bgk values to use for the kernel. * If values changes between data sets this needs to be called before * every kernel call. Returns DKS_SUCCESS. */ @@ -91,7 +125,8 @@ public: return DKS_SUCCESS; } - /** Set alpha and beta values to use for the kernel. + /** + * Set alpha and beta values to use for the kernel. * If values changes between data sets this needs to be called before * every kernel call. Returns DKS_SUCCESS. */ @@ -101,8 +136,9 @@ public: return DKS_SUCCESS; } - /** Set number of blocks and threads. - * Used to set parameters obtained from auto-tuning + /** + * Set number of blocks and threads. + * Used to set parameters obtained from auto-tuning */ int setKernelParams(int numBlocks, int blockSize) { int ierr = DKS_ERROR; @@ -118,8 +154,9 @@ public: return ierr; } - /** Get the number of operations in compiled kernel. - * Count the number of operation in the ptx file for the compiled program. + /** + * Get the number of operations in compiled kernel. + * Count the number of operation in the ptx file for the compiled program. */ int getOperations(int &oper) { diff --git a/src/Algorithms/CollimatorPhysics.h b/src/Algorithms/CollimatorPhysics.h index b6c4c61..cbc497d 100644 --- a/src/Algorithms/CollimatorPhysics.h +++ b/src/Algorithms/CollimatorPhysics.h @@ -5,6 +5,9 @@ #include #include "../DKSDefinitions.h" +/** + * Interface to impelment particle matter interaction for OPAL. + */ class DKSCollimatorPhysics { protected: @@ -15,29 +18,61 @@ protected: public: virtual ~DKSCollimatorPhysics() { } - + + /** + * Execute collimator physics kernel. + * + */ virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, bool enableRutherforScattering = true) = 0; + /** + * Special calse CollimatorPhysics kernel that uses SoA instead of AoS. + * Used only on the MIC side, was not implemented on the GPU. + */ virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr, void *par_ptr, int numparticles) = 0; + /** + * Sort particle array on GPU. + * Count particles that are dead (label -1) or leaving material (label -2) and sort particle + * array so these particles are at the end of array + */ virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0; + /** + * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS. + * Used only on the MIC side, was not implemented on the GPU. + */ virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr, void *par_ptr, int numparticles, int &numaddback) = 0; + /** + * BorisPusher push function for integration from OPAL. + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, double dt, double c, bool usedt = false, int streamId = -1) = 0; + /** + * BorisPusher kick function for integration from OPAL. + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, void *bf_ptr, void *dt_ptr, double charge, double mass, int npart, double c, int streamId = -1) = 0; + /** + * BorisPusher push function with transformto function form OPAL. + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr, double dt, double c, bool usedt = false, diff --git a/src/Algorithms/FFT.h b/src/Algorithms/FFT.h index 6af8d35..6bf12fc 100644 --- a/src/Algorithms/FFT.h +++ b/src/Algorithms/FFT.h @@ -6,12 +6,21 @@ #include "../DKSDefinitions.h" +/** + * Abstract class defining methods for DKS FFT class. + * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes. + */ class BaseFFT { protected: int defaultN[3]; int defaultNdim; + /** + * Check if FFT plan is created for the needed dimension and FFT size. + * Returns true if the plan has been created and false if no plan for specified dimension + * and size exists. + */ bool useDefaultPlan(int ndim, int N[3]) { if (ndim != defaultNdim) return false; @@ -24,18 +33,57 @@ public: virtual ~BaseFFT() { } + /** Setup FFT - init FFT library used by chosen device. */ virtual int setupFFT(int ndim, int N[3]) = 0; + + /** Setup real to complex FFT - init FFT library used by chosen device. */ virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0; + + /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */ virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0; + + /** Clean up. */ virtual int destroyFFT() = 0; + + /** + * Exectute C2C FFT. + * mem_ptr - memory ptr on the device for complex data. + * Performs in place FFT. + */ virtual int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true) = 0; + + /** + * Exectute inverse C2C FFT. + * mem_ptr - memory ptr on the device for complex data. + * Performs in place FFT. + */ virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; + + /** + * Normalize the FFT or IFFT. + * mem_ptr - memory to complex data. + */ virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; + + /** + * Exectute R2C FFT. + * real_ptr - real input data for FFT, comp_ptr - memory on the device where + * results for the FFT are stored as complex numbers. + */ virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1) = 0; + + /** + * Exectute C2R FFT. + * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT. + */ virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1) = 0; + + /** + * Normalize CR FFT. + */ virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0; }; diff --git a/src/Algorithms/GreensFunction.h b/src/Algorithms/GreensFunction.h index 45674ec..8cc8ccc 100644 --- a/src/Algorithms/GreensFunction.h +++ b/src/Algorithms/GreensFunction.h @@ -4,24 +4,27 @@ #include #include +/** + * Interface to implement Greens function calculations for OPAL. + */ class GreensFunction { public: virtual ~GreensFunction() { } - /** calc greens integral, as defined in OPAL */ + /** calc greens integral, as defined in OPAL. */ virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0; - /** integration if rho2_m, see OPAL for more details */ + /** integration if rho2_m, see OPAL for more details. */ virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, int streamId = -1) = 0; - /** mirror rho2_m field */ + /** mirror rho2_m field. */ virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0; - /** multiply two complex fields from device memory */ + /** multiply two complex fields from device memory. */ virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0; }; diff --git a/src/Algorithms/ImageReconstruction.h b/src/Algorithms/ImageReconstruction.h index 3a6266e..1c63082 100644 --- a/src/Algorithms/ImageReconstruction.h +++ b/src/Algorithms/ImageReconstruction.h @@ -5,17 +5,22 @@ #define BLOCK_SIZE 128 +/** Struct to hold voxel position for PET image. */ struct VoxelPosition { float x; float y; float z; }; +/** Struct that holds pair of detectors that registered an envent. */ struct ListEvent { unsigned detA : 16; unsigned detB : 16; }; +/** + * Interface to implement PET image reconstruction. + */ class ImageReconstruction { protected: @@ -25,7 +30,8 @@ public: virtual ~ImageReconstruction() { } - /** Caluclate source. + /** + * Caluclate source. * Places a sphere at each voxel position and calculate the avg value and std value of pixels * that are inside this sphere. All the sphere used have the same diameter. */ @@ -33,7 +39,8 @@ public: void *avg, void *std, float diameter, int total_voxels, int total_sources, int start = 0) = 0; - /** Calculate background. + /** + * Calculate background. * Places two sphere at each voxel position, calculates the avg value and std value of pixels * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter. @@ -42,7 +49,8 @@ public: void *avg, void *std, float diameter, int total_voxels, int total_sources, int start = 0) = 0; - /** Caluclate source using differente sources. + /** + * Caluclate source using differente sources. * Places two sphere at each voxel position, calculates the avg value and std value of pixels * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * each sphere is given by *diameter array. @@ -52,7 +60,7 @@ public: int total_sources, int start = 0) = 0; /** - * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * Places two sphere at each voxel position, calculates the avg value and std value of pixels. * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the * smaller sphere. @@ -61,7 +69,8 @@ public: void *avg, void *std, void *diameter, int total_voxels, int total_sources, int start = 0) = 0; - /** Generate normalization. + /** + * Generate normalization. * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel * that updates voxel values in the image on the slope between these two detectors. */ @@ -69,14 +78,16 @@ public: void *det_position, int total_det) = 0; - /** Calculate forward projection. + /** + * Calculate forward projection. * For image reconstruction calculates forward projections. * see recon.cpp for details */ virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, void *image_position, int num_events) = 0; - /** Calculate backward projection. + /** + * Calculate backward projection. * For image reconstruction calculates backward projections. * see recon.cpp for details */ @@ -84,29 +95,29 @@ public: void *det_position, void *image_position, int num_events, int num_voxels) = 0; - /** Set the voxel dimensins on device. - * + /** + *Set the voxel dimensins on device. */ virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0; - /** Set the image edge variables on the device. - * + /** + * Set the image edge variables on the device. */ virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0; - /** Set the image edge1 on the device. - * + /** + * Set the image edge1 on the device. */ virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0; - /** Set the minimum crystan in one ring values on the device. - * + /** + * Set the minimum crystan in one ring values on the device. */ virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1) = 0; - /** Set all other required parameters for reconstruction. - * + /** + * Set all other required parameters for reconstruction. */ virtual int setParams(float matrix_distance_factor, float phantom_diameter, float atten_per_mm, float ring_diameter) = 0; diff --git a/src/AutoTuning/DKSAutoTuning.h b/src/AutoTuning/DKSAutoTuning.h index ca8f3a3..c14e7c1 100644 --- a/src/AutoTuning/DKSAutoTuning.h +++ b/src/AutoTuning/DKSAutoTuning.h @@ -18,6 +18,17 @@ typedef std::vector Parameters; typedef std::vector States; +/** + * DKS autotuning class, allows to auto-tune the defince function. + * Executes the defined function for auto-tuning and searches for optimal parameters to improve + * the function execution time. The function that is auto-tuned, parameters and the ranges + * need to be set. Includes multiple search methods, that searches the parameter space to finde + * the optimal solution. + * 1) exaustive search + * 2) line search + * 3) hill climbimg + * 4) simulated annealing + */ class DKSAutoTuning { private: @@ -36,12 +47,13 @@ private: int loops_m; - /** Update parameters from a state */ + /** Update parameters from a state. */ int setParameterValues(States states); - /** Evaluate the function and set execution time - * Returns DKS_ERROR if errors occured during function execution. - * Returns DKS_SUCCESS if function executed as planned. + /** + * Evaluate the function and set execution time + * Returns DKS_ERROR if errors occured during function execution. + * Returns DKS_SUCCESS if function executed as planned. */ int evaluateFunction(double &value); @@ -50,12 +62,13 @@ public: /** Constructor */ DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100); - /** Destructor */ + /** Destructor. */ ~DKSAutoTuning(); - /** Set function to auto tune. - * Caller of setFunction is responsible to bind the correct parameters - * to the function with std::bind. + /** + * Set function to auto tune. + * Caller of setFunction is responsible to bind the correct parameters + * to the function with std::bind. */ void setFunction(std::function f, std::string name, bool evaluate_time = true) { f_m = f; @@ -63,15 +76,21 @@ public: evaluate_time_m = evaluate_time; } + /** + * Set function to auto tune. + * Caller of setFunction is responsible to bind the correct parameters + * to the function with std::bind. + */ void setFunction(std::function f, std::string name, bool evaluate_time = false) { fd_m = f; function_name_m = name; evaluate_time_m = evaluate_time; } - /** Set parameter for auto tuning. - * Provide a pointer to a parameter that will be changed during auto-tuning - * and a min-max value for this element + /** + * Set parameter for auto tuning. + * Provide a pointer to a parameter that will be changed during auto-tuning + * and a min-max value for this element */ template void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) { @@ -85,9 +104,9 @@ public: /** Perform exaustive search evaluating all the parameter configurations */ void exaustiveSearch(); - /** Perform auto-tuning. - * Perform line-search auto-tuning by variying parameters one at a time and keeping other - * parameters constant. + /** + * Perform line-search auto-tuning by variying parameters one at a time. + * After one parameter is auto-tuned the next on is varied */ void lineSearch(); diff --git a/src/AutoTuning/DKSAutoTuningTester.h b/src/AutoTuning/DKSAutoTuningTester.h index 9c44309..96de6ba 100644 --- a/src/AutoTuning/DKSAutoTuningTester.h +++ b/src/AutoTuning/DKSAutoTuningTester.h @@ -4,6 +4,7 @@ #include #include +/** Tester class for auto-tuning search algorithms. */ class DKSAutoTuningTester { friend class DKSBaseMuSR; diff --git a/src/AutoTuning/DKSConfig.h b/src/AutoTuning/DKSConfig.h index 037a282..c747067 100644 --- a/src/AutoTuning/DKSConfig.h +++ b/src/AutoTuning/DKSConfig.h @@ -1,9 +1,3 @@ -/** Class to save and load DKS autotunning configs. - * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml. - * Uses boost xml_parser to read and write the xml file and boost property tree to store - * the xml content. - */ - #ifndef DKS_CONFIG #define DKS_CONFIG @@ -29,6 +23,13 @@ namespace pt = boost::property_tree; const std::string config_dir = "/.config/DKS"; const std::string config_file = "/autotuning.xml"; +/** Class to save and load DKS autotunning configs. + * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml. + * Uses boost xml_parser to read and write the xml file and boost property tree to store + * the xml content. + * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved + * so the auto-tuning has no effect. + */ class DKSConfig { private: diff --git a/src/AutoTuning/DKSSearchStates.h b/src/AutoTuning/DKSSearchStates.h index cdd8fb0..dbc8d23 100644 --- a/src/AutoTuning/DKSSearchStates.h +++ b/src/AutoTuning/DKSSearchStates.h @@ -9,6 +9,9 @@ enum VALUE_TYPE { DKS_INT, DKS_DOUBLE }; +/** + * Parameter class allows to change the searchable parameters during the auto-tuning. + */ class Parameter { private: @@ -64,6 +67,10 @@ public: }; +/** + * Struct to hold a auto-tuning state. + * Holds the current value, min, max and a step to witch a state can change. + */ struct State { double value; double min; @@ -74,6 +81,12 @@ struct State { typedef std::vector Parameters; typedef std::vector States; +/** + * Used by auto-tuning search algorithms to move between parameter configurations. + * Allows to move from one parameter stat to another, get neighboring states, + * move to neighboring states and save state information. Print functions are available + * for debugging purposes, to follow how algorithm muves between sates. + */ class DKSSearchStates { private: diff --git a/src/CUDA/CudaBase.cu b/src/CUDA/CudaBase.cu index 6317567..51f2729 100644 --- a/src/CUDA/CudaBase.cu +++ b/src/CUDA/CudaBase.cu @@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) { return DKS_SUCCESS; } - -/* - Info: allcate memory and write data (push) - Return: pointer to memory object -*/ -/* - void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) { - - void * mem_ptr; - mem_ptr = cuda_allocateMemory(size, ierr); - - if (ierr == DKS_SUCCESS) - ierr = cuda_writeData(mem_ptr, in_data, size); - - return mem_ptr; - } -*/ - -/* - Info: read data and free memory (pull) - Return: success or error code -*/ -/* - int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) { - - ierr = cuda_readData(mem_ptr, out_data, size); - if (ierr == DKS_SUCCESS) - ierr = cuda_freeMemory(mem_ptr); - else - return DKS_ERROR; - - - if (ierr == DKS_SUCCESS) - return DKS_SUCCESS; - else - return DKS_ERROR; - } -*/ - -/* - Info: execute function - Return: success or error code -*/ -int CudaBase::cuda_executeFunction() { - - std::cout << "Execute function" << std::endl; - return DKS_SUCCESS; -} - -/* - Info: clean up - Return: success or error code -*/ -int CudaBase::cuda_cleanUp() { - - std::cout << "clean up" << std::endl; - return DKS_SUCCESS; - -} diff --git a/src/CUDA/CudaBase.cuh b/src/CUDA/CudaBase.cuh index 8c8dbd6..ce1da4b 100644 --- a/src/CUDA/CudaBase.cuh +++ b/src/CUDA/CudaBase.cuh @@ -16,6 +16,11 @@ #define BLOCK_SIZE 128 +/** + * CUDA base class handles device setup and basic communication with the device. + * Handles devicew setup, memory manegement, data transfers and stream setup for + * asynchronous data transfers and kernel executions. + */ class CudaBase { private: @@ -48,17 +53,17 @@ public: /** * Delete curandState. * Delete curandState array on the GPU and free memory. - * Return success or error code + * Return success or error code */ int cuda_deleteCurandStates(); - /** Create 'size' random numbers on the device and save in mem_ptr array - * + /** + * Create 'size' random numbers on the device and save in mem_ptr array. */ int cuda_createRandomNumbers(void *mem_ptr, int size); - /** Get a pointer to curand states - * + /** + * Get a pointer to curand states. */ curandState* cuda_getCurandStates(); @@ -75,93 +80,98 @@ public: int cuda_addStream(cudaStream_t tmpStream, int &streamId); /** - * delete cuda stream + * delete cuda stream. * success or error code */ int cuda_deleteStream(int id); /** - * delete all streams + * delete all streams. * success or error code */ int cuda_deleteStreams(); /** - * set stream to use + * set stream to use. * success or error code */ int cuda_setStream(int id); /** - * Info: get stream that is used - * Return: return id of curretn stream + * get stream that is used. + * Return: return id of curretn stream */ int cuda_getStreamId(); /** - * Info: reset to default stream + * reset to default stream. * Return: success or error code */ int cuda_defaultStream(); /** - * Info: get number of streams + * get number of streams. * Return: success or error code */ int cuda_numberOfStreams(); /** - * Info: get stream + * get stream. * Return: stream */ cudaStream_t cuda_getStream(int id); /** - * Get default cublass handle + * Get default cublass handle. */ cublasHandle_t cuda_getCublas(); /** - * Info: get information on cuda devices + * get information on cuda devices. * Return: success or error code */ int cuda_getDevices(); - /** Get CUDA device count. - * Sets the number of devices on the platform that can use CUDA. - * Returns DKS_SUCCESS + /** + * Get CUDA device count. + * Sets the number of devices on the platform that can use CUDA. */ int cuda_getDeviceCount(int &ndev); - /** Get the name of the device. - * QUery the device properties of the used device and set the string device_name + /** + * Get the name of the device. + * QUery the device properties of the used device and set the string device_name */ int cuda_getDeviceName(std::string &device_name); - /** Set CUDA device to use. - * If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR + /** + * Set CUDA device to use. + * If device passed in is larger than the number of devices use + * the default:0 and return DKS_ERROR */ int cuda_setDevice(int device); - /** Get unique devices - * Get array of indeces with the unique CUDA devices available on the paltform + /** + * Get unique devices. + * Get array of indeces with the unique CUDA devices available on the paltform */ int cuda_getUniqueDevices(std::vector &devices); /** - * Info: init device + * Initialize connection to the device. + * Only needed when runtime compilation is used. * Return: success or error code */ int cuda_setUp(); /** - * Info: allocate memory on cuda device + * Allocate memory on cuda device. * Return: pointer to memory object */ void * cuda_allocateMemory(size_t size, int &ierr); /** - * Info: allocate host memory in pinned memory + * Allocate host memory in pinned memory * Return: success or error code */ template @@ -174,8 +184,9 @@ public: return DKS_SUCCESS; } - /** Zero CUDA memory. - * Set all the elements of the array on the device to zero. + /** + * Zero CUDA memory. + * Set all the elements of the array on the device to zero. */ template int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) { @@ -189,8 +200,9 @@ public: return DKS_SUCCESS; } - /** Zero CUDA memory. - * Set all the elements of the array on the device to zero. + /** + * Zero CUDA memory async. + * Set all the elements of the array on the device to zero. */ template int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) { @@ -209,7 +221,7 @@ public: } /** - * Info: write data to memory + * Write data to memory * Retrun: success or error code */ template @@ -226,7 +238,7 @@ public: } /** - * Info: write data assynchonuously + * Write data assynchonuously * Return: success or error code */ template @@ -258,7 +270,7 @@ public: } /** - * Info: read data from memory + * Read data from memory * Return: success or error code */ template @@ -275,7 +287,7 @@ public: } /** - * Info: read data async from device memory + * Read data async from device memory * Return: success or error code */ template @@ -307,19 +319,19 @@ public: } /** - * Info: free memory on device + * Free memory on device * Return: success or error code */ int cuda_freeMemory(void * mem_ptr); /** - * Info: free page locked memory on host + * Free page locked memory on host * Return: success or erro code */ int cuda_freeHostMemory(void * mem_ptr); /** - * Info: allcate memory and write data (push) + * Allcate memory and write data (push) * Return: pointer to memory object */ template @@ -335,7 +347,7 @@ public: } /** - * Info: read data and free memory (pull) + * Read data and free memory (pull) * Return: success or error code */ template @@ -353,21 +365,10 @@ public: else return DKS_ERROR; } - - /** - * Info: execute function - * Return: success or error code - */ - int cuda_executeFunction(); - - /** - * Info: clean up - * Return: success or error code - */ - int cuda_cleanUp(); /** - * Info: sync cuda device + * Sync cuda device. + * Waits till all the tasks on the GPU are finished. * Return: success or error code */ int cuda_syncDevice() { @@ -376,7 +377,7 @@ public: } /** - * Page-lock host memory + * Page-lock host memory. */ template int cuda_hostRegister(T *ptr, int size) { @@ -390,7 +391,7 @@ public: } /** - * Release page locked memory + * Release page locked memory. */ template int cuda_hostUnregister(T *ptr) { @@ -403,7 +404,7 @@ public: } /** - * Info: print device memory info (total, used, avail) + * Print device memory info (total, used, avail) * Return: success or error code */ int cuda_memInfo() { diff --git a/src/CUDA/CudaChiSquare.cuh b/src/CUDA/CudaChiSquare.cuh index 588dec5..692362c 100644 --- a/src/CUDA/CudaChiSquare.cuh +++ b/src/CUDA/CudaChiSquare.cuh @@ -8,6 +8,7 @@ #include "CudaBase.cuh" +/** Deprecated, CUDA simpleFit implementation of ChiSquare. */ class CudaChiSquare { private: diff --git a/src/CUDA/CudaChiSquareRuntime.cuh b/src/CUDA/CudaChiSquareRuntime.cuh index 79a9af5..581132e 100644 --- a/src/CUDA/CudaChiSquareRuntime.cuh +++ b/src/CUDA/CudaChiSquareRuntime.cuh @@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double const std::string cudaFunctFooter = "}\n"; +/** + * CUDA implementation of ChiSquareRuntime class. + * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU. + */ class CudaChiSquareRuntime : public ChiSquareRuntime{ private: @@ -29,65 +33,72 @@ private: cublasHandle_t defaultCublasRT; - /** Setup to init device - * Create context and init device for RT compilation + /** + * Setup to init device. + * Create context and init device for RT compilation */ void setUpContext(); - /** Private function to add function to kernel string - * + /** + * Private function to add function to kernel string. */ std::string buildProgram(std::string function); public: - /** Constructor with CudaBase argument - * + /** + * Constructor with CudaBase argument */ CudaChiSquareRuntime(CudaBase *base); - /** Default constructor init cuda device - * + /** + * Default constructor init cuda device */ CudaChiSquareRuntime(); - /** Default destructor - * + /** + * Default destructor. */ ~CudaChiSquareRuntime(); - /** Compile program and save ptx. + /** + * Compile program and save ptx. * Add function string to the calcFunction kernel and compile the program * Function must be valid C math expression. Parameters can be addressed in * a form par[map[idx]] */ int compileProgram(std::string function, bool mlh = false); - /** Launch selected kernel + /** + * Launch selected kernel. * Launched the selected kernel from the compiled code. - * Result is put in &result variable + * Result is put in &result variable. */ int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, int numpar, int numfunc, int nummap, double timeStart, double timeStep, double &result); - /** Write params to device. + /** + * Write params to device. * Write params from double array to mem_param_m memory on the device. */ int writeParams(const double *params, int numparams); - /** Write functions to device. + /** + * Write functions to device. * Write function values from double array to mem_func_m memory on the device. */ int writeFunc(const double *func, int numfunc); - /** Write maps to device. + /** + * Write maps to device. * Write map values from int array to mem_map_m memory on the device. */ int writeMap(const int *map, int nummap); - /** Allocate temporary memory needed for chi square. + /** + * Allocate temporary memory needed for chi square. * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to * the maximum number of elements in any datasets that will be used for calculations. Size_param, * size_func and size_map are the maximum number of parameters, functions and maps used in @@ -96,14 +107,16 @@ public: int initChiSquare(int size_data, int size_param, int size_func, int size_map); - /** Free temporary memory allocated for chi square. + /** + * Free temporary memory allocated for chi square. * Frees the chisq temporary memory and memory for params, functions and maps */ int freeChiSquare(); - /** Check if CUDA device is able to run the chi square kernel. - * Redundant - all new CUDA devices that support RT compilation will also support - * double precision, there are no other requirements to run chi square on GPU + /** + * Check if CUDA device is able to run the chi square kernel. + * Redundant - all new CUDA devices that support RT compilation will also support + * double precision, there are no other requirements to run chi square on GPU */ int checkChiSquareKernels(int fitType, int &threadsPerBlock) { return DKS_SUCCESS; diff --git a/src/CUDA/CudaCollimatorPhysics.cu b/src/CUDA/CudaCollimatorPhysics.cu index 4aca093..2c600d5 100644 --- a/src/CUDA/CudaCollimatorPhysics.cu +++ b/src/CUDA/CudaCollimatorPhysics.cu @@ -1,5 +1,6 @@ #include "CudaCollimatorPhysics.cuh" +//constants used in OPAL //#define M_P 0.93827231e+00 #define M_P 0.93827204e+00 #define C 299792458.0 @@ -11,6 +12,7 @@ #define Z_P 1 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 +//parameter array indexes #define POSITION 0 #define ZSIZE 1 #define RHO_M 2 @@ -28,12 +30,18 @@ #define BLOCK_SIZE 128 #define NUMPAR 13 +/** + * CUDA device function for calculating dot product. + */ __device__ inline double dot(double3 &d1, double3 &d2) { return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); } +/** + * CUDA devce function to calculate cross product. + */ __device__ inline double3 cross(double3 &lhs, double3 &rhs) { double3 tmp; tmp.x = lhs.y * rhs.z - lhs.z * rhs.y; @@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) { return tmp; } +/** + * CUDA device function to calculate arbitrary rotation. + */ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) { double c=cos(Theta); double s=sin(Theta); @@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th return tmp; } +/** + * CUDA device function to check if particle is still in material. + * z - particle position, par - parameter array. Particle is considered inside the + * material if z is > material starting position and z < material starting position - mat size. + */ __device__ inline bool checkHit(double &z, double *par) { /* check if particle is in the degrader material */ @@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) { } +/** + * CUDA device function to calculate energyLoss for one particle. + * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss + * algorith are available in OPAL user guide. + */ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) { @@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, } +/** + * CUDA device function for rotation in 2 dimensions. + * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012), + * "Passage of particles through matter" + */ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, double &normP, double &thetacou, double &deltas, int coord, double *par) @@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); } +/** + * CUDA device function to calculate Coulomb scattering for one particle. + * Including Multiple Coulomb Scattering and large angle Rutherford Scattering. + * For details on the algorithm see OPAL user guide. + */ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par, bool enableRutherfordScattering) { @@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d } +/** + * CUDA kernel that performs one step in particle movement trough mater. + * One thread is launched for each particle in the simulation. The kernel checks if the particle + * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks + * particles that are exiting the material. + * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL + * @param[in] *par array of material properties, always constant size - 13 + * @param[in] *state array holding cuRand states to preserve states between kernel launches + * @param[in] numparticles number of particles in the simulation + * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering + */ template __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, int numparticles, bool enableRutherfordScattering) @@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state volatile int tid = threadIdx.x; volatile int idx = blockIdx.x * blockDim.x + tid; - //transfer params to shared memory + //transfer params and particle positions to shared memory + //R is kept in shared memory in order to reduce register pressure for the kernel extern __shared__ double smem[]; double *p = (double*)smem; - double3 *R = (double3*)&smem[NUMPAR]; + double3 *R = (double3*)&smem[NUMPAR]; - curandState s; + curandState s; //each tread gets its own cuRand state for random number generation double3 P; + //load parameters to shared memory for (int tt = tid; tt < NUMPAR; tt += blockDim.x) p[tt] = par[tt]; + //sync threads to ensure that parameters are finished loading __syncthreads(); + //there might be some empty threads that do no work if (idx < numparticles) { - s = state[idx]; - R[tid] = data[idx].Rincol; - P = data[idx].Pincol; + s = state[idx]; //load cuRand state to local memory + R[tid] = data[idx].Rincol; //load position to shared memory + P = data[idx].Pincol; //load momentum to local memory bool pdead = false; volatile double sq = sqrt(1.0 + dot(P, P)); double Eng; + //check if particle is still in the material if (checkHit(R[tid].z, p)) { + //calculate enery loss Eng = (sq - 1) * M_P; energyLoss(Eng, pdead, s, p); + //check if particle is not dead if (!pdead) { double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; sq = sqrt(dot(P, P)); + //caluclate Coulomb scattering P.x = P.x * ptot / sq; P.y = P.y * ptot / sq; P.z = P.z * ptot / sq; coulombScat(R[tid], P, s, p, enableRutherfordScattering); + //update particle momentum data[idx].Pincol = P; } else { + //mark particle as dead (-1) data[idx].label = -1; } - + + //update cuRand state state[idx] = s; } else { - + //particle exits material - drift and mark as exiting (-2) R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; @@ -272,14 +325,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state } + //update particle position data[idx].Rincol = R[tid]; } } -__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, - curandState *state, int numparticles, - bool enableRutherfordScattering) +/** + * CUDA kernel that performs one step in particle movement trough mater using SoA particles. + * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays. + * Deprecated - GPU version does not use SoA. + * @param[in] data structure of arrays containing particle data + * @param[in] *par array of material properties, always constant size - 13 + * @param[in] *state array holding cuRand states to preserve states between kernel launches + * @param[in] numparticles number of particles in the simulation + * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering + */ +__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, + curandState *state, int numparticles, + bool enableRutherfordScattering) { //get global id and thread id @@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, } - +/** + * Device function to swich off unitless positions. + */ inline __device__ void unitlessOff(double3 &a, const double &c) { a.x *= c; a.y *= c; a.z *= c; } +/** + * Device function to swich on unitless positions. + */ inline __device__ void unitlessOn(double3 &a, const double &c) { a.x /= c; a.y /= c; a.z /= c; } -//swithch to unitless positions with dtc -__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) { - - volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < npart) { - double3 R = gR[idx]; - double3 X = gX[idx]; - - unitlessOn(R, dtc); - unitlessOn(X, dtc); - - gR[idx] = R; - gX[idx] = X; - } - -} - -//swithc to unitless positions with dt*c -__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) { - - volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < npart) { - double3 R = gR[idx]; - double3 X = gX[idx]; - double dt = gdt[idx]; - - unitlessOff(R, dt*c); - unitlessOff(X, dt*c); - - gR[idx] = R; - gX[idx] = X; - } -} - -//swithc off unitless positions with dtc -__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) { - - volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < npart) { - double3 R = gR[idx]; - double3 X = gX[idx]; - - unitlessOff(R, dtc); - unitlessOff(X, dtc); - - gR[idx] = R; - gX[idx] = X; - } - -} - -//switch off unitelss positions with dt*c -__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) { - - volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < npart) { - double3 R = gR[idx]; - double3 X = gX[idx]; - double dt = gdt[idx]; - - unitlessOff(R, dt*c); - unitlessOff(X, dt*c); - - gR[idx] = R; - gX[idx] = X; - } -} - +/** + * CUDA kernel to perform particle push. + * @param[in] *gR array of particle positions + * @param[in] *gP array of particle momentums + * @param[in] npart number of particles + * @param[in] dtc dt*c + */ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { //get global id and thread id @@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { } } - +/** + * CUDA kernel to perform particle push. + * @param[in] *gR array of particle positions + * @param[in] *gP array of particle momentums + * @param[in] *gdt array of time steps for each particle + * @param[in] npart number of particles + * @param[in] c speed of light + */ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) { //get global id and thread id @@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou } } +/** + * CUDA kernel to perform particle kick. + * @param[in] *gR array of particle positions + * @param[in] *gP array of particle momentums + * @param[in] *gEf + * @param[in] *gBf + * @param[in] *gdt array of time steps for each particle + * @param[in] npart number of particles + * @param[in] c speed of light + */ __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, double3 *gBf, double *gdt, double charge, double mass, int npart, double c) @@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection } -struct compare_particle -{ - int threshold; - - compare_particle() { - threshold = 0; - } - - void set_threshold(int t) { - threshold = t; - } - - __host__ __device__ - bool operator()(CUDA_PART p1, CUDA_PART p2) { - return p1.label > p2.label; - } - - __host__ __device__ - bool operator()(CUDA_PART p1) { - return p1.label < threshold; - } -}; - - -struct compare_particle_small -{ - int threshold; - - compare_particle_small() { - threshold = 0; - } - - void set_threshold(int t) { - threshold = t; - } - - __host__ __device__ - bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) { - return p1.label > p2.label; - } - - __host__ __device__ - bool operator()(CUDA_PART_SMALL p1) { - return p1.label < threshold; - } -}; - - -struct less_then -{ - __host__ __device__ - bool operator()(int x) - { - return x < 0; - } -}; - int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, bool enableRutherfordScattering) { diff --git a/src/CUDA/CudaCollimatorPhysics.cuh b/src/CUDA/CudaCollimatorPhysics.cuh index fe2f275..fda8f46 100644 --- a/src/CUDA/CudaCollimatorPhysics.cuh +++ b/src/CUDA/CudaCollimatorPhysics.cuh @@ -20,7 +20,8 @@ #include "CudaBase.cuh" /** - * Structure for storing particle on GPU + * Structure for storing particle on GPU or MIC as AoS. + * Structure for OPAL particle, can be used to store particles on the GPU in array of structures. */ typedef struct __align__(16) { int label; @@ -37,7 +38,10 @@ typedef struct __align__(16) { } CUDA_PART; /** - * Structure for storing particle on GPU + * Structure for storing particle on GPU as AoS + * Structure for OPAL particle, can be used to store particles on the GPU in array of structures, + * contains only data that are used by the GPU kernels, the rest of the particle data must be kept + * on the host side. */ typedef struct { int label; @@ -47,7 +51,8 @@ typedef struct { } CUDA_PART_SMALL; /** - * Structure for storing particle on GPU + * Structure for storing particle on GPU as SoA. + * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays. */ typedef struct { int *label; @@ -65,6 +70,9 @@ typedef struct { /** * Structure for storing particle on GPU + * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays, + * contains only data that are used by the GPU kernels, the rest of the particle data must be kept + * on the host side. */ typedef struct { int *label; @@ -73,11 +81,39 @@ typedef struct { double3 *Pincol; } CUDA_PART2_SMALL; -/** CudaCollimatorPhysics class. - * Contains kerenls that execute CollimatorPhysics functions form OPAL. - * For detailed documentation on CollimatorPhysics functions see OPAL documentation +/** + * Operator used in thrust sort to compare particles by label. + * Used to move dead particles to the end of array, since they have label -1 or -2. */ -class CudaCollimatorPhysics : public DKSCollimatorPhysics{ +struct compare_particle_small +{ + int threshold; + + compare_particle_small() { + threshold = 0; + } + + void set_threshold(int t) { + threshold = t; + } + + __host__ __device__ + bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) { + return p1.label > p2.label; + } + + __host__ __device__ + bool operator()(CUDA_PART_SMALL p1) { + return p1.label < threshold; + } +}; + +/** + * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface. + * Contains kerenls that execute CollimatorPhysics functions form OPAL. + * For detailed documentation on CollimatorPhysics functions see OPAL documentation. + */ +class CudaCollimatorPhysics : public DKSCollimatorPhysics { private: @@ -86,32 +122,44 @@ private: public: - /** Constructor with CudaBase argument - * + /** + * Constructor with CudaBase as argument. + * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object. */ CudaCollimatorPhysics(CudaBase *base) { m_base = base; base_create = false; } - /** Constructor - empty. */ + /** + * Empty constructor. + * Create a new instance of CudaCollimatorPhysics with its own CudaBase. + */ CudaCollimatorPhysics() { m_base = new CudaBase(); base_create = true; } - /** Destructor - empty */ + /** + * Destructor. + * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor. + */ ~CudaCollimatorPhysics() { if (base_create) delete m_base; }; - /** Execute collimator physics kernel. + /** + * Execute collimator physics kernel. * */ int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, bool enableRutherforScattering = true); + /** + * Special calse CollimatorPhysics kernel that uses SoA instead of AoS. + * Used only on the MIC side, was not implemented on the GPU. + */ int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr, @@ -120,12 +168,17 @@ public: return DKS_ERROR; } - /** Sort particle array on GPU. + /** + * Sort particle array on GPU. * Count particles that are dead (label -1) or leaving material (label -2) and sort particle * array so these particles are at the end of array */ int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + /** + * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS. + * Used only on the MIC side, was not implemented on the GPU. + */ int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr, @@ -134,18 +187,25 @@ public: return DKS_ERROR; } - /** BorisPusher push function for integration from OPAL. + /** + * BorisPusher push function for integration from OPAL. * ParallelTTracker integration from OPAL implemented in cuda. * For more details see ParallelTTracler docomentation in opal */ int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, double dt, double c, bool usedt = false, int streamId = -1); + /** + * BorisPusher kick function for integration from OPAL. + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, void *bf_ptr, void *dt_ptr, double charge, double mass, int npart, double c, int streamId = -1); - /** BorisPusher push function with transformto function form OPAL + /** + * BorisPusher push function with transformto function form OPAL. * ParallelTTracker integration from OPAL implemented in cuda. * For more details see ParallelTTracler docomentation in opal */ diff --git a/src/CUDA/CudaFFT.cuh b/src/CUDA/CudaFFT.cuh index 2fcbbec..f52ae9f 100644 --- a/src/CUDA/CudaFFT.cuh +++ b/src/CUDA/CudaFFT.cuh @@ -10,6 +10,10 @@ #include "../Algorithms/FFT.h" #include "CudaBase.cuh" +/** + * Cuda FFT class based on BaseFFT interface. + * Uses cuFFT library to perform FFTs on nvidias GPUs. + */ class CudaFFT : public BaseFFT { private: @@ -34,7 +38,7 @@ public: ~CudaFFT(); /** - * Info: init cufftPlans witch can be reused for all FFTs of the same size and type + * Init cufftPlans witch can be reused for all FFTs of the same size and type * Return: success or error code */ int setupFFT(int ndim, int N[3]); @@ -42,45 +46,21 @@ public: int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } /** - * Info: destroy default FFT plans + * Destroy default FFT plans * Return: success or error code */ int destroyFFT(); - /* - Info: execute complex to complex double precision fft using cufft library - Return: success or error code - */ int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); - /* - Info: execute ifft - Return: success or error code - */ int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); - /* - Info: execute normalize using cuda kernel for complex to complex iFFT - Return: success or error code - */ int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); - /* - Info: execute real to complex double precision FFT - Return: success or error code - */ int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); - /* - Info: exectue complex to real double precision FFT - Return: success or error code - */ int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); - /* - Info: execute normalize for complex to real iFFT - Return: success or error code - */ int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1); }; diff --git a/src/CUDA/CudaGreensFunction.cuh b/src/CUDA/CudaGreensFunction.cuh index 69f29de..57e8cd4 100644 --- a/src/CUDA/CudaGreensFunction.cuh +++ b/src/CUDA/CudaGreensFunction.cuh @@ -12,6 +12,7 @@ #include "../Algorithms/GreensFunction.h" #include "CudaBase.cuh" +/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */ class CudaGreensFunction : public GreensFunction{ private: diff --git a/src/CUDA/CudaImageReconstruction.cuh b/src/CUDA/CudaImageReconstruction.cuh index 4cf532c..ab10c2c 100644 --- a/src/CUDA/CudaImageReconstruction.cuh +++ b/src/CUDA/CudaImageReconstruction.cuh @@ -10,6 +10,7 @@ #include "../Algorithms/ImageReconstruction.h" #include "CudaBase.cuh" +/** CUDA implementation of ImageReconstruction interface. */ class CudaImageReconstruction : public ImageReconstruction { private: diff --git a/src/DKSBase.h b/src/DKSBase.h index be10898..d8eb708 100644 --- a/src/DKSBase.h +++ b/src/DKSBase.h @@ -1,11 +1,3 @@ -/** DKSBase class. - * DKSBase.h - * Author: Uldis Locans - * Date: 15.09.2014 - * Base class of Dynamic Kernel Scheduler that handles the function calls - * from host application to DKS - */ - #ifndef H_DKS_BASE #define H_DKS_BASE @@ -41,7 +33,12 @@ #include "AutoTuning/DKSConfig.h" -/** DKSBase class for handling function calls to DKS library */ +/** + * API for handling communication function calls to DKS library. + * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device, + * memory manegement, data transfer and other basic communication functions between + * the host and device. + */ class DKSBase { private: @@ -74,7 +71,7 @@ protected: DKSConfig dksconfig; /** - * Check if current API is set to OpenCL + * Check if current API is set to OpenCL. * Return true/false wether current api is opencl */ bool apiOpenCL(); @@ -91,11 +88,11 @@ protected: */ bool apiOpenMP(); - /** Check if device is GPU */ + /** Check if device is GPU. */ bool deviceGPU(); - /** Check if device is CPU */ + /** Check if device is CPU. */ bool deviceCPU(); - /** Check if device is MIC */ + /** Check if device is MIC. */ bool deviceMIC(); /** diff --git a/src/DKSBaseMuSR.h b/src/DKSBaseMuSR.h index 9f640b6..00e95dc 100644 --- a/src/DKSBaseMuSR.h +++ b/src/DKSBaseMuSR.h @@ -20,6 +20,11 @@ #include "OpenCL/OpenCLChiSquareRuntime.h" #endif +/** + * API to handle musrfit calls to DKS library. + * Using ChiSquareRuntime interface allows to call chi square functions on the + * GPU or CPU using CUDA or OpenCL. + */ class DKSBaseMuSR : public DKSFFT { private: diff --git a/src/DKSFFT.h b/src/DKSFFT.h index cd87523..0e062c1 100644 --- a/src/DKSFFT.h +++ b/src/DKSFFT.h @@ -24,6 +24,10 @@ #include "MIC/MICFFT.h" #endif +/** + * API to handel calls to DKSFFT. + * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries. + */ class DKSFFT : public DKSBase { private: diff --git a/src/DKSImageReconstruction.h b/src/DKSImageReconstruction.h index 32f67ef..2388309 100644 --- a/src/DKSImageReconstruction.h +++ b/src/DKSImageReconstruction.h @@ -10,6 +10,9 @@ #include "CUDA/CudaImageReconstruction.cuh" #endif +/** + * API to handle PET image reconstruction calls. + */ class DKSImageRecon : public DKSBase { private: @@ -22,87 +25,88 @@ public: ~DKSImageRecon(); - /** Image reconstruction analaysis calculate source. - * - * + /** + * Image reconstruction analaysis calculate source. */ int callCalculateSource(void *image_space, void *image_position, void *source_position, void *avg, void *std, float diameter, int total_voxels, int total_sources, int start = 0); - /** Image reconstruction analaysis calculate source. - * - * + /** + * Image reconstruction analaysis calculate source. */ int callCalculateBackground(void *image_space, void *image_position, void *source_position, void *avg, void *std, float diameter, int total_voxels, int total_sources, int start = 0); - /** Image reconstruction analaysis calculate source. - * - * + /** + * Image reconstruction analaysis calculate source. */ int callCalculateSources(void *image_space, void *image_position, void *source_position, void *avg, void *std, void *diameter, int total_voxels, int total_sources, int start = 0); - /** Image reconstruction analaysis calculate source. - * - * + /** + * Image reconstruction analaysis calculate source. */ int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, void *avg, void *std, void *diameter, int total_voxels, int total_sources, int start = 0); - /** Image reconstruction - generate normalization. - * + /** + * Image reconstruction - generate normalization. */ int callGenerateNormalization(void *recon, void *image_position, void *det_position, int total_det); - /** Image reconstruction - forward correction. - * + /** + * Image reconstruction - forward correction. */ int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, void *image_position, int num_events); - /** Image reconstruction - backward projection. - * + /** + * Image reconstruction - backward projection. */ int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, void *det_position, void *image_position, int num_events, int num_voxels); - /** Set the voxel dimensins on device. + /** + * Set the voxel dimensins on device. * Values are stored in GPU memory and used in forward and backward projection calculations. * Call set function once to transfer the values from host side to GPU. * If value changes on the host side set functions needs to be called again to update GPU values. */ int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); - /** Set the image edge. + /** + * Set the image edge. * Values are stored in GPU memory and used in forward and backward projection calculations. * Call set function once to transfer the values from host side to GPU. * If value changes on the host side set functions needs to be called again to update GPU values. */ int setEdge(float x_edge, float y_edge, float z_edge); - /** Set the image edge1. + /** + * Set the image edge1. * Values are stored in GPU memory and used in forward and backward projection calculations. * Call set function once to transfer the values from host side to GPU. * If value changes on the host side set functions needs to be called again to update GPU values. */ int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); - /** Set the minimum crystan in one ring values. + /** + * Set the minimum crystal in one ring values. * Values are stored in GPU memory and used in forward and backward projection calculations. * Call set function once to transfer the values from host side to GPU. * If value changes on the host side set functions needs to be called again to update GPU values. */ int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); - /** Set all other required parameters for reconstruction. + /** + * Set all other required parameters for reconstruction. * Values are stored in GPU memory and used in forward and backward projection calculations. * Call set function once to transfer the values from host side to GPU. * If value changes on the host side set functions needs to be called again to update GPU values. diff --git a/src/DKSOPAL.h b/src/DKSOPAL.h index 0577f2d..202654e 100644 --- a/src/DKSOPAL.h +++ b/src/DKSOPAL.h @@ -33,6 +33,11 @@ #include "MIC/MICCollimatorPhysics.h" #endif +/** + * API to handle OPAL calls to DKS library. + * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase + * functions. + */ class DKSOPAL : public DKSFFT { private: diff --git a/src/MIC/MICBase.h b/src/MIC/MICBase.h index 0087778..9371dd5 100644 --- a/src/MIC/MICBase.h +++ b/src/MIC/MICBase.h @@ -26,6 +26,9 @@ #define MIC_WIDTH 128 +/** MIC Base class handles device setup and basic communication with the device. + * Handles devicew setup, memory manegement and data transfers. + */ class MICBase { private: @@ -45,58 +48,60 @@ public: int m_device_id; - /* constructor */ + /** constructor */ MICBase(); - /* destructor */ + /** destructor */ ~MICBase(); - /* - Info: create MKL rand streams for each thread - Return: success or error code - */ + /** + * Create MKL rand streams for each thread + * Return: success or error code + */ int mic_createRandStreams(int size); - /* - Info: delete MKL rand streams - Return: succes or error code - */ + /** + * Delete MKL rand streams + * Return: succes or error code + */ int mic_deleteRandStreams(); - /* - Info: create a new signal for the mic - Return: success or error code - */ + /** + * Create a new signal for the mic. + * Signals can be used for assynchronous data transfers. + * Return: success or error code + */ int mic_createStream(int & streamId); - /* - Info: get the signal from the vector - Return: mic signal + /** + * Info: get the signal from the vector. + * Return: mic signal */ int& mic_getStream(int id); - /* - Info: delete streams - Return: success or error code - */ + /** + * Info: delete streams. + * Return: success or error code + */ int mic_deleteStreams(); - /* - Info: set device id - Return: success or error code - */ + /** + * Info: set device id. + * Return: success or error code + */ int mic_setDeviceId(int id); - /* - Info: get mic devices - Return: success or error code - */ + /** + * Info: get mic devices. + * Prints information about mic devices. + * Return: success or error code + */ int mic_getDevices(); - /* - Info: allocate memory on MIC device - Return: success or error code - */ + /** + * Allocate memory on MIC device. + * Return: success or error code + */ template void * mic_allocateMemory(int size) { @@ -109,10 +114,10 @@ public: return tmp; } - /* - Info: transfer data to device - Return: success or error code - */ + /** + * Transfer data to device. + * Return: success or error code + */ template int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) { T* tmp_ptr = (T*)data_ptr; @@ -123,10 +128,10 @@ public: return DKS_SUCCESS; } - /* - Info: write data to device, non-blocking - Return: success or error code - */ + /** + * Write data to device, non-blocking. + * Return: success or error code + */ template int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) { @@ -139,10 +144,10 @@ public: } - /* - Info: read data from device - Return: success or error code - */ + /** + * Read data from device + * Return: success or error code + */ template int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) { T* tmp_ptr = (T*)data_ptr; @@ -154,10 +159,10 @@ public: return DKS_SUCCESS; } - /* - Info: read data from device waiting for signal - Return: success or error code - */ + /** + * Read data from device waiting for signal + * Return: success or error code + */ template int mic_readDataAsync(const void * data_ptr, void * result, int size, int streamId = -1, int offset = 0) { @@ -172,10 +177,10 @@ public: } - /* - Info: wait till all the signals are complete - Return siccess or error code - */ + /** + * Wait till all the signals are complete + * Return siccess or error code + */ int mic_syncDevice() { //empty offload to wait for all the signals to finish and launch a new empy signal @@ -193,10 +198,10 @@ public: } - /* - Info: free memory on device - Return: success or error code - */ + /** + * Free memory on device + * Return: success or error code + */ template int mic_freeMemory(void * data_ptr, int size) { @@ -210,10 +215,10 @@ public: return DKS_SUCCESS; } - /* - Info: allocate memory and write data to device - Return: success or error code - */ + /** + * Allocate memory and write data to device + * Return: success or error code + */ template void * mic_pushData(const void * data, int size) { T* tmp_ptr = new T[size]; @@ -227,10 +232,10 @@ public: return tmp_ptr; } -/* - Info: read data and free memory on device - Return: success or erro code -*/ + /** + * Read data and free memory on device + * Return: success or erro code + */ template int mic_pullData(void * data_ptr, void * result, int size) { T* tmp_ptr = (T*)data_ptr; diff --git a/src/MIC/MICChiSquare.h b/src/MIC/MICChiSquare.h index c62de0b..56c0e99 100644 --- a/src/MIC/MICChiSquare.h +++ b/src/MIC/MICChiSquare.h @@ -14,6 +14,9 @@ #include #include "MICBase.h" +/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. + * Not complete and untested because of the poor performance of first MIC devices. + */ class MICChiSquare { MICBase *m_micbase; diff --git a/src/MIC/MICCollimatorPhysics.cpp b/src/MIC/MICCollimatorPhysics.cpp index f334d0b..5a8929c 100644 --- a/src/MIC/MICCollimatorPhysics.cpp +++ b/src/MIC/MICCollimatorPhysics.cpp @@ -22,22 +22,34 @@ #define I_M 10 #define DT_M 11 +/** + * MIC device function for calculating dot product. + */ __declspec(target(mic)) double dot(mic_double3 d1, mic_double3 d2) { return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); } +/** + * MIC device function for calculating dot product. + */ __declspec(target(mic)) double dot(double dx, double dy, double dz) { return (dx * dx + dy * dy + dz * dz); } +/** + * MIC device function to check if particle is still in material. + */ __declspec(target(mic)) bool checkHit(double &z, double *par) { return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); } +/** + * MIC device function to calculate arbitrary rotation. + */ __declspec(target(mic)) void Rot(double &px, double &pz, double &x, double &z, double xplane, double normP, double thetacou, double deltas, int coord) @@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane, pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); } + +/** + * MIC device function to calculate Coulomb scattering for one particle. + * Including Multiple Coulomb Scattering and large angle Rutherford Scattering. + * Uses AoS to store particle positions and momentum, paralelized using OpenMP. + * For details on the algorithm see OPAL user guide. + * Deprecated on favor of SoA data layout. + */ __declspec(target(mic)) void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) { double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; @@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr } +/** + * MIC device function to calculate Coulomb scattering for one particle. + * Including Multiple Coulomb Scattering and large angle Rutherford Scattering. + * Uses SoA to store particle positions and momentum, paralelized using OpenMP. + * For details on the algorithm see OPAL user guide. + */ __declspec(target(mic)) -void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label, +void coulombScat(double *rx, double *ry, double *rz, + double *px, double *py, double *pz, int *label, double *par, VSLStreamStatePtr &stream, int ii, int size) { - + + //arrays for temporary storage, each core proceses MIC_WIDTH particles double normP[MIC_WIDTH] __attribute__((aligned(64))); double deltas[MIC_WIDTH] __attribute__((aligned(64))); double theta0[MIC_WIDTH] __attribute__((aligned(64))); @@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou double z2[MIC_WIDTH] __attribute__((aligned(64))); double thetacou[MIC_WIDTH] __attribute__((aligned(64))); + //simd instruction tells the compiler its safe to vectorize the loop #pragma vector aligned #pragma simd for (int i = ii; i < ii + MIC_WIDTH; i++) { @@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou } } + //vectorize the loop #pragma vector aligned #pragma simd for (int i = ii; i < ii + size; i++) { @@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou } } - //generate array of random numbers vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); @@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou } +/** + * MIC device function to calculate energyLoss for one particle. + * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss + * algorith are available in OPAL user guide. + */ __declspec(target(mic)) void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) { @@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) pdead = 1; } +/** + * MIC device function to calculate energyLoss for one particle. + * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss + * algorith are available in OPAL user guide. + */ __declspec(target(mic)) void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) { @@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu double *par = (double*) par_ptr; VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream; + /* offload the computation to the MIC, reuses the memory already allocated on the mic. + the memory allocation and data trasnfer need to be handled before */ #pragma offload target(mic:m_micbase->m_device_id) \ inout(data:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \ @@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu VSLStreamStatePtr stream = streamArr[omp_get_thread_num()]; //for loop trough particles if not checkhit set label to -2 and update R.x - #pragma omp for simd for (int i = 0; i < numparticles; i++) { if ( !checkHit(data[i].Rincol.z, par) ) { @@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt { - + //cast device memory pointers to appropriate types int *label = (int*)label_ptr; unsigned *localID = (unsigned*)localID_ptr; double *rx = (double*)rx_ptr; @@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream; + /* offload the computation to the MIC, reuses the memory already allocated on the mic. + the memory allocation and data trasnfer need to be handled before */ #pragma offload target (mic:0) \ in(label:length(0) DKS_REUSE DKS_RETAIN) \ in(localID:length(0) DKS_REUSE DKS_RETAIN) \ diff --git a/src/MIC/MICCollimatorPhysics.h b/src/MIC/MICCollimatorPhysics.h index 910bc99..30bcccd 100644 --- a/src/MIC/MICCollimatorPhysics.h +++ b/src/MIC/MICCollimatorPhysics.h @@ -26,6 +26,12 @@ typedef struct { } MIC_PART_SMALL; +/** + * MICCollimatorPhysics class based on DKSCollimatorPhysics interface. + * Implementes OPALs collimator physics class for particle matter interactions using OpenMP + * and offload mode targetomg Intel Xeon Phi processors. + * For detailed documentation on CollimatorPhysics functions see OPAL documentation. + */ class MICCollimatorPhysics : public DKSCollimatorPhysics { private: diff --git a/src/MIC/MICFFT.h b/src/MIC/MICFFT.h index 491ea06..38c87e9 100644 --- a/src/MIC/MICFFT.h +++ b/src/MIC/MICFFT.h @@ -10,6 +10,10 @@ #include "../Algorithms/FFT.h" #include "MICBase.h" +/** + * MIC FFT based on BaseFFT interface. + * uses MKL library to offload FFT on Intel Xeon Phi devices. + */ class MICFFT : public BaseFFT { private: diff --git a/src/MIC/MICGreensFunction.hpp b/src/MIC/MICGreensFunction.hpp index db3944f..a1e00fa 100644 --- a/src/MIC/MICGreensFunction.hpp +++ b/src/MIC/MICGreensFunction.hpp @@ -15,6 +15,7 @@ #define DKS_SUCCESS 0 #define DKS_ERROR 1 +/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */ class MICGreensFunction : public GreensFunction { private: diff --git a/src/MIC/MICMergeSort.h b/src/MIC/MICMergeSort.h index 408037b..3af4e75 100644 --- a/src/MIC/MICMergeSort.h +++ b/src/MIC/MICMergeSort.h @@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) { return p; } +/** + * Merge sort implementation for intel MIC. + * Paralellized over all the MIC cores using OpenMP tasks. + */ template void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) { @@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) { } } +/** + * Quicksort algorithm, developed for use on Intel MIC devices. + */ template void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) { @@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) { } +/** + * Insertion sort of @p list, developed for use on Intel MIC. + * Used by quick_sort to sort small lists. + */ template void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) { diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h index e8b9d16..b811ae4 100644 --- a/src/OpenCL/OpenCLBase.h +++ b/src/OpenCL/OpenCLBase.h @@ -1,16 +1,3 @@ -/* - - Name: OpenCLBase - - Author: Uldis Locans - - Info: OpenCL base class to handle all the common details associated - with kernel launch on OpenCL device - - Date: 2014.09.18 - -*/ - #ifndef H_OPENCL_BASE #define H_OPENCL_BASE @@ -32,7 +19,7 @@ #include "../DKSDefinitions.h" -/* struct for random number state */ +/** struct for random number state. */ typedef struct { double s10; double s11; @@ -44,168 +31,194 @@ typedef struct { bool gen; } RNDState; +/** + * OpenCL base class to handle device setup and basic communication wiht the device. + * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch. + * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase + * class contains methods to read the kernel files, compile the kernel codes and launch kernels + * from the compiled codes. Which kernel file needs to be loaded for the specif functin is + * handled by the base class that is launching the kernel. + */ class OpenCLBase { private: - + + //variables containig OpenCL device and platform ids static cl_platform_id m_platform_id; static cl_device_id m_device_id; + //variables containit compiled OpenCL program and kernel cl_context_properties m_context_properties[3]; cl_program m_program; cl_kernel m_kernel; + //variables for tracking OpenCL events static cl_event m_last_event; cl_int m_num_events; std::vector m_events; + //currently load kernel file char * m_kernel_file; + //type of device used by OpenCL cl_device_type m_device_type; - /* - Name: getPlatforms - Info: get all avaialble platforms and save in m_platform_ids, save number of platforms - Return: success or error code - */ + /** + * Get all available OpenCL platforms. + * Get all avaialble platforms and save in m_platform_ids, save number of platforms + * Return: success or error code + */ int ocl_getPlatforms(); - /* - Name: getDevice - Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu) - ReturnL success or error code - */ + /** + * Get first available OpenCL device of specified type. + * Get first avaialble devices and save device id and platform id for this device, + * device name: (-gpu, -mic, -cpu) + * ReturnL success or error code + */ int ocl_getDevice(const char* device_name); - /* - Name getDeviceType - Info: get device type from device name (-gpu, -cpu, -mic) - Return: success or error code - */ + /** + * Get cl_device_type from the specified device name. + * get device type from device name (-gpu, -cpu, -mic) + * Return: success or error code + */ int ocl_getDeviceType(const char* device_name, cl_device_type &device_type); - /* - Name: createContext - Info: create context with specified device - Return: success or error code - */ + /** + * Create OpenCL context with specified device. + * Return: success or error code + */ int ocl_createContext(); - /* - Name: buildProgram - Info: build program from specified kernel file - Return: success or error code + /** + * Build program from specified kernel file. + * Return: success or error code. */ int ocl_buildProgram(const char* kernel_file); - /** Compile program from kernel source string - * + /** + * Compile program from kernel source string. + * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the + * OpenCL program, that can be then executed on the device. + * opts is a string specifiend additional compiler flags. */ int ocl_compileProgram(const char* kernel_source, const char* opts = NULL); protected: + //memory for random number states int defaultRndSet; cl_mem defaultRndState; public: + //OpenCL context and commad queue static cl_context m_context; - static cl_command_queue m_command_queue; + static cl_command_queue m_command_queue; - /* - constructor - */ + /** + * constructor + */ OpenCLBase(); - /* - destructor - */ + /** + * destructor + */ ~OpenCLBase(); - /* - Create RND states - Return: success or error code - */ + /** + * Allocate memory for size random number states and init the rnd states. + * Uses AMD clRng library for random numbers. + * This library is only compatible with AMD devices. + */ int ocl_createRndStates(int size); - /* Create an array of random numbers on the device - * + /** + * Create an array of random numbers on the device. + * Filles hte mem_ptr with random numbers. */ int ocl_createRandomNumbers(void *mem_ptr, int size); - /* - Destroy rnd states - Return: success or error code - */ + /** + * Destroy rnd states and free device memory. + * Return: success or error code + */ int ocl_deleteRndStates(); - /* - Name: getAllDevices - Info: get all available devices - ReturnL success or error code + /** + * Prints info about all the available platforms and devices. + * Can be used for information purposes to see what devices are available on the system. + * ReturnL success or error code. */ int ocl_getAllDevices(); - /** Get the OpenCL device count for the set type of device - * + /** + * Get the OpenCL device count for the set type of device. + * Device count is set in ndev parameter, returns success or error code. */ int ocl_getDeviceCount(int &ndev); - /** Get the name of the device used + /** + * Get the name of the device currently us use. */ int ocl_getDeviceName(std::string &device_name); - /** Set the device to use for OpenCL kernels. - * device id to use is passed as integer. + /** + * Set the device to use for OpenCL kernels. + * Device id to use is passed as integer. */ int ocl_setDevice(int device); - /** Get a list of all the unique devices of the same type that can run OpenCL kernels - * Used when GPUs of different types might be pressent on the system. + /** + * Get a list of all the unique devices of the same type that can run OpenCL kernels. + * Used when GPUs of different types might be pressent on the system. */ int ocl_getUniqueDevices(std::vector &devices); - /* - Name: setUp - Info: set up opencl resources - Return: success or error code - */ + /** + * Initialize OpenCL connection with a device of specified type. + * Find if specified device is avaialble, creates a contex and command queue. + * Returns success or error code. + */ int ocl_setUp(const char* device_name); - /* - Name: loadKernel - Info: load and compile opencl kernel file if it has changed - Return: success or error code + /** + * Given a OpenCL kernel file name loads the content and compile the OpenCL code. + * Load and compile opencl kernel file if it has changed. + * Return: success or error code */ int ocl_loadKernel(const char* kernel_file); - /** Build program from kernel source. + /** + * Build program from kernel source. * Builds a program from source code provided in kernel_source. * If compilation fails will return DKS_ERROR */ int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL); - /* - Name: allocateMemory - Info: allocate memory on device - Return: return pointer to memory + /** + * Allocate memory on the device. + * Return: return pointer to memory */ cl_mem ocl_allocateMemory(size_t size, int &ierr); - /* - Name: allocateMemory - Info: allocate memory on device - Return: return pointer to memory + /** + * Allocate memory of specific type on device. + * The availabel types are cl_mem_flags type listed in OpenCL documentation: + * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, + * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR. + * Return: return pointer to memory */ cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); - /** Zero OpenCL memory buffer - * Set all the elemetns in the device array to zero + /** + * Zero OpenCL memory buffer. + * Set all the elemetns in the device array to zero. */ template int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) { @@ -218,92 +231,90 @@ public: return DKS_SUCCESS; } - /* - Name: writeData - Info: write data to device memory (needs ptr to mem object) - Return: success or error code - */ + /** + * Write data to device memory (needs ptr to mem object) + * Return: success or error code + */ int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); - /* - Name: copyData - Info: copy data from one buffer on the device to another - Return: success or error code - */ + /** + * Copy data from one buffer on the device to another + * Return: success or error code + */ int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size); - /* - Name: createKernel - Info: create kernel from program - Return: success or error code - */ + /** + * Create kernel from compiled OpenCL program. + * Return: success or error code + */ int ocl_createKernel(const char* kernel_name); - /* - Name: setKernelArgs - Info: set opencl kernel arguments - Return: success or error code - */ + /** + * Set argiments for the kernel that will be launched. + * Return: success or error code + */ int ocl_setKernelArg(int idx, size_t size, const void *arg_value); - /* - Name: executeKernel - Info: execute selected kernel (needs kernel parameters) - Return: success or error code + /** + * Execute selected kernel. + * Before kenrel can be executed buildProgram must be executed, create kernel must be executed + * and kenre specifeid in execute kerenel must be in compiled source, and the necessary + * kernel arguments must be set. + * Return: success or error code */ int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL); - /* - Name: readData - Info: read data from device (needs pointer to mem object) - Return: success or error code - */ + /** + * Read data from device (needs pointer to mem object). + * Return: success or error code + */ int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); - /* - Name: freeMemory - Info: free device memory (needs ptr to mem object) - Return: success or error code - */ + /** + * Free device memory (needs ptr to mem object). + * Return: success or error code + */ int ocl_freeMemory(cl_mem mem_ptr); - /* - Name: cleanUp - Info: free opencl resources - Return: success or error code - */ + /** + * Free opencl resources. + * Deletes the kernel, compiled program, command queue and colese the connection + * to device by releasing the context. + * Return: success or error code + */ int ocl_cleanUp(); - /* - Name: deviceInfo - Info: print device info (mostly for debugging purposes) - Return: success or error code - */ + /** + * Print info of currently selected device. + * Mostly for debugging purposes, but in verbose mode can be used to see device properties. + * Return: success or error code + */ int ocl_deviceInfo(bool verbose = true); - /* Check OpenCL kernel. - * Query device and check if it can run the kernel with required parameters + /* + * Check OpenCL kernel. + * Query device and check if it can run the kernel with required parameters. + * Also check the available OpenCL extensions - usefull for checking the supported device + * features, like double precission. */ int ocl_checkKernel(const char* kernel_name, int work_group_size, bool double_precision, int &threadsPerBlock); - /* - Name: clearEvents - Info: clear saved events (for debuging purposes) - Return: nothing - */ + /** + * Clear the event list. + * Events can be used for timing and synchronization purposes. + */ void ocl_clearEvents(); - /* - Name: eventInfo - Info: print information about kernel timings (for debuging purposes) - Return: nothing - */ + /** + * print information about kernel timings from event list. + * for debuging purposes + */ void ocl_eventInfo(); - /* - Return current command queue - */ + /** + * Return current command queue. + */ cl_command_queue ocl_getQueue() { return m_command_queue; } }; diff --git a/src/OpenCL/OpenCLChiSquare.h b/src/OpenCL/OpenCLChiSquare.h index bbc5da6..5c7ad51 100644 --- a/src/OpenCL/OpenCLChiSquare.h +++ b/src/OpenCL/OpenCLChiSquare.h @@ -14,7 +14,7 @@ #define DKS_SUCCESS 0 #define DKS_ERROR 1 - +/** Deprecated, SimpleFit implementation of ChiSquare. */ class OpenCLChiSquare { private: diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp index f8e21a6..2b97b87 100644 --- a/src/OpenCL/OpenCLChiSquareRuntime.cpp +++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp @@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType, } int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) { + //write params to gpu int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams); return ierr; } @@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) { if (numfunc == 0) return DKS_SUCCESS; + //write function values to the GPU int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc); return ierr; } @@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) { if (nummap == 0) return DKS_SUCCESS; + //wrtie map values to the GPU int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap); return ierr; } @@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, freeChiSquare(); } - //allocate temporary memory + //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr); mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr); if (size_func == 0) @@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() { int ierr = DKS_ERROR; if (initDone_m) { - //free memory + //free GPU memory ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m); @@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl return DKS_ERROR; } + //check the GPU kernel ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock); return ierr; diff --git a/src/OpenCL/OpenCLChiSquareRuntime.h b/src/OpenCL/OpenCLChiSquareRuntime.h index 90b5c7c..350b6df 100644 --- a/src/OpenCL/OpenCLChiSquareRuntime.h +++ b/src/OpenCL/OpenCLChiSquareRuntime.h @@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double * const std::string openclFunctFooter = "}\n"; +/** + * OpenCL implementation of ChiSquareRuntime class. + * Implements ChiSquareRuntime interface to allow musrfit to target devices that + * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi. + */ class OpenCLChiSquareRuntime : public ChiSquareRuntime { private: OpenCLBase *m_oclbase; - /** Private function to add user defined function to kernel string - * + /** + * Private function to add user defined function to kernel string. */ std::string buildProgram(std::string function); + /** + * Launch parallel reduction kernel to calculate the sum of data array + */ double calculateSum(cl_mem data, int length); public: - /** Constructor wiht openclbase argument - * + /** + * Constructor wiht openclbase argument. */ OpenCLChiSquareRuntime(OpenCLBase *base); - /** Default constructor - * + /** + * Default constructor */ OpenCLChiSquareRuntime(); - /** Default destructor - * + /** + * Default destructor */ ~OpenCLChiSquareRuntime(); - /** Compile program and save ptx. + /** + * Compile program and save ptx. * Add function string to the calcFunction kernel and compile the program * Function must be valid C math expression. Parameters can be addressed in * a form par[map[idx]] */ int compileProgram(std::string function, bool mlh = false); - /** Launch selected kernel + /** + * Launch selected kernel. * Launched the selected kernel from the compiled code. * Result is put in &result variable */ @@ -64,22 +74,26 @@ public: double timeStart, double timeStep, double &result); - /** Write params to device. + /** + * Write params to device. * Write params from double array to mem_param_m memory on the device. */ int writeParams(const double *params, int numparams); - /** Write functions to device. + /** + * Write functions to device. * Write function values from double array to mem_func_m memory on the device. */ int writeFunc(const double *func, int numfunc); - /** Write maps to device. + /** + * Write maps to device. * Write map values from int array to mem_map_m memory on the device. */ int writeMap(const int *map, int nummap); - /** Allocate temporary memory needed for chi square. + /** + * Allocate temporary memory needed for chi square. * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to * the maximum number of elements in any datasets that will be used for calculations. Size_param, * size_func and size_map are the maximum number of parameters, functions and maps used in @@ -87,14 +101,16 @@ public: */ int initChiSquare(int size_data, int size_param, int size_func, int size_map); - /** Free temporary memory allocated for chi square. + /** + * Free temporary memory allocated for chi square. * Frees the chisq temporary memory and memory for params, functions and maps */ int freeChiSquare(); - /** Check MuSR kernels for necessary resources. + /** + * Check MuSR kernels for necessary resources. * Query device properties to get if sufficient resources are - * available to run the kernels + * available to run the kernels. Also checks if double precission is enabled on the device. */ int checkChiSquareKernels(int fitType, int &threadsPerBlock); diff --git a/src/OpenCL/OpenCLCollimatorPhysics.h b/src/OpenCL/OpenCLCollimatorPhysics.h index 0f8accc..2de1408 100644 --- a/src/OpenCL/OpenCLCollimatorPhysics.h +++ b/src/OpenCL/OpenCLCollimatorPhysics.h @@ -17,12 +17,16 @@ #include "boost/compute/core.hpp" */ +/** Double3 structure for use in OpenCL code. */ typedef struct { double x; double y; double z; } Double3; +/** + * Structure for stroing particles in OpenCL code. + */ typedef struct { int label; unsigned localID; @@ -35,6 +39,10 @@ typedef struct { //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z)); //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol)); +/** + * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface. + * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs. + */ class OpenCLCollimatorPhysics : public DKSCollimatorPhysics { private: @@ -42,16 +50,20 @@ private: public: - /* constructor */ + /** + * Constructor with OpenCLBase as argument. + * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object. + */ OpenCLCollimatorPhysics(OpenCLBase *base) { m_oclbase = base; } - /* destructor */ + /** + * Destructor. + */ ~OpenCLCollimatorPhysics() { } - /* execute degrader code on device */ int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, bool enableRutherforScattering = true); diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h index 6ab8c88..8f930a2 100644 --- a/src/OpenCL/OpenCLFFT.h +++ b/src/OpenCL/OpenCLFFT.h @@ -1,14 +1,3 @@ -/* - - Name: OpenCLFFT - - Author: Uldis Locans - - Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL - - Data: 19.09.2014 - -*/ #ifndef H_OPENCL_FFT #define H_OPENCL_FFT @@ -22,6 +11,12 @@ #include "clFFT.h" +/** + * OpenCL FFT class based on BaseFFT interface. + * Uses clFFT library to perform FFTs on AMD gpus. + * clFFT library works also on nvida GPUs and other devices that + * support OpenCL. + */ class OpenCLFFT : public BaseFFT { private: diff --git a/src/OpenCL/OpenCLGreensFunction.h b/src/OpenCL/OpenCLGreensFunction.h index 3e90fba..9e352ac 100644 --- a/src/OpenCL/OpenCLGreensFunction.h +++ b/src/OpenCL/OpenCLGreensFunction.h @@ -7,6 +7,7 @@ #include "../Algorithms/GreensFunction.h" #include "OpenCLBase.h" +/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */ class OpenCLGreensFunction : public GreensFunction { private: @@ -31,7 +32,7 @@ public: int buildProgram(); /** - Info: calc itegral on device memory (taken from OPAL src code) + Info: calc itegral on device memory (taken from OPAL src code). Return: success or error code */ int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, @@ -39,20 +40,20 @@ public: int streamId = -1); /** - Info: integration of rho2_m field (taken from OPAL src code) + Info: integration of rho2_m field (taken from OPAL src code). Return: success or error code */ int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, int streamId = -1); /** - Info: mirror rho field (taken from OPAL src code) + Info: mirror rho field (taken from OPAL src code). Return: succes or error code */ int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1); /** - Info: multiply complex fields already on the GPU memory, result will be put in ptr1 + Info: multiply complex fields already on the GPU memory, result will be put in ptr1. Return: success or error code */ int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); diff --git a/src/Utility/DKSTimer.h b/src/Utility/DKSTimer.h index 80025c0..fcfce4a 100644 --- a/src/Utility/DKSTimer.h +++ b/src/Utility/DKSTimer.h @@ -5,6 +5,10 @@ #include #include +/** + * Custom timer class. + * Allows to insert timers in the code to get function exectution times. + */ class DKSTimer { private: @@ -17,39 +21,45 @@ private: public: - /** Init DKSTimer by seting timer to zero */ + /** Init DKSTimer by seting timer to zero. */ DKSTimer(); ~DKSTimer(); - /** Init the timer - * Set the name for timer and clear all values + /** + * Init the timer. + * Set the name for timer and clear all values */ void init(std::string n); - /** Start the timer. - * Get the curret time with gettimeofday and save in timeStart + /** + * Start the timer. + * Get the curret time with gettimeofday and save in timeStart */ void start(); - /** Stop the timer - * Get the curretn time with gettimeofday and save in timeEnd - * Calculate elapsed time by timeEnd - timeStart and add to timervalue + /** + * Stop the timer. + * Get the curretn time with gettimeofday and save in timeEnd + * Calculate elapsed time by timeEnd - timeStart and add to timervalue */ void stop(); - /** Reset timervalue to zero. - * Set timervalue, timeStart and timeEnd to zero + /** + * Reset timervalue to zero. + * Set timervalue, timeStart and timeEnd to zero */ void reset(); - /** Return elapsed time in seconds. - * Return the value of timervalue + /** + * Return elapsed time in seconds. + * Return the value of timervalue */ double gettime(); - /** Print timer. - * Print the elapsed time of the timer + /** + * Print timer. + * Print the elapsed time of the timer */ void print();