updated documentation

This commit is contained in:
Uldis Locans
2017-08-10 14:57:48 +02:00
parent 7ca93a3a49
commit ccc4329bef
38 changed files with 939 additions and 673 deletions

View File

@ -15,6 +15,9 @@
class DKSBaseMuSR;
/**
* Interface to implement ChiSquareRuntime class for musrfit.
*/
class ChiSquareRuntime {
friend class DKSBaseMuSR;
@ -63,23 +66,54 @@ public:
/** Default constructor */
//ChiSquareRuntime();
/** Default destructor */
/** Default destructor. */
virtual ~ChiSquareRuntime() { };
/**
* Compile GPU programm generated at runtime.
*/
virtual int compileProgram(std::string function, bool mlh = false) = 0;
/**
* Launche the compiled chiSquare kernel.
*/
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result) = 0;
/**
* Write the parameter values to the GPU.
*/
virtual int writeParams(const double *params, int numparams) = 0;
/**
* Write the function values to the GPU.
*/
virtual int writeFunc(const double *func, int numfunc) = 0;
/**
* Write map values to the GPU.
*/
virtual int writeMap(const int *map, int nummap) = 0;
/**
* Allocate temporary memory needed for the chi square calucaltios on the device.
*/
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
/**
* Free device memory allocated for chi square calculations.
*/
virtual int freeChiSquare() = 0;
/**
* Check if available device can run the chi square GPU code.
*/
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
/** Set N0, tau and bgk values to use for the kernel.
/**
* Set N0, tau and bgk values to use for the kernel.
* If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS.
*/
@ -91,7 +125,8 @@ public:
return DKS_SUCCESS;
}
/** Set alpha and beta values to use for the kernel.
/**
* Set alpha and beta values to use for the kernel.
* If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS.
*/
@ -101,7 +136,8 @@ public:
return DKS_SUCCESS;
}
/** Set number of blocks and threads.
/**
* Set number of blocks and threads.
* Used to set parameters obtained from auto-tuning
*/
int setKernelParams(int numBlocks, int blockSize) {
@ -118,7 +154,8 @@ public:
return ierr;
}
/** Get the number of operations in compiled kernel.
/**
* Get the number of operations in compiled kernel.
* Count the number of operation in the ptx file for the compiled program.
*/
int getOperations(int &oper) {

View File

@ -5,6 +5,9 @@
#include <string>
#include "../DKSDefinitions.h"
/**
* Interface to impelment particle matter interaction for OPAL.
*/
class DKSCollimatorPhysics {
protected:
@ -16,28 +19,60 @@ public:
virtual ~DKSCollimatorPhysics() { }
/**
* Execute collimator physics kernel.
*
*/
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
bool enableRutherforScattering = true) = 0;
/**
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles) = 0;
/**
* Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array
*/
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
/**
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback) = 0;
/**
* BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0;
/**
* BorisPusher kick function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1) = 0;
/**
* BorisPusher push function with transformto function form OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,

View File

@ -6,12 +6,21 @@
#include "../DKSDefinitions.h"
/**
* Abstract class defining methods for DKS FFT class.
* Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
*/
class BaseFFT {
protected:
int defaultN[3];
int defaultNdim;
/**
* Check if FFT plan is created for the needed dimension and FFT size.
* Returns true if the plan has been created and false if no plan for specified dimension
* and size exists.
*/
bool useDefaultPlan(int ndim, int N[3]) {
if (ndim != defaultNdim)
return false;
@ -24,18 +33,57 @@ public:
virtual ~BaseFFT() { }
/** Setup FFT - init FFT library used by chosen device. */
virtual int setupFFT(int ndim, int N[3]) = 0;
/** Setup real to complex FFT - init FFT library used by chosen device. */
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
/** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
/** Clean up. */
virtual int destroyFFT() = 0;
/**
* Exectute C2C FFT.
* mem_ptr - memory ptr on the device for complex data.
* Performs in place FFT.
*/
virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
int streamId = -1, bool forward = true) = 0;
/**
* Exectute inverse C2C FFT.
* mem_ptr - memory ptr on the device for complex data.
* Performs in place FFT.
*/
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
/**
* Normalize the FFT or IFFT.
* mem_ptr - memory to complex data.
*/
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
/**
* Exectute R2C FFT.
* real_ptr - real input data for FFT, comp_ptr - memory on the device where
* results for the FFT are stored as complex numbers.
*/
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0;
/**
* Exectute C2R FFT.
* real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
*/
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0;
/**
* Normalize CR FFT.
*/
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
};

View File

@ -4,24 +4,27 @@
#include <iostream>
#include <cmath>
/**
* Interface to implement Greens function calculations for OPAL.
*/
class GreensFunction {
public:
virtual ~GreensFunction() { }
/** calc greens integral, as defined in OPAL */
/** calc greens integral, as defined in OPAL. */
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
/** integration if rho2_m, see OPAL for more details */
/** integration if rho2_m, see OPAL for more details. */
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1) = 0;
/** mirror rho2_m field */
/** mirror rho2_m field. */
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
/** multiply two complex fields from device memory */
/** multiply two complex fields from device memory. */
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
};

View File

@ -5,17 +5,22 @@
#define BLOCK_SIZE 128
/** Struct to hold voxel position for PET image. */
struct VoxelPosition {
float x;
float y;
float z;
};
/** Struct that holds pair of detectors that registered an envent. */
struct ListEvent {
unsigned detA : 16;
unsigned detB : 16;
};
/**
* Interface to implement PET image reconstruction.
*/
class ImageReconstruction {
protected:
@ -25,7 +30,8 @@ public:
virtual ~ImageReconstruction() { }
/** Caluclate source.
/**
* Caluclate source.
* Places a sphere at each voxel position and calculate the avg value and std value of pixels
* that are inside this sphere. All the sphere used have the same diameter.
*/
@ -33,7 +39,8 @@ public:
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/** Calculate background.
/**
* Calculate background.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -42,7 +49,8 @@ public:
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/** Caluclate source using differente sources.
/**
* Caluclate source using differente sources.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* each sphere is given by *diameter array.
@ -52,7 +60,7 @@ public:
int total_sources, int start = 0) = 0;
/**
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
* Places two sphere at each voxel position, calculates the avg value and std value of pixels.
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
* smaller sphere.
@ -61,7 +69,8 @@ public:
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/** Generate normalization.
/**
* Generate normalization.
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
* that updates voxel values in the image on the slope between these two detectors.
*/
@ -69,14 +78,16 @@ public:
void *det_position, int total_det) = 0;
/** Calculate forward projection.
/**
* Calculate forward projection.
* For image reconstruction calculates forward projections.
* see recon.cpp for details
*/
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events) = 0;
/** Calculate backward projection.
/**
* Calculate backward projection.
* For image reconstruction calculates backward projections.
* see recon.cpp for details
*/
@ -84,29 +95,29 @@ public:
void *det_position, void *image_position,
int num_events, int num_voxels) = 0;
/** Set the voxel dimensins on device.
*
/**
*Set the voxel dimensins on device.
*/
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
/** Set the image edge variables on the device.
*
/**
* Set the image edge variables on the device.
*/
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
/** Set the image edge1 on the device.
*
/**
* Set the image edge1 on the device.
*/
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
/** Set the minimum crystan in one ring values on the device.
*
/**
* Set the minimum crystan in one ring values on the device.
*/
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
float min_CrystalDist_InOneRing1) = 0;
/** Set all other required parameters for reconstruction.
*
/**
* Set all other required parameters for reconstruction.
*/
virtual int setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter) = 0;

View File

@ -18,6 +18,17 @@
typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States;
/**
* DKS autotuning class, allows to auto-tune the defince function.
* Executes the defined function for auto-tuning and searches for optimal parameters to improve
* the function execution time. The function that is auto-tuned, parameters and the ranges
* need to be set. Includes multiple search methods, that searches the parameter space to finde
* the optimal solution.
* 1) exaustive search
* 2) line search
* 3) hill climbimg
* 4) simulated annealing
*/
class DKSAutoTuning {
private:
@ -36,10 +47,11 @@ private:
int loops_m;
/** Update parameters from a state */
/** Update parameters from a state. */
int setParameterValues(States states);
/** Evaluate the function and set execution time
/**
* Evaluate the function and set execution time
* Returns DKS_ERROR if errors occured during function execution.
* Returns DKS_SUCCESS if function executed as planned.
*/
@ -50,10 +62,11 @@ public:
/** Constructor */
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
/** Destructor */
/** Destructor. */
~DKSAutoTuning();
/** Set function to auto tune.
/**
* Set function to auto tune.
* Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind.
*/
@ -63,13 +76,19 @@ public:
evaluate_time_m = evaluate_time;
}
/**
* Set function to auto tune.
* Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind.
*/
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
fd_m = f;
function_name_m = name;
evaluate_time_m = evaluate_time;
}
/** Set parameter for auto tuning.
/**
* Set parameter for auto tuning.
* Provide a pointer to a parameter that will be changed during auto-tuning
* and a min-max value for this element
*/
@ -85,9 +104,9 @@ public:
/** Perform exaustive search evaluating all the parameter configurations */
void exaustiveSearch();
/** Perform auto-tuning.
* Perform line-search auto-tuning by variying parameters one at a time and keeping other
* parameters constant.
/**
* Perform line-search auto-tuning by variying parameters one at a time.
* After one parameter is auto-tuned the next on is varied
*/
void lineSearch();

View File

@ -4,6 +4,7 @@
#include <iostream>
#include <cmath>
/** Tester class for auto-tuning search algorithms. */
class DKSAutoTuningTester {
friend class DKSBaseMuSR;

View File

@ -1,9 +1,3 @@
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
*/
#ifndef DKS_CONFIG
#define DKS_CONFIG
@ -29,6 +23,13 @@ namespace pt = boost::property_tree;
const std::string config_dir = "/.config/DKS";
const std::string config_file = "/autotuning.xml";
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
* TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
* so the auto-tuning has no effect.
*/
class DKSConfig {
private:

View File

@ -9,6 +9,9 @@
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
/**
* Parameter class allows to change the searchable parameters during the auto-tuning.
*/
class Parameter {
private:
@ -64,6 +67,10 @@ public:
};
/**
* Struct to hold a auto-tuning state.
* Holds the current value, min, max and a step to witch a state can change.
*/
struct State {
double value;
double min;
@ -74,6 +81,12 @@ struct State {
typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States;
/**
* Used by auto-tuning search algorithms to move between parameter configurations.
* Allows to move from one parameter stat to another, get neighboring states,
* move to neighboring states and save state information. Print functions are available
* for debugging purposes, to follow how algorithm muves between sates.
*/
class DKSSearchStates {
private:

View File

@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
return DKS_SUCCESS;
}
/*
Info: allcate memory and write data (push)
Return: pointer to memory object
*/
/*
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
void * mem_ptr;
mem_ptr = cuda_allocateMemory(size, ierr);
if (ierr == DKS_SUCCESS)
ierr = cuda_writeData(mem_ptr, in_data, size);
return mem_ptr;
}
*/
/*
Info: read data and free memory (pull)
Return: success or error code
*/
/*
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
ierr = cuda_readData(mem_ptr, out_data, size);
if (ierr == DKS_SUCCESS)
ierr = cuda_freeMemory(mem_ptr);
else
return DKS_ERROR;
if (ierr == DKS_SUCCESS)
return DKS_SUCCESS;
else
return DKS_ERROR;
}
*/
/*
Info: execute function
Return: success or error code
*/
int CudaBase::cuda_executeFunction() {
std::cout << "Execute function" << std::endl;
return DKS_SUCCESS;
}
/*
Info: clean up
Return: success or error code
*/
int CudaBase::cuda_cleanUp() {
std::cout << "clean up" << std::endl;
return DKS_SUCCESS;
}

View File

@ -16,6 +16,11 @@
#define BLOCK_SIZE 128
/**
* CUDA base class handles device setup and basic communication with the device.
* Handles devicew setup, memory manegement, data transfers and stream setup for
* asynchronous data transfers and kernel executions.
*/
class CudaBase {
private:
@ -52,13 +57,13 @@ public:
*/
int cuda_deleteCurandStates();
/** Create 'size' random numbers on the device and save in mem_ptr array
*
/**
* Create 'size' random numbers on the device and save in mem_ptr array.
*/
int cuda_createRandomNumbers(void *mem_ptr, int size);
/** Get a pointer to curand states
*
/**
* Get a pointer to curand states.
*/
curandState* cuda_getCurandStates();
@ -75,93 +80,98 @@ public:
int cuda_addStream(cudaStream_t tmpStream, int &streamId);
/**
* delete cuda stream
* delete cuda stream.
* success or error code
*/
int cuda_deleteStream(int id);
/**
* delete all streams
* delete all streams.
* success or error code
*/
int cuda_deleteStreams();
/**
* set stream to use
* set stream to use.
* success or error code
*/
int cuda_setStream(int id);
/**
* Info: get stream that is used
* get stream that is used.
* Return: return id of curretn stream
*/
int cuda_getStreamId();
/**
* Info: reset to default stream
* reset to default stream.
* Return: success or error code
*/
int cuda_defaultStream();
/**
* Info: get number of streams
* get number of streams.
* Return: success or error code
*/
int cuda_numberOfStreams();
/**
* Info: get stream
* get stream.
* Return: stream
*/
cudaStream_t cuda_getStream(int id);
/**
* Get default cublass handle
* Get default cublass handle.
*/
cublasHandle_t cuda_getCublas();
/**
* Info: get information on cuda devices
* get information on cuda devices.
* Return: success or error code
*/
int cuda_getDevices();
/** Get CUDA device count.
/**
* Get CUDA device count.
* Sets the number of devices on the platform that can use CUDA.
* Returns DKS_SUCCESS
*/
int cuda_getDeviceCount(int &ndev);
/** Get the name of the device.
/**
* Get the name of the device.
* QUery the device properties of the used device and set the string device_name
*/
int cuda_getDeviceName(std::string &device_name);
/** Set CUDA device to use.
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR
/**
* Set CUDA device to use.
* If device passed in is larger than the number of devices use
* the default:0 and return DKS_ERROR
*/
int cuda_setDevice(int device);
/** Get unique devices
/**
* Get unique devices.
* Get array of indeces with the unique CUDA devices available on the paltform
*/
int cuda_getUniqueDevices(std::vector<int> &devices);
/**
* Info: init device
* Initialize connection to the device.
* Only needed when runtime compilation is used.
* Return: success or error code
*/
int cuda_setUp();
/**
* Info: allocate memory on cuda device
* Allocate memory on cuda device.
* Return: pointer to memory object
*/
void * cuda_allocateMemory(size_t size, int &ierr);
/**
* Info: allocate host memory in pinned memory
* Allocate host memory in pinned memory
* Return: success or error code
*/
template<typename T>
@ -174,7 +184,8 @@ public:
return DKS_SUCCESS;
}
/** Zero CUDA memory.
/**
* Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
@ -189,7 +200,8 @@ public:
return DKS_SUCCESS;
}
/** Zero CUDA memory.
/**
* Zero CUDA memory async.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
@ -209,7 +221,7 @@ public:
}
/**
* Info: write data to memory
* Write data to memory
* Retrun: success or error code
*/
template<typename T>
@ -226,7 +238,7 @@ public:
}
/**
* Info: write data assynchonuously
* Write data assynchonuously
* Return: success or error code
*/
template<typename T>
@ -258,7 +270,7 @@ public:
}
/**
* Info: read data from memory
* Read data from memory
* Return: success or error code
*/
template<typename T>
@ -275,7 +287,7 @@ public:
}
/**
* Info: read data async from device memory
* Read data async from device memory
* Return: success or error code
*/
template<typename T>
@ -307,19 +319,19 @@ public:
}
/**
* Info: free memory on device
* Free memory on device
* Return: success or error code
*/
int cuda_freeMemory(void * mem_ptr);
/**
* Info: free page locked memory on host
* Free page locked memory on host
* Return: success or erro code
*/
int cuda_freeHostMemory(void * mem_ptr);
/**
* Info: allcate memory and write data (push)
* Allcate memory and write data (push)
* Return: pointer to memory object
*/
template<typename T>
@ -335,7 +347,7 @@ public:
}
/**
* Info: read data and free memory (pull)
* Read data and free memory (pull)
* Return: success or error code
*/
template<typename T>
@ -355,19 +367,8 @@ public:
}
/**
* Info: execute function
* Return: success or error code
*/
int cuda_executeFunction();
/**
* Info: clean up
* Return: success or error code
*/
int cuda_cleanUp();
/**
* Info: sync cuda device
* Sync cuda device.
* Waits till all the tasks on the GPU are finished.
* Return: success or error code
*/
int cuda_syncDevice() {
@ -376,7 +377,7 @@ public:
}
/**
* Page-lock host memory
* Page-lock host memory.
*/
template<typename T>
int cuda_hostRegister(T *ptr, int size) {
@ -390,7 +391,7 @@ public:
}
/**
* Release page locked memory
* Release page locked memory.
*/
template<typename T>
int cuda_hostUnregister(T *ptr) {
@ -403,7 +404,7 @@ public:
}
/**
* Info: print device memory info (total, used, avail)
* Print device memory info (total, used, avail)
* Return: success or error code
*/
int cuda_memInfo() {

View File

@ -8,6 +8,7 @@
#include "CudaBase.cuh"
/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
class CudaChiSquare {
private:

View File

@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
const std::string cudaFunctFooter = "}\n";
/**
* CUDA implementation of ChiSquareRuntime class.
* Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
*/
class CudaChiSquareRuntime : public ChiSquareRuntime{
private:
@ -29,65 +33,72 @@ private:
cublasHandle_t defaultCublasRT;
/** Setup to init device
/**
* Setup to init device.
* Create context and init device for RT compilation
*/
void setUpContext();
/** Private function to add function to kernel string
*
/**
* Private function to add function to kernel string.
*/
std::string buildProgram(std::string function);
public:
/** Constructor with CudaBase argument
*
/**
* Constructor with CudaBase argument
*/
CudaChiSquareRuntime(CudaBase *base);
/** Default constructor init cuda device
*
/**
* Default constructor init cuda device
*/
CudaChiSquareRuntime();
/** Default destructor
*
/**
* Default destructor.
*/
~CudaChiSquareRuntime();
/** Compile program and save ptx.
/**
* Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]]
*/
int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel
/**
* Launch selected kernel.
* Launched the selected kernel from the compiled code.
* Result is put in &result variable
* Result is put in &result variable.
*/
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result);
/** Write params to device.
/**
* Write params to device.
* Write params from double array to mem_param_m memory on the device.
*/
int writeParams(const double *params, int numparams);
/** Write functions to device.
/**
* Write functions to device.
* Write function values from double array to mem_func_m memory on the device.
*/
int writeFunc(const double *func, int numfunc);
/** Write maps to device.
/**
* Write maps to device.
* Write map values from int array to mem_map_m memory on the device.
*/
int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square.
/**
* Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in
@ -96,12 +107,14 @@ public:
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square.
/**
* Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps
*/
int freeChiSquare();
/** Check if CUDA device is able to run the chi square kernel.
/**
* Check if CUDA device is able to run the chi square kernel.
* Redundant - all new CUDA devices that support RT compilation will also support
* double precision, there are no other requirements to run chi square on GPU
*/

View File

@ -1,5 +1,6 @@
#include "CudaCollimatorPhysics.cuh"
//constants used in OPAL
//#define M_P 0.93827231e+00
#define M_P 0.93827204e+00
#define C 299792458.0
@ -11,6 +12,7 @@
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
//parameter array indexes
#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
@ -28,12 +30,18 @@
#define BLOCK_SIZE 128
#define NUMPAR 13
/**
* CUDA device function for calculating dot product.
*/
__device__ inline double dot(double3 &d1, double3 &d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
/**
* CUDA devce function to calculate cross product.
*/
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
return tmp;
}
/**
* CUDA device function to calculate arbitrary rotation.
*/
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
double c=cos(Theta);
double s=sin(Theta);
@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
return tmp;
}
/**
* CUDA device function to check if particle is still in material.
* z - particle position, par - parameter array. Particle is considered inside the
* material if z is > material starting position and z < material starting position - mat size.
*/
__device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */
@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) {
}
/**
* CUDA device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
{
@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
}
/**
* CUDA device function for rotation in 2 dimensions.
* For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),
* "Passage of particles through matter"
*/
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
double &normP, double &thetacou, double &deltas, int coord,
double *par)
@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
/**
* CUDA device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* For details on the algorithm see OPAL user guide.
*/
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
bool enableRutherfordScattering)
{
@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
}
/**
* CUDA kernel that performs one step in particle movement trough mater.
* One thread is launched for each particle in the simulation. The kernel checks if the particle
* is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
* particles that are exiting the material.
* @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
* @param[in] *par array of material properties, always constant size - 13
* @param[in] *state array holding cuRand states to preserve states between kernel launches
* @param[in] numparticles number of particles in the simulation
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
*/
template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles, bool enableRutherfordScattering)
@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory
//transfer params and particle positions to shared memory
//R is kept in shared memory in order to reduce register pressure for the kernel
extern __shared__ double smem[];
double *p = (double*)smem;
double3 *R = (double3*)&smem[NUMPAR];
curandState s;
curandState s; //each tread gets its own cuRand state for random number generation
double3 P;
//load parameters to shared memory
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
p[tt] = par[tt];
//sync threads to ensure that parameters are finished loading
__syncthreads();
//there might be some empty threads that do no work
if (idx < numparticles) {
s = state[idx];
R[tid] = data[idx].Rincol;
P = data[idx].Pincol;
s = state[idx]; //load cuRand state to local memory
R[tid] = data[idx].Rincol; //load position to shared memory
P = data[idx].Pincol; //load momentum to local memory
bool pdead = false;
volatile double sq = sqrt(1.0 + dot(P, P));
double Eng;
//check if particle is still in the material
if (checkHit(R[tid].z, p)) {
//calculate enery loss
Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p);
//check if particle is not dead
if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P));
//caluclate Coulomb scattering
P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
//update particle momentum
data[idx].Pincol = P;
} else {
//mark particle as dead (-1)
data[idx].label = -1;
}
//update cuRand state
state[idx] = s;
} else {
//particle exits material - drift and mark as exiting (-2)
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -272,12 +325,23 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
}
//update particle position
data[idx].Rincol = R[tid];
}
}
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
/**
* CUDA kernel that performs one step in particle movement trough mater using SoA particles.
* Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
* Deprecated - GPU version does not use SoA.
* @param[in] data structure of arrays containing particle data
* @param[in] *par array of material properties, always constant size - 13
* @param[in] *state array holding cuRand states to preserve states between kernel launches
* @param[in] numparticles number of particles in the simulation
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
*/
__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles,
bool enableRutherfordScattering)
{
@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
}
/**
* Device function to swich off unitless positions.
*/
inline __device__ void unitlessOff(double3 &a, const double &c) {
a.x *= c;
a.y *= c;
a.z *= c;
}
/**
* Device function to swich on unitless positions.
*/
inline __device__ void unitlessOn(double3 &a, const double &c) {
a.x /= c;
a.y /= c;
a.z /= c;
}
//swithch to unitless positions with dtc
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOn(R, dtc);
unitlessOn(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc to unitless positions with dt*c
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc off unitless positions with dtc
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOff(R, dtc);
unitlessOff(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//switch off unitelss positions with dt*c
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
/**
* CUDA kernel to perform particle push.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] npart number of particles
* @param[in] dtc dt*c
*/
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
//get global id and thread id
@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
}
}
/**
* CUDA kernel to perform particle push.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] *gdt array of time steps for each particle
* @param[in] npart number of particles
* @param[in] c speed of light
*/
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
//get global id and thread id
@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
}
}
/**
* CUDA kernel to perform particle kick.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] *gEf
* @param[in] *gBf
* @param[in] *gdt array of time steps for each particle
* @param[in] npart number of particles
* @param[in] c speed of light
*/
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
double3 *gBf, double *gdt, double charge,
double mass, int npart, double c)
@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
}
struct compare_particle
{
int threshold;
compare_particle() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART p1, CUDA_PART p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART p1) {
return p1.label < threshold;
}
};
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
struct less_then
{
__host__ __device__
bool operator()(int x)
{
return x < 0;
}
};
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering)
{

View File

@ -20,7 +20,8 @@
#include "CudaBase.cuh"
/**
* Structure for storing particle on GPU
* Structure for storing particle on GPU or MIC as AoS.
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
*/
typedef struct __align__(16) {
int label;
@ -37,7 +38,10 @@ typedef struct __align__(16) {
} CUDA_PART;
/**
* Structure for storing particle on GPU
* Structure for storing particle on GPU as AoS
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
* on the host side.
*/
typedef struct {
int label;
@ -47,7 +51,8 @@ typedef struct {
} CUDA_PART_SMALL;
/**
* Structure for storing particle on GPU
* Structure for storing particle on GPU as SoA.
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
*/
typedef struct {
int *label;
@ -65,6 +70,9 @@ typedef struct {
/**
* Structure for storing particle on GPU
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
* on the host side.
*/
typedef struct {
int *label;
@ -73,9 +81,37 @@ typedef struct {
double3 *Pincol;
} CUDA_PART2_SMALL;
/** CudaCollimatorPhysics class.
/**
* Operator used in thrust sort to compare particles by label.
* Used to move dead particles to the end of array, since they have label -1 or -2.
*/
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
/**
* CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
*/
class CudaCollimatorPhysics : public DKSCollimatorPhysics {
@ -86,32 +122,44 @@ private:
public:
/** Constructor with CudaBase argument
*
/**
* Constructor with CudaBase as argument.
* Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
*/
CudaCollimatorPhysics(CudaBase *base) {
m_base = base;
base_create = false;
}
/** Constructor - empty. */
/**
* Empty constructor.
* Create a new instance of CudaCollimatorPhysics with its own CudaBase.
*/
CudaCollimatorPhysics() {
m_base = new CudaBase();
base_create = true;
}
/** Destructor - empty */
/**
* Destructor.
* Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
*/
~CudaCollimatorPhysics() {
if (base_create)
delete m_base;
};
/** Execute collimator physics kernel.
/**
* Execute collimator physics kernel.
*
*/
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numpartices, bool enableRutherforScattering = true);
/**
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
@ -120,12 +168,17 @@ public:
return DKS_ERROR;
}
/** Sort particle array on GPU.
/**
* Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array
*/
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
@ -134,18 +187,25 @@ public:
return DKS_ERROR;
}
/** BorisPusher push function for integration from OPAL.
/**
* BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1);
/**
* BorisPusher kick function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL
/**
* BorisPusher push function with transformto function form OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/

View File

@ -10,6 +10,10 @@
#include "../Algorithms/FFT.h"
#include "CudaBase.cuh"
/**
* Cuda FFT class based on BaseFFT interface.
* Uses cuFFT library to perform FFTs on nvidias GPUs.
*/
class CudaFFT : public BaseFFT {
private:
@ -34,7 +38,7 @@ public:
~CudaFFT();
/**
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type
* Init cufftPlans witch can be reused for all FFTs of the same size and type
* Return: success or error code
*/
int setupFFT(int ndim, int N[3]);
@ -42,45 +46,21 @@ public:
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
/**
* Info: destroy default FFT plans
* Destroy default FFT plans
* Return: success or error code
*/
int destroyFFT();
/*
Info: execute complex to complex double precision fft using cufft library
Return: success or error code
*/
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
/*
Info: execute ifft
Return: success or error code
*/
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize using cuda kernel for complex to complex iFFT
Return: success or error code
*/
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute real to complex double precision FFT
Return: success or error code
*/
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: exectue complex to real double precision FFT
Return: success or error code
*/
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
};

View File

@ -12,6 +12,7 @@
#include "../Algorithms/GreensFunction.h"
#include "CudaBase.cuh"
/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
class CudaGreensFunction : public GreensFunction{
private:

View File

@ -10,6 +10,7 @@
#include "../Algorithms/ImageReconstruction.h"
#include "CudaBase.cuh"
/** CUDA implementation of ImageReconstruction interface. */
class CudaImageReconstruction : public ImageReconstruction {
private:

View File

@ -1,11 +1,3 @@
/** DKSBase class.
* DKSBase.h
* Author: Uldis Locans
* Date: 15.09.2014
* Base class of Dynamic Kernel Scheduler that handles the function calls
* from host application to DKS
*/
#ifndef H_DKS_BASE
#define H_DKS_BASE
@ -41,7 +33,12 @@
#include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */
/**
* API for handling communication function calls to DKS library.
* DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
* memory manegement, data transfer and other basic communication functions between
* the host and device.
*/
class DKSBase {
private:
@ -74,7 +71,7 @@ protected:
DKSConfig dksconfig;
/**
* Check if current API is set to OpenCL
* Check if current API is set to OpenCL.
* Return true/false wether current api is opencl
*/
bool apiOpenCL();
@ -91,11 +88,11 @@ protected:
*/
bool apiOpenMP();
/** Check if device is GPU */
/** Check if device is GPU. */
bool deviceGPU();
/** Check if device is CPU */
/** Check if device is CPU. */
bool deviceCPU();
/** Check if device is MIC */
/** Check if device is MIC. */
bool deviceMIC();
/**

View File

@ -20,6 +20,11 @@
#include "OpenCL/OpenCLChiSquareRuntime.h"
#endif
/**
* API to handle musrfit calls to DKS library.
* Using ChiSquareRuntime interface allows to call chi square functions on the
* GPU or CPU using CUDA or OpenCL.
*/
class DKSBaseMuSR : public DKSFFT {
private:

View File

@ -24,6 +24,10 @@
#include "MIC/MICFFT.h"
#endif
/**
* API to handel calls to DKSFFT.
* Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
*/
class DKSFFT : public DKSBase {
private:

View File

@ -10,6 +10,9 @@
#include "CUDA/CudaImageReconstruction.cuh"
#endif
/**
* API to handle PET image reconstruction calls.
*/
class DKSImageRecon : public DKSBase {
private:
@ -22,87 +25,88 @@ public:
~DKSImageRecon();
/** Image reconstruction analaysis calculate source.
*
*
/**
* Image reconstruction analaysis calculate source.
*/
int callCalculateSource(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source.
*
*
/**
* Image reconstruction analaysis calculate source.
*/
int callCalculateBackground(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source.
*
*
/**
* Image reconstruction analaysis calculate source.
*/
int callCalculateSources(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source.
*
*
/**
* Image reconstruction analaysis calculate source.
*/
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction - generate normalization.
*
/**
* Image reconstruction - generate normalization.
*/
int callGenerateNormalization(void *recon, void *image_position,
void *det_position, int total_det);
/** Image reconstruction - forward correction.
*
/**
* Image reconstruction - forward correction.
*/
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events);
/** Image reconstruction - backward projection.
*
/**
* Image reconstruction - backward projection.
*/
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position,
int num_events, int num_voxels);
/** Set the voxel dimensins on device.
/**
* Set the voxel dimensins on device.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
/** Set the image edge.
/**
* Set the image edge.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setEdge(float x_edge, float y_edge, float z_edge);
/** Set the image edge1.
/**
* Set the image edge1.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
/** Set the minimum crystan in one ring values.
/**
* Set the minimum crystal in one ring values.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
/** Set all other required parameters for reconstruction.
/**
* Set all other required parameters for reconstruction.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.

View File

@ -33,6 +33,11 @@
#include "MIC/MICCollimatorPhysics.h"
#endif
/**
* API to handle OPAL calls to DKS library.
* Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
* functions.
*/
class DKSOPAL : public DKSFFT {
private:

View File

@ -26,6 +26,9 @@
#define MIC_WIDTH 128
/** MIC Base class handles device setup and basic communication with the device.
* Handles devicew setup, memory manegement and data transfers.
*/
class MICBase {
private:
@ -45,57 +48,59 @@ public:
int m_device_id;
/* constructor */
/** constructor */
MICBase();
/* destructor */
/** destructor */
~MICBase();
/*
Info: create MKL rand streams for each thread
Return: success or error code
/**
* Create MKL rand streams for each thread
* Return: success or error code
*/
int mic_createRandStreams(int size);
/*
Info: delete MKL rand streams
Return: succes or error code
/**
* Delete MKL rand streams
* Return: succes or error code
*/
int mic_deleteRandStreams();
/*
Info: create a new signal for the mic
Return: success or error code
/**
* Create a new signal for the mic.
* Signals can be used for assynchronous data transfers.
* Return: success or error code
*/
int mic_createStream(int & streamId);
/*
Info: get the signal from the vector
Return: mic signal
/**
* Info: get the signal from the vector.
* Return: mic signal
*/
int& mic_getStream(int id);
/*
Info: delete streams
Return: success or error code
/**
* Info: delete streams.
* Return: success or error code
*/
int mic_deleteStreams();
/*
Info: set device id
Return: success or error code
/**
* Info: set device id.
* Return: success or error code
*/
int mic_setDeviceId(int id);
/*
Info: get mic devices
Return: success or error code
/**
* Info: get mic devices.
* Prints information about mic devices.
* Return: success or error code
*/
int mic_getDevices();
/*
Info: allocate memory on MIC device
Return: success or error code
/**
* Allocate memory on MIC device.
* Return: success or error code
*/
template<typename T>
void * mic_allocateMemory(int size) {
@ -109,9 +114,9 @@ public:
return tmp;
}
/*
Info: transfer data to device
Return: success or error code
/**
* Transfer data to device.
* Return: success or error code
*/
template<typename T>
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
@ -123,9 +128,9 @@ public:
return DKS_SUCCESS;
}
/*
Info: write data to device, non-blocking
Return: success or error code
/**
* Write data to device, non-blocking.
* Return: success or error code
*/
template<typename T>
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
@ -139,9 +144,9 @@ public:
}
/*
Info: read data from device
Return: success or error code
/**
* Read data from device
* Return: success or error code
*/
template<typename T>
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
@ -154,9 +159,9 @@ public:
return DKS_SUCCESS;
}
/*
Info: read data from device waiting for signal
Return: success or error code
/**
* Read data from device waiting for signal
* Return: success or error code
*/
template<typename T>
int mic_readDataAsync(const void * data_ptr, void * result, int size,
@ -172,9 +177,9 @@ public:
}
/*
Info: wait till all the signals are complete
Return siccess or error code
/**
* Wait till all the signals are complete
* Return siccess or error code
*/
int mic_syncDevice() {
@ -193,9 +198,9 @@ public:
}
/*
Info: free memory on device
Return: success or error code
/**
* Free memory on device
* Return: success or error code
*/
template<typename T>
int mic_freeMemory(void * data_ptr, int size) {
@ -210,9 +215,9 @@ public:
return DKS_SUCCESS;
}
/*
Info: allocate memory and write data to device
Return: success or error code
/**
* Allocate memory and write data to device
* Return: success or error code
*/
template<typename T>
void * mic_pushData(const void * data, int size) {
@ -227,9 +232,9 @@ public:
return tmp_ptr;
}
/*
Info: read data and free memory on device
Return: success or erro code
/**
* Read data and free memory on device
* Return: success or erro code
*/
template<typename T>
int mic_pullData(void * data_ptr, void * result, int size) {

View File

@ -14,6 +14,9 @@
#include <offload.h>
#include "MICBase.h"
/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices.
* Not complete and untested because of the poor performance of first MIC devices.
*/
class MICChiSquare {
MICBase *m_micbase;

View File

@ -22,22 +22,34 @@
#define I_M 10
#define DT_M 11
/**
* MIC device function for calculating dot product.
*/
__declspec(target(mic))
double dot(mic_double3 d1, mic_double3 d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
/**
* MIC device function for calculating dot product.
*/
__declspec(target(mic))
double dot(double dx, double dy, double dz) {
return (dx * dx + dy * dy + dz * dz);
}
/**
* MIC device function to check if particle is still in material.
*/
__declspec(target(mic))
bool checkHit(double &z, double *par) {
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
}
/**
* MIC device function to calculate arbitrary rotation.
*/
__declspec(target(mic))
void Rot(double &px, double &pz, double &x, double &z, double xplane,
double normP, double thetacou, double deltas, int coord)
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
/**
* MIC device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* Uses AoS to store particle positions and momentum, paralelized using OpenMP.
* For details on the algorithm see OPAL user guide.
* Deprecated on favor of SoA data layout.
*/
__declspec(target(mic))
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
}
/**
* MIC device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* Uses SoA to store particle positions and momentum, paralelized using OpenMP.
* For details on the algorithm see OPAL user guide.
*/
__declspec(target(mic))
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
void coulombScat(double *rx, double *ry, double *rz,
double *px, double *py, double *pz, int *label,
double *par, VSLStreamStatePtr &stream, int ii, int size)
{
//arrays for temporary storage, each core proceses MIC_WIDTH particles
double normP[MIC_WIDTH] __attribute__((aligned(64)));
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
double z2[MIC_WIDTH] __attribute__((aligned(64)));
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
//simd instruction tells the compiler its safe to vectorize the loop
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
}
}
//vectorize the loop
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + size; i++) {
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
}
}
//generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
}
/**
* MIC device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__declspec(target(mic))
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
pdead = 1;
}
/**
* MIC device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__declspec(target(mic))
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
the memory allocation and data trasnfer need to be handled before */
#pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x
#pragma omp for simd
for (int i = 0; i < numparticles; i++) {
if ( !checkHit(data[i].Rincol.z, par) ) {
@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
{
//cast device memory pointers to appropriate types
int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr;
@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
the memory allocation and data trasnfer need to be handled before */
#pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \

View File

@ -26,6 +26,12 @@ typedef struct {
} MIC_PART_SMALL;
/**
* MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Implementes OPALs collimator physics class for particle matter interactions using OpenMP
* and offload mode targetomg Intel Xeon Phi processors.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
*/
class MICCollimatorPhysics : public DKSCollimatorPhysics {
private:

View File

@ -10,6 +10,10 @@
#include "../Algorithms/FFT.h"
#include "MICBase.h"
/**
* MIC FFT based on BaseFFT interface.
* uses MKL library to offload FFT on Intel Xeon Phi devices.
*/
class MICFFT : public BaseFFT {
private:

View File

@ -15,6 +15,7 @@
#define DKS_SUCCESS 0
#define DKS_ERROR 1
/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
class MICGreensFunction : public GreensFunction {
private:

View File

@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
return p;
}
/**
* Merge sort implementation for intel MIC.
* Paralellized over all the MIC cores using OpenMP tasks.
*/
template <typename T>
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
}
}
/**
* Quicksort algorithm, developed for use on Intel MIC devices.
*/
template <typename T>
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
}
/**
* Insertion sort of @p list, developed for use on Intel MIC.
* Used by quick_sort to sort small lists.
*/
template <typename T>
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

View File

@ -1,16 +1,3 @@
/*
Name: OpenCLBase
Author: Uldis Locans
Info: OpenCL base class to handle all the common details associated
with kernel launch on OpenCL device
Date: 2014.09.18
*/
#ifndef H_OPENCL_BASE
#define H_OPENCL_BASE
@ -32,7 +19,7 @@
#include "../DKSDefinitions.h"
/* struct for random number state */
/** struct for random number state. */
typedef struct {
double s10;
double s11;
@ -44,168 +31,194 @@ typedef struct {
bool gen;
} RNDState;
/**
* OpenCL base class to handle device setup and basic communication wiht the device.
* Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
* The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
* class contains methods to read the kernel files, compile the kernel codes and launch kernels
* from the compiled codes. Which kernel file needs to be loaded for the specif functin is
* handled by the base class that is launching the kernel.
*/
class OpenCLBase {
private:
//variables containig OpenCL device and platform ids
static cl_platform_id m_platform_id;
static cl_device_id m_device_id;
//variables containit compiled OpenCL program and kernel
cl_context_properties m_context_properties[3];
cl_program m_program;
cl_kernel m_kernel;
//variables for tracking OpenCL events
static cl_event m_last_event;
cl_int m_num_events;
std::vector<cl_event> m_events;
//currently load kernel file
char * m_kernel_file;
//type of device used by OpenCL
cl_device_type m_device_type;
/*
Name: getPlatforms
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
Return: success or error code
/**
* Get all available OpenCL platforms.
* Get all avaialble platforms and save in m_platform_ids, save number of platforms
* Return: success or error code
*/
int ocl_getPlatforms();
/*
Name: getDevice
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
ReturnL success or error code
/**
* Get first available OpenCL device of specified type.
* Get first avaialble devices and save device id and platform id for this device,
* device name: (-gpu, -mic, -cpu)
* ReturnL success or error code
*/
int ocl_getDevice(const char* device_name);
/*
Name getDeviceType
Info: get device type from device name (-gpu, -cpu, -mic)
Return: success or error code
/**
* Get cl_device_type from the specified device name.
* get device type from device name (-gpu, -cpu, -mic)
* Return: success or error code
*/
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
/*
Name: createContext
Info: create context with specified device
Return: success or error code
/**
* Create OpenCL context with specified device.
* Return: success or error code
*/
int ocl_createContext();
/*
Name: buildProgram
Info: build program from specified kernel file
Return: success or error code
/**
* Build program from specified kernel file.
* Return: success or error code.
*/
int ocl_buildProgram(const char* kernel_file);
/** Compile program from kernel source string
*
/**
* Compile program from kernel source string.
* Takes a string read from OpenCL kernel file saved in kernel_source and compiles the
* OpenCL program, that can be then executed on the device.
* opts is a string specifiend additional compiler flags.
*/
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
protected:
//memory for random number states
int defaultRndSet;
cl_mem defaultRndState;
public:
//OpenCL context and commad queue
static cl_context m_context;
static cl_command_queue m_command_queue;
/*
constructor
/**
* constructor
*/
OpenCLBase();
/*
destructor
/**
* destructor
*/
~OpenCLBase();
/*
Create RND states
Return: success or error code
/**
* Allocate memory for size random number states and init the rnd states.
* Uses AMD clRng library for random numbers.
* This library is only compatible with AMD devices.
*/
int ocl_createRndStates(int size);
/* Create an array of random numbers on the device
*
/**
* Create an array of random numbers on the device.
* Filles hte mem_ptr with random numbers.
*/
int ocl_createRandomNumbers(void *mem_ptr, int size);
/*
Destroy rnd states
Return: success or error code
/**
* Destroy rnd states and free device memory.
* Return: success or error code
*/
int ocl_deleteRndStates();
/*
Name: getAllDevices
Info: get all available devices
ReturnL success or error code
/**
* Prints info about all the available platforms and devices.
* Can be used for information purposes to see what devices are available on the system.
* ReturnL success or error code.
*/
int ocl_getAllDevices();
/** Get the OpenCL device count for the set type of device
*
/**
* Get the OpenCL device count for the set type of device.
* Device count is set in ndev parameter, returns success or error code.
*/
int ocl_getDeviceCount(int &ndev);
/** Get the name of the device used
/**
* Get the name of the device currently us use.
*/
int ocl_getDeviceName(std::string &device_name);
/** Set the device to use for OpenCL kernels.
* device id to use is passed as integer.
/**
* Set the device to use for OpenCL kernels.
* Device id to use is passed as integer.
*/
int ocl_setDevice(int device);
/** Get a list of all the unique devices of the same type that can run OpenCL kernels
/**
* Get a list of all the unique devices of the same type that can run OpenCL kernels.
* Used when GPUs of different types might be pressent on the system.
*/
int ocl_getUniqueDevices(std::vector<int> &devices);
/*
Name: setUp
Info: set up opencl resources
Return: success or error code
/**
* Initialize OpenCL connection with a device of specified type.
* Find if specified device is avaialble, creates a contex and command queue.
* Returns success or error code.
*/
int ocl_setUp(const char* device_name);
/*
Name: loadKernel
Info: load and compile opencl kernel file if it has changed
Return: success or error code
/**
* Given a OpenCL kernel file name loads the content and compile the OpenCL code.
* Load and compile opencl kernel file if it has changed.
* Return: success or error code
*/
int ocl_loadKernel(const char* kernel_file);
/** Build program from kernel source.
/**
* Build program from kernel source.
* Builds a program from source code provided in kernel_source.
* If compilation fails will return DKS_ERROR
*/
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
/*
Name: allocateMemory
Info: allocate memory on device
Return: return pointer to memory
/**
* Allocate memory on the device.
* Return: return pointer to memory
*/
cl_mem ocl_allocateMemory(size_t size, int &ierr);
/*
Name: allocateMemory
Info: allocate memory on device
Return: return pointer to memory
/**
* Allocate memory of specific type on device.
* The availabel types are cl_mem_flags type listed in OpenCL documentation:
* CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,
* CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
* Return: return pointer to memory
*/
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer
* Set all the elemetns in the device array to zero
/**
* Zero OpenCL memory buffer.
* Set all the elemetns in the device array to zero.
*/
template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@ -218,91 +231,89 @@ public:
return DKS_SUCCESS;
}
/*
Name: writeData
Info: write data to device memory (needs ptr to mem object)
Return: success or error code
/**
* Write data to device memory (needs ptr to mem object)
* Return: success or error code
*/
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/*
Name: copyData
Info: copy data from one buffer on the device to another
Return: success or error code
/**
* Copy data from one buffer on the device to another
* Return: success or error code
*/
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
/*
Name: createKernel
Info: create kernel from program
Return: success or error code
/**
* Create kernel from compiled OpenCL program.
* Return: success or error code
*/
int ocl_createKernel(const char* kernel_name);
/*
Name: setKernelArgs
Info: set opencl kernel arguments
Return: success or error code
/**
* Set argiments for the kernel that will be launched.
* Return: success or error code
*/
int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
/*
Name: executeKernel
Info: execute selected kernel (needs kernel parameters)
Return: success or error code
/**
* Execute selected kernel.
* Before kenrel can be executed buildProgram must be executed, create kernel must be executed
* and kenre specifeid in execute kerenel must be in compiled source, and the necessary
* kernel arguments must be set.
* Return: success or error code
*/
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
/*
Name: readData
Info: read data from device (needs pointer to mem object)
Return: success or error code
/**
* Read data from device (needs pointer to mem object).
* Return: success or error code
*/
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/*
Name: freeMemory
Info: free device memory (needs ptr to mem object)
Return: success or error code
/**
* Free device memory (needs ptr to mem object).
* Return: success or error code
*/
int ocl_freeMemory(cl_mem mem_ptr);
/*
Name: cleanUp
Info: free opencl resources
Return: success or error code
/**
* Free opencl resources.
* Deletes the kernel, compiled program, command queue and colese the connection
* to device by releasing the context.
* Return: success or error code
*/
int ocl_cleanUp();
/*
Name: deviceInfo
Info: print device info (mostly for debugging purposes)
Return: success or error code
/**
* Print info of currently selected device.
* Mostly for debugging purposes, but in verbose mode can be used to see device properties.
* Return: success or error code
*/
int ocl_deviceInfo(bool verbose = true);
/* Check OpenCL kernel.
* Query device and check if it can run the kernel with required parameters
/*
* Check OpenCL kernel.
* Query device and check if it can run the kernel with required parameters.
* Also check the available OpenCL extensions - usefull for checking the supported device
* features, like double precission.
*/
int ocl_checkKernel(const char* kernel_name, int work_group_size,
bool double_precision, int &threadsPerBlock);
/*
Name: clearEvents
Info: clear saved events (for debuging purposes)
Return: nothing
/**
* Clear the event list.
* Events can be used for timing and synchronization purposes.
*/
void ocl_clearEvents();
/*
Name: eventInfo
Info: print information about kernel timings (for debuging purposes)
Return: nothing
/**
* print information about kernel timings from event list.
* for debuging purposes
*/
void ocl_eventInfo();
/*
Return current command queue
/**
* Return current command queue.
*/
cl_command_queue ocl_getQueue() { return m_command_queue; }
};

View File

@ -14,7 +14,7 @@
#define DKS_SUCCESS 0
#define DKS_ERROR 1
/** Deprecated, SimpleFit implementation of ChiSquare. */
class OpenCLChiSquare {
private:

View File

@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
}
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
//write params to gpu
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
return ierr;
}
@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
if (numfunc == 0)
return DKS_SUCCESS;
//write function values to the GPU
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
return ierr;
}
@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
if (nummap == 0)
return DKS_SUCCESS;
//wrtie map values to the GPU
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
return ierr;
}
@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
freeChiSquare();
}
//allocate temporary memory
//allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
if (size_func == 0)
@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
int ierr = DKS_ERROR;
if (initDone_m) {
//free memory
//free GPU memory
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
return DKS_ERROR;
}
//check the GPU kernel
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
return ierr;

View File

@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
const std::string openclFunctFooter = "}\n";
/**
* OpenCL implementation of ChiSquareRuntime class.
* Implements ChiSquareRuntime interface to allow musrfit to target devices that
* support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
*/
class OpenCLChiSquareRuntime : public ChiSquareRuntime {
private:
OpenCLBase *m_oclbase;
/** Private function to add user defined function to kernel string
*
/**
* Private function to add user defined function to kernel string.
*/
std::string buildProgram(std::string function);
/**
* Launch parallel reduction kernel to calculate the sum of data array
*/
double calculateSum(cl_mem data, int length);
public:
/** Constructor wiht openclbase argument
*
/**
* Constructor wiht openclbase argument.
*/
OpenCLChiSquareRuntime(OpenCLBase *base);
/** Default constructor
*
/**
* Default constructor
*/
OpenCLChiSquareRuntime();
/** Default destructor
*
/**
* Default destructor
*/
~OpenCLChiSquareRuntime();
/** Compile program and save ptx.
/**
* Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]]
*/
int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel
/**
* Launch selected kernel.
* Launched the selected kernel from the compiled code.
* Result is put in &result variable
*/
@ -64,22 +74,26 @@ public:
double timeStart, double timeStep,
double &result);
/** Write params to device.
/**
* Write params to device.
* Write params from double array to mem_param_m memory on the device.
*/
int writeParams(const double *params, int numparams);
/** Write functions to device.
/**
* Write functions to device.
* Write function values from double array to mem_func_m memory on the device.
*/
int writeFunc(const double *func, int numfunc);
/** Write maps to device.
/**
* Write maps to device.
* Write map values from int array to mem_map_m memory on the device.
*/
int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square.
/**
* Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in
@ -87,14 +101,16 @@ public:
*/
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square.
/**
* Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps
*/
int freeChiSquare();
/** Check MuSR kernels for necessary resources.
/**
* Check MuSR kernels for necessary resources.
* Query device properties to get if sufficient resources are
* available to run the kernels
* available to run the kernels. Also checks if double precission is enabled on the device.
*/
int checkChiSquareKernels(int fitType, int &threadsPerBlock);

View File

@ -17,12 +17,16 @@
#include "boost/compute/core.hpp"
*/
/** Double3 structure for use in OpenCL code. */
typedef struct {
double x;
double y;
double z;
} Double3;
/**
* Structure for stroing particles in OpenCL code.
*/
typedef struct {
int label;
unsigned localID;
@ -35,6 +39,10 @@ typedef struct {
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
/**
* OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
*/
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
private:
@ -42,16 +50,20 @@ private:
public:
/* constructor */
/**
* Constructor with OpenCLBase as argument.
* Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
*/
OpenCLCollimatorPhysics(OpenCLBase *base) {
m_oclbase = base;
}
/* destructor */
/**
* Destructor.
*/
~OpenCLCollimatorPhysics() {
}
/* execute degrader code on device */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherforScattering = true);

View File

@ -1,14 +1,3 @@
/*
Name: OpenCLFFT
Author: Uldis Locans
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
Data: 19.09.2014
*/
#ifndef H_OPENCL_FFT
#define H_OPENCL_FFT
@ -22,6 +11,12 @@
#include "clFFT.h"
/**
* OpenCL FFT class based on BaseFFT interface.
* Uses clFFT library to perform FFTs on AMD gpus.
* clFFT library works also on nvida GPUs and other devices that
* support OpenCL.
*/
class OpenCLFFT : public BaseFFT {
private:

View File

@ -7,6 +7,7 @@
#include "../Algorithms/GreensFunction.h"
#include "OpenCLBase.h"
/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
class OpenCLGreensFunction : public GreensFunction {
private:
@ -31,7 +32,7 @@ public:
int buildProgram();
/**
Info: calc itegral on device memory (taken from OPAL src code)
Info: calc itegral on device memory (taken from OPAL src code).
Return: success or error code
*/
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
@ -39,20 +40,20 @@ public:
int streamId = -1);
/**
Info: integration of rho2_m field (taken from OPAL src code)
Info: integration of rho2_m field (taken from OPAL src code).
Return: success or error code
*/
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
/**
Info: mirror rho field (taken from OPAL src code)
Info: mirror rho field (taken from OPAL src code).
Return: succes or error code
*/
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
Return: success or error code
*/
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);

View File

@ -5,6 +5,10 @@
#include <string>
#include <sys/time.h>
/**
* Custom timer class.
* Allows to insert timers in the code to get function exectution times.
*/
class DKSTimer {
private:
@ -17,38 +21,44 @@ private:
public:
/** Init DKSTimer by seting timer to zero */
/** Init DKSTimer by seting timer to zero. */
DKSTimer();
~DKSTimer();
/** Init the timer
/**
* Init the timer.
* Set the name for timer and clear all values
*/
void init(std::string n);
/** Start the timer.
/**
* Start the timer.
* Get the curret time with gettimeofday and save in timeStart
*/
void start();
/** Stop the timer
/**
* Stop the timer.
* Get the curretn time with gettimeofday and save in timeEnd
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
*/
void stop();
/** Reset timervalue to zero.
/**
* Reset timervalue to zero.
* Set timervalue, timeStart and timeEnd to zero
*/
void reset();
/** Return elapsed time in seconds.
/**
* Return elapsed time in seconds.
* Return the value of timervalue
*/
double gettime();
/** Print timer.
/**
* Print timer.
* Print the elapsed time of the timer
*/
void print();