updated documentation
This commit is contained in:
@ -15,6 +15,9 @@
|
||||
|
||||
class DKSBaseMuSR;
|
||||
|
||||
/**
|
||||
* Interface to implement ChiSquareRuntime class for musrfit.
|
||||
*/
|
||||
class ChiSquareRuntime {
|
||||
friend class DKSBaseMuSR;
|
||||
|
||||
@ -63,23 +66,54 @@ public:
|
||||
/** Default constructor */
|
||||
//ChiSquareRuntime();
|
||||
|
||||
/** Default destructor */
|
||||
/** Default destructor. */
|
||||
virtual ~ChiSquareRuntime() { };
|
||||
|
||||
/**
|
||||
* Compile GPU programm generated at runtime.
|
||||
*/
|
||||
virtual int compileProgram(std::string function, bool mlh = false) = 0;
|
||||
|
||||
/**
|
||||
* Launche the compiled chiSquare kernel.
|
||||
*/
|
||||
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result) = 0;
|
||||
|
||||
/**
|
||||
* Write the parameter values to the GPU.
|
||||
*/
|
||||
virtual int writeParams(const double *params, int numparams) = 0;
|
||||
|
||||
/**
|
||||
* Write the function values to the GPU.
|
||||
*/
|
||||
virtual int writeFunc(const double *func, int numfunc) = 0;
|
||||
|
||||
/**
|
||||
* Write map values to the GPU.
|
||||
*/
|
||||
virtual int writeMap(const int *map, int nummap) = 0;
|
||||
|
||||
/**
|
||||
* Allocate temporary memory needed for the chi square calucaltios on the device.
|
||||
*/
|
||||
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
|
||||
|
||||
/**
|
||||
* Free device memory allocated for chi square calculations.
|
||||
*/
|
||||
virtual int freeChiSquare() = 0;
|
||||
|
||||
/**
|
||||
* Check if available device can run the chi square GPU code.
|
||||
*/
|
||||
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
|
||||
|
||||
/** Set N0, tau and bgk values to use for the kernel.
|
||||
/**
|
||||
* Set N0, tau and bgk values to use for the kernel.
|
||||
* If values changes between data sets this needs to be called before
|
||||
* every kernel call. Returns DKS_SUCCESS.
|
||||
*/
|
||||
@ -91,7 +125,8 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Set alpha and beta values to use for the kernel.
|
||||
/**
|
||||
* Set alpha and beta values to use for the kernel.
|
||||
* If values changes between data sets this needs to be called before
|
||||
* every kernel call. Returns DKS_SUCCESS.
|
||||
*/
|
||||
@ -101,8 +136,9 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Set number of blocks and threads.
|
||||
* Used to set parameters obtained from auto-tuning
|
||||
/**
|
||||
* Set number of blocks and threads.
|
||||
* Used to set parameters obtained from auto-tuning
|
||||
*/
|
||||
int setKernelParams(int numBlocks, int blockSize) {
|
||||
int ierr = DKS_ERROR;
|
||||
@ -118,8 +154,9 @@ public:
|
||||
return ierr;
|
||||
}
|
||||
|
||||
/** Get the number of operations in compiled kernel.
|
||||
* Count the number of operation in the ptx file for the compiled program.
|
||||
/**
|
||||
* Get the number of operations in compiled kernel.
|
||||
* Count the number of operation in the ptx file for the compiled program.
|
||||
*/
|
||||
int getOperations(int &oper) {
|
||||
|
||||
|
@ -5,6 +5,9 @@
|
||||
#include <string>
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
/**
|
||||
* Interface to impelment particle matter interaction for OPAL.
|
||||
*/
|
||||
class DKSCollimatorPhysics {
|
||||
|
||||
protected:
|
||||
@ -16,28 +19,60 @@ public:
|
||||
|
||||
virtual ~DKSCollimatorPhysics() { }
|
||||
|
||||
/**
|
||||
* Execute collimator physics kernel.
|
||||
*
|
||||
*/
|
||||
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
|
||||
bool enableRutherforScattering = true) = 0;
|
||||
|
||||
/**
|
||||
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
|
||||
* Used only on the MIC side, was not implemented on the GPU.
|
||||
*/
|
||||
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles) = 0;
|
||||
|
||||
/**
|
||||
* Sort particle array on GPU.
|
||||
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
|
||||
* array so these particles are at the end of array
|
||||
*/
|
||||
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
|
||||
|
||||
/**
|
||||
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
|
||||
* Used only on the MIC side, was not implemented on the GPU.
|
||||
*/
|
||||
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles, int &numaddback) = 0;
|
||||
|
||||
/**
|
||||
* BorisPusher push function for integration from OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false, int streamId = -1) = 0;
|
||||
|
||||
/**
|
||||
* BorisPusher kick function for integration from OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
||||
void *bf_ptr, void *dt_ptr, double charge,
|
||||
double mass, int npart, double c, int streamId = -1) = 0;
|
||||
|
||||
/**
|
||||
* BorisPusher push function with transformto function form OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false,
|
||||
|
@ -6,12 +6,21 @@
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
/**
|
||||
* Abstract class defining methods for DKS FFT class.
|
||||
* Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
|
||||
*/
|
||||
class BaseFFT {
|
||||
|
||||
protected:
|
||||
int defaultN[3];
|
||||
int defaultNdim;
|
||||
|
||||
/**
|
||||
* Check if FFT plan is created for the needed dimension and FFT size.
|
||||
* Returns true if the plan has been created and false if no plan for specified dimension
|
||||
* and size exists.
|
||||
*/
|
||||
bool useDefaultPlan(int ndim, int N[3]) {
|
||||
if (ndim != defaultNdim)
|
||||
return false;
|
||||
@ -24,18 +33,57 @@ public:
|
||||
|
||||
virtual ~BaseFFT() { }
|
||||
|
||||
/** Setup FFT - init FFT library used by chosen device. */
|
||||
virtual int setupFFT(int ndim, int N[3]) = 0;
|
||||
|
||||
/** Setup real to complex FFT - init FFT library used by chosen device. */
|
||||
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
|
||||
|
||||
/** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
|
||||
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
|
||||
|
||||
/** Clean up. */
|
||||
virtual int destroyFFT() = 0;
|
||||
|
||||
/**
|
||||
* Exectute C2C FFT.
|
||||
* mem_ptr - memory ptr on the device for complex data.
|
||||
* Performs in place FFT.
|
||||
*/
|
||||
virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
|
||||
int streamId = -1, bool forward = true) = 0;
|
||||
|
||||
/**
|
||||
* Exectute inverse C2C FFT.
|
||||
* mem_ptr - memory ptr on the device for complex data.
|
||||
* Performs in place FFT.
|
||||
*/
|
||||
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||
|
||||
/**
|
||||
* Normalize the FFT or IFFT.
|
||||
* mem_ptr - memory to complex data.
|
||||
*/
|
||||
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||
|
||||
/**
|
||||
* Exectute R2C FFT.
|
||||
* real_ptr - real input data for FFT, comp_ptr - memory on the device where
|
||||
* results for the FFT are stored as complex numbers.
|
||||
*/
|
||||
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||
int streamId = -1) = 0;
|
||||
|
||||
/**
|
||||
* Exectute C2R FFT.
|
||||
* real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
|
||||
*/
|
||||
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||
int streamId = -1) = 0;
|
||||
|
||||
/**
|
||||
* Normalize CR FFT.
|
||||
*/
|
||||
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||
|
||||
};
|
||||
|
@ -4,24 +4,27 @@
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
/**
|
||||
* Interface to implement Greens function calculations for OPAL.
|
||||
*/
|
||||
class GreensFunction {
|
||||
|
||||
public:
|
||||
|
||||
virtual ~GreensFunction() { }
|
||||
|
||||
/** calc greens integral, as defined in OPAL */
|
||||
/** calc greens integral, as defined in OPAL. */
|
||||
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
||||
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
|
||||
|
||||
/** integration if rho2_m, see OPAL for more details */
|
||||
/** integration if rho2_m, see OPAL for more details. */
|
||||
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
|
||||
int streamId = -1) = 0;
|
||||
|
||||
/** mirror rho2_m field */
|
||||
/** mirror rho2_m field. */
|
||||
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
|
||||
|
||||
/** multiply two complex fields from device memory */
|
||||
/** multiply two complex fields from device memory. */
|
||||
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
|
||||
|
||||
};
|
||||
|
@ -5,17 +5,22 @@
|
||||
|
||||
#define BLOCK_SIZE 128
|
||||
|
||||
/** Struct to hold voxel position for PET image. */
|
||||
struct VoxelPosition {
|
||||
float x;
|
||||
float y;
|
||||
float z;
|
||||
};
|
||||
|
||||
/** Struct that holds pair of detectors that registered an envent. */
|
||||
struct ListEvent {
|
||||
unsigned detA : 16;
|
||||
unsigned detB : 16;
|
||||
};
|
||||
|
||||
/**
|
||||
* Interface to implement PET image reconstruction.
|
||||
*/
|
||||
class ImageReconstruction {
|
||||
|
||||
protected:
|
||||
@ -25,7 +30,8 @@ public:
|
||||
|
||||
virtual ~ImageReconstruction() { }
|
||||
|
||||
/** Caluclate source.
|
||||
/**
|
||||
* Caluclate source.
|
||||
* Places a sphere at each voxel position and calculate the avg value and std value of pixels
|
||||
* that are inside this sphere. All the sphere used have the same diameter.
|
||||
*/
|
||||
@ -33,7 +39,8 @@ public:
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/** Calculate background.
|
||||
/**
|
||||
* Calculate background.
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
|
||||
@ -42,7 +49,8 @@ public:
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/** Caluclate source using differente sources.
|
||||
/**
|
||||
* Caluclate source using differente sources.
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||
* each sphere is given by *diameter array.
|
||||
@ -52,7 +60,7 @@ public:
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/**
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels.
|
||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
|
||||
* smaller sphere.
|
||||
@ -61,7 +69,8 @@ public:
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/** Generate normalization.
|
||||
/**
|
||||
* Generate normalization.
|
||||
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
|
||||
* that updates voxel values in the image on the slope between these two detectors.
|
||||
*/
|
||||
@ -69,14 +78,16 @@ public:
|
||||
void *det_position, int total_det) = 0;
|
||||
|
||||
|
||||
/** Calculate forward projection.
|
||||
/**
|
||||
* Calculate forward projection.
|
||||
* For image reconstruction calculates forward projections.
|
||||
* see recon.cpp for details
|
||||
*/
|
||||
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||
void *image_position, int num_events) = 0;
|
||||
|
||||
/** Calculate backward projection.
|
||||
/**
|
||||
* Calculate backward projection.
|
||||
* For image reconstruction calculates backward projections.
|
||||
* see recon.cpp for details
|
||||
*/
|
||||
@ -84,29 +95,29 @@ public:
|
||||
void *det_position, void *image_position,
|
||||
int num_events, int num_voxels) = 0;
|
||||
|
||||
/** Set the voxel dimensins on device.
|
||||
*
|
||||
/**
|
||||
*Set the voxel dimensins on device.
|
||||
*/
|
||||
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
|
||||
|
||||
/** Set the image edge variables on the device.
|
||||
*
|
||||
/**
|
||||
* Set the image edge variables on the device.
|
||||
*/
|
||||
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
|
||||
|
||||
/** Set the image edge1 on the device.
|
||||
*
|
||||
/**
|
||||
* Set the image edge1 on the device.
|
||||
*/
|
||||
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
|
||||
|
||||
/** Set the minimum crystan in one ring values on the device.
|
||||
*
|
||||
/**
|
||||
* Set the minimum crystan in one ring values on the device.
|
||||
*/
|
||||
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
|
||||
float min_CrystalDist_InOneRing1) = 0;
|
||||
|
||||
/** Set all other required parameters for reconstruction.
|
||||
*
|
||||
/**
|
||||
* Set all other required parameters for reconstruction.
|
||||
*/
|
||||
virtual int setParams(float matrix_distance_factor, float phantom_diameter,
|
||||
float atten_per_mm, float ring_diameter) = 0;
|
||||
|
@ -18,6 +18,17 @@
|
||||
typedef std::vector<Parameter> Parameters;
|
||||
typedef std::vector<State> States;
|
||||
|
||||
/**
|
||||
* DKS autotuning class, allows to auto-tune the defince function.
|
||||
* Executes the defined function for auto-tuning and searches for optimal parameters to improve
|
||||
* the function execution time. The function that is auto-tuned, parameters and the ranges
|
||||
* need to be set. Includes multiple search methods, that searches the parameter space to finde
|
||||
* the optimal solution.
|
||||
* 1) exaustive search
|
||||
* 2) line search
|
||||
* 3) hill climbimg
|
||||
* 4) simulated annealing
|
||||
*/
|
||||
class DKSAutoTuning {
|
||||
|
||||
private:
|
||||
@ -36,12 +47,13 @@ private:
|
||||
|
||||
int loops_m;
|
||||
|
||||
/** Update parameters from a state */
|
||||
/** Update parameters from a state. */
|
||||
int setParameterValues(States states);
|
||||
|
||||
/** Evaluate the function and set execution time
|
||||
* Returns DKS_ERROR if errors occured during function execution.
|
||||
* Returns DKS_SUCCESS if function executed as planned.
|
||||
/**
|
||||
* Evaluate the function and set execution time
|
||||
* Returns DKS_ERROR if errors occured during function execution.
|
||||
* Returns DKS_SUCCESS if function executed as planned.
|
||||
*/
|
||||
int evaluateFunction(double &value);
|
||||
|
||||
@ -50,12 +62,13 @@ public:
|
||||
/** Constructor */
|
||||
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
|
||||
|
||||
/** Destructor */
|
||||
/** Destructor. */
|
||||
~DKSAutoTuning();
|
||||
|
||||
/** Set function to auto tune.
|
||||
* Caller of setFunction is responsible to bind the correct parameters
|
||||
* to the function with std::bind.
|
||||
/**
|
||||
* Set function to auto tune.
|
||||
* Caller of setFunction is responsible to bind the correct parameters
|
||||
* to the function with std::bind.
|
||||
*/
|
||||
void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
|
||||
f_m = f;
|
||||
@ -63,15 +76,21 @@ public:
|
||||
evaluate_time_m = evaluate_time;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set function to auto tune.
|
||||
* Caller of setFunction is responsible to bind the correct parameters
|
||||
* to the function with std::bind.
|
||||
*/
|
||||
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
|
||||
fd_m = f;
|
||||
function_name_m = name;
|
||||
evaluate_time_m = evaluate_time;
|
||||
}
|
||||
|
||||
/** Set parameter for auto tuning.
|
||||
* Provide a pointer to a parameter that will be changed during auto-tuning
|
||||
* and a min-max value for this element
|
||||
/**
|
||||
* Set parameter for auto tuning.
|
||||
* Provide a pointer to a parameter that will be changed during auto-tuning
|
||||
* and a min-max value for this element
|
||||
*/
|
||||
template <typename T1>
|
||||
void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
|
||||
@ -85,9 +104,9 @@ public:
|
||||
/** Perform exaustive search evaluating all the parameter configurations */
|
||||
void exaustiveSearch();
|
||||
|
||||
/** Perform auto-tuning.
|
||||
* Perform line-search auto-tuning by variying parameters one at a time and keeping other
|
||||
* parameters constant.
|
||||
/**
|
||||
* Perform line-search auto-tuning by variying parameters one at a time.
|
||||
* After one parameter is auto-tuned the next on is varied
|
||||
*/
|
||||
void lineSearch();
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
/** Tester class for auto-tuning search algorithms. */
|
||||
class DKSAutoTuningTester {
|
||||
|
||||
friend class DKSBaseMuSR;
|
||||
|
@ -1,9 +1,3 @@
|
||||
/** Class to save and load DKS autotunning configs.
|
||||
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
|
||||
* Uses boost xml_parser to read and write the xml file and boost property tree to store
|
||||
* the xml content.
|
||||
*/
|
||||
|
||||
#ifndef DKS_CONFIG
|
||||
#define DKS_CONFIG
|
||||
|
||||
@ -29,6 +23,13 @@ namespace pt = boost::property_tree;
|
||||
const std::string config_dir = "/.config/DKS";
|
||||
const std::string config_file = "/autotuning.xml";
|
||||
|
||||
/** Class to save and load DKS autotunning configs.
|
||||
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
|
||||
* Uses boost xml_parser to read and write the xml file and boost property tree to store
|
||||
* the xml content.
|
||||
* TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
|
||||
* so the auto-tuning has no effect.
|
||||
*/
|
||||
class DKSConfig {
|
||||
|
||||
private:
|
||||
|
@ -9,6 +9,9 @@
|
||||
|
||||
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
|
||||
|
||||
/**
|
||||
* Parameter class allows to change the searchable parameters during the auto-tuning.
|
||||
*/
|
||||
class Parameter {
|
||||
|
||||
private:
|
||||
@ -64,6 +67,10 @@ public:
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* Struct to hold a auto-tuning state.
|
||||
* Holds the current value, min, max and a step to witch a state can change.
|
||||
*/
|
||||
struct State {
|
||||
double value;
|
||||
double min;
|
||||
@ -74,6 +81,12 @@ struct State {
|
||||
typedef std::vector<Parameter> Parameters;
|
||||
typedef std::vector<State> States;
|
||||
|
||||
/**
|
||||
* Used by auto-tuning search algorithms to move between parameter configurations.
|
||||
* Allows to move from one parameter stat to another, get neighboring states,
|
||||
* move to neighboring states and save state information. Print functions are available
|
||||
* for debugging purposes, to follow how algorithm muves between sates.
|
||||
*/
|
||||
class DKSSearchStates {
|
||||
|
||||
private:
|
||||
|
@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: allcate memory and write data (push)
|
||||
Return: pointer to memory object
|
||||
*/
|
||||
/*
|
||||
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
|
||||
|
||||
void * mem_ptr;
|
||||
mem_ptr = cuda_allocateMemory(size, ierr);
|
||||
|
||||
if (ierr == DKS_SUCCESS)
|
||||
ierr = cuda_writeData(mem_ptr, in_data, size);
|
||||
|
||||
return mem_ptr;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
Info: read data and free memory (pull)
|
||||
Return: success or error code
|
||||
*/
|
||||
/*
|
||||
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
|
||||
|
||||
ierr = cuda_readData(mem_ptr, out_data, size);
|
||||
if (ierr == DKS_SUCCESS)
|
||||
ierr = cuda_freeMemory(mem_ptr);
|
||||
else
|
||||
return DKS_ERROR;
|
||||
|
||||
|
||||
if (ierr == DKS_SUCCESS)
|
||||
return DKS_SUCCESS;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
Info: execute function
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaBase::cuda_executeFunction() {
|
||||
|
||||
std::cout << "Execute function" << std::endl;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: clean up
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaBase::cuda_cleanUp() {
|
||||
|
||||
std::cout << "clean up" << std::endl;
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
@ -16,6 +16,11 @@
|
||||
|
||||
#define BLOCK_SIZE 128
|
||||
|
||||
/**
|
||||
* CUDA base class handles device setup and basic communication with the device.
|
||||
* Handles devicew setup, memory manegement, data transfers and stream setup for
|
||||
* asynchronous data transfers and kernel executions.
|
||||
*/
|
||||
class CudaBase {
|
||||
|
||||
private:
|
||||
@ -48,17 +53,17 @@ public:
|
||||
/**
|
||||
* Delete curandState.
|
||||
* Delete curandState array on the GPU and free memory.
|
||||
* Return success or error code
|
||||
* Return success or error code
|
||||
*/
|
||||
int cuda_deleteCurandStates();
|
||||
|
||||
/** Create 'size' random numbers on the device and save in mem_ptr array
|
||||
*
|
||||
/**
|
||||
* Create 'size' random numbers on the device and save in mem_ptr array.
|
||||
*/
|
||||
int cuda_createRandomNumbers(void *mem_ptr, int size);
|
||||
|
||||
/** Get a pointer to curand states
|
||||
*
|
||||
/**
|
||||
* Get a pointer to curand states.
|
||||
*/
|
||||
curandState* cuda_getCurandStates();
|
||||
|
||||
@ -75,93 +80,98 @@ public:
|
||||
int cuda_addStream(cudaStream_t tmpStream, int &streamId);
|
||||
|
||||
/**
|
||||
* delete cuda stream
|
||||
* delete cuda stream.
|
||||
* success or error code
|
||||
*/
|
||||
int cuda_deleteStream(int id);
|
||||
|
||||
/**
|
||||
* delete all streams
|
||||
* delete all streams.
|
||||
* success or error code
|
||||
*/
|
||||
int cuda_deleteStreams();
|
||||
|
||||
/**
|
||||
* set stream to use
|
||||
* set stream to use.
|
||||
* success or error code
|
||||
*/
|
||||
int cuda_setStream(int id);
|
||||
|
||||
/**
|
||||
* Info: get stream that is used
|
||||
* Return: return id of curretn stream
|
||||
* get stream that is used.
|
||||
* Return: return id of curretn stream
|
||||
*/
|
||||
int cuda_getStreamId();
|
||||
|
||||
/**
|
||||
* Info: reset to default stream
|
||||
* reset to default stream.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_defaultStream();
|
||||
|
||||
/**
|
||||
* Info: get number of streams
|
||||
* get number of streams.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_numberOfStreams();
|
||||
|
||||
/**
|
||||
* Info: get stream
|
||||
* get stream.
|
||||
* Return: stream
|
||||
*/
|
||||
cudaStream_t cuda_getStream(int id);
|
||||
|
||||
/**
|
||||
* Get default cublass handle
|
||||
* Get default cublass handle.
|
||||
*/
|
||||
cublasHandle_t cuda_getCublas();
|
||||
|
||||
/**
|
||||
* Info: get information on cuda devices
|
||||
* get information on cuda devices.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_getDevices();
|
||||
|
||||
/** Get CUDA device count.
|
||||
* Sets the number of devices on the platform that can use CUDA.
|
||||
* Returns DKS_SUCCESS
|
||||
/**
|
||||
* Get CUDA device count.
|
||||
* Sets the number of devices on the platform that can use CUDA.
|
||||
*/
|
||||
int cuda_getDeviceCount(int &ndev);
|
||||
|
||||
/** Get the name of the device.
|
||||
* QUery the device properties of the used device and set the string device_name
|
||||
/**
|
||||
* Get the name of the device.
|
||||
* QUery the device properties of the used device and set the string device_name
|
||||
*/
|
||||
int cuda_getDeviceName(std::string &device_name);
|
||||
|
||||
/** Set CUDA device to use.
|
||||
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR
|
||||
/**
|
||||
* Set CUDA device to use.
|
||||
* If device passed in is larger than the number of devices use
|
||||
* the default:0 and return DKS_ERROR
|
||||
*/
|
||||
int cuda_setDevice(int device);
|
||||
|
||||
/** Get unique devices
|
||||
* Get array of indeces with the unique CUDA devices available on the paltform
|
||||
/**
|
||||
* Get unique devices.
|
||||
* Get array of indeces with the unique CUDA devices available on the paltform
|
||||
*/
|
||||
int cuda_getUniqueDevices(std::vector<int> &devices);
|
||||
|
||||
/**
|
||||
* Info: init device
|
||||
* Initialize connection to the device.
|
||||
* Only needed when runtime compilation is used.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_setUp();
|
||||
|
||||
/**
|
||||
* Info: allocate memory on cuda device
|
||||
* Allocate memory on cuda device.
|
||||
* Return: pointer to memory object
|
||||
*/
|
||||
void * cuda_allocateMemory(size_t size, int &ierr);
|
||||
|
||||
/**
|
||||
* Info: allocate host memory in pinned memory
|
||||
* Allocate host memory in pinned memory
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
@ -174,8 +184,9 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Zero CUDA memory.
|
||||
* Set all the elements of the array on the device to zero.
|
||||
/**
|
||||
* Zero CUDA memory.
|
||||
* Set all the elements of the array on the device to zero.
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
|
||||
@ -189,8 +200,9 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Zero CUDA memory.
|
||||
* Set all the elements of the array on the device to zero.
|
||||
/**
|
||||
* Zero CUDA memory async.
|
||||
* Set all the elements of the array on the device to zero.
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
|
||||
@ -209,7 +221,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: write data to memory
|
||||
* Write data to memory
|
||||
* Retrun: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
@ -226,7 +238,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: write data assynchonuously
|
||||
* Write data assynchonuously
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
@ -258,7 +270,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: read data from memory
|
||||
* Read data from memory
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
@ -275,7 +287,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: read data async from device memory
|
||||
* Read data async from device memory
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
@ -307,19 +319,19 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: free memory on device
|
||||
* Free memory on device
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_freeMemory(void * mem_ptr);
|
||||
|
||||
/**
|
||||
* Info: free page locked memory on host
|
||||
* Free page locked memory on host
|
||||
* Return: success or erro code
|
||||
*/
|
||||
int cuda_freeHostMemory(void * mem_ptr);
|
||||
|
||||
/**
|
||||
* Info: allcate memory and write data (push)
|
||||
* Allcate memory and write data (push)
|
||||
* Return: pointer to memory object
|
||||
*/
|
||||
template<typename T>
|
||||
@ -335,7 +347,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: read data and free memory (pull)
|
||||
* Read data and free memory (pull)
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
@ -355,19 +367,8 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: execute function
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_executeFunction();
|
||||
|
||||
/**
|
||||
* Info: clean up
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_cleanUp();
|
||||
|
||||
/**
|
||||
* Info: sync cuda device
|
||||
* Sync cuda device.
|
||||
* Waits till all the tasks on the GPU are finished.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_syncDevice() {
|
||||
@ -376,7 +377,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Page-lock host memory
|
||||
* Page-lock host memory.
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_hostRegister(T *ptr, int size) {
|
||||
@ -390,7 +391,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Release page locked memory
|
||||
* Release page locked memory.
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_hostUnregister(T *ptr) {
|
||||
@ -403,7 +404,7 @@ public:
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: print device memory info (total, used, avail)
|
||||
* Print device memory info (total, used, avail)
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_memInfo() {
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
|
||||
class CudaChiSquare {
|
||||
|
||||
private:
|
||||
|
@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
|
||||
|
||||
const std::string cudaFunctFooter = "}\n";
|
||||
|
||||
/**
|
||||
* CUDA implementation of ChiSquareRuntime class.
|
||||
* Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
|
||||
*/
|
||||
class CudaChiSquareRuntime : public ChiSquareRuntime{
|
||||
|
||||
private:
|
||||
@ -29,65 +33,72 @@ private:
|
||||
|
||||
cublasHandle_t defaultCublasRT;
|
||||
|
||||
/** Setup to init device
|
||||
* Create context and init device for RT compilation
|
||||
/**
|
||||
* Setup to init device.
|
||||
* Create context and init device for RT compilation
|
||||
*/
|
||||
void setUpContext();
|
||||
|
||||
/** Private function to add function to kernel string
|
||||
*
|
||||
/**
|
||||
* Private function to add function to kernel string.
|
||||
*/
|
||||
std::string buildProgram(std::string function);
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor with CudaBase argument
|
||||
*
|
||||
/**
|
||||
* Constructor with CudaBase argument
|
||||
*/
|
||||
CudaChiSquareRuntime(CudaBase *base);
|
||||
|
||||
/** Default constructor init cuda device
|
||||
*
|
||||
/**
|
||||
* Default constructor init cuda device
|
||||
*/
|
||||
CudaChiSquareRuntime();
|
||||
|
||||
/** Default destructor
|
||||
*
|
||||
/**
|
||||
* Default destructor.
|
||||
*/
|
||||
~CudaChiSquareRuntime();
|
||||
|
||||
/** Compile program and save ptx.
|
||||
/**
|
||||
* Compile program and save ptx.
|
||||
* Add function string to the calcFunction kernel and compile the program
|
||||
* Function must be valid C math expression. Parameters can be addressed in
|
||||
* a form par[map[idx]]
|
||||
*/
|
||||
int compileProgram(std::string function, bool mlh = false);
|
||||
|
||||
/** Launch selected kernel
|
||||
/**
|
||||
* Launch selected kernel.
|
||||
* Launched the selected kernel from the compiled code.
|
||||
* Result is put in &result variable
|
||||
* Result is put in &result variable.
|
||||
*/
|
||||
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result);
|
||||
|
||||
/** Write params to device.
|
||||
/**
|
||||
* Write params to device.
|
||||
* Write params from double array to mem_param_m memory on the device.
|
||||
*/
|
||||
int writeParams(const double *params, int numparams);
|
||||
|
||||
/** Write functions to device.
|
||||
/**
|
||||
* Write functions to device.
|
||||
* Write function values from double array to mem_func_m memory on the device.
|
||||
*/
|
||||
int writeFunc(const double *func, int numfunc);
|
||||
|
||||
/** Write maps to device.
|
||||
/**
|
||||
* Write maps to device.
|
||||
* Write map values from int array to mem_map_m memory on the device.
|
||||
*/
|
||||
int writeMap(const int *map, int nummap);
|
||||
|
||||
/** Allocate temporary memory needed for chi square.
|
||||
/**
|
||||
* Allocate temporary memory needed for chi square.
|
||||
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
||||
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
||||
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
||||
@ -96,14 +107,16 @@ public:
|
||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||
|
||||
|
||||
/** Free temporary memory allocated for chi square.
|
||||
/**
|
||||
* Free temporary memory allocated for chi square.
|
||||
* Frees the chisq temporary memory and memory for params, functions and maps
|
||||
*/
|
||||
int freeChiSquare();
|
||||
|
||||
/** Check if CUDA device is able to run the chi square kernel.
|
||||
* Redundant - all new CUDA devices that support RT compilation will also support
|
||||
* double precision, there are no other requirements to run chi square on GPU
|
||||
/**
|
||||
* Check if CUDA device is able to run the chi square kernel.
|
||||
* Redundant - all new CUDA devices that support RT compilation will also support
|
||||
* double precision, there are no other requirements to run chi square on GPU
|
||||
*/
|
||||
int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
|
||||
return DKS_SUCCESS;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "CudaCollimatorPhysics.cuh"
|
||||
|
||||
//constants used in OPAL
|
||||
//#define M_P 0.93827231e+00
|
||||
#define M_P 0.93827204e+00
|
||||
#define C 299792458.0
|
||||
@ -11,6 +12,7 @@
|
||||
#define Z_P 1
|
||||
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
|
||||
|
||||
//parameter array indexes
|
||||
#define POSITION 0
|
||||
#define ZSIZE 1
|
||||
#define RHO_M 2
|
||||
@ -28,12 +30,18 @@
|
||||
#define BLOCK_SIZE 128
|
||||
#define NUMPAR 13
|
||||
|
||||
/**
|
||||
* CUDA device function for calculating dot product.
|
||||
*/
|
||||
__device__ inline double dot(double3 &d1, double3 &d2) {
|
||||
|
||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA devce function to calculate cross product.
|
||||
*/
|
||||
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
|
||||
double3 tmp;
|
||||
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
|
||||
@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA device function to calculate arbitrary rotation.
|
||||
*/
|
||||
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
|
||||
double c=cos(Theta);
|
||||
double s=sin(Theta);
|
||||
@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA device function to check if particle is still in material.
|
||||
* z - particle position, par - parameter array. Particle is considered inside the
|
||||
* material if z is > material starting position and z < material starting position - mat size.
|
||||
*/
|
||||
__device__ inline bool checkHit(double &z, double *par) {
|
||||
|
||||
/* check if particle is in the degrader material */
|
||||
@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* CUDA device function to calculate energyLoss for one particle.
|
||||
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
|
||||
* algorith are available in OPAL user guide.
|
||||
*/
|
||||
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
|
||||
{
|
||||
|
||||
@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA device function for rotation in 2 dimensions.
|
||||
* For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),
|
||||
* "Passage of particles through matter"
|
||||
*/
|
||||
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
|
||||
double &normP, double &thetacou, double &deltas, int coord,
|
||||
double *par)
|
||||
@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
|
||||
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA device function to calculate Coulomb scattering for one particle.
|
||||
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
|
||||
* For details on the algorithm see OPAL user guide.
|
||||
*/
|
||||
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
|
||||
bool enableRutherfordScattering)
|
||||
{
|
||||
@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* CUDA kernel that performs one step in particle movement trough mater.
|
||||
* One thread is launched for each particle in the simulation. The kernel checks if the particle
|
||||
* is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
|
||||
* particles that are exiting the material.
|
||||
* @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
|
||||
* @param[in] *par array of material properties, always constant size - 13
|
||||
* @param[in] *state array holding cuRand states to preserve states between kernel launches
|
||||
* @param[in] numparticles number of particles in the simulation
|
||||
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
|
||||
int numparticles, bool enableRutherfordScattering)
|
||||
@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
//transfer params to shared memory
|
||||
//transfer params and particle positions to shared memory
|
||||
//R is kept in shared memory in order to reduce register pressure for the kernel
|
||||
extern __shared__ double smem[];
|
||||
double *p = (double*)smem;
|
||||
double3 *R = (double3*)&smem[NUMPAR];
|
||||
|
||||
curandState s;
|
||||
curandState s; //each tread gets its own cuRand state for random number generation
|
||||
double3 P;
|
||||
|
||||
//load parameters to shared memory
|
||||
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
|
||||
p[tt] = par[tt];
|
||||
|
||||
//sync threads to ensure that parameters are finished loading
|
||||
__syncthreads();
|
||||
|
||||
//there might be some empty threads that do no work
|
||||
if (idx < numparticles) {
|
||||
s = state[idx];
|
||||
R[tid] = data[idx].Rincol;
|
||||
P = data[idx].Pincol;
|
||||
s = state[idx]; //load cuRand state to local memory
|
||||
R[tid] = data[idx].Rincol; //load position to shared memory
|
||||
P = data[idx].Pincol; //load momentum to local memory
|
||||
|
||||
bool pdead = false;
|
||||
volatile double sq = sqrt(1.0 + dot(P, P));
|
||||
|
||||
double Eng;
|
||||
|
||||
//check if particle is still in the material
|
||||
if (checkHit(R[tid].z, p)) {
|
||||
|
||||
//calculate enery loss
|
||||
Eng = (sq - 1) * M_P;
|
||||
energyLoss(Eng, pdead, s, p);
|
||||
|
||||
//check if particle is not dead
|
||||
if (!pdead) {
|
||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||
sq = sqrt(dot(P, P));
|
||||
|
||||
//caluclate Coulomb scattering
|
||||
P.x = P.x * ptot / sq;
|
||||
P.y = P.y * ptot / sq;
|
||||
P.z = P.z * ptot / sq;
|
||||
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
|
||||
|
||||
//update particle momentum
|
||||
data[idx].Pincol = P;
|
||||
} else {
|
||||
//mark particle as dead (-1)
|
||||
data[idx].label = -1;
|
||||
}
|
||||
|
||||
//update cuRand state
|
||||
state[idx] = s;
|
||||
} else {
|
||||
|
||||
//particle exits material - drift and mark as exiting (-2)
|
||||
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
|
||||
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
|
||||
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
|
||||
@ -272,14 +325,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
|
||||
|
||||
}
|
||||
|
||||
//update particle position
|
||||
data[idx].Rincol = R[tid];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
||||
curandState *state, int numparticles,
|
||||
bool enableRutherfordScattering)
|
||||
/**
|
||||
* CUDA kernel that performs one step in particle movement trough mater using SoA particles.
|
||||
* Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
|
||||
* Deprecated - GPU version does not use SoA.
|
||||
* @param[in] data structure of arrays containing particle data
|
||||
* @param[in] *par array of material properties, always constant size - 13
|
||||
* @param[in] *state array holding cuRand states to preserve states between kernel launches
|
||||
* @param[in] numparticles number of particles in the simulation
|
||||
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
|
||||
*/
|
||||
__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
|
||||
curandState *state, int numparticles,
|
||||
bool enableRutherfordScattering)
|
||||
{
|
||||
|
||||
//get global id and thread id
|
||||
@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Device function to swich off unitless positions.
|
||||
*/
|
||||
inline __device__ void unitlessOff(double3 &a, const double &c) {
|
||||
a.x *= c;
|
||||
a.y *= c;
|
||||
a.z *= c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Device function to swich on unitless positions.
|
||||
*/
|
||||
inline __device__ void unitlessOn(double3 &a, const double &c) {
|
||||
a.x /= c;
|
||||
a.y /= c;
|
||||
a.z /= c;
|
||||
}
|
||||
|
||||
//swithch to unitless positions with dtc
|
||||
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
|
||||
unitlessOn(R, dtc);
|
||||
unitlessOn(X, dtc);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//swithc to unitless positions with dt*c
|
||||
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
double dt = gdt[idx];
|
||||
|
||||
unitlessOff(R, dt*c);
|
||||
unitlessOff(X, dt*c);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
}
|
||||
|
||||
//swithc off unitless positions with dtc
|
||||
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
|
||||
unitlessOff(R, dtc);
|
||||
unitlessOff(X, dtc);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//switch off unitelss positions with dt*c
|
||||
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
double dt = gdt[idx];
|
||||
|
||||
unitlessOff(R, dt*c);
|
||||
unitlessOff(X, dt*c);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* CUDA kernel to perform particle push.
|
||||
* @param[in] *gR array of particle positions
|
||||
* @param[in] *gP array of particle momentums
|
||||
* @param[in] npart number of particles
|
||||
* @param[in] dtc dt*c
|
||||
*/
|
||||
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
||||
|
||||
//get global id and thread id
|
||||
@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* CUDA kernel to perform particle push.
|
||||
* @param[in] *gR array of particle positions
|
||||
* @param[in] *gP array of particle momentums
|
||||
* @param[in] *gdt array of time steps for each particle
|
||||
* @param[in] npart number of particles
|
||||
* @param[in] c speed of light
|
||||
*/
|
||||
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
|
||||
|
||||
//get global id and thread id
|
||||
@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA kernel to perform particle kick.
|
||||
* @param[in] *gR array of particle positions
|
||||
* @param[in] *gP array of particle momentums
|
||||
* @param[in] *gEf
|
||||
* @param[in] *gBf
|
||||
* @param[in] *gdt array of time steps for each particle
|
||||
* @param[in] npart number of particles
|
||||
* @param[in] c speed of light
|
||||
*/
|
||||
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
|
||||
double3 *gBf, double *gdt, double charge,
|
||||
double mass, int npart, double c)
|
||||
@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
|
||||
|
||||
}
|
||||
|
||||
struct compare_particle
|
||||
{
|
||||
int threshold;
|
||||
|
||||
compare_particle() {
|
||||
threshold = 0;
|
||||
}
|
||||
|
||||
void set_threshold(int t) {
|
||||
threshold = t;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART p1, CUDA_PART p2) {
|
||||
return p1.label > p2.label;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART p1) {
|
||||
return p1.label < threshold;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct compare_particle_small
|
||||
{
|
||||
int threshold;
|
||||
|
||||
compare_particle_small() {
|
||||
threshold = 0;
|
||||
}
|
||||
|
||||
void set_threshold(int t) {
|
||||
threshold = t;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
|
||||
return p1.label > p2.label;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART_SMALL p1) {
|
||||
return p1.label < threshold;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct less_then
|
||||
{
|
||||
__host__ __device__
|
||||
bool operator()(int x)
|
||||
{
|
||||
return x < 0;
|
||||
}
|
||||
};
|
||||
|
||||
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
||||
bool enableRutherfordScattering)
|
||||
{
|
||||
|
@ -20,7 +20,8 @@
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
* Structure for storing particle on GPU or MIC as AoS.
|
||||
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
|
||||
*/
|
||||
typedef struct __align__(16) {
|
||||
int label;
|
||||
@ -37,7 +38,10 @@ typedef struct __align__(16) {
|
||||
} CUDA_PART;
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
* Structure for storing particle on GPU as AoS
|
||||
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
|
||||
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
|
||||
* on the host side.
|
||||
*/
|
||||
typedef struct {
|
||||
int label;
|
||||
@ -47,7 +51,8 @@ typedef struct {
|
||||
} CUDA_PART_SMALL;
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
* Structure for storing particle on GPU as SoA.
|
||||
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
|
||||
*/
|
||||
typedef struct {
|
||||
int *label;
|
||||
@ -65,6 +70,9 @@ typedef struct {
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
|
||||
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
|
||||
* on the host side.
|
||||
*/
|
||||
typedef struct {
|
||||
int *label;
|
||||
@ -73,11 +81,39 @@ typedef struct {
|
||||
double3 *Pincol;
|
||||
} CUDA_PART2_SMALL;
|
||||
|
||||
/** CudaCollimatorPhysics class.
|
||||
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
|
||||
* For detailed documentation on CollimatorPhysics functions see OPAL documentation
|
||||
/**
|
||||
* Operator used in thrust sort to compare particles by label.
|
||||
* Used to move dead particles to the end of array, since they have label -1 or -2.
|
||||
*/
|
||||
class CudaCollimatorPhysics : public DKSCollimatorPhysics{
|
||||
struct compare_particle_small
|
||||
{
|
||||
int threshold;
|
||||
|
||||
compare_particle_small() {
|
||||
threshold = 0;
|
||||
}
|
||||
|
||||
void set_threshold(int t) {
|
||||
threshold = t;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
|
||||
return p1.label > p2.label;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART_SMALL p1) {
|
||||
return p1.label < threshold;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
|
||||
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
|
||||
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
|
||||
*/
|
||||
class CudaCollimatorPhysics : public DKSCollimatorPhysics {
|
||||
|
||||
private:
|
||||
|
||||
@ -86,32 +122,44 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor with CudaBase argument
|
||||
*
|
||||
/**
|
||||
* Constructor with CudaBase as argument.
|
||||
* Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
|
||||
*/
|
||||
CudaCollimatorPhysics(CudaBase *base) {
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
}
|
||||
|
||||
/** Constructor - empty. */
|
||||
/**
|
||||
* Empty constructor.
|
||||
* Create a new instance of CudaCollimatorPhysics with its own CudaBase.
|
||||
*/
|
||||
CudaCollimatorPhysics() {
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
}
|
||||
|
||||
/** Destructor - empty */
|
||||
/**
|
||||
* Destructor.
|
||||
* Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
|
||||
*/
|
||||
~CudaCollimatorPhysics() {
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
};
|
||||
|
||||
/** Execute collimator physics kernel.
|
||||
/**
|
||||
* Execute collimator physics kernel.
|
||||
*
|
||||
*/
|
||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||
int numpartices, bool enableRutherforScattering = true);
|
||||
|
||||
/**
|
||||
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
|
||||
* Used only on the MIC side, was not implemented on the GPU.
|
||||
*/
|
||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
@ -120,12 +168,17 @@ public:
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/** Sort particle array on GPU.
|
||||
/**
|
||||
* Sort particle array on GPU.
|
||||
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
|
||||
* array so these particles are at the end of array
|
||||
*/
|
||||
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
||||
|
||||
/**
|
||||
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
|
||||
* Used only on the MIC side, was not implemented on the GPU.
|
||||
*/
|
||||
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
@ -134,18 +187,25 @@ public:
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/** BorisPusher push function for integration from OPAL.
|
||||
/**
|
||||
* BorisPusher push function for integration from OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false, int streamId = -1);
|
||||
|
||||
/**
|
||||
* BorisPusher kick function for integration from OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
||||
void *bf_ptr, void *dt_ptr, double charge, double mass,
|
||||
int npart, double c, int streamId = -1);
|
||||
|
||||
/** BorisPusher push function with transformto function form OPAL
|
||||
/**
|
||||
* BorisPusher push function with transformto function form OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
|
@ -10,6 +10,10 @@
|
||||
#include "../Algorithms/FFT.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
/**
|
||||
* Cuda FFT class based on BaseFFT interface.
|
||||
* Uses cuFFT library to perform FFTs on nvidias GPUs.
|
||||
*/
|
||||
class CudaFFT : public BaseFFT {
|
||||
|
||||
private:
|
||||
@ -34,7 +38,7 @@ public:
|
||||
~CudaFFT();
|
||||
|
||||
/**
|
||||
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type
|
||||
* Init cufftPlans witch can be reused for all FFTs of the same size and type
|
||||
* Return: success or error code
|
||||
*/
|
||||
int setupFFT(int ndim, int N[3]);
|
||||
@ -42,45 +46,21 @@ public:
|
||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||
|
||||
/**
|
||||
* Info: destroy default FFT plans
|
||||
* Destroy default FFT plans
|
||||
* Return: success or error code
|
||||
*/
|
||||
int destroyFFT();
|
||||
|
||||
/*
|
||||
Info: execute complex to complex double precision fft using cufft library
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
|
||||
|
||||
/*
|
||||
Info: execute ifft
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute normalize using cuda kernel for complex to complex iFFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute real to complex double precision FFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: exectue complex to real double precision FFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute normalize for complex to real iFFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
};
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "../Algorithms/GreensFunction.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
|
||||
class CudaGreensFunction : public GreensFunction{
|
||||
|
||||
private:
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "../Algorithms/ImageReconstruction.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
/** CUDA implementation of ImageReconstruction interface. */
|
||||
class CudaImageReconstruction : public ImageReconstruction {
|
||||
|
||||
private:
|
||||
|
@ -1,11 +1,3 @@
|
||||
/** DKSBase class.
|
||||
* DKSBase.h
|
||||
* Author: Uldis Locans
|
||||
* Date: 15.09.2014
|
||||
* Base class of Dynamic Kernel Scheduler that handles the function calls
|
||||
* from host application to DKS
|
||||
*/
|
||||
|
||||
#ifndef H_DKS_BASE
|
||||
#define H_DKS_BASE
|
||||
|
||||
@ -41,7 +33,12 @@
|
||||
|
||||
#include "AutoTuning/DKSConfig.h"
|
||||
|
||||
/** DKSBase class for handling function calls to DKS library */
|
||||
/**
|
||||
* API for handling communication function calls to DKS library.
|
||||
* DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
|
||||
* memory manegement, data transfer and other basic communication functions between
|
||||
* the host and device.
|
||||
*/
|
||||
class DKSBase {
|
||||
|
||||
private:
|
||||
@ -74,7 +71,7 @@ protected:
|
||||
DKSConfig dksconfig;
|
||||
|
||||
/**
|
||||
* Check if current API is set to OpenCL
|
||||
* Check if current API is set to OpenCL.
|
||||
* Return true/false wether current api is opencl
|
||||
*/
|
||||
bool apiOpenCL();
|
||||
@ -91,11 +88,11 @@ protected:
|
||||
*/
|
||||
bool apiOpenMP();
|
||||
|
||||
/** Check if device is GPU */
|
||||
/** Check if device is GPU. */
|
||||
bool deviceGPU();
|
||||
/** Check if device is CPU */
|
||||
/** Check if device is CPU. */
|
||||
bool deviceCPU();
|
||||
/** Check if device is MIC */
|
||||
/** Check if device is MIC. */
|
||||
bool deviceMIC();
|
||||
|
||||
/**
|
||||
|
@ -20,6 +20,11 @@
|
||||
#include "OpenCL/OpenCLChiSquareRuntime.h"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* API to handle musrfit calls to DKS library.
|
||||
* Using ChiSquareRuntime interface allows to call chi square functions on the
|
||||
* GPU or CPU using CUDA or OpenCL.
|
||||
*/
|
||||
class DKSBaseMuSR : public DKSFFT {
|
||||
|
||||
private:
|
||||
|
@ -24,6 +24,10 @@
|
||||
#include "MIC/MICFFT.h"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* API to handel calls to DKSFFT.
|
||||
* Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
|
||||
*/
|
||||
class DKSFFT : public DKSBase {
|
||||
|
||||
private:
|
||||
|
@ -10,6 +10,9 @@
|
||||
#include "CUDA/CudaImageReconstruction.cuh"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* API to handle PET image reconstruction calls.
|
||||
*/
|
||||
class DKSImageRecon : public DKSBase {
|
||||
|
||||
private:
|
||||
@ -22,87 +25,88 @@ public:
|
||||
|
||||
~DKSImageRecon();
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
/**
|
||||
* Image reconstruction analaysis calculate source.
|
||||
*/
|
||||
int callCalculateSource(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
/**
|
||||
* Image reconstruction analaysis calculate source.
|
||||
*/
|
||||
int callCalculateBackground(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
/**
|
||||
* Image reconstruction analaysis calculate source.
|
||||
*/
|
||||
int callCalculateSources(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
/**
|
||||
* Image reconstruction analaysis calculate source.
|
||||
*/
|
||||
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Image reconstruction - generate normalization.
|
||||
*
|
||||
/**
|
||||
* Image reconstruction - generate normalization.
|
||||
*/
|
||||
int callGenerateNormalization(void *recon, void *image_position,
|
||||
void *det_position, int total_det);
|
||||
|
||||
/** Image reconstruction - forward correction.
|
||||
*
|
||||
/**
|
||||
* Image reconstruction - forward correction.
|
||||
*/
|
||||
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||
void *image_position, int num_events);
|
||||
|
||||
/** Image reconstruction - backward projection.
|
||||
*
|
||||
/**
|
||||
* Image reconstruction - backward projection.
|
||||
*/
|
||||
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
|
||||
void *det_position, void *image_position,
|
||||
int num_events, int num_voxels);
|
||||
|
||||
/** Set the voxel dimensins on device.
|
||||
/**
|
||||
* Set the voxel dimensins on device.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
|
||||
|
||||
/** Set the image edge.
|
||||
/**
|
||||
* Set the image edge.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setEdge(float x_edge, float y_edge, float z_edge);
|
||||
|
||||
/** Set the image edge1.
|
||||
/**
|
||||
* Set the image edge1.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
|
||||
|
||||
/** Set the minimum crystan in one ring values.
|
||||
/**
|
||||
* Set the minimum crystal in one ring values.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
|
||||
|
||||
/** Set all other required parameters for reconstruction.
|
||||
/**
|
||||
* Set all other required parameters for reconstruction.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
|
@ -33,6 +33,11 @@
|
||||
#include "MIC/MICCollimatorPhysics.h"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* API to handle OPAL calls to DKS library.
|
||||
* Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
|
||||
* functions.
|
||||
*/
|
||||
class DKSOPAL : public DKSFFT {
|
||||
|
||||
private:
|
||||
|
@ -26,6 +26,9 @@
|
||||
|
||||
#define MIC_WIDTH 128
|
||||
|
||||
/** MIC Base class handles device setup and basic communication with the device.
|
||||
* Handles devicew setup, memory manegement and data transfers.
|
||||
*/
|
||||
class MICBase {
|
||||
|
||||
private:
|
||||
@ -45,58 +48,60 @@ public:
|
||||
|
||||
int m_device_id;
|
||||
|
||||
/* constructor */
|
||||
/** constructor */
|
||||
MICBase();
|
||||
|
||||
/* destructor */
|
||||
/** destructor */
|
||||
~MICBase();
|
||||
|
||||
/*
|
||||
Info: create MKL rand streams for each thread
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Create MKL rand streams for each thread
|
||||
* Return: success or error code
|
||||
*/
|
||||
int mic_createRandStreams(int size);
|
||||
|
||||
/*
|
||||
Info: delete MKL rand streams
|
||||
Return: succes or error code
|
||||
*/
|
||||
/**
|
||||
* Delete MKL rand streams
|
||||
* Return: succes or error code
|
||||
*/
|
||||
int mic_deleteRandStreams();
|
||||
|
||||
/*
|
||||
Info: create a new signal for the mic
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Create a new signal for the mic.
|
||||
* Signals can be used for assynchronous data transfers.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int mic_createStream(int & streamId);
|
||||
|
||||
/*
|
||||
Info: get the signal from the vector
|
||||
Return: mic signal
|
||||
/**
|
||||
* Info: get the signal from the vector.
|
||||
* Return: mic signal
|
||||
*/
|
||||
int& mic_getStream(int id);
|
||||
|
||||
/*
|
||||
Info: delete streams
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Info: delete streams.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int mic_deleteStreams();
|
||||
|
||||
/*
|
||||
Info: set device id
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Info: set device id.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int mic_setDeviceId(int id);
|
||||
|
||||
/*
|
||||
Info: get mic devices
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Info: get mic devices.
|
||||
* Prints information about mic devices.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int mic_getDevices();
|
||||
|
||||
/*
|
||||
Info: allocate memory on MIC device
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Allocate memory on MIC device.
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
void * mic_allocateMemory(int size) {
|
||||
|
||||
@ -109,10 +114,10 @@ public:
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: transfer data to device
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Transfer data to device.
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
@ -123,10 +128,10 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: write data to device, non-blocking
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Write data to device, non-blocking.
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
|
||||
{
|
||||
@ -139,10 +144,10 @@ public:
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Info: read data from device
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Read data from device
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
@ -154,10 +159,10 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: read data from device waiting for signal
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Read data from device waiting for signal
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_readDataAsync(const void * data_ptr, void * result, int size,
|
||||
int streamId = -1, int offset = 0) {
|
||||
@ -172,10 +177,10 @@ public:
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Info: wait till all the signals are complete
|
||||
Return siccess or error code
|
||||
*/
|
||||
/**
|
||||
* Wait till all the signals are complete
|
||||
* Return siccess or error code
|
||||
*/
|
||||
int mic_syncDevice() {
|
||||
|
||||
//empty offload to wait for all the signals to finish and launch a new empy signal
|
||||
@ -193,10 +198,10 @@ public:
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Info: free memory on device
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Free memory on device
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_freeMemory(void * data_ptr, int size) {
|
||||
|
||||
@ -210,10 +215,10 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: allocate memory and write data to device
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Allocate memory and write data to device
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
void * mic_pushData(const void * data, int size) {
|
||||
T* tmp_ptr = new T[size];
|
||||
@ -227,10 +232,10 @@ public:
|
||||
return tmp_ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: read data and free memory on device
|
||||
Return: success or erro code
|
||||
*/
|
||||
/**
|
||||
* Read data and free memory on device
|
||||
* Return: success or erro code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_pullData(void * data_ptr, void * result, int size) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
|
@ -14,6 +14,9 @@
|
||||
#include <offload.h>
|
||||
#include "MICBase.h"
|
||||
|
||||
/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices.
|
||||
* Not complete and untested because of the poor performance of first MIC devices.
|
||||
*/
|
||||
class MICChiSquare {
|
||||
|
||||
MICBase *m_micbase;
|
||||
|
@ -22,22 +22,34 @@
|
||||
#define I_M 10
|
||||
#define DT_M 11
|
||||
|
||||
/**
|
||||
* MIC device function for calculating dot product.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
double dot(mic_double3 d1, mic_double3 d2) {
|
||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||
}
|
||||
|
||||
/**
|
||||
* MIC device function for calculating dot product.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
double dot(double dx, double dy, double dz) {
|
||||
return (dx * dx + dy * dy + dz * dz);
|
||||
}
|
||||
|
||||
/**
|
||||
* MIC device function to check if particle is still in material.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
bool checkHit(double &z, double *par) {
|
||||
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* MIC device function to calculate arbitrary rotation.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
void Rot(double &px, double &pz, double &x, double &z, double xplane,
|
||||
double normP, double thetacou, double deltas, int coord)
|
||||
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
|
||||
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* MIC device function to calculate Coulomb scattering for one particle.
|
||||
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
|
||||
* Uses AoS to store particle positions and momentum, paralelized using OpenMP.
|
||||
* For details on the algorithm see OPAL user guide.
|
||||
* Deprecated on favor of SoA data layout.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
|
||||
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
||||
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* MIC device function to calculate Coulomb scattering for one particle.
|
||||
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
|
||||
* Uses SoA to store particle positions and momentum, paralelized using OpenMP.
|
||||
* For details on the algorithm see OPAL user guide.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
|
||||
void coulombScat(double *rx, double *ry, double *rz,
|
||||
double *px, double *py, double *pz, int *label,
|
||||
double *par, VSLStreamStatePtr &stream, int ii, int size)
|
||||
{
|
||||
|
||||
//arrays for temporary storage, each core proceses MIC_WIDTH particles
|
||||
double normP[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
|
||||
double z2[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
|
||||
//simd instruction tells the compiler its safe to vectorize the loop
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
|
||||
}
|
||||
}
|
||||
|
||||
//vectorize the loop
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + size; i++) {
|
||||
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//generate array of random numbers
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
|
||||
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* MIC device function to calculate energyLoss for one particle.
|
||||
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
|
||||
* algorith are available in OPAL user guide.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
|
||||
|
||||
@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
|
||||
pdead = 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* MIC device function to calculate energyLoss for one particle.
|
||||
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
|
||||
* algorith are available in OPAL user guide.
|
||||
*/
|
||||
__declspec(target(mic))
|
||||
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
||||
|
||||
@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
|
||||
double *par = (double*) par_ptr;
|
||||
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
|
||||
|
||||
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
|
||||
the memory allocation and data trasnfer need to be handled before */
|
||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
|
||||
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
|
||||
|
||||
//for loop trough particles if not checkhit set label to -2 and update R.x
|
||||
|
||||
#pragma omp for simd
|
||||
for (int i = 0; i < numparticles; i++) {
|
||||
if ( !checkHit(data[i].Rincol.z, par) ) {
|
||||
@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
||||
{
|
||||
|
||||
|
||||
|
||||
//cast device memory pointers to appropriate types
|
||||
int *label = (int*)label_ptr;
|
||||
unsigned *localID = (unsigned*)localID_ptr;
|
||||
double *rx = (double*)rx_ptr;
|
||||
@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
||||
|
||||
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
|
||||
|
||||
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
|
||||
the memory allocation and data trasnfer need to be handled before */
|
||||
#pragma offload target (mic:0) \
|
||||
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
|
@ -26,6 +26,12 @@ typedef struct {
|
||||
} MIC_PART_SMALL;
|
||||
|
||||
|
||||
/**
|
||||
* MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
|
||||
* Implementes OPALs collimator physics class for particle matter interactions using OpenMP
|
||||
* and offload mode targetomg Intel Xeon Phi processors.
|
||||
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
|
||||
*/
|
||||
class MICCollimatorPhysics : public DKSCollimatorPhysics {
|
||||
|
||||
private:
|
||||
|
@ -10,6 +10,10 @@
|
||||
#include "../Algorithms/FFT.h"
|
||||
#include "MICBase.h"
|
||||
|
||||
/**
|
||||
* MIC FFT based on BaseFFT interface.
|
||||
* uses MKL library to offload FFT on Intel Xeon Phi devices.
|
||||
*/
|
||||
class MICFFT : public BaseFFT {
|
||||
|
||||
private:
|
||||
|
@ -15,6 +15,7 @@
|
||||
#define DKS_SUCCESS 0
|
||||
#define DKS_ERROR 1
|
||||
|
||||
/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
|
||||
class MICGreensFunction : public GreensFunction {
|
||||
|
||||
private:
|
||||
|
@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge sort implementation for intel MIC.
|
||||
* Paralellized over all the MIC cores using OpenMP tasks.
|
||||
*/
|
||||
template <typename T>
|
||||
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
|
||||
|
||||
@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Quicksort algorithm, developed for use on Intel MIC devices.
|
||||
*/
|
||||
template <typename T>
|
||||
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||
|
||||
@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Insertion sort of @p list, developed for use on Intel MIC.
|
||||
* Used by quick_sort to sort small lists.
|
||||
*/
|
||||
template <typename T>
|
||||
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||
|
||||
|
@ -1,16 +1,3 @@
|
||||
/*
|
||||
|
||||
Name: OpenCLBase
|
||||
|
||||
Author: Uldis Locans
|
||||
|
||||
Info: OpenCL base class to handle all the common details associated
|
||||
with kernel launch on OpenCL device
|
||||
|
||||
Date: 2014.09.18
|
||||
|
||||
*/
|
||||
|
||||
#ifndef H_OPENCL_BASE
|
||||
#define H_OPENCL_BASE
|
||||
|
||||
@ -32,7 +19,7 @@
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
/* struct for random number state */
|
||||
/** struct for random number state. */
|
||||
typedef struct {
|
||||
double s10;
|
||||
double s11;
|
||||
@ -44,168 +31,194 @@ typedef struct {
|
||||
bool gen;
|
||||
} RNDState;
|
||||
|
||||
/**
|
||||
* OpenCL base class to handle device setup and basic communication wiht the device.
|
||||
* Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
|
||||
* The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
|
||||
* class contains methods to read the kernel files, compile the kernel codes and launch kernels
|
||||
* from the compiled codes. Which kernel file needs to be loaded for the specif functin is
|
||||
* handled by the base class that is launching the kernel.
|
||||
*/
|
||||
class OpenCLBase {
|
||||
|
||||
private:
|
||||
|
||||
//variables containig OpenCL device and platform ids
|
||||
static cl_platform_id m_platform_id;
|
||||
static cl_device_id m_device_id;
|
||||
|
||||
//variables containit compiled OpenCL program and kernel
|
||||
cl_context_properties m_context_properties[3];
|
||||
cl_program m_program;
|
||||
cl_kernel m_kernel;
|
||||
|
||||
//variables for tracking OpenCL events
|
||||
static cl_event m_last_event;
|
||||
cl_int m_num_events;
|
||||
std::vector<cl_event> m_events;
|
||||
|
||||
//currently load kernel file
|
||||
char * m_kernel_file;
|
||||
|
||||
//type of device used by OpenCL
|
||||
cl_device_type m_device_type;
|
||||
|
||||
/*
|
||||
Name: getPlatforms
|
||||
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Get all available OpenCL platforms.
|
||||
* Get all avaialble platforms and save in m_platform_ids, save number of platforms
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_getPlatforms();
|
||||
|
||||
|
||||
/*
|
||||
Name: getDevice
|
||||
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
|
||||
ReturnL success or error code
|
||||
*/
|
||||
/**
|
||||
* Get first available OpenCL device of specified type.
|
||||
* Get first avaialble devices and save device id and platform id for this device,
|
||||
* device name: (-gpu, -mic, -cpu)
|
||||
* ReturnL success or error code
|
||||
*/
|
||||
int ocl_getDevice(const char* device_name);
|
||||
|
||||
/*
|
||||
Name getDeviceType
|
||||
Info: get device type from device name (-gpu, -cpu, -mic)
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Get cl_device_type from the specified device name.
|
||||
* get device type from device name (-gpu, -cpu, -mic)
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
|
||||
|
||||
/*
|
||||
Name: createContext
|
||||
Info: create context with specified device
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Create OpenCL context with specified device.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_createContext();
|
||||
|
||||
/*
|
||||
Name: buildProgram
|
||||
Info: build program from specified kernel file
|
||||
Return: success or error code
|
||||
/**
|
||||
* Build program from specified kernel file.
|
||||
* Return: success or error code.
|
||||
*/
|
||||
int ocl_buildProgram(const char* kernel_file);
|
||||
|
||||
/** Compile program from kernel source string
|
||||
*
|
||||
/**
|
||||
* Compile program from kernel source string.
|
||||
* Takes a string read from OpenCL kernel file saved in kernel_source and compiles the
|
||||
* OpenCL program, that can be then executed on the device.
|
||||
* opts is a string specifiend additional compiler flags.
|
||||
*/
|
||||
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
|
||||
|
||||
protected:
|
||||
|
||||
//memory for random number states
|
||||
int defaultRndSet;
|
||||
cl_mem defaultRndState;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
//OpenCL context and commad queue
|
||||
static cl_context m_context;
|
||||
static cl_command_queue m_command_queue;
|
||||
|
||||
/*
|
||||
constructor
|
||||
*/
|
||||
/**
|
||||
* constructor
|
||||
*/
|
||||
OpenCLBase();
|
||||
|
||||
/*
|
||||
destructor
|
||||
*/
|
||||
/**
|
||||
* destructor
|
||||
*/
|
||||
~OpenCLBase();
|
||||
|
||||
/*
|
||||
Create RND states
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Allocate memory for size random number states and init the rnd states.
|
||||
* Uses AMD clRng library for random numbers.
|
||||
* This library is only compatible with AMD devices.
|
||||
*/
|
||||
int ocl_createRndStates(int size);
|
||||
|
||||
/* Create an array of random numbers on the device
|
||||
*
|
||||
/**
|
||||
* Create an array of random numbers on the device.
|
||||
* Filles hte mem_ptr with random numbers.
|
||||
*/
|
||||
int ocl_createRandomNumbers(void *mem_ptr, int size);
|
||||
|
||||
/*
|
||||
Destroy rnd states
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Destroy rnd states and free device memory.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_deleteRndStates();
|
||||
|
||||
|
||||
/*
|
||||
Name: getAllDevices
|
||||
Info: get all available devices
|
||||
ReturnL success or error code
|
||||
/**
|
||||
* Prints info about all the available platforms and devices.
|
||||
* Can be used for information purposes to see what devices are available on the system.
|
||||
* ReturnL success or error code.
|
||||
*/
|
||||
int ocl_getAllDevices();
|
||||
|
||||
/** Get the OpenCL device count for the set type of device
|
||||
*
|
||||
/**
|
||||
* Get the OpenCL device count for the set type of device.
|
||||
* Device count is set in ndev parameter, returns success or error code.
|
||||
*/
|
||||
int ocl_getDeviceCount(int &ndev);
|
||||
|
||||
/** Get the name of the device used
|
||||
/**
|
||||
* Get the name of the device currently us use.
|
||||
*/
|
||||
int ocl_getDeviceName(std::string &device_name);
|
||||
|
||||
/** Set the device to use for OpenCL kernels.
|
||||
* device id to use is passed as integer.
|
||||
/**
|
||||
* Set the device to use for OpenCL kernels.
|
||||
* Device id to use is passed as integer.
|
||||
*/
|
||||
int ocl_setDevice(int device);
|
||||
|
||||
/** Get a list of all the unique devices of the same type that can run OpenCL kernels
|
||||
* Used when GPUs of different types might be pressent on the system.
|
||||
/**
|
||||
* Get a list of all the unique devices of the same type that can run OpenCL kernels.
|
||||
* Used when GPUs of different types might be pressent on the system.
|
||||
*/
|
||||
int ocl_getUniqueDevices(std::vector<int> &devices);
|
||||
|
||||
/*
|
||||
Name: setUp
|
||||
Info: set up opencl resources
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Initialize OpenCL connection with a device of specified type.
|
||||
* Find if specified device is avaialble, creates a contex and command queue.
|
||||
* Returns success or error code.
|
||||
*/
|
||||
int ocl_setUp(const char* device_name);
|
||||
|
||||
/*
|
||||
Name: loadKernel
|
||||
Info: load and compile opencl kernel file if it has changed
|
||||
Return: success or error code
|
||||
/**
|
||||
* Given a OpenCL kernel file name loads the content and compile the OpenCL code.
|
||||
* Load and compile opencl kernel file if it has changed.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_loadKernel(const char* kernel_file);
|
||||
|
||||
|
||||
/** Build program from kernel source.
|
||||
/**
|
||||
* Build program from kernel source.
|
||||
* Builds a program from source code provided in kernel_source.
|
||||
* If compilation fails will return DKS_ERROR
|
||||
*/
|
||||
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
|
||||
|
||||
/*
|
||||
Name: allocateMemory
|
||||
Info: allocate memory on device
|
||||
Return: return pointer to memory
|
||||
/**
|
||||
* Allocate memory on the device.
|
||||
* Return: return pointer to memory
|
||||
*/
|
||||
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
||||
|
||||
/*
|
||||
Name: allocateMemory
|
||||
Info: allocate memory on device
|
||||
Return: return pointer to memory
|
||||
/**
|
||||
* Allocate memory of specific type on device.
|
||||
* The availabel types are cl_mem_flags type listed in OpenCL documentation:
|
||||
* CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,
|
||||
* CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
|
||||
* Return: return pointer to memory
|
||||
*/
|
||||
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
||||
|
||||
/** Zero OpenCL memory buffer
|
||||
* Set all the elemetns in the device array to zero
|
||||
/**
|
||||
* Zero OpenCL memory buffer.
|
||||
* Set all the elemetns in the device array to zero.
|
||||
*/
|
||||
template <typename T>
|
||||
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
|
||||
@ -218,92 +231,90 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Name: writeData
|
||||
Info: write data to device memory (needs ptr to mem object)
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Write data to device memory (needs ptr to mem object)
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
||||
|
||||
/*
|
||||
Name: copyData
|
||||
Info: copy data from one buffer on the device to another
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Copy data from one buffer on the device to another
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
|
||||
|
||||
/*
|
||||
Name: createKernel
|
||||
Info: create kernel from program
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Create kernel from compiled OpenCL program.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_createKernel(const char* kernel_name);
|
||||
|
||||
/*
|
||||
Name: setKernelArgs
|
||||
Info: set opencl kernel arguments
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Set argiments for the kernel that will be launched.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
|
||||
|
||||
/*
|
||||
Name: executeKernel
|
||||
Info: execute selected kernel (needs kernel parameters)
|
||||
Return: success or error code
|
||||
/**
|
||||
* Execute selected kernel.
|
||||
* Before kenrel can be executed buildProgram must be executed, create kernel must be executed
|
||||
* and kenre specifeid in execute kerenel must be in compiled source, and the necessary
|
||||
* kernel arguments must be set.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
|
||||
|
||||
/*
|
||||
Name: readData
|
||||
Info: read data from device (needs pointer to mem object)
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Read data from device (needs pointer to mem object).
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
||||
|
||||
/*
|
||||
Name: freeMemory
|
||||
Info: free device memory (needs ptr to mem object)
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Free device memory (needs ptr to mem object).
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_freeMemory(cl_mem mem_ptr);
|
||||
|
||||
/*
|
||||
Name: cleanUp
|
||||
Info: free opencl resources
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Free opencl resources.
|
||||
* Deletes the kernel, compiled program, command queue and colese the connection
|
||||
* to device by releasing the context.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_cleanUp();
|
||||
|
||||
/*
|
||||
Name: deviceInfo
|
||||
Info: print device info (mostly for debugging purposes)
|
||||
Return: success or error code
|
||||
*/
|
||||
/**
|
||||
* Print info of currently selected device.
|
||||
* Mostly for debugging purposes, but in verbose mode can be used to see device properties.
|
||||
* Return: success or error code
|
||||
*/
|
||||
int ocl_deviceInfo(bool verbose = true);
|
||||
|
||||
/* Check OpenCL kernel.
|
||||
* Query device and check if it can run the kernel with required parameters
|
||||
/*
|
||||
* Check OpenCL kernel.
|
||||
* Query device and check if it can run the kernel with required parameters.
|
||||
* Also check the available OpenCL extensions - usefull for checking the supported device
|
||||
* features, like double precission.
|
||||
*/
|
||||
int ocl_checkKernel(const char* kernel_name, int work_group_size,
|
||||
bool double_precision, int &threadsPerBlock);
|
||||
|
||||
/*
|
||||
Name: clearEvents
|
||||
Info: clear saved events (for debuging purposes)
|
||||
Return: nothing
|
||||
*/
|
||||
/**
|
||||
* Clear the event list.
|
||||
* Events can be used for timing and synchronization purposes.
|
||||
*/
|
||||
void ocl_clearEvents();
|
||||
|
||||
/*
|
||||
Name: eventInfo
|
||||
Info: print information about kernel timings (for debuging purposes)
|
||||
Return: nothing
|
||||
*/
|
||||
/**
|
||||
* print information about kernel timings from event list.
|
||||
* for debuging purposes
|
||||
*/
|
||||
void ocl_eventInfo();
|
||||
|
||||
/*
|
||||
Return current command queue
|
||||
*/
|
||||
/**
|
||||
* Return current command queue.
|
||||
*/
|
||||
cl_command_queue ocl_getQueue() { return m_command_queue; }
|
||||
};
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
#define DKS_SUCCESS 0
|
||||
#define DKS_ERROR 1
|
||||
|
||||
|
||||
/** Deprecated, SimpleFit implementation of ChiSquare. */
|
||||
class OpenCLChiSquare {
|
||||
|
||||
private:
|
||||
|
@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
|
||||
//write params to gpu
|
||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
|
||||
return ierr;
|
||||
}
|
||||
@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
|
||||
if (numfunc == 0)
|
||||
return DKS_SUCCESS;
|
||||
|
||||
//write function values to the GPU
|
||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
|
||||
return ierr;
|
||||
}
|
||||
@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
|
||||
if (nummap == 0)
|
||||
return DKS_SUCCESS;
|
||||
|
||||
//wrtie map values to the GPU
|
||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
|
||||
return ierr;
|
||||
}
|
||||
@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
|
||||
freeChiSquare();
|
||||
}
|
||||
|
||||
//allocate temporary memory
|
||||
//allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
|
||||
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
|
||||
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
|
||||
if (size_func == 0)
|
||||
@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
|
||||
int ierr = DKS_ERROR;
|
||||
if (initDone_m) {
|
||||
|
||||
//free memory
|
||||
//free GPU memory
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
|
||||
@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//check the GPU kernel
|
||||
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
|
||||
|
||||
return ierr;
|
||||
|
@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
|
||||
|
||||
const std::string openclFunctFooter = "}\n";
|
||||
|
||||
/**
|
||||
* OpenCL implementation of ChiSquareRuntime class.
|
||||
* Implements ChiSquareRuntime interface to allow musrfit to target devices that
|
||||
* support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
|
||||
*/
|
||||
class OpenCLChiSquareRuntime : public ChiSquareRuntime {
|
||||
|
||||
private:
|
||||
|
||||
OpenCLBase *m_oclbase;
|
||||
|
||||
/** Private function to add user defined function to kernel string
|
||||
*
|
||||
/**
|
||||
* Private function to add user defined function to kernel string.
|
||||
*/
|
||||
std::string buildProgram(std::string function);
|
||||
|
||||
/**
|
||||
* Launch parallel reduction kernel to calculate the sum of data array
|
||||
*/
|
||||
double calculateSum(cl_mem data, int length);
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor wiht openclbase argument
|
||||
*
|
||||
/**
|
||||
* Constructor wiht openclbase argument.
|
||||
*/
|
||||
OpenCLChiSquareRuntime(OpenCLBase *base);
|
||||
|
||||
/** Default constructor
|
||||
*
|
||||
/**
|
||||
* Default constructor
|
||||
*/
|
||||
OpenCLChiSquareRuntime();
|
||||
|
||||
/** Default destructor
|
||||
*
|
||||
/**
|
||||
* Default destructor
|
||||
*/
|
||||
~OpenCLChiSquareRuntime();
|
||||
|
||||
/** Compile program and save ptx.
|
||||
/**
|
||||
* Compile program and save ptx.
|
||||
* Add function string to the calcFunction kernel and compile the program
|
||||
* Function must be valid C math expression. Parameters can be addressed in
|
||||
* a form par[map[idx]]
|
||||
*/
|
||||
int compileProgram(std::string function, bool mlh = false);
|
||||
|
||||
/** Launch selected kernel
|
||||
/**
|
||||
* Launch selected kernel.
|
||||
* Launched the selected kernel from the compiled code.
|
||||
* Result is put in &result variable
|
||||
*/
|
||||
@ -64,22 +74,26 @@ public:
|
||||
double timeStart, double timeStep,
|
||||
double &result);
|
||||
|
||||
/** Write params to device.
|
||||
/**
|
||||
* Write params to device.
|
||||
* Write params from double array to mem_param_m memory on the device.
|
||||
*/
|
||||
int writeParams(const double *params, int numparams);
|
||||
|
||||
/** Write functions to device.
|
||||
/**
|
||||
* Write functions to device.
|
||||
* Write function values from double array to mem_func_m memory on the device.
|
||||
*/
|
||||
int writeFunc(const double *func, int numfunc);
|
||||
|
||||
/** Write maps to device.
|
||||
/**
|
||||
* Write maps to device.
|
||||
* Write map values from int array to mem_map_m memory on the device.
|
||||
*/
|
||||
int writeMap(const int *map, int nummap);
|
||||
|
||||
/** Allocate temporary memory needed for chi square.
|
||||
/**
|
||||
* Allocate temporary memory needed for chi square.
|
||||
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
||||
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
||||
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
||||
@ -87,14 +101,16 @@ public:
|
||||
*/
|
||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||
|
||||
/** Free temporary memory allocated for chi square.
|
||||
/**
|
||||
* Free temporary memory allocated for chi square.
|
||||
* Frees the chisq temporary memory and memory for params, functions and maps
|
||||
*/
|
||||
int freeChiSquare();
|
||||
|
||||
/** Check MuSR kernels for necessary resources.
|
||||
/**
|
||||
* Check MuSR kernels for necessary resources.
|
||||
* Query device properties to get if sufficient resources are
|
||||
* available to run the kernels
|
||||
* available to run the kernels. Also checks if double precission is enabled on the device.
|
||||
*/
|
||||
int checkChiSquareKernels(int fitType, int &threadsPerBlock);
|
||||
|
||||
|
@ -17,12 +17,16 @@
|
||||
#include "boost/compute/core.hpp"
|
||||
*/
|
||||
|
||||
/** Double3 structure for use in OpenCL code. */
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} Double3;
|
||||
|
||||
/**
|
||||
* Structure for stroing particles in OpenCL code.
|
||||
*/
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
@ -35,6 +39,10 @@ typedef struct {
|
||||
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
|
||||
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
|
||||
|
||||
/**
|
||||
* OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
|
||||
* Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
|
||||
*/
|
||||
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
|
||||
|
||||
private:
|
||||
@ -42,16 +50,20 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
/* constructor */
|
||||
/**
|
||||
* Constructor with OpenCLBase as argument.
|
||||
* Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
|
||||
*/
|
||||
OpenCLCollimatorPhysics(OpenCLBase *base) {
|
||||
m_oclbase = base;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~OpenCLCollimatorPhysics() {
|
||||
}
|
||||
|
||||
/* execute degrader code on device */
|
||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
||||
bool enableRutherforScattering = true);
|
||||
|
||||
|
@ -1,14 +1,3 @@
|
||||
/*
|
||||
|
||||
Name: OpenCLFFT
|
||||
|
||||
Author: Uldis Locans
|
||||
|
||||
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
|
||||
|
||||
Data: 19.09.2014
|
||||
|
||||
*/
|
||||
#ifndef H_OPENCL_FFT
|
||||
#define H_OPENCL_FFT
|
||||
|
||||
@ -22,6 +11,12 @@
|
||||
|
||||
#include "clFFT.h"
|
||||
|
||||
/**
|
||||
* OpenCL FFT class based on BaseFFT interface.
|
||||
* Uses clFFT library to perform FFTs on AMD gpus.
|
||||
* clFFT library works also on nvida GPUs and other devices that
|
||||
* support OpenCL.
|
||||
*/
|
||||
class OpenCLFFT : public BaseFFT {
|
||||
|
||||
private:
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "../Algorithms/GreensFunction.h"
|
||||
#include "OpenCLBase.h"
|
||||
|
||||
/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
|
||||
class OpenCLGreensFunction : public GreensFunction {
|
||||
|
||||
private:
|
||||
@ -31,7 +32,7 @@ public:
|
||||
int buildProgram();
|
||||
|
||||
/**
|
||||
Info: calc itegral on device memory (taken from OPAL src code)
|
||||
Info: calc itegral on device memory (taken from OPAL src code).
|
||||
Return: success or error code
|
||||
*/
|
||||
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
||||
@ -39,20 +40,20 @@ public:
|
||||
int streamId = -1);
|
||||
|
||||
/**
|
||||
Info: integration of rho2_m field (taken from OPAL src code)
|
||||
Info: integration of rho2_m field (taken from OPAL src code).
|
||||
Return: success or error code
|
||||
*/
|
||||
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
||||
int streamId = -1);
|
||||
|
||||
/**
|
||||
Info: mirror rho field (taken from OPAL src code)
|
||||
Info: mirror rho field (taken from OPAL src code).
|
||||
Return: succes or error code
|
||||
*/
|
||||
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
|
||||
|
||||
/**
|
||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
|
||||
Return: success or error code
|
||||
*/
|
||||
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
||||
|
@ -5,6 +5,10 @@
|
||||
#include <string>
|
||||
#include <sys/time.h>
|
||||
|
||||
/**
|
||||
* Custom timer class.
|
||||
* Allows to insert timers in the code to get function exectution times.
|
||||
*/
|
||||
class DKSTimer {
|
||||
|
||||
private:
|
||||
@ -17,39 +21,45 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
/** Init DKSTimer by seting timer to zero */
|
||||
/** Init DKSTimer by seting timer to zero. */
|
||||
DKSTimer();
|
||||
|
||||
~DKSTimer();
|
||||
|
||||
/** Init the timer
|
||||
* Set the name for timer and clear all values
|
||||
/**
|
||||
* Init the timer.
|
||||
* Set the name for timer and clear all values
|
||||
*/
|
||||
void init(std::string n);
|
||||
|
||||
/** Start the timer.
|
||||
* Get the curret time with gettimeofday and save in timeStart
|
||||
/**
|
||||
* Start the timer.
|
||||
* Get the curret time with gettimeofday and save in timeStart
|
||||
*/
|
||||
void start();
|
||||
|
||||
/** Stop the timer
|
||||
* Get the curretn time with gettimeofday and save in timeEnd
|
||||
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
|
||||
/**
|
||||
* Stop the timer.
|
||||
* Get the curretn time with gettimeofday and save in timeEnd
|
||||
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
|
||||
*/
|
||||
void stop();
|
||||
|
||||
/** Reset timervalue to zero.
|
||||
* Set timervalue, timeStart and timeEnd to zero
|
||||
/**
|
||||
* Reset timervalue to zero.
|
||||
* Set timervalue, timeStart and timeEnd to zero
|
||||
*/
|
||||
void reset();
|
||||
|
||||
/** Return elapsed time in seconds.
|
||||
* Return the value of timervalue
|
||||
/**
|
||||
* Return elapsed time in seconds.
|
||||
* Return the value of timervalue
|
||||
*/
|
||||
double gettime();
|
||||
|
||||
/** Print timer.
|
||||
* Print the elapsed time of the timer
|
||||
/**
|
||||
* Print timer.
|
||||
* Print the elapsed time of the timer
|
||||
*/
|
||||
void print();
|
||||
|
||||
|
Reference in New Issue
Block a user