From eee9dfd89e955cbeb8532f7f61b140d7406ba8d4 Mon Sep 17 00:00:00 2001 From: Uldis Locans Date: Tue, 28 Feb 2017 15:06:45 +0100 Subject: [PATCH] seperate OPAL DKS functions from base --- src/DKSBase.cpp | 363 ++---------------------------------------------- src/DKSBase.h | 220 ++--------------------------- src/DKSOPAL.cpp | 277 ++++++++++++++++++++++++++++++++++++ src/DKSOPAL.h | 217 +++++++++++++++++++++++++++++ 4 files changed, 520 insertions(+), 557 deletions(-) create mode 100644 src/DKSOPAL.cpp create mode 100644 src/DKSOPAL.h diff --git a/src/DKSBase.cpp b/src/DKSBase.cpp index 57567db..2960414 100644 --- a/src/DKSBase.cpp +++ b/src/DKSBase.cpp @@ -103,17 +103,14 @@ DKSBase::DKSBase() { #ifdef DKS_CUDA cbase = new CudaBase(); - cchi = new CudaChiSquare(cbase); #endif -ls#ifdef DKS_OPENCL +#ifdef DKS_OPENCL oclbase = new OpenCLBase(); - oclchi = new OpenCLChiSquare(oclbase); #endif #ifdef DKS_MIC micbase = new MICBase(); - micchi = new MICChiSquare(micbase); #endif } @@ -157,10 +154,6 @@ DKSBase::~DKSBase() { if (m_function_name != NULL) delete[] m_function_name; - delete dksfft; - delete dkscol; - delete dksgreens; - #ifdef DKS_CUDA delete cchi; delete cbase; @@ -287,37 +280,7 @@ int DKSBase::getDeviceList(std::vector &devices) { return DKS_ERROR; } -int DKSBase::setup() { - - int ierr = DKS_ERROR; - - if (apiOpenCL()) { - ierr = OPENCL_SAFECALL( DKS_SUCCESS ); - //TODO: only enable if AMD libraries are available - dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(oclbase) ); - dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(oclbase) ); - dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(oclbase) ); - } else if (apiCuda()) { - ierr = CUDA_SAFECALL( DKS_SUCCESS ); - dksfft = CUDA_SAFEINIT( new CudaFFT(cbase) ); - dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(cbase) ); - dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(cbase) ); - } else if (apiOpenMP()) { - ierr = MIC_SAFECALL( DKS_SUCCESS ); - dksfft = MIC_SAFEINIT( new MICFFT(micbase) ); - dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(micbase) ); - dksgreens = MIC_SAFEINIT( new MICGreensFunction(micbase) ); - } else { - ierr = DKS_ERROR; - } - - return ierr; -} - -/* - init device -*/ -int DKSBase::initDevice() { +int DKSBase::setupDevice() { int ierr = DKS_ERROR; @@ -347,10 +310,15 @@ int DKSBase::initDevice() { } } - if (ierr == DKS_SUCCESS) - ierr = setup(); - return ierr; + +} + +/* + init device +*/ +int DKSBase::initDevice() { + return setupDevice(); } /* @@ -468,292 +436,16 @@ int DKSBase::syncDevice() { return DKS_ERROR; } -/* setup fft plans to reuse if multiple ffts of same size are needed */ -int DKSBase::setupFFT(int ndim, int N[3]) { - - if (apiCuda()) { - return dksfft->setupFFT(ndim, N); - } else if (apiOpenCL()) { - int ierr1 = dksfft->setupFFT(ndim, N); - int ierr2 = dksfft->setupFFTRC(ndim, N); - int ierr3 = dksfft->setupFFTCR(ndim, N); - if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS) - return DKS_ERROR; - - return DKS_SUCCESS; - } else if (apiOpenMP()) { - //micbase.mic_setupFFT(ndim, N); - //BENI: setting up RC and CR transformations on MIC - int ierr1 = dksfft->setupFFTRC(ndim, N, 1.); - int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])); - if (ierr1 != DKS_SUCCESS) - return ierr1; - if (ierr2 != DKS_SUCCESS) - return ierr2; - return DKS_SUCCESS; - } - - return DKS_ERROR; - -} -//BENI: -int DKSBase::setupFFTRC(int ndim, int N[3], double scale) { +int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) { if (apiCuda()) - return dksfft->setupFFT(ndim, N); + return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size)); if (apiOpenCL()) - return dksfft->setupFFTRC(ndim, N); - else if (apiOpenMP()) - return dksfft->setupFFTRC(ndim, N, scale); + return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size)); return DKS_ERROR; - } -//BENI: -int DKSBase::setupFFTCR(int ndim, int N[3], double scale) { - - if (apiCuda()) - return dksfft->setupFFT(ndim, N); - if (apiOpenCL()) - return dksfft->setupFFTCR(ndim, N); - else if (apiOpenMP()) - return dksfft->setupFFTCR(ndim, N, scale); - - return DKS_ERROR; - -} - -/* call OpenCL FFT function for selected platform */ -int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { - - if (apiOpenCL() || apiOpenMP()) - return dksfft->executeFFT(data_ptr, ndim, dimsize); - else if (apiCuda()) - return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId); - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; -} - -/* call OpenCL IFFT function for selected platform */ -int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { - if (apiOpenCL() || apiOpenMP()) - return dksfft->executeIFFT(data_ptr, ndim, dimsize); - else if (apiCuda()) - return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId); - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; -} - -/* call normalize FFT function for selected platform */ -int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { - - if (apiOpenCL()) { - if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) - return dksfft->normalizeFFT(data_ptr, ndim, dimsize); - else - return DKS_ERROR; - } else if (apiCuda()) { - return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId); - } else if (apiOpenMP()) { - return dksfft->normalizeFFT(data_ptr, ndim, dimsize); - } - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; -} - -/* call real to complex FFT */ -int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { - - if (apiCuda()) - return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId); - else if (apiOpenCL() || apiOpenMP()) - return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize); - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; -} - -/* call complex to real FFT */ -int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { - if (apiCuda()) - return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId); - else if (apiOpenCL() || apiOpenMP()) - return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize); - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; -} - -/* normalize complex to real iFFT */ -int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) { - if (apiCuda()) - return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId); - else if (apiOpenCL()) - return DKS_ERROR; - else if (apiOpenMP()) - return DKS_ERROR; - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; -} - -int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, - double hz_m0, double hz_m1, double hz_m2, int streamId) { - - return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, - hz_m0, hz_m1, hz_m2, streamId); - -} - -int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, - int I, int J, int K, int streamId) { - - return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId); -} - -int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { - - return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId); -} - -int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) { - - return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId); -} - - -int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, - double fTimeResolution, double fRebin, - int sensors, int length, int numpar, double &result) -{ - - if (apiCuda()) { - return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq, - fTimeResolution, fRebin, - sensors, length, numpar, - result)); - } else if (apiOpenCL()) { - - if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) - return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq, - fTimeResolution, fRebin, - sensors, length, numpar, result)); - else - return DKS_ERROR; - } - - DEBUG_MSG("No implementation for selceted platform"); - return DKS_ERROR; - -} - -int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, - double fTimeResolution, double fRebin, double fGoodBinOffset, - int sensors, int length, int numpar, - double &result) -{ - if (apiCuda()) { - return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, - fTimeResolution, fRebin, fGoodBinOffset, - sensors, length, numpar, - result)); - } else if (apiOpenCL()) { - if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) - return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, - fTimeResolution, fRebin, fGoodBinOffset, - sensors, length, numpar, result)); - else - return DKS_ERROR; - } - - DEBUG_MSG("No implementation for selceted platform"); - return DKS_ERROR; - -} - -int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, - double fTimeResolution, double fRebin, double fGoodBinOffset, - int sensors, int length, int numpar, - double &result) -{ - if (apiCuda()) { - return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, - fTimeResolution, fRebin, fGoodBinOffset, - sensors, length, numpar, - result)); - } else if (apiOpenCL()) { - - if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) - return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, - fTimeResolution, fRebin, fGoodBinOffset, - sensors, length, numpar, result)); - else - return DKS_ERROR; - } - - DEBUG_MSG("No implementation for selceted platform"); - return DKS_ERROR; - -} - -int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, - int numparticles, int numparams, - int &numaddback, int &numdead) -{ - - return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles); - -} - - -int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) -{ - - return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles); - -} - -int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, - void *rx_ptr, void *ry_ptr, void *rz_ptr, - void *px_ptr, void *py_ptr, void *pz_ptr, - void *par_ptr, int numparticles) -{ - - - return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr, - rx_ptr, ry_ptr, rz_ptr, - px_ptr, py_ptr, pz_ptr, - par_ptr, numparticles); - -} - - -int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) -{ - - - return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback); - -} - -int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, - void *rx_ptr, void *ry_ptr, void *rz_ptr, - void *px_ptr, void *py_ptr, void *pz_ptr, - void *par_ptr, int numparticles, int &numaddback) -{ - - return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, - rx_ptr, ry_ptr, rz_ptr, - px_ptr, py_ptr, pz_ptr, - par_ptr, numparticles, numaddback)); - -} - - int DKSBase::callInitRandoms(int size) { if (apiCuda()) return CUDA_SAFECALL(cbase->cuda_createCurandStates(size)); @@ -766,32 +458,3 @@ int DKSBase::callInitRandoms(int size) { return DKS_ERROR; } - -int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, - void *dt_ptr, double dt, double c, - bool usedt, int streamId) -{ - - return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId); - -} - -int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, - void *lastSec_ptr, void *orient_ptr, - int npart, int nsec, void *dt_ptr, double dt, - double c, bool usedt, int streamId) -{ - - return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr, - npart, nsec, dt_ptr, dt, c, usedt, streamId); - -} - -int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) { - if (apiCuda()) - return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size)); - if (apiOpenCL()) - return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size)); - - return DKS_ERROR; -} diff --git a/src/DKSBase.h b/src/DKSBase.h index ebd8d78..d07a7e6 100644 --- a/src/DKSBase.h +++ b/src/DKSBase.h @@ -76,10 +76,6 @@ private: bool m_auto_tuning; bool m_use_config; - DKSFFT *dksfft; - DKSCollimatorPhysics *dkscol; - GreensFunction *dksgreens; - #ifdef DKS_OPENCL OpenCLBase *oclbase; OpenCLChiSquare *oclchi; @@ -140,6 +136,12 @@ protected: } #endif +#ifdef DKS_MIC + MICBase *getMICBase() { + return micbase; + } +#endif + /** Call OpenCL base to load specified kenrel file. * */ @@ -155,10 +157,6 @@ protected: return device_name; } - /** Private function to initialize objects based on the device used. - * - */ - int setup(); public: @@ -179,6 +177,11 @@ public: */ ~DKSBase(); + /** Function to initialize objects based on the device used. + * + */ + int setupDevice(); + /** Turn on auto tuning */ void setAutoTuningOn() { m_auto_tuning = true; } @@ -891,184 +894,10 @@ public: return DKS_ERROR; } - - /////////////////////////////////////////////// - ///////Function library part of dksbase//////// - /////////////////////////////////////////////// - - /** - * Setup FFT function. - * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls. - * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case - * each fft will do its own setup according to fft size and dimensions. - * TODO: opencl and mic implementations - */ - int setupFFT(int ndim, int N[3]); - //BENI: - int setupFFTRC(int ndim, int N[3], double scale = 1.0); - //BENI: - int setupFFTCR(int ndim, int N[3], double scale = 1.0); - - /** - * Call complex-to-complex fft. - * Executes in place complex to compelx fft on the device on data pointed by data_ptr. - * stream id can be specified to use other streams than default. - * TODO: mic implementation - */ - int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); - - /** - * Call complex-to-complex ifft. - * Executes in place complex to compelx ifft on the device on data pointed by data_ptr. - * stream id can be specified to use other streams than default. - * TODO: mic implementation. - */ - int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); - - /** - * Normalize complex to complex ifft. - * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by - * fft size - * TODO: mic implementation. - */ - int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); - - /** - * Call real to complex FFT. - * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points - * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size - * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast - * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] - * TODO: opencl and mic implementations - */ - int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); - - /** - * Call complex to real iFFT. - * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points - * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size - * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast - * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] - * TODO: opencl and mic implementations. - */ - int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); - - /** - * Normalize compelx to real ifft. - * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by - * fft size. - * TODO: opencl and mic implementations. - */ - int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1); - - /** - * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. - * For specifics check OPAL docs. - * TODO: opencl and mic implementations. - */ - int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, - double hz_m0, double hz_m1, double hz_m2, int streamId = -1); - - /** - * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. - * For specifics check OPAL docs. - * TODO: opencl and mic implementations. - */ - int callGreensIntegration(void *mem_ptr, void *tmp_ptr, - int I, int J, int K, int streamId = -1); - - /** - * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. - * For specifics check OPAL docs. - * TODO: opencl and mic implementations. - */ - int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); - - /** - * Element by element multiplication. - * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies - * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1. - * TODO: opencl and mic implementations. - */ - int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1); - - /** - * Chi square for parameter fitting on device. - * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for - * intermediate results. Chi square results are put in &results - */ - int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, - double fTimeResolution, double fRebin, - int sensors, int length, int numpar, double &result); - - /** - * max-log-likelihood for parameter fitting on device. - * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, - * mem_par - pointer to parameter set, mem_results - pointer for - * intermediate results. Chi square results are put in &results. - * TODO: opencl and mic implementations. - */ - int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, - double fTimeResolution, double fRebin, double fGoodBinOffser, - int sensors, int length, int numpar, - double &result); - - /** - * max-log-likelihood for parameter fitting on device. - * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, - * mem_par - pointer to parameter set, mem_results - pointer for - * intermediate results. Chi square results are put in &results. - * TODO: opencl and mic implementations. - */ - int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, - double fTimeResolution, double fRebin, double fGoodBinOffser, - int sensors, int length, int numpar, - double &result); - - /** - * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. - * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. - * TODO: opencl and mic implementations. - */ - int callCollimatorPhysics(void *mem_ptr, void *par_ptr, - int numparticles, int numparams, - int &numaddback, int &numdead); - - - - /** - * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. - * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. - * TODO: opencl and mic implementations. - */ - int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles); - - /** - * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. - * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. - * Test function for the MIC to test SoA layout vs AoS layout used in previous versions - */ - int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, - void *rx_ptr, void *ry_ptr, void *rz_ptr, - void *px_ptr, void *py_ptr, void *pz_ptr, - void *par_ptr, int numparticles); - /** - * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. - * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. - * TODO: opencl and mic implementations. + * Create random numbers on the device and fille mem_data array */ - int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); - - /** - * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. - * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. - * TODO: opencl and mic implementations. - */ - int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, - void *rx_ptr, void *ry_ptr, void *rz_ptr, - void *px_ptr, void *py_ptr, void *pz_ptr, - void *par_ptr, int numparticles, int &numaddback); + int callCreateRandomNumbers(void *mem_ptr, int size); /** * Init random number states and save for reuse on device. @@ -1076,29 +905,6 @@ public: */ int callInitRandoms(int size); - /** - * Integration code from ParallelTTracker from OPAL. - * For specifics check OPAL docs and CudaCollimatorPhysics class docs - */ - int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, - void *dt_ptr, double dt, double c, - bool usedt = false, int streamId = -1); - - /** - * Integration code from ParallelTTracker from OPAL. - * For specifics check OPAL docs and CudaCollimatorPhysics class docs - */ - int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, - void *lastSec_ptr, void *orient_ptr, - int npart, int nsec, void *dt_ptr, - double dt, double c, bool usedt = false, - int streamId = -1); - - /** - * Create random numbers on the device and fille mem_data array - */ - int callCreateRandomNumbers(void *mem_ptr, int size); - /** * Print memory information on device (total, used, available) * TODO: opencl and mic imlementation diff --git a/src/DKSOPAL.cpp b/src/DKSOPAL.cpp new file mode 100644 index 0000000..111c3ba --- /dev/null +++ b/src/DKSOPAL.cpp @@ -0,0 +1,277 @@ +#include "DKSOPAL.h" + +DKSOPAL::DKSOPAL() { + dksfft = nullptr; + dkscol = nullptr; + dksgreens = nullptr; +} + +DKSOPAL::~DKSOPAL() { + delete dksfft; + delete dkscol; + delete dksgreens; +} + +int DKSOPAL::setupOPAL() { + int ierr = DKS_ERROR; + + if (apiOpenCL()) { + ierr = OPENCL_SAFECALL( DKS_SUCCESS ); + //TODO: only enable if AMD libraries are available + dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) ); + dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) ); + dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) ); + } else if (apiCuda()) { + ierr = CUDA_SAFECALL( DKS_SUCCESS ); + dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) ); + dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) ); + dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) ); + } else if (apiOpenMP()) { + ierr = MIC_SAFECALL( DKS_SUCCESS ); + dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) ); + dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) ); + dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) ); + } else { + ierr = DKS_ERROR; + } + + return ierr; +} + +int DKSOPAL::initDevice() { + int ierr = setupDevice(); + if (ierr == DKS_ERROR) + ierr = setupOPAL(); + return ierr; +} + +/* setup fft plans to reuse if multiple ffts of same size are needed */ +int DKSOPAL::setupFFT(int ndim, int N[3]) { + + if (apiCuda()) { + return dksfft->setupFFT(ndim, N); + } else if (apiOpenCL()) { + int ierr1 = dksfft->setupFFT(ndim, N); + int ierr2 = dksfft->setupFFTRC(ndim, N); + int ierr3 = dksfft->setupFFTCR(ndim, N); + if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS) + return DKS_ERROR; + + return DKS_SUCCESS; + } else if (apiOpenMP()) { + //micbase.mic_setupFFT(ndim, N); + //BENI: setting up RC and CR transformations on MIC + int ierr1 = dksfft->setupFFTRC(ndim, N, 1.); + int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])); + if (ierr1 != DKS_SUCCESS) + return ierr1; + if (ierr2 != DKS_SUCCESS) + return ierr2; + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} +//BENI: +int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) { + + if (apiCuda()) + return dksfft->setupFFT(ndim, N); + if (apiOpenCL()) + return dksfft->setupFFTRC(ndim, N); + else if (apiOpenMP()) + return dksfft->setupFFTRC(ndim, N, scale); + + return DKS_ERROR; + +} + +//BENI: +int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) { + + if (apiCuda()) + return dksfft->setupFFT(ndim, N); + if (apiOpenCL()) + return dksfft->setupFFTCR(ndim, N); + else if (apiOpenMP()) + return dksfft->setupFFTCR(ndim, N, scale); + + return DKS_ERROR; + +} + +/* call OpenCL FFT function for selected platform */ +int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiOpenCL() || apiOpenMP()) + return dksfft->executeFFT(data_ptr, ndim, dimsize); + else if (apiCuda()) + return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call OpenCL IFFT function for selected platform */ +int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + if (apiOpenCL() || apiOpenMP()) + return dksfft->executeIFFT(data_ptr, ndim, dimsize); + else if (apiCuda()) + return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call normalize FFT function for selected platform */ +int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiOpenCL()) { + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return dksfft->normalizeFFT(data_ptr, ndim, dimsize); + else + return DKS_ERROR; + } else if (apiCuda()) { + return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId); + } else if (apiOpenMP()) { + return dksfft->normalizeFFT(data_ptr, ndim, dimsize); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call real to complex FFT */ +int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiCuda()) + return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId); + else if (apiOpenCL() || apiOpenMP()) + return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call complex to real FFT */ +int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { + if (apiCuda()) + return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId); + else if (apiOpenCL() || apiOpenMP()) + return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* normalize complex to real iFFT */ +int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) { + if (apiCuda()) + return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId); + else if (apiOpenCL()) + return DKS_ERROR; + else if (apiOpenMP()) + return DKS_ERROR; + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, + double hz_m0, double hz_m1, double hz_m2, int streamId) { + + return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, + hz_m0, hz_m1, hz_m2, streamId); + +} + +int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr, + int I, int J, int K, int streamId) { + + return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId); +} + +int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { + + return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId); +} + +int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) { + + return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId); +} + +int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles, int numparams, + int &numaddback, int &numdead) +{ + + return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles); + +} + + +int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) +{ + + return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles); + +} + +int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) +{ + + + return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr, + rx_ptr, ry_ptr, rz_ptr, + px_ptr, py_ptr, pz_ptr, + par_ptr, numparticles); + +} + + +int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) +{ + + + return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback); + +} + +int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) +{ + + return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, + rx_ptr, ry_ptr, rz_ptr, + px_ptr, py_ptr, pz_ptr, + par_ptr, numparticles, numaddback)); + +} + + +int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, + bool usedt, int streamId) +{ + + return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId); + +} + +int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, void *dt_ptr, double dt, + double c, bool usedt, int streamId) +{ + + return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr, + npart, nsec, dt_ptr, dt, c, usedt, streamId); + +} diff --git a/src/DKSOPAL.h b/src/DKSOPAL.h new file mode 100644 index 0000000..f182c1d --- /dev/null +++ b/src/DKSOPAL.h @@ -0,0 +1,217 @@ +#ifndef H_DKS_OPAL +#define H_DKS_OPAL + +#include +#include "AutoTuning/DKSAutoTuning.h" + +#include "DKSBase.h" + +#include "DKSDefinitions.h" + +#include "Algorithms/GreensFunction.h" +#include "Algorithms/CollimatorPhysics.h" +#include "Algorithms/FFT.h" + + +#ifdef DKS_AMD +#include "OpenCL/OpenCLFFT.h" +#include "OpenCL/OpenCLGreensFunction.h" +#include "OpenCL/OpenCLCollimatorPhysics.h" +#endif + +#ifdef DKS_CUDA +#include "CUDA/CudaFFT.cuh" +#include "CUDA/CudaGreensFunction.cuh" +#include "CUDA/CudaCollimatorPhysics.cuh" + +#endif + +#ifdef DKS_MIC +#include "MIC/MICFFT.h" +#include "MIC/MICGreensFunction.hpp" +#include "MIC/MICCollimatorPhysics.h" +#endif + +class DKSOPAL : public DKSBase { + +private: + + DKSFFT *dksfft; + DKSCollimatorPhysics *dkscol; + GreensFunction *dksgreens; + + int setupOPAL(); + +public: + + DKSOPAL(); + + ~DKSOPAL(); + + int initDevice(); + + /////////////////////////////////////////////// + ///////Function library part of dksbase//////// + /////////////////////////////////////////////// + + /** + * Setup FFT function. + * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls. + * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case + * each fft will do its own setup according to fft size and dimensions. + * TODO: opencl and mic implementations + */ + int setupFFT(int ndim, int N[3]); + //BENI: + int setupFFTRC(int ndim, int N[3], double scale = 1.0); + //BENI: + int setupFFTCR(int ndim, int N[3], double scale = 1.0); + +/** + * Call complex-to-complex fft. + * Executes in place complex to compelx fft on the device on data pointed by data_ptr. + * stream id can be specified to use other streams than default. + * TODO: mic implementation + */ + int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call complex-to-complex ifft. + * Executes in place complex to compelx ifft on the device on data pointed by data_ptr. + * stream id can be specified to use other streams than default. + * TODO: mic implementation. + */ + int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Normalize complex to complex ifft. + * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by + * fft size + * TODO: mic implementation. + */ + int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call real to complex FFT. + * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points + * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size + * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast + * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] + * TODO: opencl and mic implementations + */ + int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call complex to real iFFT. + * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points + * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size + * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast + * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] + * TODO: opencl and mic implementations. + */ + int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Normalize compelx to real ifft. + * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by + * fft size. + * TODO: opencl and mic implementations. + */ + int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, + double hz_m0, double hz_m1, double hz_m2, int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callGreensIntegration(void *mem_ptr, void *tmp_ptr, + int I, int J, int K, int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); + + /** + * Element by element multiplication. + * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies + * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1. + * TODO: opencl and mic implementations. + */ + int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles, int numparams, + int &numaddback, int &numdead); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * Test function for the MIC to test SoA layout vs AoS layout used in previous versions + */ + int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1); + + +}; + +#endif