#include "DKSBase.h" #define API_OPENCL "OpenCL" #define API_CUDA "Cuda" #define API_OPENMP "OpenMP" #define DEVICE_GPU "-gpu" #define DEVICE_CPU "-cpu" #define DEVICE_MIC "-mic" //=====================================// //==========Private functions==========// //=====================================// bool DKSBase::apiOpenCL() { if (!m_api_set) return false; if (strcmp(m_api_name, API_OPENCL) != 0) return false; return true; } bool DKSBase::apiCuda() { if (!m_api_set) return false; if (strcmp(m_api_name, API_CUDA) != 0) return false; return true; } bool DKSBase::apiOpenMP() { if (!m_api_set) return false; if (strcmp(m_api_name, API_OPENMP) != 0) return false; return true; } bool DKSBase::deviceGPU() { if (!m_device_set) return false; if (strcmp(m_device_name, DEVICE_GPU) != 0) return false; return true; } bool DKSBase::deviceCPU() { if (!m_device_set) return false; if (strcmp(m_device_name, DEVICE_CPU) != 0) return false; return true; } bool DKSBase::deviceMIC() { if (!m_device_set) return false; if (strcmp(m_device_name, DEVICE_MIC) != 0) return false; return true; } int DKSBase::loadOpenCLKernel(const char *kernel_name) { //load kernel char * kernel_file = new char[500]; kernel_file[0] = '\0'; strcat(kernel_file, OPENCL_KERNELS); strcat(kernel_file, kernel_name); int ierr = OPENCL_SAFECALL( oclbase->ocl_loadKernel(kernel_file) ); delete[] kernel_file; return ierr; } //=====================================// //==========Public functions===========// //=====================================// DKSBase::DKSBase() { m_device_name = NULL; m_api_name = NULL; m_function_name = NULL; m_device_set = false; m_api_set = false; m_function_set = false; m_auto_tuning = false; m_use_config = false; #ifdef DKS_CUDA cbase = new CudaBase(); cchi = new CudaChiSquare(cbase); #endif ls#ifdef DKS_OPENCL oclbase = new OpenCLBase(); oclchi = new OpenCLChiSquare(oclbase); #endif #ifdef DKS_MIC micbase = new MICBase(); micchi = new MICChiSquare(micbase); #endif } DKSBase::DKSBase(const char* api_name, const char* device_name) { setAPI(api_name, strlen(api_name)); setDevice(device_name, strlen(device_name)); m_function_name = NULL; m_function_set = false; m_auto_tuning = false; m_use_config = false; #ifdef DKS_CUDA cbase = new CudaBase(); cchi = new CudaChiSquare(cbase); #endif #ifdef DKS_OPENCL oclbase = new OpenCLBase(); oclchi = new OpenCLChiSquare(oclbase); #endif #ifdef DKS_MIC micbase = new MICBase(); miccol = new MICCollimatorPhysics(micbase); #endif } DKSBase::~DKSBase() { if (m_device_name != NULL) delete[] m_device_name; if (m_api_name != NULL) delete[] m_api_name; if (m_function_name != NULL) delete[] m_function_name; delete dksfft; delete dkscol; delete dksgreens; #ifdef DKS_CUDA delete cchi; delete cbase; #endif #ifdef DKS_OPENCL delete oclchi; delete oclbase; #endif #ifdef DKS_MIC delete micchi; delete micbase; #endif } /* Name: setDevice Info: sets specific device to use. length specifies device_name string length (deprecated) Return: success or error code */ int DKSBase::setDevice(const char* device_name, int length) { if (m_device_set) delete[] m_device_name; int l = strlen(device_name); m_device_name = new char[l+1]; for (int i = 0; i < l; i++) m_device_name[i] = device_name[i]; m_device_name[l] = '\0'; m_device_set = true; return DKS_SUCCESS; } /* Name: setAPI Info: sets specific api (OpenCL, CUDA, OpenACC, OpenMP) to use Return: success or error code */ int DKSBase::setAPI(const char* api_name, int length) { if (m_api_set) delete[] m_api_name; int l = strlen(api_name); m_api_name = new char[l+1]; for (int i = 0; i < l; i++) m_api_name[i] = api_name[i]; m_api_name[l] = '\0'; m_api_set = true; return DKS_SUCCESS; } /* Name: getDevices Info: get all available devices Return: success or error code */ int DKSBase::getDevices() { int ierr1 = OPENCL_SAFECALL( oclbase->ocl_getAllDevices() ); int ierr2 = CUDA_SAFECALL( cbase->cuda_getDevices() ); int ierr3 = MIC_SAFECALL( micbase->mic_getDevices() ); if (ierr1 + ierr2 + ierr3 != DKS_SUCCESS) return DKS_ERROR; return DKS_SUCCESS; } int DKSBase::getDeviceCount(int &ndev) { ndev = 0; if (apiOpenCL()) return OPENCL_SAFECALL( oclbase->ocl_getDeviceCount(ndev) ); else if (apiCuda()) return CUDA_SAFECALL( cbase->cuda_getDeviceCount(ndev) ); else if (apiOpenMP()) return DKS_ERROR; else return DKS_ERROR; } int DKSBase::getDeviceName(std::string &device_name) { if (apiOpenCL()) return OPENCL_SAFECALL( oclbase->ocl_getDeviceName(device_name) ); else if (apiCuda()) return CUDA_SAFECALL( cbase->cuda_getDeviceName(device_name) ); else if (apiOpenMP()) return DKS_ERROR; else return DKS_ERROR; } int DKSBase::setDefaultDevice(int device) { std::cout << "Set device " << device << std::endl; if (apiOpenCL()) return OPENCL_SAFECALL( oclbase->ocl_setDevice(device) ); else if (apiCuda()) return CUDA_SAFECALL( cbase->cuda_setDevice(device) ); else if (apiOpenMP()) return DKS_ERROR; else return DKS_ERROR; } int DKSBase::getDeviceList(std::vector &devices) { if (apiOpenCL()) return OPENCL_SAFECALL( oclbase->ocl_getUniqueDevices(devices) ); else if (apiCuda()) return CUDA_SAFECALL( cbase->cuda_getUniqueDevices(devices) ); else if (apiOpenMP()) return DKS_ERROR; else return DKS_ERROR; } int DKSBase::setup() { int ierr = DKS_ERROR; if (apiOpenCL()) { ierr = OPENCL_SAFECALL( DKS_SUCCESS ); //TODO: only enable if AMD libraries are available dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(oclbase) ); dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(oclbase) ); dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(oclbase) ); } else if (apiCuda()) { ierr = CUDA_SAFECALL( DKS_SUCCESS ); dksfft = CUDA_SAFEINIT( new CudaFFT(cbase) ); dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(cbase) ); dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(cbase) ); } else if (apiOpenMP()) { ierr = MIC_SAFECALL( DKS_SUCCESS ); dksfft = MIC_SAFEINIT( new MICFFT(micbase) ); dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(micbase) ); dksgreens = MIC_SAFEINIT( new MICGreensFunction(micbase) ); } else { ierr = DKS_ERROR; } return ierr; } /* init device */ int DKSBase::initDevice() { int ierr = DKS_ERROR; //if api is not set default is OpenCL if (!m_api_set) { setDevice("-gpu", 4); setAPI(API_OPENCL, 6); ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); } else { if (apiOpenCL()) { if (!m_device_set) { setDevice("-gpu", 4); setAPI(API_OPENCL, 6); ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); } else { setAPI(API_OPENCL, 6); ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) ); } } else if (apiCuda()) { setDevice("-gpu", 4); setAPI(API_CUDA, 4); ierr = CUDA_SAFECALL(DKS_SUCCESS); } else if (apiOpenMP()) { setDevice("-mic", 4); setAPI(API_OPENMP, 6); ierr = MIC_SAFECALL(DKS_SUCCESS); } } if (ierr == DKS_SUCCESS) ierr = setup(); return ierr; } /* set up cuda, opencl and mic to allow async data transfer and kernel execution. name stream 'stolen' from cuda. opencl context ~ cuda stream. TODO: implementations for OpenCL and MIC still needed */ int DKSBase::createStream(int &streamId) { if (apiCuda()) return CUDA_SAFECALL( cbase->cuda_createStream(streamId) ); else if (apiOpenMP()) return MIC_SAFECALL( micbase->mic_createStream(streamId) ); DEBUG_MSG("Streams not enbled for this platforms jet"); return DKS_ERROR; } /* send device pointer to other processes */ #ifdef DKS_MPI int DKSBase::sendPointer(void *mem_ptr, int dest, MPI_Comm comm) { if ( apiCuda() ) { #ifdef DKS_CUDA cudaError cerror; cudaIpcMemHandle_t shandle; cerror = cudaIpcGetMemHandle(&shandle, mem_ptr); MPI_Send(&shandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, dest, 100, comm); if (cerror != cudaSuccess) { DEBUG_MSG("Error geting mem handle"); return DKS_ERROR; } return DKS_SUCCESS; #endif } else if (apiOpenMP()) { #ifdef DKS_MIC //BENI: DEBUG_MSG("No SendPointer for MIC is implemented"); return DKS_ERROR; #endif } else { DEBUG_MSG("Send device pointer not implemented on selected platform"); return DKS_ERROR; } return DKS_ERROR; } #endif /* receive device pointer */ #ifdef DKS_MPI void * DKSBase::receivePointer(int hostproc, MPI_Comm comm, int &ierr) { void *mem_ptr; if (apiCuda()) { #ifdef DKS_CUDA cudaError cerror; cudaIpcMemHandle_t rhandle; MPI_Recv(&rhandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, hostproc, 100, comm, NULL); cerror = cudaIpcOpenMemHandle(&mem_ptr, rhandle, cudaIpcMemLazyEnablePeerAccess); if (cerror != cudaSuccess) { DEBUG_MSG("Error opening received handle"); ierr = DKS_ERROR; } #endif return mem_ptr; } else if (apiOpenMP()) { #ifdef DKS_MIC //BENI: DEBUG_MSG("No ReceivePointer for MIC is implemented"); return DKS_SUCCESS; #endif return mem_ptr; } else { ierr = DKS_ERROR; DEBUG_MSG("Receive device pointer not implemented for selected platform"); return mem_ptr; } } #endif /* close received handle */ int DKSBase::closeHandle(void *mem_ptr) { if (apiCuda()) { #ifdef DKS_CUDA cudaError cerror; cerror = cudaIpcCloseMemHandle(mem_ptr); if (cerror != cudaSuccess) { DEBUG_MSG("Error closing memory handle"); return DKS_ERROR; } return DKS_SUCCESS; #endif } DEBUG_MSG("Memory handles not implemented for selected platform"); return DKS_ERROR; } /* sync device calls */ int DKSBase::syncDevice() { if (apiCuda()) return CUDA_SAFECALL( cbase->cuda_syncDevice() ); else if (apiOpenMP()) return MIC_SAFECALL( micbase->mic_syncDevice() ); return DKS_ERROR; } /* setup fft plans to reuse if multiple ffts of same size are needed */ int DKSBase::setupFFT(int ndim, int N[3]) { if (apiCuda()) { return dksfft->setupFFT(ndim, N); } else if (apiOpenCL()) { int ierr1 = dksfft->setupFFT(ndim, N); int ierr2 = dksfft->setupFFTRC(ndim, N); int ierr3 = dksfft->setupFFTCR(ndim, N); if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS) return DKS_ERROR; return DKS_SUCCESS; } else if (apiOpenMP()) { //micbase.mic_setupFFT(ndim, N); //BENI: setting up RC and CR transformations on MIC int ierr1 = dksfft->setupFFTRC(ndim, N, 1.); int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])); if (ierr1 != DKS_SUCCESS) return ierr1; if (ierr2 != DKS_SUCCESS) return ierr2; return DKS_SUCCESS; } return DKS_ERROR; } //BENI: int DKSBase::setupFFTRC(int ndim, int N[3], double scale) { if (apiCuda()) return dksfft->setupFFT(ndim, N); if (apiOpenCL()) return dksfft->setupFFTRC(ndim, N); else if (apiOpenMP()) return dksfft->setupFFTRC(ndim, N, scale); return DKS_ERROR; } //BENI: int DKSBase::setupFFTCR(int ndim, int N[3], double scale) { if (apiCuda()) return dksfft->setupFFT(ndim, N); if (apiOpenCL()) return dksfft->setupFFTCR(ndim, N); else if (apiOpenMP()) return dksfft->setupFFTCR(ndim, N, scale); return DKS_ERROR; } /* call OpenCL FFT function for selected platform */ int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { if (apiOpenCL() || apiOpenMP()) return dksfft->executeFFT(data_ptr, ndim, dimsize); else if (apiCuda()) return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId); DEBUG_MSG("No implementation for selected platform"); return DKS_ERROR; } /* call OpenCL IFFT function for selected platform */ int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { if (apiOpenCL() || apiOpenMP()) return dksfft->executeIFFT(data_ptr, ndim, dimsize); else if (apiCuda()) return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId); DEBUG_MSG("No implementation for selected platform"); return DKS_ERROR; } /* call normalize FFT function for selected platform */ int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { if (apiOpenCL()) { if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) return dksfft->normalizeFFT(data_ptr, ndim, dimsize); else return DKS_ERROR; } else if (apiCuda()) { return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId); } else if (apiOpenMP()) { return dksfft->normalizeFFT(data_ptr, ndim, dimsize); } DEBUG_MSG("No implementation for selected platform"); return DKS_ERROR; } /* call real to complex FFT */ int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { if (apiCuda()) return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId); else if (apiOpenCL() || apiOpenMP()) return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize); DEBUG_MSG("No implementation for selected platform"); return DKS_ERROR; } /* call complex to real FFT */ int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { if (apiCuda()) return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId); else if (apiOpenCL() || apiOpenMP()) return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize); DEBUG_MSG("No implementation for selected platform"); return DKS_ERROR; } /* normalize complex to real iFFT */ int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) { if (apiCuda()) return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId); else if (apiOpenCL()) return DKS_ERROR; else if (apiOpenMP()) return DKS_ERROR; DEBUG_MSG("No implementation for selected platform"); return DKS_ERROR; } int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, double hz_m0, double hz_m1, double hz_m2, int streamId) { return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, hz_m0, hz_m1, hz_m2, streamId); } int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, int I, int J, int K, int streamId) { return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId); } int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId); } int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) { return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId); } int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, double fTimeResolution, double fRebin, int sensors, int length, int numpar, double &result) { if (apiCuda()) { return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq, fTimeResolution, fRebin, sensors, length, numpar, result)); } else if (apiOpenCL()) { if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq, fTimeResolution, fRebin, sensors, length, numpar, result)); else return DKS_ERROR; } DEBUG_MSG("No implementation for selceted platform"); return DKS_ERROR; } int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, double fTimeResolution, double fRebin, double fGoodBinOffset, int sensors, int length, int numpar, double &result) { if (apiCuda()) { return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, fTimeResolution, fRebin, fGoodBinOffset, sensors, length, numpar, result)); } else if (apiOpenCL()) { if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, fTimeResolution, fRebin, fGoodBinOffset, sensors, length, numpar, result)); else return DKS_ERROR; } DEBUG_MSG("No implementation for selceted platform"); return DKS_ERROR; } int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, double fTimeResolution, double fRebin, double fGoodBinOffset, int sensors, int length, int numpar, double &result) { if (apiCuda()) { return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, fTimeResolution, fRebin, fGoodBinOffset, sensors, length, numpar, result)); } else if (apiOpenCL()) { if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, fTimeResolution, fRebin, fGoodBinOffset, sensors, length, numpar, result)); else return DKS_ERROR; } DEBUG_MSG("No implementation for selceted platform"); return DKS_ERROR; } int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int numparams, int &numaddback, int &numdead) { return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles); } int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) { return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles); } int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr, void *par_ptr, int numparticles) { return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, py_ptr, pz_ptr, par_ptr, numparticles); } int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback); } int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr, void *par_ptr, int numparticles, int &numaddback) { return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, py_ptr, pz_ptr, par_ptr, numparticles, numaddback)); } int DKSBase::callInitRandoms(int size) { if (apiCuda()) return CUDA_SAFECALL(cbase->cuda_createCurandStates(size)); else if (apiOpenCL()) return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size)); else if (apiOpenMP()) return MIC_SAFECALL(micbase->mic_createRandStreams(size)); DEBUG_MSG("No implementation for selceted platform"); return DKS_ERROR; } int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, double dt, double c, bool usedt, int streamId) { return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId); } int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr, double dt, double c, bool usedt, int streamId) { return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr, npart, nsec, dt_ptr, dt, c, usedt, streamId); } int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) { if (apiCuda()) return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size)); if (apiOpenCL()) return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size)); return DKS_ERROR; }