diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c08e39..3d9370c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,10 +38,17 @@ IF (Boost_FOUND) LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) ENDIF (Boost_FOUND) +#find clFFT +SET (clFFT_USE_STATIC_LIBS OFF) +FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT}) +MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}") +MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}") +INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS}) +LINK_DIRECTORIES (${CLFFT_LIBRARIES}) + #enable UQTK OPTION (USE_UQTK "Use UQTK" OFF) - #intel icpc compiler specific flags IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) diff --git a/auto-tuning/CMakeLists.txt b/auto-tuning/CMakeLists.txt index e3be789..5a90088 100644 --- a/auto-tuning/CMakeLists.txt +++ b/auto-tuning/CMakeLists.txt @@ -3,17 +3,17 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) #chi square kernel tests ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) -TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES}) +TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp) -TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES}) +TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) IF (USE_UQTK) ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp) - TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) ENDIF (USE_UQTK) #TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES}) #test to verify search functions ADD_EXECUTABLE(testSearch testSearch.cpp) -TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES}) +TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) diff --git a/src/DKSBase.cpp b/src/DKSBase.cpp index 96e9b19..7194971 100644 --- a/src/DKSBase.cpp +++ b/src/DKSBase.cpp @@ -189,6 +189,7 @@ DKSBase::~DKSBase() { delete oclbase; #endif + #ifdef DKS_MIC delete micfft; delete miccol; @@ -461,6 +462,14 @@ int DKSBase::setupFFT(int ndim, int N[3]) { if (apiCuda()) { return CUDA_SAFECALL( cfft->setupFFT(ndim, N) ); + } else if (apiOpenCL()) { + int ierr1 = OPENCL_SAFECALL( oclfft->setupFFT(ndim, N) ); + int ierr2 = OPENCL_SAFECALL( oclfft->setupFFTRC(ndim, N) ); + int ierr3 = OPENCL_SAFECALL( oclfft->setupFFTCR(ndim, N) ); + if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS) + return DKS_ERROR; + + return DKS_SUCCESS; } else if (apiOpenMP()) { //micbase.mic_setupFFT(ndim, N); //BENI: setting up RC and CR transformations on MIC @@ -481,6 +490,8 @@ int DKSBase::setupFFTRC(int ndim, int N[3], double scale) { if (apiCuda()) return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); + if (apiOpenCL()) + return OPENCL_SAFECALL(oclfft->setupFFTRC(ndim, N)); else if (apiOpenMP()) return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale)); @@ -493,6 +504,8 @@ int DKSBase::setupFFTCR(int ndim, int N[3], double scale) { if (apiCuda()) return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); + if (apiOpenCL()) + return OPENCL_SAFECALL(oclfft->setupFFTCR(ndim, N)); else if (apiOpenMP()) return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale)); @@ -559,6 +572,8 @@ int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[ if (apiCuda()) return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) ); + else if (apiOpenCL()) + return OPENCL_SAFECALL( oclfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize) ); else if (apiOpenMP()) return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) ); @@ -570,6 +585,8 @@ int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[ int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { if (apiCuda()) return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) ); + else if (apiOpenCL()) + return OPENCL_SAFECALL( oclfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize) ); else if (apiOpenMP()) return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) ); @@ -581,25 +598,15 @@ int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[ int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) { if (apiCuda()) return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) ); + else if (apiOpenCL()) + return DKS_SUCCESS; + else if (apiOpenMP()) + return DKS_SUCCESS; DEBUG_MSG("No implementation for selected platform"); return DKS_SUCCESS; } -/* normalize complex to real iFFT */ -int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) { - if (apiOpenCL()) { - if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS) - return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim)); - else - return DKS_ERROR; - } - - DEBUG_MSG("No implementation for selected platform"); - return DKS_ERROR; - -} - int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, double hz_m0, double hz_m1, double hz_m2, int streamId) { diff --git a/src/DKSBase.h b/src/DKSBase.h index ea8bc39..dccb022 100644 --- a/src/DKSBase.h +++ b/src/DKSBase.h @@ -405,7 +405,7 @@ public: } else if (apiOpenMP()) { #ifdef DKS_MIC void * mem_ptr = NULL; - mem_ptr = micbase.mic_allocateMemory(elements); + mem_ptr = micbase->mic_allocateMemory(elements); return mem_ptr; #endif } @@ -498,7 +498,7 @@ public: return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset)); } else if (apiOpenMP()) { - return MIC_SAFECALL(micbase.mic_writeData(mem_ptr, data, elements, offset)); + return MIC_SAFECALL(micbase->mic_writeData(mem_ptr, data, elements, offset)); } @@ -532,7 +532,7 @@ public: size_t size = sizeof(T)*elements; return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset)); } else if (apiOpenMP()) { - return MIC_SAFECALL(micbase.mic_writeDataAsync(mem_ptr, data, elements, streamId, offset)); + return MIC_SAFECALL(micbase->mic_writeDataAsync(mem_ptr, data, elements, streamId, offset)); } return DKS_ERROR; @@ -832,7 +832,7 @@ public: size_t size = sizeof(T)*elements; return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset)); } else if (apiOpenMP()) { - return MIC_SAFECALL(micbase.mic_readData(mem_ptr, out_data, elements, offset)); + return MIC_SAFECALL(micbase->mic_readData(mem_ptr, out_data, elements, offset)); } return DKS_ERROR; @@ -860,7 +860,7 @@ public: size_t size = sizeof(T)*elements; return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset)); } else if (apiOpenMP()) { - return MIC_SAFECALL(micbase.mic_readDataAsync(mem_ptr, out_data, elements, + return MIC_SAFECALL(micbase->mic_readDataAsync(mem_ptr, out_data, elements, streamId, offset)); } @@ -880,7 +880,7 @@ public: else if (apiCuda()) return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr)); else if (apiOpenMP()) - return MIC_SAFECALL(micbase.mic_freeMemory(mem_ptr, elements)); + return MIC_SAFECALL(micbase->mic_freeMemory(mem_ptr, elements)); return DKS_ERROR; } @@ -955,12 +955,6 @@ public: */ int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1); - /** - * Transpose 2D and 3D arrays, OpenCL implementation - * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose - */ - int callTranspose(void *mem_ptr, int N[3], int ndim, int dim); - /** * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. * For specifics check OPAL docs. diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h index ae0a15c..ece3ac8 100644 --- a/src/OpenCL/OpenCLBase.h +++ b/src/OpenCL/OpenCLBase.h @@ -52,9 +52,6 @@ class OpenCLBase { private: - static cl_context m_context; - static cl_command_queue m_command_queue; - static cl_platform_id m_platform_id; static cl_device_id m_device_id; @@ -118,6 +115,9 @@ protected: public: + + static cl_context m_context; + static cl_command_queue m_command_queue; /* constructor diff --git a/src/OpenCL/OpenCLFFT.cpp b/src/OpenCL/OpenCLFFT.cpp index 5cbe9e9..66ab3fe 100644 --- a/src/OpenCL/OpenCLFFT.cpp +++ b/src/OpenCL/OpenCLFFT.cpp @@ -89,26 +89,82 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) call fft execution on device for every dimension */ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) { - int ierr; - + + int dkserr = DKS_SUCCESS; + cl_int ierr; cl_mem inout = (cl_mem)data; - int n = N[0]; - for (int dim = 0; dim < ndim; dim++) { - ierr = ocl_callBitReverseKernel(inout, dim, ndim, n); - if (ierr != OCL_SUCCESS) { - DEBUG_MSG("Error executing bit reverse"); - return OCL_ERROR; - } + if (forward) + ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, + 0, NULL, NULL, &inout, NULL, NULL); + else + ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, + 0, NULL, NULL, &inout, NULL, NULL); - ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward); - if (ierr != OCL_SUCCESS) { - DEBUG_MSG("Error executing fft reverse"); - return OCL_ERROR; - } + if (ierr != OCL_SUCCESS) { + dkserr = DKS_ERROR; + DEBUG_MSG("Error executing cfFFT\n"); + if (ierr == CLFFT_INVALID_PLAN) + std::cout << "Invlalid plan" << std::endl; + else + std::cout << "CLFFT error" << std::endl; } - return OCL_SUCCESS; + return dkserr; +} + +/* + call rcfft execution on device for every dimension +*/ +int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) { + + std::cout << "execute RCFFT" << std::endl; + + int dkserr = DKS_SUCCESS; + cl_int ierr; + cl_mem real_in = (cl_mem)real_ptr; + cl_mem comp_out = (cl_mem)comp_ptr; + + ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, + 0, NULL, NULL, &real_in, &comp_out, NULL); + + if (ierr != OCL_SUCCESS) { + dkserr = DKS_ERROR; + DEBUG_MSG("Error executing cfFFT\n"); + if (ierr == CLFFT_INVALID_PLAN) + std::cout << "Invlalid plan" << std::endl; + else + std::cout << "CLFFT error" << std::endl; + } + + return dkserr; +} + +/* + call rcfft execution on device for every dimension +*/ +int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) { + + std::cout << "execute CRFFT" << std::endl; + + int dkserr = DKS_SUCCESS; + cl_int ierr; + cl_mem real_in = (cl_mem)real_ptr; + cl_mem comp_out = (cl_mem)comp_ptr; + + ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, + 0, NULL, NULL, &comp_out, &real_in, NULL); + + if (ierr != OCL_SUCCESS) { + dkserr = DKS_ERROR; + DEBUG_MSG("Error executing cfFFT\n"); + if (ierr == CLFFT_INVALID_PLAN) + std::cout << "Invlalid plan" << std::endl; + else + std::cout << "CLFFT error" << std::endl; + } + + return dkserr; } /* @@ -120,10 +176,11 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) { } /* - call kernel to normalize fft + call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled. */ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { +/* cl_mem inout = (cl_mem)data; int n = N[0]; @@ -150,132 +207,143 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { DEBUG_MSG("Error executing kernel"); return OCL_ERROR; } - +*/ return OCL_SUCCESS; } -int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) { - - int ierr; - int size = sizeof(cl_double2)*pow(N,ndim); - - cl_mem mem_tmp; - cl_mem mem_src = (cl_mem)src; - cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr); +int OpenCLFFT::setupFFT(int ndim, int N[3]) { - //set the number of work items in each dimension - size_t work_items[3]; - int p = 1; - int threads = N / 2; - int f = (forward) ? -1 : 1; - - //execute kernel - int n = (int)log2(N); - for (int i = 0; i < ndim; i++) { + cl_int err; - int dim = i+1; - p = 1; - work_items[0] = (dim == 1) ? N/2 : N; - work_items[1] = (dim == 2) ? N/2 : N; - work_items[2] = (dim == 3) ? N/2 : N; - - //transpose array if calculating dimension larger than 1 - //if (dim > 1) - // ocl_executeTranspose(mem_src, N, ndim, dim); - - //create kernel and set kernel arguments - if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS) - return OCL_ERROR; - - for (int t = 1; t <= log2(N); t++) { - - m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); - m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst); - m_oclbase->ocl_setKernelArg(2, sizeof(int), &p); - m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads); - m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim); - m_oclbase->ocl_setKernelArg(5, sizeof(int), &f); - - if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) - return OCL_ERROR; + clfftDim dim = CLFFT_3D; + size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; - mem_tmp = mem_src; - mem_src = mem_dst; - mem_dst = mem_tmp; - - p = 2*p; + /* Create 3D fft plan*/ + err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength); + + /* Set plan parameters */ + err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE); + if (err != CL_SUCCESS) + std::cout << "Error setting precision" << std::endl; + err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); + if (err != CL_SUCCESS) + std::cout << "Error setting layout" << std::endl; + err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE); + if (err != CL_SUCCESS) + std::cout << "Error setting result location" << std::endl; + /* Bake the plan */ + err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL); + + if (err != CL_SUCCESS) { + DEBUG_MSG("Error creating Complex-to-complex plan"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) { + cl_int err; + + clfftDim dim = CLFFT_3D; + size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; + + /* Create 3D fft plan*/ + err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength); + + /* Set plan parameters */ + err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE); + err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED); + err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE); + + /* Bake the plan */ + err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL); + + if (err != CL_SUCCESS) { + DEBUG_MSG("Error creating Real-to-complex plan"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) { + cl_int err; + + clfftDim dim = CLFFT_3D; + size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; + + /* Create 3D fft plan*/ + err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength); + + /* Set plan parameters */ + err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE); + err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL); + err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE); + + /* Bake the plan */ + err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL); + + if (err != CL_SUCCESS) { + DEBUG_MSG("Error creating Complex-to-real plan"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +int OpenCLFFT::destroyFFT() { + clfftDestroyPlan(&planHandleZ2Z); + clfftDestroyPlan(&planHandleD2Z); + clfftDestroyPlan(&planHandleZ2D); + + clfftTeardown(); + + return DKS_SUCCESS; +} + + +void OpenCLFFT::printError(clfftStatus err) { + + if (err != CL_SUCCESS) { + std::cout << "Error creating default plan " << err << std::endl; + switch(err) { + case CLFFT_BUGCHECK: + std::cout << "bugcheck" << std::endl; + break; + case CLFFT_NOTIMPLEMENTED: + std::cout << "not implemented" << std::endl; + break; + case CLFFT_TRANSPOSED_NOTIMPLEMENTED: + std::cout << "transposed not implemented" << std::endl; + break; + case CLFFT_FILE_NOT_FOUND: + std::cout << "file not found" << std::endl; + break; + case CLFFT_FILE_CREATE_FAILURE: + std::cout << "file create failure" << std::endl; + break; + case CLFFT_VERSION_MISMATCH: + std::cout << "version missmatch" << std::endl; + break; + case CLFFT_INVALID_PLAN: + std::cout << "invalid plan" << std::endl; + break; + case CLFFT_DEVICE_NO_DOUBLE: + std::cout << "no double" << std::endl; + break; + case CLFFT_DEVICE_MISMATCH: + std::cout << "device missmatch" << std::endl; + break; + case CLFFT_ENDSTATUS: + std::cout << "end status" << std::endl; + break; + default: + std::cout << "other: " << err << std::endl; + break; } - - //transpose array back if calculating dimension larger than 1 - //if (dim > 1) - // ocl_executeTranspose(mem_src, N, ndim, dim); - } - - if (ndim*n % 2 == 1) { - m_oclbase->ocl_copyData(mem_src, mem_dst, size); - mem_tmp = mem_src; - mem_src = mem_dst; - mem_dst = mem_tmp; } - m_oclbase->ocl_freeMemory(mem_dst); - - return OCL_SUCCESS; - -} - -int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) { - - cl_mem mem_src = (cl_mem)src; - - size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N}; - size_t work_group_size[3] = {(size_t)N/2, 1, 1}; - - m_oclbase->ocl_createKernel("fft_batch3D"); - - m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); - m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL); - m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL); - m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL); - m_oclbase->ocl_setKernelArg(4, sizeof(int), &N); - - - for (int dim = 1; dim < ndim+1; dim++) { - m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim); - m_oclbase->ocl_executeKernel(3, work_items, work_group_size); - } - - return OCL_SUCCESS; -} - -int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) { - - cl_mem mem_src = (cl_mem)src; - - if (ndim == 1) - return OCL_SUCCESS; - - size_t work_items[3]; - work_items[0] = N[0]; - work_items[1] = N[1]; - work_items[2] = 1; - - size_t work_group_size[3]; - work_group_size[0] = N[0]; - work_group_size[1] = N[1]; - work_group_size[2] = 1; - - size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2]; - - m_oclbase->ocl_createKernel("transpose"); - m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); - m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src); - m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]); - m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]); - m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL); - m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size); - - return OCL_SUCCESS; } /* diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h index 31816f9..dfa7772 100644 --- a/src/OpenCL/OpenCLFFT.h +++ b/src/OpenCL/OpenCLFFT.h @@ -20,12 +20,19 @@ #include "../Algorithms/FFT.h" #include "OpenCLBase.h" +#include "clFFT.h" + class OpenCLFFT : public DKSFFT { private: OpenCLBase *m_oclbase; + clfftSetupData fftSetup; + clfftPlanHandle planHandleZ2Z; + clfftPlanHandle planHandleD2Z; + clfftPlanHandle planHandleZ2D; + /* Info: call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform, @@ -42,15 +49,31 @@ private: */ int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N); + /** Get clfftStatus and print the corresponding error message. + * clfftStatus is returned from all clFFT library functions, print error displays the + * corresponding error message. If "other" is printed then error code corresponds to + * OpenCL error code and not specifically to clFFT library, then OpenCL error codes should + * be checked to determine the reason for the error. + */ + void printError(clfftStatus err); + public: /* constructor - currently does nothing*/ OpenCLFFT(OpenCLBase *base) { m_oclbase = base; + + /* Set up fft */ + cl_int err; + err = clfftInitSetupData(&fftSetup); + err = clfftSetup(&fftSetup); + + if (err != CL_SUCCESS) + DEBUG_MSG("Error seting up clFFT"); } /* destructor - currently does nothing*/ - ~OpenCLFFT() { } + ~OpenCLFFT() { destroyFFT(); } /* Info: execute forward fft function with data set on device @@ -77,35 +100,23 @@ public: Info: set FFT size Return: success or error code */ - int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; } + int setupFFT(int ndim, int N[3]); - int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + int setupFFTRC(int ndim, int N[3], double scale = 1.0); - int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + int setupFFTCR(int ndim, int N[3], double scale = 1.0); - int destroyFFT() { return DKS_SUCCESS; } + int destroyFFT(); int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], - int streamId = -1) - { - return DKS_ERROR; - } + int streamId = -1); int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], - int streamId = -1) - { - return DKS_ERROR; - } + int streamId = -1); int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_ERROR; } - int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true); - - int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true); - - int ocl_executeTranspose(void *src, int N[3], int ndim, int dim); - //void printData3DN4(cl_double2* &data, int N); }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 01f33fb..f8f7399 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) #ADD_EXECUTABLE(testFFT testFFT.cpp) #ADD_EXECUTABLE(testMIC testMIC.cpp) #ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp) -#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp) -#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp) +ADD_EXECUTABLE(testFFT3D testFFT3D.cpp) +ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp) #ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp) #ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp) #ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp) @@ -23,7 +23,7 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) #ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp) #ADD_EXECUTABLE(testTranspose testTranspose.cpp) ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) -#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) +ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) #ADD_EXECUTABLE(testPush testPush.cpp) #ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp) #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp) @@ -38,8 +38,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) #TARGET_LINK_LIBRARIES(testFFT dks) #TARGET_LINK_LIBRARIES(testMIC dks) #TARGET_LINK_LIBRARIES(testMICOpenCL dks) -#TARGET_LINK_LIBRARIES(testFFT3D dks) -#TARGET_LINK_LIBRARIES(testFFT3DRC dks) +TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) +TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks) #TARGET_LINK_LIBRARIES(testFFT3DTiming dks) #TARGET_LINK_LIBRARIES(testStockhamFFT dks) @@ -53,8 +53,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) #TARGET_LINK_LIBRARIES(testGather dks) #TARGET_LINK_LIBRARIES(testGatherAsync dks) #TARGET_LINK_LIBRARIES(testTranspose dks) -TARGET_LINK_LIBRARIES(testCollimatorPhysics dks) -#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks) +TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) +TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) #TARGET_LINK_LIBRARIES(testPush dks) #TARGET_LINK_LIBRARIES(testFFTSolverMIC dks) #TARGET_LINK_LIBRARIES(testIntegration dks) diff --git a/test/testFFT3D.cpp b/test/testFFT3D.cpp index ff14242..fc034ae 100644 --- a/test/testFFT3D.cpp +++ b/test/testFFT3D.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "Utility/TimeStamp.h" #include "DKSBase.h" @@ -18,22 +19,30 @@ int main(int argc, char *argv[]) { int N = 16; char *api_name = new char[10]; char *device_name = new char[10]; - if (argc == 2) { - N = atoi(argv[1]); - strcpy(api_name, "Cuda"); - strcpy(device_name, "-gpu"); - } else if (argc == 3) { - N = atoi(argv[1]); - strcpy(api_name, argv[2]); - strcpy(device_name, "-gpu"); - } else if (argc == 4) { - N = atoi(argv[1]); - strcpy(api_name, argv[2]); - strcpy(device_name, argv[3]); - } else { - N = 16; - strcpy(api_name, "OpenCL"); - strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-N")) + N = atoi(argv[i+1]); } cout << "Use api: " << api_name << ", " << device_name << endl; @@ -74,9 +83,16 @@ int main(int argc, char *argv[]) { /* write data to device */ ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + if (N < 5) + printData3DN4(cdata, N, 3); + /* execute fft */ base.callFFT(mem_ptr, 3, dimsize); + if (N < 5) { + base.readData< complex > (mem_ptr, cfft, N*N*N); + printData3DN4(cfft, N, 3); + } /* execute ifft */ base.callIFFT(mem_ptr, 3, dimsize); @@ -86,7 +102,9 @@ int main(int argc, char *argv[]) { /* read data from device */ base.readData< complex >(mem_ptr, cifft, N*N*N); - + if (N < 5) + printData3DN4(cifft, N, 3); + /* free device memory */ base.freeMemory< complex >(mem_ptr, N*N*N); @@ -130,7 +148,7 @@ void printData3DN4(complex* &data, int N, int dim) { if (a < 10e-5 && a > -10e-5) a = 0; - cout << d << "; " << a << "\t"; + cout << "(" << d << "," << a << ") "; } } cout << endl; @@ -157,3 +175,5 @@ void compareData(complex* &data1, complex* &data2, int N, int di cout << "Size " << N << " CC <--> CC diff: " << sum << endl; } + + diff --git a/test/testFFT3DRC.cpp b/test/testFFT3DRC.cpp index b0a0625..c5a2c04 100644 --- a/test/testFFT3DRC.cpp +++ b/test/testFFT3DRC.cpp @@ -9,20 +9,29 @@ using namespace std; void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim); void initData(double *data, int dimsize[3]); -bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop); +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, + char *api_name, char *device_name); void printHelp(); +void printData3DN4(complex* &data, int N, int dim); +void printData3DN4(double* &data, int N, int dim); + + int main(int argc, char *argv[]) { int N1 = 8; int N2 = 8; int N3 = 8; int dim = 3; - int loop = 10; + int loop = 0; + char *api_name = new char[10]; + char *device_name = new char[10]; - if ( readParams(argc, argv, N1, N2, N3, loop) ) + if ( readParams(argc, argv, N1, N2, N3, loop, api_name, device_name) ) return 0; + cout << "Use api: " << api_name << ", " << device_name << endl; + int dimsize[3] = {N3, N2, N1}; int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; @@ -30,32 +39,19 @@ int main(int argc, char *argv[]) { double *rdata = new double[sizereal]; double *outdata = new double[sizereal]; complex *cfft = new complex[sizecomp]; - - for (int i=0; iREAL) */ base.setupFFTCR(dim, dimsize,1./(N1*N2*N3)); -#endif - -#ifdef DKS_CUDA - DKSBase base; - base.setAPI("Cuda", 4); - base.setDevice("-gpu", 4); - base.initDevice(); - base.setupFFT(dim, dimsize); -#endif // allocate memory on device int ierr; @@ -68,6 +64,7 @@ int main(int argc, char *argv[]) { base.writeData(real_ptr, rdata, sizereal); base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); + base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize); base.readData(real_res_ptr, outdata, sizereal); //timer for total loop time, FFT and IFFT calls @@ -92,9 +89,7 @@ int main(int argc, char *argv[]) { gettimeofday(&timeIFFTEnd[i], NULL); //normalize -#ifdef DKS_CUDA base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize); -#endif // read IFFT data from device base.readData(real_res_ptr, outdata, sizereal); @@ -173,7 +168,9 @@ void printHelp() { std::cout << std::endl; } -bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) { +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, + char *api_name, char *device_name) +{ for (int i = 1; i < argc; i++) { @@ -193,7 +190,68 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) { printHelp(); return true; } + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } } return false; } + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N/2 + 1; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << "(" << d << "," << a << ") "; + } + } + cout << endl; + } + cout << endl; + +} + +void printData3DN4(double* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k]; + + if (d < 10e-5 && d > -10e-5) + d = 0; + + cout << d << " "; + } + } + cout << endl; + } + cout << endl; + +}