FFT for OpenCL using clFFT library
This commit is contained in:
@ -38,10 +38,17 @@ IF (Boost_FOUND)
|
|||||||
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
||||||
ENDIF (Boost_FOUND)
|
ENDIF (Boost_FOUND)
|
||||||
|
|
||||||
|
#find clFFT
|
||||||
|
SET (clFFT_USE_STATIC_LIBS OFF)
|
||||||
|
FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
|
||||||
|
MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
|
||||||
|
MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
|
||||||
|
INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
|
||||||
|
LINK_DIRECTORIES (${CLFFT_LIBRARIES})
|
||||||
|
|
||||||
#enable UQTK
|
#enable UQTK
|
||||||
OPTION (USE_UQTK "Use UQTK" OFF)
|
OPTION (USE_UQTK "Use UQTK" OFF)
|
||||||
|
|
||||||
|
|
||||||
#intel icpc compiler specific flags
|
#intel icpc compiler specific flags
|
||||||
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
||||||
|
|
||||||
|
@ -3,17 +3,17 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
|
|
||||||
#chi square kernel tests
|
#chi square kernel tests
|
||||||
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
|
|
||||||
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
|
|
||||||
IF (USE_UQTK)
|
IF (USE_UQTK)
|
||||||
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
ENDIF (USE_UQTK)
|
ENDIF (USE_UQTK)
|
||||||
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
|
||||||
|
|
||||||
#test to verify search functions
|
#test to verify search functions
|
||||||
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
||||||
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
|
@ -189,6 +189,7 @@ DKSBase::~DKSBase() {
|
|||||||
delete oclbase;
|
delete oclbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
delete micfft;
|
delete micfft;
|
||||||
delete miccol;
|
delete miccol;
|
||||||
@ -461,6 +462,14 @@ int DKSBase::setupFFT(int ndim, int N[3]) {
|
|||||||
|
|
||||||
if (apiCuda()) {
|
if (apiCuda()) {
|
||||||
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
|
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
int ierr1 = OPENCL_SAFECALL( oclfft->setupFFT(ndim, N) );
|
||||||
|
int ierr2 = OPENCL_SAFECALL( oclfft->setupFFTRC(ndim, N) );
|
||||||
|
int ierr3 = OPENCL_SAFECALL( oclfft->setupFFTCR(ndim, N) );
|
||||||
|
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
return DKS_SUCCESS;
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
//micbase.mic_setupFFT(ndim, N);
|
//micbase.mic_setupFFT(ndim, N);
|
||||||
//BENI: setting up RC and CR transformations on MIC
|
//BENI: setting up RC and CR transformations on MIC
|
||||||
@ -481,6 +490,8 @@ int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
|
|||||||
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||||
|
if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL(oclfft->setupFFTRC(ndim, N));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
|
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
|
||||||
|
|
||||||
@ -493,6 +504,8 @@ int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
|
|||||||
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||||
|
if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL(oclfft->setupFFTCR(ndim, N));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
|
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
|
||||||
|
|
||||||
@ -559,6 +572,8 @@ int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[
|
|||||||
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL( oclfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize) );
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
|
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
|
||||||
|
|
||||||
@ -570,6 +585,8 @@ int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[
|
|||||||
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL( oclfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize) );
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
|
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
|
||||||
|
|
||||||
@ -581,25 +598,15 @@ int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[
|
|||||||
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
|
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenCL())
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* normalize complex to real iFFT */
|
|
||||||
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
|
|
||||||
if (apiOpenCL()) {
|
|
||||||
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
|
|
||||||
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
|
|
||||||
else
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
||||||
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
||||||
|
|
||||||
|
@ -405,7 +405,7 @@ public:
|
|||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
void * mem_ptr = NULL;
|
void * mem_ptr = NULL;
|
||||||
mem_ptr = micbase.mic_allocateMemory<T>(elements);
|
mem_ptr = micbase->mic_allocateMemory<T>(elements);
|
||||||
return mem_ptr;
|
return mem_ptr;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -498,7 +498,7 @@ public:
|
|||||||
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
||||||
|
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
|
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -532,7 +532,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -832,7 +832,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
|
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -860,7 +860,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
||||||
streamId, offset));
|
streamId, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -880,7 +880,7 @@ public:
|
|||||||
else if (apiCuda())
|
else if (apiCuda())
|
||||||
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
|
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
@ -955,12 +955,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
|
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
/**
|
|
||||||
* Transpose 2D and 3D arrays, OpenCL implementation
|
|
||||||
* N - size of dimensions, ndim - number of dimensions, dim - dim to transpose
|
|
||||||
*/
|
|
||||||
int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
* For specifics check OPAL docs.
|
* For specifics check OPAL docs.
|
||||||
|
@ -52,9 +52,6 @@ class OpenCLBase {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
static cl_context m_context;
|
|
||||||
static cl_command_queue m_command_queue;
|
|
||||||
|
|
||||||
static cl_platform_id m_platform_id;
|
static cl_platform_id m_platform_id;
|
||||||
static cl_device_id m_device_id;
|
static cl_device_id m_device_id;
|
||||||
|
|
||||||
@ -119,6 +116,9 @@ protected:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
static cl_context m_context;
|
||||||
|
static cl_command_queue m_command_queue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
constructor
|
constructor
|
||||||
*/
|
*/
|
||||||
|
@ -89,26 +89,82 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
|
|||||||
call fft execution on device for every dimension
|
call fft execution on device for every dimension
|
||||||
*/
|
*/
|
||||||
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
||||||
int ierr;
|
|
||||||
|
|
||||||
|
int dkserr = DKS_SUCCESS;
|
||||||
|
cl_int ierr;
|
||||||
cl_mem inout = (cl_mem)data;
|
cl_mem inout = (cl_mem)data;
|
||||||
int n = N[0];
|
|
||||||
|
|
||||||
for (int dim = 0; dim < ndim; dim++) {
|
if (forward)
|
||||||
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
|
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
||||||
|
0, NULL, NULL, &inout, NULL, NULL);
|
||||||
|
else
|
||||||
|
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
|
||||||
|
0, NULL, NULL, &inout, NULL, NULL);
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
if (ierr != OCL_SUCCESS) {
|
||||||
DEBUG_MSG("Error executing bit reverse");
|
dkserr = DKS_ERROR;
|
||||||
return OCL_ERROR;
|
DEBUG_MSG("Error executing cfFFT\n");
|
||||||
|
if (ierr == CLFFT_INVALID_PLAN)
|
||||||
|
std::cout << "Invlalid plan" << std::endl;
|
||||||
|
else
|
||||||
|
std::cout << "CLFFT error" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
|
return dkserr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
call rcfft execution on device for every dimension
|
||||||
|
*/
|
||||||
|
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
|
std::cout << "execute RCFFT" << std::endl;
|
||||||
|
|
||||||
|
int dkserr = DKS_SUCCESS;
|
||||||
|
cl_int ierr;
|
||||||
|
cl_mem real_in = (cl_mem)real_ptr;
|
||||||
|
cl_mem comp_out = (cl_mem)comp_ptr;
|
||||||
|
|
||||||
|
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
||||||
|
0, NULL, NULL, &real_in, &comp_out, NULL);
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
if (ierr != OCL_SUCCESS) {
|
||||||
DEBUG_MSG("Error executing fft reverse");
|
dkserr = DKS_ERROR;
|
||||||
return OCL_ERROR;
|
DEBUG_MSG("Error executing cfFFT\n");
|
||||||
}
|
if (ierr == CLFFT_INVALID_PLAN)
|
||||||
|
std::cout << "Invlalid plan" << std::endl;
|
||||||
|
else
|
||||||
|
std::cout << "CLFFT error" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
return OCL_SUCCESS;
|
return dkserr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
call rcfft execution on device for every dimension
|
||||||
|
*/
|
||||||
|
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
|
std::cout << "execute CRFFT" << std::endl;
|
||||||
|
|
||||||
|
int dkserr = DKS_SUCCESS;
|
||||||
|
cl_int ierr;
|
||||||
|
cl_mem real_in = (cl_mem)real_ptr;
|
||||||
|
cl_mem comp_out = (cl_mem)comp_ptr;
|
||||||
|
|
||||||
|
ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
|
||||||
|
0, NULL, NULL, &comp_out, &real_in, NULL);
|
||||||
|
|
||||||
|
if (ierr != OCL_SUCCESS) {
|
||||||
|
dkserr = DKS_ERROR;
|
||||||
|
DEBUG_MSG("Error executing cfFFT\n");
|
||||||
|
if (ierr == CLFFT_INVALID_PLAN)
|
||||||
|
std::cout << "Invlalid plan" << std::endl;
|
||||||
|
else
|
||||||
|
std::cout << "CLFFT error" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return dkserr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -120,10 +176,11 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
call kernel to normalize fft
|
call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
|
||||||
*/
|
*/
|
||||||
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
|
/*
|
||||||
cl_mem inout = (cl_mem)data;
|
cl_mem inout = (cl_mem)data;
|
||||||
|
|
||||||
int n = N[0];
|
int n = N[0];
|
||||||
@ -150,132 +207,143 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
|||||||
DEBUG_MSG("Error executing kernel");
|
DEBUG_MSG("Error executing kernel");
|
||||||
return OCL_ERROR;
|
return OCL_ERROR;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
return OCL_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
|
int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
||||||
|
|
||||||
int ierr;
|
cl_int err;
|
||||||
int size = sizeof(cl_double2)*pow(N,ndim);
|
|
||||||
|
|
||||||
cl_mem mem_tmp;
|
clfftDim dim = CLFFT_3D;
|
||||||
cl_mem mem_src = (cl_mem)src;
|
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||||
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
|
|
||||||
|
|
||||||
//set the number of work items in each dimension
|
/* Create 3D fft plan*/
|
||||||
size_t work_items[3];
|
err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
|
||||||
int p = 1;
|
|
||||||
int threads = N / 2;
|
|
||||||
int f = (forward) ? -1 : 1;
|
|
||||||
|
|
||||||
//execute kernel
|
/* Set plan parameters */
|
||||||
int n = (int)log2(N);
|
err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
|
||||||
for (int i = 0; i < ndim; i++) {
|
if (err != CL_SUCCESS)
|
||||||
|
std::cout << "Error setting precision" << std::endl;
|
||||||
|
err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
|
||||||
|
if (err != CL_SUCCESS)
|
||||||
|
std::cout << "Error setting layout" << std::endl;
|
||||||
|
err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
|
||||||
|
if (err != CL_SUCCESS)
|
||||||
|
std::cout << "Error setting result location" << std::endl;
|
||||||
|
/* Bake the plan */
|
||||||
|
err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||||
|
|
||||||
int dim = i+1;
|
if (err != CL_SUCCESS) {
|
||||||
p = 1;
|
DEBUG_MSG("Error creating Complex-to-complex plan");
|
||||||
work_items[0] = (dim == 1) ? N/2 : N;
|
return DKS_ERROR;
|
||||||
work_items[1] = (dim == 2) ? N/2 : N;
|
|
||||||
work_items[2] = (dim == 3) ? N/2 : N;
|
|
||||||
|
|
||||||
//transpose array if calculating dimension larger than 1
|
|
||||||
//if (dim > 1)
|
|
||||||
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
|
||||||
|
|
||||||
//create kernel and set kernel arguments
|
|
||||||
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
|
|
||||||
return OCL_ERROR;
|
|
||||||
|
|
||||||
for (int t = 1; t <= log2(N); t++) {
|
|
||||||
|
|
||||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
|
||||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
|
|
||||||
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
|
|
||||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
|
|
||||||
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
|
|
||||||
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
|
|
||||||
|
|
||||||
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
|
||||||
return OCL_ERROR;
|
|
||||||
|
|
||||||
mem_tmp = mem_src;
|
|
||||||
mem_src = mem_dst;
|
|
||||||
mem_dst = mem_tmp;
|
|
||||||
|
|
||||||
p = 2*p;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//transpose array back if calculating dimension larger than 1
|
return DKS_SUCCESS;
|
||||||
//if (dim > 1)
|
|
||||||
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ndim*n % 2 == 1) {
|
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
||||||
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
|
cl_int err;
|
||||||
mem_tmp = mem_src;
|
|
||||||
mem_src = mem_dst;
|
clfftDim dim = CLFFT_3D;
|
||||||
mem_dst = mem_tmp;
|
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||||
|
|
||||||
|
/* Create 3D fft plan*/
|
||||||
|
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
|
||||||
|
|
||||||
|
/* Set plan parameters */
|
||||||
|
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
|
||||||
|
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
|
||||||
|
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
|
||||||
|
|
||||||
|
/* Bake the plan */
|
||||||
|
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||||
|
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
DEBUG_MSG("Error creating Real-to-complex plan");
|
||||||
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_oclbase->ocl_freeMemory(mem_dst);
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
return OCL_SUCCESS;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
|
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
||||||
|
cl_int err;
|
||||||
|
|
||||||
cl_mem mem_src = (cl_mem)src;
|
clfftDim dim = CLFFT_3D;
|
||||||
|
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||||
|
|
||||||
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
|
/* Create 3D fft plan*/
|
||||||
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
|
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
|
||||||
|
|
||||||
m_oclbase->ocl_createKernel("fft_batch3D");
|
/* Set plan parameters */
|
||||||
|
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
|
||||||
|
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
|
||||||
|
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
|
||||||
|
|
||||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
/* Bake the plan */
|
||||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
|
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
|
|
||||||
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
|
|
||||||
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
|
|
||||||
|
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
for (int dim = 1; dim < ndim+1; dim++) {
|
DEBUG_MSG("Error creating Complex-to-real plan");
|
||||||
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
|
return DKS_ERROR;
|
||||||
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return OCL_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
|
int OpenCLFFT::destroyFFT() {
|
||||||
|
clfftDestroyPlan(&planHandleZ2Z);
|
||||||
|
clfftDestroyPlan(&planHandleD2Z);
|
||||||
|
clfftDestroyPlan(&planHandleZ2D);
|
||||||
|
|
||||||
cl_mem mem_src = (cl_mem)src;
|
clfftTeardown();
|
||||||
|
|
||||||
if (ndim == 1)
|
return DKS_SUCCESS;
|
||||||
return OCL_SUCCESS;
|
}
|
||||||
|
|
||||||
size_t work_items[3];
|
|
||||||
work_items[0] = N[0];
|
|
||||||
work_items[1] = N[1];
|
|
||||||
work_items[2] = 1;
|
|
||||||
|
|
||||||
size_t work_group_size[3];
|
void OpenCLFFT::printError(clfftStatus err) {
|
||||||
work_group_size[0] = N[0];
|
|
||||||
work_group_size[1] = N[1];
|
|
||||||
work_group_size[2] = 1;
|
|
||||||
|
|
||||||
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
|
if (err != CL_SUCCESS) {
|
||||||
|
std::cout << "Error creating default plan " << err << std::endl;
|
||||||
|
switch(err) {
|
||||||
|
case CLFFT_BUGCHECK:
|
||||||
|
std::cout << "bugcheck" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_NOTIMPLEMENTED:
|
||||||
|
std::cout << "not implemented" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
|
||||||
|
std::cout << "transposed not implemented" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_FILE_NOT_FOUND:
|
||||||
|
std::cout << "file not found" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_FILE_CREATE_FAILURE:
|
||||||
|
std::cout << "file create failure" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_VERSION_MISMATCH:
|
||||||
|
std::cout << "version missmatch" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_INVALID_PLAN:
|
||||||
|
std::cout << "invalid plan" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_DEVICE_NO_DOUBLE:
|
||||||
|
std::cout << "no double" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_DEVICE_MISMATCH:
|
||||||
|
std::cout << "device missmatch" << std::endl;
|
||||||
|
break;
|
||||||
|
case CLFFT_ENDSTATUS:
|
||||||
|
std::cout << "end status" << std::endl;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
std::cout << "other: " << err << std::endl;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
m_oclbase->ocl_createKernel("transpose");
|
|
||||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
|
||||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
|
|
||||||
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
|
|
||||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
|
|
||||||
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
|
|
||||||
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
|
|
||||||
|
|
||||||
return OCL_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -20,12 +20,19 @@
|
|||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithms/FFT.h"
|
||||||
#include "OpenCLBase.h"
|
#include "OpenCLBase.h"
|
||||||
|
|
||||||
|
#include "clFFT.h"
|
||||||
|
|
||||||
class OpenCLFFT : public DKSFFT {
|
class OpenCLFFT : public DKSFFT {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
OpenCLBase *m_oclbase;
|
OpenCLBase *m_oclbase;
|
||||||
|
|
||||||
|
clfftSetupData fftSetup;
|
||||||
|
clfftPlanHandle planHandleZ2Z;
|
||||||
|
clfftPlanHandle planHandleD2Z;
|
||||||
|
clfftPlanHandle planHandleZ2D;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Info: call fft kernels to execute FFT of the given domain,
|
Info: call fft kernels to execute FFT of the given domain,
|
||||||
data - devevice memory ptr, cdim - current dim to transform,
|
data - devevice memory ptr, cdim - current dim to transform,
|
||||||
@ -42,15 +49,31 @@ private:
|
|||||||
*/
|
*/
|
||||||
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
||||||
|
|
||||||
|
/** Get clfftStatus and print the corresponding error message.
|
||||||
|
* clfftStatus is returned from all clFFT library functions, print error displays the
|
||||||
|
* corresponding error message. If "other" is printed then error code corresponds to
|
||||||
|
* OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
|
||||||
|
* be checked to determine the reason for the error.
|
||||||
|
*/
|
||||||
|
void printError(clfftStatus err);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/* constructor - currently does nothing*/
|
/* constructor - currently does nothing*/
|
||||||
OpenCLFFT(OpenCLBase *base) {
|
OpenCLFFT(OpenCLBase *base) {
|
||||||
m_oclbase = base;
|
m_oclbase = base;
|
||||||
|
|
||||||
|
/* Set up fft */
|
||||||
|
cl_int err;
|
||||||
|
err = clfftInitSetupData(&fftSetup);
|
||||||
|
err = clfftSetup(&fftSetup);
|
||||||
|
|
||||||
|
if (err != CL_SUCCESS)
|
||||||
|
DEBUG_MSG("Error seting up clFFT");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* destructor - currently does nothing*/
|
/* destructor - currently does nothing*/
|
||||||
~OpenCLFFT() { }
|
~OpenCLFFT() { destroyFFT(); }
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Info: execute forward fft function with data set on device
|
Info: execute forward fft function with data set on device
|
||||||
@ -77,35 +100,23 @@ public:
|
|||||||
Info: set FFT size
|
Info: set FFT size
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
|
int setupFFT(int ndim, int N[3]);
|
||||||
|
|
||||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
||||||
|
|
||||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
||||||
|
|
||||||
int destroyFFT() { return DKS_SUCCESS; }
|
int destroyFFT();
|
||||||
|
|
||||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1)
|
int streamId = -1);
|
||||||
{
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1)
|
int streamId = -1);
|
||||||
{
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
|
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
|
||||||
{
|
{
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
|
|
||||||
|
|
||||||
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
|
|
||||||
|
|
||||||
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
|
|
||||||
|
|
||||||
//void printData3DN4(cl_double2* &data, int N);
|
//void printData3DN4(cl_double2* &data, int N);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
||||||
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
||||||
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
||||||
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
||||||
@ -23,7 +23,7 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
||||||
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
||||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
||||||
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
||||||
#ADD_EXECUTABLE(testPush testPush.cpp)
|
#ADD_EXECUTABLE(testPush testPush.cpp)
|
||||||
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
||||||
@ -38,8 +38,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
|||||||
#TARGET_LINK_LIBRARIES(testFFT dks)
|
#TARGET_LINK_LIBRARIES(testFFT dks)
|
||||||
#TARGET_LINK_LIBRARIES(testMIC dks)
|
#TARGET_LINK_LIBRARIES(testMIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3D dks)
|
TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
|
TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
||||||
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
||||||
@ -53,8 +53,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
|||||||
#TARGET_LINK_LIBRARIES(testGather dks)
|
#TARGET_LINK_LIBRARIES(testGather dks)
|
||||||
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
||||||
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
|
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
|
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
#TARGET_LINK_LIBRARIES(testPush dks)
|
#TARGET_LINK_LIBRARIES(testPush dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
#include "Utility/TimeStamp.h"
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -18,24 +19,32 @@ int main(int argc, char *argv[]) {
|
|||||||
int N = 16;
|
int N = 16;
|
||||||
char *api_name = new char[10];
|
char *api_name = new char[10];
|
||||||
char *device_name = new char[10];
|
char *device_name = new char[10];
|
||||||
if (argc == 2) {
|
|
||||||
N = atoi(argv[1]);
|
for (int i = 1; i < argc; i++) {
|
||||||
|
if (argv[i] == string("-cuda")) {
|
||||||
strcpy(api_name, "Cuda");
|
strcpy(api_name, "Cuda");
|
||||||
strcpy(device_name, "-gpu");
|
strcpy(device_name, "-gpu");
|
||||||
} else if (argc == 3) {
|
}
|
||||||
N = atoi(argv[1]);
|
|
||||||
strcpy(api_name, argv[2]);
|
if (argv[i] == string("-opencl")) {
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
} else if (argc == 4) {
|
|
||||||
N = atoi(argv[1]);
|
|
||||||
strcpy(api_name, argv[2]);
|
|
||||||
strcpy(device_name, argv[3]);
|
|
||||||
} else {
|
|
||||||
N = 16;
|
|
||||||
strcpy(api_name, "OpenCL");
|
strcpy(api_name, "OpenCL");
|
||||||
strcpy(device_name, "-gpu");
|
strcpy(device_name, "-gpu");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-mic")) {
|
||||||
|
strcpy(api_name, "OpenMP");
|
||||||
|
strcpy(device_name, "-mic");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-cpu")) {
|
||||||
|
strcpy(api_name, "OpenCL");
|
||||||
|
strcpy(device_name, "-cpu");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-N"))
|
||||||
|
N = atoi(argv[i+1]);
|
||||||
|
}
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||||
|
|
||||||
int dimsize[3] = {N, N, N};
|
int dimsize[3] = {N, N, N};
|
||||||
@ -74,9 +83,16 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
/* write data to device */
|
/* write data to device */
|
||||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||||
|
if (N < 5)
|
||||||
|
printData3DN4(cdata, N, 3);
|
||||||
|
|
||||||
|
|
||||||
/* execute fft */
|
/* execute fft */
|
||||||
base.callFFT(mem_ptr, 3, dimsize);
|
base.callFFT(mem_ptr, 3, dimsize);
|
||||||
|
if (N < 5) {
|
||||||
|
base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
|
||||||
|
printData3DN4(cfft, N, 3);
|
||||||
|
}
|
||||||
|
|
||||||
/* execute ifft */
|
/* execute ifft */
|
||||||
base.callIFFT(mem_ptr, 3, dimsize);
|
base.callIFFT(mem_ptr, 3, dimsize);
|
||||||
@ -86,6 +102,8 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
/* read data from device */
|
/* read data from device */
|
||||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||||
|
if (N < 5)
|
||||||
|
printData3DN4(cifft, N, 3);
|
||||||
|
|
||||||
/* free device memory */
|
/* free device memory */
|
||||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||||
@ -130,7 +148,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
|
|||||||
if (a < 10e-5 && a > -10e-5)
|
if (a < 10e-5 && a > -10e-5)
|
||||||
a = 0;
|
a = 0;
|
||||||
|
|
||||||
cout << d << "; " << a << "\t";
|
cout << "(" << d << "," << a << ") ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
@ -157,3 +175,5 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
|
|||||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,20 +9,29 @@ using namespace std;
|
|||||||
|
|
||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
||||||
void initData(double *data, int dimsize[3]);
|
void initData(double *data, int dimsize[3]);
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
|
||||||
|
char *api_name, char *device_name);
|
||||||
void printHelp();
|
void printHelp();
|
||||||
|
|
||||||
|
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||||
|
void printData3DN4(double* &data, int N, int dim);
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
int N1 = 8;
|
int N1 = 8;
|
||||||
int N2 = 8;
|
int N2 = 8;
|
||||||
int N3 = 8;
|
int N3 = 8;
|
||||||
int dim = 3;
|
int dim = 3;
|
||||||
int loop = 10;
|
int loop = 0;
|
||||||
|
char *api_name = new char[10];
|
||||||
|
char *device_name = new char[10];
|
||||||
|
|
||||||
if ( readParams(argc, argv, N1, N2, N3, loop) )
|
if ( readParams(argc, argv, N1, N2, N3, loop, api_name, device_name) )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||||
|
|
||||||
int dimsize[3] = {N3, N2, N1};
|
int dimsize[3] = {N3, N2, N1};
|
||||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
||||||
@ -30,32 +39,19 @@ int main(int argc, char *argv[]) {
|
|||||||
double *rdata = new double[sizereal];
|
double *rdata = new double[sizereal];
|
||||||
double *outdata = new double[sizereal];
|
double *outdata = new double[sizereal];
|
||||||
complex<double> *cfft = new complex<double>[sizecomp];
|
complex<double> *cfft = new complex<double>[sizecomp];
|
||||||
|
|
||||||
for (int i=0; i<sizecomp; ++i) {
|
|
||||||
cfft[i].real() = 7.;
|
|
||||||
cfft[i].imag() = 3.33;
|
|
||||||
}
|
|
||||||
initData(rdata, dimsize);
|
initData(rdata, dimsize);
|
||||||
|
|
||||||
/* init DKSBase */
|
/* init DKSBase */
|
||||||
cout << "Init device and set function" << endl;
|
cout << "Init device and set function" << endl;
|
||||||
#ifdef DKS_MIC
|
|
||||||
DKSBase base;
|
DKSBase base;
|
||||||
base.setAPI("OpenMP", 6);
|
base.setAPI(api_name, strlen(api_name));
|
||||||
base.setDevice("-mic", 4);
|
base.setDevice(device_name, strlen(device_name));
|
||||||
base.initDevice();
|
base.initDevice();
|
||||||
|
base.setupFFT(3, dimsize);
|
||||||
|
|
||||||
base.setupFFTRC(dim, dimsize);
|
base.setupFFTRC(dim, dimsize);
|
||||||
/* setup backward fft (COMPLEX->REAL) */
|
/* setup backward fft (COMPLEX->REAL) */
|
||||||
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
|
||||||
DKSBase base;
|
|
||||||
base.setAPI("Cuda", 4);
|
|
||||||
base.setDevice("-gpu", 4);
|
|
||||||
base.initDevice();
|
|
||||||
base.setupFFT(dim, dimsize);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// allocate memory on device
|
// allocate memory on device
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -68,6 +64,7 @@ int main(int argc, char *argv[]) {
|
|||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||||
|
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
|
|
||||||
//timer for total loop time, FFT and IFFT calls
|
//timer for total loop time, FFT and IFFT calls
|
||||||
@ -92,9 +89,7 @@ int main(int argc, char *argv[]) {
|
|||||||
gettimeofday(&timeIFFTEnd[i], NULL);
|
gettimeofday(&timeIFFTEnd[i], NULL);
|
||||||
|
|
||||||
//normalize
|
//normalize
|
||||||
#ifdef DKS_CUDA
|
|
||||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||||
#endif
|
|
||||||
|
|
||||||
// read IFFT data from device
|
// read IFFT data from device
|
||||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
@ -173,7 +168,9 @@ void printHelp() {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
|
||||||
|
char *api_name, char *device_name)
|
||||||
|
{
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
|
|
||||||
@ -193,7 +190,68 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
|
|||||||
printHelp();
|
printHelp();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-cuda")) {
|
||||||
|
strcpy(api_name, "Cuda");
|
||||||
|
strcpy(device_name, "-gpu");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-opencl")) {
|
||||||
|
strcpy(api_name, "OpenCL");
|
||||||
|
strcpy(device_name, "-gpu");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-mic")) {
|
||||||
|
strcpy(api_name, "OpenMP");
|
||||||
|
strcpy(device_name, "-mic");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv[i] == string("-cpu")) {
|
||||||
|
strcpy(api_name, "OpenCL");
|
||||||
|
strcpy(device_name, "-cpu");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||||
|
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
for (int k = 0; k < N/2 + 1; k++) {
|
||||||
|
double d = data[i*N*N + j*N + k].real();
|
||||||
|
double a = data[i*N*N + j*N + k].imag();
|
||||||
|
|
||||||
|
if (d < 10e-5 && d > -10e-5)
|
||||||
|
d = 0;
|
||||||
|
if (a < 10e-5 && a > -10e-5)
|
||||||
|
a = 0;
|
||||||
|
|
||||||
|
cout << "(" << d << "," << a << ") ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void printData3DN4(double* &data, int N, int dim) {
|
||||||
|
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
for (int k = 0; k < N; k++) {
|
||||||
|
double d = data[i*N*N + j*N + k];
|
||||||
|
|
||||||
|
if (d < 10e-5 && d > -10e-5)
|
||||||
|
d = 0;
|
||||||
|
|
||||||
|
cout << d << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user