OpenCL FFT using clfft and tests

This commit is contained in:
Uldis Locans
2016-11-21 16:14:11 +01:00
parent 4432d32480
commit b3a12e02a8
14 changed files with 482 additions and 390 deletions

View File

@ -167,6 +167,40 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error zeroing cuda memory!\n");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
int dkserror = DKS_SUCCESS;
cudaError cerror;
if (streamId < cuda_numberOfStreams()) {
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
cuda_getStream(streamId));
if (cerror != cudaSuccess)
dkserror = DKS_ERROR;
} else
dkserror = DKS_ERROR;
return dkserror;
}
/** /**
* Info: write data to memory * Info: write data to memory
* Retrun: success or error code * Retrun: success or error code

View File

@ -194,7 +194,6 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
rho2_m[i + j*ni + k*ni*nj] = tmp_rho; rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
} }
} }
@ -273,7 +272,6 @@ __global__ void mirroredRhoField(double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i; id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri; id8 = rk * NI * NJ + rj * NI + ri;
double data = rho2_m[id1]; double data = rho2_m[id1];
if (i != 0) rho2_m[id2] = data; if (i != 0) rho2_m[id2] = data;
@ -389,8 +387,10 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
int thread = 128; int thread = 128;
int block = (I * J * K / thread) + 1; int block = (I * J * K / thread) + 1;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
if (streamId == -1) { if (streamId == -1) {
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K); 2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
@ -399,6 +399,7 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K); 2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;

View File

@ -114,6 +114,7 @@ DKSBase::DKSBase() {
oclfft = new OpenCLFFT(oclbase); oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase); oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase); oclcol = new OpenCLCollimatorPhysics(oclbase);
oclgreens = new OpenCLGreensFunction(oclbase);
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
@ -149,6 +150,7 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
oclfft = new OpenCLFFT(oclbase); oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase); oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase); oclcol = new OpenCLCollimatorPhysics(oclbase);
oclgreens = new OpenCLGreensFunction(oclbase);
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
@ -187,6 +189,7 @@ DKSBase::~DKSBase() {
delete oclchi; delete oclchi;
delete oclcol; delete oclcol;
delete oclbase; delete oclbase;
delete oclgreens;
#endif #endif
@ -613,6 +616,9 @@ int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int
if (apiCuda()) { if (apiCuda()) {
return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId) ); hz_m0, hz_m1, hz_m2, streamId) );
} else if (apiOpenCL()) {
return OPENCL_SAFECALL(oclgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2) );
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
//BENI: //BENI:
return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2)); return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
@ -627,6 +633,8 @@ int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
if (apiCuda()) if (apiCuda())
return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId)); return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
else if (apiOpenMP()) else if (apiOpenMP())
return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K)); return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
@ -638,6 +646,8 @@ int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId
if (apiCuda()) if (apiCuda())
return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId)); return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenMP()) else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K)); return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K));
@ -649,6 +659,8 @@ int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size,
if (apiCuda()) if (apiCuda())
return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId)); return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
else if (apiOpenMP()) else if (apiOpenMP())
return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size)); return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));

View File

@ -32,6 +32,7 @@
#include "OpenCL/OpenCLFFT.h" #include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLChiSquare.h" #include "OpenCL/OpenCLChiSquare.h"
#include "OpenCL/OpenCLCollimatorPhysics.h" #include "OpenCL/OpenCLCollimatorPhysics.h"
#include "OpenCL/OpenCLGreensFunction.h"
#endif #endif
#ifdef DKS_CUDA #ifdef DKS_CUDA
@ -76,6 +77,7 @@ private:
OpenCLFFT *oclfft; OpenCLFFT *oclfft;
OpenCLChiSquare *oclchi; OpenCLChiSquare *oclchi;
OpenCLCollimatorPhysics *oclcol; OpenCLCollimatorPhysics *oclcol;
OpenCLGreensFunction *oclgreens;
#endif #endif
#ifdef DKS_CUDA #ifdef DKS_CUDA

View File

@ -4,6 +4,7 @@ SET (_SRCS
OpenCLChiSquare.cpp OpenCLChiSquare.cpp
OpenCLCollimatorPhysics.cpp OpenCLCollimatorPhysics.cpp
OpenCLChiSquareRuntime.cpp OpenCLChiSquareRuntime.cpp
OpenCLGreensFunction.cpp
) )
SET (_HDRS SET (_HDRS
@ -12,6 +13,7 @@ SET (_HDRS
OpenCLChiSquare.h OpenCLChiSquare.h
OpenCLCollimatorPhysics.h OpenCLCollimatorPhysics.h
OpenCLChiSquareRuntime.h OpenCLChiSquareRuntime.h
OpenCLGreensFunction.h
) )
#INCLUDE_DIRECTORIES ( #INCLUDE_DIRECTORIES (
@ -25,6 +27,7 @@ SET (_KERNELS
OpenCLKernels/OpenCLTranspose.cl OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLChiSquareRuntime.cl OpenCLKernels/OpenCLChiSquareRuntime.cl
OpenCLKernels/OpenCLGreensFunction.cl
) )
ADD_SOURCES (${_SRCS}) ADD_SOURCES (${_SRCS})

View File

@ -428,7 +428,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
int ierr; int ierr;
//create program from kernel //create program from kernel
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr); m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
NULL, &ierr);
if (ierr != CL_SUCCESS) { if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr); DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
return DKS_ERROR; return DKS_ERROR;
@ -438,7 +439,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL); ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
/* /*
check if compileng kernel source succeded, if failed return error code check if compiling kernel source succeded, if failed return error code
if in debug mode get compilation info and print program build log witch if in debug mode get compilation info and print program build log witch
will give indication what made the compilation fail will give indication what made the compilation fail
*/ */
@ -447,7 +448,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
//get build status //get build status
cl_build_status status; cl_build_status status;
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &status, NULL);
//get log size //get log size
size_t log_size; size_t log_size;

View File

@ -30,24 +30,11 @@
#include <CL/cl_ext.h> #include <CL/cl_ext.h>
#endif #endif
#include "clRNG/clRNG.h"
#include "clRNG/mrg31k3p.h"
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/* struct for random number state */
typedef struct {
double s10;
double s11;
double s12;
double s20;
double s21;
double s22;
double z;
bool gen;
} RNDState;
class OpenCLBase { class OpenCLBase {
private: private:
@ -203,6 +190,20 @@ public:
*/ */
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer
* Set all the elemetns in the device array to zero
*/
template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
cl_int ierr;
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
sizeof(T)*size, 0, nullptr, nullptr);
if (ierr != CL_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
}
/* /*
Name: writeData Name: writeData
Info: write data to device memory (needs ptr to mem object) Info: write data to device memory (needs ptr to mem object)

View File

@ -117,15 +117,13 @@ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool for
*/ */
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) { int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
std::cout << "execute RCFFT" << std::endl;
int dkserr = DKS_SUCCESS; int dkserr = DKS_SUCCESS;
cl_int ierr; cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr; cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr; cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &real_in, &comp_out, NULL); 0, NULL, NULL, &real_in, &comp_out, NULL);
if (ierr != OCL_SUCCESS) { if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR; dkserr = DKS_ERROR;
@ -144,8 +142,6 @@ int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3],
*/ */
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) { int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
std::cout << "execute CRFFT" << std::endl;
int dkserr = DKS_SUCCESS; int dkserr = DKS_SUCCESS;
cl_int ierr; cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr; cl_mem real_in = (cl_mem)real_ptr;
@ -214,7 +210,13 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
cl_int err; cl_int err;
clfftDim dim = CLFFT_3D; clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
/* Create 3D fft plan*/ /* Create 3D fft plan*/
@ -244,9 +246,20 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) { int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
cl_int err; cl_int err;
clfftDim dim = CLFFT_3D; clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
size_t clOutStride[3] = {1, half, half * N[1]};
/* Create 3D fft plan*/ /* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength); err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
@ -254,6 +267,8 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE); err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED); err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE); err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
/* Bake the plan */ /* Bake the plan */
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL); err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
@ -269,9 +284,20 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) { int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
cl_int err; cl_int err;
clfftDim dim = CLFFT_3D; clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, half, half * N[1]};
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
/* Create 3D fft plan*/ /* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength); err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
@ -279,6 +305,8 @@ int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE); err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL); err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE); err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
/* Bake the plan */ /* Bake the plan */
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL); err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);

View File

@ -1,5 +1,5 @@
#include "OpenCLGreensFunction.h" #include "OpenCLGreensFunction.h"
#define GREENS_KERNEL "OpenCLKernels/OpenCLGreensFunction.cl" #define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) { OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
m_base = base; m_base = base;
@ -29,6 +29,8 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
double hr_m0, double hr_m1, double hr_m2, double hr_m0, double hr_m1, double hr_m2,
int streamId) int streamId)
{ {
int ierr = DKS_SUCCESS;
//compile opencl program from source //compile opencl program from source
buildProgram(); buildProgram();
@ -42,26 +44,28 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
work_items = (work_items / work_size + 1) * work_size; work_items = (work_items / work_size + 1) * work_size;
//create kernel //create kernel
ierr = m_oclbase->ocl_createKernel("kernelTmpgreen"); ierr = m_base->ocl_createKernel("kernelTmpgreen");
//set kernel parameters //set kernel parameters
m_base->setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr); m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
m_base->setKernelArg(1, sizeof(double), &hr_m0); m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
m_base->setKernelArg(2, sizeof(double), &hr_m1); m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
m_base->setKernelArg(3, sizeof(double), &hr_m2); m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
m_base->setKernelArg(4, sizeof(int), &I); m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->setKernelArg(5, sizeof(int), &J); m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->setKernelArg(6, sizeof(int), &K); m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel //execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr; return ierr;
} }
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
int streamId) int K, int streamId)
{ {
int ierr = DKS_SUCCESS;
//compile opencl program from source //compile opencl program from source
buildProgram(); buildProgram();
@ -70,8 +74,6 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen; cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
int NI = 2*(I - 1); int NI = 2*(I - 1);
int NJ = 2*(J - 1); int NJ = 2*(J - 1);
int NK = 2*(K - 1);
//set the work item size //set the work item size
size_t work_size = 128; size_t work_size = 128;
@ -80,20 +82,22 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
work_items = (work_items / work_size + 1) * work_size; work_items = (work_items / work_size + 1) * work_size;
//create kernel //create kernel
ierr = m_oclbase->ocl_createKernel("kernelIntegration"); ierr = m_base->ocl_createKernel("kernelIntegration");
//set kernel parameters //set kernel parameters
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr); m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr); m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
m_base->setKernelArg(2, sizeof(int), &I); m_base->ocl_setKernelArg(2, sizeof(int), &NI);
m_base->setKernelArg(3, sizeof(int), &J); m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
m_base->setKernelArg(4, sizeof(int), &K); m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->setKernelArg(5, sizeof(int), &NI); m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->setKernelArg(6, sizeof(int), &NJ); m_base->ocl_setKernelArg(6, sizeof(int), &K);
m_base->setKernelArg(7, sizeof(int), &NK);
//execute kernel //execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); double zero = 0.0;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr; return ierr;
@ -102,6 +106,8 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
{ {
int ierr = DKS_SUCCESS;
//compile opencl program from source //compile opencl program from source
buildProgram(); buildProgram();
@ -114,6 +120,8 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
int J2 = 2*J; int J2 = 2*J;
int K2 = 2*K; int K2 = 2*K;
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
//set the work item size //set the work item size
size_t work_size = 128; size_t work_size = 128;
size_t work_items = NI * NJ * NK; size_t work_items = NI * NJ * NK;
@ -121,19 +129,20 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
work_items = (work_items / work_size + 1) * work_size; work_items = (work_items / work_size + 1) * work_size;
//create kernel //create kernel
ierr = m_oclbase->ocl_createKernel("kernelMirroredRhoField"); ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
//set kernel parameters //set kernel parameters
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr); m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->setKernelArg(1, sizeof(int), &I2); m_base->ocl_setKernelArg(1, sizeof(int), &I2);
m_base->setKernelArg(2, sizeof(int), &J2); m_base->ocl_setKernelArg(2, sizeof(int), &J2);
m_base->setKernelArg(3, sizeof(int), &K2); m_base->ocl_setKernelArg(3, sizeof(int), &K2);
m_base->setKernelArg(4, sizeof(int), &NI); m_base->ocl_setKernelArg(4, sizeof(int), &NI);
m_base->setKernelArg(5, sizeof(int), &NJ); m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
m_base->setKernelArg(6, sizeof(int), &NK); m_base->ocl_setKernelArg(6, sizeof(int), &NK);
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
//execute kernel //execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr; return ierr;
} }
@ -141,4 +150,32 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId) int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
{ {
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem mem_ptr1 = (cl_mem) ptr1;
cl_mem mem_ptr2 = (cl_mem) ptr2;
//set the work item size
size_t work_size = 128;
size_t work_items = size;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("multiplyComplexFields");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
m_base->ocl_setKernelArg(2, sizeof(int), &size);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
} }

View File

@ -60,4 +60,4 @@ public:
}; };
#endif H_OPENCL_GREENSFUNCTION #endif

View File

@ -86,23 +86,24 @@ __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgre
rho2_m[i + j*ni + k*ni*nj] = tmp_rho; rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
} }
} }
/** miror rho-field */ /** miror rho-field */
__kernel void mirroredRhoField0(__global double *rho2_m, int NI, int NJ) { __kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
rho2_m[0] = rho2_m[NI*NJ]; rho2_m[0] = rho2_m[NI*NJ];
} }
__kernel void mirroredRhoField(__global double *rho2_m, __kernel void kernelMirroredRhoField(__global double *rho2_m,
int NI, int NJ, int NK, int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp) { int NI_tmp, int NJ_tmp, int NK_tmp,
int size)
{
int tid = get_local_id(0); int tid = get_local_id(0);
int id = get_global_id(0); int id = get_global_id(0);
if (id == 0) if (id == 0)
rho2_m[0] = rho2_m[NI * NJ]; rho2_m[0] = rho2_m[NI * NJ];
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
@ -127,27 +128,29 @@ __kernel void mirroredRhoField(__global double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i; id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri; id8 = rk * NI * NJ + rj * NI + ri;
double data = 0.0;
if (id1 < size)
data = rho2_m[id1];
double data = rho2_m[id1]; if (i != 0 && id2 < size) rho2_m[id2] = data;
if (i != 0) rho2_m[id2] = data;
if (j != 0) rho2_m[id3] = data; if (j != 0 && id3 < size) rho2_m[id3] = data;
if (i != 0 && j != 0) rho2_m[id4] = data; if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
if (k != 0) rho2_m[id5] = data; if (k != 0 && id5 < size) rho2_m[id5] = data;
if (k != 0 && i != 0) rho2_m[id6] = data; if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
if (k!= 0 && j != 0) rho2_m[id7] = data; if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data; if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
} }
} }
/** multiply complex fields */ /** multiply complex fields */
double2 CompelxMul(double2 a, double2 b) { double2 ComplexMul(double2 a, double2 b) {
double2 c; double2 c;
c.x = a.x * b.x - a.y * b.y; c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x; c.y = a.x * b.y + a.y * b.x;
@ -155,12 +158,13 @@ double2 CompelxMul(double2 a, double2 b) {
return c; return c;
} }
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
__kernel void multiplyComplexFields_2(__global double2 *ptr1, __global double2 *ptr2, int size)
int size)
{ {
int idx = get_global_id(0); int idx = get_global_id(0);
if (idx < size) if (idx < size)
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]); ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
} }

View File

@ -25,7 +25,7 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
#ADD_EXECUTABLE(testPush testPush.cpp) #ADD_EXECUTABLE(testPush testPush.cpp)
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp) ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp) #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp) #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
@ -56,7 +56,7 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testPush dks) #TARGET_LINK_LIBRARIES(testPush dks)
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks) TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testIntegration dks) #TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks) #TARGET_LINK_LIBRARIES(testImageReconstruction dks)

View File

@ -1,6 +1,8 @@
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include <complex> #include <complex>
#include <fstream>
#include <iomanip>
#include "Utility/TimeStamp.h" #include "Utility/TimeStamp.h"
#include "DKSBase.h" #include "DKSBase.h"
@ -8,14 +10,20 @@
using namespace std; using namespace std;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim); void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
void initData(double *data, int dimsize[3]); void initData(double *data, int dimsize[3], int dim);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
char *api_name, char *device_name); char *api_name, char *device_name, char *file_name);
void printHelp(); void printHelp();
void printData3DN4(complex<double>* &data, int N, int dim); void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* &data, int N, int dim); void printData3DN4(double* &data, int N, int dim);
double precision(double a) {
//if (a < 1e-10)
// return 0.0;
//else
return a;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
@ -26,32 +34,29 @@ int main(int argc, char *argv[]) {
int loop = 0; int loop = 0;
char *api_name = new char[10]; char *api_name = new char[10];
char *device_name = new char[10]; char *device_name = new char[10];
char *file_name = new char[50];
if ( readParams(argc, argv, N1, N2, N3, loop, api_name, device_name) ) if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
return 0; return 0;
cout << "Use api: " << api_name << ", " << device_name << endl; cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N3, N2, N1}; int dimsize[3] = {N1, N2, N3};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal]; double *rdata = new double[sizereal];
double *outdata = new double[sizereal]; double *outdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp]; complex<double> *cfft = new complex<double>[sizecomp];
initData(rdata, dimsize); initData(rdata, dimsize, dim);
/* init DKSBase */ /* init DKSBase */
cout << "Init device and set function" << endl; cout << "Init device and set function" << endl;
DKSBase base; DKSBase base;
base.setAPI(api_name, strlen(api_name)); base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name)); base.setDevice(device_name, strlen(device_name));
base.initDevice(); base.initDevice();
base.setupFFT(3, dimsize); base.setupFFT(dim, dimsize);
base.setupFFTRC(dim, dimsize);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
// allocate memory on device // allocate memory on device
int ierr; int ierr;
@ -63,68 +68,59 @@ DKSBase base;
// execute one run before starting the timers // execute one run before starting the timers
base.writeData<double>(real_ptr, rdata, sizereal); base.writeData<double>(real_ptr, rdata, sizereal);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize); base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
base.readData<double>(real_res_ptr, outdata, sizereal); base.readData<double>(real_res_ptr, outdata, sizereal);
//timer for total loop time, FFT and IFFT calls
struct timeval timeStart, timeEnd;
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
gettimeofday(&timeStart, NULL);
for (int i=0; i<loop; ++i){
// write data to device
base.writeData<double>(real_ptr, rdata, sizereal);
// execute rcfft
gettimeofday(&timeFFTStart[i], NULL);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeFFTEnd[i], NULL);
// execute crfft
gettimeofday(&timeIFFTStart[i], NULL);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeIFFTEnd[i], NULL);
//normalize
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
// read IFFT data from device
base.readData<double>(real_res_ptr, outdata, sizereal);
ofstream myfile;
myfile.open(file_name);
myfile<< "in\tout\treal\timag\n";
for (int i = 0; i < sizereal; i++) {
//myfile << precision(rdata[i]) << "\t";
//myfile << precision(outdata[i]) << "\t";
if (i < sizecomp) {
myfile << precision(cfft[i].real()) << "\t";
myfile << precision(cfft[i].imag());
}
myfile << "\n";
} }
gettimeofday(&timeEnd, NULL); myfile.close();
/*
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1; j++) {
cout << rdata[i*N1 + j] << " ";
}
cout << endl;
}
cout << endl;
}
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1 / 2 + 1; j++) {
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
}
cout << endl;
}
cout << endl;
}
*/
// free device memory // free device memory
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp); base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal); base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(real_res_ptr, sizereal); base.freeMemory<double>(real_res_ptr, sizereal);
// compare in and out data to see if we get back the same results // compare in and out data to see if we get back the same results
cout << "comp" << endl;
compareData(rdata, outdata, N1, N2, N3, dim); compareData(rdata, outdata, N1, N2, N3, dim);
cout << "done" << endl;
//calculate seconds for total time and fft times
double tfft = 0;
double tifft = 0;
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
for (int i = 0; i < loop; i++) {
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
}
//print timing results
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
<< "\n\n";
return 0; return 0;
} }
@ -132,10 +128,10 @@ DKSBase base;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) { void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
int id; int id;
double sum = 0; double sum = 0;
for (int i = 0; i < NI; i++) { for (int i = 0; i < NK; i++) {
for (int j = 0; j < NJ; j++) { for (int j = 0; j < NJ; j++) {
for (int k = 0; k < NK; k++) { for (int k = 0; k < NI; k++) {
id = k*NI*NJ + j*NI + i; id = i*NI*NJ + j*NI + k;
sum += fabs(data1[id] - data2[id]); sum += fabs(data1[id] - data2[id]);
} }
} }
@ -143,13 +139,21 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
std::cout << "RC <--> CR diff: " << sum << std::endl; std::cout << "RC <--> CR diff: " << sum << std::endl;
} }
void initData(double *data, int dimsize[3]) { void initData(double *data, int dimsize[3], int dim) {
for (int i = 0; i < dimsize[2]; i++) { if (dim == 3) {
for (int i = 0; i < dimsize[2]; i++)
for (int j = 0; j < dimsize[1]; j++)
for (int k = 0; k < dimsize[0]; k++)
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
} else if (dim == 2) {
for (int j = 0; j < dimsize[1]; j++) { for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) { for (int k = 0; k < dimsize[0]; k++) {
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; data[j*dimsize[0] + k] = sin(k);
} }
} }
} else {
for (int k = 0; k < dimsize[0]; k++)
data[k] = sin(k);
} }
} }
@ -168,12 +172,17 @@ void printHelp() {
std::cout << std::endl; std::cout << std::endl;
} }
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
char *api_name, char *device_name) char *api_name, char *device_name, char *file_name)
{ {
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-dim")) {
dim = atoi(argv[i + 1]);
i++;
}
if ( argv[i] == std::string("-grid") ) { if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]); N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]); N2 = atoi(argv[i + 2]);
@ -194,21 +203,25 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
if (argv[i] == string("-cuda")) { if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda"); strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu"); strcpy(device_name, "-gpu");
strcpy(file_name, "cuda_fft.dat");
} }
if (argv[i] == string("-opencl")) { if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL"); strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu"); strcpy(device_name, "-gpu");
strcpy(file_name, "opencl_fft.dat");
} }
if (argv[i] == string("-mic")) { if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP"); strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic"); strcpy(device_name, "-mic");
strcpy(file_name, "openmp_fft.dat");
} }
if (argv[i] == string("-cpu")) { if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL"); strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu"); strcpy(device_name, "-cpu");
strcpy(file_name, "opencl_cpu_fft.dat");
} }
} }

View File

@ -1,5 +1,4 @@
#include <iostream> #include <iostream>
//#include <mpi.h>
#include <string.h> #include <string.h>
#include "DKSBase.h" #include "DKSBase.h"
@ -11,309 +10,265 @@ using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") { void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0) if (strcmp(message, "") != 0)
cout << message; cout << message;
for (int i = 0; i < NI; i++) { for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) { for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) { for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t"; cout << data[i*N*N + j*N + k] << "\t";
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
} }
void initData(double *data, int N) { void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) { for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) { for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) { for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1; data[i*N*N + j*N + k] = k+1;
} }
} }
} }
} }
void initData2(double *data, int N) { void initData2(double *data, int N) {
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
data[i] = i; data[i] = i;
} }
void initComplex( complex<double> *d, int N) { void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0); d[i] = complex<double>(2, 0);
} }
} }
void printComplex(complex<double> *d, int N) { void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
cout << d[i] << "\t"; cout << d[i] << "\t";
cout << endl; cout << endl;
}
void printDouble(double *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << ", ";
cout << endl;
} }
void initMirror(double *data, int n1, int n2, int n3) { void initMirror(double *data, int n1, int n2, int n3) {
int d = 1; int d = 1;
for (int i = 0; i < n3; i++) { for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) { for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) { for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++; data[i * n2 * n1 + j * n1 + k] = d++;
else else
data[i * n2 * n1 + j * n1 + k] = 0; data[i * n2 * n1 + j * n1 + k] = 0;
} }
} }
} }
} }
void printDiv(int c) { void printDiv(int c) {
for (int i = 0; i < c; i++) for (int i = 0; i < c; i++)
cout << "-"; cout << "-";
cout << endl; cout << endl;
} }
void printMirror(double *data, int n1, int n2, int n3) { void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75); printDiv(75);
for (int i = 0; i < n3; i++) { for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) { for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) { for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t"; cout << data[i * n2 * n1 + j * n1 + k] << "\t";
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
double sumData(double *data, int datasize) { double sumData(double *data, int datasize) {
double sum = 0; double sum = 0;
for (int i = 0; i < datasize; i++) for (int i = 0; i < datasize; i++)
sum += data[i]; sum += data[i];
return sum; return sum;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
/* mpi init */ char *api_name = new char[10];
//int rank, nprocs; char *device_name = new char[10];
//MPI_Init(&argc, &argv);
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* for (int i = 1; i < argc; i++) {
if (nprocs != 8) { if (argv[i] == string("-cuda")) {
cout << "example was set to run with 8 processes" << endl; strcpy(api_name, "Cuda");
cout << "exit..." << endl; strcpy(device_name, "-gpu");
return 0; }
}
*/
/* set domain size */ if (argv[i] == string("-opencl")) {
int NG[3] = {64, 64, 32}; strcpy(api_name, "OpenCL");
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; strcpy(device_name, "-gpu");
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; }
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
//id[0] = 0; if (argv[i] == string("-mic")) {
//id[1] = NL[1] * (rank % 4); strcpy(api_name, "OpenMP");
//id[2] = NL[2] * (rank / 4); strcpy(device_name, "-mic");
}
/* print some messages bout the example in the begginig */ if (argv[i] == string("-cpu")) {
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; strcpy(api_name, "OpenCL");
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; strcpy(device_name, "-cpu");
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; }
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; }
int tmp[3];
/* for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}*/
// } else {
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
// }
/* dks init and create 2 streams */ cout << "Use api: " << api_name << ", " << device_name << endl;
int dkserr;
//int streamGreens, streamFFT;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
#endif
#ifdef DKS_CUDA /* set domain size */
DKSBase base; int NG[3] = {64, 64, 32};
base.setAPI("Cuda", 4); int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
base.setDevice("-gpu", 4); int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
base.initDevice(); int sizerho = NG[0] * NG[1] * NG[2];
#endif int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
//base.createStream(streamFFT); /* print some messages bout the example in the begginig */
//if (rank == 0) { cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
// base.createStream(streamGreens); cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
base.setupFFT(3, NG);
//}
/* allocate memory and init rho field */ /* dks init and create 2 streams */
double *rho = new double[sizerho]; int dkserr;
double *rho_out = new double[sizerho]; DKSBase base;
//double *green_out = new double[sizegreen]; base.setAPI(api_name, strlen(api_name));
initMirror(rho, NL[0], NL[1], NL[2]); base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, NG);
/* /* allocate memory and init rho field */
allocate memory on device for double *rho = new double[sizerho];
- rho field double *rho_out = new double[sizerho];
- rho FFT //double *green_out = new double[sizegreen];
- tmpgreen double *mirror_out = new double[sizerho];
- greens integral //initMirror(rho, NL[0], NL[1], NL[2]);
- greens integral FFT initMirror(rho, NG[0], NG[1], NG[2]);
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
// if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* } else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}*/
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* send and receive pointer to allocated memory on device */ /* =================================================*/
/* /* =================================================*/
if (rank == 0) { /* =====loop trough fftpoison solver iterations=====*/
for (int p = 1; p < nprocs; p++) /* =================================================*/
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); /* =================================================*/
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
*/
double old_sum = 0;
/* =================================================*/ int hr_m[3] = {1, 1, 1};
/* =================================================*/ base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
double old_sum = 0; /* calculate greens integral on gpu */
double tmp_sum = 0; base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
for (int l = 0; l < 100; l++) {
//MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
//if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2]);
/* calculate greens integral on gpu */ /* mirror the field */
//if (rank == 0) base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]); /*
base.readData<double>(grn_ptr, mirror_out, sizerho);
for (int i = 0; i < sizerho; i++)
cout << mirror_out[i] << " ";
cout << endl << endl;
/* mirror the field */ for (int i = 0; i < sizerho; i++)
//if (rank == 0) cout << rho[i] << " ";
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]); cout << endl << endl;
*/
/* transfer rho field to device */
base.writeData<double>(rho2_ptr, rho, sizerho);
/* get FFT of rho field */
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
/* get FFT of mirrored greens integral */ /* get FFT of mirrored greens integral */
//if (rank == 0) base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* transfer rho field to device */ /* multiply both FFTs */
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT); base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
/* get FFT of rho field */ /*
//if (rank == 0) { complex<double> *crho = new complex<double>[sizecomp];
//base.syncDevice(); complex<double> *cgre = new complex<double>[sizecomp];
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
//} base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
/* multiply both FFTs */ for (int i = 0; i < sizecomp; i++)
//if (rank == 0) cout << cgre[i].real() << " ";
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); cout << endl << endl;
//MPI_Barrier(MPI_COMM_WORLD);
/* inverse fft and transfer data back */ for (int i = 0; i < sizecomp; i++)
/* cout << crho[i].real() << " ";
multiple device syncs and mpi barriers are used to make sure data cout << endl << endl;
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
//if (rank == 0) {
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
/*} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
*/
delete[] crho;
delete[] cgre;
*/
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
base.readData<double> (rho2_ptr, rho_out, sizerho);
for (int i = 0; i < 10; i++)
cout << rho_out[i] << " ";
cout << endl;
old_sum = sumData(rho_out, sizerho);
}
/* =================================================*/ /* =================================================*/
/* =================================================*/ /* =================================================*/
/* ==========end fftpoison solver test run==========*/ /* ==========end fftpoison solver test run==========*/
/* =================================================*/ /* =================================================*/
/* =================================================*/ /* =================================================*/
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
base.freeMemory<double>(rho2_ptr, sizerho);
delete[] rho_out;
/* free memory on device */ delete[] rho;
//if (rank == 0) { delete[] mirror_out;
base.freeMemory<double>(tmpgreen_ptr, sizegreen); cout << "Final sum: " << old_sum << endl;
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
/*} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}*/
//MPI_Finalize();
} }