OpenCL FFT using clfft and tests
This commit is contained in:
@ -167,6 +167,40 @@ public:
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Zero CUDA memory.
|
||||
* Set all the elements of the array on the device to zero.
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
|
||||
cudaError cerror;
|
||||
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error zeroing cuda memory!\n");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Zero CUDA memory.
|
||||
* Set all the elements of the array on the device to zero.
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
|
||||
int dkserror = DKS_SUCCESS;
|
||||
cudaError cerror;
|
||||
if (streamId < cuda_numberOfStreams()) {
|
||||
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
|
||||
cuda_getStream(streamId));
|
||||
|
||||
if (cerror != cudaSuccess)
|
||||
dkserror = DKS_ERROR;
|
||||
} else
|
||||
dkserror = DKS_ERROR;
|
||||
|
||||
return dkserror;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: write data to memory
|
||||
* Retrun: success or error code
|
||||
|
@ -189,12 +189,11 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
|
||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
|
||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||
|
||||
|
||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -273,7 +272,6 @@ __global__ void mirroredRhoField(double *rho2_m,
|
||||
id7 = rk * NI * NJ + rj * NI + i;
|
||||
id8 = rk * NI * NJ + rj * NI + ri;
|
||||
|
||||
|
||||
double data = rho2_m[id1];
|
||||
if (i != 0) rho2_m[id2] = data;
|
||||
|
||||
@ -389,8 +387,10 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
||||
|
||||
int thread = 128;
|
||||
int block = (I * J * K / thread) + 1;
|
||||
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
||||
|
||||
if (streamId == -1) {
|
||||
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
|
||||
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
||||
2*(I - 1), 2*(J - 1), I, J, K);
|
||||
return DKS_SUCCESS;
|
||||
@ -399,6 +399,7 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
||||
|
||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
|
||||
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
||||
2*(I - 1), 2*(J - 1), I, J, K);
|
||||
return DKS_SUCCESS;
|
||||
|
@ -114,6 +114,7 @@ DKSBase::DKSBase() {
|
||||
oclfft = new OpenCLFFT(oclbase);
|
||||
oclchi = new OpenCLChiSquare(oclbase);
|
||||
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||
oclgreens = new OpenCLGreensFunction(oclbase);
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
@ -149,6 +150,7 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
|
||||
oclfft = new OpenCLFFT(oclbase);
|
||||
oclchi = new OpenCLChiSquare(oclbase);
|
||||
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||
oclgreens = new OpenCLGreensFunction(oclbase);
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
@ -187,6 +189,7 @@ DKSBase::~DKSBase() {
|
||||
delete oclchi;
|
||||
delete oclcol;
|
||||
delete oclbase;
|
||||
delete oclgreens;
|
||||
#endif
|
||||
|
||||
|
||||
@ -613,6 +616,9 @@ int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||
hz_m0, hz_m1, hz_m2, streamId) );
|
||||
} else if (apiOpenCL()) {
|
||||
return OPENCL_SAFECALL(oclgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||
hz_m0, hz_m1, hz_m2) );
|
||||
} else if (apiOpenMP()) {
|
||||
//BENI:
|
||||
return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
|
||||
@ -627,6 +633,8 @@ int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
|
||||
else if (apiOpenCL())
|
||||
return OPENCL_SAFECALL(oclgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||
|
||||
@ -638,6 +646,8 @@ int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||
else if (apiOpenCL())
|
||||
return OPENCL_SAFECALL(oclgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K));
|
||||
|
||||
@ -649,6 +659,8 @@ int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size,
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
|
||||
else if (apiOpenCL())
|
||||
return OPENCL_SAFECALL(oclgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "OpenCL/OpenCLFFT.h"
|
||||
#include "OpenCL/OpenCLChiSquare.h"
|
||||
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
||||
#include "OpenCL/OpenCLGreensFunction.h"
|
||||
#endif
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
@ -76,6 +77,7 @@ private:
|
||||
OpenCLFFT *oclfft;
|
||||
OpenCLChiSquare *oclchi;
|
||||
OpenCLCollimatorPhysics *oclcol;
|
||||
OpenCLGreensFunction *oclgreens;
|
||||
#endif
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
|
@ -4,6 +4,7 @@ SET (_SRCS
|
||||
OpenCLChiSquare.cpp
|
||||
OpenCLCollimatorPhysics.cpp
|
||||
OpenCLChiSquareRuntime.cpp
|
||||
OpenCLGreensFunction.cpp
|
||||
)
|
||||
|
||||
SET (_HDRS
|
||||
@ -12,6 +13,7 @@ SET (_HDRS
|
||||
OpenCLChiSquare.h
|
||||
OpenCLCollimatorPhysics.h
|
||||
OpenCLChiSquareRuntime.h
|
||||
OpenCLGreensFunction.h
|
||||
)
|
||||
|
||||
#INCLUDE_DIRECTORIES (
|
||||
@ -25,6 +27,7 @@ SET (_KERNELS
|
||||
OpenCLKernels/OpenCLTranspose.cl
|
||||
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
||||
OpenCLKernels/OpenCLChiSquareRuntime.cl
|
||||
OpenCLKernels/OpenCLGreensFunction.cl
|
||||
)
|
||||
|
||||
ADD_SOURCES (${_SRCS})
|
||||
|
@ -428,7 +428,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
||||
int ierr;
|
||||
|
||||
//create program from kernel
|
||||
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
|
||||
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
|
||||
NULL, &ierr);
|
||||
if (ierr != CL_SUCCESS) {
|
||||
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
||||
return DKS_ERROR;
|
||||
@ -438,7 +439,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
||||
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
||||
|
||||
/*
|
||||
check if compileng kernel source succeded, if failed return error code
|
||||
check if compiling kernel source succeded, if failed return error code
|
||||
if in debug mode get compilation info and print program build log witch
|
||||
will give indication what made the compilation fail
|
||||
*/
|
||||
@ -447,7 +448,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
||||
|
||||
//get build status
|
||||
cl_build_status status;
|
||||
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
|
||||
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
|
||||
sizeof(cl_build_status), &status, NULL);
|
||||
|
||||
//get log size
|
||||
size_t log_size;
|
||||
|
@ -30,24 +30,11 @@
|
||||
#include <CL/cl_ext.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include "clRNG/clRNG.h"
|
||||
#include "clRNG/mrg31k3p.h"
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
/* struct for random number state */
|
||||
typedef struct {
|
||||
|
||||
double s10;
|
||||
double s11;
|
||||
double s12;
|
||||
double s20;
|
||||
double s21;
|
||||
double s22;
|
||||
double z;
|
||||
bool gen;
|
||||
|
||||
} RNDState;
|
||||
|
||||
class OpenCLBase {
|
||||
|
||||
private:
|
||||
@ -195,7 +182,7 @@ public:
|
||||
Return: return pointer to memory
|
||||
*/
|
||||
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
||||
|
||||
|
||||
/*
|
||||
Name: allocateMemory
|
||||
Info: allocate memory on device
|
||||
@ -203,6 +190,20 @@ public:
|
||||
*/
|
||||
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
||||
|
||||
/** Zero OpenCL memory buffer
|
||||
* Set all the elemetns in the device array to zero
|
||||
*/
|
||||
template <typename T>
|
||||
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
|
||||
|
||||
cl_int ierr;
|
||||
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
|
||||
sizeof(T)*size, 0, nullptr, nullptr);
|
||||
if (ierr != CL_SUCCESS)
|
||||
return DKS_ERROR;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Name: writeData
|
||||
Info: write data to device memory (needs ptr to mem object)
|
||||
|
@ -117,15 +117,13 @@ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool for
|
||||
*/
|
||||
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
std::cout << "execute RCFFT" << std::endl;
|
||||
|
||||
int dkserr = DKS_SUCCESS;
|
||||
cl_int ierr;
|
||||
cl_mem real_in = (cl_mem)real_ptr;
|
||||
cl_mem comp_out = (cl_mem)comp_ptr;
|
||||
|
||||
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
||||
0, NULL, NULL, &real_in, &comp_out, NULL);
|
||||
0, NULL, NULL, &real_in, &comp_out, NULL);
|
||||
|
||||
if (ierr != OCL_SUCCESS) {
|
||||
dkserr = DKS_ERROR;
|
||||
@ -144,8 +142,6 @@ int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3],
|
||||
*/
|
||||
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
std::cout << "execute CRFFT" << std::endl;
|
||||
|
||||
int dkserr = DKS_SUCCESS;
|
||||
cl_int ierr;
|
||||
cl_mem real_in = (cl_mem)real_ptr;
|
||||
@ -214,7 +210,13 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
||||
|
||||
cl_int err;
|
||||
|
||||
clfftDim dim = CLFFT_3D;
|
||||
clfftDim dim;
|
||||
if (ndim == 1)
|
||||
dim = CLFFT_1D;
|
||||
else if (ndim == 2)
|
||||
dim = CLFFT_2D;
|
||||
else
|
||||
dim = CLFFT_3D;
|
||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||
|
||||
/* Create 3D fft plan*/
|
||||
@ -244,9 +246,20 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
||||
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
||||
cl_int err;
|
||||
|
||||
clfftDim dim = CLFFT_3D;
|
||||
clfftDim dim;
|
||||
if (ndim == 1)
|
||||
dim = CLFFT_1D;
|
||||
else if (ndim == 2)
|
||||
dim = CLFFT_2D;
|
||||
else
|
||||
dim = CLFFT_3D;
|
||||
|
||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||
|
||||
size_t half = (size_t)N[0] / 2 + 1;
|
||||
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
||||
size_t clOutStride[3] = {1, half, half * N[1]};
|
||||
|
||||
/* Create 3D fft plan*/
|
||||
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
|
||||
|
||||
@ -254,6 +267,8 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
||||
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
|
||||
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
|
||||
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
|
||||
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
|
||||
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
|
||||
|
||||
/* Bake the plan */
|
||||
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||
@ -269,9 +284,20 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
||||
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
||||
cl_int err;
|
||||
|
||||
clfftDim dim = CLFFT_3D;
|
||||
clfftDim dim;
|
||||
if (ndim == 1)
|
||||
dim = CLFFT_1D;
|
||||
else if (ndim == 2)
|
||||
dim = CLFFT_2D;
|
||||
else
|
||||
dim = CLFFT_3D;
|
||||
|
||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||
|
||||
size_t half = (size_t)N[0] / 2 + 1;
|
||||
size_t clInStride[3] = {1, half, half * N[1]};
|
||||
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
||||
|
||||
/* Create 3D fft plan*/
|
||||
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
|
||||
|
||||
@ -279,6 +305,8 @@ int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
||||
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
|
||||
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
|
||||
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
|
||||
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
|
||||
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
|
||||
|
||||
/* Bake the plan */
|
||||
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include "OpenCLGreensFunction.h"
|
||||
#define GREENS_KERNEL "OpenCLKernels/OpenCLGreensFunction.cl"
|
||||
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
|
||||
|
||||
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
|
||||
m_base = base;
|
||||
@ -29,6 +29,8 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
|
||||
double hr_m0, double hr_m1, double hr_m2,
|
||||
int streamId)
|
||||
{
|
||||
int ierr = DKS_SUCCESS;
|
||||
|
||||
//compile opencl program from source
|
||||
buildProgram();
|
||||
|
||||
@ -42,26 +44,28 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
|
||||
work_items = (work_items / work_size + 1) * work_size;
|
||||
|
||||
//create kernel
|
||||
ierr = m_oclbase->ocl_createKernel("kernelTmpgreen");
|
||||
ierr = m_base->ocl_createKernel("kernelTmpgreen");
|
||||
|
||||
//set kernel parameters
|
||||
m_base->setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
|
||||
m_base->setKernelArg(1, sizeof(double), &hr_m0);
|
||||
m_base->setKernelArg(2, sizeof(double), &hr_m1);
|
||||
m_base->setKernelArg(3, sizeof(double), &hr_m2);
|
||||
m_base->setKernelArg(4, sizeof(int), &I);
|
||||
m_base->setKernelArg(5, sizeof(int), &J);
|
||||
m_base->setKernelArg(6, sizeof(int), &K);
|
||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
|
||||
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
|
||||
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
|
||||
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
|
||||
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
||||
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
||||
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
||||
|
||||
//execute kernel
|
||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
||||
int streamId)
|
||||
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
|
||||
int K, int streamId)
|
||||
{
|
||||
int ierr = DKS_SUCCESS;
|
||||
|
||||
//compile opencl program from source
|
||||
buildProgram();
|
||||
|
||||
@ -70,8 +74,6 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
|
||||
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
||||
int NI = 2*(I - 1);
|
||||
int NJ = 2*(J - 1);
|
||||
int NK = 2*(K - 1);
|
||||
|
||||
|
||||
//set the work item size
|
||||
size_t work_size = 128;
|
||||
@ -80,20 +82,22 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
|
||||
work_items = (work_items / work_size + 1) * work_size;
|
||||
|
||||
//create kernel
|
||||
ierr = m_oclbase->ocl_createKernel("kernelIntegration");
|
||||
ierr = m_base->ocl_createKernel("kernelIntegration");
|
||||
|
||||
//set kernel parameters
|
||||
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
||||
m_base->setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
|
||||
m_base->setKernelArg(2, sizeof(int), &I);
|
||||
m_base->setKernelArg(3, sizeof(int), &J);
|
||||
m_base->setKernelArg(4, sizeof(int), &K);
|
||||
m_base->setKernelArg(5, sizeof(int), &NI);
|
||||
m_base->setKernelArg(6, sizeof(int), &NJ);
|
||||
m_base->setKernelArg(7, sizeof(int), &NK);
|
||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
||||
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
|
||||
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
|
||||
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
|
||||
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
||||
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
||||
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
||||
|
||||
//execute kernel
|
||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
double zero = 0.0;
|
||||
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
||||
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
|
||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
return ierr;
|
||||
|
||||
@ -102,6 +106,8 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
|
||||
|
||||
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
|
||||
{
|
||||
int ierr = DKS_SUCCESS;
|
||||
|
||||
//compile opencl program from source
|
||||
buildProgram();
|
||||
|
||||
@ -114,6 +120,8 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
|
||||
int J2 = 2*J;
|
||||
int K2 = 2*K;
|
||||
|
||||
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
|
||||
|
||||
//set the work item size
|
||||
size_t work_size = 128;
|
||||
size_t work_items = NI * NJ * NK;
|
||||
@ -121,19 +129,20 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
|
||||
work_items = (work_items / work_size + 1) * work_size;
|
||||
|
||||
//create kernel
|
||||
ierr = m_oclbase->ocl_createKernel("kernelMirroredRhoField");
|
||||
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
|
||||
|
||||
//set kernel parameters
|
||||
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
||||
m_base->setKernelArg(1, sizeof(int), &I2);
|
||||
m_base->setKernelArg(2, sizeof(int), &J2);
|
||||
m_base->setKernelArg(3, sizeof(int), &K2);
|
||||
m_base->setKernelArg(4, sizeof(int), &NI);
|
||||
m_base->setKernelArg(5, sizeof(int), &NJ);
|
||||
m_base->setKernelArg(6, sizeof(int), &NK);
|
||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
||||
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
|
||||
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
|
||||
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
|
||||
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
|
||||
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
|
||||
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
|
||||
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
|
||||
|
||||
//execute kernel
|
||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
return ierr;
|
||||
}
|
||||
@ -141,4 +150,32 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
|
||||
|
||||
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
|
||||
{
|
||||
int ierr = DKS_SUCCESS;
|
||||
|
||||
//compile opencl program from source
|
||||
buildProgram();
|
||||
|
||||
//cast the input data ptr to cl_mem
|
||||
cl_mem mem_ptr1 = (cl_mem) ptr1;
|
||||
cl_mem mem_ptr2 = (cl_mem) ptr2;
|
||||
|
||||
//set the work item size
|
||||
size_t work_size = 128;
|
||||
size_t work_items = size;
|
||||
if (work_items % work_size > 0)
|
||||
work_items = (work_items / work_size + 1) * work_size;
|
||||
|
||||
//create kernel
|
||||
ierr = m_base->ocl_createKernel("multiplyComplexFields");
|
||||
|
||||
//set kernel parameters
|
||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
|
||||
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
|
||||
m_base->ocl_setKernelArg(2, sizeof(int), &size);
|
||||
|
||||
//execute kernel
|
||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
return ierr;
|
||||
|
||||
}
|
||||
|
@ -60,4 +60,4 @@ public:
|
||||
};
|
||||
|
||||
|
||||
#endif H_OPENCL_GREENSFUNCTION
|
||||
#endif
|
||||
|
@ -81,28 +81,29 @@ __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgre
|
||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
|
||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||
|
||||
|
||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** miror rho-field */
|
||||
__kernel void mirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
|
||||
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
|
||||
rho2_m[0] = rho2_m[NI*NJ];
|
||||
}
|
||||
|
||||
__kernel void mirroredRhoField(__global double *rho2_m,
|
||||
int NI, int NJ, int NK,
|
||||
int NI_tmp, int NJ_tmp, int NK_tmp) {
|
||||
__kernel void kernelMirroredRhoField(__global double *rho2_m,
|
||||
int NI, int NJ, int NK,
|
||||
int NI_tmp, int NJ_tmp, int NK_tmp,
|
||||
int size)
|
||||
{
|
||||
|
||||
int tid = get_local_id(0);
|
||||
int id = get_global_id(0);
|
||||
|
||||
if (id == 0)
|
||||
rho2_m[0] = rho2_m[NI * NJ];
|
||||
rho2_m[0] = rho2_m[NI * NJ];
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
@ -127,27 +128,29 @@ __kernel void mirroredRhoField(__global double *rho2_m,
|
||||
id7 = rk * NI * NJ + rj * NI + i;
|
||||
id8 = rk * NI * NJ + rj * NI + ri;
|
||||
|
||||
double data = 0.0;
|
||||
if (id1 < size)
|
||||
data = rho2_m[id1];
|
||||
|
||||
double data = rho2_m[id1];
|
||||
if (i != 0) rho2_m[id2] = data;
|
||||
if (i != 0 && id2 < size) rho2_m[id2] = data;
|
||||
|
||||
if (j != 0) rho2_m[id3] = data;
|
||||
if (j != 0 && id3 < size) rho2_m[id3] = data;
|
||||
|
||||
if (i != 0 && j != 0) rho2_m[id4] = data;
|
||||
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
|
||||
|
||||
if (k != 0) rho2_m[id5] = data;
|
||||
if (k != 0 && id5 < size) rho2_m[id5] = data;
|
||||
|
||||
if (k != 0 && i != 0) rho2_m[id6] = data;
|
||||
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
|
||||
|
||||
if (k!= 0 && j != 0) rho2_m[id7] = data;
|
||||
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
|
||||
|
||||
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
|
||||
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** multiply complex fields */
|
||||
double2 CompelxMul(double2 a, double2 b) {
|
||||
double2 ComplexMul(double2 a, double2 b) {
|
||||
double2 c;
|
||||
c.x = a.x * b.x - a.y * b.y;
|
||||
c.y = a.x * b.y + a.y * b.x;
|
||||
@ -155,12 +158,13 @@ double2 CompelxMul(double2 a, double2 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
__kernel void multiplyComplexFields_2(__global double2 *ptr1, __global double2 *ptr2,
|
||||
int size)
|
||||
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
|
||||
int size)
|
||||
{
|
||||
|
||||
int idx = get_global_id(0);
|
||||
|
||||
if (idx < size)
|
||||
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user