OpenCL FFT using clfft and tests
This commit is contained in:
@ -167,6 +167,40 @@ public:
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Zero CUDA memory.
|
||||||
|
* Set all the elements of the array on the device to zero.
|
||||||
|
*/
|
||||||
|
template<typename T>
|
||||||
|
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
|
||||||
|
cudaError cerror;
|
||||||
|
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
|
||||||
|
if (cerror != cudaSuccess) {
|
||||||
|
DEBUG_MSG("Error zeroing cuda memory!\n");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Zero CUDA memory.
|
||||||
|
* Set all the elements of the array on the device to zero.
|
||||||
|
*/
|
||||||
|
template<typename T>
|
||||||
|
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
|
||||||
|
int dkserror = DKS_SUCCESS;
|
||||||
|
cudaError cerror;
|
||||||
|
if (streamId < cuda_numberOfStreams()) {
|
||||||
|
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
|
||||||
|
cuda_getStream(streamId));
|
||||||
|
|
||||||
|
if (cerror != cudaSuccess)
|
||||||
|
dkserror = DKS_ERROR;
|
||||||
|
} else
|
||||||
|
dkserror = DKS_ERROR;
|
||||||
|
|
||||||
|
return dkserror;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Info: write data to memory
|
* Info: write data to memory
|
||||||
* Retrun: success or error code
|
* Retrun: success or error code
|
||||||
|
@ -189,12 +189,11 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
|
|||||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||||
|
|
||||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -273,7 +272,6 @@ __global__ void mirroredRhoField(double *rho2_m,
|
|||||||
id7 = rk * NI * NJ + rj * NI + i;
|
id7 = rk * NI * NJ + rj * NI + i;
|
||||||
id8 = rk * NI * NJ + rj * NI + ri;
|
id8 = rk * NI * NJ + rj * NI + ri;
|
||||||
|
|
||||||
|
|
||||||
double data = rho2_m[id1];
|
double data = rho2_m[id1];
|
||||||
if (i != 0) rho2_m[id2] = data;
|
if (i != 0) rho2_m[id2] = data;
|
||||||
|
|
||||||
@ -389,8 +387,10 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
|||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
int block = (I * J * K / thread) + 1;
|
int block = (I * J * K / thread) + 1;
|
||||||
|
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
|
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
|
||||||
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
||||||
2*(I - 1), 2*(J - 1), I, J, K);
|
2*(I - 1), 2*(J - 1), I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
@ -399,6 +399,7 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
|||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
|
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
|
||||||
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
||||||
2*(I - 1), 2*(J - 1), I, J, K);
|
2*(I - 1), 2*(J - 1), I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
|
@ -114,6 +114,7 @@ DKSBase::DKSBase() {
|
|||||||
oclfft = new OpenCLFFT(oclbase);
|
oclfft = new OpenCLFFT(oclbase);
|
||||||
oclchi = new OpenCLChiSquare(oclbase);
|
oclchi = new OpenCLChiSquare(oclbase);
|
||||||
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||||
|
oclgreens = new OpenCLGreensFunction(oclbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
@ -149,6 +150,7 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
|
|||||||
oclfft = new OpenCLFFT(oclbase);
|
oclfft = new OpenCLFFT(oclbase);
|
||||||
oclchi = new OpenCLChiSquare(oclbase);
|
oclchi = new OpenCLChiSquare(oclbase);
|
||||||
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||||
|
oclgreens = new OpenCLGreensFunction(oclbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
@ -187,6 +189,7 @@ DKSBase::~DKSBase() {
|
|||||||
delete oclchi;
|
delete oclchi;
|
||||||
delete oclcol;
|
delete oclcol;
|
||||||
delete oclbase;
|
delete oclbase;
|
||||||
|
delete oclgreens;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
@ -613,6 +616,9 @@ int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int
|
|||||||
if (apiCuda()) {
|
if (apiCuda()) {
|
||||||
return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||||
hz_m0, hz_m1, hz_m2, streamId) );
|
hz_m0, hz_m1, hz_m2, streamId) );
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
return OPENCL_SAFECALL(oclgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||||
|
hz_m0, hz_m1, hz_m2) );
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
//BENI:
|
//BENI:
|
||||||
return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
|
return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
|
||||||
@ -627,6 +633,8 @@ int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
|||||||
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
|
return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
|
||||||
|
else if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL(oclgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||||
|
|
||||||
@ -638,6 +646,8 @@ int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId
|
|||||||
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
|
return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||||
|
else if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL(oclgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K));
|
return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K));
|
||||||
|
|
||||||
@ -649,6 +659,8 @@ int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size,
|
|||||||
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
|
return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
|
||||||
|
else if (apiOpenCL())
|
||||||
|
return OPENCL_SAFECALL(oclgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@
|
|||||||
#include "OpenCL/OpenCLFFT.h"
|
#include "OpenCL/OpenCLFFT.h"
|
||||||
#include "OpenCL/OpenCLChiSquare.h"
|
#include "OpenCL/OpenCLChiSquare.h"
|
||||||
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
||||||
|
#include "OpenCL/OpenCLGreensFunction.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
@ -76,6 +77,7 @@ private:
|
|||||||
OpenCLFFT *oclfft;
|
OpenCLFFT *oclfft;
|
||||||
OpenCLChiSquare *oclchi;
|
OpenCLChiSquare *oclchi;
|
||||||
OpenCLCollimatorPhysics *oclcol;
|
OpenCLCollimatorPhysics *oclcol;
|
||||||
|
OpenCLGreensFunction *oclgreens;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
|
@ -4,6 +4,7 @@ SET (_SRCS
|
|||||||
OpenCLChiSquare.cpp
|
OpenCLChiSquare.cpp
|
||||||
OpenCLCollimatorPhysics.cpp
|
OpenCLCollimatorPhysics.cpp
|
||||||
OpenCLChiSquareRuntime.cpp
|
OpenCLChiSquareRuntime.cpp
|
||||||
|
OpenCLGreensFunction.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
SET (_HDRS
|
SET (_HDRS
|
||||||
@ -12,6 +13,7 @@ SET (_HDRS
|
|||||||
OpenCLChiSquare.h
|
OpenCLChiSquare.h
|
||||||
OpenCLCollimatorPhysics.h
|
OpenCLCollimatorPhysics.h
|
||||||
OpenCLChiSquareRuntime.h
|
OpenCLChiSquareRuntime.h
|
||||||
|
OpenCLGreensFunction.h
|
||||||
)
|
)
|
||||||
|
|
||||||
#INCLUDE_DIRECTORIES (
|
#INCLUDE_DIRECTORIES (
|
||||||
@ -25,6 +27,7 @@ SET (_KERNELS
|
|||||||
OpenCLKernels/OpenCLTranspose.cl
|
OpenCLKernels/OpenCLTranspose.cl
|
||||||
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
||||||
OpenCLKernels/OpenCLChiSquareRuntime.cl
|
OpenCLKernels/OpenCLChiSquareRuntime.cl
|
||||||
|
OpenCLKernels/OpenCLGreensFunction.cl
|
||||||
)
|
)
|
||||||
|
|
||||||
ADD_SOURCES (${_SRCS})
|
ADD_SOURCES (${_SRCS})
|
||||||
|
@ -428,7 +428,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
//create program from kernel
|
//create program from kernel
|
||||||
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
|
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
|
||||||
|
NULL, &ierr);
|
||||||
if (ierr != CL_SUCCESS) {
|
if (ierr != CL_SUCCESS) {
|
||||||
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -438,7 +439,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
check if compileng kernel source succeded, if failed return error code
|
check if compiling kernel source succeded, if failed return error code
|
||||||
if in debug mode get compilation info and print program build log witch
|
if in debug mode get compilation info and print program build log witch
|
||||||
will give indication what made the compilation fail
|
will give indication what made the compilation fail
|
||||||
*/
|
*/
|
||||||
@ -447,7 +448,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
|
|
||||||
//get build status
|
//get build status
|
||||||
cl_build_status status;
|
cl_build_status status;
|
||||||
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
|
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
|
||||||
|
sizeof(cl_build_status), &status, NULL);
|
||||||
|
|
||||||
//get log size
|
//get log size
|
||||||
size_t log_size;
|
size_t log_size;
|
||||||
|
@ -30,24 +30,11 @@
|
|||||||
#include <CL/cl_ext.h>
|
#include <CL/cl_ext.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "clRNG/clRNG.h"
|
||||||
|
#include "clRNG/mrg31k3p.h"
|
||||||
|
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
/* struct for random number state */
|
|
||||||
typedef struct {
|
|
||||||
|
|
||||||
double s10;
|
|
||||||
double s11;
|
|
||||||
double s12;
|
|
||||||
double s20;
|
|
||||||
double s21;
|
|
||||||
double s22;
|
|
||||||
double z;
|
|
||||||
bool gen;
|
|
||||||
|
|
||||||
} RNDState;
|
|
||||||
|
|
||||||
class OpenCLBase {
|
class OpenCLBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -195,7 +182,7 @@ public:
|
|||||||
Return: return pointer to memory
|
Return: return pointer to memory
|
||||||
*/
|
*/
|
||||||
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Name: allocateMemory
|
Name: allocateMemory
|
||||||
Info: allocate memory on device
|
Info: allocate memory on device
|
||||||
@ -203,6 +190,20 @@ public:
|
|||||||
*/
|
*/
|
||||||
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
||||||
|
|
||||||
|
/** Zero OpenCL memory buffer
|
||||||
|
* Set all the elemetns in the device array to zero
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
|
||||||
|
|
||||||
|
cl_int ierr;
|
||||||
|
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
|
||||||
|
sizeof(T)*size, 0, nullptr, nullptr);
|
||||||
|
if (ierr != CL_SUCCESS)
|
||||||
|
return DKS_ERROR;
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Name: writeData
|
Name: writeData
|
||||||
Info: write data to device memory (needs ptr to mem object)
|
Info: write data to device memory (needs ptr to mem object)
|
||||||
|
@ -117,15 +117,13 @@ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool for
|
|||||||
*/
|
*/
|
||||||
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
std::cout << "execute RCFFT" << std::endl;
|
|
||||||
|
|
||||||
int dkserr = DKS_SUCCESS;
|
int dkserr = DKS_SUCCESS;
|
||||||
cl_int ierr;
|
cl_int ierr;
|
||||||
cl_mem real_in = (cl_mem)real_ptr;
|
cl_mem real_in = (cl_mem)real_ptr;
|
||||||
cl_mem comp_out = (cl_mem)comp_ptr;
|
cl_mem comp_out = (cl_mem)comp_ptr;
|
||||||
|
|
||||||
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
||||||
0, NULL, NULL, &real_in, &comp_out, NULL);
|
0, NULL, NULL, &real_in, &comp_out, NULL);
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
if (ierr != OCL_SUCCESS) {
|
||||||
dkserr = DKS_ERROR;
|
dkserr = DKS_ERROR;
|
||||||
@ -144,8 +142,6 @@ int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3],
|
|||||||
*/
|
*/
|
||||||
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
std::cout << "execute CRFFT" << std::endl;
|
|
||||||
|
|
||||||
int dkserr = DKS_SUCCESS;
|
int dkserr = DKS_SUCCESS;
|
||||||
cl_int ierr;
|
cl_int ierr;
|
||||||
cl_mem real_in = (cl_mem)real_ptr;
|
cl_mem real_in = (cl_mem)real_ptr;
|
||||||
@ -214,7 +210,13 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
|||||||
|
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
|
||||||
clfftDim dim = CLFFT_3D;
|
clfftDim dim;
|
||||||
|
if (ndim == 1)
|
||||||
|
dim = CLFFT_1D;
|
||||||
|
else if (ndim == 2)
|
||||||
|
dim = CLFFT_2D;
|
||||||
|
else
|
||||||
|
dim = CLFFT_3D;
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
/* Create 3D fft plan*/
|
||||||
@ -244,9 +246,20 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
|||||||
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
|
||||||
clfftDim dim = CLFFT_3D;
|
clfftDim dim;
|
||||||
|
if (ndim == 1)
|
||||||
|
dim = CLFFT_1D;
|
||||||
|
else if (ndim == 2)
|
||||||
|
dim = CLFFT_2D;
|
||||||
|
else
|
||||||
|
dim = CLFFT_3D;
|
||||||
|
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||||
|
|
||||||
|
size_t half = (size_t)N[0] / 2 + 1;
|
||||||
|
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
||||||
|
size_t clOutStride[3] = {1, half, half * N[1]};
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
/* Create 3D fft plan*/
|
||||||
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
|
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
|
||||||
|
|
||||||
@ -254,6 +267,8 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
|||||||
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
|
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
|
||||||
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
|
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
|
||||||
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
|
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
|
||||||
|
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
|
||||||
|
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
|
||||||
|
|
||||||
/* Bake the plan */
|
/* Bake the plan */
|
||||||
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||||
@ -269,9 +284,20 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
|||||||
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
|
||||||
clfftDim dim = CLFFT_3D;
|
clfftDim dim;
|
||||||
|
if (ndim == 1)
|
||||||
|
dim = CLFFT_1D;
|
||||||
|
else if (ndim == 2)
|
||||||
|
dim = CLFFT_2D;
|
||||||
|
else
|
||||||
|
dim = CLFFT_3D;
|
||||||
|
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
||||||
|
|
||||||
|
size_t half = (size_t)N[0] / 2 + 1;
|
||||||
|
size_t clInStride[3] = {1, half, half * N[1]};
|
||||||
|
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
/* Create 3D fft plan*/
|
||||||
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
|
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
|
||||||
|
|
||||||
@ -279,6 +305,8 @@ int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
|||||||
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
|
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
|
||||||
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
|
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
|
||||||
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
|
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
|
||||||
|
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
|
||||||
|
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
|
||||||
|
|
||||||
/* Bake the plan */
|
/* Bake the plan */
|
||||||
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include "OpenCLGreensFunction.h"
|
#include "OpenCLGreensFunction.h"
|
||||||
#define GREENS_KERNEL "OpenCLKernels/OpenCLGreensFunction.cl"
|
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
|
||||||
|
|
||||||
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
|
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
|
||||||
m_base = base;
|
m_base = base;
|
||||||
@ -29,6 +29,8 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
|
|||||||
double hr_m0, double hr_m1, double hr_m2,
|
double hr_m0, double hr_m1, double hr_m2,
|
||||||
int streamId)
|
int streamId)
|
||||||
{
|
{
|
||||||
|
int ierr = DKS_SUCCESS;
|
||||||
|
|
||||||
//compile opencl program from source
|
//compile opencl program from source
|
||||||
buildProgram();
|
buildProgram();
|
||||||
|
|
||||||
@ -42,26 +44,28 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
|
|||||||
work_items = (work_items / work_size + 1) * work_size;
|
work_items = (work_items / work_size + 1) * work_size;
|
||||||
|
|
||||||
//create kernel
|
//create kernel
|
||||||
ierr = m_oclbase->ocl_createKernel("kernelTmpgreen");
|
ierr = m_base->ocl_createKernel("kernelTmpgreen");
|
||||||
|
|
||||||
//set kernel parameters
|
//set kernel parameters
|
||||||
m_base->setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
|
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
|
||||||
m_base->setKernelArg(1, sizeof(double), &hr_m0);
|
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
|
||||||
m_base->setKernelArg(2, sizeof(double), &hr_m1);
|
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
|
||||||
m_base->setKernelArg(3, sizeof(double), &hr_m2);
|
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
|
||||||
m_base->setKernelArg(4, sizeof(int), &I);
|
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
||||||
m_base->setKernelArg(5, sizeof(int), &J);
|
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
||||||
m_base->setKernelArg(6, sizeof(int), &K);
|
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
||||||
|
|
||||||
//execute kernel
|
//execute kernel
|
||||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||||
|
|
||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
|
||||||
int streamId)
|
int K, int streamId)
|
||||||
{
|
{
|
||||||
|
int ierr = DKS_SUCCESS;
|
||||||
|
|
||||||
//compile opencl program from source
|
//compile opencl program from source
|
||||||
buildProgram();
|
buildProgram();
|
||||||
|
|
||||||
@ -70,8 +74,6 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
|
|||||||
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
||||||
int NI = 2*(I - 1);
|
int NI = 2*(I - 1);
|
||||||
int NJ = 2*(J - 1);
|
int NJ = 2*(J - 1);
|
||||||
int NK = 2*(K - 1);
|
|
||||||
|
|
||||||
|
|
||||||
//set the work item size
|
//set the work item size
|
||||||
size_t work_size = 128;
|
size_t work_size = 128;
|
||||||
@ -80,20 +82,22 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
|
|||||||
work_items = (work_items / work_size + 1) * work_size;
|
work_items = (work_items / work_size + 1) * work_size;
|
||||||
|
|
||||||
//create kernel
|
//create kernel
|
||||||
ierr = m_oclbase->ocl_createKernel("kernelIntegration");
|
ierr = m_base->ocl_createKernel("kernelIntegration");
|
||||||
|
|
||||||
//set kernel parameters
|
//set kernel parameters
|
||||||
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
||||||
m_base->setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
|
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
|
||||||
m_base->setKernelArg(2, sizeof(int), &I);
|
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
|
||||||
m_base->setKernelArg(3, sizeof(int), &J);
|
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
|
||||||
m_base->setKernelArg(4, sizeof(int), &K);
|
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
||||||
m_base->setKernelArg(5, sizeof(int), &NI);
|
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
||||||
m_base->setKernelArg(6, sizeof(int), &NJ);
|
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
||||||
m_base->setKernelArg(7, sizeof(int), &NK);
|
|
||||||
|
|
||||||
//execute kernel
|
//execute kernel
|
||||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
double zero = 0.0;
|
||||||
|
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
||||||
|
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
|
||||||
|
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||||
|
|
||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
@ -102,6 +106,8 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
|
|||||||
|
|
||||||
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
|
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
|
||||||
{
|
{
|
||||||
|
int ierr = DKS_SUCCESS;
|
||||||
|
|
||||||
//compile opencl program from source
|
//compile opencl program from source
|
||||||
buildProgram();
|
buildProgram();
|
||||||
|
|
||||||
@ -114,6 +120,8 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
|
|||||||
int J2 = 2*J;
|
int J2 = 2*J;
|
||||||
int K2 = 2*K;
|
int K2 = 2*K;
|
||||||
|
|
||||||
|
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
|
||||||
|
|
||||||
//set the work item size
|
//set the work item size
|
||||||
size_t work_size = 128;
|
size_t work_size = 128;
|
||||||
size_t work_items = NI * NJ * NK;
|
size_t work_items = NI * NJ * NK;
|
||||||
@ -121,19 +129,20 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
|
|||||||
work_items = (work_items / work_size + 1) * work_size;
|
work_items = (work_items / work_size + 1) * work_size;
|
||||||
|
|
||||||
//create kernel
|
//create kernel
|
||||||
ierr = m_oclbase->ocl_createKernel("kernelMirroredRhoField");
|
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
|
||||||
|
|
||||||
//set kernel parameters
|
//set kernel parameters
|
||||||
m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
||||||
m_base->setKernelArg(1, sizeof(int), &I2);
|
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
|
||||||
m_base->setKernelArg(2, sizeof(int), &J2);
|
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
|
||||||
m_base->setKernelArg(3, sizeof(int), &K2);
|
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
|
||||||
m_base->setKernelArg(4, sizeof(int), &NI);
|
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
|
||||||
m_base->setKernelArg(5, sizeof(int), &NJ);
|
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
|
||||||
m_base->setKernelArg(6, sizeof(int), &NK);
|
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
|
||||||
|
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
|
||||||
|
|
||||||
//execute kernel
|
//execute kernel
|
||||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||||
|
|
||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
@ -141,4 +150,32 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
|
|||||||
|
|
||||||
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
|
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
|
||||||
{
|
{
|
||||||
|
int ierr = DKS_SUCCESS;
|
||||||
|
|
||||||
|
//compile opencl program from source
|
||||||
|
buildProgram();
|
||||||
|
|
||||||
|
//cast the input data ptr to cl_mem
|
||||||
|
cl_mem mem_ptr1 = (cl_mem) ptr1;
|
||||||
|
cl_mem mem_ptr2 = (cl_mem) ptr2;
|
||||||
|
|
||||||
|
//set the work item size
|
||||||
|
size_t work_size = 128;
|
||||||
|
size_t work_items = size;
|
||||||
|
if (work_items % work_size > 0)
|
||||||
|
work_items = (work_items / work_size + 1) * work_size;
|
||||||
|
|
||||||
|
//create kernel
|
||||||
|
ierr = m_base->ocl_createKernel("multiplyComplexFields");
|
||||||
|
|
||||||
|
//set kernel parameters
|
||||||
|
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
|
||||||
|
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
|
||||||
|
m_base->ocl_setKernelArg(2, sizeof(int), &size);
|
||||||
|
|
||||||
|
//execute kernel
|
||||||
|
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
||||||
|
|
||||||
|
return ierr;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -60,4 +60,4 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#endif H_OPENCL_GREENSFUNCTION
|
#endif
|
||||||
|
@ -81,28 +81,29 @@ __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgre
|
|||||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||||
|
|
||||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** miror rho-field */
|
/** miror rho-field */
|
||||||
__kernel void mirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
|
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
|
||||||
rho2_m[0] = rho2_m[NI*NJ];
|
rho2_m[0] = rho2_m[NI*NJ];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void mirroredRhoField(__global double *rho2_m,
|
__kernel void kernelMirroredRhoField(__global double *rho2_m,
|
||||||
int NI, int NJ, int NK,
|
int NI, int NJ, int NK,
|
||||||
int NI_tmp, int NJ_tmp, int NK_tmp) {
|
int NI_tmp, int NJ_tmp, int NK_tmp,
|
||||||
|
int size)
|
||||||
|
{
|
||||||
|
|
||||||
int tid = get_local_id(0);
|
int tid = get_local_id(0);
|
||||||
int id = get_global_id(0);
|
int id = get_global_id(0);
|
||||||
|
|
||||||
if (id == 0)
|
if (id == 0)
|
||||||
rho2_m[0] = rho2_m[NI * NJ];
|
rho2_m[0] = rho2_m[NI * NJ];
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||||
|
|
||||||
@ -127,27 +128,29 @@ __kernel void mirroredRhoField(__global double *rho2_m,
|
|||||||
id7 = rk * NI * NJ + rj * NI + i;
|
id7 = rk * NI * NJ + rj * NI + i;
|
||||||
id8 = rk * NI * NJ + rj * NI + ri;
|
id8 = rk * NI * NJ + rj * NI + ri;
|
||||||
|
|
||||||
|
double data = 0.0;
|
||||||
|
if (id1 < size)
|
||||||
|
data = rho2_m[id1];
|
||||||
|
|
||||||
double data = rho2_m[id1];
|
if (i != 0 && id2 < size) rho2_m[id2] = data;
|
||||||
if (i != 0) rho2_m[id2] = data;
|
|
||||||
|
|
||||||
if (j != 0) rho2_m[id3] = data;
|
if (j != 0 && id3 < size) rho2_m[id3] = data;
|
||||||
|
|
||||||
if (i != 0 && j != 0) rho2_m[id4] = data;
|
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
|
||||||
|
|
||||||
if (k != 0) rho2_m[id5] = data;
|
if (k != 0 && id5 < size) rho2_m[id5] = data;
|
||||||
|
|
||||||
if (k != 0 && i != 0) rho2_m[id6] = data;
|
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
|
||||||
|
|
||||||
if (k!= 0 && j != 0) rho2_m[id7] = data;
|
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
|
||||||
|
|
||||||
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
|
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** multiply complex fields */
|
/** multiply complex fields */
|
||||||
double2 CompelxMul(double2 a, double2 b) {
|
double2 ComplexMul(double2 a, double2 b) {
|
||||||
double2 c;
|
double2 c;
|
||||||
c.x = a.x * b.x - a.y * b.y;
|
c.x = a.x * b.x - a.y * b.y;
|
||||||
c.y = a.x * b.y + a.y * b.x;
|
c.y = a.x * b.y + a.y * b.x;
|
||||||
@ -155,12 +158,13 @@ double2 CompelxMul(double2 a, double2 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
|
||||||
__kernel void multiplyComplexFields_2(__global double2 *ptr1, __global double2 *ptr2,
|
int size)
|
||||||
int size)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
int idx = get_global_id(0);
|
int idx = get_global_id(0);
|
||||||
|
|
||||||
if (idx < size)
|
if (idx < size)
|
||||||
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
|
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -25,7 +25,7 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
|||||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
||||||
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
||||||
#ADD_EXECUTABLE(testPush testPush.cpp)
|
#ADD_EXECUTABLE(testPush testPush.cpp)
|
||||||
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
||||||
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
||||||
|
|
||||||
@ -56,7 +56,7 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
|||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
#TARGET_LINK_LIBRARIES(testPush dks)
|
#TARGET_LINK_LIBRARIES(testPush dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
|
||||||
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
||||||
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
#include "Utility/TimeStamp.h"
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -8,14 +10,20 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
||||||
void initData(double *data, int dimsize[3]);
|
void initData(double *data, int dimsize[3], int dim);
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
|
||||||
char *api_name, char *device_name);
|
char *api_name, char *device_name, char *file_name);
|
||||||
void printHelp();
|
void printHelp();
|
||||||
|
|
||||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||||
void printData3DN4(double* &data, int N, int dim);
|
void printData3DN4(double* &data, int N, int dim);
|
||||||
|
|
||||||
|
double precision(double a) {
|
||||||
|
//if (a < 1e-10)
|
||||||
|
// return 0.0;
|
||||||
|
//else
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
@ -26,32 +34,29 @@ int main(int argc, char *argv[]) {
|
|||||||
int loop = 0;
|
int loop = 0;
|
||||||
char *api_name = new char[10];
|
char *api_name = new char[10];
|
||||||
char *device_name = new char[10];
|
char *device_name = new char[10];
|
||||||
|
char *file_name = new char[50];
|
||||||
|
|
||||||
if ( readParams(argc, argv, N1, N2, N3, loop, api_name, device_name) )
|
if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||||
|
|
||||||
int dimsize[3] = {N3, N2, N1};
|
int dimsize[3] = {N1, N2, N3};
|
||||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
||||||
|
|
||||||
double *rdata = new double[sizereal];
|
double *rdata = new double[sizereal];
|
||||||
double *outdata = new double[sizereal];
|
double *outdata = new double[sizereal];
|
||||||
complex<double> *cfft = new complex<double>[sizecomp];
|
complex<double> *cfft = new complex<double>[sizecomp];
|
||||||
initData(rdata, dimsize);
|
initData(rdata, dimsize, dim);
|
||||||
|
|
||||||
/* init DKSBase */
|
/* init DKSBase */
|
||||||
cout << "Init device and set function" << endl;
|
cout << "Init device and set function" << endl;
|
||||||
DKSBase base;
|
DKSBase base;
|
||||||
base.setAPI(api_name, strlen(api_name));
|
base.setAPI(api_name, strlen(api_name));
|
||||||
base.setDevice(device_name, strlen(device_name));
|
base.setDevice(device_name, strlen(device_name));
|
||||||
base.initDevice();
|
base.initDevice();
|
||||||
base.setupFFT(3, dimsize);
|
base.setupFFT(dim, dimsize);
|
||||||
|
|
||||||
base.setupFFTRC(dim, dimsize);
|
|
||||||
/* setup backward fft (COMPLEX->REAL) */
|
|
||||||
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
|
||||||
|
|
||||||
// allocate memory on device
|
// allocate memory on device
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -63,68 +68,59 @@ DKSBase base;
|
|||||||
// execute one run before starting the timers
|
// execute one run before starting the timers
|
||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||||
|
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
||||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
|
|
||||||
//timer for total loop time, FFT and IFFT calls
|
|
||||||
struct timeval timeStart, timeEnd;
|
|
||||||
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
|
|
||||||
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
|
|
||||||
|
|
||||||
gettimeofday(&timeStart, NULL);
|
|
||||||
for (int i=0; i<loop; ++i){
|
|
||||||
|
|
||||||
// write data to device
|
|
||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
|
||||||
|
|
||||||
// execute rcfft
|
|
||||||
gettimeofday(&timeFFTStart[i], NULL);
|
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
|
||||||
gettimeofday(&timeFFTEnd[i], NULL);
|
|
||||||
|
|
||||||
// execute crfft
|
|
||||||
gettimeofday(&timeIFFTStart[i], NULL);
|
|
||||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
|
||||||
gettimeofday(&timeIFFTEnd[i], NULL);
|
|
||||||
|
|
||||||
//normalize
|
|
||||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
|
||||||
|
|
||||||
// read IFFT data from device
|
|
||||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
|
||||||
|
|
||||||
|
|
||||||
|
ofstream myfile;
|
||||||
|
myfile.open(file_name);
|
||||||
|
myfile<< "in\tout\treal\timag\n";
|
||||||
|
for (int i = 0; i < sizereal; i++) {
|
||||||
|
//myfile << precision(rdata[i]) << "\t";
|
||||||
|
//myfile << precision(outdata[i]) << "\t";
|
||||||
|
if (i < sizecomp) {
|
||||||
|
myfile << precision(cfft[i].real()) << "\t";
|
||||||
|
myfile << precision(cfft[i].imag());
|
||||||
|
}
|
||||||
|
myfile << "\n";
|
||||||
}
|
}
|
||||||
gettimeofday(&timeEnd, NULL);
|
myfile.close();
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (dim == 2) {
|
||||||
|
for (int i = 0; i < N2; i++) {
|
||||||
|
for (int j = 0; j < N1; j++) {
|
||||||
|
cout << rdata[i*N1 + j] << " ";
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (dim == 2) {
|
||||||
|
for (int i = 0; i < N2; i++) {
|
||||||
|
for (int j = 0; j < N1 / 2 + 1; j++) {
|
||||||
|
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
// free device memory
|
// free device memory
|
||||||
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
||||||
base.freeMemory<double>(real_ptr, sizereal);
|
base.freeMemory<double>(real_ptr, sizereal);
|
||||||
base.freeMemory<double>(real_res_ptr, sizereal);
|
base.freeMemory<double>(real_res_ptr, sizereal);
|
||||||
|
|
||||||
// compare in and out data to see if we get back the same results
|
// compare in and out data to see if we get back the same results
|
||||||
|
cout << "comp" << endl;
|
||||||
compareData(rdata, outdata, N1, N2, N3, dim);
|
compareData(rdata, outdata, N1, N2, N3, dim);
|
||||||
|
cout << "done" << endl;
|
||||||
//calculate seconds for total time and fft times
|
|
||||||
double tfft = 0;
|
|
||||||
double tifft = 0;
|
|
||||||
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
|
|
||||||
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
|
|
||||||
|
|
||||||
for (int i = 0; i < loop; i++) {
|
|
||||||
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
|
|
||||||
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
|
|
||||||
|
|
||||||
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
|
|
||||||
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
|
|
||||||
}
|
|
||||||
|
|
||||||
//print timing results
|
|
||||||
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
|
|
||||||
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
|
|
||||||
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
|
|
||||||
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
|
|
||||||
<< "\n\n";
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -132,10 +128,10 @@ DKSBase base;
|
|||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
||||||
int id;
|
int id;
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < NI; i++) {
|
for (int i = 0; i < NK; i++) {
|
||||||
for (int j = 0; j < NJ; j++) {
|
for (int j = 0; j < NJ; j++) {
|
||||||
for (int k = 0; k < NK; k++) {
|
for (int k = 0; k < NI; k++) {
|
||||||
id = k*NI*NJ + j*NI + i;
|
id = i*NI*NJ + j*NI + k;
|
||||||
sum += fabs(data1[id] - data2[id]);
|
sum += fabs(data1[id] - data2[id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -143,13 +139,21 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
|
|||||||
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData(double *data, int dimsize[3]) {
|
void initData(double *data, int dimsize[3], int dim) {
|
||||||
for (int i = 0; i < dimsize[2]; i++) {
|
if (dim == 3) {
|
||||||
|
for (int i = 0; i < dimsize[2]; i++)
|
||||||
|
for (int j = 0; j < dimsize[1]; j++)
|
||||||
|
for (int k = 0; k < dimsize[0]; k++)
|
||||||
|
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
|
||||||
|
} else if (dim == 2) {
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
for (int j = 0; j < dimsize[1]; j++) {
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
for (int k = 0; k < dimsize[0]; k++) {
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
|
data[j*dimsize[0] + k] = sin(k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (int k = 0; k < dimsize[0]; k++)
|
||||||
|
data[k] = sin(k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -168,12 +172,17 @@ void printHelp() {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
|
||||||
char *api_name, char *device_name)
|
char *api_name, char *device_name, char *file_name)
|
||||||
{
|
{
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
|
|
||||||
|
if ( argv[i] == std::string("-dim")) {
|
||||||
|
dim = atoi(argv[i + 1]);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
if ( argv[i] == std::string("-grid") ) {
|
if ( argv[i] == std::string("-grid") ) {
|
||||||
N1 = atoi(argv[i + 1]);
|
N1 = atoi(argv[i + 1]);
|
||||||
N2 = atoi(argv[i + 2]);
|
N2 = atoi(argv[i + 2]);
|
||||||
@ -194,21 +203,25 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
|
|||||||
if (argv[i] == string("-cuda")) {
|
if (argv[i] == string("-cuda")) {
|
||||||
strcpy(api_name, "Cuda");
|
strcpy(api_name, "Cuda");
|
||||||
strcpy(device_name, "-gpu");
|
strcpy(device_name, "-gpu");
|
||||||
|
strcpy(file_name, "cuda_fft.dat");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
if (argv[i] == string("-opencl")) {
|
||||||
strcpy(api_name, "OpenCL");
|
strcpy(api_name, "OpenCL");
|
||||||
strcpy(device_name, "-gpu");
|
strcpy(device_name, "-gpu");
|
||||||
|
strcpy(file_name, "opencl_fft.dat");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
if (argv[i] == string("-mic")) {
|
||||||
strcpy(api_name, "OpenMP");
|
strcpy(api_name, "OpenMP");
|
||||||
strcpy(device_name, "-mic");
|
strcpy(device_name, "-mic");
|
||||||
|
strcpy(file_name, "openmp_fft.dat");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
if (argv[i] == string("-cpu")) {
|
||||||
strcpy(api_name, "OpenCL");
|
strcpy(api_name, "OpenCL");
|
||||||
strcpy(device_name, "-cpu");
|
strcpy(device_name, "-cpu");
|
||||||
|
strcpy(file_name, "opencl_cpu_fft.dat");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
//#include <mpi.h>
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -11,309 +10,265 @@ using namespace std;
|
|||||||
|
|
||||||
|
|
||||||
void printData3D(double* data, int N, int NI, const char *message = "") {
|
void printData3D(double* data, int N, int NI, const char *message = "") {
|
||||||
if (strcmp(message, "") != 0)
|
if (strcmp(message, "") != 0)
|
||||||
cout << message;
|
cout << message;
|
||||||
|
|
||||||
for (int i = 0; i < NI; i++) {
|
for (int i = 0; i < NI; i++) {
|
||||||
for (int j = 0; j < N; j++) {
|
for (int j = 0; j < N; j++) {
|
||||||
for (int k = 0; k < N; k++) {
|
for (int k = 0; k < N; k++) {
|
||||||
cout << data[i*N*N + j*N + k] << "\t";
|
cout << data[i*N*N + j*N + k] << "\t";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData(double *data, int N) {
|
void initData(double *data, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N/4 + 1; i++) {
|
for (int i = 0; i < N/4 + 1; i++) {
|
||||||
for (int j = 0; j < N/2 + 1; j++) {
|
for (int j = 0; j < N/2 + 1; j++) {
|
||||||
for (int k = 0; k < N/2 + 1; k++) {
|
for (int k = 0; k < N/2 + 1; k++) {
|
||||||
data[i*N*N + j*N + k] = k+1;
|
data[i*N*N + j*N + k] = k+1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData2(double *data, int N) {
|
void initData2(double *data, int N) {
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
data[i] = i;
|
data[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initComplex( complex<double> *d, int N) {
|
void initComplex( complex<double> *d, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
d[i] = complex<double>(2, 0);
|
d[i] = complex<double>(2, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void printComplex(complex<double> *d, int N) {
|
void printComplex(complex<double> *d, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
cout << d[i] << "\t";
|
cout << d[i] << "\t";
|
||||||
cout << endl;
|
cout << endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void printDouble(double *d, int N) {
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++)
|
||||||
|
cout << d[i] << ", ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void initMirror(double *data, int n1, int n2, int n3) {
|
void initMirror(double *data, int n1, int n2, int n3) {
|
||||||
int d = 1;
|
int d = 1;
|
||||||
for (int i = 0; i < n3; i++) {
|
for (int i = 0; i < n3; i++) {
|
||||||
for (int j = 0; j < n2; j++) {
|
for (int j = 0; j < n2; j++) {
|
||||||
for (int k = 0; k < n1; k++) {
|
for (int k = 0; k < n1; k++) {
|
||||||
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
||||||
data[i * n2 * n1 + j * n1 + k] = d++;
|
data[i * n2 * n1 + j * n1 + k] = d++;
|
||||||
else
|
else
|
||||||
data[i * n2 * n1 + j * n1 + k] = 0;
|
data[i * n2 * n1 + j * n1 + k] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void printDiv(int c) {
|
void printDiv(int c) {
|
||||||
for (int i = 0; i < c; i++)
|
for (int i = 0; i < c; i++)
|
||||||
cout << "-";
|
cout << "-";
|
||||||
cout << endl;
|
cout << endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void printMirror(double *data, int n1, int n2, int n3) {
|
void printMirror(double *data, int n1, int n2, int n3) {
|
||||||
|
|
||||||
printDiv(75);
|
printDiv(75);
|
||||||
for (int i = 0; i < n3; i++) {
|
for (int i = 0; i < n3; i++) {
|
||||||
for (int j = 0; j < n2; j++) {
|
for (int j = 0; j < n2; j++) {
|
||||||
for (int k = 0; k < n1; k++) {
|
for (int k = 0; k < n1; k++) {
|
||||||
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
double sumData(double *data, int datasize) {
|
double sumData(double *data, int datasize) {
|
||||||
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < datasize; i++)
|
for (int i = 0; i < datasize; i++)
|
||||||
sum += data[i];
|
sum += data[i];
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
/* mpi init */
|
char *api_name = new char[10];
|
||||||
//int rank, nprocs;
|
char *device_name = new char[10];
|
||||||
//MPI_Init(&argc, &argv);
|
|
||||||
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
||||||
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
|
||||||
|
|
||||||
/*
|
for (int i = 1; i < argc; i++) {
|
||||||
if (nprocs != 8) {
|
if (argv[i] == string("-cuda")) {
|
||||||
cout << "example was set to run with 8 processes" << endl;
|
strcpy(api_name, "Cuda");
|
||||||
cout << "exit..." << endl;
|
strcpy(device_name, "-gpu");
|
||||||
return 0;
|
}
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* set domain size */
|
if (argv[i] == string("-opencl")) {
|
||||||
int NG[3] = {64, 64, 32};
|
strcpy(api_name, "OpenCL");
|
||||||
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
strcpy(device_name, "-gpu");
|
||||||
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
}
|
||||||
int sizerho = NG[0] * NG[1] * NG[2];
|
|
||||||
int sizegreen = ng[0] * ng[1] * ng[2];
|
|
||||||
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
|
||||||
int id[3];
|
|
||||||
|
|
||||||
//id[0] = 0;
|
if (argv[i] == string("-mic")) {
|
||||||
//id[1] = NL[1] * (rank % 4);
|
strcpy(api_name, "OpenMP");
|
||||||
//id[2] = NL[2] * (rank / 4);
|
strcpy(device_name, "-mic");
|
||||||
|
}
|
||||||
|
|
||||||
/* print some messages bout the example in the begginig */
|
if (argv[i] == string("-cpu")) {
|
||||||
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
strcpy(api_name, "OpenCL");
|
||||||
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
|
strcpy(device_name, "-cpu");
|
||||||
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
}
|
||||||
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
|
}
|
||||||
int tmp[3];
|
|
||||||
/* for (int p = 1; p < nprocs; p++) {
|
|
||||||
MPI_Status mpistatus;
|
|
||||||
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
|
|
||||||
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
|
|
||||||
}*/
|
|
||||||
// } else {
|
|
||||||
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
|
|
||||||
// }
|
|
||||||
|
|
||||||
/* dks init and create 2 streams */
|
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||||
int dkserr;
|
|
||||||
//int streamGreens, streamFFT;
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
DKSBase base;
|
|
||||||
base.setAPI("OpenMP", 6);
|
|
||||||
base.setDevice("-mic", 4);
|
|
||||||
base.initDevice();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
/* set domain size */
|
||||||
DKSBase base;
|
int NG[3] = {64, 64, 32};
|
||||||
base.setAPI("Cuda", 4);
|
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
||||||
base.setDevice("-gpu", 4);
|
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
||||||
base.initDevice();
|
int sizerho = NG[0] * NG[1] * NG[2];
|
||||||
#endif
|
int sizegreen = ng[0] * ng[1] * ng[2];
|
||||||
|
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
||||||
|
|
||||||
//base.createStream(streamFFT);
|
/* print some messages bout the example in the begginig */
|
||||||
//if (rank == 0) {
|
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
||||||
// base.createStream(streamGreens);
|
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
||||||
base.setupFFT(3, NG);
|
|
||||||
//}
|
|
||||||
|
|
||||||
/* allocate memory and init rho field */
|
/* dks init and create 2 streams */
|
||||||
double *rho = new double[sizerho];
|
int dkserr;
|
||||||
double *rho_out = new double[sizerho];
|
DKSBase base;
|
||||||
//double *green_out = new double[sizegreen];
|
base.setAPI(api_name, strlen(api_name));
|
||||||
initMirror(rho, NL[0], NL[1], NL[2]);
|
base.setDevice(device_name, strlen(device_name));
|
||||||
|
base.initDevice();
|
||||||
|
base.setupFFT(3, NG);
|
||||||
|
|
||||||
/*
|
/* allocate memory and init rho field */
|
||||||
allocate memory on device for
|
double *rho = new double[sizerho];
|
||||||
- rho field
|
double *rho_out = new double[sizerho];
|
||||||
- rho FFT
|
//double *green_out = new double[sizegreen];
|
||||||
- tmpgreen
|
double *mirror_out = new double[sizerho];
|
||||||
- greens integral
|
//initMirror(rho, NL[0], NL[1], NL[2]);
|
||||||
- greens integral FFT
|
initMirror(rho, NG[0], NG[1], NG[2]);
|
||||||
*/
|
|
||||||
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
|
||||||
// if (rank == 0) {
|
|
||||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
|
||||||
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
|
||||||
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
|
||||||
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
|
||||||
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
|
||||||
/* } else {
|
|
||||||
grntr_ptr = NULL;
|
|
||||||
rho2_ptr = NULL;
|
|
||||||
grn_ptr = NULL;
|
|
||||||
rho2tr_ptr = NULL;
|
|
||||||
tmpgreen_ptr = NULL;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
allocate memory on device for
|
||||||
|
- rho field
|
||||||
|
- rho FFT
|
||||||
|
- tmpgreen
|
||||||
|
- greens integral
|
||||||
|
- greens integral FFT
|
||||||
|
*/
|
||||||
|
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
||||||
|
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
||||||
|
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||||
|
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||||
|
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||||
|
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||||
|
|
||||||
/* send and receive pointer to allocated memory on device */
|
/* =================================================*/
|
||||||
/*
|
/* =================================================*/
|
||||||
if (rank == 0) {
|
/* =====loop trough fftpoison solver iterations=====*/
|
||||||
for (int p = 1; p < nprocs; p++)
|
/* =================================================*/
|
||||||
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
|
/* =================================================*/
|
||||||
} else {
|
|
||||||
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
|
|
||||||
}
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
double old_sum = 0;
|
||||||
|
|
||||||
/* =================================================*/
|
int hr_m[3] = {1, 1, 1};
|
||||||
/* =================================================*/
|
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
|
||||||
/* =====loop trough fftpoison solver iterations=====*/
|
|
||||||
/* =================================================*/
|
|
||||||
/* =================================================*/
|
|
||||||
|
|
||||||
double old_sum = 0;
|
/* calculate greens integral on gpu */
|
||||||
double tmp_sum = 0;
|
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
||||||
for (int l = 0; l < 100; l++) {
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
/* on node 0, calculate tmpgreen on gpu */
|
|
||||||
int hr_m[3] = {1, 1, 1};
|
|
||||||
//if (rank == 0)
|
|
||||||
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
|
|
||||||
hr_m[0], hr_m[1], hr_m[2]);
|
|
||||||
|
|
||||||
/* calculate greens integral on gpu */
|
/* mirror the field */
|
||||||
//if (rank == 0)
|
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
||||||
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
/*
|
||||||
|
base.readData<double>(grn_ptr, mirror_out, sizerho);
|
||||||
|
for (int i = 0; i < sizerho; i++)
|
||||||
|
cout << mirror_out[i] << " ";
|
||||||
|
cout << endl << endl;
|
||||||
|
|
||||||
/* mirror the field */
|
for (int i = 0; i < sizerho; i++)
|
||||||
//if (rank == 0)
|
cout << rho[i] << " ";
|
||||||
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
cout << endl << endl;
|
||||||
|
*/
|
||||||
|
/* transfer rho field to device */
|
||||||
|
base.writeData<double>(rho2_ptr, rho, sizerho);
|
||||||
|
|
||||||
|
/* get FFT of rho field */
|
||||||
|
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
||||||
|
|
||||||
/* get FFT of mirrored greens integral */
|
/* get FFT of mirrored greens integral */
|
||||||
//if (rank == 0)
|
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
||||||
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
|
||||||
|
|
||||||
/* transfer rho field to device */
|
/* multiply both FFTs */
|
||||||
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
|
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
||||||
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
/* get FFT of rho field */
|
/*
|
||||||
//if (rank == 0) {
|
complex<double> *crho = new complex<double>[sizecomp];
|
||||||
//base.syncDevice();
|
complex<double> *cgre = new complex<double>[sizecomp];
|
||||||
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
|
||||||
//}
|
base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
|
||||||
|
|
||||||
/* multiply both FFTs */
|
for (int i = 0; i < sizecomp; i++)
|
||||||
//if (rank == 0)
|
cout << cgre[i].real() << " ";
|
||||||
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
cout << endl << endl;
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
/* inverse fft and transfer data back */
|
for (int i = 0; i < sizecomp; i++)
|
||||||
/*
|
cout << crho[i].real() << " ";
|
||||||
multiple device syncs and mpi barriers are used to make sure data
|
cout << endl << endl;
|
||||||
transfer is started when results are ready and progam moves on
|
|
||||||
only when data transfer is finished
|
delete[] crho;
|
||||||
*/
|
delete[] cgre;
|
||||||
//if (rank == 0) {
|
*/
|
||||||
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
|
||||||
//base.syncDevice();
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
|
||||||
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
//base.syncDevice();
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
//cout << "result: " << sumData(rho_out, sizerho) << endl;
|
|
||||||
if (l == 0) {
|
|
||||||
old_sum = sumData(rho_out, sizerho);
|
|
||||||
} else {
|
|
||||||
tmp_sum = sumData(rho_out, sizerho);
|
|
||||||
if (old_sum != tmp_sum) {
|
|
||||||
cout << "diff in iteration: " << l << endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*} else {
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
/* inverse fft and transfer data back */
|
||||||
|
/*
|
||||||
|
multiple device syncs and mpi barriers are used to make sure data
|
||||||
|
transfer is started when results are ready and progam moves on
|
||||||
|
only when data transfer is finished
|
||||||
|
*/
|
||||||
|
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
||||||
|
|
||||||
|
base.readData<double> (rho2_ptr, rho_out, sizerho);
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; i++)
|
||||||
|
cout << rho_out[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
old_sum = sumData(rho_out, sizerho);
|
||||||
|
|
||||||
}
|
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* ==========end fftpoison solver test run==========*/
|
/* ==========end fftpoison solver test run==========*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
|
|
||||||
|
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||||
|
base.freeMemory<double>(grn_ptr, sizerho);
|
||||||
|
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
||||||
|
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
||||||
|
base.freeMemory<double>(rho2_ptr, sizerho);
|
||||||
|
|
||||||
|
delete[] rho_out;
|
||||||
/* free memory on device */
|
delete[] rho;
|
||||||
//if (rank == 0) {
|
delete[] mirror_out;
|
||||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
cout << "Final sum: " << old_sum << endl;
|
||||||
base.freeMemory<double>(grn_ptr, sizerho);
|
|
||||||
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
|
||||||
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
base.freeMemory<double>(rho2_ptr, sizerho);
|
|
||||||
cout << "Final sum: " << old_sum << endl;
|
|
||||||
/*} else {
|
|
||||||
base.closeHandle(rho2_ptr);
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
}*/
|
|
||||||
|
|
||||||
//MPI_Finalize();
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user