diff --git a/src/OpenCL/OpenCLGreensFunction.cpp b/src/OpenCL/OpenCLGreensFunction.cpp new file mode 100644 index 0000000..fab16b4 --- /dev/null +++ b/src/OpenCL/OpenCLGreensFunction.cpp @@ -0,0 +1,144 @@ +#include "OpenCLGreensFunction.h" +#define GREENS_KERNEL "OpenCLKernels/OpenCLGreensFunction.cl" + +OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) { + m_base = base; + base_create = false; +} + +OpenCLGreensFunction::OpenCLGreensFunction() { + m_base = new OpenCLBase(); + base_create = true; +} + +OpenCLGreensFunction::~OpenCLGreensFunction() { + if (base_create) + delete m_base; +} + +int OpenCLGreensFunction::buildProgram() { + char *kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, GREENS_KERNEL); + + return m_base->ocl_loadKernel(kernel_file); +} + +int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, + double hr_m0, double hr_m1, double hr_m2, + int streamId) +{ + //compile opencl program from source + buildProgram(); + + //cast the input data ptr to cl_mem + cl_mem tmpgreen_ptr = (cl_mem)tmpgreen; + + //set the work item size + size_t work_size = 128; + size_t work_items = I * J * K; + if (work_items % work_size > 0) + work_items = (work_items / work_size + 1) * work_size; + + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelTmpgreen"); + + //set kernel parameters + m_base->setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr); + m_base->setKernelArg(1, sizeof(double), &hr_m0); + m_base->setKernelArg(2, sizeof(double), &hr_m1); + m_base->setKernelArg(3, sizeof(double), &hr_m2); + m_base->setKernelArg(4, sizeof(int), &I); + m_base->setKernelArg(5, sizeof(int), &J); + m_base->setKernelArg(6, sizeof(int), &K); + + //execute kernel + ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + return ierr; +} + +int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, + int streamId) +{ + //compile opencl program from source + buildProgram(); + + //cast the input data ptr to cl_mem + cl_mem rho2_ptr = (cl_mem)rho2_m; + cl_mem tmpgreen_ptr = (cl_mem)tmpgreen; + int NI = 2*(I - 1); + int NJ = 2*(J - 1); + int NK = 2*(K - 1); + + + //set the work item size + size_t work_size = 128; + size_t work_items = I * J * K; + if (work_items % work_size > 0) + work_items = (work_items / work_size + 1) * work_size; + + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelIntegration"); + + //set kernel parameters + m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr); + m_base->setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr); + m_base->setKernelArg(2, sizeof(int), &I); + m_base->setKernelArg(3, sizeof(int), &J); + m_base->setKernelArg(4, sizeof(int), &K); + m_base->setKernelArg(5, sizeof(int), &NI); + m_base->setKernelArg(6, sizeof(int), &NJ); + m_base->setKernelArg(7, sizeof(int), &NK); + + //execute kernel + ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + return ierr; + +} + + +int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) +{ + //compile opencl program from source + buildProgram(); + + //cast the input data ptr to cl_mem + cl_mem rho2_ptr = (cl_mem)rho2_m; + int NI = I + 1; + int NJ = J + 1; + int NK = K + 1; + int I2 = 2*I; + int J2 = 2*J; + int K2 = 2*K; + + //set the work item size + size_t work_size = 128; + size_t work_items = NI * NJ * NK; + if (work_items % work_size > 0) + work_items = (work_items / work_size + 1) * work_size; + + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelMirroredRhoField"); + + //set kernel parameters + m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr); + m_base->setKernelArg(1, sizeof(int), &I2); + m_base->setKernelArg(2, sizeof(int), &J2); + m_base->setKernelArg(3, sizeof(int), &K2); + m_base->setKernelArg(4, sizeof(int), &NI); + m_base->setKernelArg(5, sizeof(int), &NJ); + m_base->setKernelArg(6, sizeof(int), &NK); + + //execute kernel + ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + return ierr; +} + + +int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId) +{ +} diff --git a/src/OpenCL/OpenCLGreensFunction.h b/src/OpenCL/OpenCLGreensFunction.h new file mode 100644 index 0000000..33b7502 --- /dev/null +++ b/src/OpenCL/OpenCLGreensFunction.h @@ -0,0 +1,63 @@ +#ifndef H_OPENCL_GREENSFUNCTION +#define H_OPENCL_GREENSFUNCTION + +#include +#include + +#include "../Algorithms/GreensFunction.h" +#include "OpenCLBase.h" + +class OpenCLGreensFunction : public GreensFunction { + +private: + + bool base_create; + OpenCLBase *m_base; + +public: + + /** Constructor with OpenCLBase argument */ + OpenCLGreensFunction(OpenCLBase *base); + + /** Default constructor */ + OpenCLGreensFunction(); + + /** Destructor */ + ~OpenCLGreensFunction(); + + /** Load OpenCL kernel file containing greens function kernels. + * m_base takes the kernel file and compiles the OpenCL programm. + */ + int buildProgram(); + + /** + Info: calc itegral on device memory (taken from OPAL src code) + Return: success or error code + */ + int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, + double hr_m0, double hr_m1, double hr_m2, + int streamId = -1); + + /** + Info: integration of rho2_m field (taken from OPAL src code) + Return: success or error code + */ + int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, + int streamId = -1); + + /** + Info: mirror rho field (taken from OPAL src code) + Return: succes or error code + */ + int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1); + + /** + Info: multiply complex fields already on the GPU memory, result will be put in ptr1 + Return: success or error code + */ + int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); + +}; + + +#endif H_OPENCL_GREENSFUNCTION diff --git a/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl new file mode 100644 index 0000000..fae4526 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl @@ -0,0 +1,166 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/** compute the greens integral analytically */ +__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, + int NI, int NJ, int NK) +{ + + int tid = get_local_size(0); + int id = get_global_id(0); + + if (id < NI * NJ * NK) { + int i = id % NI; + int k = id / (NI * NJ); + int j = (id - k * NI * NJ) / NI; + + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgreen[id] = tmpgrn / cellVolume; + + } + +} + +/** perform the actual integration */ +__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen, + int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) +{ + + int tid = get_local_id(0); + int id = get_global_id(0); + + int ni = NI; + int nj = NJ; + + double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + if (id < NI_tmp * NJ_tmp * NK_tmp) { + int i = id % NI_tmp; + int k = id / (NI_tmp * NJ_tmp); + int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp; + + tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0; + tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; + + if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) + tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp) + tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp) + tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (k+1 < NK_tmp) + tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && j+1 < NJ_tmp) + tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && k+1 < NK_tmp) + tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp && k+1 < NK_tmp) + tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; + + rho2_m[i + j*ni + k*ni*nj] = tmp_rho; + } + +} + +/** miror rho-field */ +__kernel void mirroredRhoField0(__global double *rho2_m, int NI, int NJ) { + rho2_m[0] = rho2_m[NI*NJ]; +} + +__kernel void mirroredRhoField(__global double *rho2_m, + int NI, int NJ, int NK, + int NI_tmp, int NJ_tmp, int NK_tmp) { + + int tid = get_local_id(0); + int id = get_global_id(0); + + if (id == 0) + rho2_m[0] = rho2_m[NI * NJ]; + + barrier(CLK_GLOBAL_MEM_FENCE); + + int id1, id2, id3, id4, id5, id6, id7, id8; + + if (id < NI_tmp * NJ_tmp * NK_tmp) { + int i = id % NI_tmp; + int k = id / (NI_tmp * NJ_tmp); + int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp; + + int ri = NI - i; + int rj = NJ - j; + int rk = NK - k; + + id1 = k * NI * NJ + j * NI + i; + id2 = k * NI * NJ + j * NI + ri; + id3 = k * NI * NJ + rj * NI + i; + id4 = k * NI * NJ + rj * NI + ri; + + id5 = rk * NI * NJ + j * NI + i; + id6 = rk * NI * NJ + j * NI + ri; + id7 = rk * NI * NJ + rj * NI + i; + id8 = rk * NI * NJ + rj * NI + ri; + + + double data = rho2_m[id1]; + if (i != 0) rho2_m[id2] = data; + + if (j != 0) rho2_m[id3] = data; + + if (i != 0 && j != 0) rho2_m[id4] = data; + + if (k != 0) rho2_m[id5] = data; + + if (k != 0 && i != 0) rho2_m[id6] = data; + + if (k!= 0 && j != 0) rho2_m[id7] = data; + + if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data; + } + +} + +/** multiply complex fields */ +double2 CompelxMul(double2 a, double2 b) { + double2 c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + + return c; +} + + +__kernel void multiplyComplexFields_2(__global double2 *ptr1, __global double2 *ptr2, + int size) +{ + int idx = get_global_id(0); + + if (idx < size) + ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]); +}