#include "OpenCLFFT.h" //=====================================// //==========Private functions==========// //=====================================// /* call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform, ndim - totla number of dimmensions, N - size of dimension */ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) { //set the number of work items in each dimension size_t work_items[3]; work_items[0] = N; work_items[1] = (ndim > 1) ? N : 1; work_items[2] = (ndim > 1) ? N : 1; work_items[cdim] = N / 2; int f = (forward) ? 1 : 0; //create kernel and set kernel arguments if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS) return OCL_ERROR; //execute kernel for (int step = 1; step < N; step <<= 1) { if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) return OCL_ERROR; } return OCL_SUCCESS; } /* call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform, ndim - totla number of dimmensions, N - size of dimension */ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) { //set work item size size_t work_items[3]; work_items[0] = N; work_items[1] = (ndim > 1) ? N : 1; work_items[2] = (ndim > 2) ? N : 1; //create kernel and set kernel arguments if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS) return OCL_ERROR; int bits = log2(N); if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS) return OCL_ERROR; //execute kernel if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) { DEBUG_MSG("Error executing kernel"); return OCL_ERROR; } return OCL_SUCCESS; } //=====================================// //==========Public functions==========// //=====================================// /* call fft execution on device for every dimension */ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) { int ierr; cl_mem inout = (cl_mem)data; int n = N[0]; for (int dim = 0; dim < ndim; dim++) { ierr = ocl_callBitReverseKernel(inout, dim, ndim, n); if (ierr != OCL_SUCCESS) { DEBUG_MSG("Error executing bit reverse"); return OCL_ERROR; } ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward); if (ierr != OCL_SUCCESS) { DEBUG_MSG("Error executing fft reverse"); return OCL_ERROR; } } return OCL_SUCCESS; } /* execute ifft */ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) { executeFFT(data, ndim, N, streamId, false); return OCL_SUCCESS; } /* call kernel to normalize fft */ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { cl_mem inout = (cl_mem)data; int n = N[0]; //set work item size size_t work_items[3]; work_items[0] = n; work_items[1] = (ndim > 1) ? n : 1; work_items[2] = (ndim > 2) ? n : 1; //create kernel if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS) return OCL_ERROR; //set kernel args unsigned int elements = pow(n, ndim); if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS) return OCL_ERROR; if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS) return OCL_ERROR; //execute kernel if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) { DEBUG_MSG("Error executing kernel"); return OCL_ERROR; } return OCL_SUCCESS; } int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) { int ierr; int size = sizeof(cl_double2)*pow(N,ndim); cl_mem mem_tmp; cl_mem mem_src = (cl_mem)src; cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr); //set the number of work items in each dimension size_t work_items[3]; int p = 1; int threads = N / 2; int f = (forward) ? -1 : 1; //execute kernel int n = (int)log2(N); for (int i = 0; i < ndim; i++) { int dim = i+1; p = 1; work_items[0] = (dim == 1) ? N/2 : N; work_items[1] = (dim == 2) ? N/2 : N; work_items[2] = (dim == 3) ? N/2 : N; //transpose array if calculating dimension larger than 1 //if (dim > 1) // ocl_executeTranspose(mem_src, N, ndim, dim); //create kernel and set kernel arguments if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS) return OCL_ERROR; for (int t = 1; t <= log2(N); t++) { m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst); m_oclbase->ocl_setKernelArg(2, sizeof(int), &p); m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads); m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim); m_oclbase->ocl_setKernelArg(5, sizeof(int), &f); if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) return OCL_ERROR; mem_tmp = mem_src; mem_src = mem_dst; mem_dst = mem_tmp; p = 2*p; } //transpose array back if calculating dimension larger than 1 //if (dim > 1) // ocl_executeTranspose(mem_src, N, ndim, dim); } if (ndim*n % 2 == 1) { m_oclbase->ocl_copyData(mem_src, mem_dst, size); mem_tmp = mem_src; mem_src = mem_dst; mem_dst = mem_tmp; } m_oclbase->ocl_freeMemory(mem_dst); return OCL_SUCCESS; } int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) { cl_mem mem_src = (cl_mem)src; size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N}; size_t work_group_size[3] = {(size_t)N/2, 1, 1}; m_oclbase->ocl_createKernel("fft_batch3D"); m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL); m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL); m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL); m_oclbase->ocl_setKernelArg(4, sizeof(int), &N); for (int dim = 1; dim < ndim+1; dim++) { m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim); m_oclbase->ocl_executeKernel(3, work_items, work_group_size); } return OCL_SUCCESS; } int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) { cl_mem mem_src = (cl_mem)src; if (ndim == 1) return OCL_SUCCESS; size_t work_items[3]; work_items[0] = N[0]; work_items[1] = N[1]; work_items[2] = 1; size_t work_group_size[3]; work_group_size[0] = N[0]; work_group_size[1] = N[1]; work_group_size[2] = 1; size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2]; m_oclbase->ocl_createKernel("transpose"); m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src); m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]); m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]); m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL); m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size); return OCL_SUCCESS; } /* void OpenCLFFT::printData3DN4(cl_double2* &data, int N) { for (int j = 0; j < N; j++) { for (int i = 0; i < N; i++) { for (int k = 0; k < N; k++) { double d = data[i*N*N + j*N + k].x; if (d > 10e-5 || d < -10e-5) std::cout << d << "\t"; else std::cout << 0 << "\t"; } } std::cout << std::endl; } std::cout << std::endl; } */