Files
DKS/src/OpenCL/OpenCLFFT.cpp
2016-10-10 14:49:32 +02:00

304 lines
7.9 KiB
C++

#include "OpenCLFFT.h"
//=====================================//
//==========Private functions==========//
//=====================================//
/*
call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform,
ndim - totla number of dimmensions, N - size of dimension
*/
int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) {
//set the number of work items in each dimension
size_t work_items[3];
work_items[0] = N;
work_items[1] = (ndim > 1) ? N : 1;
work_items[2] = (ndim > 1) ? N : 1;
work_items[cdim] = N / 2;
int f = (forward) ? 1 : 0;
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
for (int step = 1; step < N; step <<= 1) {
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
}
return OCL_SUCCESS;
}
/*
call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform,
ndim - totla number of dimmensions, N - size of dimension
*/
int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) {
//set work item size
size_t work_items[3];
work_items[0] = N;
work_items[1] = (ndim > 1) ? N : 1;
work_items[2] = (ndim > 2) ? N : 1;
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS)
return OCL_ERROR;
int bits = log2(N);
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
DEBUG_MSG("Error executing kernel");
return OCL_ERROR;
}
return OCL_SUCCESS;
}
//=====================================//
//==========Public functions==========//
//=====================================//
/*
call fft execution on device for every dimension
*/
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
int ierr;
cl_mem inout = (cl_mem)data;
int n = N[0];
for (int dim = 0; dim < ndim; dim++) {
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing bit reverse");
return OCL_ERROR;
}
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing fft reverse");
return OCL_ERROR;
}
}
return OCL_SUCCESS;
}
/*
execute ifft
*/
int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
executeFFT(data, ndim, N, streamId, false);
return OCL_SUCCESS;
}
/*
call kernel to normalize fft
*/
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
cl_mem inout = (cl_mem)data;
int n = N[0];
//set work item size
size_t work_items[3];
work_items[0] = n;
work_items[1] = (ndim > 1) ? n : 1;
work_items[2] = (ndim > 2) ? n : 1;
//create kernel
if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS)
return OCL_ERROR;
//set kernel args
unsigned int elements = pow(n, ndim);
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
DEBUG_MSG("Error executing kernel");
return OCL_ERROR;
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
int ierr;
int size = sizeof(cl_double2)*pow(N,ndim);
cl_mem mem_tmp;
cl_mem mem_src = (cl_mem)src;
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
//set the number of work items in each dimension
size_t work_items[3];
int p = 1;
int threads = N / 2;
int f = (forward) ? -1 : 1;
//execute kernel
int n = (int)log2(N);
for (int i = 0; i < ndim; i++) {
int dim = i+1;
p = 1;
work_items[0] = (dim == 1) ? N/2 : N;
work_items[1] = (dim == 2) ? N/2 : N;
work_items[2] = (dim == 3) ? N/2 : N;
//transpose array if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
return OCL_ERROR;
for (int t = 1; t <= log2(N); t++) {
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
p = 2*p;
}
//transpose array back if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
}
if (ndim*n % 2 == 1) {
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
}
m_oclbase->ocl_freeMemory(mem_dst);
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
cl_mem mem_src = (cl_mem)src;
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
m_oclbase->ocl_createKernel("fft_batch3D");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
for (int dim = 1; dim < ndim+1; dim++) {
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
cl_mem mem_src = (cl_mem)src;
if (ndim == 1)
return OCL_SUCCESS;
size_t work_items[3];
work_items[0] = N[0];
work_items[1] = N[1];
work_items[2] = 1;
size_t work_group_size[3];
work_group_size[0] = N[0];
work_group_size[1] = N[1];
work_group_size[2] = 1;
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
m_oclbase->ocl_createKernel("transpose");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
return OCL_SUCCESS;
}
/*
void OpenCLFFT::printData3DN4(cl_double2* &data, int N) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].x;
if (d > 10e-5 || d < -10e-5)
std::cout << d << "\t";
else
std::cout << 0 << "\t";
}
}
std::cout << std::endl;
}
std::cout << std::endl;
}
*/