304 lines
7.9 KiB
C++
304 lines
7.9 KiB
C++
#include "OpenCLFFT.h"
|
|
|
|
//=====================================//
|
|
//==========Private functions==========//
|
|
//=====================================//
|
|
|
|
/*
|
|
call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform,
|
|
ndim - totla number of dimmensions, N - size of dimension
|
|
*/
|
|
int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) {
|
|
|
|
//set the number of work items in each dimension
|
|
size_t work_items[3];
|
|
work_items[0] = N;
|
|
work_items[1] = (ndim > 1) ? N : 1;
|
|
work_items[2] = (ndim > 1) ? N : 1;
|
|
work_items[cdim] = N / 2;
|
|
|
|
int f = (forward) ? 1 : 0;
|
|
|
|
//create kernel and set kernel arguments
|
|
if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
|
|
//execute kernel
|
|
for (int step = 1; step < N; step <<= 1) {
|
|
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
}
|
|
|
|
return OCL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform,
|
|
ndim - totla number of dimmensions, N - size of dimension
|
|
*/
|
|
int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) {
|
|
//set work item size
|
|
size_t work_items[3];
|
|
work_items[0] = N;
|
|
work_items[1] = (ndim > 1) ? N : 1;
|
|
work_items[2] = (ndim > 2) ? N : 1;
|
|
|
|
//create kernel and set kernel arguments
|
|
if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
int bits = log2(N);
|
|
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
//execute kernel
|
|
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
|
|
DEBUG_MSG("Error executing kernel");
|
|
return OCL_ERROR;
|
|
}
|
|
|
|
return OCL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
//=====================================//
|
|
//==========Public functions==========//
|
|
//=====================================//
|
|
|
|
/*
|
|
call fft execution on device for every dimension
|
|
*/
|
|
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
|
int ierr;
|
|
|
|
cl_mem inout = (cl_mem)data;
|
|
int n = N[0];
|
|
|
|
for (int dim = 0; dim < ndim; dim++) {
|
|
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
|
|
if (ierr != OCL_SUCCESS) {
|
|
DEBUG_MSG("Error executing bit reverse");
|
|
return OCL_ERROR;
|
|
}
|
|
|
|
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
|
|
if (ierr != OCL_SUCCESS) {
|
|
DEBUG_MSG("Error executing fft reverse");
|
|
return OCL_ERROR;
|
|
}
|
|
}
|
|
|
|
return OCL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
execute ifft
|
|
*/
|
|
int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
|
|
executeFFT(data, ndim, N, streamId, false);
|
|
return OCL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
call kernel to normalize fft
|
|
*/
|
|
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
|
|
|
cl_mem inout = (cl_mem)data;
|
|
|
|
int n = N[0];
|
|
|
|
//set work item size
|
|
size_t work_items[3];
|
|
work_items[0] = n;
|
|
work_items[1] = (ndim > 1) ? n : 1;
|
|
work_items[2] = (ndim > 2) ? n : 1;
|
|
|
|
//create kernel
|
|
if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
//set kernel args
|
|
unsigned int elements = pow(n, ndim);
|
|
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
//execute kernel
|
|
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
|
|
DEBUG_MSG("Error executing kernel");
|
|
return OCL_ERROR;
|
|
}
|
|
|
|
return OCL_SUCCESS;
|
|
}
|
|
|
|
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
|
|
|
|
int ierr;
|
|
int size = sizeof(cl_double2)*pow(N,ndim);
|
|
|
|
cl_mem mem_tmp;
|
|
cl_mem mem_src = (cl_mem)src;
|
|
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
|
|
|
|
//set the number of work items in each dimension
|
|
size_t work_items[3];
|
|
int p = 1;
|
|
int threads = N / 2;
|
|
int f = (forward) ? -1 : 1;
|
|
|
|
//execute kernel
|
|
int n = (int)log2(N);
|
|
for (int i = 0; i < ndim; i++) {
|
|
|
|
int dim = i+1;
|
|
p = 1;
|
|
work_items[0] = (dim == 1) ? N/2 : N;
|
|
work_items[1] = (dim == 2) ? N/2 : N;
|
|
work_items[2] = (dim == 3) ? N/2 : N;
|
|
|
|
//transpose array if calculating dimension larger than 1
|
|
//if (dim > 1)
|
|
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
|
|
|
//create kernel and set kernel arguments
|
|
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
for (int t = 1; t <= log2(N); t++) {
|
|
|
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
|
|
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
|
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
|
|
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
|
|
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
|
|
|
|
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
|
return OCL_ERROR;
|
|
|
|
mem_tmp = mem_src;
|
|
mem_src = mem_dst;
|
|
mem_dst = mem_tmp;
|
|
|
|
p = 2*p;
|
|
}
|
|
|
|
//transpose array back if calculating dimension larger than 1
|
|
//if (dim > 1)
|
|
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
|
}
|
|
|
|
if (ndim*n % 2 == 1) {
|
|
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
|
|
mem_tmp = mem_src;
|
|
mem_src = mem_dst;
|
|
mem_dst = mem_tmp;
|
|
}
|
|
|
|
m_oclbase->ocl_freeMemory(mem_dst);
|
|
|
|
return OCL_SUCCESS;
|
|
|
|
}
|
|
|
|
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
|
|
|
|
cl_mem mem_src = (cl_mem)src;
|
|
|
|
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
|
|
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
|
|
|
|
m_oclbase->ocl_createKernel("fft_batch3D");
|
|
|
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
|
|
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
|
|
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
|
|
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
|
|
|
|
|
|
for (int dim = 1; dim < ndim+1; dim++) {
|
|
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
|
|
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
|
|
}
|
|
|
|
return OCL_SUCCESS;
|
|
}
|
|
|
|
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
|
|
|
|
cl_mem mem_src = (cl_mem)src;
|
|
|
|
if (ndim == 1)
|
|
return OCL_SUCCESS;
|
|
|
|
size_t work_items[3];
|
|
work_items[0] = N[0];
|
|
work_items[1] = N[1];
|
|
work_items[2] = 1;
|
|
|
|
size_t work_group_size[3];
|
|
work_group_size[0] = N[0];
|
|
work_group_size[1] = N[1];
|
|
work_group_size[2] = 1;
|
|
|
|
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
|
|
|
|
m_oclbase->ocl_createKernel("transpose");
|
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
|
|
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
|
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
|
|
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
|
|
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
|
|
|
|
return OCL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
void OpenCLFFT::printData3DN4(cl_double2* &data, int N) {
|
|
|
|
for (int j = 0; j < N; j++) {
|
|
for (int i = 0; i < N; i++) {
|
|
for (int k = 0; k < N; k++) {
|
|
double d = data[i*N*N + j*N + k].x;
|
|
if (d > 10e-5 || d < -10e-5)
|
|
std::cout << d << "\t";
|
|
else
|
|
std::cout << 0 << "\t";
|
|
}
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|