diff --git a/auto-tuning/CMakeLists.txt b/auto-tuning/CMakeLists.txt index 77e5f67..07611b8 100644 --- a/auto-tuning/CMakeLists.txt +++ b/auto-tuning/CMakeLists.txt @@ -28,4 +28,7 @@ IF (ENABLE_OPAL) TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) ENDIF(ENABLE_OPAL) +ADD_EXECUTABLE(testFFT testFFT.cpp) +TARGET_LINK_LIBRARIES(testFFT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) + diff --git a/auto-tuning/testFFT.cpp b/auto-tuning/testFFT.cpp new file mode 100644 index 0000000..69bbff6 --- /dev/null +++ b/auto-tuning/testFFT.cpp @@ -0,0 +1,216 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSFFT.h" +#include "DKSOPAL.h" +#include "DKSBaseMuSR.h" + +using namespace std; + +void compareData(complex* data1, complex* data2, int N, int dim); +void compareData(double* data1, double *data2, int N, int dim); + +void initData(complex *data, int dimsize[3], int dim); +void initData(double *data, int dimsize[3], int dim); + +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim, + char *api_name, char *device_name); + +void printHelp(); + +int main(int argc, char *argv[]) { + + int ierr; + int N1 = 8; + int N2 = 8; + int N3 = 8; + int dim = 3; + char *api_name = new char[10]; + char *device_name = new char[10]; + + if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) ) + return 0; + + cout << "Use api: " << api_name << ", " << device_name << endl; + + int dimsize[3] = {N1, N2, N3}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; + + double *rdata = new double[sizereal]; + double *ordata = new double[sizereal]; + complex *cdata = new complex[sizereal]; + complex *codata = new complex[sizereal]; + + initData(rdata, dimsize, 3); + initData(cdata, dimsize, 3); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBaseMuSR base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + cout << "init device" << endl; + base.initDevice(); + cout << "setup fft" << endl; + base.setupFFT(dim, dimsize); + + //Test RC FFT -> CR FFT + void *real_ptr, *comp_ptr, *res_ptr; + cout << "allocate memory" << endl; + real_ptr = base.allocateMemory(sizereal, ierr); + res_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + cout << "write data" << endl; + base.writeData(real_ptr, rdata, sizereal); + + cout << "perform fft" << endl; + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize); + base.callNormalizeC2RFFT(res_ptr, dim, dimsize); + + cout << "read data" << endl; + base.readData(res_ptr, ordata, sizereal); + + compareData(rdata, ordata, N1, 3); + + base.freeMemory(real_ptr, sizereal); + base.freeMemory(res_ptr, sizereal); + base.freeMemory< complex >(comp_ptr, sizecomp); + + //Test CC FFT + void *mem_ptr; + mem_ptr = base.allocateMemory< complex >(sizereal, ierr); + base.writeData< complex >(mem_ptr, cdata, sizereal); + base.callFFT(mem_ptr, 3, dimsize); + base.callIFFT(mem_ptr, 3, dimsize); + base.callNormalizeFFT(mem_ptr, 3, dimsize); + base.readData< complex >(mem_ptr, codata, sizereal); + + compareData(cdata, codata, N1, 3); + + base.freeMemory< complex > (mem_ptr, sizereal); + + delete[] rdata; + delete[] ordata; + delete[] cdata; + delete[] codata; + +} + +void compareData(complex* data1, complex* data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + +void compareData(double* data1, double* data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id] - data2[id]); + } + } + } + cout << "Size " << N << " RC <--> CR diff: " << sum << endl; +} + +void initData(complex *data, int dimsize[3], int dim) { + if (dim == 3) { + for (int i = 0; i < dimsize[2]; i++) + for (int j = 0; j < dimsize[1]; j++) + for (int k = 0; k < dimsize[0]; k++) + data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex(sin(k), 0.0); + } else if (dim == 2) { + for (int j = 0; j < dimsize[1]; j++) { + for (int k = 0; k < dimsize[0]; k++) { + data[j*dimsize[0] + k] = complex(sin(k), 0.0); + } + } + } else { + for (int k = 0; k < dimsize[0]; k++) + data[k] = complex(sin(k), 0.0); + } +} + +void initData(double *data, int dimsize[3], int dim) { + if (dim == 3) { + for (int i = 0; i < dimsize[2]; i++) + for (int j = 0; j < dimsize[1]; j++) + for (int k = 0; k < dimsize[0]; k++) + data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k); + } else if (dim == 2) { + for (int j = 0; j < dimsize[1]; j++) { + for (int k = 0; k < dimsize[0]; k++) { + data[j*dimsize[0] + k] = sin(k); + } + } + } else { + for (int k = 0; k < dimsize[0]; k++) + data[k] = sin(k); + } +} + +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim, + char *api_name, char *device_name) +{ + + for (int i = 1; i < argc; i++) { + + if ( argv[i] == std::string("-dim")) { + dim = atoi(argv[i + 1]); + i++; + } + + if ( argv[i] == std::string("-grid") ) { + N1 = atoi(argv[i + 1]); + N2 = atoi(argv[i + 2]); + N3 = atoi(argv[i + 3]); + i += 3; + } + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + } + + return false; +} + diff --git a/src/Algorithms/FFT.h b/src/Algorithms/FFT.h index b16e5f6..2711a4d 100644 --- a/src/Algorithms/FFT.h +++ b/src/Algorithms/FFT.h @@ -6,7 +6,7 @@ #include "../DKSDefinitions.h" -class DKSFFT { +class FFT { protected: int defaultN[3]; @@ -22,7 +22,7 @@ protected: public: - virtual ~DKSFFT() { } + virtual ~FFT() { } virtual int setupFFT(int ndim, int N[3]) = 0; virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d2edeb7..fb15e27 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,12 +35,12 @@ ENDMACRO () SET (DKS_BASEDIR_HDRS DKSBase.h DKSDefinitions.h - DKSOPAL.h + DKSFFT.h ) SET (DKS_BASEDIR_SRCS DKSBase.cpp - DKSOPAL.cpp + DKSFFT.cpp ) #add opal to DKS if enable_opal is set diff --git a/src/CUDA/CMakeLists.txt b/src/CUDA/CMakeLists.txt index e75647c..93d8dc4 100644 --- a/src/CUDA/CMakeLists.txt +++ b/src/CUDA/CMakeLists.txt @@ -1,9 +1,9 @@ -SET (_HDRS CudaBase.cuh) -SET (_SRCS CudaBase.cu) +SET (_HDRS CudaBase.cuh CudaFFT.cuh) +SET (_SRCS CudaBase.cu CudaFFT.cu) IF (ENABLE_OPAL) - SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh) - SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu) + SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh) + SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu) ENDIF (ENABLE_OPAL) IF (ENABLE_MUSR) diff --git a/src/CUDA/CudaFFT.cuh b/src/CUDA/CudaFFT.cuh index 0c22f2c..88e8486 100644 --- a/src/CUDA/CudaFFT.cuh +++ b/src/CUDA/CudaFFT.cuh @@ -10,7 +10,7 @@ #include "../Algorithms/FFT.h" #include "CudaBase.cuh" -class CudaFFT : public DKSFFT{ +class CudaFFT : public FFT { private: diff --git a/src/DKSBaseMuSR.h b/src/DKSBaseMuSR.h index 30f2d89..9f640b6 100644 --- a/src/DKSBaseMuSR.h +++ b/src/DKSBaseMuSR.h @@ -8,6 +8,7 @@ #include "AutoTuning/DKSAutoTuningTester.h" #include "DKSBase.h" +#include "DKSFFT.h" #include "Algorithms/ChiSquareRuntime.h" @@ -19,7 +20,7 @@ #include "OpenCL/OpenCLChiSquareRuntime.h" #endif -class DKSBaseMuSR : public DKSBase { +class DKSBaseMuSR : public DKSFFT { private: diff --git a/src/DKSFFT.cpp b/src/DKSFFT.cpp index 9942fc2..02746ef 100644 --- a/src/DKSFFT.cpp +++ b/src/DKSFFT.cpp @@ -4,7 +4,7 @@ DKSFFT::DKSFFT() { dksfft = nullptr; } -~DKSFFT::DKSFFT() { +DKSFFT::~DKSFFT() { delete dksfft; } @@ -12,8 +12,10 @@ DKSFFT::DKSFFT() { int DKSFFT::setupFFT(int ndim, int N[3]) { if (apiCuda()) { + dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) ); return dksfft->setupFFT(ndim, N); } else if (apiOpenCL()) { + dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) ); int ierr1 = dksfft->setupFFT(ndim, N); int ierr2 = dksfft->setupFFTRC(ndim, N); int ierr3 = dksfft->setupFFTCR(ndim, N); @@ -24,6 +26,7 @@ int DKSFFT::setupFFT(int ndim, int N[3]) { } else if (apiOpenMP()) { //micbase.mic_setupFFT(ndim, N); //BENI: setting up RC and CR transformations on MIC + dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) ); int ierr1 = dksfft->setupFFTRC(ndim, N, 1.); int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])); if (ierr1 != DKS_SUCCESS) diff --git a/src/DKSFFT.h b/src/DKSFFT.h index c13fff5..4931e65 100644 --- a/src/DKSFFT.h +++ b/src/DKSFFT.h @@ -1,5 +1,5 @@ -#ifndef H_DKS_FFT -#define H_DKS_FFT +#ifndef H_DKSBASE_FFT +#define H_DKSBASE_FFT #include #include "AutoTuning/DKSAutoTuning.h" @@ -18,7 +18,6 @@ #ifdef DKS_CUDA #include "CUDA/CudaFFT.cuh" - #endif #ifdef DKS_MIC @@ -29,7 +28,7 @@ class DKSFFT : public DKSBase { private: - DKSFFT *dksfft; + FFT *dksfft; int initFFT(); @@ -105,3 +104,5 @@ public: }; + +#endif diff --git a/src/DKSOPAL.cpp b/src/DKSOPAL.cpp index 1b8ca9e..39d8077 100644 --- a/src/DKSOPAL.cpp +++ b/src/DKSOPAL.cpp @@ -11,7 +11,6 @@ DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) { } DKSOPAL::~DKSOPAL() { - delete dksfft; delete dkscol; delete dksgreens; } @@ -21,17 +20,14 @@ int DKSOPAL::setupOPAL() { if (apiOpenCL()) { ierr = OPENCL_SAFECALL( DKS_SUCCESS ); //TODO: only enable if AMD libraries are available - dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) ); dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) ); dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) ); } else if (apiCuda()) { ierr = CUDA_SAFECALL( DKS_SUCCESS ); - dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) ); dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) ); dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) ); } else if (apiOpenMP()) { ierr = MIC_SAFECALL( DKS_SUCCESS ); - dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) ); dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) ); dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) ); } else { diff --git a/src/DKSOPAL.h b/src/DKSOPAL.h index feee92f..0577f2d 100644 --- a/src/DKSOPAL.h +++ b/src/DKSOPAL.h @@ -5,6 +5,7 @@ #include "AutoTuning/DKSAutoTuning.h" #include "DKSBase.h" +#include "DKSFFT.h" #include "DKSDefinitions.h" @@ -32,11 +33,10 @@ #include "MIC/MICCollimatorPhysics.h" #endif -class DKSOPAL : public DKSBase { +class DKSOPAL : public DKSFFT { private: - DKSFFT *dksfft; DKSCollimatorPhysics *dkscol; GreensFunction *dksgreens; diff --git a/src/MIC/CMakeLists.txt b/src/MIC/CMakeLists.txt index 11c1480..ab90c1e 100644 --- a/src/MIC/CMakeLists.txt +++ b/src/MIC/CMakeLists.txt @@ -1,11 +1,10 @@ -SET (_SRCS MICBase.cpp) -SET (_HDRS MICBase.h) +SET (_SRCS MICBase.cpp MICFFT.cpp) +SET (_HDRS MICBase.h MICFFT.h) IF (ENABLE_OPAL) SET (_SRCS ${_SRCS} MICChiSquare.cpp - MICFFT.cpp MICGreensFunction.cpp MICCollimatorPhysics.cpp ) @@ -13,7 +12,6 @@ IF (ENABLE_OPAL) SET (_HDRS ${_HDRS} MICChiSquare.h - MICFFT.h MICCollimatorPhysics.h MICGreensFunction.hpp MICMergeSort.h diff --git a/src/MIC/MICFFT.h b/src/MIC/MICFFT.h index ade95c7..fd585d6 100644 --- a/src/MIC/MICFFT.h +++ b/src/MIC/MICFFT.h @@ -10,7 +10,7 @@ #include "../Algorithms/FFT.h" #include "MICBase.h" -class MICFFT : public DKSFFT { +class MICFFT : public FFT { private: diff --git a/src/OpenCL/CMakeLists.txt b/src/OpenCL/CMakeLists.txt index f3bf3f7..18f14c8 100644 --- a/src/OpenCL/CMakeLists.txt +++ b/src/OpenCL/CMakeLists.txt @@ -4,6 +4,25 @@ SET (_HDRS OpenCLBase.h) SET (_SRCS OpenCLBase.cpp) SET (_KERNELS "") +IF (ENABLE_AMD) + SET (_SRCS + ${_SRCS} + OpenCLFFT.cpp + ) + + SET (_HDRS + ${_HDRS} + OpenCLFFT.h + ) + + SET (_KERNELS + ${_KERNELS} + OpenCLKernels/OpenCLFFT.cl + OpenCLKernels/OpenCLFFTStockham.cl + OpenCLKernels/OpenCLTranspose.cl + ) +ENDIF (ENABLE_AMD) + IF (ENABLE_MUSR) SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h) SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp) @@ -13,23 +32,18 @@ ENDIF (ENABLE_MUSR) IF (ENABLE_AMD AND ENABLE_OPAL) SET (_SRCS ${_SRCS} - OpenCLFFT.cpp OpenCLCollimatorPhysics.cpp OpenCLGreensFunction.cpp ) SET (_HDRS ${_HDRS} - OpenCLFFT.h OpenCLCollimatorPhysics.h OpenCLGreensFunction.h ) SET (_KERNELS ${_KERNELS} - OpenCLKernels/OpenCLFFT.cl - OpenCLKernels/OpenCLFFTStockham.cl - OpenCLKernels/OpenCLTranspose.cl OpenCLKernels/OpenCLCollimatorPhysics.cl OpenCLKernels/OpenCLGreensFunction.cl ) diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h index dfa7772..b73420e 100644 --- a/src/OpenCL/OpenCLFFT.h +++ b/src/OpenCL/OpenCLFFT.h @@ -22,7 +22,7 @@ #include "clFFT.h" -class OpenCLFFT : public DKSFFT { +class OpenCLFFT : public FFT { private: @@ -112,10 +112,9 @@ public: int streamId = -1); int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); - int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) - { - return DKS_ERROR; - } + int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { + return DKS_ERROR; + } //void printData3DN4(cl_double2* &data, int N);