#include #include #include #include #include #include "Utility/TimeStamp.h" #include "DKSBase.h" using namespace std; void initData(double *data, int dimsize[3]) { for (int i = 0; i < dimsize[2]; i++) { for (int j = 0; j < dimsize[1]; j++) { for (int k = 0; k < dimsize[0]; k++) { data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; } } } } int main(int argc, char *argv[]) { int N = 8; if (argc == 2) N = atoi(argv[1]); int N1 = N; int N2 = N; int N3 = N; int dim = 3; int dimsize[3] = {N3, N2, N1}; int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1); double *data1 = new double[sizereal]; double *data2 = new double[sizereal]; initData(data1, dimsize); initData(data2, dimsize); /* init DKSBase */ cout << "Init device and set function" << endl; DKSBase base; base.setAPI("Cuda", 4); base.setDevice("-gpu", 4); base.initDevice(); base.setupFFT(3, dimsize); /* pagelock data */ base.allocateHostMemory(data1, sizereal); base.allocateHostMemory(data2, sizereal); /* create streams */ int fft1, fft2; base.createStream(fft1); base.createStream(fft2); int ierr; void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2; cout << "allocating memory ..." << endl; /* allocate memory on device */; real_ptr1 = base.allocateMemory(sizereal, ierr); real_ptr2 = base.allocateMemory(sizereal, ierr); comp_ptr1 = base.allocateMemory< complex >(sizecomp*2, ierr); comp_ptr2 = base.allocateMemory< complex >(sizecomp*2, ierr); cufftHandle defaultPlan; cudaStream_t cfft1, cfft2; cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z); cudaStreamCreate(&cfft1); cudaStreamCreate(&cfft2); for (int i = 0; i < 5; i++) { cufftHandle plan = defaultPlan; cout << "Iteration: " << i << endl; /* write data to device */ base.writeDataAsync(real_ptr1, data1, sizereal, fft1); //cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1); /* execute rcfft */ base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1); //cufftSetStream(plan, cfft1); //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2); /* write data to device */ base.writeDataAsync(real_ptr2, data2, sizereal, fft2); //cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2); /* execute rcfft */ base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2); //cufftSetStream(plan, cfft2); //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2); } base.freeMemory(real_ptr1, sizereal); base.freeMemory(real_ptr2, sizereal); base.freeMemory< complex >(comp_ptr1, sizereal); base.freeMemory< complex >(comp_ptr2, sizereal); /* free pagelock data */ base.freeHostMemory(data1, sizereal); base.freeHostMemory(data2, sizereal); return 0; }