#include #include #include "DKSBase.h" #include "nvToolsExt.h" #include "cuda_profiler_api.h" #include "cuda_runtime.h" using namespace std; void printData3D(double* data, int N, int NI, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < NI; i++) { for (int j = 0; j < N; j++) { for (int k = 0; k < N; k++) { cout << data[i*N*N + j*N + k] << "\t"; } cout << endl; } cout << endl; } } void initData(double *data, int N) { for (int i = 0; i < N/4 + 1; i++) { for (int j = 0; j < N/2 + 1; j++) { for (int k = 0; k < N/2 + 1; k++) { data[i*N*N + j*N + k] = k+1; } } } } void initData2(double *data, int N) { for (int i = 0; i < N; i++) data[i] = i; } void initComplex( complex *d, int N) { for (int i = 0; i < N; i++) { d[i] = complex(2, 0); } } void printComplex(complex *d, int N) { for (int i = 0; i < N; i++) cout << d[i] << "\t"; cout << endl; } void printDouble(double *d, int N) { for (int i = 0; i < N; i++) cout << d[i] << ", "; cout << endl; } void initMirror(double *data, int n1, int n2, int n3) { int d = 1; for (int i = 0; i < n3; i++) { for (int j = 0; j < n2; j++) { for (int k = 0; k < n1; k++) { if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) data[i * n2 * n1 + j * n1 + k] = d++; else data[i * n2 * n1 + j * n1 + k] = 0; } } } } void printDiv(int c) { for (int i = 0; i < c; i++) cout << "-"; cout << endl; } void printMirror(double *data, int n1, int n2, int n3) { printDiv(75); for (int i = 0; i < n3; i++) { for (int j = 0; j < n2; j++) { for (int k = 0; k < n1; k++) { cout << data[i * n2 * n1 + j * n1 + k] << "\t"; } cout << endl; } cout << endl; } cout << endl; } double sumData(double *data, int datasize) { double sum = 0; for (int i = 0; i < datasize; i++) sum += data[i]; return sum; } int main(int argc, char *argv[]) { char *api_name = new char[10]; char *device_name = new char[10]; for (int i = 1; i < argc; i++) { if (argv[i] == string("-cuda")) { strcpy(api_name, "Cuda"); strcpy(device_name, "-gpu"); } if (argv[i] == string("-opencl")) { strcpy(api_name, "OpenCL"); strcpy(device_name, "-gpu"); } if (argv[i] == string("-mic")) { strcpy(api_name, "OpenMP"); strcpy(device_name, "-mic"); } if (argv[i] == string("-cpu")) { strcpy(api_name, "OpenCL"); strcpy(device_name, "-cpu"); } } cout << "Use api: " << api_name << ", " << device_name << endl; /* set domain size */ int NG[3] = {64, 64, 32}; int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; int sizerho = NG[0] * NG[1] * NG[2]; int sizegreen = ng[0] * ng[1] * ng[2]; int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1; /* print some messages bout the example in the begginig */ cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; /* dks init and create 2 streams */ int dkserr; DKSBase base; base.setAPI(api_name, strlen(api_name)); base.setDevice(device_name, strlen(device_name)); base.initDevice(); base.setupFFT(3, NG); /* allocate memory and init rho field */ double *rho = new double[sizerho]; double *rho_out = new double[sizerho]; //double *green_out = new double[sizegreen]; double *mirror_out = new double[sizerho]; //initMirror(rho, NL[0], NL[1], NL[2]); initMirror(rho, NG[0], NG[1], NG[2]); /* allocate memory on device for - rho field - rho FFT - tmpgreen - greens integral - greens integral FFT */ void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr; tmpgreen_ptr = base.allocateMemory(sizegreen, dkserr); rho2_ptr = base.allocateMemory(sizerho, dkserr); grn_ptr = base.allocateMemory(sizerho, dkserr); rho2tr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); grntr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); /* =================================================*/ /* =================================================*/ /* =====loop trough fftpoison solver iterations=====*/ /* =================================================*/ /* =================================================*/ double old_sum = 0; int hr_m[3] = {1, 1, 1}; base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]); /* calculate greens integral on gpu */ base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]); /* mirror the field */ base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]); /* base.readData(grn_ptr, mirror_out, sizerho); for (int i = 0; i < sizerho; i++) cout << mirror_out[i] << " "; cout << endl << endl; for (int i = 0; i < sizerho; i++) cout << rho[i] << " "; cout << endl << endl; */ /* transfer rho field to device */ base.writeData(rho2_ptr, rho, sizerho); /* get FFT of rho field */ base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); /* get FFT of mirrored greens integral */ base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG); /* multiply both FFTs */ base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); /* complex *crho = new complex[sizecomp]; complex *cgre = new complex[sizecomp]; base.readData< complex >(rho2tr_ptr, crho, sizecomp); base.readData< complex >(grntr_ptr, cgre, sizecomp); for (int i = 0; i < sizecomp; i++) cout << cgre[i].real() << " "; cout << endl << endl; for (int i = 0; i < sizecomp; i++) cout << crho[i].real() << " "; cout << endl << endl; delete[] crho; delete[] cgre; */ /* inverse fft and transfer data back */ /* multiple device syncs and mpi barriers are used to make sure data transfer is started when results are ready and progam moves on only when data transfer is finished */ base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG); base.readData (rho2_ptr, rho_out, sizerho); for (int i = 0; i < 10; i++) cout << rho_out[i] << " "; cout << endl; old_sum = sumData(rho_out, sizerho); /* =================================================*/ /* =================================================*/ /* ==========end fftpoison solver test run==========*/ /* =================================================*/ /* =================================================*/ base.freeMemory(tmpgreen_ptr, sizegreen); base.freeMemory(grn_ptr, sizerho); base.freeMemory< complex >(rho2tr_ptr, sizecomp); base.freeMemory< complex >(grntr_ptr, sizecomp); base.freeMemory(rho2_ptr, sizerho); delete[] rho_out; delete[] rho; delete[] mirror_out; cout << "Final sum: " << old_sum << endl; }