#include #include #include #include "DKSBase.h" #include "nvToolsExt.h" #include "cuda_profiler_api.h" #include "cuda_runtime.h" using namespace std; void printData3D(double* data, int N, int NI, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < NI; i++) { for (int j = 0; j < N; j++) { for (int k = 0; k < N; k++) { cout << data[i*N*N + j*N + k] << "\t"; } cout << endl; } cout << endl; } } void initData(double *data, int N) { for (int i = 0; i < N/4 + 1; i++) { for (int j = 0; j < N/2 + 1; j++) { for (int k = 0; k < N/2 + 1; k++) { data[i*N*N + j*N + k] = k+1; } } } } void initData2(double *data, int N) { for (int i = 0; i < N; i++) data[i] = i; } void initComplex( complex *d, int N) { for (int i = 0; i < N; i++) { d[i] = complex(2, 0); } } void printComplex(complex *d, int N) { for (int i = 0; i < N; i++) cout << d[i] << "\t"; cout << endl; } void initMirror(double *data, int n1, int n2, int n3) { int d = 1; for (int i = 0; i < n3; i++) { for (int j = 0; j < n2; j++) { for (int k = 0; k < n1; k++) { if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) data[i * n2 * n1 + j * n1 + k] = d++; else data[i * n2 * n1 + j * n1 + k] = 0; } } } } void printDiv(int c) { for (int i = 0; i < c; i++) cout << "-"; cout << endl; } void printMirror(double *data, int n1, int n2, int n3) { printDiv(75); for (int i = 0; i < n3; i++) { for (int j = 0; j < n2; j++) { for (int k = 0; k < n1; k++) { cout << data[i * n2 * n1 + j * n1 + k] << "\t"; } cout << endl; } cout << endl; } cout << endl; } double sumData(double *data, int datasize) { double sum = 0; for (int i = 0; i < datasize; i++) sum += data[i]; return sum; } int main(int argc, char *argv[]) { /* mpi init */ int rank, nprocs; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (nprocs != 8) { cout << "example was set to run with 8 processes" << endl; cout << "exit..." << endl; return 0; } /* set domain size */ int NG[3] = {64, 64, 32}; int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; int sizerho = NG[0] * NG[1] * NG[2]; int sizegreen = ng[0] * ng[1] * ng[2]; int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1; int id[3]; id[0] = 0; id[1] = NL[1] * (rank % 4); id[2] = NL[2] * (rank / 4); /* print some messages bout the example in the begginig */ if (rank == 0) { cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; int tmp[3]; for (int p = 1; p < nprocs; p++) { MPI_Status mpistatus; MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus); cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl; } } else { MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD); } /* dks init and create 2 streams */ int dkserr; int streamGreens, streamFFT; DKSBase base;// = DKSBase(); base.setAPI("Cuda", 4); base.setDevice("-gpu", 4); base.initDevice(); base.createStream(streamFFT); if (rank == 0) { base.createStream(streamGreens); base.setupFFT(3, NG); } /* allocate memory and init rho field */ double *rho = new double[sizerho]; double *rho_out = new double[sizerho]; //double *green_out = new double[sizegreen]; initMirror(rho, NL[0], NL[1], NL[2]); /* allocate memory on device for - rho field - rho FFT - tmpgreen - greens integral - greens integral FFT */ void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr; if (rank == 0) { tmpgreen_ptr = base.allocateMemory(sizegreen, dkserr); rho2_ptr = base.allocateMemory(sizerho, dkserr); grn_ptr = base.allocateMemory(sizerho, dkserr); rho2tr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); grntr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); } else { grntr_ptr = NULL; rho2_ptr = NULL; grn_ptr = NULL; rho2tr_ptr = NULL; tmpgreen_ptr = NULL; } /* send and receive pointer to allocated memory on device */ if (rank == 0) { for (int p = 1; p < nprocs; p++) base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); } else { rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr); } MPI_Barrier(MPI_COMM_WORLD); /* =================================================*/ /* =================================================*/ /* =====loop trough fftpoison solver iterations=====*/ /* =================================================*/ /* =================================================*/ double old_sum = 0; double tmp_sum = 0; for (int l = 0; l < 10000; l++) { MPI_Barrier(MPI_COMM_WORLD); /* on node 0, calculate tmpgreen on gpu */ int hr_m[3] = {1, 1, 1}; if (rank == 0) base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2], streamGreens); /* calculate greens integral on gpu */ if (rank == 0) base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens); /* mirror the field */ if (rank == 0) base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens); /* get FFT of mirrored greens integral */ if (rank == 0) base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens); /* transfer rho field to device */ base.gather3DDataAsync ( rho2_ptr, rho, NG, NL, id, streamFFT); MPI_Barrier(MPI_COMM_WORLD); /* get FFT of rho field */ if (rank == 0) { base.syncDevice(); base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); } /* multiply both FFTs */ if (rank == 0) base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); MPI_Barrier(MPI_COMM_WORLD); /* inverse fft and transfer data back */ /* multiple device syncs and mpi barriers are used to make sure data transfer is started when results are ready and progam moves on only when data transfer is finished */ if (rank == 0) { base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG); base.syncDevice(); MPI_Barrier(MPI_COMM_WORLD); base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); MPI_Barrier(MPI_COMM_WORLD); base.syncDevice(); MPI_Barrier(MPI_COMM_WORLD); //cout << "result: " << sumData(rho_out, sizerho) << endl; if (l == 0) { old_sum = sumData(rho_out, sizerho); } else { tmp_sum = sumData(rho_out, sizerho); if (old_sum != tmp_sum) { cout << "diff in iteration: " << l << endl; } } } else { MPI_Barrier(MPI_COMM_WORLD); base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); } } /* =================================================*/ /* =================================================*/ /* ==========end fftpoison solver test run==========*/ /* =================================================*/ /* =================================================*/ /* free memory on device */ if (rank == 0) { base.freeMemory(tmpgreen_ptr, sizegreen); base.freeMemory(grn_ptr, sizerho); base.freeMemory< complex >(rho2tr_ptr, sizecomp); base.freeMemory< complex >(grntr_ptr, sizecomp); MPI_Barrier(MPI_COMM_WORLD); base.freeMemory(rho2_ptr, sizerho); cout << "Final sum: " << old_sum << endl; } else { base.closeHandle(rho2_ptr); MPI_Barrier(MPI_COMM_WORLD); } MPI_Finalize(); }