#include #include #include #include "nvToolsExt.h" #include "cuda_profiler_api.h" #include "DKSBase.h" using namespace std; void printData3D(int* data, int N, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { for (int k = 0; k < N; k++) { cout << data[i*N*N + j*N + k] << "\t"; } cout << endl; } cout << endl; } } void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < nz; i++) { for (int j = 0; j < ny; j++) { for (int k = 0; k < nx; k++) { cout << data[i*ny*nx + j*nx + k] << "\t"; } cout << endl; } cout << endl; } } void printData(int *data, int N, int nprocs, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < nprocs*N; i++) cout << data[i] << "\t"; cout << endl << endl; } void initData(int *data, int N, int rank) { for (int i = 0; i < N; i++) data[i] = (rank+1); } int main(int argc, char *argv[]) { int ierr; int rank, nprocs; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); //cout << "Rank " << (rank+1) << " from " << nprocs << endl; int Ng[3] = {128, 128, 64}; int Nl[3] = {128, 64, 32}; int nglobal = Ng[0] * Ng[1] * Ng[2]; int nlocal = Nl[0] * Nl[1] * Nl[2]; DKSBase base = DKSBase(); base.setAPI("Cuda", 4); base.setDevice("-gpu", 4); base.initDevice(); int *hdata_in; if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) { hdata_in = new int[nlocal]; cout << "pinned allocation failed!" << endl; } initData(hdata_in, nlocal, rank); int *hdata_out; if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) { hdata_out = new int[nlocal]; cout << "pinned allocation failed!" << endl; } //create streams for async execution int stream1, stream2; base.createStream(stream1); base.createStream(stream2); if (rank == 0) base.setupFFT(3, Ng); for (int i = 0; i < 1; i++) { MPI_Barrier(MPI_COMM_WORLD); if (i == 1) nvtxMarkA("start gather"); if (rank == 0) { int id[3] = {0, 0, 0}; void *mem_ptr, *tmpgreen_ptr, *comp_ptr; //allocate memory on device int sizegreen = 65 * 65 * 33; int sizecomp = 65 * 128 * 64; mem_ptr = base.allocateMemory(nglobal, ierr); tmpgreen_ptr = base.allocateMemory(sizegreen, ierr); comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); //send pointer to other processes nvtxMarkA("call gather"); for (int j = 1; j < nprocs; j++) base.sendPointer(mem_ptr, j, MPI_COMM_WORLD); //call another kernel while data transfer is processing nvtxMarkA("call green"); base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2); //write data to device base.gather3DDataAsync(mem_ptr, hdata_in, Ng, Nl, id, stream1); /* execute rcfft */ //base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng); base.syncDevice(); MPI_Barrier(MPI_COMM_WORLD); //read data from device base.scatter3DDataAsync(mem_ptr, hdata_out, Ng, Nl, id); MPI_Barrier(MPI_COMM_WORLD); base.syncDevice(); MPI_Barrier(MPI_COMM_WORLD); base.freeMemory(mem_ptr, nglobal); base.freeMemory(tmpgreen_ptr, sizegreen); base.freeMemory< complex >(comp_ptr, sizecomp); } else { void *mem_ptr; int idy = 0; int idz = 0;//Nl[2]*rank; if (rank / 2 == 1) idy = Ng[1] / 2; if (rank % 2 == 1) idz = Ng[2] / 2; int id[3] = {0, idy, idz}; nvtxMarkA("call gather"); mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr); base.gather3DDataAsync(mem_ptr, hdata_in, Ng, Nl, id, stream1); MPI_Barrier(MPI_COMM_WORLD); base.scatter3DDataAsync(mem_ptr, hdata_out, Ng, Nl, id); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); base.closeHandle(mem_ptr); } int sum1 = 0; for (int c = 0; c < nlocal; c++) sum1 += hdata_in[c]; int sum2 = 0; for (int c = 0; c < nlocal; c++) sum2 += hdata_out[c]; cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl; if (i == 1) nvtxMarkA("end gather"); } //printData(hdata_in, nlocal, 1); MPI_Barrier(MPI_COMM_WORLD); base.freeHostMemory(hdata_in, nlocal); //delete[] hdata_in; MPI_Finalize(); return 0; }