#include #include #include #include "nvToolsExt.h" #include "cuda_profiler_api.h" #include "DKSBase.h" using namespace std; void printData3D(int* data, int N, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { for (int k = 0; k < N; k++) { cout << data[i*N*N + j*N + k] << "\t"; } cout << endl; } cout << endl; } } void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < nz; i++) { for (int j = 0; j < ny; j++) { for (int k = 0; k < nx; k++) { cout << data[i*ny*nx + j*nx + k] << "\t"; } cout << endl; } cout << endl; } } void printData(int *data, int N, int nprocs, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < nprocs; i++) { for (int j = 0; j < N; j++) cout << data[i*N + j] << "\t"; cout << endl; } } void initData(int *data, int N, int rank) { for (int i = 0; i < N; i++) data[i] = (rank+1); } int main(int argc, char *argv[]) { int ierr; int rank, nprocs; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); cout << "Rank " << (rank+1) << " from " << nprocs << endl; int N_global[3] = {64, 64, 32}; int N_local[3] = {64, 32, 16}; int n = N_local[0] * N_local[1] * N_local[2]; int idx[4] = {0, 0, 0, 0}; int idy[4] = {0, 32, 0, 32}; int idz[4] = {0, 0, 16, 16}; DKSBase base = DKSBase(); base.setAPI("Cuda", 4); base.setDevice("-gpu", 4); base.initDevice(); int *hdata_in; if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) { hdata_in = new int[n]; cout << "pinned allocation failed!" << endl; } initData(hdata_in, n, rank); for (int i = 0; i < 2; i++) { MPI_Barrier(MPI_COMM_WORLD); if (i == 1) nvtxMarkA("start gather"); if (rank == 0) { void *mem_ptr, *tmpgreen_ptr; mem_ptr = base.allocateMemory(nprocs*n, ierr); //call another kernel int sizegreen = 33 * 33 * 17; tmpgreen_ptr = base.allocateMemory(sizegreen, ierr); nvtxMarkA("call green"); base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007); nvtxMarkA("call gather"); base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); //read and print data once for debug only /* if (i == 0 && nprocs*n < 257) { int *hdata_out_all = new int[nprocs*n]; base.readData(mem_ptr, hdata_out_all, n*nprocs); printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]); } else { int *hout_data = new int[nprocs*n]; base.readData(mem_ptr, hout_data, nprocs*n); int sum = 0; for (int s = 0; s < nprocs*n; s++) sum += hout_data[s]; cout << "Sum: " << sum << endl; } */ MPI_Barrier(MPI_COMM_WORLD); nvtxMarkA("call scatter"); base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); base.freeMemory(mem_ptr, n*nprocs); base.freeMemory(tmpgreen_ptr, sizegreen); } else { nvtxMarkA("call gather"); base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); nvtxMarkA("call scatter"); base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); } if (i == 1) nvtxMarkA("end gather"); } MPI_Barrier(MPI_COMM_WORLD); base.freeHostMemory(hdata_in, n); MPI_Finalize(); return 0; }