#include #include #include #include "nvToolsExt.h" #include "cuda_profiler_api.h" #include "DKSBase.h" using namespace std; void printData3D(int* data, int N, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { for (int k = 0; k < N; k++) { cout << data[i*N*N + j*N + k] << "\t"; } cout << endl; } cout << endl; } } void printData(int *data, int N, int nprocs, const char *message = "") { if (strcmp(message, "") != 0) cout << message; for (int i = 0; i < nprocs; i++) { for (int j = 0; j < N; j++) cout << data[i*N + j] << "\t"; cout << endl; } } void initData(int *data, int N, int rank) { for (int i = 0; i < N; i++) data[i] = (rank+1); } int main(int argc, char *argv[]) { int ierr; int rank, nprocs; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); cout << "Rank " << (rank+1) << " from " << nprocs << endl; //mpi copy int n = 32*16*16; int N_global[3] = {32, 32, 32}; int N_local[3] = {32, 16, 16}; int idx[4] = {0, 0, 0, 0}; int idy[4] = {0, 0, 16, 16}; int idz[4] = {0, 16, 0, 16}; //greens kernel int n1 = 33; int n2 = 33; int n3 = 17; int sizegreen = n1*n2*n3; DKSBase base = DKSBase(); base.setAPI("Cuda", 4); base.setDevice("-gpu", 4); base.initDevice(); int *hdata_in; if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) { hdata_in = new int[n]; cout << "pinned allocation failed!" << endl; } initData(hdata_in, n, rank); int stream2; for (int i = 0; i < 2; i++) { if (rank == 0) { if (i == 0) { cudaProfilerStart(); base.createStream(stream2); } nvtxMarkA("start gather"); void *mem_ptr, *green_ptr; mem_ptr = base.allocateMemory(nprocs*n, ierr); green_ptr = base.allocateMemory(sizegreen, ierr); nvtxMarkA("call gather"); MPI_Request request; MPI_Status status; base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, request); nvtxMarkA("call kernel"); base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1, 4.160715e-03, 4.474911e-03, 1.247311e-02, stream2); MPI_Wait(&request, &status); base.freeMemory(mem_ptr, n*nprocs); base.freeMemory(green_ptr, sizegreen); MPI_Barrier(MPI_COMM_WORLD); nvtxMarkA("end gather"); if (i == 1) cudaProfilerStop(); } else { MPI_Request request; base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local, idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, request); MPI_Barrier(MPI_COMM_WORLD); } } base.freeHostMemory(hdata_in, n); MPI_Finalize(); return 0; }