206 lines
4.7 KiB
C++
206 lines
4.7 KiB
C++
#include <iostream>
|
|
#include <mpi.h>
|
|
#include <string.h>
|
|
|
|
#include "nvToolsExt.h"
|
|
#include "cuda_profiler_api.h"
|
|
#include "DKSBase.h"
|
|
|
|
using namespace std;
|
|
|
|
|
|
void printData3D(int* data, int N, const char *message = "") {
|
|
if (strcmp(message, "") != 0)
|
|
cout << message;
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
for (int j = 0; j < N; j++) {
|
|
for (int k = 0; k < N; k++) {
|
|
cout << data[i*N*N + j*N + k] << "\t";
|
|
}
|
|
cout << endl;
|
|
}
|
|
cout << endl;
|
|
}
|
|
|
|
}
|
|
|
|
void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
|
|
|
|
if (strcmp(message, "") != 0)
|
|
cout << message;
|
|
|
|
for (int i = 0; i < nz; i++) {
|
|
for (int j = 0; j < ny; j++) {
|
|
for (int k = 0; k < nx; k++) {
|
|
cout << data[i*ny*nx + j*nx + k] << "\t";
|
|
}
|
|
cout << endl;
|
|
}
|
|
cout << endl;
|
|
}
|
|
}
|
|
|
|
|
|
void printData(int *data, int N, int nprocs, const char *message = "") {
|
|
if (strcmp(message, "") != 0)
|
|
cout << message;
|
|
|
|
for (int i = 0; i < nprocs*N; i++)
|
|
cout << data[i] << "\t";
|
|
cout << endl << endl;
|
|
|
|
}
|
|
|
|
void initData(int *data, int N, int rank) {
|
|
for (int i = 0; i < N; i++)
|
|
data[i] = (rank+1);
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
|
|
int ierr;
|
|
int rank, nprocs;
|
|
|
|
MPI_Init(&argc, &argv);
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
|
|
|
//cout << "Rank " << (rank+1) << " from " << nprocs << endl;
|
|
|
|
int Ng[3] = {128, 128, 64};
|
|
int Nl[3] = {128, 64, 32};
|
|
int nglobal = Ng[0] * Ng[1] * Ng[2];
|
|
int nlocal = Nl[0] * Nl[1] * Nl[2];
|
|
|
|
DKSBase base = DKSBase();
|
|
base.setAPI("Cuda", 4);
|
|
base.setDevice("-gpu", 4);
|
|
base.initDevice();
|
|
|
|
int *hdata_in;
|
|
if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) {
|
|
hdata_in = new int[nlocal];
|
|
cout << "pinned allocation failed!" << endl;
|
|
}
|
|
initData(hdata_in, nlocal, rank);
|
|
|
|
int *hdata_out;
|
|
if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) {
|
|
hdata_out = new int[nlocal];
|
|
cout << "pinned allocation failed!" << endl;
|
|
}
|
|
|
|
//create streams for async execution
|
|
int stream1, stream2;
|
|
base.createStream(stream1);
|
|
base.createStream(stream2);
|
|
|
|
if (rank == 0)
|
|
base.setupFFT(3, Ng);
|
|
|
|
for (int i = 0; i < 1; i++) {
|
|
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
if (i == 1)
|
|
nvtxMarkA("start gather");
|
|
|
|
if (rank == 0) {
|
|
|
|
int id[3] = {0, 0, 0};
|
|
|
|
void *mem_ptr, *tmpgreen_ptr, *comp_ptr;
|
|
|
|
//allocate memory on device
|
|
int sizegreen = 65 * 65 * 33;
|
|
int sizecomp = 65 * 128 * 64;
|
|
mem_ptr = base.allocateMemory<double>(nglobal, ierr);
|
|
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);
|
|
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
|
|
|
//send pointer to other processes
|
|
nvtxMarkA("call gather");
|
|
for (int j = 1; j < nprocs; j++)
|
|
base.sendPointer(mem_ptr, j, MPI_COMM_WORLD);
|
|
|
|
//call another kernel while data transfer is processing
|
|
nvtxMarkA("call green");
|
|
base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2);
|
|
|
|
//write data to device
|
|
base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
|
|
|
|
/* execute rcfft */
|
|
//base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng);
|
|
|
|
base.syncDevice();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
//read data from device
|
|
base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
|
|
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
base.syncDevice();
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
|
|
base.freeMemory<double>(mem_ptr, nglobal);
|
|
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
|
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
|
|
|
} else {
|
|
|
|
|
|
void *mem_ptr;
|
|
int idy = 0;
|
|
int idz = 0;//Nl[2]*rank;
|
|
if (rank / 2 == 1) idy = Ng[1] / 2;
|
|
if (rank % 2 == 1) idz = Ng[2] / 2;
|
|
int id[3] = {0, idy, idz};
|
|
|
|
nvtxMarkA("call gather");
|
|
mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr);
|
|
base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
|
|
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
|
|
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
base.closeHandle(mem_ptr);
|
|
|
|
}
|
|
|
|
int sum1 = 0;
|
|
for (int c = 0; c < nlocal; c++)
|
|
sum1 += hdata_in[c];
|
|
|
|
int sum2 = 0;
|
|
for (int c = 0; c < nlocal; c++)
|
|
sum2 += hdata_out[c];
|
|
|
|
cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl;
|
|
|
|
|
|
if (i == 1)
|
|
nvtxMarkA("end gather");
|
|
|
|
}
|
|
|
|
//printData(hdata_in, nlocal, 1);
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
base.freeHostMemory(hdata_in, nlocal);
|
|
//delete[] hdata_in;
|
|
|
|
MPI_Finalize();
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
|
|
|