DKS/test/testGatherAsync2.cpp

#include <iostream>
#include <mpi.h>
#include <string.h>

#include "nvToolsExt.h"
#include "cuda_profiler_api.h"
#include "DKSBase.h"

using namespace std;


void printData3D(int* data, int N, const char *message = "") {
  if (strcmp(message, "") != 0)
    cout << message;

  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      for (int k = 0; k < N; k++) {
	cout << data[i*N*N + j*N + k] << "\t";
      }
      cout << endl;
    }
    cout << endl;
  }

}

void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {

  if (strcmp(message, "") != 0)
    cout << message;

  for (int i = 0; i < nz; i++) {
    for (int j = 0; j < ny; j++) {
      for (int k = 0; k < nx; k++) {
	cout << data[i*ny*nx + j*nx + k] << "\t";
      }
      cout << endl;
    }
    cout << endl;
  }
}


void printData(int *data, int N, int nprocs, const char *message = "") {
  if (strcmp(message, "") != 0)
    cout << message;

  for (int i = 0; i < nprocs*N; i++)
      cout << data[i] << "\t";
  cout << endl << endl;

}

void initData(int *data, int N, int rank) {
  for (int i = 0; i < N; i++)
    data[i] = (rank+1);
}

int main(int argc, char *argv[]) {

  int ierr;
  int rank, nprocs;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  //cout << "Rank " << (rank+1) << " from " << nprocs << endl;

  int Ng[3] = {128, 128, 64};
  int Nl[3] = {128, 64, 32};
  int nglobal = Ng[0] * Ng[1] * Ng[2];
  int nlocal = Nl[0] * Nl[1] * Nl[2];

  DKSBase base = DKSBase();
  base.setAPI("Cuda", 4);
  base.setDevice("-gpu", 4);
  base.initDevice();

  int *hdata_in;
  if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) {
    hdata_in = new int[nlocal];
    cout << "pinned allocation failed!" << endl;
  }
  initData(hdata_in, nlocal, rank);

  int *hdata_out;
  if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) {
    hdata_out = new int[nlocal];
    cout << "pinned allocation failed!" << endl;
  }

  //create streams for async execution
  int stream1, stream2;
  base.createStream(stream1);
  base.createStream(stream2);

  if (rank == 0)
    base.setupFFT(3, Ng);

  for (int i = 0; i < 1; i++) {

    MPI_Barrier(MPI_COMM_WORLD);
    if (i == 1)
      nvtxMarkA("start gather");

    if (rank == 0) {

      int id[3] = {0, 0, 0};

      void *mem_ptr, *tmpgreen_ptr, *comp_ptr;

      //allocate memory on device
      int sizegreen = 65 * 65 * 33;
      int sizecomp = 65 * 128 * 64;
      mem_ptr = base.allocateMemory<double>(nglobal, ierr);
      tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);
      comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);

      //send pointer to other processes
      nvtxMarkA("call gather");
      for (int j = 1; j < nprocs; j++)
	base.sendPointer(mem_ptr, j, MPI_COMM_WORLD);

      //call another kernel while data transfer is processing
      nvtxMarkA("call green");
      base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2);

      //write data to device
      base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);

      /* execute rcfft */
      //base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng);

      base.syncDevice();
      MPI_Barrier(MPI_COMM_WORLD);

      //read data from device
      base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);

      MPI_Barrier(MPI_COMM_WORLD);
      base.syncDevice();
      MPI_Barrier(MPI_COMM_WORLD);


      base.freeMemory<double>(mem_ptr, nglobal);
      base.freeMemory<double>(tmpgreen_ptr, sizegreen);
      base.freeMemory< complex<double> >(comp_ptr, sizecomp);

    } else {


      void *mem_ptr;
      int idy = 0;
      int idz = 0;//Nl[2]*rank;
      if (rank / 2 == 1) idy = Ng[1] / 2;
      if (rank % 2 == 1) idz = Ng[2] / 2;
      int id[3] = {0, idy, idz};

      nvtxMarkA("call gather");
      mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr);
      base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);

      MPI_Barrier(MPI_COMM_WORLD);

      base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);

      MPI_Barrier(MPI_COMM_WORLD);

      MPI_Barrier(MPI_COMM_WORLD);

      base.closeHandle(mem_ptr);

    }

    int sum1 = 0;
    for (int c = 0; c < nlocal; c++)
      sum1 += hdata_in[c];

    int sum2 = 0;
    for (int c = 0; c < nlocal; c++)
      sum2 += hdata_out[c];

    cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl;


    if (i == 1)
      nvtxMarkA("end gather");

  }

  //printData(hdata_in, nlocal, 1);
  MPI_Barrier(MPI_COMM_WORLD);
  base.freeHostMemory(hdata_in, nlocal);
  //delete[] hdata_in;

  MPI_Finalize();
  return 0;
}