snapshot of svn

2016-10-10 14:49:32 +02:00
commit 4fa529aaea
122 changed files with 23153 additions and 0 deletions
--- a/test/testGatherAsync.cpp
+++ b/test/testGatherAsync.cpp
@ -0,0 +1,144 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = 0; j < N; j++)
+      cout << data[i*N + j] << "\t";
+    cout << endl;
+  }
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  //mpi copy
+  int n = 32*16*16;
+  int N_global[3] = {32, 32, 32};
+  int N_local[3] = {32, 16, 16};
+  int idx[4] = {0, 0, 0, 0};
+  int idy[4] = {0, 0, 16, 16};
+  int idz[4] = {0, 16, 0, 16};
+
+  //greens kernel
+  int n1 = 33;
+  int n2 = 33;
+  int n3 = 17;
+  int sizegreen = n1*n2*n3;
+	
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+	
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
+    hdata_in = new int[n];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, n, rank);
+
+  int stream2;
+  for (int i = 0; i < 2; i++) {
+	
+    if (rank == 0) {
+      if (i == 0) { 
+	cudaProfilerStart();
+	base.createStream(stream2);
+      }
+      
+      nvtxMarkA("start gather");
+      
+      void *mem_ptr, *green_ptr;
+
+      mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+      green_ptr = base.allocateMemory<int>(sizegreen, ierr);
+		
+      nvtxMarkA("call gather");
+      MPI_Request request;
+      MPI_Status status;
+
+      base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			     idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, 
+			     request);
+
+
+      nvtxMarkA("call kernel");
+      base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1, 
+			      4.160715e-03, 4.474911e-03, 1.247311e-02, stream2);
+
+      MPI_Wait(&request, &status);
+      
+
+      base.freeMemory<int>(mem_ptr, n*nprocs);
+      base.freeMemory<int>(green_ptr, sizegreen);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      nvtxMarkA("end gather");
+
+      if (i == 1) cudaProfilerStop();
+    } else {
+      
+      MPI_Request request;
+      base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			     idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, 
+			     request);
+    
+      MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+  }
+
+  base.freeHostMemory(hdata_in, n);
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+