snapshot of svn

2016-10-10 14:49:32 +02:00
commit 4fa529aaea
122 changed files with 23153 additions and 0 deletions
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -0,0 +1,84 @@
+INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+
+LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+
+#ADD_EXECUTABLE(testDKS testDKS.cpp)
+#ADD_EXECUTABLE(testChi testChi.cpp)
+#ADD_EXECUTABLE(testFFT testFFT.cpp)
+#ADD_EXECUTABLE(testMIC testMIC.cpp)
+#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
+#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
+#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
+#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
+#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
+#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
+#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp)
+#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp)
+#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp)
+#ADD_EXECUTABLE(testOffset testOffset.cpp)
+#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp)
+#ADD_EXECUTABLE(testMPI testMPI.cpp)
+#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp)
+#ADD_EXECUTABLE(testGather testGather.cpp)
+#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
+#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
+ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
+#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
+#ADD_EXECUTABLE(testPush testPush.cpp)
+#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
+#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
+#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
+
+#shared library
+#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp)
+
+
+#TARGET_LINK_LIBRARIES(testDKS dks)
+#TARGET_LINK_LIBRARIES(testChi dks)
+#TARGET_LINK_LIBRARIES(testFFT dks)
+#TARGET_LINK_LIBRARIES(testMIC dks)
+#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
+#TARGET_LINK_LIBRARIES(testFFT3D dks)
+#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
+#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
+#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
+#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
+#TARGET_LINK_LIBRARIES(testStockFFT3D dks)
+#TARGET_LINK_LIBRARIES(testMemObjects dks)
+#TARGET_LINK_LIBRARIES(testRCFFT dks)
+#TARGET_LINK_LIBRARIES(testOffset dks)
+#TARGET_LINK_LIBRARIES(testOffsetMPI dks)
+#TARGET_LINK_LIBRARIES(testMPI dks)
+#TARGET_LINK_LIBRARIES(testMPIFFT dks)
+#TARGET_LINK_LIBRARIES(testGather dks)
+#TARGET_LINK_LIBRARIES(testGatherAsync dks)
+#TARGET_LINK_LIBRARIES(testTranspose dks)
+TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
+#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
+#TARGET_LINK_LIBRARIES(testPush dks)
+#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
+#TARGET_LINK_LIBRARIES(testIntegration dks)
+#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
+
+
+#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared)
+
+
+#IF (${COMPILER_NAME} STREQUAL "mpicxx")
+   #ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp)
+   #ADD_EXECUTABLE(testGreens testGreens.cpp)
+   #ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp)
+   #ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp)
+   #TARGET_LINK_LIBRARIES(testGatherAsync2 dks)
+   #TARGET_LINK_LIBRARIES(testGreens dks)
+   #TARGET_LINK_LIBRARIES(testFFTSolver dks)
+   #TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks)	
+#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
+
+#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp)
+#TARGET_LINK_LIBRARIES(testChiSquare dks)
+
+#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
+  #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+  #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
+#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
--- a/test/testChi.cpp
+++ b/test/testChi.cpp
@ -0,0 +1,141 @@
+#include <iostream>
+#include <complex>
+#include <cstdlib>
+
+#include "DKSBase.h"
+#include "Utility/TimeStamp.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+  char *api_name = new char[10];
+  char *device_name = new char[4];
+
+
+  if (argc == 3) {
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, argv[2]);
+  } else if (argc == 2){
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, "-gpu");
+  } else {
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << endl;
+
+  cout << "Begin DKS Base tests" << endl;
+	
+  /* inti data */
+  int ierr;
+  int nsize = 4000000;
+  int jsize = 16;
+  int psize = 6;
+  double *data = new double[nsize*jsize];
+  double *p = new double[psize*jsize];
+  double data_out = 0;
+	
+  srand(time(NULL));
+  for (int i = 0; i < nsize*jsize; i++) {
+    //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
+    //data[i] = sign*(double)rand()/RAND_MAX;
+    data[i] = (double)i / (nsize*jsize);
+    //data[i] = 1;
+  }
+  for (int i = 0; i < psize*jsize; i++) {
+    //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
+    //p[i] = sign*(double)rand()/RAND_MAX;
+    p[i] = (double)i / (nsize*jsize);
+    //p[i] = 1;
+  }
+  /* end init */
+	
+  timestamp_t tstart, tend;
+  //timestamp_t t0, t1;
+	
+  tstart = get_timestamp();
+		
+  //init dks base class, set API to opencl and init connection with OpenCL device
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+	
+  //ptrs to hold reference to device memory
+  void *dptr, *ntptr, *pptr;
+	
+  //allocate memory on device
+  //t0 = get_timestamp();
+  dptr = base.allocateMemory<double>(nsize*jsize, ierr);
+  ntptr = base.allocateMemory<double>(nsize*jsize, ierr);
+  pptr = base.allocateMemory<double>(psize*jsize, ierr);
+  //t1 = get_timestamp();
+  //cout << "Allocate memory: " << get_secs(t0, t1) << endl;
+	
+  //write data to device
+  //t0 = get_timestamp();
+  base.writeData<double>(dptr, data, nsize*jsize);	
+  //t1 = get_timestamp();
+  //cout << "Write data set: " << get_secs(t0, t1) << endl << endl;
+	
+  for (int i = 0; i < 5; i++) {
+    //write parameters to device
+    //t0 = get_timestamp();
+    base.writeData<double>(pptr, p, psize*jsize);
+    //t1 = get_timestamp();
+    //cout << "Write parameters: " << get_secs(t0, t1) << endl;
+
+    //set function to calcNt and execute it with necessary parameters
+    //t0 = get_timestamp();
+    base.callNt<double>(ntptr, pptr, psize, nsize, jsize, 0.025);
+    //t1 = get_timestamp();
+		
+    //cout << "Calc N(t): " << get_secs(t0, t1) << endl;
+		
+    //set function to chi2 and execute it with necessary parameters
+    //t0 = get_timestamp();
+    base.callChi2<double>(ntptr, dptr, ntptr, nsize*jsize);
+    //t1 = get_timestamp();
+    //cout << "Calc chi^2: " << get_secs(t0, t1) << endl;
+	
+    //set function so sum and execute it with necessary parameters
+    //t0 = get_timestamp();
+    base.callSum<double>(ntptr, ntptr, nsize*jsize);
+    //t1 = get_timestamp();
+    //cout << "Calc sum: " << get_secs(t0, t1) << endl;
+	
+    //read calculated sum (one value)
+    //t0 = get_timestamp();
+    base.readData<double>(ntptr, &data_out, 1);
+    //t1 = get_timestamp();
+    //cout << "Read sum: " << get_secs(t0, t1) << endl;
+    cout << "Sum nt: " << data_out << endl;
+		
+    /*
+      for (int i = 0; i < psize*jsize; i++) {
+      int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
+      p[i] = sign*(double)rand()/RAND_MAX;
+      }
+    */
+		
+		
+    //cout << endl;
+  }
+	
+  //free device memory
+  //t0 = get_timestamp();
+  base.freeMemory<double>(dptr, nsize*jsize);
+  base.freeMemory<double>(ntptr, nsize*jsize);
+  base.freeMemory<double>(pptr, psize*jsize);
+  //t1 = get_timestamp();
+  //cout << "Free memory: " << get_secs(t0, t1) << endl;
+	
+  tend = get_timestamp();
+	
+  cout << endl  << "time: " << get_secs(tstart, tend) << endl;
+		
+  return 0;
+}
+
--- a/test/testChiSquare.cpp
+++ b/test/testChiSquare.cpp
@ -0,0 +1,168 @@
+#include <iostream>
+#include <vector>
+#include "DKSBase.h"
+
+using namespace std;
+
+void initData(vector< vector<double> > &v, int length) {
+
+  for (unsigned int i = 0; i < v.size(); i++) {
+    for (int j = 0; j < length; j++) {
+      v[i].push_back(j);
+    }
+  }
+
+}
+
+
+void printData(vector< vector<double> > &v) {
+  for (unsigned int i = 0; i < v.size(); i++) {
+    for (unsigned int j = 0; j < v[i].size(); j++) {
+      cout << v[i][j] << "\t";
+    }
+    cout << endl;
+  }
+}
+
+void initData(double *data, int sensors, int length) {
+
+  for (int i = 0; i < sensors; i++) {
+    for (int j = 0; j < length; j++) {
+      data[i*length + j] = j;
+    }
+  }
+
+}
+
+
+void printData(double *data, int sensors, int length) {
+  for (int i = 0; i < sensors; i++) {
+    for (int j = 0; j < length; j++) {
+      cout << data[i*length + j] << "\t";
+    }
+    cout << endl;
+  }
+}
+
+void initPar(double *par, int npar) {
+
+  for (int i = 0; i < npar; i++)
+    par[i] = (double)i / npar;
+
+}
+
+void printDiv(int size) {
+  for (int i = 0; i < size; i++)
+    cout << "=";
+  cout << endl;
+}
+
+void calcChisq(vector< vector<double> > fData, double * par, double fTimeResolution, double fRebin)
+{
+
+  double chisq = 0.0;
+  double theo, data;
+  const double tau=2.197019;
+  const double dt0 = fTimeResolution*0.5*(fRebin-1);
+  double time;
+  double w = par[0]*0.08516155035269027; 
+
+  unsigned int i, j;
+
+  for (i=0; i<fData.size(); i++) {
+    for (j=0; j<fData[0].size(); j++) {
+      data = fData[i][j];
+      time = dt0+fTimeResolution*fRebin*j;
+
+      theo = par[2 + i*4] * exp(-time/tau)*(1.0 + par[3 + i*4]*exp(-0.5 * pow(par[1]*time,2.0))*cos(w*time+par[4+i*4]*1.74532925199432955e-2))+par[5+i*4]; 
+      if (data != 0.0) {
+	chisq += (theo-data)*(theo-data)/data;
+	cout << (theo-data)*(theo-data)/data << "\t";
+      } else {
+	chisq += theo*theo;
+	cout << theo*theo << "\t";
+      }
+    }
+    cout << endl;
+  }
+
+  cout << "Chisq: " << chisq << endl;
+
+}
+
+
+int main(int argc, char *argv[]) {
+
+  bool useCuda = true;
+  if (argc == 2 && atoi(argv[1]) == 1)
+    useCuda = false;
+
+  int ierr;
+  int sensors = 5;
+  int length = 10;
+  int npar = 4 * sensors + 2;
+  int ndata = sensors * length;
+  
+  double result;
+
+  double fTimeResolution = 0.05;
+  double fRebin = 5;
+
+  double *par = new double[npar];
+  initPar(par, npar);
+  
+  vector< vector< double > > fData;
+  fData.resize(sensors);
+  initData(fData, length);
+  printData(fData);
+  printDiv(75);
+
+  DKSBase dksbase;
+  if (useCuda)
+    dksbase.setAPI("Cuda", 4);
+  else
+    dksbase.setAPI("OpenCL", 6);
+  dksbase.setDevice("-gpu", 4);
+  dksbase.initDevice();
+  dksbase.setupFFT(0, NULL);
+
+
+  void *mem_data, *mem_par, *mem_chisq;
+  cout << "Allocate memory" << endl;
+  mem_par = dksbase.allocateMemory<double>(npar, ierr);
+  mem_data = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
+  mem_chisq = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
+  
+  
+  cout << "Write data" << endl;
+  dksbase.writeData<double>(mem_par, par, npar);
+  for (int i = 0; i < sensors; i++)
+    dksbase.writeData<double>(mem_data, &fData[i][0], length, i*length);
+  
+  
+  
+  cout << "Call PHistoTFFcn" << endl;
+  dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq, 
+			  fTimeResolution, fRebin, 
+			  sensors, length, npar, result);
+  cout << "Result: " << result << endl;
+  
+  
+  double *out_data = new double[ndata];
+  dksbase.readData<double>(mem_chisq, out_data, ndata);
+  printDiv(75);
+  printData(out_data, sensors, length);
+  printDiv(75);
+  
+  calcChisq(fData, par, fTimeResolution, fRebin);
+  printDiv(75);
+  
+  cout << "Free memory" << endl;
+  dksbase.freeMemory<double>(mem_par, npar);
+  dksbase.freeMemory<double>(mem_data, ndata);
+  dksbase.freeMemory<double>(mem_chisq, ndata);
+  
+
+  return 0;
+
+}
--- a/test/testChiSquareRT.cpp
+++ b/test/testChiSquareRT.cpp
@ -0,0 +1,193 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <omp.h>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+void initData(double *data, int N, bool ones = false) {
+  for (int i = 0; i < N; i++) {
+    if (ones) 
+      data[i] = 1.0;
+    else
+      data[i] = (double)rand() / RAND_MAX;
+  }
+}
+
+template <typename T>
+void printData(T *data, int N) {
+  for (int i = 0; i < N; i++)
+    std::cout << data[i] << "\t";
+  std::cout << std::endl;
+}
+
+
+const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])";
+//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])";
+//const std::string funct = "p[m[0]] * se(t, p[m[1]])";
+//const std::string funct = "p[m[1]] + p[m[0]]";
+
+double fTheory(double time, double *par, double *func, int *map) {
+  return cos(time*par[0]) - exp(-time*par[map[0]]);
+}
+
+double testFunctionSerial(double *data, double *par, double *func, int *map,
+			  double N0, double tau, double bkg, double timeStep,
+			  int startTimeBin, int endTimeBin) 
+{
+  double time, diff, theo;
+  double chisq = 0;
+  for (int i = startTimeBin; i < endTimeBin; ++i) {
+    time = i * timeStep;
+    theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
+    diff = data[i] - theo;
+
+    chisq += diff * diff / data[i];
+  }
+
+  return chisq;
+}
+
+double testFunctionParallel(double *data, double *par, double *func, int *map,
+			    double N0, double tau, double bkg, double timeStep,
+			    int startTimeBin, int endTimeBin)
+{
+  int i, chunk;
+  double time, diff, theo;
+  double chisq = 0;
+
+  chunk = (endTimeBin - startTimeBin) / omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq)
+  for (i = startTimeBin; i < endTimeBin; ++i) {
+    time = i * timeStep;
+    theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
+    diff = data[i] - theo;
+    
+    chisq += diff * diff / data[i];
+  }
+
+  return chisq;
+}
+  
+int main(int argc, char *argv[]) {
+
+  int Loop = 100;
+
+  //init test data on the host
+  int Ndata = 8;
+  if (argc  > 1)
+    Ndata = atoi(argv[1]);
+
+  int api = 1;
+  if (argc > 2)
+    api = atoi(argv[2]);
+
+  int Npar = 66;
+  int Nfunc = 1;
+  int Nmap = 4;
+
+  double *data = new double[Ndata];
+  double *par = new double[Npar];
+  double *func = new double[Nfunc];
+  int *map = new int[Nmap];
+
+  initData(data, Ndata);
+  initData(par, Npar);
+  initData(func, Nfunc);
+  map[0] = 1;
+  map[1] = 2;
+  map[2] = 3;
+  map[3] = 4;
+
+  //create timers
+  DKSTimer serialTimer;
+  DKSTimer cudaTimer;
+  DKSTimer ompTimer;
+  DKSTimer gpuOverhead;
+  serialTimer.init("Serial timer");
+  cudaTimer.init("Cuda timer");
+  ompTimer.init("OpenMP timer");
+  gpuOverhead.init("Overhead for gpu");
+
+
+  //serial version
+  double resultSerial;
+
+  serialTimer.start();
+  for (int i = 0; i < Loop; i++)
+    resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
+  serialTimer.stop();
+
+  //openmp version
+  double resultOMP = 0.0;
+
+  ompTimer.start();
+  //for (int i = 0; i < Loop; i++)
+  //  resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
+  ompTimer.stop();
+
+
+  //create and init dkabase
+  gpuOverhead.start();
+
+  DKSBaseMuSR dksbase;
+  if (api == 1)
+    dksbase.setAPI("Cuda");
+  else
+    dksbase.setAPI("OpenCL");
+
+  dksbase.setDevice("-gpu");
+  dksbase.initDevice();
+  dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap);
+
+  //allocate memory on the device
+  int ierr;
+  void *data_ptr;
+
+  data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+  
+  dksbase.writeData<double>(data_ptr, data, Ndata);
+  dksbase.writeFunctions(func, Nfunc);
+  dksbase.writeMaps(map, Nmap);
+
+  dksbase.callCompileProgram(funct);
+  gpuOverhead.stop();
+
+  double resultCuda;
+
+  cudaTimer.start();
+  for (int i = 0; i < Loop; i++) {
+    dksbase.writeParams(par, Npar);
+    int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap,
+					   0.0, 0.1, 0, resultCuda);
+
+    if (ierr != 0)
+      exit (EXIT_FAILURE);
+
+  }
+  cudaTimer.stop();
+
+  std::cout << std::endl;
+  std::cout << "=======================Results=======================" << std::endl;
+  std::cout << "Result serial  = " << resultSerial << std::endl;
+  std::cout << "Result prallel = " << resultOMP << std::endl;
+  std::cout << "Result cuda    = " << resultCuda << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "=======================Timings=======================" << std::endl;
+  serialTimer.print();
+  ompTimer.print();
+  cudaTimer.print();
+  gpuOverhead.print();
+  std::cout << std::endl;
+
+  dksbase.freeMemory<double>(data_ptr, Ndata);
+
+  return 0;
+
+
+}
--- a/test/testCollimatorPhysics.cpp
+++ b/test/testCollimatorPhysics.cpp
@ -0,0 +1,248 @@
+#include <iostream>
+
+#include <vector>
+#include <sys/time.h>
+
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+
+using namespace std;
+
+typedef struct {
+  int label;
+  unsigned localID;
+  double Rincol[3];
+  double Pincol[3];
+} PART_SMALL;
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Vector;
+
+PART_SMALL initPartSmall(int d) {
+
+  PART_SMALL p;
+  p.label = 0;
+  p.localID = d;
+
+  p.Rincol[0] = 0.0;
+  p.Rincol[1] = 0.0;
+  p.Rincol[2] = 0.02;
+
+  p.Pincol[0] = 0.0;
+  p.Pincol[1] = 0.0;
+  p.Pincol[2] = 3.9920183237269791e-01;
+
+  return p;
+}
+
+Vector initVector() {
+  Vector tmp;
+  tmp.x = 0.5;
+  tmp.y = 0.5;
+  tmp.z = 0.5;
+
+  return tmp;
+}
+
+void printPart(PART_SMALL p) {
+  cout << "label: " << p.label << ", ";
+  cout << "localid: " << p.localID << ",";
+  cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
+  cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
+  cout << endl;
+}
+
+void printVector(Vector v) {
+  cout << v.x << "\t" << v.y << "\t" << v.z << endl;
+ }
+
+void initParts(PART_SMALL *p, int N) {
+  for (int i = 0; i < N; i++)
+    p[i] = initPartSmall(i);
+}
+
+void printParts(PART_SMALL *p, int N) {
+  for (int i = 0; i < N; i++)
+    printPart(p[i]);
+  cout << endl;
+}
+
+void initVectors(Vector *v, int N) {
+  for (int i = 0; i < N; i++)
+    v[i] = initVector();
+}
+
+void printVectors(Vector *v, int N) {
+  for (int i = 0; i < N; i++)
+    printVector(v[i]);
+  cout << endl;
+}
+
+
+void initParams(double *data) {
+  data[0]  = 0.0;//2.0000000000000000e-02;
+  data[1]  = 1.0;//1.0000000000000000e-02;	
+  data[2]  = 2.2100000000000000e+00;
+  data[3]  = 6.0000000000000000e+00;	
+  data[4]  = 1.2010700000000000e+01;	
+  data[5]  = 2.6010000000000000e+00;	
+  data[6]  = 1.7010000000000000e+03;	
+  data[7]  = 1.2790000000000000e+03;	
+  data[8]  = 1.6379999999999999e-02;	
+  data[9]  = 1.9321266968325795e-01;	
+  data[10] = 7.9000000000000000e+01;	
+  data[11] = 1.0000000000000002e-12;
+
+}
+
+void printDouble(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    std::cout << data[i] << "\t";
+  std::cout << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 1e5;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+
+    if (argv[i] == string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of particles: " << numpart << endl;
+  cout << "Number of loops: " << loop << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //init part vector to test mc
+  PART_SMALL *parts = new PART_SMALL[numpart];
+  initParts(parts, numpart);
+
+  double *params = new double[12];
+  initParams(params);
+  
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+
+  //init random
+  base.callInitRandoms(numpart);
+
+  //**test collimator physics and sort***//
+  void *part_ptr, *param_ptr;
+
+  //allocate memory for particles
+  part_ptr = base.allocateMemory<PART_SMALL>(numpart, ierr);
+  param_ptr = base.allocateMemory<double>(12, ierr);
+
+  //transfer data to device
+  base.writeData<PART_SMALL>(part_ptr, parts, numpart);
+  base.writeData<double>(param_ptr, params, 12);
+
+  int numaddback;
+  //test calls to do some first executions
+  base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
+  base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);  
+  base.syncDevice();
+  //std::cout << "particles to add back: " << numaddback << std::endl;
+
+  struct timeval timeStart, timeEnd;
+  std::cout << "Start MC" << std::endl;
+
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < loop; i++) {
+    base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
+    base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
+    base.syncDevice();
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  std::cout << "addback: " << numaddback << std::endl;
+
+  std::cout << "End MC" << std::endl;
+  double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	       (timeEnd.tv_usec - timeStart.tv_usec));
+
+  std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl;
+  std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
+
+  //read data from device
+  base.readData<PART_SMALL>(part_ptr, parts, numpart);
+
+  //free memory
+  base.freeMemory<PART_SMALL>(part_ptr, numpart);
+  base.freeMemory<double>(param_ptr, 12);  
+
+
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++) {
+    std::cout << parts[i].label << "\t" 
+	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
+	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
+	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
+	      << std::endl;
+  }
+
+  std:: cout << "..." << std::endl;
+
+  for (int i = numpart - 10; i < numpart; i++) {
+    std::cout << parts[i].label << "\t" 
+	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
+	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
+	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
+	      << std::endl;
+  }
+
+  double arx = 0, ary = 0, arz = 0;
+  double apx = 0, apy = 0, apz = 0;
+  for (int i = 0; i < numpart; i++) {
+
+    arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart;
+    ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart;
+    arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart;
+
+    apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart;
+    apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart;
+    apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart;
+
+  }
+
+  std::cout << std::fixed << std::setprecision(10);
+  std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
+	    << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
+
+
+  cout << "==========================END TEST==========================" << endl;
+  return 0;
+
+}
--- a/test/testCollimatorPhysicsMPI.cpp
+++ b/test/testCollimatorPhysicsMPI.cpp
@ -0,0 +1,126 @@
+#include <iostream>
+
+#include <vector>
+
+#include "DKSBase.h"
+#include "cuda_runtime.h"
+
+#include <mpi.h>
+
+using namespace std;
+
+typedef struct {
+  int label;
+  unsigned localID;
+  double Rincol[3];
+  double Pincol[3];
+  long IDincol;
+  int Binincol;
+  double DTincol;
+  double Qincol;
+  long LastSecincol;
+  double Bfincol[3];
+  double Efincol[3];
+} PART;
+
+PART initPart(int d) {
+
+  PART p;
+  p.label = d;
+  p.localID = d;
+  for (int i = 0; i < 3; i++) {
+    p.Rincol[i] = 0.5;// / (d+1);
+    p.Pincol[i] = 0.5;// / (d+1);
+    p.Bfincol[i] = 1.0 / (d+1);
+    p.Efincol[i] = 1.0 / (d+1);
+  }
+  p.IDincol = d;
+  p.Binincol = d;
+  p.DTincol = d;
+  p.Qincol = d;
+  p.LastSecincol = d;
+
+  return p;
+
+}
+
+void printPart(PART p) {
+
+  cout << "label: " << p.label << ", ";
+  //cout << "localID: " << p.localID << ", ";
+  cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
+  cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", ";
+  //cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", ";
+  //cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", ";
+  //cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", ";
+  //cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl;
+  cout << endl;
+
+
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  int numpart = 500501;
+
+  DKSBase base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.callInitRandoms(numpart);
+
+  PART tmp;
+  vector<PART> p;
+  vector<PART> p_out;
+  p_out.resize(numpart);
+
+  for (int i = 0; i < numpart; i++) {
+    tmp = initPart(i + 1);
+    p.push_back(tmp);
+  }
+
+  if (numpart <= 20) {
+    for (int i = 0; i < 10; i++)
+      printPart(p[i]);
+    cout << endl;
+  }
+
+  double params[19];
+  for (int i = 0; i < 19; i++)
+    params[i] = 0.05;
+  params[0] = 0;
+  params[1] = 1;
+
+  void *mem_ptr, *par_ptr;
+  
+  par_ptr = base.allocateMemory<double>(19, ierr);
+  base.writeData<double>(par_ptr, params, 19);
+
+  mem_ptr = base.allocateMemory<PART>(numpart, ierr);
+  base.writeData<PART>(mem_ptr, &p[0], numpart);
+
+  int addback, dead;
+  for (int i = 0; i < 100; i++)
+    base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead);
+  cout << "Add back: " << addback << ", dead: " << dead << endl;
+
+  base.readData<PART>(mem_ptr, &p_out[0], numpart);
+  base.freeMemory<PART>(mem_ptr, ierr);
+  base.freeMemory<double>(par_ptr, ierr);
+
+  if (numpart <= 20) {
+    for (int i = 0; i < numpart; i++)
+      printPart(p_out[i]);
+  }
+
+  MPI_Finalize();
+  return 0;
+
+}
--- a/test/testCollimatorPhysicsSoA.cpp
+++ b/test/testCollimatorPhysicsSoA.cpp
@ -0,0 +1,250 @@
+#include <iostream>
+#include <iomanip>
+
+#include <vector>
+#include <sys/time.h>
+
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+#include <omp.h>
+
+using namespace std;
+
+typedef struct {
+  int *label;
+  unsigned *localID;
+  double *rx;
+  double *ry;
+  double *rz;
+  double *px;
+  double *py;
+  double *pz;
+} PART;
+
+
+void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz,
+	       double *px, double *py, double *pz, int npart) {
+
+  for (int i = 0; i < npart; i++) {
+    label[i] = 0;
+    localID[i] = i;
+    rx[i] = 0.0;
+    ry[i] = 0.0;
+    rz[i] = 0.02;
+    px[i] = 0.0;
+    py[i] = 0.0;
+    pz[i] = 3.9920183237269791e-01;
+  }
+}
+
+void initParams(double *data) {
+  data[0]  = 0.0;//2.0000000000000000e-02;
+  data[1]  = 1.0;//1.0000000000000000e-02;	
+  data[2]  = 2.2100000000000000e+00;
+  data[3]  = 6.0000000000000000e+00;	
+  data[4]  = 1.2010700000000000e+01;	
+  data[5]  = 2.6010000000000000e+00;	
+  data[6]  = 1.7010000000000000e+03;	
+  data[7]  = 1.2790000000000000e+03;	
+  data[8]  = 1.6379999999999999e-02;	
+  data[9]  = 1.9321266968325795e-01;	
+  data[10] = 7.9000000000000000e+01;	
+  data[11] = 1.0000000000000002e-12;
+
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 1e5;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+
+    if (argv[i] == string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  int threads = 0;
+  /*
+#pragma offload target(mic:0) out(threads)
+  {
+    #pragma omp parallel
+    {
+      threads = omp_get_num_threads();
+    }
+  }
+  */
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of particles: " << numpart << endl;
+  cout << "Number of loops: " << loop << endl;
+  cout << "Number of threads: " << threads << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //init part vector to test mc
+  //int *label;
+  //unsigned *localID;
+  //double *rx, *ry, *rz, *px, *py, *pz;
+  PART p;
+  p.label   = (int*) _mm_malloc(sizeof(int)*numpart, 64);
+  p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64);
+  p.rx      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.ry      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.rz      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.px      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.py      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.pz      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart);
+
+  double *params = new double[12];
+  initParams(params);
+ 
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+
+  //init random
+  base.callInitRandoms(numpart);
+
+  //**test collimator physics and sort***//
+  void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
+
+  //allocate memory for particles
+  label_ptr   = base.allocateMemory<int>(numpart, ierr);
+  localID_ptr = base.allocateMemory<unsigned>(numpart, ierr);
+  rx_ptr      = base.allocateMemory<double>(numpart, ierr);
+  ry_ptr      = base.allocateMemory<double>(numpart, ierr);
+  rz_ptr      = base.allocateMemory<double>(numpart, ierr);
+  px_ptr      = base.allocateMemory<double>(numpart, ierr);
+  py_ptr      = base.allocateMemory<double>(numpart, ierr);
+  pz_ptr      = base.allocateMemory<double>(numpart, ierr);
+
+  param_ptr = base.allocateMemory<double>(12, ierr);
+
+  //transfer data to device
+  base.writeData<int>(label_ptr, p.label, numpart);
+  base.writeData<unsigned>(localID_ptr, p.localID, numpart);
+  base.writeData<double>(rx_ptr, p.rx, numpart);
+  base.writeData<double>(ry_ptr, p.ry, numpart);
+  base.writeData<double>(rz_ptr, p.rz, numpart);
+  base.writeData<double>(px_ptr, p.px, numpart);
+  base.writeData<double>(py_ptr, p.py, numpart);
+  base.writeData<double>(pz_ptr, p.pz, numpart);
+
+  //transfer params to device
+  base.writeData<double>(param_ptr, params, 12);
+
+  std::cout << "test runs" << std::endl;
+
+  int numaddback;
+  //test calls to do some first executions
+  base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+  				py_ptr, pz_ptr, param_ptr, numpart);
+  base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+				    py_ptr, pz_ptr, param_ptr, numpart, numaddback);  
+  base.syncDevice();
+
+  struct timeval timeStart, timeEnd;
+  std::cout << "Start MC" << std::endl;
+
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < loop; i++) {
+    base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+    				  py_ptr, pz_ptr, param_ptr, numpart);
+    base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+				      py_ptr, pz_ptr, param_ptr, numpart, numaddback);
+    base.syncDevice();
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  std::cout << "addback: " << numaddback << std::endl;
+
+  std::cout << "End MC" << std::endl;
+  double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	       (timeEnd.tv_usec - timeStart.tv_usec));
+
+  std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl;
+  std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
+
+  //read data from device
+  base.readData<int>(label_ptr, p.label, numpart);
+  base.readData<unsigned>(localID_ptr, p.localID, numpart);
+  base.readData<double>(rx_ptr, p.rx, numpart);
+  base.readData<double>(ry_ptr, p.ry, numpart);
+  base.readData<double>(rz_ptr, p.rz, numpart);
+  base.readData<double>(px_ptr, p.px, numpart);
+  base.readData<double>(py_ptr, p.py, numpart);
+  base.readData<double>(pz_ptr, p.pz, numpart);
+
+  //free memory
+  base.freeMemory<int>(label_ptr, numpart);
+  base.freeMemory<unsigned>(localID_ptr, numpart);
+  base.freeMemory<double>(rx_ptr, numpart);
+  base.freeMemory<double>(ry_ptr, numpart);
+  base.freeMemory<double>(rz_ptr, numpart);
+  base.freeMemory<double>(px_ptr, numpart);
+  base.freeMemory<double>(py_ptr, numpart);
+  base.freeMemory<double>(pz_ptr, numpart);
+
+  base.freeMemory<double>(param_ptr, 12);
+
+  /*  
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++) {
+    std::cout <<  p.label[i] << "\t" << p.rx[i] 
+	      << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] 
+	      << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
+  }
+  std:: cout << "..." << std::endl;
+
+  for (int i = numpart - 10; i < numpart; i++) {
+    std::cout << p.label[i] << "\t" << p.rx[i] 
+	      << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] 
+	      << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
+  }
+
+  double arx = 0, ary = 0, arz = 0;
+  double apx = 0, apy = 0, apz = 0;
+  for (int i = 0; i < numpart; i++) {
+
+    arx += sqrt(p.rx[i] * p.rx[i]) / numpart;
+    ary += sqrt(p.ry[i] * p.ry[i]) / numpart;
+    arz += sqrt(p.rz[i] * p.rz[i]) / numpart;
+
+    apx += sqrt(p.px[i] * p.px[i]) / numpart;
+    apy += sqrt(p.py[i] * p.py[i]) / numpart;
+    apz += sqrt(p.pz[i] * p.pz[i]) / numpart;
+
+  }
+
+  std::cout << std::fixed << std::setprecision(10);
+  std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
+	    << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
+  */
+  cout << "==========================END TEST==========================" << endl;
+  return 0;
+
+}
--- a/test/testDKS.cpp
+++ b/test/testDKS.cpp
@ -0,0 +1,15 @@
+#include <iostream>
+#include <complex>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+	DKSBase base = DKSBase();
+	base.getDevices();
+		
+	return 0;
+}
+
--- a/test/testFFT.cpp
+++ b/test/testFFT.cpp
@ -0,0 +1,83 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  if (argc == 2) {
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, argv[2]);
+  } else {
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+
+  cout << "Begin DKS Base tests" << endl;
+		
+  int N = 2;
+  int dimsize[3] = {N, N, N};
+	
+  complex<double> *cdata = new complex<double>[N];
+  complex<double> *cfft = new complex<double>[N];
+  for (int i = 0; i < N; i++) {
+    cdata[i] = complex<double>(0, 0);
+    cfft[i] = complex<double>(0, 0);
+  }
+	
+  cdata[0] = complex<double>(1.73205, 1.73205);
+	
+  timestamp_t t0, t1;
+	
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+	
+  void *mem_ptr;
+  int ierr;
+	
+  /* write data to device */	
+  mem_ptr = base.pushData< complex<double> >( (const void*)cdata, N, ierr);
+
+  /* execute fft */
+  base.callFFT(mem_ptr, 1, dimsize);
+	
+  /* execute ifft */	
+  base.callIFFT(mem_ptr, 1, dimsize);
+	
+  /* execute normalize */
+  base.callNormalizeFFT(mem_ptr, 1, dimsize);
+	
+  /* read data from device */
+  base.pullData< complex<double> >(mem_ptr, cfft, N);
+	
+  /* print results */
+	
+  cout << "Data" << endl;
+  for (int i = 0; i < N; i++)
+    cout << cdata[i] << "\t";
+  cout << endl;
+	
+  cout << "FFT" << endl;
+  for (int i = 0; i < N; i++)
+    cout << cfft[i] << "\t";
+  cout << endl;
+	
+		
+  return 0;
+}
+
--- a/test/testFFT3D.cpp
+++ b/test/testFFT3D.cpp
@ -0,0 +1,159 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double>* &data, int N, int dim, bool normalize = false);
+void printData3DN4(complex<double>* &data, int N, int dim);
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+/* usage - ./testFFT3D */
+int main(int argc, char *argv[]) {
+
+  int N = 16;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  if (argc == 2) {
+    N = atoi(argv[1]);
+    strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, "-gpu");
+  } else if (argc == 4) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, argv[3]);
+  } else {
+    N = 16;
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N, N, N};
+
+  cout << "Begin DKS Base tests, N = " <<  N << endl;
+
+  int dim = 3;
+  complex<double> *cdata = new complex<double>[N*N*N];
+  complex<double> *cfft = new complex<double>[N*N*N];
+  complex<double> *cifft = new complex<double>[N*N*N];
+	
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
+	cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+	cifft[i*N*N + j*N + k] = complex<double>(0, 0);
+      }
+    }
+  }
+	
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+  base.setupFFT(3, dimsize);
+	
+  void *mem_ptr;
+  int ierr;
+
+  /* allocate memory on device */
+  mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+	
+  /* write data to device */	
+  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+
+  /* execute fft */
+  base.callFFT(mem_ptr, 3, dimsize);
+	
+  /* execute ifft */	
+  base.callIFFT(mem_ptr, 3, dimsize);
+
+  /* execute normalize */
+  base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	
+  /* read data from device */
+  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+	
+  /* free device memory */
+  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+	
+  /* compare results */
+  compareData(cdata, cifft, N, dim);
+		
+  return 0;
+}
+
+void printData(complex<double>* &data, int N, int dim, bool normalize) {
+  int ni, nj, nk;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+    
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	if (!normalize) {
+	  cout << data[i*ni*ni + j*nj + k].real() << " ";
+	  cout << data[i*ni*ni + j*nj + k].imag() << "\t";
+	} else
+	  cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N; k++) {
+	double d = data[i*N*N + j*N + k].real();
+	double a = data[i*N*N + j*N + k].imag();
+				
+	if (d < 10e-5 && d > -10e-5)
+	  d = 0;
+	if (a < 10e-5 && a > -10e-5)
+	  a = 0;
+					
+	cout << d << "; " << a << "\t";
+      }
+    }
+    cout << endl;
+  }
+  cout << endl;
+    
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id].real() - data2[id].real());
+	sum += fabs(data1[id].imag() - data2[id].imag());
+      }
+    }
+  }
+  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
--- a/test/testFFT3DRC.cpp
+++ b/test/testFFT3DRC.cpp
@ -0,0 +1,199 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
+void initData(double *data, int dimsize[3]);
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
+void printHelp();
+
+int main(int argc, char *argv[]) {
+
+  int N1 = 8;
+  int N2 = 8;
+  int N3 = 8;
+  int dim = 3;
+  int loop = 10;
+
+  if ( readParams(argc, argv, N1, N2, N3, loop) )
+    return 0;
+
+  int dimsize[3] = {N3, N2, N1};
+  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
+
+  double *rdata = new double[sizereal];
+  double *outdata = new double[sizereal];
+  complex<double> *cfft = new complex<double>[sizecomp];
+
+  for (int i=0; i<sizecomp; ++i) {
+    cfft[i].real() = 7.;
+    cfft[i].imag() = 3.33;
+  }
+  initData(rdata, dimsize);
+
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+#ifdef DKS_MIC
+  DKSBase base;
+  base.setAPI("OpenMP", 6);
+  base.setDevice("-mic", 4);
+  base.initDevice();
+  base.setupFFTRC(dim, dimsize);
+  /* setup backward fft (COMPLEX->REAL) */
+  base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
+#endif
+
+#ifdef DKS_CUDA
+  DKSBase base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.setupFFT(dim, dimsize);
+#endif
+
+  // allocate memory on device
+  int ierr;
+  void *real_ptr, *comp_ptr, *real_res_ptr;
+  real_ptr = base.allocateMemory<double>(sizereal, ierr);
+  real_res_ptr = base.allocateMemory<double>(sizereal, ierr);
+  comp_ptr = base.allocateMemory< std::complex<double> >(sizecomp, ierr);
+
+  // execute one run before starting the timers
+  base.writeData<double>(real_ptr, rdata, sizereal);
+  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+  base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
+  base.readData<double>(real_res_ptr, outdata, sizereal);
+
+  //timer for total loop time, FFT and IFFT calls
+  struct timeval timeStart, timeEnd;
+  struct timeval timeFFTStart[loop], timeFFTEnd[loop];
+  struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
+
+  gettimeofday(&timeStart, NULL);
+  for (int i=0; i<loop; ++i){
+
+    // write data to device
+    base.writeData<double>(real_ptr, rdata, sizereal);
+
+    // execute rcfft
+    gettimeofday(&timeFFTStart[i], NULL);
+    base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+    gettimeofday(&timeFFTEnd[i], NULL);
+
+    // execute crfft
+    gettimeofday(&timeIFFTStart[i], NULL);
+    base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
+    gettimeofday(&timeIFFTEnd[i], NULL);
+
+    //normalize
+#ifdef DKS_CUDA
+    base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
+#endif
+
+    // read IFFT data from device
+    base.readData<double>(real_res_ptr, outdata, sizereal);
+
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  // free device memory
+  base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
+  base.freeMemory<double>(real_ptr, sizereal);
+  base.freeMemory<double>(real_res_ptr, sizereal);
+
+  // compare in and out data to see if we get back the same results
+  compareData(rdata, outdata, N1, N2, N3, dim);
+
+  //calculate seconds for total time and fft times
+  double tfft = 0;
+  double tifft = 0;
+  double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + 
+		  (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
+
+  for (int i = 0; i < loop; i++) {
+    tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + 
+	      (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
+
+    tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + 
+	      (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
+  }
+
+  //print timing results
+  std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
+	    << "\nTotal time\t" << ttot <<  "s\tavg time\t"  << ttot / loop  << "s"
+	    << "\nFFT total\t"  << tfft <<  "s\tFFT avg \t"  << tfft / loop  << "s"
+	    << "\nIFFT total\t" << tifft << "s\tIFFT avg\t"  << tifft / loop << "s"
+	    << "\n\n";
+
+  return 0;
+}
+
+void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
+  int id;
+  double sum = 0;
+  for (int i = 0; i < NI; i++) {
+    for (int j = 0; j < NJ; j++) {
+      for (int k = 0; k < NK; k++) {
+	id = k*NI*NJ + j*NI + i;
+	sum += fabs(data1[id] - data2[id]);
+      }
+    }
+  }
+  std::cout << "RC <--> CR diff: " << sum << std::endl;
+}
+
+void initData(double *data, int dimsize[3]) {
+  for (int i = 0; i < dimsize[2]; i++) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
+      }
+    }
+  }
+}
+
+void printHelp() {
+  std::cout << std::endl;
+
+  std::cout << "testFFT3DRC executes 3D real complex and 3D complex real"
+	    << "function on the Intel MIC.\n";
+  std::cout << "Operations performed by testRC are: "
+	    << "write data to MIC -> FFT -> IFFT -> read data from MIC.\n";
+  std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z " 
+	    << "-loop $l\n";
+  std::cout << "where $x $y $z are number of elements in each dimension and "
+	    << "$l is the number of times all the operations will be performed.\n";
+
+  std::cout << std::endl;
+}
+
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
+
+  for (int i = 1; i < argc; i++) {
+
+    if ( argv[i] == std::string("-grid") ) {
+      N1 = atoi(argv[i + 1]);
+      N2 = atoi(argv[i + 2]);
+      N3 = atoi(argv[i + 3]);
+      i += 3;
+    }
+
+    if ( argv[i] == std::string("-loop") ) {
+      loop = atoi(argv[i + 1]);
+      i += 1;
+    }
+
+    if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) {
+      printHelp();
+      return true;
+    }
+  }
+
+  return false;
+}
--- a/test/testFFT3DRC_MIC.cpp
+++ b/test/testFFT3DRC_MIC.cpp
@ -0,0 +1,220 @@
+#include <iostream>
+#include <stdlib.h>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double>* &data, int N, int dim, bool normalize = false);
+void printData3DN4(complex<double>* &data, int N, int dim);
+void printData3DN4(double* data, int N, int dim);
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+void compareData(double* data1, double* data2, int N, int dim);
+
+/* Compute (K*L)%M accurately */
+static double moda(int K, int L, int M)
+{
+	return (double)(((long long)K * L) % M);
+}
+/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */
+static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4)
+{
+	double TWOPI = 6.2831853071795864769, phase, factor;
+	int n1, n2, n3, S1, S2, S3, index;
+
+	/* Generalized strides for row-major addressing of x */
+	S3 = 1;
+	S2 = (N3/2+1)*2;
+	S1 = N2*(N3/2+1)*2;
+
+	factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0;
+	for (n1 = 0; n1 < N1; n1++)
+	{
+		for (n2 = 0; n2 < N2; n2++)
+		{
+			for (n3 = 0; n3 < N3; n3++)
+			{
+				phase  = moda(n1,H1,N1) / N1;
+				phase += moda(n2,H2,N2) / N2;
+				phase += moda(n3,H3,N3) / N3;
+				index = n1*S1 + n2*S2 + n3*S3;
+				//cout << "index = " << index << endl;
+				x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3);
+			}
+		}
+	}
+}
+
+
+int main(int argc, char *argv[]) {
+
+	int N = atoi(argv[1]);
+	int dim = 3;
+	int dimsize[3] = {N, N, N};
+	int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+	int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2];
+
+	//double *rdata = new double[sizereal];
+	//double *outdata = new double[sizereal];
+	//complex<double> *cfft = new complex<double>[sizecomp];
+	double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
+	double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
+	complex<double> *cfft = (complex<double> *)malloc(sizecomp*sizeof(complex<double>));
+
+	init_r(rdata, N,N,N);
+
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+
+	DKSBase base;
+	base.setAPI("OpenMP", 6);
+	base.setDevice("-mic", 4);
+	base.initDevice();
+
+	/* setup forward fft (REAL->COMPLEX) */
+	base.setupFFTRC(dim, dimsize);
+
+	int ierr;	
+	void *real_ptr, *comp_ptr;
+
+	/* allocate memory on device */;
+	real_ptr = base.allocateMemory<double>(sizereal, ierr);
+	comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+
+	/* write data to device */	
+	base.writeData<double>(real_ptr, rdata, sizereal);
+
+	//printData3DN4(rdata,N,3);
+
+	/* execute rcfft */
+	base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+
+	/* read FFT data from device */
+	base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
+	base.writeData<double>(comp_ptr, cfft, sizereal);
+
+
+	/* setup backward fft (COMPLEX->REAL) */
+	base.setupFFTCR(dim, dimsize,1./(N*N*N));
+	/* execute crfft */
+	base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize);
+
+	/* normalize */
+	//base.callNormalizeC2RFFT(real_ptr, dim, dimsize);
+
+	/* read FFT data from device */
+	//base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
+
+	/* read IFFT data from device */
+	base.readData<double>(real_ptr, outdata, sizereal);
+
+	/* free device memory */
+	base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+	base.freeMemory<double>(real_ptr, sizereal);
+
+	/* compare data */
+	compareData(rdata, outdata, N, dim);
+
+	return 0;
+}
+
+void printData(complex<double>* &data, int N, int dim, bool normalize) {
+	int ni, nj, nk;
+	ni = (dim > 2) ? N : 1;
+	nj = (dim > 1) ? N : 1;
+	nk = N;
+
+	for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+			for (int k = 0; k < nk; k++) {
+				if (!normalize)
+					cout << data[i*ni*ni + j*nj + k].real() << "\t";
+				else
+					cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k].real();
+				double a = data[i*N*N + j*N + k].imag();
+
+				if (d < 10e-5 && d > -10e-5)
+					d = 0;
+				if (a < 10e-5 && a > -10e-5)
+					a = 0;
+
+				cout << d << "; " << a << "\t";
+			}
+		}
+		cout << endl;
+	}
+	cout << endl;
+
+}
+void printData3DN4(double* data, int N, int dim) {
+
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k];
+				//double a = data[i*N*N + j*N + k].imag();
+
+				if (d < 10e-5 && d > -10e-5)
+					d = 0;
+				//if (a < 10e-5 && a > -10e-5)
+				//	a = 0;
+
+				cout << d << "\t";
+			}
+		}
+		cout << endl;
+	}
+	cout << endl;
+
+}
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+	int ni, nj, nk, id;
+	ni = (dim > 2) ? N : 1;
+	nj = (dim > 1) ? N : 1;
+	nk = N;
+	double sum = 0;
+	for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+			for (int k = 0; k < nk; k++) {
+				id = i*ni*ni + j*nj + k;
+				sum += fabs(data1[id].real() - data2[id].real());
+				sum += fabs(data1[id].imag() - data2[id].imag());
+			}
+		}
+	}
+	cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
+void compareData(double* data1, double* data2, int N, int dim) {
+	int ni, nj, nk, id;
+	ni = (dim > 2) ? N : 1;
+	nj = (dim > 1) ? N : 1;
+	nk = N;
+	double sum = 0;
+	for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+			for (int k = 0; k < nk; k++) {
+				id = i*ni*ni + j*nj + k;
+				//sum += fabs(data1[id] - data2[id]/(N*N*N));
+				sum += fabs(data1[id] - data2[id]);
+			}
+		}
+	}
+	cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
+}
--- a/test/testFFT3DSO.cpp
+++ b/test/testFFT3DSO.cpp
@ -0,0 +1,159 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double>* &data, int N, int dim, bool normalize = false);
+void printData3DN4(complex<double>* &data, int N, int dim);
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+/* usage - ./testFFT3D */
+int main(int argc, char *argv[]) {
+
+  int N = 16;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  if (argc == 2) {
+    N = atoi(argv[1]);
+    strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, "-gpu");
+  } else if (argc == 4) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, argv[3]);
+  } else {
+    N = 16;
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N, N, N};
+
+  cout << "Begin DKS Base tests, N = " <<  N << endl;
+
+  int dim = 3;
+  complex<double> *cdata = new complex<double>[N*N*N];
+  complex<double> *cfft = new complex<double>[N*N*N];
+  complex<double> *cifft = new complex<double>[N*N*N];
+	
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
+	cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+	cifft[i*N*N + j*N + k] = complex<double>(0, 0);
+      }
+    }
+  }
+	
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+  base.setupFFT(3, dimsize);
+	
+  void *mem_ptr;
+  int ierr;
+
+  /* allocate memory on device */
+  mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+	
+  /* write data to device */	
+  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+
+  /* execute fft */
+  base.callFFT(mem_ptr, 3, dimsize);
+	
+  /* execute ifft */	
+  base.callIFFT(mem_ptr, 3, dimsize);
+
+  /* execute normalize */
+  base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	
+  /* read data from device */
+  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+	
+  /* free device memory */
+  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+	
+  /* compare results */
+  compareData(cdata, cifft, N, dim);
+		
+  return 0;
+}
+
+void printData(complex<double>* &data, int N, int dim, bool normalize) {
+  int ni, nj, nk;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+    
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	if (!normalize) {
+	  cout << data[i*ni*ni + j*nj + k].real() << " ";
+	  cout << data[i*ni*ni + j*nj + k].imag() << "\t";
+	} else
+	  cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N; k++) {
+	double d = data[i*N*N + j*N + k].real();
+	double a = data[i*N*N + j*N + k].imag();
+				
+	if (d < 10e-5 && d > -10e-5)
+	  d = 0;
+	if (a < 10e-5 && a > -10e-5)
+	  a = 0;
+					
+	cout << d << "; " << a << "\t";
+      }
+    }
+    cout << endl;
+  }
+  cout << endl;
+    
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id].real() - data2[id].real());
+	sum += fabs(data1[id].imag() - data2[id].imag());
+      }
+    }
+  }
+  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
--- a/test/testFFT3DTiming.cpp
+++ b/test/testFFT3DTiming.cpp
@ -0,0 +1,130 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+
+int main(int argc, char *argv[]) {
+
+	int N = 4;
+	char *api_name = new char[10];
+	char *device_name = new char[10];
+	if (argc == 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else if (argc > 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+		N = atoi(argv[3]);
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+	int dimsize[3] = {N, N, N};
+
+
+	cout << "Use api: " << api_name << endl;
+
+	cout << "Begin DKS Base tests, N = " <<  N << endl;		
+
+	complex<double> *cdata = new complex<double>[N*N*N];
+	complex<double> *cfft = new complex<double>[N*N*N];
+	complex<double> *cifft = new complex<double>[N*N*N];
+	
+	for (int i = 0; i < N; i++) {
+		for (int j = 0; j < N; j++) {
+			for (int k = 0; k < N; k++) {
+				cdata[i*N*N + j*N + k] = complex<double>((double)i / N, 0);
+				cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+				cifft[i*N*N + j*N + k] = complex<double>(0, 0);
+			}
+		}
+	}
+	
+	timestamp_t t0, t1;
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	DKSBase base;
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(api_name));
+	base.initDevice();
+	
+	void *mem_ptr;
+	int ierr;
+	
+	/* run stest funct to init device */
+	mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+	ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+	base.callFFT(mem_ptr, 3, dimsize);
+	base.callIFFT(mem_ptr, 3, dimsize);
+	base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+	base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+	/* end test */
+	
+	int steps = 10;
+	base.oclClearEvents();
+	t0 = get_timestamp();
+	for (int i = 0; i < steps; i++) {
+	
+		/* allocate memory on device */
+		mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+		
+		/* write data to device */	
+		ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+
+		/* execute fft */
+		base.callFFT(mem_ptr, 3, dimsize);
+	
+		/* execute ifft */	
+		base.callIFFT(mem_ptr, 3, dimsize);
+	
+		/* execute normalize */
+		base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	
+		/* read data from device */
+		base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+		
+		/* free device memory */
+		base.freeMemory< complex<double> >(mem_ptr, N);
+
+		//compareData(cdata, cifft, N, 3);
+	}
+	t1 = get_timestamp();
+	
+	cout << "=========================" << endl;
+	//base.oclEventInfo();
+	cout << "Average total: " << get_secs(t0, t1) / steps << endl;
+	cout << "=========================" << endl;
+	
+	
+	
+		
+	return 0;
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+    int ni, nj, nk, id;
+    ni = (dim > 2) ? N : 1;
+    nj = (dim > 1) ? N : 1;
+    nk = N;
+    double sum = 0;
+    for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+	    	for (int k = 0; k < nk; k++) {
+			id = i*ni*ni + j*nj + k;
+			sum += fabs(data1[id].real() - data2[id].real());
+			sum += fabs(data1[id].imag() - data2[id].imag());
+		    }
+		}
+    }
+    cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
--- a/test/testFFTAsync.cpp
+++ b/test/testFFTAsync.cpp
@ -0,0 +1,117 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include <cufft.h>
+#include <cuda_runtime.h>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+
+
+using namespace std;
+
+void initData(double *data, int dimsize[3]) {
+  for (int i = 0; i < dimsize[2]; i++) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
+      }
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+
+  int N = 8;
+  if (argc == 2)
+    N = atoi(argv[1]);
+
+  int N1 = N; 
+  int N2 = N;
+  int N3 = N;
+  int dim = 3;
+
+  int dimsize[3] = {N3, N2, N1};
+  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+  int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1);
+
+  double *data1 = new double[sizereal];
+  double *data2 = new double[sizereal];
+
+  initData(data1, dimsize);
+  initData(data2, dimsize);
+
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+
+  DKSBase base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.setupFFT(3, dimsize);
+
+  /* pagelock data */
+  base.allocateHostMemory(data1, sizereal); 
+  base.allocateHostMemory(data2, sizereal);
+
+  /* create streams */
+  int fft1, fft2;
+  base.createStream(fft1);
+  base.createStream(fft2);
+
+  int ierr;	
+  void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2;
+
+  cout << "allocating memory ..." << endl;
+  /* allocate memory on device */;
+  real_ptr1 = base.allocateMemory<double>(sizereal, ierr);
+  real_ptr2 = base.allocateMemory<double>(sizereal, ierr);
+  comp_ptr1 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
+  comp_ptr2 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
+
+  cufftHandle defaultPlan;
+  cudaStream_t cfft1, cfft2;
+  cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z);
+  cudaStreamCreate(&cfft1);
+  cudaStreamCreate(&cfft2);
+  
+
+  for (int i = 0; i < 5; i++) {
+    
+    cufftHandle plan = defaultPlan;
+
+    cout << "Iteration: " << i << endl;
+    /* write data to device */	
+    base.writeDataAsync<double>(real_ptr1, data1, sizereal, fft1);
+    //cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1);
+
+    /* execute rcfft */
+    base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1);
+    //cufftSetStream(plan, cfft1);
+    //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2);
+
+    /* write data to device */	
+    base.writeDataAsync<double>(real_ptr2, data2, sizereal, fft2);
+    //cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2);
+
+    /* execute rcfft */
+    base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2);
+    //cufftSetStream(plan, cfft2);
+    //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2);
+
+  }
+
+  base.freeMemory<double>(real_ptr1, sizereal);
+  base.freeMemory<double>(real_ptr2, sizereal);
+  base.freeMemory< complex<double> >(comp_ptr1, sizereal);
+  base.freeMemory< complex<double> >(comp_ptr2, sizereal);
+
+  /* free pagelock data */
+  base.freeHostMemory(data1, sizereal); 
+  base.freeHostMemory(data2, sizereal);
+
+  return 0;
+
+}
--- a/test/testFFTSolver.cpp
+++ b/test/testFFTSolver.cpp
@ -0,0 +1,301 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void printData3D(double* data, int N, int NI, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+   
+  for (int i = 0; i < NI; i++) {
+	  for (int j = 0; j < N; j++) {
+	    for (int k = 0; k < N; k++) {
+		    cout << data[i*N*N + j*N + k] << "\t";
+	    }
+		  cout << endl;
+		}
+	  cout << endl;
+  }
+    
+}
+
+void initData(double *data, int N) {
+
+  for (int i = 0; i < N/4 + 1; i++) {
+    for (int j = 0; j < N/2 + 1; j++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+        data[i*N*N + j*N + k] = k+1;
+      }
+    }
+  }
+}
+
+void initData2(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = i;
+}
+
+void initComplex( complex<double> *d, int N) {
+
+  for (int i = 0; i < N; i++) {
+    d[i] = complex<double>(2, 0);
+  }
+
+}
+
+void printComplex(complex<double> *d, int N) {
+  
+  for (int i = 0; i < N; i++)
+    cout << d[i] << "\t";
+  cout << endl;
+
+}
+
+void initMirror(double *data, int n1, int n2, int n3) {
+  int d = 1;
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+	  data[i * n2 * n1 + j * n1 + k] = d++;
+	else
+	  data[i * n2 * n1 + j * n1 + k] = 0;
+      }
+    }
+  }
+}
+
+void printDiv(int c) {
+  for (int i = 0; i < c; i++)
+    cout << "-";
+  cout << endl;
+
+}
+
+void printMirror(double *data, int n1, int n2, int n3) {
+  
+  printDiv(75);
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+  cout << endl;
+}
+
+double sumData(double *data, int datasize) {
+
+  double sum = 0;
+  for (int i = 0; i < datasize; i++)
+    sum += data[i];
+
+  return sum;
+}
+
+int main(int argc, char *argv[]) {
+
+  /* mpi init */
+  int rank, nprocs;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  if (nprocs != 8) {
+    cout << "example was set to run with 8 processes" << endl;
+    cout << "exit..." << endl;
+    return 0;
+  }
+
+  /* set domain size */
+  int NG[3] = {64, 64, 32};
+  int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+  int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+  int sizerho = NG[0] * NG[1] * NG[2];
+  int sizegreen = ng[0] * ng[1] * ng[2];
+  int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
+  int id[3];
+
+  id[0] = 0;
+  id[1] = NL[1] * (rank % 4);
+  id[2] = NL[2] * (rank / 4);
+
+  /* print some messages bout the example in the begginig */
+  if (rank == 0) {
+    cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+    cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
+    cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
+    cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
+    int tmp[3];
+    for (int p = 1; p < nprocs; p++) {
+      MPI_Status mpistatus;
+      MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
+      cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
+    }
+  } else {
+    MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
+  }
+
+  /* dks init and create 2 streams */
+  int dkserr;
+  int streamGreens, streamFFT;
+  DKSBase base;// = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.createStream(streamFFT);
+  if (rank == 0) {
+    base.createStream(streamGreens);
+    base.setupFFT(3, NG);
+  }
+
+  /* allocate memory and init rho field */
+  double *rho = new double[sizerho];
+  double *rho_out = new double[sizerho];
+  //double *green_out = new double[sizegreen];
+  initMirror(rho, NL[0], NL[1], NL[2]);
+
+  /*
+    allocate memory on device for 
+    - rho field
+    - rho FFT
+    - tmpgreen
+    - greens integral
+    - greens integral FFT
+  */
+  void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
+  if (rank == 0) {
+    tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
+    rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
+    grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
+    rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+    grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+  } else {
+    grntr_ptr = NULL;
+    rho2_ptr = NULL;
+    grn_ptr = NULL;
+    rho2tr_ptr = NULL;
+    tmpgreen_ptr = NULL;
+  }
+
+  /* send and receive pointer to allocated memory on device */
+  if (rank == 0) {
+    for (int p = 1; p < nprocs; p++)
+      base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
+  } else {
+    rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  /* =================================================*/
+  /* =================================================*/
+  /* =====loop trough fftpoison solver iterations=====*/
+  /* =================================================*/
+  /* =================================================*/
+  
+  double old_sum = 0;
+  double tmp_sum = 0;
+  for (int l = 0; l < 10000; l++) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    /* on node 0, calculate tmpgreen on gpu */
+    int hr_m[3] = {1, 1, 1};
+    if (rank == 0)
+      base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
+			      hr_m[0], hr_m[1], hr_m[2], streamGreens);
+
+    /* calculate greens integral on gpu */
+    if (rank == 0)
+      base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens);
+
+    /* mirror the field */
+    if (rank == 0)
+      base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens);
+    
+
+    /* get FFT of mirrored greens integral */
+    if (rank == 0) 
+      base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens);
+
+    /* transfer rho field to device */
+    base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* get FFT of rho field */
+    if (rank == 0) {
+      base.syncDevice();
+      base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
+    }
+
+    /* multiply both FFTs */
+    if (rank == 0)
+      base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* inverse fft and transfer data back */
+    /* 
+       multiple device syncs and mpi barriers are used to make sure data 
+       transfer is started when results are ready and progam moves on 
+       only when data transfer is finished
+    */
+    if (rank == 0) {
+      base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      //cout << "result: " << sumData(rho_out, sizerho) << endl;
+      if (l == 0) { 
+	old_sum = sumData(rho_out, sizerho);
+      } else {
+	tmp_sum = sumData(rho_out, sizerho);
+	if (old_sum != tmp_sum) {
+	  cout << "diff in iteration: " << l << endl;
+	}
+      }
+    } else {
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+
+  }
+  /* =================================================*/  
+  /* =================================================*/
+  /* ==========end fftpoison solver test run==========*/
+  /* =================================================*/
+  /* =================================================*/
+
+
+
+  /* free memory on device */
+  if (rank == 0) {
+    base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+    base.freeMemory<double>(grn_ptr, sizerho);
+    base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
+    base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
+    MPI_Barrier(MPI_COMM_WORLD);
+    base.freeMemory<double>(rho2_ptr, sizerho);
+    cout << "Final sum: " << old_sum << endl;
+  } else {
+    base.closeHandle(rho2_ptr);
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+
+  MPI_Finalize();
+
+
+}
--- a/test/testFFTSolver_MIC.cpp
+++ b/test/testFFTSolver_MIC.cpp
@ -0,0 +1,319 @@
+#include <iostream>
+//#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void printData3D(double* data, int N, int NI, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+
+	for (int i = 0; i < NI; i++) {
+		for (int j = 0; j < N; j++) {
+			for (int k = 0; k < N; k++) {
+				cout << data[i*N*N + j*N + k] << "\t";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+
+}
+
+void initData(double *data, int N) {
+
+	for (int i = 0; i < N/4 + 1; i++) {
+		for (int j = 0; j < N/2 + 1; j++) {
+			for (int k = 0; k < N/2 + 1; k++) {
+				data[i*N*N + j*N + k] = k+1;
+			}
+		}
+	}
+}
+
+void initData2(double *data, int N) {
+	for (int i = 0; i < N; i++)
+		data[i] = i;
+}
+
+void initComplex( complex<double> *d, int N) {
+
+	for (int i = 0; i < N; i++) {
+		d[i] = complex<double>(2, 0);
+	}
+
+}
+
+void printComplex(complex<double> *d, int N) {
+
+	for (int i = 0; i < N; i++)
+		cout << d[i] << "\t";
+	cout << endl;
+
+}
+
+void initMirror(double *data, int n1, int n2, int n3) {
+	int d = 1;
+	for (int i = 0; i < n3; i++) {
+		for (int j = 0; j < n2; j++) {
+			for (int k = 0; k < n1; k++) {
+				if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+					data[i * n2 * n1 + j * n1 + k] = d++;
+				else
+					data[i * n2 * n1 + j * n1 + k] = 0;
+			}
+		}
+	}
+}
+
+void printDiv(int c) {
+	for (int i = 0; i < c; i++)
+		cout << "-";
+	cout << endl;
+
+}
+
+void printMirror(double *data, int n1, int n2, int n3) {
+
+	printDiv(75);
+	for (int i = 0; i < n3; i++) {
+		for (int j = 0; j < n2; j++) {
+			for (int k = 0; k < n1; k++) {
+				cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+	cout << endl;
+}
+
+double sumData(double *data, int datasize) {
+
+	double sum = 0;
+	for (int i = 0; i < datasize; i++)
+		sum += data[i];
+
+	return sum;
+}
+
+int main(int argc, char *argv[]) {
+
+	/* mpi init */
+	//int rank, nprocs;
+	//MPI_Init(&argc, &argv);
+	//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+	/*
+	   if (nprocs != 8) {
+	   cout << "example was set to run with 8 processes" << endl;
+	   cout << "exit..." << endl;
+	   return 0;
+	   }
+	   */
+
+	/* set domain size */
+	int NG[3] = {64, 64, 32};
+	int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+	int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+	int sizerho = NG[0] * NG[1] * NG[2];
+	int sizegreen = ng[0] * ng[1] * ng[2];
+	int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
+	int id[3];
+
+	//id[0] = 0;
+	//id[1] = NL[1] * (rank % 4);
+	//id[2] = NL[2] * (rank / 4);
+
+	/* print some messages bout the example in the begginig */
+	cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+	//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
+	cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
+	//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
+	int tmp[3];
+	/*  for (int p = 1; p < nprocs; p++) {
+		MPI_Status mpistatus;
+		MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
+		cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
+		}*/
+	// } else {
+	//   MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
+	// }
+
+	/* dks init and create 2 streams */
+	int dkserr;
+	//int streamGreens, streamFFT;
+#ifdef DKS_MIC
+	DKSBase base;
+	base.setAPI("OpenMP", 6);
+	base.setDevice("-mic", 4);
+	base.initDevice();
+#endif
+
+#ifdef DKS_CUDA
+	DKSBase base;
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+#endif
+
+	//base.createStream(streamFFT);
+	//if (rank == 0) {
+	//  base.createStream(streamGreens);
+	base.setupFFT(3, NG);
+	//}
+
+	/* allocate memory and init rho field */
+	double *rho = new double[sizerho];
+	double *rho_out = new double[sizerho];
+	//double *green_out = new double[sizegreen];
+	initMirror(rho, NL[0], NL[1], NL[2]);
+
+	/*
+	   allocate memory on device for 
+	   - rho field
+	   - rho FFT
+	   - tmpgreen
+	   - greens integral
+	   - greens integral FFT
+	   */
+	void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
+	// if (rank == 0) {
+	tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
+	rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
+	grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
+	rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+	grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+	/* } else {
+	   grntr_ptr = NULL;
+	   rho2_ptr = NULL;
+	   grn_ptr = NULL;
+	   rho2tr_ptr = NULL;
+	   tmpgreen_ptr = NULL;
+	   }*/
+
+
+	/* send and receive pointer to allocated memory on device */
+	/*
+	   if (rank == 0) {
+	   for (int p = 1; p < nprocs; p++)
+	   base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
+	   } else {
+	   rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
+	   }
+	   MPI_Barrier(MPI_COMM_WORLD);
+	   */
+
+
+	/* =================================================*/
+	/* =================================================*/
+	/* =====loop trough fftpoison solver iterations=====*/
+	/* =================================================*/
+	/* =================================================*/
+
+	double old_sum = 0;
+	double tmp_sum = 0;
+	for (int l = 0; l < 100; l++) {
+		//MPI_Barrier(MPI_COMM_WORLD);
+		/* on node 0, calculate tmpgreen on gpu */
+		int hr_m[3] = {1, 1, 1};
+		//if (rank == 0)
+		base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
+				hr_m[0], hr_m[1], hr_m[2]);
+
+		/* calculate greens integral on gpu */
+		//if (rank == 0)
+		base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
+
+		/* mirror the field */
+		//if (rank == 0)
+		base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+
+
+		/* get FFT of mirrored greens integral */
+		//if (rank == 0) 
+		base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
+
+		/* transfer rho field to device */
+		//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
+		base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
+		//MPI_Barrier(MPI_COMM_WORLD);
+
+		/* get FFT of rho field */
+		//if (rank == 0) {
+		//base.syncDevice();
+		base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
+		//}
+
+		/* multiply both FFTs */
+		//if (rank == 0)
+		base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
+		//MPI_Barrier(MPI_COMM_WORLD);
+
+		/* inverse fft and transfer data back */
+		/* 
+		   multiple device syncs and mpi barriers are used to make sure data 
+		   transfer is started when results are ready and progam moves on 
+		   only when data transfer is finished
+		   */
+		//if (rank == 0) {
+		base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
+		//base.syncDevice();
+		//MPI_Barrier(MPI_COMM_WORLD);
+		//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+		base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
+		//MPI_Barrier(MPI_COMM_WORLD);
+		//base.syncDevice();
+		//MPI_Barrier(MPI_COMM_WORLD);
+		//cout << "result: " << sumData(rho_out, sizerho) << endl;
+		if (l == 0) { 
+			old_sum = sumData(rho_out, sizerho);
+		} else {
+			tmp_sum = sumData(rho_out, sizerho);
+			if (old_sum != tmp_sum) {
+				cout << "diff in iteration: " << l << endl;
+			}
+		}
+		/*} else {
+		  MPI_Barrier(MPI_COMM_WORLD);
+		  base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+		  MPI_Barrier(MPI_COMM_WORLD);
+		  MPI_Barrier(MPI_COMM_WORLD);
+		  }
+		  */
+
+
+	}
+/* =================================================*/  
+/* =================================================*/
+/* ==========end fftpoison solver test run==========*/
+/* =================================================*/
+/* =================================================*/
+
+
+
+/* free memory on device */
+//if (rank == 0) {
+base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+base.freeMemory<double>(grn_ptr, sizerho);
+base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
+base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
+//MPI_Barrier(MPI_COMM_WORLD);
+base.freeMemory<double>(rho2_ptr, sizerho);
+cout << "Final sum: " << old_sum << endl;
+/*} else {
+  base.closeHandle(rho2_ptr);
+  MPI_Barrier(MPI_COMM_WORLD);
+  }*/
+
+//MPI_Finalize();
+
+
+}
--- a/test/testGather.cpp
+++ b/test/testGather.cpp
@ -0,0 +1,172 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+
+void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
+
+  if (strcmp(message, "") != 0)
+    cout << message;
+
+  for (int i = 0; i < nz; i++) {
+    for (int j = 0; j < ny; j++) {
+      for (int k = 0; k < nx; k++) {
+	cout << data[i*ny*nx + j*nx + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = 0; j < N; j++)
+      cout << data[i*N + j] << "\t";
+    cout << endl;
+  }
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  int N_global[3] = {64, 64, 32};
+  int N_local[3] = {64, 32, 16};
+  int n = N_local[0] * N_local[1] * N_local[2];
+  
+  int idx[4] = {0, 0, 0, 0};
+  int idy[4] = {0, 32, 0, 32};
+  int idz[4] = {0, 0, 16, 16};
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+	
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
+    hdata_in = new int[n];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, n, rank);
+	
+  	
+  for (int i = 0; i < 2; i++) {
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (i == 1)
+      nvtxMarkA("start gather");
+
+    if (rank == 0) {
+	
+      void *mem_ptr, *tmpgreen_ptr;
+
+      mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+
+      //call another kernel
+      int sizegreen = 33 * 33 * 17;
+      tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);	
+      nvtxMarkA("call green");
+      base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007);
+
+      nvtxMarkA("call gather");
+      base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+     
+      //read and print data once for debug only
+      /*
+      if (i == 0 && nprocs*n < 257) {
+      int *hdata_out_all = new int[nprocs*n];
+      base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
+      printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]);
+      }
+      
+      else {
+	int *hout_data = new int[nprocs*n];
+	base.readData<int>(mem_ptr, hout_data, nprocs*n);
+	int sum = 0;
+	for (int s = 0; s < nprocs*n; s++)
+	  sum += hout_data[s];
+
+	cout << "Sum: " << sum << endl;
+      }
+      */
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      nvtxMarkA("call scatter");
+      base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			 idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+
+      base.freeMemory<int>(mem_ptr, n*nprocs);
+      base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+		
+    } else {
+		
+      nvtxMarkA("call gather");
+      base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+     
+      MPI_Barrier(MPI_COMM_WORLD);
+		
+      nvtxMarkA("call scatter");
+      base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			 idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+    }
+
+    if (i == 1)
+      nvtxMarkA("end gather");
+
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  base.freeHostMemory(hdata_in, n);
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+
--- a/test/testGatherAsync.cpp
+++ b/test/testGatherAsync.cpp
@ -0,0 +1,144 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = 0; j < N; j++)
+      cout << data[i*N + j] << "\t";
+    cout << endl;
+  }
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  //mpi copy
+  int n = 32*16*16;
+  int N_global[3] = {32, 32, 32};
+  int N_local[3] = {32, 16, 16};
+  int idx[4] = {0, 0, 0, 0};
+  int idy[4] = {0, 0, 16, 16};
+  int idz[4] = {0, 16, 0, 16};
+
+  //greens kernel
+  int n1 = 33;
+  int n2 = 33;
+  int n3 = 17;
+  int sizegreen = n1*n2*n3;
+	
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+	
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
+    hdata_in = new int[n];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, n, rank);
+
+  int stream2;
+  for (int i = 0; i < 2; i++) {
+	
+    if (rank == 0) {
+      if (i == 0) { 
+	cudaProfilerStart();
+	base.createStream(stream2);
+      }
+      
+      nvtxMarkA("start gather");
+      
+      void *mem_ptr, *green_ptr;
+
+      mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+      green_ptr = base.allocateMemory<int>(sizegreen, ierr);
+		
+      nvtxMarkA("call gather");
+      MPI_Request request;
+      MPI_Status status;
+
+      base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			     idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, 
+			     request);
+
+
+      nvtxMarkA("call kernel");
+      base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1, 
+			      4.160715e-03, 4.474911e-03, 1.247311e-02, stream2);
+
+      MPI_Wait(&request, &status);
+      
+
+      base.freeMemory<int>(mem_ptr, n*nprocs);
+      base.freeMemory<int>(green_ptr, sizegreen);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      nvtxMarkA("end gather");
+
+      if (i == 1) cudaProfilerStop();
+    } else {
+      
+      MPI_Request request;
+      base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			     idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, 
+			     request);
+    
+      MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+  }
+
+  base.freeHostMemory(hdata_in, n);
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+
--- a/test/testGatherAsync2.cpp
+++ b/test/testGatherAsync2.cpp
@ -0,0 +1,205 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
+
+  if (strcmp(message, "") != 0)
+    cout << message;
+
+  for (int i = 0; i < nz; i++) {
+    for (int j = 0; j < ny; j++) {
+      for (int k = 0; k < nx; k++) {
+	cout << data[i*ny*nx + j*nx + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs*N; i++)
+      cout << data[i] << "\t";
+  cout << endl << endl;
+  
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  //cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  int Ng[3] = {128, 128, 64};
+  int Nl[3] = {128, 64, 32};
+  int nglobal = Ng[0] * Ng[1] * Ng[2];
+  int nlocal = Nl[0] * Nl[1] * Nl[2];
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) {
+    hdata_in = new int[nlocal];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, nlocal, rank);
+
+  int *hdata_out;
+  if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) {
+    hdata_out = new int[nlocal];
+    cout << "pinned allocation failed!" << endl;
+  }
+
+  //create streams for async execution
+  int stream1, stream2;
+  base.createStream(stream1);
+  base.createStream(stream2);
+  
+  if (rank == 0)
+    base.setupFFT(3, Ng);
+
+  for (int i = 0; i < 1; i++) {
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (i == 1)
+      nvtxMarkA("start gather");
+
+    if (rank == 0) {
+
+      int id[3] = {0, 0, 0};
+      
+      void *mem_ptr, *tmpgreen_ptr, *comp_ptr;
+
+      //allocate memory on device
+      int sizegreen = 65 * 65 * 33;
+      int sizecomp = 65 * 128 * 64;
+      mem_ptr = base.allocateMemory<double>(nglobal, ierr);
+      tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);	
+      comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+        
+      //send pointer to other processes
+      nvtxMarkA("call gather");
+      for (int j = 1; j < nprocs; j++)
+	base.sendPointer(mem_ptr, j, MPI_COMM_WORLD);
+
+      //call another kernel while data transfer is processing 
+      nvtxMarkA("call green");
+      base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2);
+      
+      //write data to device
+      base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
+      
+      /* execute rcfft */
+      //base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng);
+
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      //read data from device
+      base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+
+      base.freeMemory<double>(mem_ptr, nglobal);
+      base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+      base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+
+    } else {
+      
+      
+      void *mem_ptr;
+      int idy = 0;
+      int idz = 0;//Nl[2]*rank;
+      if (rank / 2 == 1) idy = Ng[1] / 2;
+      if (rank % 2 == 1) idz = Ng[2] / 2;
+      int id[3] = {0, idy, idz};
+      
+      nvtxMarkA("call gather");
+      mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr);
+      base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      base.closeHandle(mem_ptr);
+      
+    }
+
+    int sum1 = 0;
+    for (int c = 0; c < nlocal; c++)
+      sum1 += hdata_in[c];
+
+    int sum2 = 0;
+    for (int c = 0; c < nlocal; c++)
+      sum2 += hdata_out[c];
+
+    cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl;
+
+
+    if (i == 1)
+      nvtxMarkA("end gather");
+
+  }
+
+  //printData(hdata_in, nlocal, 1);
+  MPI_Barrier(MPI_COMM_WORLD);
+  base.freeHostMemory(hdata_in, nlocal);
+  //delete[] hdata_in;
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+
--- a/test/testGreens.cpp
+++ b/test/testGreens.cpp
@ -0,0 +1,239 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+#include <complex>
+
+#include "DKSBase.h"
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void printData3D(double* data, int N, int NI, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+   
+  for (int i = 0; i < NI; i++) {
+	  for (int j = 0; j < N; j++) {
+	    for (int k = 0; k < N; k++) {
+		    cout << data[i*N*N + j*N + k] << "\t";
+	    }
+		  cout << endl;
+		}
+	  cout << endl;
+  }
+    
+}
+
+void initData(double *data, int N) {
+
+  for (int i = 0; i < N/4 + 1; i++) {
+    for (int j = 0; j < N/2 + 1; j++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+        data[i*N*N + j*N + k] = k+1;
+      }
+    }
+  }
+}
+
+void initData2(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = i;
+}
+
+void initComplex( complex<double> *d, int N) {
+
+  for (int i = 0; i < N; i++) {
+    d[i] = complex<double>(2, 0);
+  }
+
+}
+
+void printComplex(complex<double> *d, int N) {
+  
+  for (int i = 0; i < N; i++)
+    cout << d[i] << "\t";
+  cout << endl;
+
+}
+
+void initMirror(double *data, int n1, int n2, int n3) {
+  int d = 1;
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+	  data[i * n2 * n1 + j * n1 + k] = d++;
+	else
+	  data[i * n2 * n1 + j * n1 + k] = 0;
+      }
+    }
+  }
+}
+
+void printDiv(int c) {
+  for (int i = 0; i < c; i++)
+    cout << "-";
+  cout << endl;
+
+}
+
+void printMirror(double *data, int n1, int n2, int n3) {
+  
+  printDiv(75);
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+  cout << endl;
+}
+
+double sumData(double *data, int datasize) {
+
+  double sum = 0;
+  for (int i = 0; i < datasize; i++)
+    sum += data[i];
+
+  return sum;
+}
+
+
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+ 
+  int N1 = 8;
+  int N2 = 8;
+  int N3 = 4;
+
+  int n1 = N1 / 2; 
+  int n2 = N2 / 2;
+  int n3 = N3 / 2;
+
+  int sizegreen = (n1 + 1) * (n2 + 1) * (n3 + 1);
+  int sizerho = N1 * N2 * N3;
+
+  double *data_green; //= new double[sizegreen];
+  double *data_rho; //= new double[sizerho];
+
+  double hr_m0 = +4.0264984513873269e-04;
+  double hr_m1 = +4.3305596731911289e-04;
+  double hr_m2 = +8.3154085085560838e-04;
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+  
+  int stream1, stream2;
+  base.createStream(stream1);
+  base.createStream(stream2);
+  cout << "ID stream1: " << stream1 << endl;
+  cout << "ID stream2: " << stream2 << endl;
+  
+  void *mem_green1, *mem_green2, *mem_rho1, *mem_rho2;
+
+  mem_green1 = base.allocateMemory<double>(sizegreen, ierr);
+  mem_green2 = base.allocateMemory<double>(sizegreen, ierr);
+  mem_rho1 = base.allocateMemory<double>(sizerho, ierr);
+  mem_rho2 = base.allocateMemory<double>(sizerho, ierr);
+
+  printDiv(50);
+
+  data_green = new double[sizegreen];
+  data_rho = new double[sizerho];
+  
+  base.callGreensIntegral(mem_green1, n1+1, n2+1, n3+1, n1+1, n2+1, 
+			  hr_m0, hr_m1, hr_m2, stream1);
+  base.readData<double>(mem_green1, data_green, sizegreen);
+  cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
+  cout << scientific << setprecision(16);
+  for (int p = 0; p < 7; p++)
+    cout << data_green[p] << "\t";
+  cout << endl;
+  //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
+
+  base.callGreensIntegration(mem_rho1, mem_green1, n1 + 1, n2 + 1, n3 + 1, -1);
+  base.readData<double>(mem_rho1, data_rho, sizerho);
+  cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+
+  base.callMirrorRhoField(mem_rho1, n1, n2, n3, -1);
+  base.readData<double>(mem_rho1, data_rho, sizerho);
+  cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+
+  printDiv(50);
+
+  /*
+  base.callGreensIntegral(mem_green2, n1+1, n2+1, n3+1, n1+1, n2+1, 
+			  1, 1, 1, -2);
+  base.readData<double>(mem_green2, data_green, sizegreen);
+  cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
+  //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
+
+  base.callGreensIntegration(mem_rho2, mem_green2, n1 + 1, n2 + 1, n3 + 1, -2);
+  base.readData<double>(mem_rho2, data_rho, sizerho);
+  cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+
+  base.callMirrorRhoField(mem_rho2, n1, n2, n3, -2);
+  base.readData<double>(mem_rho2, data_rho, sizerho);
+  cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+  */
+  printDiv(50);
+
+  base.freeMemory<double>(mem_green1, sizegreen);
+  base.freeMemory<double>(mem_green2, sizegreen);
+  base.freeMemory<double>(mem_rho1, sizerho);
+  base.freeMemory<double>(mem_rho2, sizerho);
+  
+  delete [] data_green;
+  delete [] data_rho;
+
+  //test complex multiplication
+  int compsize = 300;
+  complex<double> *data1 = new complex<double>[compsize];
+  complex<double> *data2 = new complex<double>[compsize];
+  for (int i = 0; i < compsize; i++) {
+    data1[i] = complex<double>(i+1, i+2);
+    data2[i] = complex<double>(i+3, i+4);
+  }
+
+  for (int i = 0; i < 3; i++)
+    cout << data1[i] << "\t";
+  cout << endl;
+  for (int i = 0; i < 3; i++)
+    cout << data2[i] << "\t";
+  cout << endl;
+
+  void *ptr1, *ptr2;
+  ptr1 = base.allocateMemory< complex<double> >(compsize, ierr);
+  ptr2 = base.allocateMemory< complex<double> >(compsize, ierr);
+
+  base.writeData< complex<double> >(ptr1, data1, compsize);
+  base.writeData< complex<double> >(ptr2, data2, compsize);
+
+  base.callMultiplyComplexFields(ptr1, ptr2, compsize);
+  
+  base.readData< complex<double> >(ptr1, data1, compsize);
+
+  for (int i = 0; i < 3; i++)
+    cout << data1[i] << "\t";
+  cout << endl;
+
+  base.freeMemory< complex<double> >(ptr1, compsize);
+  base.freeMemory< complex<double> >(ptr2, compsize);
+  		
+  return 0;
+}
--- a/test/testImageReconstruction.cpp
+++ b/test/testImageReconstruction.cpp
@ -0,0 +1,191 @@
+#include <iostream>
+#include <cstdlib>
+#include <sys/time.h>
+#include "DKSImageReconstruction.h"
+
+struct voxelPosition {
+  float x;
+  float y;
+  float z;
+};
+
+void initImage(float *image, int size) {
+  for (int i = 0; i < size; i++)
+    image[i] = (float)rand() / RAND_MAX;
+}
+
+void initPosition(voxelPosition *voxel, int N) {
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	int idx = i * N * N + j * N + k;
+	if (k == 0)
+	  voxel[idx].x = 0.0;
+	else
+	  voxel[idx].x = voxel[idx - 1].x + 0.1;
+
+	if (j == 0)
+	  voxel[idx].y = 0.0;
+	else
+	  voxel[idx].y = voxel[idx - N].y + 0.1;
+
+	if (i == 0)
+	  voxel[idx].z = 0.0;
+	else
+	  voxel[idx].z = voxel[idx - N * N].z + 0.1;
+      }
+    }
+  }
+}
+
+void printPosition(voxelPosition *voxel, int size) {
+  for (int i = 0; i < size; i++)
+    std::cout << voxel[i].x << "\t";
+  std::cout << std::endl;
+  for (int i = 0; i < size; i++)
+    std::cout << voxel[i].y << "\t";
+  std::cout << std::endl;
+  for (int i = 0; i < size; i++)
+    std::cout << voxel[i].z << "\t";
+  std::cout << std::endl;
+}
+
+#define DIAMETER 2.0
+bool select_source(voxelPosition *image_tmp, voxelPosition source_temp, int id)
+{
+  float distance_x = pow(image_tmp[id].x-source_temp.x,2);
+  float distance_y = pow(image_tmp[id].y-source_temp.y,2);
+  float distance_z = pow(image_tmp[id].z-source_temp.z,2);
+  float distance = sqrt(distance_x + distance_y + distance_z);
+
+  if ( distance < DIAMETER*0.5 ) {
+      return true;
+  }
+  else
+    return false;
+}
+
+void calculate_source(float *image_space , voxelPosition *image_geometry, 
+		      voxelPosition source, int total_voxels, 
+		      float *average, float *std)
+{
+  
+  int number_selected_maximum = 10000;
+  float *select;
+  select = new float[number_selected_maximum];
+  for (int j=0;j<number_selected_maximum;j++)
+    select[j] = 0.0;
+  int number_selected=0;
+
+  for (int voxel_id = 0; voxel_id < total_voxels; voxel_id++) {
+    if ( select_source( image_geometry, source, voxel_id ) ) {
+      select[number_selected] = image_space[voxel_id];
+      number_selected += 1;
+    }
+  }
+
+  *average = 0.0;
+  *std = 0.0;
+
+  for (int j=0;j<number_selected;j++)
+    *average += select[j];
+  *average /= float(number_selected);
+
+  for (int j=0;j<number_selected;j++)
+    *std += pow(*average-select[j],2);
+  *std = sqrt(*std/number_selected/(number_selected-1));
+
+  delete[] select;
+}
+
+int main(int argc, char *argv[]) {
+
+  int N = 8;
+  if (argc == 2)
+    N = atoi(argv[1]);
+
+  double ttotal;
+  struct timeval timeStart, timeEnd;
+
+  int total = N*N*N;
+  float *image = new float[total];
+  voxelPosition *geometry = new voxelPosition[total];
+
+  initImage(image, total);
+  initPosition(geometry, N);
+
+  voxelPosition source;
+  float avg[total], stdev[total];
+
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < total; i++) {
+    source.x = geometry[i].x;
+    source.y = geometry[i].y;
+    source.z = geometry[i].z;
+    calculate_source(image , geometry, source, total, &avg[i], &stdev[i]);
+  }    
+  gettimeofday(&timeEnd, NULL);
+  ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	     (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
+
+  float avgavg = 0;
+  float avgstdev = 0;
+  for (int i = 0; i < total; i++) {
+    avgavg += avg[i] / total;
+    avgstdev += stdev[i] / total;
+  }
+
+  std::cout << "Total voxels: " << N*N*N << std::endl;
+  std::cout << "Dimensions [" << geometry[0].x << ":" << geometry[N-1].x << "]"
+	    << "[" << geometry[0].y << ":" << geometry[N*N-1].x << "]"
+	    << "[" << geometry[0].z << ":" << geometry[N*N*N-1].x << "]" << std::endl;
+  std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
+
+
+  void *image_space, *image_position, *source_position, *davg, *dstd;
+
+  int ierr;
+  DKSImageRecon base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+  image_space = base.allocateMemory<float>(total, ierr);
+  image_position = base.allocateMemory<voxelPosition>(total, ierr);
+  source_position = base.allocateMemory<voxelPosition>(total, ierr);
+  davg = base.allocateMemory<float>(total, ierr);
+  dstd = base.allocateMemory<float>(total, ierr);
+
+  base.writeData<float>(image_space, image, total);
+  base.writeData<voxelPosition>(image_position, geometry, total);
+  base.writeData<voxelPosition>(source_position, geometry, total);
+
+
+  gettimeofday(&timeStart, NULL);
+  base.callCalculateSource(image_space, image_position, source_position, 
+			   davg, dstd, DIAMETER, total, total);
+
+
+  base.readData<float>(davg, avg, total);
+  base.readData<float>(dstd, stdev, total);
+  gettimeofday(&timeEnd, NULL);
+  ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	     (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
+
+  base.freeMemory<float>(image_space, total);
+  base.freeMemory<voxelPosition>(image_position, total);
+  base.freeMemory<voxelPosition>(source_position, total);
+  base.freeMemory<float>(dstd, total);
+  base.freeMemory<float>(davg, total);
+  
+  avgavg = 0;
+  avgstdev = 0;
+  for (int i = 0; i < total; i++) {
+    avgavg += avg[i] / total;
+    avgstdev += stdev[i] / total;
+  }
+  std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
+
+  return N;
+
+}
--- a/test/testMIC.cpp
+++ b/test/testMIC.cpp
@ -0,0 +1,51 @@
+#include <iostream>
+#include "DKSBase.h"
+
+using namespace std;
+
+int main() {
+
+  DKSBase base;
+	
+  base.setAPI("OpenMP", 6);
+  base.initDevice();
+	
+  //init data
+  int ierr;
+  int N = 8;
+  double *in_data = new double[N];
+  double *in_data2 = new double[N];
+  double *out_data = new double[N];
+  double *out_data2 = new double[N];
+	
+  for (int i = 0; i < N; i++) {
+    in_data[i] = i;
+    in_data2[i] = i*i;
+  }
+		
+  //test memory allocation, write and read operations
+  void *d_ptr, *d2_ptr;
+  
+  d_ptr = base.allocateMemory<double>(N, ierr);
+  d2_ptr = base.allocateMemory<double>(N, ierr);
+ 	
+  base.writeData<double>(d_ptr, in_data, N);
+  base.writeData<double>(d2_ptr, in_data2, N);
+	
+  base.readData<double>(d_ptr, out_data, N);
+  base.readData<double>(d2_ptr, out_data2, N);
+  base.freeMemory<double>(d_ptr, N);
+  base.freeMemory<double>(d2_ptr, N);
+ 		
+  //print results
+  for (int i = 0; i < N; i++)
+    cout << out_data[i] << "\t";
+  cout << endl;	
+	
+  for (int i = 0; i < N; i++)
+    cout << out_data2[i] << "\t";
+  cout << endl;	
+	
+  return 0;
+
+}
--- a/test/testMICOpenCL.cpp
+++ b/test/testMICOpenCL.cpp
@ -0,0 +1,94 @@
+#include <iostream>
+#include <cstdlib>
+#include "DKSBase.h"
+#include "Utility/TimeStamp.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+	char *api_name = new char[10];
+	char *device_name = new char[4];
+
+	if (argc == 3) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+	} else if (argc == 2){
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+
+	cout << "Use api: " << api_name << endl;
+	cout << "Use device: " << device_name << endl;
+
+
+	int ierr;
+	int N = 10000;
+	double *data = new double[N];
+	double *data_out = new double[N];
+	double *data_out2 = new double[N];
+	
+	for (int i = 0; i < N; i++) {
+		data[i] = i;
+	}
+	
+	//init dks base class, set API to opencl and init connection with OpenCL device
+	DKSBase base;
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(device_name));
+	base.initDevice();
+	
+	//data ptr
+	void *data_ptr, *data_ptr2;
+	
+	//allocate memory
+	data_ptr = base.allocateMemory<double>(N, ierr);
+	data_ptr2 = base.allocateMemory<double>(N, ierr);
+	
+	//write data to memory and fill data on device
+	base.writeData<double>(data_ptr, data, N);
+	base.writeData<double>(data_ptr2, data, N);
+	//base.callNt<double>(data_ptr2, data_ptr, 6, N, 1, 0);
+	
+	//calc sum
+	base.callSum<double>(data_ptr2, data_ptr2, N);
+	
+	//base.callSum<double>(data_ptr, data_ptr, N);
+	
+	//chi^2
+	//base.callChi2<double>(data_ptr, data_ptr, data_ptr, N);
+	//base.callChi2<double>(data_ptr2, data_ptr2, data_ptr2, N);
+	
+	//read data
+	base.readData<double>(data_ptr, data_out, N);
+	base.readData<double>(data_ptr2, data_out2, N);
+	
+	//base.oclEventInfo();
+	
+	//free memory
+	base.freeMemory<double>(data_ptr, N);
+	base.freeMemory<double>(data_ptr2, N);
+	
+	
+	/*
+	for (int i = 0; i < N; i++) {
+		cout << data[i] << "\t";
+	}
+	cout << endl << endl;
+	for (int i = 0; i < N; i++) {
+		cout << data_out[i] << "\t";
+	}
+	cout << endl << endl;
+	for (int i = 0; i < N; i++) {
+		cout << data_out2[i] << "\t";
+	}
+	cout << endl;
+	*/
+
+
+
+	return 0;
+}
--- a/test/testMICPush.cpp
+++ b/test/testMICPush.cpp
@ -0,0 +1,68 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Part;
+
+void initData(Part *data, int N) {
+  for (int i = 0; i < N; i++) {
+    data[i].x = rand() / RAND_MAX;
+    data[i].y = rand() / RAND_MAX;
+    data[i].z = rand() / RAND_MAX;
+  }
+}
+
+int main() {
+
+  int ierr;
+  int N = 100000;
+
+  //__declspec(align(64)) Part *R = new Part[N];
+  //__declspec(align(64)) Part *P = new Part[N];
+  Part *R = new Part[N];
+  Part *P = new Part[N];
+
+  initData(R, N);
+  initData(P, N);
+
+  DKSBase dksbase;
+  dksbase.setAPI("OpenMP", 6);
+  dksbase.setDevice("-mic", 4);
+  dksbase.initDevice();
+
+  void *r_ptr, *p_ptr, *dt_ptr;
+  r_ptr = dksbase.allocateMemory<Part>(N, ierr);
+  p_ptr = dksbase.allocateMemory<Part>(N, ierr);
+  dt_ptr = dksbase.allocateMemory<double>(N, ierr);
+
+  dksbase.writeData<Part>(r_ptr, R, N);
+
+  cout << "====================START PUSH====================" << endl;
+
+  for (int i = 0; i < 5; i++) {
+    //write r to device
+    dksbase.writeData<Part>(r_ptr, R, N);
+    //calc push
+    dksbase.callParallelTTrackerPush (r_ptr, p_ptr, N, dt_ptr,
+				      0.001, 1, false, NULL);
+    //read R from device
+    dksbase.readDataAsync<Part> (r_ptr, R, N, NULL);
+  }
+
+  cout << "====================END PUSH====================" << endl;
+
+
+
+  dksbase.freeMemory<Part>(r_ptr, N);
+  dksbase.freeMemory<Part>(p_ptr, N);
+  dksbase.freeMemory<double>(dt_ptr, N);
+
+  return 0;
+}
--- a/test/testMPI.cpp
+++ b/test/testMPI.cpp
@ -0,0 +1,89 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+		
+	for (int i = 0; i < nprocs; i++) {
+		for (int j = 0; j < N; j++)
+			cout << data[i*N + j] << "\t";
+		cout << endl;
+	}
+}
+
+void initData(int *data, int N, int rank) {
+	for (int i = 0; i < N; i++)
+		data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+	int ierr;
+	int rank, nprocs;
+		
+	MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+    cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+	int n = 8;
+	int sizen = sizeof(int)*n;
+	int sizeall = sizeof(int)*n*nprocs;
+	
+	int *hdata_in = new int[n];
+	int *hdata_out = new int[n];
+	initData(hdata_in, n, rank);
+	cout << "In data for process " << rank+1 << ":\t";
+	printData(hdata_in, n, 1);
+	
+	
+	DKSBase base = DKSBase();
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	
+	if (rank == 0) {
+	
+		int *hdata_out_all = new int[nprocs*n];
+		void* mem_ptr;
+		mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+		
+		MPI_Gather(hdata_in, n, MPI_INT, mem_ptr, n, MPI_INT, 0, MPI_COMM_WORLD);
+		
+		base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
+		
+		MPI_Scatter(mem_ptr, n, MPI_INT, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
+		
+		base.freeMemory<int>(mem_ptr, n*nprocs);
+		
+		printData(hdata_out_all, n, nprocs, "Out data 1:\n");
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_in, n, 1);
+	} else {
+		
+		MPI_Gather(hdata_in, n, MPI_INT, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
+		
+		MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
+		
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_in, n, 1);
+		
+	}
+
+
+	MPI_Finalize();
+	return 0;
+}
+
+
+
+
+
--- a/test/testMPIFFT.cpp
+++ b/test/testMPIFFT.cpp
@ -0,0 +1,91 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double> *data, int N, int nprocs, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+		
+	for (int i = 0; i < nprocs; i++) {
+		for (int j = 0; j < N; j++)
+			cout << data[i*N + j] << "\t";
+		cout << endl;
+	}
+}
+
+void initData(complex<double> *data, int N, int rank) {
+	for (int i = 0; i < N; i++)
+		data[i] = complex<double>((double)rank+1.0, 0.0);
+}
+
+int main(int argc, char *argv[]) {
+
+	int ierr;
+	int rank, nprocs;
+		
+	MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+    cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+	int n = 8;
+	
+	complex<double> *hdata_in = new complex<double>[n];
+	complex<double> *hdata_out = new complex<double>[n];
+	initData(hdata_in, n, rank);
+	cout << "In data for process " << rank+1 << ":\t";
+	printData(hdata_in, n, 1);
+	
+	
+	DKSBase base = DKSBase();
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	
+	if (rank == 0) {
+	
+		complex<double> *hdata_out_all = new complex<double>[nprocs*n];
+		void* mem_ptr;
+		mem_ptr = base.allocateMemory< complex<double> >(nprocs*n, ierr);
+		
+		
+		MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, mem_ptr, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+		
+		
+		int dimsize[3] = {n*nprocs, 1, 1};
+		base.callFFT(mem_ptr, 1, dimsize);
+		base.readData< complex<double> >(mem_ptr, hdata_out_all, n*nprocs);
+		
+		MPI_Scatter(mem_ptr, n, MPI_DOUBLE_COMPLEX, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+		
+		base.freeMemory< complex<double> >(mem_ptr, n*nprocs);
+		
+		printData(hdata_out_all, n, nprocs, "Out data 1:\n");
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_out, n, 1);
+	} else {
+		
+		MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
+		
+		MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+		
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_out, n, 1);
+		
+	}
+
+
+	MPI_Finalize();
+	return 0;
+}
+
+
+
+
+
--- a/test/testMemObjects.cpp
+++ b/test/testMemObjects.cpp
@ -0,0 +1,75 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	
+	int ierr,n, N;
+
+	if (argc > 1)
+		n = atoi(argv[1]);
+	else
+		n = 10;
+
+	N = 2 << n;
+	cout << "Elements: " << N << endl;
+
+	double *data = new double[N];
+	for (int i = 0; i < N; i++)
+		data[i] = (double)i / N;
+
+
+	DKSBase base = DKSBase();
+	base.setAPI("OpenCL", 6);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	void *ptr1;
+	ptr1 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr1, data, N);
+	
+	void *ptr2;
+	ptr2 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr2, data, N);
+	
+	void *ptr3;
+	ptr3 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr3, data, N);
+	
+	void *ptr4;
+	ptr4 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr4, data, N);
+	
+	void *ptr5;
+	ptr5 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr5, data, N);
+	
+	void *ptr6;
+	ptr6 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr6, data, N);
+	
+	void *ptr7;
+	ptr7 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr7, data, N);
+	
+	void *ptr8;
+	ptr8 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr8, data, N);
+	
+	base.freeMemory<double>(ptr1, N);
+	base.freeMemory<double>(ptr2, N);
+	base.freeMemory<double>(ptr3, N);
+	base.freeMemory<double>(ptr4, N);
+	base.freeMemory<double>(ptr5, N);
+	base.freeMemory<double>(ptr6, N);
+	base.freeMemory<double>(ptr7, N);
+	base.freeMemory<double>(ptr8, N);
+	
+	
+		
+	return 0;
+}
+
--- a/test/testOffset.cpp
+++ b/test/testOffset.cpp
@ -0,0 +1,73 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	
+	char *api_name = new char[10];
+	char *device_name = new char[10];
+	if (argc == 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else if (argc == 3) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+	
+	
+	int ierr,n, N;
+
+	N = 8;
+	n = 4;
+
+	double *data_in = new double[N];
+	double *data_out_1 = new double[N];
+	double *data_out_2 = new double[N];
+	for (int i = 0; i < N; i++) {
+		data_in[i] = (double)i / N;
+		data_out_1[i] = 0.0;
+		data_out_2[i] = 0.0;
+	}
+
+	cout << "Run example on: " << api_name << " using " << device_name << endl;
+
+	DKSBase base = DKSBase();
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(api_name));
+	base.initDevice();
+	
+	void *ptr1;
+	ptr1 = base.allocateMemory<double>(N, ierr);
+	
+	ierr = base.writeData<double>(ptr1, data_in, n, 0);
+	ierr = base.writeData<double>(ptr1, data_in, n, 4);
+	
+	ierr = base.readData<double>(ptr1, data_out_1, N);
+	ierr = base.readData<double>(ptr1, data_out_2, n, 2);
+	
+	base.freeMemory<double>(ptr1, N);
+	
+	for (int i = 0; i < N; i++)
+		cout << data_in[i] << "\t";
+	cout << endl;
+	
+	for (int i = 0; i < N; i++)
+		cout << data_out_1[i] << "\t";
+	cout << endl;
+	
+	for (int i = 0; i < N; i++)
+		cout << data_out_2[i] << "\t";
+	cout << endl;
+		
+	
+	
+		
+	return 0;
+}
+
--- a/test/testOffsetMPI.cpp
+++ b/test/testOffsetMPI.cpp
@ -0,0 +1,81 @@
+#include <mpi.h>
+#include <iostream>
+#include <cstdlib>
+
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	
+	int rank, size;
+		
+	MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    
+    cout << "Rank " << rank << " from " << size << endl;
+
+	
+	int ierr, N, n;
+
+	N = 8;
+	n = N / 2;
+
+	double *data_in = new double[n];
+	
+	for (int i = 0; i < n; i++)
+		data_in[i] = (double)rank + 1.0 + (double)i / n;
+
+	DKSBase base = DKSBase();
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	if (rank == 0) {
+		//alocate memory of size N
+		void *ptr1;
+		ptr1 = base.allocateMemory<double>(size*N, ierr);
+		cout << "Sent pointer: " << ptr1 << endl;
+	
+		//send ptr to other processes
+		MPI_Send(&ptr1, sizeof(void*), MPI_BYTE, 1, 123, MPI_COMM_WORLD);
+	
+		//wrtie n data with no offset to device and wait for other processes
+		ierr = base.writeData<double>(ptr1, data_in, n, rank*n);
+		MPI_Barrier(MPI_COMM_WORLD);
+		
+		//read memory of size N from device
+		double *data_out = new double[N];	
+		ierr = base.readData<double>(ptr1, data_out, N);
+	
+		//free device memory
+		base.freeMemory<double>(ptr1, size*N);
+	
+		//print results
+		for (int i = 0; i < n; i++)
+			cout << data_in[i] << "\t";
+		cout << endl;
+	
+		for (int i = 0; i < N; i++)
+			cout << data_out[i] << "\t";
+		cout << endl;
+		
+    } else {
+    	//receive device memory pointer
+    	void *ptr2;
+    	MPI_Recv(&ptr2, sizeof(void*), MPI_BYTE, 0, 123, MPI_COMM_WORLD, NULL);
+    	cout << "Received pointer: " << ptr2 << endl;
+    	//write data with an offset
+    	base.writeData<double>(ptr2, data_in, n, rank*n);
+    	
+    	MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+    MPI_Finalize();
+	
+		
+	return 0;
+}
+
--- a/test/testPush.cpp
+++ b/test/testPush.cpp
@ -0,0 +1,57 @@
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void initData(double3 *data, int N) {
+  for (int i = 0; i < N; i++) {
+    data[i].x = rand() / RAND_MAX;
+    data[i].y = rand() / RAND_MAX;
+    data[i].z = rand() / RAND_MAX;
+  }
+}
+
+
+int main() {
+  
+  int ierr;
+  int N = 1000000;
+  double3 *R = new double3[N];
+  double3 *P = new double3[N];
+
+  initData(R, N);
+  initData(P, N);
+
+  DKSBase dksbase;
+  dksbase.setAPI("Cuda", 4);
+  dksbase.setDevice("-gpu", 4);
+  dksbase.initDevice();
+
+  void *r_ptr, *p_ptr;
+  
+  r_ptr = dksbase.allocateMemory<double3>(N, ierr);
+  p_ptr = dksbase.allocateMemory<double3>(N, ierr);
+
+  dksbase.writeData<double3>(r_ptr, R, N);
+  dksbase.writeData<double3>(p_ptr, P, N);
+
+  for (int i = 0; i < 100; i++)
+    dksbase.callParallelTTrackerPush(r_ptr, p_ptr, N, NULL, 0.5, 1, false);
+
+
+  dksbase.readData<double3>(r_ptr, R, N);
+  dksbase.readData<double3>(p_ptr, P, N);
+
+  dksbase.freeMemory<double3>(r_ptr, N);
+  dksbase.freeMemory<double3>(p_ptr, N);
+
+
+  return 0;
+}
--- a/test/testRCFFT.cpp
+++ b/test/testRCFFT.cpp
@ -0,0 +1,168 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(double* &data, int N1, int N2);
+void printData(complex<double>* &data, int N1, int N2);
+void printData3DN4(complex<double>* &data, int N, int dim);
+void printData3DN4(double* &data, int N, int dim);
+
+
+void compareData(double* &data1, double* &data2, int N, int dim);
+
+
+
+int main(int argc, char *argv[]) {
+
+	int N1 = 4;
+	int N2 = 4;
+
+	if (argc == 3) {
+		N1 = atoi(argv[1]);
+		N2 = atoi(argv[2]);
+	}
+
+	int dimsize[3] = {N1, N2, 1};
+
+	cout << "Begin RC 3D FFT tests, grid = " <<  N1 << "\t" << N2 << endl;		
+	int sizereal = N1*N2;
+	int sizecomp = N1*(N2/2+1);
+
+	int dim = 3;
+	double *cdata = new double[sizereal];
+	complex<double> *cfft = new complex<double>[sizecomp];
+	
+	for (int i = 0; i < N2; i++) {
+		for (int j = 0; j < N1; j++) {
+			cdata[i*N1 + j] = (double)(j) / N1;
+		}
+	}
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	DKSBase base;
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	void *real_ptr, *comp_ptr;
+	int ierr;
+	/* allocate memory on device */
+	real_ptr = base.allocateMemory<double>(sizereal, ierr);
+	comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+	
+	/* write data to device */	
+	ierr = base.writeData<double>(real_ptr, cdata, sizereal);
+
+	/* execute fft */
+	base.callR2CFFT(real_ptr, comp_ptr, 2, dimsize);
+	
+	/* read data from device */
+	base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
+	
+	/* free device memory */
+	base.freeMemory<double>(real_ptr, sizereal);
+	base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+	
+	cout << "FFT complete" << endl;
+	
+	
+	/* print results */
+	printData(cdata, N1, N2);
+	printData(cfft, N1, N2);
+	
+	
+		
+	return 0;
+}
+
+void printData(double* &data, int N1, int N2) {
+    
+    for (int i = 0; i < N2; i++) {
+		for (int j = 0; j < N1; j++) {
+			cout << data[i*N1 + j] << " ";
+		}
+		cout << endl;
+    }
+	cout << endl;
+}
+
+void printData(complex<double>* &data, int N1, int N2) {
+    
+    complex<double> tmp(0.0, 0.0);
+    for (int i = 0; i < N2/2+1; i++) {
+		for (int j = 0; j < N1; j++) {
+			tmp = data[i*N1 + j];
+		    if (tmp.real() < 0.00001 && tmp.real() > -0.00001) tmp = complex<double>(0.0, tmp.imag());
+		    if (tmp.imag() < 0.00001 && tmp.imag() > -0.00001) tmp = complex<double>(tmp.real(), 0.0);
+		    	
+		    cout << tmp << " ";
+		}
+		cout << endl;
+    }
+    cout << endl;
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				
+				double d = data[i*N*N + j*N + k].real();
+				double a = data[i*N*N + j*N + k].imag();
+				
+				if (d < 10e-5 && d > -10e-5)
+					d = 0;
+				if (a < 10e-5 && a > -10e-5)
+					a = 0;
+					
+				cout << d << "; " << a << "\t";
+    		}
+    	}
+	    cout << endl;
+	}
+	cout << endl;
+    
+}
+
+void printData3DN4(double* &data, int N, int dim) {
+    
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k];
+				if (d > 10e-5 || d < -10e-5)
+				    cout << d << "\t";
+				else 
+					cout << 0 << "\t";
+    		}
+    	}
+	    cout << endl;
+	}
+	cout << endl;
+    
+}
+
+void compareData(double* &data1, double* &data2, int N, int dim) {
+    int ni, nj, nk, id;
+    ni = (dim > 2) ? N : 1;
+    nj = (dim > 1) ? N : 1;
+    nk = N;
+    double sum = 0;
+    for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+	    	for (int k = 0; k < nk; k++) {
+				id = i*ni*ni + j*nj + k;
+				sum += fabs(data1[id] - data2[id]);
+		    }
+		}
+    }
+    cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
--- a/test/testStockFFT3D.cpp
+++ b/test/testStockFFT3D.cpp
@ -0,0 +1,181 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData3DN4(complex<double>* &data, int N, int dim);
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+int main(int argc, char *argv[]) {
+
+	int n = 2;
+	if (argc == 2) 
+		n = atoi(argv[1]);
+
+	int N = pow(2,n);
+
+	cout << "Begin DKS Base tests" << endl;
+		
+	cout << "FFT size: " << N << endl;
+
+	int dimsize[3] = {N, N, N};
+
+	
+	complex<double> *cdata = new complex<double>[N*N*N];
+	complex<double> *cfft = new complex<double>[N*N*N];
+	complex<double> *cfft2 = new complex<double>[N*N*N];
+	complex<double> *cfft3 = new complex<double>[N*N*N];
+
+	
+	for (int i = 0; i < N; i++) {
+		for (int j = 0; j < N; j++) {
+			for (int k = 0; k < N; k++) {
+				//cdata[i*N*N + j*N + k] = complex<double>((double)k/(N*N*N), 0);
+				cdata[i*N*N + j*N + k] = complex<double>(k, 0);
+				cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+				cfft2[i*N*N + j*N + k] = complex<double>(0, 0);
+				cfft3[i*N + j*N + k] = complex<double>(0, 0);
+			}
+		}
+	}
+	
+	if (N == 4)
+		printData3DN4(cdata, N, 3);
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	int ierr;
+	
+	
+	timestamp_t t0, t1;
+	
+	/* stockham radix-2 out-of-place fft */
+	DKSBase base2;
+	base2.setAPI("OpenCL", 6);
+	base2.setDevice("-gpu", 4);
+	base2.initDevice();
+	
+	cout << endl;
+	void *src_ptr;
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		src_ptr = base2.allocateMemory< complex<double> >(N*N*N, ierr);
+		base2.writeData< complex<double> >(src_ptr, cdata, N*N*N);
+		base2.callFFTStockham(src_ptr, 3, dimsize);
+		base2.readData< complex<double> >(src_ptr, cfft2, N*N*N);
+		base2.freeMemory< complex<double> >(src_ptr, N*N*N);
+		t1 = get_timestamp();
+		cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+
+	if (N == 4)
+		printData3DN4(cfft2, N, 3);
+	
+	//delete base2;
+	cout << endl;
+	
+	/* CUDA cufft */
+	DKSBase base3;
+	base3.setAPI("Cuda", 4);
+	base3.setDevice("-gpu", 4);
+	base3.initDevice();
+	
+	cout << endl;
+	void *cuda_ptr;
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		cuda_ptr = base3.allocateMemory< complex<double> >(N*N*N, ierr);
+		base3.writeData< complex<double> >(cuda_ptr, cdata, N*N*N);
+		base3.callFFT(cuda_ptr, 3, dimsize);
+		base3.readData< complex<double> >(cuda_ptr, cfft3, N*N*N);
+		base3.freeMemory< complex<double> >(cuda_ptr, N*N*N);
+		t1 = get_timestamp();
+		cout << "Cuda FFT time: " << get_secs(t0, t1) << endl;
+	}
+		
+	if (N == 4)
+		printData3DN4(cfft3, N, 3);
+	
+	//delete base3;
+	cout << endl;
+	
+	
+	/* radix-2 in place fft */
+	DKSBase base;
+	base.setAPI("OpenCL", 6);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	cout << endl;
+	void *mem_ptr;
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+		base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+		base.callFFT(mem_ptr, 3, dimsize);
+		base.readData< complex<double> >(mem_ptr, cfft, N*N*N);
+		base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+		t1 = get_timestamp();
+		cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+	
+	if (N == 4)	
+		printData3DN4(cfft, N, 3);
+	
+	//delete base;
+	cout << endl;
+	
+	/* compare results */	
+	cout << endl;
+	
+	cout << "Radix 2 vs Stockham: ";
+	compareData(cfft, cfft2, N, 3);
+	
+	cout << "Radix 2 vs Cufft: ";
+	compareData(cfft, cfft3, N, 3);
+	
+	cout << "Stockham vs Cufft: ";
+	compareData(cfft2, cfft3, N, 3);	
+
+	return 0;
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k].real();
+				if (d > 10e-5 || d < -10e-5)
+				    cout << d << "\t";
+				else 
+					cout << 0 << "\t";
+    		}
+    	}
+	    cout << endl;
+	}
+	cout << endl;
+    
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+    int ni, nj, nk, id;
+    ni = (dim > 2) ? N : 1;
+    nj = (dim > 1) ? N : 1;
+    nk = N;
+    double sum = 0;
+    for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+	    	for (int k = 0; k < nk; k++) {
+			id = i*ni*ni + j*nj + k;
+			sum += fabs(data1[id].real() - data2[id].real());
+			sum += fabs(data1[id].imag() - data2[id].imag());
+		    }
+		}
+    }
+    cout << "CC <--> CC diff: " << sum << endl;
+}
--- a/test/testStockhamFFT.cpp
+++ b/test/testStockhamFFT.cpp
@ -0,0 +1,107 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+	int n = 2;
+	char *api_name = new char[10];
+	char *device_name = new char[10];
+	if (argc == 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else if (argc == 3) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+	} else if (argc == 4) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+		n = atoi(argv[3]);
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+
+	int N = pow(2,n);
+	cout << "Use api: " << api_name << endl;
+
+	cout << "Begin DKS Base tests" << endl;
+		
+	cout << "FFT size: " << N << endl;
+	
+	int dimsize[3] = {N, N, N};
+	
+	complex<double> *cdata = new complex<double>[N];
+	complex<double> *cfft = new complex<double>[N];
+	complex<double> *cfft2 = new complex<double>[N];
+	complex<double> *cfftsrc = new complex<double>[N];
+	for (int i = 0; i < N; i++) {
+		cdata[i] = complex<double>((double)i / N, 0);
+		cfft[i] = complex<double>(0, 0);
+		cfft2[i] = complex<double>(0, 0);
+		cfftsrc[i] = complex<double>(0, 0);
+	}
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	DKSBase base;
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(api_name));
+	base.initDevice();
+	
+	
+	timestamp_t t0, t1;
+	
+	/* radix-2 in place fft */
+	void *mem_ptr;
+	int ierr;
+	
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		mem_ptr = base.allocateMemory< complex<double> >(N, ierr);
+		base.writeData< complex<double> >(mem_ptr, cdata, N);
+		base.callFFT(mem_ptr, 1, dimsize);
+		base.readData< complex<double> >(mem_ptr, cfft, N);
+		base.freeMemory< complex<double> >(mem_ptr, N);
+		t1 = get_timestamp();
+		cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+	
+	cout << endl;
+	
+	/* stockham radix-2 out-of-place fft */
+	void *src_ptr;
+	
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		src_ptr = base.allocateMemory< complex<double> >(N, ierr);
+		base.writeData< complex<double> >(src_ptr, cdata, N);
+		base.callFFTStockham(src_ptr, 1, dimsize);
+		base.readData< complex<double> >(src_ptr, cfft2, N);
+		base.freeMemory< complex<double> >(src_ptr, N);
+		t1 = get_timestamp();
+		cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+	
+	double diff = 0;
+	for (int i = 0; i < N; i++) {
+		diff += fabs(cfft[i].real() - cfft2[i].real());
+		diff += fabs(cfft[i].imag() - cfft2[i].imag());
+	}
+	
+	cout << endl << "Difference: " << diff << endl;
+	
+	if (diff > 0.00001) {
+		for (int i = 0; i < 10; i++) {
+			cout << cfft[i] << "\t" << cfft2[i] << endl;
+		}
+	}
+		
+	return 0;
+}
+
--- a/test/testTimeIntegration.cpp
+++ b/test/testTimeIntegration.cpp
@ -0,0 +1,227 @@
+#include <iostream>
+#include <vector>
+#include <time.h>
+#include <sys/time.h>
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+using namespace std;
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Vector;
+
+Vector initVector() {
+  Vector tmp;
+  tmp.x = 0.5;
+  tmp.y = 0.5;
+  tmp.z = 0.5;
+
+  return tmp;
+}
+
+void initVectors(Vector *v, int N) {
+  for (int i = 0; i < N; i++)
+    v[i] = initVector();
+}
+
+void initDouble(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = 0.005;
+}
+
+void initLastSect(long *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = -1;
+}
+
+void checkSum(Vector *v, int N) {
+  double sum = 0;
+  for (int i = 0; i < N; i++)
+    sum += v[i].x + v[i].y + v[i].z;
+
+  std::cout << "checksum: " << sum << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 10;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+    
+    if (argv[i] == string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of particles: " << numpart << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //init p,r and dt arrays to test time integration
+  Vector *r = new Vector[numpart];
+  Vector *p = new Vector[numpart];
+  Vector *x = new Vector[numpart];
+  Vector *ori = new Vector[5];
+  initVectors(r, numpart);
+  initVectors(p, numpart);
+  initVectors(x, numpart);
+  initVectors(ori, 5);
+
+  double *dt = new double[numpart];
+  initDouble(dt, numpart);
+
+  long *ls = new long[numpart];
+  initLastSect(ls, numpart);
+
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+
+  int stream1, stream2;
+  base.createStream(stream1);
+  base.createStream(stream2);
+  
+  base.registerHostMemory(r, numpart);
+  base.registerHostMemory(p, numpart);
+  base.registerHostMemory(x, numpart);
+  base.registerHostMemory(dt, numpart);
+  base.registerHostMemory(ls, numpart);
+
+  //***test parallelttrackerpush***//
+  void *r_ptr, *p_ptr, *x_ptr, *dt_ptr, *ls_ptr, *ori_ptr;
+
+  //allocate memory on the device
+  r_ptr = base.allocateMemory<Vector>(numpart, ierr);
+  p_ptr = base.allocateMemory<Vector>(numpart, ierr);
+  x_ptr = base.allocateMemory<Vector>(numpart, ierr);
+  dt_ptr = base.allocateMemory<double>(numpart, ierr);
+  ls_ptr = base.allocateMemory<long>(numpart, ierr);
+  ori_ptr = base.allocateMemory<Vector>(5, ierr);
+
+  //transfer data to device
+  base.writeData<Vector>(r_ptr, r, numpart);
+  base.writeData<Vector>(p_ptr, p, numpart);
+  base.writeData<Vector>(x_ptr, x, numpart);
+  base.writeData<Vector>(ori_ptr, ori, 5);
+
+  
+  //do some couple of integration loops before the timer is started
+  for (int i = 0; i < 5; i++) {
+    //calc push
+    base.callParallelTTrackerPush (r_ptr, p_ptr, numpart, dt_ptr,
+				      0.05, 1, false, stream1);
+
+    //read R from device
+    base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
+    
+    //write LastSection to device
+    base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
+
+    //calc push
+    base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
+					   dt_ptr, 0.05, 1, false, stream2);
+    //read x from device
+    base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
+    
+    //sync and wait till all tasks and reads are complete
+    base.syncDevice();
+  }
+
+  checkSum(r, numpart);
+  checkSum(x, numpart);
+  
+
+  
+  //start the timing of integration
+  struct timeval timeStart, timeEnd;
+  std::cout << "start integration" << std::endl;
+  
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < loop; i++) {
+
+    //calc push
+    base.callParallelTTrackerPush(r_ptr, p_ptr, numpart, dt_ptr, 0.05, 1, false, stream1);
+
+    //read R from device
+    base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
+
+    //write LastSection to device
+    base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
+
+    //calc push transform
+    base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
+    					   dt_ptr, 0.05, 1, false, stream2);
+    
+    //read R from device
+    base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
+
+    //sync and wait till all tasks and reads are complete
+    base.syncDevice();
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  std::cout << "end integration" << std::endl;
+  double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	       (timeEnd.tv_usec - timeStart.tv_usec));
+
+  std::cout << "Time for " << numpart << " integrations: " << t * 1e-6 << "s" << std::endl;
+  std::cout << "Average time for integration: " << t * 1e-6 / loop << std::endl;
+
+  checkSum(r, numpart);
+  checkSum(x, numpart);
+
+
+
+  //free memory
+  base.freeMemory<Vector>(r_ptr, numpart);
+  base.freeMemory<Vector>(p_ptr, numpart);
+  base.freeMemory<Vector>(x_ptr, numpart);
+  base.freeMemory<Vector>(ori_ptr, 5);
+  base.freeMemory<double>(dt_ptr, numpart);
+  base.freeMemory<long>(ls_ptr, numpart);
+
+  //unregister host memory
+  base.unregisterHostMemory(r);
+  base.unregisterHostMemory(p);
+  base.unregisterHostMemory(x);
+  base.unregisterHostMemory(dt);
+  base.unregisterHostMemory(ls);  
+
+  //free host memory
+  delete[] r;
+  delete[] x;
+  delete[] p;
+  delete[] dt;
+  delete[] ls;
+  delete[] ori;
+  
+  cout << "==========================END TEST==========================" << endl;
+  return 0;
+
+}
--- a/test/testTranspose.cpp
+++ b/test/testTranspose.cpp
@ -0,0 +1,76 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void initData(complex<double> *d, int N, int dim) {
+  
+  int size = N;
+  if (dim == 2) size = N*N;
+  if (dim == 3) size = N*N*N;
+
+  for (int i = 0; i < size; i++)
+    d[i] = complex<double>(i, 0);
+
+}
+
+void printData(complex<double> *d, int N, int dim) {
+  
+  int NZ = N;
+  int NY = (dim > 1) ? N : 1;
+  int NX = (dim > 2) ? N : 1;
+
+  for (int i = 0; i < NX; i++) {
+    for (int j = 0; j < NY; j++) {
+      for (int k = 0; k < NZ; k++) {
+	std::cout << d[i*N*N + j*N + k].real() << "\t";
+      }
+      std::cout << std::endl;
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+
+} 
+
+int main(int argc, char *argv[]) {
+  
+  int N = (argc > 1) ? atoi(argv[1]) : 4;
+  int dimN[3] = {N, N, 1};
+  int dim = 2;
+  int ndim = 1;
+  int size = dimN[0] * dimN[1] * dimN[2];
+
+  std::complex<double> *hd_in = new std::complex<double>[size];
+  std::complex<double> *hd_out = new std::complex<double>[size];
+  initData(hd_in, N, dim);
+  printData(hd_in, N, dim);
+
+  DKSBase base;
+  base.setAPI("OpenCL", 6);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  
+  int ierr;
+  void *mem_ptr;
+
+  mem_ptr = base.allocateMemory< std::complex<double> >(size, ierr);
+  base.writeData< std::complex<double> >(mem_ptr, hd_in, size);
+  
+  base.callTranspose(mem_ptr, dimN, dim, ndim);
+
+  base.readData< std::complex<double> >(mem_ptr, hd_out, size);
+  base.freeMemory< std::complex<double> >(mem_ptr, size);
+
+  printData(hd_out, N, 2);
+
+  delete[] hd_in;
+  delete[] hd_out;
+
+  return 0;
+
+}