add enableRutherfordScattering option to OPALs collimatorPhysics GPU version

Allow other applications check for DKS version
add seed to random number initialization
2017-04-24 10:44:41 +02:00 · 2017-04-05 17:00:52 +02:00 · 2017-03-17 10:43:41 +01:00
47 changed files with 1447 additions and 2591 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,8 +1,10 @@
 CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
-SET (DKS_VERSION_MINOR 1)
+SET (DKS_VERSION_MINOR 0)
-SET (DKS_VERSION_PATCH 0)
+SET (DKS_VERSION_PATCH 2)
 SET (PACKAGE \"dks\")
 set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
 SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -37,44 +39,10 @@ IF (Boost_FOUND)
  LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
 ENDIF (Boost_FOUND)
 #include OPAL, musrfit or pet kernels
 OPTION(DKS_FULL "Compile DKS with full library" OFF)
 OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
 OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
 OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
 IF (DKS_FULL)
  SET(ENABLE_OPAL ON)
  SET(ENABLE_MUSR ON)
  SET(ENABLE_PET ON)
 ENDIF(DKS_FULL)
 #find clFFT
 OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
 IF (ENABLE_AMD)
  SET (clFFT_USE_STATIC_LIBS OFF)
  FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
  MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
  MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
  INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
  LINK_DIRECTORIES (${CLFFT_LIBRARIES})
  #find clRNG
  #SET (clRNG_USE_STATIC_LIBS OFF)
  #FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
  #MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
  #MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
  #INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
  #LINK_DIRECTORIES (${CLRNG_LIBRARIES})
  #find_package(PkgConfig)
  #pkg_check_modules(clRng REQUIRED)
  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
 ENDIF (ENABLE_AMD)
 #enable UQTK
 OPTION (USE_UQTK "Use UQTK" OFF)
 #intel icpc compiler specific flags
 IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
@ -217,3 +185,4 @@ INSTALL (
  DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
  RENAME ${PROJECT_NAME}ConfigVersion.cmake
  )
--- a/auto-tuning/CMakeLists.txt
+++ b/auto-tuning/CMakeLists.txt
@ -2,30 +2,18 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #chi square kernel tests
-IF (ENABLE_MUSR)
+ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
-  ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
  TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
-  ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
+ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
-  TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
  IF (USE_UQTK)
    ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
    TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
  ENDIF (USE_UQTK)
  #TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
  #test to verify search functions
  ADD_EXECUTABLE(testSearch testSearch.cpp)
  TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 ENDIF (ENABLE_MUSR)
 IF (ENABLE_OPAL)
  ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
  TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
  ADD_EXECUTABLE(testPushKick testPushKick.cpp)
  TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 ENDIF(ENABLE_OPAL)
 IF (USE_UQTK)
  ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
  TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
 ENDIF (USE_UQTK)
 #TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
 #test to verify search functions
 ADD_EXECUTABLE(testSearch testSearch.cpp)
 TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
--- a/auto-tuning/testCollimatorPhysics.cpp
+++ b/auto-tuning/testCollimatorPhysics.cpp
@ -1,161 +0,0 @@
 #include <iostream>
 #include <vector>
 #include <string>
 #include "DKSOPAL.h"
 typedef struct {
  int label;
  unsigned localID;
  double Rincol[3];
  double Pincol[3];
 } PART;
 PART initPartSmall(int d) {
  PART p;
  p.label = 0;
  p.localID = d;
  p.Rincol[0] = 0.0;
  p.Rincol[1] = 0.0;
  p.Rincol[2] = 0.02;
  p.Pincol[0] = 0.0;
  p.Pincol[1] = 0.0;
  p.Pincol[2] = 3.9920183237269791e-01;
  return p;
 }
 void printPart(PART p) {
  std::cout << "label: " << p.label << ", ";
  std::cout << "localid: " << p.localID << ",";
  std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
  std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
  std::cout << std::endl;
 }
 void initParts(PART *p, int N) {
  for (int i = 0; i < N; i++)
    p[i] = initPartSmall(i);
 }
 void printParts(PART *p, int N) {
  for (int i = 0; i < N; i++)
    printPart(p[i]);
  std::cout << std::endl;
 }
 void initParams(double *data) {
  data[0]  = 0.0;//2.0000000000000000e-02;
  data[1]  = 1.0;//1.0000000000000000e-02;	
  data[2]  = 2.2100000000000000e+00;
  data[3]  = 6.0000000000000000e+00;	
  data[4]  = 1.2010700000000000e+01;	
  data[5]  = 2.6010000000000000e+00;	
  data[6]  = 1.7010000000000000e+03;	
  data[7]  = 1.2790000000000000e+03;	
  data[8]  = 1.6379999999999999e-02;	
  data[9]  = 1.9321266968325795e-01;	
  data[10] = 7.9000000000000000e+01;	
  data[11] = 1.0000000000000002e-12;
 }
 int main(int argc, char *argv[]) {
  int loop = 10;
  int numpart = 1e5;
  char *api_name = new char[10];
  char *device_name = new char[10];
  strcpy(api_name, "Cuda");
  strcpy(device_name, "-gpu");
  for (int i = 1; i < argc; i++) {
    if (argv[i] == std::string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
    }
    if (argv[i] == std::string("-npart")) {
      numpart = atoi(argv[i+1]);
      i++;
    }
    if (argv[i] == std::string("-loop")) {
      loop = atoi(argv[i+1]);
      i++;
    }
  }
  std::cout << "=========================BEGIN TEST=========================" << std::endl;
  std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
  std::cout << "Number of particles: " << numpart << std::endl;
  std::cout << "Number of loops: " << loop << std::endl;
  std::cout << "------------------------------------------------------------" << std::endl;
  //init part vector to test mc
  PART *parts = new PART[numpart];
  initParts(parts, numpart);
  double *params = new double[12];
  initParams(params);
  //init dks
  int ierr;
  DKSOPAL base;
  base.setAPI(api_name, strlen(api_name));
  base.setDevice(device_name, strlen(api_name));
  ierr = base.initDevice();
  if (ierr != DKS_SUCCESS)
    std::cout << "Error with init device!" << std::endl;
  //init random
  base.callInitRandoms(numpart);
  //**test collimator physics and sort***//
  void *part_ptr, *param_ptr;
  //allocate memory for particles
  part_ptr = base.allocateMemory<PART>(numpart, ierr);
  param_ptr = base.allocateMemory<double>(12, ierr);
  //transfer data to device
  base.writeData<PART>(part_ptr, parts, numpart);
  base.writeData<double>(param_ptr, params, 12);
  int numaddback;
  base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
  base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);  
  base.syncDevice();
  //read data from device
  base.readData<PART>(part_ptr, parts, numpart);
  //free memory
  base.freeMemory<PART>(part_ptr, numpart);
  base.freeMemory<double>(param_ptr, 12);  
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++) {
    std::cout << parts[i].label << "\t" 
 	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
 	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
 	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
 	      << std::endl;
  }
  std:: cout << "..." << std::endl;
  for (int i = numpart - 10; i < numpart; i++) {
    std::cout << parts[i].label << "\t" 
 	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
 	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
 	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
 	      << std::endl;
  }
  return 0;
 }
--- a/auto-tuning/testPushKick.cpp
+++ b/auto-tuning/testPushKick.cpp
@ -1,132 +0,0 @@
 #include <iostream>
 #include <vector>
 #include <string>
 #include "DKSOPAL.h"
 #include <vector_types.h>
 #include "cuda_runtime.h"
 void initData(double3 *data, int N) {
  for (int i = 0; i < N; i++) {
    data[i].x = (double)rand() / RAND_MAX;
    data[i].y = (double)rand() / RAND_MAX;
    data[i].z = (double)rand() / RAND_MAX;
  }
 }
 void initDt(double *data, int N) {
  for (int i = 0; i < N; i++) {
    data[i] = 0.00001;
  }
 }
 int main(int argc, char *argv[]) {
  int loop = 10;
  int numpart = 1e5;
  char *api_name = new char[10];
  char *device_name = new char[10];
  strcpy(api_name, "Cuda");
  strcpy(device_name, "-gpu");
  for (int i = 1; i < argc; i++) {
    if (argv[i] == std::string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
    }
    if (argv[i] == std::string("-npart")) {
      numpart = atoi(argv[i+1]);
      i++;
    }
    if (argv[i] == std::string("-loop")) {
      loop = atoi(argv[i+1]);
      i++;
    }
  }
  std::cout << "=========================BEGIN TEST=========================" << std::endl;
  std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
  std::cout << "Number of particles: " << numpart << std::endl;
  std::cout << "Number of loops: " << loop << std::endl;
  std::cout << "------------------------------------------------------------" << std::endl;
  int ierr;
  DKSOPAL dksbase;
  dksbase.setAPI(api_name, strlen(api_name));
  dksbase.setDevice(device_name, strlen(api_name));
  ierr = dksbase.initDevice();
  if (ierr != DKS_SUCCESS)
    std::cout << "Error with init device!" << std::endl;
  double3 *R = new double3[numpart];
  double3 *P = new double3[numpart];
  double3 *Ef = new double3[numpart];
  double3 *Bf = new double3[numpart];
  double *dt = new double[numpart];
  initData(R, numpart);
  initData(P, numpart);
  initData(Ef, numpart);
  initData(Bf, numpart);
  initDt(dt, numpart);
  void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
  r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
  dksbase.writeData<double3>(r_ptr, R, numpart);
  dksbase.writeData<double3>(p_ptr, P, numpart);
  dksbase.writeData<double3>(ef_ptr, Ef, numpart);
  dksbase.writeData<double3>(bf_ptr, Bf, numpart);
  dksbase.writeData<double>(dt_ptr, dt, numpart);
  for (int i = 0; i < loop; ++i)
    dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  std:: cout << "..." << std::endl;
  for (int i = numpart - 10; i < numpart; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  std::cout << "============" << std::endl;
  dksbase.readData<double3>(r_ptr, R, numpart);
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  std:: cout << "..." << std::endl;
  for (int i = numpart - 10; i < numpart; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  dksbase.freeMemory<double3>(r_ptr, numpart);
  dksbase.freeMemory<double3>(p_ptr, numpart);
  dksbase.freeMemory<double3>(ef_ptr, numpart);
  dksbase.freeMemory<double3>(bf_ptr, numpart);
  dksbase.freeMemory<double>(dt_ptr, numpart);
  delete[] R;
  delete[] P;
  delete[] Ef;
  delete[] Bf;
  delete[] dt;
 }
--- a/cmake/DKSConfig.cmake.in
+++ b/cmake/DKSConfig.cmake.in
@ -4,4 +4,4 @@ SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 SET(${PROJECT_NAME}_LIBRARY "dks")
 SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
 SET(DKS_VERSION ${DKS_VERSION})
-SET(DKS_VERSION_STR ${DKS_VERSION_STR})
+SET(DKS_VERSION_STR ${DKS_VERSION_STR})
--- a/src/Algorithms/CMakeLists.txt
+++ b/src/Algorithms/CMakeLists.txt
@ -6,7 +6,6 @@ SET (_HDRS
 	ImageReconstruction.h
 	CollimatorPhysics.h
 	FFT.h
 	GreensFunction.h
  )
 ADD_SOURCES (${_SRCS})
--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@ -5,7 +5,10 @@
 #include <string>
 #include "../DKSDefinitions.h"
 class DKSBaseMuSR;
 class DKSCollimatorPhysics {
  friend class DKSBaseMuSR;
 protected:
@ -16,7 +19,8 @@ public:
  virtual ~DKSCollimatorPhysics() { }
-  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;
+  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
 				bool enableRutherfordScattering = true) = 0;
  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
@ -33,10 +37,6 @@ public:
  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;
  virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				   void *bf_ptr, void *dt_ptr, double charge,
 				   double mass, int npart, double c, int streamId = -1) = 0; 
  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@ -1,29 +0,0 @@
 #ifndef H_GREENSFUNCTION
 #define H_GREENSFUNCTION
 #include <iostream>
 #include <cmath>
 class GreensFunction {
 public:
  virtual ~GreensFunction() { }
  /** calc greens integral, as defined in OPAL */
  virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
 			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
  /** integration if rho2_m, see OPAL for more details */
  virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
 					int streamId = -1) = 0;
  /** mirror rho2_m field */
  virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
  /** multiply two complex fields from device memory */
  virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
 };
 #endif
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -35,29 +35,13 @@ ENDMACRO ()
 SET (DKS_BASEDIR_HDRS
  DKSBase.h
  DKSDefinitions.h
  DKSOPAL.h
  )
 SET (DKS_BASEDIR_SRCS
  DKSBase.cpp
  DKSOPAL.cpp
  )
-#add opal to DKS if enable_opal is set
+IF (USE_CUDA OR USE_OPENCL)
 IF (ENABLE_OPAL)
  SET (DKS_BASEDIR_HDRS
    ${DKS_BASEDIR_HDRS}
    DKSOPAL.h
    )
  SET (DKS_BASEDIR_SRCS
    ${DKS_BASEDIR_SRCS}
    DKSOPAL.cpp
    )
 ENDIF (ENABLE_OPAL)
 #and musrt to DKS if cuda or opencl is used and enable_musr is set
 IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
   SET (DKS_BASEDIR_HDRS
       ${DKS_BASEDIR_HDRS}
       DKSBaseMuSR.h
@ -67,10 +51,9 @@ IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
       ${DKS_BASEDIR_SRCS}
       DKSBaseMuSR.cpp
       )
-ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
+ENDIF (USE_CUDA OR USE_OPENCL)
-#add image reconstruction to DKS if cuda is used and enable_pet is set
+IF (USE_CUDA)
 IF (USE_CUDA AND ENABLE_PET)
  SET (DKS_BASEDIR_HDRS
    ${DKS_BASEDIR_HDRS}
    DKSImageReconstruction.h
@ -80,7 +63,7 @@ IF (USE_CUDA AND ENABLE_PET)
    ${DKS_BASEDIR_SRCS}
    DKSImageReconstruction.cpp
    )
-ENDIF (USE_CUDA AND ENABLE_PET)
+ENDIF (USE_CUDA)
 ADD_HEADERS (${DKS_BASEDIR_HDRS})
 ADD_SOURCES (${DKS_BASEDIR_SRCS})
--- a/src/CUDA/CMakeLists.txt
+++ b/src/CUDA/CMakeLists.txt
@ -1,27 +1,35 @@
-SET (_HDRS CudaBase.cuh)
+SET (_HDRS
-SET (_SRCS CudaBase.cu)
+	CudaBase.cuh
 	CudaFFT.cuh
 	CudaGreensFunction.cuh
 	CudaChiSquare.cuh
 	CudaCollimatorPhysics.cuh
 	CudaImageReconstruction.cuh
 	CudaChiSquareRuntime.cuh
  )
 SET (_SRCS
 	CudaBase.cu
 	CudaFFT.cu
 	CudaGreensFunction.cu
 	CudaChiSquare.cu
 	CudaCollimatorPhysics.cu
 	CudaImageReconstruction.cu
 	CudaChiSquareRuntime.cu
 )
-IF (ENABLE_OPAL)
+#INCLUDE_DIRECTORIES (
-  SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
+#  ${CMAKE_CURRENT_SOURCE_DIR}
-  SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu)
+#)
 ENDIF (ENABLE_OPAL)
 IF (ENABLE_MUSR)
  SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
  SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
  SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
 ENDIF (ENABLE_MUSR)
 IF (ENABLE_PET)
  SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
  SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
 ENDIF (ENABLE_PET)
 MESSAGE (STATUS "CUDA headers: ${_HDRS}")
 ADD_SOURCES(${_SRCS})
 ADD_HEADERS(${_HDRS})
 INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
 SET (_KERNELS
  NVRTCKernels/CudaChiSquareKernel.cu
  )
 INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -13,13 +13,6 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
 }
 __global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < size)
    data[idx] = curand_uniform_double(&state[idx]);
 }
 //=====================================//
 //==========Private functions==========//
@ -55,7 +48,7 @@ int CudaBase::cuda_createCurandStates(int size, int seed) {
  int threads = 128;
  int blocks = size / threads + 1;
-  if (seed == -1) 
+  if (seed == -1)
    seed = time(NULL);
  //std::cout << "sizeof: " << sizeof(curandState) << std::endl;
@ -76,15 +69,6 @@ int CudaBase::cuda_deleteCurandStates() {
  return DKS_SUCCESS;
 }
 int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
  int threads = BLOCK_SIZE;
  int blocks = size / threads + 1;
  kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
  return DKS_SUCCESS;
 }
 curandState* CudaBase::cuda_getCurandStates() {
  return defaultRndState;
 }
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@ -15,8 +15,6 @@
 #include <nvToolsExt.h>
 #include <time.h>
 #define BLOCK_SIZE 128
 class CudaBase {
 private:
@ -41,7 +39,6 @@ public:
   * Init cuda random number (cuRand) states.
   * Create an array of type curandState  with "size" elements on the GPU
   * and create a curandState with different seed for each array entry.
   * If no seed is given create a seed based on current time.
   * Return success or error code
   */
  int cuda_createCurandStates(int size, int seed = -1);
@ -53,11 +50,6 @@ public:
   */
  int cuda_deleteCurandStates();
  /** Create 'size' random numbers on the device and save in mem_ptr array
   *
   */
  int cuda_createRandomNumbers(void *mem_ptr, int size);
  /** Get a pointer to curand states
   *
   */
@ -175,40 +167,6 @@ public:
    return DKS_SUCCESS;
  }		
  /** Zero CUDA memory.
   *  Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
    cudaError cerror;
    cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
    if (cerror != cudaSuccess) {
      DEBUG_MSG("Error zeroing cuda memory!\n");
      return DKS_ERROR;
    }
    return DKS_SUCCESS;
  }
  /** Zero CUDA memory.
   *  Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
    int dkserror = DKS_SUCCESS;
    cudaError cerror;
    if (streamId < cuda_numberOfStreams()) {
 	cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size, 
 				 cuda_getStream(streamId));
 	if (cerror != cudaSuccess)
 	  dkserror = DKS_ERROR;
    } else
      dkserror = DKS_ERROR;
    return dkserror;
  }
  /** 
   * Info: write data to memory
   * Retrun: success or error code
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@ -23,6 +23,7 @@
 #define X0_M 9
 #define I_M 10
 #define DT_M 11
 #define LOWENERGY_THR 12
 #define BLOCK_SIZE 128
 #define NUMPAR 12
@ -33,14 +34,6 @@ __device__ inline double dot(double3 &d1, double3 &d2) {
 }
 __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  double3 tmp;
  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
  tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
  tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
  return tmp;
 }
 __device__ inline bool checkHit(double &z, double *par) {
  /* check if particle is in the degrader material */
@ -89,7 +82,7 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
    Eng = Eng + delta_E / 1E3;
  }
-  pdead = ((Eng<1E-4) || (dEdx>0));
+  pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) );
 }
@ -125,7 +118,9 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
-__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
+__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, 
 				   double* par, bool enableRutherfordScattering) 
 {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
  double gamma = (Eng + M_P) / M_P;
@ -153,7 +148,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-  if(P2 < 0.0047) {
+  if( (P2 < 0.0047) && enableRutherfordScattering) {
    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
@ -179,7 +174,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
  Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
  P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-  if(P2 < 0.0047) {
+  if( (P2 < 0.0047) && enableRutherfordScattering) {
    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
@ -193,7 +188,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
-					int numparticles)
+					int numparticles, bool enableRutherfordScattering)
 {
  //get global id and thread id
@ -235,7 +230,7 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
-	coulombScat(R[tid], P, s, p); 
+	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
 	data[idx].Pincol = P;
      } else {
@ -258,7 +253,8 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
 }
 __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
-					 curandState *state, int numparticles)
+					 curandState *state, int numparticles,  
 					 bool enableRutherfordScattering)
 {
  //get global id and thread id
@ -296,7 +292,7 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
    	P.x = P.x * ptot / sq;
    	P.y = P.y * ptot / sq;
    	P.z = P.z * ptot / sq;
-    	coulombScat(R[tid], P, s, p); 
+    	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
    	data.Pincol[idx] = P;
    } else {
@ -431,7 +427,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
 }
-__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
+__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
  //get global id and thread id
  volatile int tid = threadIdx.x;
@ -457,51 +453,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
  }
 }
-__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
+//TODO: kernel for push with switch off unitless positions with dt[i]*c
 			   double3 *gBf, double *gdt, double charge, 
 			   double mass, int npart, double c)
 {
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 P = gP[idx];
    double3 Ef = gEf[idx];
    double3 Bf = gBf[idx];
    double dt = gdt[idx];
    P.x += 0.5 * dt * charge * c / mass * Ef.x;
    P.y += 0.5 * dt * charge * c / mass * Ef.y;
    P.z += 0.5 * dt * charge * c / mass * Ef.z;
    double gamma = sqrt(1.0 + dot(P, P));
    double3 t, w, s;
    t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
    t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
    t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
    double3 crossPt = cross(P, t);
    w.x = P.x + crossPt.x;
    w.y = P.y + crossPt.y;
    w.z = P.z + crossPt.z;
    s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
    s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
    s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
    double3 crossws = cross(w, s);
    P.x += crossws.x;
    P.y += crossws.y;
    P.z += crossws.z;
    P.x += 0.5 * dt * charge * c / mass * Ef.x;
    P.y += 0.5 * dt * charge * c / mass * Ef.y;
    P.z += 0.5 * dt * charge * c / mass * Ef.z;
    gP[idx] = P;
  }
 }
 __device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
@ -663,7 +615,8 @@ struct less_then
  }
 };
-int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
+int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 					     bool enableRutherfordScattering)
 {
  int threads = BLOCK_SIZE;
@ -676,7 +629,8 @@ int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int n
  kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr, 
 							  (double*)par_ptr,
 							  m_base->cuda_getCurandStates(),
-							  numparticles);
+							  numparticles,
 							  enableRutherfordScattering);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
@ -723,12 +677,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
    }
  } else {
    if (streamId == -1) {
-      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, 
+      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart, 
-				      (double*)dt_ptr, npart, c);
+				      (double*)dt_ptr, c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
-      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, 
+      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, 
-					      (double*)dt_ptr, npart, c);
+					      (double*)dt_ptr, c);
    }
  }
@ -736,29 +690,6 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
  return DKS_SUCCESS;
 }
 int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 						void *bf_ptr, void *dt_ptr, double charge,
 						double mass, int npart,
 						double c, int streamId) 
 {
  int threads = BLOCK_SIZE;
  int blocks = npart / threads + 1;
  //call kernel
  if (streamId == -1) {
    kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
 				    (double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
  } else {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
    kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, 
 					    (double3*)ef_ptr, (double3*)bf_ptr, 
 					    (double*)dt_ptr, charge, mass,  npart, c);
  }
  return DKS_SUCCESS;
 }
 int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 							 void *lastSec_ptr, void *orient_ptr, 
 							 int npart, int nsec, 
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@ -110,7 +110,7 @@ public:
   *
   */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-			int numpartices);
+			int numpartices, bool enableRutherfordScattering = true);
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
@ -141,10 +141,6 @@ public:
  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);
  int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			   void *bf_ptr, void *dt_ptr, double charge, double mass,
 			   int npart, double c, int streamId = -1); 
  /** BorisPusher push function with transformto function form OPAL
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
--- a/src/CUDA/CudaGreensFunction.cu
+++ b/src/CUDA/CudaGreensFunction.cu
@ -189,11 +189,12 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
-    
+  
    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
-
+  
    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
  }
 }
@ -272,20 +273,28 @@ __global__ void mirroredRhoField(double *rho2_m,
    id7 = rk * NI * NJ + rj * NI + i;
    id8 = rk * NI * NJ + rj * NI + ri;
    double data = rho2_m[id1];
-    if (i != 0) rho2_m[id2] = data;
+    if (i != 0)
      rho2_m[id2] = data;
-    if (j != 0) rho2_m[id3] = data;
+    if (j != 0)
      rho2_m[id3] = data;
-    if (i != 0 && j != 0) rho2_m[id4] = data;
+    if (i != 0 && j != 0)
      rho2_m[id4] = data;
-    if (k != 0) rho2_m[id5] = data;
+    if (k != 0) 
      rho2_m[id5] = data;
-    if (k !=  0 && i != 0) rho2_m[id6] = data;
+    if (k !=  0 && i != 0)
      rho2_m[id6] = data;
-    if (k!= 0 && j != 0) rho2_m[id7] = data;
+    if (k!= 0 && j != 0)
      rho2_m[id7] = data;
-    if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
+    if (k != 0 && j != 0 & i != 0)
      rho2_m[id8] = data;
  }
@ -354,9 +363,9 @@ CudaGreensFunction::~CudaGreensFunction() {
    delete m_base;
 }
-int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
-				       double hr_m0, double hr_m1, double hr_m2,
+					    double hr_m0, double hr_m1, double hr_m2,
-				       int streamId)
+					    int streamId)
 {
  int thread = 128;
@ -364,7 +373,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
  //if no stream specified use default stream
  if (streamId == -1) {
-    kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
+    kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
    return DKS_SUCCESS;
  }
@ -372,7 +381,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
-    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
+    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
    return DKS_SUCCESS;
  }
@ -380,17 +389,15 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
 }
-int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, 
+int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, 
-						  int I, int J, int K,
+						       int I, int J, int K,
-						  int streamId) 
+						       int streamId) 
 {
  int thread = 128;
  int block = (I * J * K / thread) + 1;
  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
  if (streamId == -1) {
    m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
    kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, 
 					    2*(I - 1), 2*(J - 1), I, J, K);
    return DKS_SUCCESS;
@ -399,7 +406,6 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
    m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
    kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, 
 						  2*(I - 1), 2*(J - 1), I, J, K);
    return DKS_SUCCESS;
@ -409,22 +415,22 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
  return DKS_ERROR;
 }
-int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
+int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
  int thread = 128;
  int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
  if (streamId == -1) {
-    mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I,  2*J);
+    mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I,  2*J);
-    mirroredRhoField<<< block, thread >>>( (double *) rho2_m,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
+    mirroredRhoField<<< block, thread >>>( (double *) mem_ptr,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
    return DKS_SUCCESS;
  }
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
-    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I,  2*J);
+    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I,  2*J);
-    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
+    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
    return DKS_SUCCESS;
  }
@ -434,13 +440,13 @@ int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int st
  return DKS_ERROR;
 }
-int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, 
+int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, 
-					      int size, int streamId) {
+						   int size, int streamId) {
  int threads = 128;
  int blocks = size / threads + 1;
  int datasize = 2 * threads * sizeof(cuDoubleComplex);
-  
+
  if (streamId == -1) {
    multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1, 
 							     (cuDoubleComplex*)ptr2, 
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@ -2,17 +2,17 @@
 #define H_CUDA_GREENSFUNCTION
 #include <iostream>
-#include <cmath>
+#include <math.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuComplex.h>
 #include "cublas_v2.h"
-#include "../Algorithms/GreensFunction.h"
+
 #include "CudaBase.cuh"
-class CudaGreensFunction : public GreensFunction{
+class CudaGreensFunction {
 private:
@ -30,32 +30,32 @@ public:
  /* destructor */
  ~CudaGreensFunction();
-  /**
+  /*
    Info: calc itegral on device memory (taken from OPAL src code)
    Return: success or error code
  */
-  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+  int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
-		       double hr_m0, double hr_m1, double hr_m2, 
+			  double hr_m0, double hr_m1, double hr_m2, 
-		       int streamId = -1);
+			  int streamId = -1);
-  /**
+  /*
    Info: integration of rho2_m field (taken from OPAL src code)
    Return: success or error code
  */
-  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
+  int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
-				  int streamId = -1);
+				     int streamId = -1);
-  /**
+  /*
    Info: mirror rho field (taken from OPAL src code)
    Return: succes or error code
  */
-  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
+  int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
-  /**
+  /*
    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
    Return: success or error code
  */
-  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
+  int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
 };
--- a/src/DKSBase.cpp
+++ b/src/DKSBase.cpp
@ -103,14 +103,25 @@ DKSBase::DKSBase() {
 #ifdef DKS_CUDA
  cbase = new CudaBase();
  cfft = new CudaFFT(cbase);
  cgreens = new CudaGreensFunction(cbase);
  cchi = new CudaChiSquare(cbase);
  ccol = new CudaCollimatorPhysics(cbase);
 #endif
 #ifdef DKS_OPENCL
  oclbase = new OpenCLBase();
  oclfft = new OpenCLFFT(oclbase);
  oclchi = new OpenCLChiSquare(oclbase);
  oclcol = new OpenCLCollimatorPhysics(oclbase);
 #endif
 #ifdef DKS_MIC
  micbase = new MICBase();
  micfft = new MICFFT(micbase);
  miccol = new MICCollimatorPhysics(micbase);
  micgreens = new MICGreensFunction(micbase);
  micchi = new MICChiSquare(micbase);
 #endif
 }
@ -127,14 +138,25 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
 #ifdef DKS_CUDA
  cbase = new CudaBase();
  cfft = new CudaFFT(cbase);
  cgreens = new CudaGreensFunction(cbase);
  cchi = new CudaChiSquare(cbase);
  ccol = new CudaCollimatorPhysics(cbase);
 #endif
 #ifdef DKS_OPENCL
  oclbase = new OpenCLBase();
  oclfft = new OpenCLFFT(oclbase);
  oclchi = new OpenCLChiSquare(oclbase);
  oclcol = new OpenCLCollimatorPhysics(oclbase);
 #endif
 #ifdef DKS_MIC
  micbase = new MICBase();
  micfft = new MICFFT(micbase);
  miccol = new MICCollimatorPhysics(micbase);
  micgreens = new MICGreensFunction(micbase);
  micchi = new MICChiSquare(micbase);
 #endif
 }
@ -151,16 +173,27 @@ DKSBase::~DKSBase() {
  if (m_function_name != NULL)
    delete[] m_function_name;
 #ifdef DKS_CUDA
  delete cfft;
  delete cgreens;
  delete cchi;
  delete ccol;
  delete cbase;
 #endif
 #ifdef DKS_OPENCL
  delete oclfft;
  delete oclchi;
  delete oclcol;
  delete oclbase;
 #endif
 #ifdef DKS_MIC
  delete micfft;
  delete miccol;
  delete micgreens;
  delete micchi;
  delete micbase;
 #endif
@ -274,45 +307,38 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
    return DKS_ERROR;
 }
-int DKSBase::setupDevice() {
+/*
-
+  init device
-  int ierr = DKS_ERROR;
+*/
 int DKSBase::initDevice() {
  //if api is not set default is OpenCL
  if (!m_api_set) {
    setDevice("-gpu", 4);
    setAPI(API_OPENCL, 6);
-    ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+    return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
  } else {
    if (apiOpenCL()) {
      if (!m_device_set) {
 	setDevice("-gpu", 4);
 	setAPI(API_OPENCL, 6);
-	ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+	return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
      } else {
 	setAPI(API_OPENCL, 6);
-	ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
+	return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
      }
    } else if (apiCuda()) {
      setDevice("-gpu", 4);
      setAPI(API_CUDA, 4);			
-      ierr = CUDA_SAFECALL(DKS_SUCCESS);
+      return CUDA_SAFECALL(DKS_SUCCESS);
    } else if (apiOpenMP()) {
      setDevice("-mic", 4);
      setAPI(API_OPENMP, 6);
-      ierr = MIC_SAFECALL(DKS_SUCCESS);
+      return MIC_SAFECALL(DKS_SUCCESS);
    }
  }
-  return ierr;
+  return DKS_ERROR;
 }
 /*
  init device
 */
 int DKSBase::initDevice() {
  return setupDevice();
 }
 /* 
@ -430,15 +456,358 @@ int DKSBase::syncDevice() {
  return DKS_ERROR;
 }
 /* setup fft plans to reuse if multiple ffts of same size are needed */
 int DKSBase::setupFFT(int ndim, int N[3]) {
-int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
+  if (apiCuda()) {
-  if (apiCuda())
+    return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
-    return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
+  } else if (apiOpenMP()) {
-  if (apiOpenCL())
+    //micbase.mic_setupFFT(ndim, N);
-    return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));
+    //BENI: setting up RC and CR transformations on MIC
    int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
    int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
    if (ierr1 != DKS_SUCCESS)
      return ierr1;
    if (ierr2 != DKS_SUCCESS)
      return ierr2;
    return DKS_SUCCESS;
  }
  return DKS_ERROR;
 }
 //BENI:
 int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
  if (apiCuda())
    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
  else if (apiOpenMP())
    return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
  return DKS_ERROR;
 }
 //BENI:
 int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
  if (apiCuda())
    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
  else if (apiOpenMP())
    return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
  return DKS_ERROR;
 }
 /* call OpenCL FFT function for selected platform */
 int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    //load kernel and execute
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
    else
      return DKS_ERROR;
  } else if (apiCuda()) {
    return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
  } else if (apiOpenMP()) {
    return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call OpenCL IFFT function for selected platform */
 int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
    else
      return DKS_ERROR;
  } else if (apiCuda()) {
    return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
  } else if (apiOpenMP()) {
    return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call normalize FFT function for selected platform */
 int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
    else 
      return DKS_ERROR;
  } else if (apiCuda()) {
    return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
  } else if (apiOpenMP()) {
    return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call real to complex FFT */
 int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
  else if (apiOpenMP())
    return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call complex to real FFT */
 int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
  else if (apiOpenMP())
    return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* normalize complex to real iFFT */
 int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
  DEBUG_MSG("No implementation for selected platform");
  return DKS_SUCCESS;
 }
 /* normalize complex to real iFFT */
 int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
  if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 				double hz_m0, double hz_m1, double hz_m2, int streamId) {
  if (apiCuda()) {
    return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ, 
 						      hz_m0, hz_m1, hz_m2, streamId) );
  } else if (apiOpenMP()) {
    //BENI:
    return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
  } 
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 				   int I, int J, int K, int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
  if (apiCuda()) 
    return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
 			     double fTimeResolution, double fRebin,
 			     int sensors, int length, int numpar, double &result)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
 						fTimeResolution, fRebin,
 						sensors, length, numpar, 
 						result));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
 						     fTimeResolution, fRebin,
 						     sensors, length, numpar, result));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 			       double fTimeResolution, double fRebin, double fGoodBinOffset,
 			       int sensors, int length, int numpar,
 			       double &result)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
 						  fTimeResolution, fRebin, fGoodBinOffset,
 						  sensors, length, numpar,
 						  result));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
 						       fTimeResolution, fRebin, fGoodBinOffset,
 						       sensors, length, numpar, result));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 				 double fTimeResolution, double fRebin, double fGoodBinOffset,
 				 int sensors, int length, int numpar,
 				 double &result)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
 						    fTimeResolution, fRebin, fGoodBinOffset,
 						    sensors, length, numpar,
 						    result));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
 							 fTimeResolution, fRebin, fGoodBinOffset,
 							 sensors, length, numpar, result));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 				   int numparticles, int numparams,
 				   int &numaddback, int &numdead) 
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
    else
      return DKS_ERROR;
  } else if (apiOpenMP()) {
    return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
  } 
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles, 
 				    bool enableRutherfordScattering) 
 {
  if (apiCuda())
    return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, 
 						  enableRutherfordScattering) );
  else if (apiOpenMP())
    return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				      void *px_ptr, void *py_ptr, void *pz_ptr,
 				      void *par_ptr, int numparticles)
 {
  if (apiOpenMP()) {
    return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
 						      rx_ptr, ry_ptr, rz_ptr, 
 						      px_ptr, py_ptr, pz_ptr,
 						      par_ptr,  numparticles) );
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
 {
  if (apiCuda())
    return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
  else if (apiOpenMP())
    return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 					  void *px_ptr, void *py_ptr, void *pz_ptr,
 					  void *par_ptr, int numparticles, int &numaddback) 
 {
  if (apiOpenMP()) {
    return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
 							 rx_ptr, ry_ptr, rz_ptr, 
 							 px_ptr, py_ptr, pz_ptr,
 							 par_ptr,  numparticles, numaddback));
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callInitRandoms(int size, int seed) {
  if (apiCuda()) 
@ -452,3 +821,43 @@ int DKSBase::callInitRandoms(int size, int seed) {
  return DKS_ERROR;
 }
 int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 				      void *dt_ptr, double dt, double c, 
 				      bool usedt, int streamId) 
 {
  if (apiCuda()) 
    return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, 
 						    usedt, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, 
 						     c, usedt, streamId));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					       void *lastSec_ptr, void *orient_ptr, 
 					       int npart, int nsec, void *dt_ptr, double dt, 
 					       double c, bool usedt, int streamId)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
 							     lastSec_ptr, orient_ptr,
 							     npart, nsec, dt_ptr, dt, 
 							     c, usedt, streamId));
  } else if (apiOpenMP()) {
    return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
 							      lastSec_ptr, orient_ptr,
 							      npart, nsec, dt_ptr, dt, 
 							      c, usedt, streamId));
  } 
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@ -29,17 +29,31 @@
 #endif
 #include "OpenCL/OpenCLBase.h"
 #include "OpenCL/OpenCLFFT.h"
 #include "OpenCL/OpenCLChiSquare.h"
 #include "OpenCL/OpenCLCollimatorPhysics.h"
 #endif
 #ifdef DKS_CUDA
 #include "CUDA/CudaBase.cuh"
 #include "CUDA/CudaFFT.cuh"
 #include "CUDA/CudaGreensFunction.cuh"
 #include "CUDA/CudaChiSquare.cuh"
 #include "CUDA/CudaCollimatorPhysics.cuh"
 #include "nvToolsExt.h"
 #endif
 #ifdef DKS_MIC
 #include "MIC/MICBase.h"
 #include "MIC/MICChiSquare.h"
 #include "MIC/MICFFT.h"
 #include "MIC/MICCollimatorPhysics.h"
 #include "MIC/MICGreensFunction.hpp"
 #endif
 #include "Algorithms/CollimatorPhysics.h"
 #include "Algorithms/FFT.h"
 #include "AutoTuning/DKSConfig.h"
 /** DKSBase class for handling function calls to DKS library */
@ -59,14 +73,25 @@ private:
 #ifdef DKS_OPENCL	
  OpenCLBase *oclbase;
  OpenCLFFT *oclfft;
  OpenCLChiSquare *oclchi;
  OpenCLCollimatorPhysics *oclcol;
 #endif
 #ifdef DKS_CUDA
  CudaBase *cbase;
  CudaFFT *cfft;
  CudaGreensFunction *cgreens;
  CudaChiSquare *cchi;
  CudaCollimatorPhysics *ccol;
 #endif
 #ifdef DKS_MIC
  MICBase *micbase;
  MICFFT *micfft;
  MICCollimatorPhysics *miccol;
  MICGreensFunction *micgreens;
  MICChiSquare *micchi;
 #endif
 protected:
@ -114,12 +139,6 @@ protected:
  }
 #endif
 #ifdef DKS_MIC
  MICBase *getMICBase() {
    return micbase;
  }
 #endif
  /** Call OpenCL base to load specified kenrel file.
   *
   */
@ -135,7 +154,6 @@ protected:
    return device_name;
  }
 public:
  /** 
@ -155,11 +173,6 @@ public:
   */
  ~DKSBase();
  /** Function to initialize objects based on the device used.
   *  
   */
  int setupDevice();
  /** Turn on auto tuning */
  void setAutoTuningOn() { m_auto_tuning = true; }
@ -392,7 +405,7 @@ public:
    } else if (apiOpenMP()) {
 #ifdef DKS_MIC
      void * mem_ptr = NULL;
-      mem_ptr = micbase->mic_allocateMemory<T>(elements);	
+      mem_ptr = micbase.mic_allocateMemory<T>(elements);	
      return mem_ptr;
 #endif
    }
@ -485,7 +498,7 @@ public:
      return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
+      return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
    } 
@ -519,7 +532,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
+      return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
    } 
    return DKS_ERROR;
@ -819,7 +832,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
+      return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
    } 
    return DKS_ERROR;
@ -847,7 +860,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements, 
+      return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements, 
 						       streamId, offset));
    }
@ -867,23 +880,221 @@ public:
    else if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
    else if (apiOpenMP())
-      return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
+      return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
    return DKS_ERROR;
  }
-  /**
+
-   * Create random numbers on the device and fille mem_data array
+  ///////////////////////////////////////////////
  ///////Function library part of dksbase////////
  ///////////////////////////////////////////////
  /** 
   * Setup FFT function.
   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
   * each fft will do its own setup according to fft size and dimensions.
   * TODO: opencl and mic implementations
   */
-  int callCreateRandomNumbers(void *mem_ptr, int size);
+  int setupFFT(int ndim, int N[3]);
  //BENI:
  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
  //BENI:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
  /** 
   * Call complex-to-complex fft.
   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation
   */
  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex-to-complex ifft.
   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation.
   */
  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize complex to complex ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size
   * TODO: mic implementation.
   */
  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call real to complex FFT.
   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations
   */
  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex to real iFFT.
   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations.
   */
  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize compelx to real ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size.
   * TODO: opencl and mic implementations.
   */
  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
  /**
   * Transpose 2D and 3D arrays, OpenCL implementation
   * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose 
   */
  int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 			    int I, int J, int K, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
  /** 
   * Element by element multiplication.
   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
   * TODO: opencl and mic implementations.
   */
  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
  /** 
   * Chi square for parameter fitting on device.
   * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for 
   * intermediate results. Chi square results are put in &results
   */
  int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
 		      double fTimeResolution, double fRebin,
 		      int sensors, int length, int numpar, double &result);
  /** 
   * max-log-likelihood for parameter fitting on device.
   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
   * mem_par - pointer to parameter set, mem_results - pointer for 
   * intermediate results. Chi square results are put in &results.
   * TODO: opencl and mic implementations.
   */
  int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 			double fTimeResolution, double fRebin, double fGoodBinOffser,
 			int sensors, int length, int numpar,
 			double &result);
  /** 
   * max-log-likelihood for parameter fitting on device.
   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
   * mem_par - pointer to parameter set, mem_results - pointer for 
   * intermediate results. Chi square results are put in &results.
   * TODO: opencl and mic implementations.
   */
  int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 			  double fTimeResolution, double fRebin, double fGoodBinOffser,
 			  int sensors, int length, int numpar,
 			  double &result);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			    int numparticles, int numparams, 
 			    int &numaddback, int &numdead);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles, 
 			     bool enableRutherfordScattering = true);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
   */
  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
 			       void *par_ptr, int numparticles);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles, int &numaddback);
  /** 
   * Init random number states and save for reuse on device.
   * If seed is -1, a random seed based on current time is taken.
   * TODO: opencl and mic implementations.
   */
  int callInitRandoms(int size, int seed = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 			       void *dt_ptr, double dt, double c, 
 			       bool usedt = false, int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					void *lastSec_ptr, void *orient_ptr, 
 					int npart, int nsec, void *dt_ptr,
 					double dt, double c, bool usedt = false, 
 					int streamId = -1);
  /**
   * Print memory information on device (total, used, available)
   * TODO: opencl and mic imlementation
--- a/src/DKSDefinitions.h
+++ b/src/DKSDefinitions.h
@ -62,12 +62,6 @@
 #define OPENCL_SAFEINIT(x) ( NULL )
 #endif
 #ifdef DKS_AMD
 #define OPENCL_SAFEINIT_AMD(x) ( x )
 #else
 #define OPENCL_SAFEINIT_AMD(x) ( NULL )
 #endif
 #ifdef DKS_MIC
 #define MIC_SAFEINIT(x) ( x )
 #else
--- a/src/DKSOPAL.cpp
+++ b/src/DKSOPAL.cpp
@ -1,298 +0,0 @@
 #include "DKSOPAL.h"
 DKSOPAL::DKSOPAL() {
  dksfft = nullptr;
  dkscol = nullptr;
  dksgreens = nullptr;
 }
 DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
  setAPI(api_name, strlen(api_name));
  setDevice(device_name, strlen(device_name));
 }
 DKSOPAL::~DKSOPAL() {
  delete dksfft;
  delete dkscol;
  delete dksgreens;
 }
 int DKSOPAL::setupOPAL() {
  int ierr = DKS_ERROR;
  if (apiOpenCL()) {
    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
    //TODO: only enable if AMD libraries are available
    dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
    dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
    dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
  } else if (apiCuda()) {
    ierr = CUDA_SAFECALL( DKS_SUCCESS );
    dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
    dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
    dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
  } else if (apiOpenMP()) {
    ierr = MIC_SAFECALL( DKS_SUCCESS );
    dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
    dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
    dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
  } else {
    ierr = DKS_ERROR;
  }
  return ierr;
 }
 int DKSOPAL::initDevice() {
  int ierr = setupDevice();
  if (ierr == DKS_SUCCESS)
    ierr = setupOPAL();
  return ierr;
 }
 /* setup fft plans to reuse if multiple ffts of same size are needed */
 int DKSOPAL::setupFFT(int ndim, int N[3]) {
  if (apiCuda()) {
    return dksfft->setupFFT(ndim, N);
  } else if (apiOpenCL()) {
    int ierr1 = dksfft->setupFFT(ndim, N);
    int ierr2 = dksfft->setupFFTRC(ndim, N);
    int ierr3 = dksfft->setupFFTCR(ndim, N);
    if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
      return DKS_ERROR;
    return DKS_SUCCESS;
  } else if (apiOpenMP()) {
    //micbase.mic_setupFFT(ndim, N);
    //BENI: setting up RC and CR transformations on MIC
    int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
    int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
    if (ierr1 != DKS_SUCCESS)
      return ierr1;
    if (ierr2 != DKS_SUCCESS)
      return ierr2;
    return DKS_SUCCESS;
  }
  return DKS_ERROR;
 }
 //BENI:
 int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) {
  if (apiCuda())
    return dksfft->setupFFT(ndim, N);
  if (apiOpenCL())
    return dksfft->setupFFTRC(ndim, N);
  else if (apiOpenMP())
    return dksfft->setupFFTRC(ndim, N, scale);
  return DKS_ERROR;
 }
 //BENI:
 int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) {
  if (apiCuda())
    return dksfft->setupFFT(ndim, N);
  if (apiOpenCL())
    return dksfft->setupFFTCR(ndim, N);
  else if (apiOpenMP())
    return dksfft->setupFFTCR(ndim, N, scale);
  return DKS_ERROR;
 }
 /* call OpenCL FFT function for selected platform */
 int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL() || apiOpenMP()) 
    return dksfft->executeFFT(data_ptr, ndim, dimsize);
  else if (apiCuda())
    return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call OpenCL IFFT function for selected platform */
 int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL() || apiOpenMP())
      return dksfft->executeIFFT(data_ptr, ndim, dimsize);
  else if (apiCuda()) 
    return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call normalize FFT function for selected platform */
 int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
    else 
      return DKS_ERROR;
  } else if (apiCuda()) {
    return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
  } else if (apiOpenMP()) {
    return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call real to complex FFT */
 int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
  else if (apiOpenCL() || apiOpenMP())
    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call complex to real FFT */
 int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
  else if (apiOpenCL() || apiOpenMP())
    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* normalize complex to real iFFT */
 int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
  else if (apiOpenCL())
    return DKS_ERROR;
  else if (apiOpenMP())
    return DKS_ERROR;
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 				double hz_m0, double hz_m1, double hz_m2, int streamId) {
    return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, 
 				     hz_m0, hz_m1, hz_m2, streamId);
 }
 int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 				   int I, int J, int K, int streamId) {
  return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
 }
 int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
  return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);  
 }
 int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
  return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
 }
 int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 				   int numparticles, int numparams,
 				   int &numaddback, int &numdead) 
 {
  return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
 }
 int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) 
 {
  return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
 }
 int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				      void *px_ptr, void *py_ptr, void *pz_ptr,
 				      void *par_ptr, int numparticles)
 {
    return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
 					rx_ptr, ry_ptr, rz_ptr, 
 					px_ptr, py_ptr, pz_ptr,
 					par_ptr,  numparticles);
 }
 int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
 {
  return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
 }
 int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 					  void *px_ptr, void *py_ptr, void *pz_ptr,
 					  void *par_ptr, int numparticles, int &numaddback) 
 {
  return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
 						       rx_ptr, ry_ptr, rz_ptr, 
 						       px_ptr, py_ptr, pz_ptr,
 						       par_ptr,  numparticles, numaddback));
 }
 int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 				      void *dt_ptr, double dt, double c, 
 				      bool usedt, int streamId) 
 {
  return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
 }
 int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, 
 				      int npart, double c, int streamId) {
  return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
 }
 int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				      void *bf_ptr, void *dt_ptr, double charge, double mass,
 				      int npart, double c, int streamId) 
 {
  return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr, 
 				      charge, mass, npart, c, streamId);
 }
 int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					       void *lastSec_ptr, void *orient_ptr, 
 					       int npart, int nsec, void *dt_ptr, double dt, 
 					       double c, bool usedt, int streamId)
 {
  return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
 					       npart, nsec, dt_ptr, dt, c, usedt, streamId);
 }
--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@ -1,233 +0,0 @@
 #ifndef H_DKS_OPAL
 #define H_DKS_OPAL
 #include <iostream>
 #include "AutoTuning/DKSAutoTuning.h"
 #include "DKSBase.h"
 #include "DKSDefinitions.h"
 #include "Algorithms/GreensFunction.h"
 #include "Algorithms/CollimatorPhysics.h"
 #include "Algorithms/FFT.h"
 #ifdef DKS_AMD
 #include "OpenCL/OpenCLFFT.h"
 #include "OpenCL/OpenCLGreensFunction.h"
 #include "OpenCL/OpenCLCollimatorPhysics.h"
 #endif
 #ifdef DKS_CUDA
 #include "CUDA/CudaFFT.cuh"
 #include "CUDA/CudaGreensFunction.cuh"
 #include "CUDA/CudaCollimatorPhysics.cuh"
 #endif
 #ifdef DKS_MIC
 #include "MIC/MICFFT.h"
 #include "MIC/MICGreensFunction.hpp"
 #include "MIC/MICCollimatorPhysics.h"
 #endif
 class DKSOPAL : public DKSBase {
 private: 
  DKSFFT *dksfft;
  DKSCollimatorPhysics *dkscol;
  GreensFunction *dksgreens;
  int setupOPAL();
 public:
  DKSOPAL();
  DKSOPAL(const char* api_name, const char* device_name);
  ~DKSOPAL();
  int initDevice();
  ///////////////////////////////////////////////
  ///////Function library part of dksbase////////
  ///////////////////////////////////////////////
  /** 
   * Setup FFT function.
   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
   * each fft will do its own setup according to fft size and dimensions.
   * TODO: opencl and mic implementations
   */
  int setupFFT(int ndim, int N[3]);
  //BENI:
  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
  //BENI:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
 /** 
   * Call complex-to-complex fft.
   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation
   */
  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex-to-complex ifft.
   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation.
   */
  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize complex to complex ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size
   * TODO: mic implementation.
   */
  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call real to complex FFT.
   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations
   */
  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex to real iFFT.
   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations.
   */
  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize compelx to real ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size.
   * TODO: opencl and mic implementations.
   */
  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 			    int I, int J, int K, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
  /** 
   * Element by element multiplication.
   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
   * TODO: opencl and mic implementations.
   */
  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			    int numparticles, int numparams, 
 			    int &numaddback, int &numdead);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
   */
  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
 			       void *par_ptr, int numparticles);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles, int &numaddback);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 			       void *dt_ptr, double dt, double c, 
 			       bool usedt = false, int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					void *lastSec_ptr, void *orient_ptr, 
 					int npart, int nsec, void *dt_ptr,
 					double dt, double c, bool usedt = false, 
 					int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, 
 			       int npart, double c, int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			       void *bf_ptr, void *dt_ptr, double charge, 
 			       double mass, int npart, double c, int streamId = -1);
 };
 #endif
--- a/src/MIC/CMakeLists.txt
+++ b/src/MIC/CMakeLists.txt
@ -1,24 +1,19 @@
-SET (_SRCS MICBase.cpp)
+SET (_SRCS
-SET (_HDRS MICBase.h)
+  MICBase.cpp
  MICChiSquare.cpp
  MICFFT.cpp
  MICGreensFunction.cpp  
  MICCollimatorPhysics.cpp
  )
-IF (ENABLE_OPAL)
+SET (_HDRS
-  SET (_SRCS
+  MICBase.h
-    ${_SRCS}
+  MICChiSquare.h
-    MICChiSquare.cpp
+  MICFFT.h
-    MICFFT.cpp
+  MICCollimatorPhysics.h
-    MICGreensFunction.cpp  
+  MICGreensFunction.hpp    
-    MICCollimatorPhysics.cpp
+  MICMergeSort.h
-    )
+  )
  SET (_HDRS
    ${_HDRS}
    MICChiSquare.h
    MICFFT.h
    MICCollimatorPhysics.h
    MICGreensFunction.hpp    
    MICMergeSort.h
    )
 ENDIF (ENABLE_OPAL)
 #INCLUDE_DIRECTORIES (
 #  ${CMAKE_CURRENT_SOURCE_DIR}
--- a/src/MIC/MICBase.cpp
+++ b/src/MIC/MICBase.cpp
@ -18,28 +18,30 @@ int MICBase::mic_createRandStreams(int size) {
  int seed = time(NULL);
-  int numThreads = 0;
+#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
 #pragma offload target(mic:m_device_id) inout(numThreads)
  {
    //get the number of threads
    int numThreads;
 #pragma omp parallel
    numThreads = omp_get_num_threads();
  }
-  defaultRndStream =  mic_allocateMemory<VSLStreamStatePtr>(numThreads);
+    //if default rnd stream already allocated delete the array
-  VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
+    if (defaultRndSet == 1)    
-  maxThreads = numThreads; 
+      delete[] defaultRndStream;
-  
+
-#pragma offload target(mic:m_device_id) \
+    //allocate defaultRndStream array
-  in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN)        \
+    defaultRndStream = new VSLStreamStatePtr[numThreads];
-  in(seed)
+
  {
    //create stream states for each thread
 #pragma omp parallel for
    for (int i = 0; i < omp_get_num_threads(); i++)
-      vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
+      vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
  }
-  defaultRndSet = 1;
+    defaultRndSet = 1;
  }
  return DKS_SUCCESS;
 }
@ -47,8 +49,15 @@ int MICBase::mic_createRandStreams(int size) {
 //delete default rand streams
 int MICBase::mic_deleteRandStreams() {
-  //mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
+#pragma offload target(mic:m_device_id) inout(defaultRndSet)
-  return DKS_SUCCESS;
+  {
    if (defaultRndSet == 1) {
      delete[] defaultRndStream;
      defaultRndSet = -1;
    }
  }
  return DKS_ERROR;
 }
 //create a new signal for the mic
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@ -30,19 +30,14 @@ class MICBase {
 private:
  std::vector<int> micStreams;
  int maxThreads; 
 protected:
  int defaultRndSet;
 public:
-
+  VSLStreamStatePtr *defaultRndStream;
 //#pragma offload_attribute(push,target(mic))
  void *defaultRndStream; //VSLSStreamStatePtr
  void *testPtr;
 //#pragma offload_attribute(pop)
  int m_device_id;
  /* constructor */
@ -207,6 +202,7 @@ public:
 #pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
    {
    }
    return DKS_SUCCESS;
  }
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@ -292,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
  const double deltas = par[DT_M] * beta * C;
  const double deltasrho = deltas * 100 * par[RHO_M];
-  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    const double Ts = (Eng * 1E6) / 1.0073; 
@ -338,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
  const double deltas = par[DT_M] * beta * C;
  const double deltasrho = deltas * 100 * par[RHO_M];
-  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    const double Ts = (Eng * 1E6) / 1.0073; 
@ -368,23 +368,23 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
 }
-int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
+int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, 
 					    int numparticles, boll enableRutherfordScattering) 
 {
  //cast device memory pointers to appropriate types
  MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
  double *par = (double*) par_ptr;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
 #pragma offload target(mic:m_micbase->m_device_id)		\
  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
  in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
  in(numparticles)
  {
 #pragma omp parallel 
    {
-      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
+      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
      //for loop trough particles if not checkhit set label to -2 and update R.x
@ -461,8 +461,6 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  int padding = numparticles % MIC_WIDTH;
  int totalpart = numparticles + padding;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
 #pragma offload target (mic:0) \
  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
@ -473,16 +471,14 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  in(py:length(0) DKS_REUSE DKS_RETAIN)	\
  in(pz:length(0) DKS_REUSE DKS_RETAIN)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
  in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
  in(totalpart)
  {
 #pragma omp parallel
    {
      //every thread gets its own rnd stream state
-      //VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
+      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
-      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
+
      #pragma omp for nowait
      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -518,11 +514,9 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 	  double Eng = (sq - 1) * M_P;
 	  double dEdx = 0;
 	  if (label[i] == 0) {
 	    energyLoss(Eng, dEdx, par, randv, i - ii);
 	  }
 	  if (Eng > 1e-4 && dEdx < 0) {
 	    double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
@ -534,12 +528,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 	  if (Eng < 1e-4 || dEdx > 0)
 	    label[i] = -1;
-	      
+	        
 	} //end inner energy loss loop
      } //end outer energy loss loop
-      
+      } //end outer energy loss loop
      //vectorize coulomb scattering as much as possible
 #pragma omp for nowait
      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -549,7 +542,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
    } //end omp parallel
  } //end offload
-   
+     
  return DKS_SUCCESS;
 }
--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@ -26,7 +26,7 @@ typedef struct {
 } MIC_PART_SMALL;
-class MICCollimatorPhysics : public DKSCollimatorPhysics {
+class MICCollimatorPhysics : DKSAlogorithms{
 private:
@ -38,9 +38,10 @@ public:
    m_micbase = base;
  };
-  ~MICCollimatorPhysics() {  };
+  ~MICCollimatorPhysics() { };
-  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 			bool enableRutherfordScattering = true);
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
--- a/src/MIC/MICFFT.cpp
+++ b/src/MIC/MICFFT.cpp
@ -6,16 +6,13 @@
 MICFFT::MICFFT(MICBase *base) {
  m_micbase = base;
  m_fftsetup = false;
 }
 MICFFT::~MICFFT() {
  if (m_fftsetup) {
 #pragma offload target(mic:0)
-    {
+  {
-      DftiFreeDescriptor(&FFTHandle_m);
+    DftiFreeDescriptor(&FFTHandle_m);
-      DftiFreeDescriptor(&handle);
+    DftiFreeDescriptor(&handle);
    }
  }
 }
@ -38,7 +35,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
  }
-  m_fftsetup = true;
+
  return DKS_SUCCESS;
 }
 //BENI:
@ -125,8 +122,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
 }
 //execute iFFT
-int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
+int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
-  return executeFFT(mem_ptr, ndim, N, -1, false);
+  return mic_executeFFT(mem_ptr, ndim, N, -1, false);
 }
 //execute REAL->COMPLEX FFT
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@ -7,14 +7,13 @@
 #include <offload.h>
 #include <mkl_dfti.h>
-#include "../Algorithms/FFT.h"
+#include "../Algorithm/DKSFFT.h"
 #include "MICBase.h"
 class MICFFT : public DKSFFT {
 private:
  bool m_fftsetup;
  MICBase *m_micbase;
  /// Internal FFT object for performing serial FFTs.
@ -75,18 +74,6 @@ public:
  /* normalize IFFT on MIC */
  int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
  /**
   * Info: destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT() { return DKS_SUCCESS; }
  /*
    Info: execute normalize for complex to real iFFT
    Return: success or error code
  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
 };
 #endif
--- a/src/MIC/MICGreensFunction.cpp
+++ b/src/MIC/MICGreensFunction.cpp
@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
  }
 */
-int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
+int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
-				      double hr_m0, double hr_m1, double hr_m2, int streamId) 
+					  double hr_m1, double hr_m2) 
 {
-  double *tmp_ptr = (double*) tmpgreen;
+  double *tmp_ptr = (double*) tmp_ptr_;
 #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
  {
    std::memset(tmp_ptr,0,I*J*K);
@ -173,14 +173,12 @@ return 0;
 */
 //CUDA similar version:
-int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
+int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
-					int streamId) 
+  double *tmpgreen = (double*) tmp_ptr_;
-{
+  double *mem_ptr = (double*) mem_ptr_;
  double *tmpgreen_ptr = (double*) tmpgreen;
  double *mem_ptr = (double*) rho2_m;
  // the actual integration
-#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
  {
    int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); 
    std::memset(mem_ptr,0,II*JJ*KK);
@ -199,27 +197,27 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
 	  tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
 	  if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
-	    tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+	    tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
 	  if (i+1 < NI_tmp)
-	    tmp1 = tmpgreen_ptr[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+	    tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
 	  if (j+1 < NJ_tmp)
-	    tmp2 = tmpgreen_ptr[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+	    tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
 	  if (k+1 < NK_tmp)
-	    tmp3 = tmpgreen_ptr[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
 	  if (i+1 < NI_tmp && j+1 < NJ_tmp)
-	    tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+	    tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
 	  if (i+1 < NI_tmp && k+1 < NK_tmp)
-	    tmp5 = tmpgreen_ptr[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
 	  if (j+1 < NJ_tmp && k+1 < NK_tmp)
-	    tmp6 = tmpgreen_ptr[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
-	  tmp7 = tmpgreen_ptr[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+	  tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
 	  double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
@ -236,8 +234,8 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
-int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
+int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
-  double *mem_ptr = (double*) rho2_m;	
+  double *mem_ptr = (double*) mem_ptr_;	
 #pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
  {
@ -283,11 +281,11 @@ int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int str
 }
 /*multiply complex fields*/
-int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
+int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
  //	  double *mem_ptr1 = (double*) mem_ptr1_;
  //	  double *mem_ptr2 = (double*) mem_ptr2_;
-  _Complex double *mem_ptr1 = (_Complex double *) ptr1;
+  _Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
-  _Complex double *mem_ptr2 = (_Complex double *) ptr2;
+  _Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
 #pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
  {
--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@ -9,13 +9,12 @@
 #include <offload.h>
 #include <mkl_dfti.h>
 #include "../Algorithms/GreensFunction.h"
 #include "MICBase.h"
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
-class MICGreensFunction : public GreensFunction {
+class MICGreensFunction {
 private:
  MICBase *m_micbase;
@ -29,18 +28,16 @@ public:
  ~MICGreensFunction();
  /* compute greens integral analytically */
-  int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
+  int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
 		     double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
  /* perform the actual integration */
-  int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K, 
+  int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
 				int stremaId = -1);
  /* Mirror rho-Field */
-  int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);
+  int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
  /*multiply complex fields*/
-  int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);
+  int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
 };
--- a/src/OpenCL/CMakeLists.txt
+++ b/src/OpenCL/CMakeLists.txt
@ -1,39 +1,31 @@
-#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found
+SET (_SRCS
-
+	OpenCLBase.cpp
-SET (_HDRS OpenCLBase.h)
+	OpenCLFFT.cpp
-SET (_SRCS OpenCLBase.cpp)
+	OpenCLChiSquare.cpp
-SET (_KERNELS "")
+	OpenCLCollimatorPhysics.cpp
-
+	OpenCLChiSquareRuntime.cpp
-IF (ENABLE_MUSR)
+  )
-  SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
+
-  SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
+SET (_HDRS
-  SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
+	OpenCLBase.h
-ENDIF (ENABLE_MUSR)
+	OpenCLFFT.h
-
+	OpenCLChiSquare.h
-IF (ENABLE_AMD AND ENABLE_OPAL)
+	OpenCLCollimatorPhysics.h
-  SET (_SRCS
+	OpenCLChiSquareRuntime.h
-    ${_SRCS}
+  )
-    OpenCLFFT.cpp
+
-    OpenCLCollimatorPhysics.cpp
+#INCLUDE_DIRECTORIES (
-    OpenCLGreensFunction.cpp
+#  ${CMAKE_CURRENT_SOURCE_DIR}
-    )
+#)
-
+
-  SET (_HDRS
+SET (_KERNELS
-    ${_HDRS}
+  OpenCLKernels/OpenCLChiSquare.cl
-    OpenCLFFT.h
+  OpenCLKernels/OpenCLFFT.cl
-    OpenCLCollimatorPhysics.h
+  OpenCLKernels/OpenCLFFTStockham.cl
-    OpenCLGreensFunction.h
+  OpenCLKernels/OpenCLTranspose.cl
-    )
+  OpenCLKernels/OpenCLCollimatorPhysics.cl
-
+  OpenCLKernels/OpenCLChiSquareRuntime.cl
  SET (_KERNELS
    ${_KERNELS}
    OpenCLKernels/OpenCLFFT.cl
    OpenCLKernels/OpenCLFFTStockham.cl
    OpenCLKernels/OpenCLTranspose.cl
    OpenCLKernels/OpenCLCollimatorPhysics.cl
    OpenCLKernels/OpenCLGreensFunction.cl
  )
 ENDIF (ENABLE_AMD AND ENABLE_OPAL)
 ADD_SOURCES (${_SRCS})
 ADD_HEADERS (${_HDRS})
--- a/src/OpenCL/OpenCLBase.cpp
+++ b/src/OpenCL/OpenCLBase.cpp
@ -7,13 +7,21 @@ cl_device_id OpenCLBase::m_device_id = NULL;
 cl_event OpenCLBase::m_last_event = NULL;
 OpenCLBase::OpenCLBase() {
  //m_context = NULL;
  //m_command_queue = NULL;
  m_program = NULL;
  m_kernel = NULL;
  //m_device_id = NULL;
  //m_platform_id = NULL;
  m_kernel_file = NULL;
  m_last_event = NULL;
  //m_events = new cl_event[500];
  //m_num_events = 0;
  defaultRndSet = 0;
 }
 OpenCLBase::~OpenCLBase() {
@ -33,11 +41,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
  ocl_loadKernel(kernel_file);
  delete[] kernel_file;
-  
+
  //allocate memory for rand states
  int ierr;
  defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
-  
+
  //exec kernel
  int seed = 0;
  ocl_createKernel("initRand");
@ -47,34 +55,13 @@ int OpenCLBase::ocl_createRndStates(int size) {
  size_t work_items = size;
  size_t work_group_size = 1;
  ocl_executeKernel(1, &work_items, &work_group_size);
  defaultRndSet = 1;
  return DKS_SUCCESS;
 }
-int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
+  return OCL_SUCCESS;
  //load kernel
  char * kernel_file = new char[500];
  kernel_file[0] = '\0';
  strcat(kernel_file, OPENCL_KERNELS);
  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
  ocl_loadKernel(kernel_file);
  delete[] kernel_file;
  //set kernel variables
  cl_mem tmp_data = (cl_mem) mem_ptr;
  ocl_createKernel("createRandoms");
  ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
  ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
  ocl_setKernelArg(2, sizeof(int), &size);
  size_t work_size = 128;
  size_t work_items = (size % work_size + 1) * work_size;
  ocl_executeKernel(1, &work_items, &work_size);
  return DKS_SUCCESS;
 }
 /* destroy rnd states */
@ -83,7 +70,7 @@ int OpenCLBase::ocl_deleteRndStates() {
  ocl_freeMemory(defaultRndState);
  defaultRndSet = 0;
-  return DKS_SUCCESS;
+  return OCL_SUCCESS;
 }
@ -441,8 +428,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
  int ierr;
  //create program from kernel
-  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, 
+  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
 					NULL, &ierr);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
    return DKS_ERROR;
@ -452,7 +438,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
  ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
  /*
-    check if compiling kernel source succeded, if failed return error code
+    check if compileng kernel source succeded, if failed return error code
    if in debug mode get compilation info and print program build log witch
    will give indication what made the compilation fail
  */
@ -461,8 +447,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
    //get build status
    cl_build_status status;
-    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, 
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
 			  sizeof(cl_build_status), &status, NULL);
    //get log size
    size_t log_size;
@ -628,12 +613,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
    }
  }
-  if (ierr != DKS_SUCCESS) {
+  if (ierr != OCL_SUCCESS) {
    DEBUG_MSG("Failed to build kernel file " << kernel_file);
-    return DKS_ERROR;
+    return OCL_ERROR;
  }
-  return DKS_SUCCESS;
+  return OCL_SUCCESS;
 }
 //compile kernel form source code provided
--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@ -30,10 +30,13 @@
 #include <CL/cl_ext.h>
 #endif
 #include "../DKSDefinitions.h"
 /* struct for random number state */
 typedef struct {
  double s10;
  double s11;
  double s12;
@ -42,12 +45,16 @@ typedef struct {
  double s22;
  double z;
  bool gen;
 } RNDState;
 class OpenCLBase {
 private:
  static cl_context m_context;
  static cl_command_queue m_command_queue;
  static cl_platform_id m_platform_id;
  static cl_device_id m_device_id;
@ -111,9 +118,6 @@ protected:
 public:
  static cl_context m_context;
  static cl_command_queue m_command_queue;
  /*
    constructor
@ -131,11 +135,6 @@ public:
  */
  int ocl_createRndStates(int size);
  /* Create an array of random numbers on the device
   *
   */
  int ocl_createRandomNumbers(void *mem_ptr, int size);
  /*
    Destroy rnd states
    Return: success or error code
@ -196,7 +195,7 @@ public:
    Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int &ierr);
-
+	
  /*
    Name: allocateMemory
    Info: allocate memory on device
@ -204,20 +203,6 @@ public:
  */
  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
  /** Zero OpenCL memory buffer
   *  Set all the elemetns in the device array to zero
   */
  template <typename T>
  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
    cl_int ierr;
    ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset, 
 			       sizeof(T)*size, 0, nullptr, nullptr);
    if (ierr != CL_SUCCESS)
      return DKS_ERROR;
    return DKS_SUCCESS;
  }
  /*
    Name: writeData
    Info: write data to device memory (needs ptr to mem object)
--- a/src/OpenCL/OpenCLCollimatorPhysics.cpp
+++ b/src/OpenCL/OpenCLCollimatorPhysics.cpp
@ -34,7 +34,7 @@ TODO:
 2. boost.compute sort for user defined structure crashes
 */
 int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-					       int numparticles) 
+					       int numparticles, bool enableRutherfordScattering) 
 {
  /*
  //set number of total threads, and number threads per block
--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@ -52,7 +52,8 @@ public:
  }
  /* execute degrader code on device */
-  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 			bool enableRutherfordScattering = true);
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
--- a/src/OpenCL/OpenCLFFT.cpp
+++ b/src/OpenCL/OpenCLFFT.cpp
@ -31,6 +31,7 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
  if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
    return OCL_ERROR;
  //execute kernel
  for (int step = 1; step < N; step <<= 1) {
@ -88,78 +89,26 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
  call fft execution on device for every dimension
 */
 int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
-
+  int ierr;
-  int dkserr = DKS_SUCCESS;
+	
  cl_int ierr;
  cl_mem inout = (cl_mem)data;
  int n = N[0];
-  if (forward)
+  for (int dim = 0; dim < ndim; dim++) {
-    ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
+    ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
-				0, NULL, NULL, &inout, NULL, NULL);
+    if (ierr != OCL_SUCCESS) {
-  else
+      DEBUG_MSG("Error executing bit reverse");
-    ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, 
+      return OCL_ERROR;
-				0, NULL, NULL, &inout, NULL, NULL);  
+    }
-  if (ierr != OCL_SUCCESS) {
+    ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
-    dkserr = DKS_ERROR;
+    if (ierr != OCL_SUCCESS) {
-    DEBUG_MSG("Error executing cfFFT\n");
+      DEBUG_MSG("Error executing fft reverse");
-    if (ierr == CLFFT_INVALID_PLAN)
+      return OCL_ERROR;
-      std::cout << "Invlalid plan" << std::endl;
+    }
    else 
      std::cout << "CLFFT error" << std::endl;
  }
-  return dkserr;
+  return OCL_SUCCESS;
 }
 /*
  call rcfft execution on device for every dimension
 */
 int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
  int dkserr = DKS_SUCCESS;
  cl_int ierr;
  cl_mem real_in = (cl_mem)real_ptr;
  cl_mem comp_out = (cl_mem)comp_ptr;
  ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
 			       0, NULL, NULL, &real_in, &comp_out, NULL);
  if (ierr != OCL_SUCCESS) {
    dkserr = DKS_ERROR;
    DEBUG_MSG("Error executing cfFFT\n");
    if (ierr == CLFFT_INVALID_PLAN)
      std::cout << "Invlalid plan" << std::endl;
    else 
      std::cout << "CLFFT error" << std::endl;
  }
  return dkserr;
 }
 /*
  call rcfft execution on device for every dimension
 */
 int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
  int dkserr = DKS_SUCCESS;
  cl_int ierr;
  cl_mem real_in = (cl_mem)real_ptr;
  cl_mem comp_out = (cl_mem)comp_ptr;
  ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, 
 				0, NULL, NULL, &comp_out, &real_in, NULL);
  if (ierr != OCL_SUCCESS) {
    dkserr = DKS_ERROR;
    DEBUG_MSG("Error executing cfFFT\n");
    if (ierr == CLFFT_INVALID_PLAN)
      std::cout << "Invlalid plan" << std::endl;
    else 
      std::cout << "CLFFT error" << std::endl;
  }
  return dkserr;
 }
 /*
@ -171,11 +120,10 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
 }
 /*
-  call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
+  call kernel to normalize fft
 */
 int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
 /*
  cl_mem inout = (cl_mem)data;
  int n = N[0];
@ -202,175 +150,132 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
    DEBUG_MSG("Error executing kernel");
    return OCL_ERROR;
  }
-*/	
+	
  return OCL_SUCCESS;
 }
-int OpenCLFFT::setupFFT(int ndim, int N[3]) {
+int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
  int ierr;
  int size = sizeof(cl_double2)*pow(N,ndim);
  cl_mem mem_tmp;
  cl_mem mem_src = (cl_mem)src;
  cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
-  cl_int err;
+  //set the number of work items in each dimension
  size_t work_items[3];
  int p = 1;
  int threads = N / 2;
  int f = (forward) ? -1 : 1;
  //execute kernel
  int n = (int)log2(N);
  for (int i = 0; i < ndim; i++) {
-  clfftDim dim;
+    int dim = i+1;
-  if (ndim == 1)
+    p = 1;
-    dim = CLFFT_1D;
+    work_items[0] = (dim == 1) ? N/2 : N;
-  else if (ndim == 2)
+    work_items[1] = (dim == 2) ? N/2 : N;
-    dim = CLFFT_2D;
+    work_items[2] = (dim == 3) ? N/2 : N;
-  else 
+		
-    dim = CLFFT_3D;
+    //transpose array if calculating dimension larger than 1
-  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
+    //if (dim > 1) 
    //	ocl_executeTranspose(mem_src, N, ndim, dim);
    //create kernel and set kernel arguments
    if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
      return OCL_ERROR;
    for (int t = 1; t <= log2(N); t++) {
      m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
      m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
      m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
      m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
      m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
      m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
      if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) 
 	return OCL_ERROR;
-  /* Create 3D fft plan*/
+      mem_tmp = mem_src;
-  err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
+      mem_src = mem_dst;
-
+      mem_dst = mem_tmp;
-  /* Set plan parameters */
+	
-  err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
+      p = 2*p;
  if (err != CL_SUCCESS)
    std::cout << "Error setting precision" << std::endl;
  err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
  if (err != CL_SUCCESS)
    std::cout << "Error setting layout" << std::endl;
  err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
  if (err != CL_SUCCESS)
    std::cout << "Error setting result location" << std::endl;
  /* Bake the plan */
  err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
  if (err != CL_SUCCESS) {
    DEBUG_MSG("Error creating Complex-to-complex plan");
    return DKS_ERROR;
  }
  return DKS_SUCCESS;
 }
 int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
  cl_int err;
  clfftDim dim;
  if (ndim == 1)
    dim = CLFFT_1D;
  else if (ndim == 2)
    dim = CLFFT_2D;
  else 
    dim = CLFFT_3D;
  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
  size_t half = (size_t)N[0] / 2 + 1;
  size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
  size_t clOutStride[3] = {1, half, half * N[1]};
  /* Create 3D fft plan*/
  err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
  /* Set plan parameters */
  err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
  err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
  err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
  err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
  err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
  /* Bake the plan */
  err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
  if (err != CL_SUCCESS) {
    DEBUG_MSG("Error creating Real-to-complex plan");
    return DKS_ERROR;
  }
  return DKS_SUCCESS;
 }
 int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
  cl_int err;
  clfftDim dim;
  if (ndim == 1)
    dim = CLFFT_1D;
  else if (ndim == 2)
    dim = CLFFT_2D;
  else
    dim = CLFFT_3D;
  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
  size_t half = (size_t)N[0] / 2 + 1;
  size_t clInStride[3] = {1, half, half * N[1]};
  size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
  /* Create 3D fft plan*/
  err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
  /* Set plan parameters */
  err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
  err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
  err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
  err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
  err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
  /* Bake the plan */
  err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
  if (err != CL_SUCCESS) {
    DEBUG_MSG("Error creating Complex-to-real plan");
    return DKS_ERROR;
  }
  return DKS_SUCCESS;
 }
 int OpenCLFFT::destroyFFT() {
  clfftDestroyPlan(&planHandleZ2Z);
  clfftDestroyPlan(&planHandleD2Z);
  clfftDestroyPlan(&planHandleZ2D);
  clfftTeardown();
  return DKS_SUCCESS;
 }
 void OpenCLFFT::printError(clfftStatus err) {
  if (err != CL_SUCCESS) {
    std::cout << "Error creating default plan " << err <<  std::endl;
    switch(err) {
    case CLFFT_BUGCHECK: 
      std::cout << "bugcheck" << std::endl; 
      break;
    case CLFFT_NOTIMPLEMENTED: 
      std::cout << "not implemented" << std::endl; 
      break;
    case CLFFT_TRANSPOSED_NOTIMPLEMENTED: 
      std::cout << "transposed not implemented" << std::endl; 
      break;
    case CLFFT_FILE_NOT_FOUND: 
      std::cout << "file not found" << std::endl; 
      break;
    case CLFFT_FILE_CREATE_FAILURE: 
      std::cout << "file create failure" << std::endl; 
      break;
    case CLFFT_VERSION_MISMATCH: 
      std::cout << "version missmatch" << std::endl; 
      break;
    case CLFFT_INVALID_PLAN: 
      std::cout << "invalid plan" << std::endl; 
      break;
    case CLFFT_DEVICE_NO_DOUBLE: 
      std::cout << "no double" << std::endl; 
      break;
    case CLFFT_DEVICE_MISMATCH: 
      std::cout << "device missmatch" << std::endl; 
      break;
    case CLFFT_ENDSTATUS: 
      std::cout << "end status" << std::endl; 
      break;
    default: 
      std::cout << "other: " << err << std::endl;
      break;
    }
    //transpose array back if calculating dimension larger than 1
    //if (dim > 1)
    //	ocl_executeTranspose(mem_src, N, ndim, dim);
  }	
  if (ndim*n % 2 == 1) {
    m_oclbase->ocl_copyData(mem_src, mem_dst, size);
    mem_tmp = mem_src;
    mem_src = mem_dst;
    mem_dst = mem_tmp;
  }
  m_oclbase->ocl_freeMemory(mem_dst);
  return OCL_SUCCESS;
 }
 int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
  cl_mem mem_src = (cl_mem)src;
  size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
  size_t work_group_size[3] = {(size_t)N/2, 1, 1};
  m_oclbase->ocl_createKernel("fft_batch3D");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
  m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
  m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
  m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
  for (int dim = 1; dim < ndim+1; dim++) {
    m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
    m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
  }
  return OCL_SUCCESS;
 }
 int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
  cl_mem mem_src = (cl_mem)src;
  if (ndim == 1)
    return OCL_SUCCESS;
  size_t work_items[3];
  work_items[0] = N[0];
  work_items[1] = N[1];
  work_items[2] = 1;
  size_t work_group_size[3];
  work_group_size[0] = N[0];
  work_group_size[1] = N[1];
  work_group_size[2] = 1;
  size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
  m_oclbase->ocl_createKernel("transpose");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
  m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
  m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
  m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
  m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
  return OCL_SUCCESS;
 }
 /*
--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@ -20,19 +20,12 @@
 #include "../Algorithms/FFT.h"
 #include "OpenCLBase.h"
 #include "clFFT.h"
 class OpenCLFFT : public DKSFFT {
 private:
  OpenCLBase *m_oclbase;
  clfftSetupData fftSetup;
  clfftPlanHandle planHandleZ2Z;
  clfftPlanHandle planHandleD2Z;
  clfftPlanHandle planHandleZ2D;
  /*
    Info: call fft kernels to execute FFT of the given domain,
    data - devevice memory ptr, cdim - current dim to transform, 
@ -49,31 +42,15 @@ private:
  */
  int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
  /** Get clfftStatus and print the corresponding error message.
   *  clfftStatus is returned from all clFFT library functions, print error displays the
   *  corresponding error message. If "other" is printed then error code corresponds to 
   *  OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
   *  be checked to determine the reason for the error.
   */
  void printError(clfftStatus err);
 public:
  /* constructor - currently does nothing*/
  OpenCLFFT(OpenCLBase *base) {
    m_oclbase = base;
    /* Set up fft */
    cl_int err;
    err = clfftInitSetupData(&fftSetup);
    err = clfftSetup(&fftSetup);
    if (err != CL_SUCCESS)
      DEBUG_MSG("Error seting up clFFT");
  }
  /* destructor - currently does nothing*/
-  ~OpenCLFFT() { destroyFFT(); }
+  ~OpenCLFFT() { }
  /*
    Info: execute forward fft function with data set on device
@ -100,23 +77,35 @@ public:
    Info: set FFT size
    Return: success or error code
  */
-  int setupFFT(int ndim, int N[3]);
+  int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
-  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
-  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
-  int destroyFFT();
+  int destroyFFT() { return DKS_SUCCESS; }
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
-		   int streamId = -1);
+				int streamId = -1)
    {
      return DKS_ERROR;
    }
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
-		   int streamId = -1);
+				int streamId = -1)
    {
      return DKS_ERROR;
    }
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
    {
      return DKS_ERROR;
    }
  int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
  int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
  int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
  //void printData3DN4(cl_double2* &data, int N);
 };
--- a/src/OpenCL/OpenCLGreensFunction.cpp
+++ b/src/OpenCL/OpenCLGreensFunction.cpp
@ -1,181 +0,0 @@
 #include "OpenCLGreensFunction.h"
 #define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
 OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
  m_base = base;
  base_create = false;
 }
 OpenCLGreensFunction::OpenCLGreensFunction() {
  m_base = new OpenCLBase();
  base_create = true;
 }
 OpenCLGreensFunction::~OpenCLGreensFunction() {
  if (base_create)
    delete m_base;
 }
 int OpenCLGreensFunction::buildProgram() {
  char *kernel_file = new char[500];
  kernel_file[0] = '\0';
  strcat(kernel_file, OPENCL_KERNELS);
  strcat(kernel_file, GREENS_KERNEL);
  return m_base->ocl_loadKernel(kernel_file);
 }
 int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
 					 double hr_m0, double hr_m1, double hr_m2, 
 					 int streamId)
 {
  int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
  //set the work item size
  size_t work_size = 128;
  size_t work_items = I * J * K;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("kernelTmpgreen");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
  m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
  m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
  m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
  m_base->ocl_setKernelArg(4, sizeof(int), &I);
  m_base->ocl_setKernelArg(5, sizeof(int), &J);
  m_base->ocl_setKernelArg(6, sizeof(int), &K);
  //execute kernel
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
 int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
 						    int K, int streamId)
 {
  int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem rho2_ptr = (cl_mem)rho2_m;
  cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
  int NI = 2*(I - 1);
  int NJ = 2*(J - 1);
  //set the work item size
  size_t work_size = 128;
  size_t work_items = I * J * K;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("kernelIntegration");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
  m_base->ocl_setKernelArg(2, sizeof(int), &NI);
  m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
  m_base->ocl_setKernelArg(4, sizeof(int), &I);
  m_base->ocl_setKernelArg(5, sizeof(int), &J);
  m_base->ocl_setKernelArg(6, sizeof(int), &K);
  //execute kernel
  double zero = 0.0;
  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
  m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
 int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) 
 {
  int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem rho2_ptr = (cl_mem)rho2_m;
  int NI = I + 1;
  int NJ = J + 1;
  int NK = K + 1;
  int I2 = 2*I;
  int J2 = 2*J;
  int K2 = 2*K;
  int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
  //set the work item size
  size_t work_size = 128;
  size_t work_items = NI * NJ * NK;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
  m_base->ocl_setKernelArg(1, sizeof(int), &I2);
  m_base->ocl_setKernelArg(2, sizeof(int), &J2);
  m_base->ocl_setKernelArg(3, sizeof(int), &K2);
  m_base->ocl_setKernelArg(4, sizeof(int), &NI);
  m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
  m_base->ocl_setKernelArg(6, sizeof(int), &NK);
  m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
  //execute kernel
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
 int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
 {
    int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem mem_ptr1 = (cl_mem) ptr1;
  cl_mem mem_ptr2 = (cl_mem) ptr2;
  //set the work item size
  size_t work_size = 128;
  size_t work_items = size;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("multiplyComplexFields");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
  m_base->ocl_setKernelArg(2, sizeof(int), &size);
  //execute kernel
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@ -1,63 +0,0 @@
 #ifndef H_OPENCL_GREENSFUNCTION
 #define H_OPENCL_GREENSFUNCTION
 #include <iostream>
 #include <cmath>
 #include "../Algorithms/GreensFunction.h"
 #include "OpenCLBase.h"
 class OpenCLGreensFunction : public GreensFunction {
 private:
  bool base_create;
  OpenCLBase *m_base;
 public:
  /** Constructor with OpenCLBase argument */
  OpenCLGreensFunction(OpenCLBase *base);
  /** Default constructor */
  OpenCLGreensFunction();
  /** Destructor */
  ~OpenCLGreensFunction();
  /** Load OpenCL kernel file containing greens function kernels.
   *  m_base takes the kernel file and compiles the OpenCL programm.
   */
  int buildProgram();
  /**
    Info: calc itegral on device memory (taken from OPAL src code)
    Return: success or error code
  */
  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
 		       double hr_m0, double hr_m1, double hr_m2, 
 		       int streamId = -1);
  /**
    Info: integration of rho2_m field (taken from OPAL src code)
    Return: success or error code
  */
  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
 				  int streamId = -1);
  /**
    Info: mirror rho field (taken from OPAL src code)
    Return: succes or error code
  */
  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
  /**
    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
    Return: success or error code
  */
  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
 };
 #endif
--- a/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
@ -1,4 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #pragma OPENCL EXTENSION 
 /******Random numbers********/
@ -87,14 +89,13 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
  if (id < N) {
    RNDState tmp;
-    int tmp_seed = 2*id;// * 0x100000000ULL;
+    int tmp_seed = id;// * 0x100000000ULL;
    tmp.s10 = 12345 + tmp_seed;
    tmp.s11 = 12345 + tmp_seed;
-    tmp.s12 = 12345 + tmp_seed;
+    tmp.s12 = 123 + tmp_seed;
    tmp.s20 = 12345 + tmp_seed;
    tmp.s21 = 12345 + tmp_seed;
-    tmp.s22 = 12345 + tmp_seed;
+    tmp.s22 = 123 + tmp_seed;
    tmp.z = 0;
    tmp.gen = true;
@ -104,19 +105,6 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
 }
 /* create random numbers and fill an array */
 __kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
  int idx = get_global_id(0);
  if (idx < size) {
    RNDState s = states[idx];
    data[idx] = rand_uniform(&s);
    states[idx] = s;
  }
 }
 /**********Degrader**********/
 enum PARAMS { POSITION, 
--- a/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
@ -1,170 +0,0 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 /** compute the greens integral analytically */
 __kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
 			     int NI, int NJ, int NK)
 {
  int tid = get_local_size(0);
  int id = get_global_id(0);
  if (id < NI * NJ * NK) {
    int i = id % NI;
    int k = id / (NI * NJ);
    int j = (id - k * NI * NJ) / NI;
    double cellVolume = hr_m0 * hr_m1 * hr_m2;
    double vv0 = i * hr_m0 - hr_m0 / 2;
    double vv1 = j * hr_m1 - hr_m1 / 2;
    double vv2 = k * hr_m2 - hr_m2 / 2;
    double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
    double tmpgrn  = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
    tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
    tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
    tmpgrn = tmpgrn / 2;
    tmpgrn += vv1 * vv2 * log(vv0 + r);
    tmpgrn += vv0 * vv2 * log(vv1 + r);
    tmpgrn += vv0 * vv1 * log(vv2 + r);
    tmpgreen[id] = tmpgrn / cellVolume;
  }
 }
 /** perform the actual integration */
 __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen, 
 				int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) 
 {
  int tid = get_local_id(0);
  int id = get_global_id(0);
  int ni = NI;
  int nj = NJ;
  double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  if (id < NI_tmp * NJ_tmp * NK_tmp) {
    int i = id % NI_tmp;
    int k = id / (NI_tmp * NJ_tmp);
    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
    tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
    tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
    if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
      tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
    if (i+1 < NI_tmp)
      tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
    if (j+1 < NJ_tmp)
      tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
    if (k+1 < NK_tmp)
      tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    if (i+1 < NI_tmp && j+1 < NJ_tmp)
      tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
    if (i+1 < NI_tmp && k+1 < NK_tmp)
      tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    if (j+1 < NJ_tmp && k+1 < NK_tmp)
      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
  }
 }
 /** miror rho-field */
 __kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
  rho2_m[0] = rho2_m[NI*NJ];
 }
 __kernel void kernelMirroredRhoField(__global double *rho2_m, 
 				     int NI, int NJ, int NK, 
 				     int NI_tmp, int NJ_tmp, int NK_tmp,
 				     int size) 
 {
  int tid = get_local_id(0);
  int id = get_global_id(0);
  if (id == 0)
   rho2_m[0] = rho2_m[NI * NJ];
  barrier(CLK_GLOBAL_MEM_FENCE);
  int id1, id2, id3, id4, id5, id6, id7, id8;
  if (id < NI_tmp * NJ_tmp * NK_tmp) {
    int i = id % NI_tmp;
    int k = id / (NI_tmp * NJ_tmp);
    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
    int ri = NI - i;
    int rj = NJ - j;
    int rk = NK - k;
    id1 = k * NI * NJ + j * NI + i;
    id2 = k * NI * NJ + j * NI + ri;
    id3 = k * NI * NJ + rj * NI + i;
    id4 = k * NI * NJ + rj * NI + ri;
    id5 = rk * NI * NJ + j * NI + i;
    id6 = rk * NI * NJ + j * NI + ri;
    id7 = rk * NI * NJ + rj * NI + i;
    id8 = rk * NI * NJ + rj * NI + ri;
    double data = 0.0;
    if (id1 < size)
      data = rho2_m[id1];
    if (i != 0 && id2 < size) rho2_m[id2] = data;
    if (j != 0 && id3 < size) rho2_m[id3] = data;
    if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
    if (k != 0 && id5 < size) rho2_m[id5] = data;
    if (k !=  0 && i != 0 && id6 < size) rho2_m[id6] = data;
    if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
    if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;     
  }
 }
 /** multiply complex fields */
 double2 ComplexMul(double2 a, double2 b) {
  double2 c;
  c.x = a.x * b.x - a.y * b.y;
  c.y = a.x * b.y + a.y * b.x;
  return c;
 }
 __kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2, 
 				    int size) 
 {
  int idx = get_global_id(0);
  if (idx < size)
    ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #ADD_EXECUTABLE(testFFT testFFT.cpp)
 #ADD_EXECUTABLE(testMIC testMIC.cpp)
 #ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
-ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
+#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
-ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
+#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
 #ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
 #ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
 #ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
@ -22,11 +22,10 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
 #ADD_EXECUTABLE(testGather testGather.cpp)
 #ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
 #ADD_EXECUTABLE(testTranspose testTranspose.cpp)
 ADD_EXECUTABLE(testRandom testRandom.cpp)
 ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
-ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
+#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
 #ADD_EXECUTABLE(testPush testPush.cpp)
-ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
+#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
 #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
@ -39,8 +38,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #TARGET_LINK_LIBRARIES(testFFT dks)
 #TARGET_LINK_LIBRARIES(testMIC dks)
 #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
-TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testFFT3D dks)
-TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
 #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
 #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
 #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -54,11 +53,10 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testGather dks)
 #TARGET_LINK_LIBRARIES(testGatherAsync dks)
 #TARGET_LINK_LIBRARIES(testTranspose dks)
-TARGET_LINK_LIBRARIES(testRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
-TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
 TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testPush dks)
-TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
 #TARGET_LINK_LIBRARIES(testIntegration dks)
 #TARGET_LINK_LIBRARIES(testImageReconstruction dks)
@ -83,4 +81,4 @@ TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}
 #IF (NOT CUDA_VERSION VERSION_LESS "7.0")
  #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
  #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
-#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
+#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
--- a/test/testCollimatorPhysicsSoA.cpp
+++ b/test/testCollimatorPhysicsSoA.cpp
@ -129,9 +129,7 @@ int main(int argc, char *argv[]) {
  //init random
  base.callInitRandoms(numpart);
  //**test collimator physics and sort***//
  void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
  //allocate memory for particles
@ -212,8 +210,8 @@ int main(int argc, char *argv[]) {
  base.freeMemory<double>(pz_ptr, numpart);
  base.freeMemory<double>(param_ptr, 12);
-  
+
-    /*  
+  /*  
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++) {
    std::cout <<  p.label[i] << "\t" << p.rx[i] 
--- a/test/testFFT3D.cpp
+++ b/test/testFFT3D.cpp
@ -1,7 +1,6 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
 #include <string>
 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@ -19,30 +18,22 @@ int main(int argc, char *argv[]) {
  int N = 16;
  char *api_name = new char[10];
  char *device_name = new char[10];
-
+  if (argc == 2) {
-  for (int i = 1; i < argc; i++) {
+    N = atoi(argv[1]);
-    if (argv[i] == string("-cuda")) {
+    strcpy(api_name, "Cuda");
-      strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
-      strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
-    } 
+    N = atoi(argv[1]);
-
+    strcpy(api_name, argv[2]);
-    if (argv[i] == string("-opencl")) {
+    strcpy(device_name, "-gpu");
-      strcpy(api_name, "OpenCL");
+  } else if (argc == 4) {
-      strcpy(device_name, "-gpu");
+    N = atoi(argv[1]);
-    } 
+    strcpy(api_name, argv[2]);
-
+    strcpy(device_name, argv[3]);
-    if (argv[i] == string("-mic")) {
+  } else {
-      strcpy(api_name, "OpenMP");
+    N = 16;
-      strcpy(device_name, "-mic");
+    strcpy(api_name, "OpenCL");
-    } 
+    strcpy(device_name, "-gpu");
    if (argv[i] == string("-cpu")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-cpu");
    }
    if (argv[i] == string("-N"))
      N = atoi(argv[i+1]);
  }
  cout << "Use api: " << api_name << ", " << device_name << endl;
@ -83,16 +74,9 @@ int main(int argc, char *argv[]) {
  /* write data to device */	
  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
  if (N < 5)
    printData3DN4(cdata, N, 3);
  /* execute fft */
  base.callFFT(mem_ptr, 3, dimsize);
  if (N < 5) {
    base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
    printData3DN4(cfft, N, 3);
  }
  /* execute ifft */	
  base.callIFFT(mem_ptr, 3, dimsize);
@ -102,9 +86,7 @@ int main(int argc, char *argv[]) {
  /* read data from device */
  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
-  if (N < 5)
+	
    printData3DN4(cifft, N, 3);	
  /* free device memory */
  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
@ -148,7 +130,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
 	if (a < 10e-5 && a > -10e-5)
 	  a = 0;
-	cout << "(" << d << "," << a << ") ";
+	cout << d << "; " << a << "\t";
      }
    }
    cout << endl;
@ -175,5 +157,3 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
 }
--- a/test/testFFT3DRC.cpp
+++ b/test/testFFT3DRC.cpp
@ -1,8 +1,6 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
 #include <fstream>
 #include <iomanip>
 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@ -10,53 +8,54 @@
 using namespace std;
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
-void initData(double *data, int dimsize[3], int dim);
+void initData(double *data, int dimsize[3]);
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim, 
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
 		char *api_name, char *device_name, char *file_name);
 void printHelp();
 void printData3DN4(complex<double>* &data, int N, int dim);
 void printData3DN4(double* &data, int N, int dim);
 double precision(double a) {
  //if (a < 1e-10)
  //  return 0.0;
  //else
    return a;
 }
 int main(int argc, char *argv[]) {
  int N1 = 8;
  int N2 = 8;
  int N3 = 8;
  int dim = 3;
-  int loop = 0;
+  int loop = 10;
  char *api_name = new char[10];
  char *device_name = new char[10];
  char *file_name = new char[50];
-  if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
+  if ( readParams(argc, argv, N1, N2, N3, loop) )
    return 0;
-  cout << "Use api: " << api_name << ", " << device_name << endl;
+  int dimsize[3] = {N3, N2, N1};
  int dimsize[3] = {N1, N2, N3};
  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
  double *rdata = new double[sizereal];
  double *outdata = new double[sizereal];
  complex<double> *cfft = new complex<double>[sizecomp];
-  initData(rdata, dimsize, dim);
+
  for (int i=0; i<sizecomp; ++i) {
    cfft[i].real() = 7.;
    cfft[i].imag() = 3.33;
  }
  initData(rdata, dimsize);
  /* init DKSBase */
  cout << "Init device and set function" << endl;
 #ifdef DKS_MIC
  DKSBase base;
-  base.setAPI(api_name, strlen(api_name));
+  base.setAPI("OpenMP", 6);
-  base.setDevice(device_name, strlen(device_name));
+  base.setDevice("-mic", 4);
  base.initDevice();
  base.setupFFTRC(dim, dimsize);
  /* setup backward fft (COMPLEX->REAL) */
  base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
 #endif
 #ifdef DKS_CUDA
  DKSBase base;
  base.setAPI("Cuda", 4);
  base.setDevice("-gpu", 4);
  base.initDevice();
  base.setupFFT(dim, dimsize);
 #endif
  // allocate memory on device
  int ierr;
@ -68,59 +67,69 @@ int main(int argc, char *argv[]) {
  // execute one run before starting the timers
  base.writeData<double>(real_ptr, rdata, sizereal);
  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
  base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
  base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
  base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
  base.readData<double>(real_res_ptr, outdata, sizereal);
  //timer for total loop time, FFT and IFFT calls
  struct timeval timeStart, timeEnd;
  struct timeval timeFFTStart[loop], timeFFTEnd[loop];
  struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
  gettimeofday(&timeStart, NULL);
  for (int i=0; i<loop; ++i){
    // write data to device
    base.writeData<double>(real_ptr, rdata, sizereal);
    // execute rcfft
    gettimeofday(&timeFFTStart[i], NULL);
    base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
    gettimeofday(&timeFFTEnd[i], NULL);
    // execute crfft
    gettimeofday(&timeIFFTStart[i], NULL);
    base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
    gettimeofday(&timeIFFTEnd[i], NULL);
    //normalize
 #ifdef DKS_CUDA
    base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
 #endif
    // read IFFT data from device
    base.readData<double>(real_res_ptr, outdata, sizereal);
  ofstream myfile;
  myfile.open(file_name);
  myfile<< "in\tout\treal\timag\n";
  for (int i = 0; i < sizereal; i++) {
    //myfile << precision(rdata[i]) << "\t";
    //myfile << precision(outdata[i]) << "\t";
    if (i < sizecomp) {
      myfile << precision(cfft[i].real()) << "\t";
      myfile << precision(cfft[i].imag());
    }
    myfile << "\n";
  }
-  myfile.close();
+  gettimeofday(&timeEnd, NULL);
 /*
  if (dim == 2) {
    for (int i = 0; i < N2; i++) {
      for (int j = 0; j < N1; j++) {
 	cout << rdata[i*N1 + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }
  if (dim == 2) {
    for (int i = 0; i < N2; i++) {
      for (int j = 0; j < N1 / 2 + 1; j++) {
 	cout << cfft[i*(N1 / 2 + 1)  + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }
 */
  // free device memory
  base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
  base.freeMemory<double>(real_ptr, sizereal);
  base.freeMemory<double>(real_res_ptr, sizereal);
  // compare in and out data to see if we get back the same results
  cout << "comp" << endl;
  compareData(rdata, outdata, N1, N2, N3, dim);
-  cout << "done" << endl;
+
  //calculate seconds for total time and fft times
  double tfft = 0;
  double tifft = 0;
  double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + 
 		  (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
  for (int i = 0; i < loop; i++) {
    tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + 
 	      (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
    tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + 
 	      (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
  }
  //print timing results
  std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
 	    << "\nTotal time\t" << ttot <<  "s\tavg time\t"  << ttot / loop  << "s"
 	    << "\nFFT total\t"  << tfft <<  "s\tFFT avg \t"  << tfft / loop  << "s"
 	    << "\nIFFT total\t" << tifft << "s\tIFFT avg\t"  << tifft / loop << "s"
 	    << "\n\n";
  return 0;
 }
@ -128,10 +137,10 @@ int main(int argc, char *argv[]) {
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
  int id;
  double sum = 0;
-  for (int i = 0; i < NK; i++) {
+  for (int i = 0; i < NI; i++) {
    for (int j = 0; j < NJ; j++) {
-      for (int k = 0; k < NI; k++) {
+      for (int k = 0; k < NK; k++) {
-	id = i*NI*NJ + j*NI + k;
+	id = k*NI*NJ + j*NI + i;
 	sum += fabs(data1[id] - data2[id]);
      }
    }
@ -139,21 +148,13 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
  std::cout << "RC <--> CR diff: " << sum << std::endl;
 }
-void initData(double *data, int dimsize[3], int dim) {
+void initData(double *data, int dimsize[3]) {
-  if (dim == 3) {
+  for (int i = 0; i < dimsize[2]; i++) {
    for (int i = 0; i < dimsize[2]; i++)
      for (int j = 0; j < dimsize[1]; j++) 
 	for (int k = 0; k < dimsize[0]; k++) 
 	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
  } else if (dim == 2) {
    for (int j = 0; j < dimsize[1]; j++) {
      for (int k = 0; k < dimsize[0]; k++) {
-	data[j*dimsize[0] + k] = sin(k);
+	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
      }
    }
  } else {
    for (int k = 0; k < dimsize[0]; k++) 
      data[k] = sin(k);
  }
 }
@ -172,17 +173,10 @@ void printHelp() {
  std::cout << std::endl;
 }
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
 		char *api_name, char *device_name, char *file_name) 
 {
  for (int i = 1; i < argc; i++) {
    if ( argv[i] == std::string("-dim")) {
      dim = atoi(argv[i + 1]);
      i++;
    }
    if ( argv[i] == std::string("-grid") ) {
      N1 = atoi(argv[i + 1]);
      N2 = atoi(argv[i + 2]);
@ -199,72 +193,7 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, in
      printHelp();
      return true;
    }
    if (argv[i] == string("-cuda")) {
      strcpy(api_name, "Cuda");
      strcpy(device_name, "-gpu");
      strcpy(file_name, "cuda_fft.dat");
    } 
    if (argv[i] == string("-opencl")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-gpu");
      strcpy(file_name, "opencl_fft.dat");
    } 
    if (argv[i] == string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
      strcpy(file_name, "openmp_fft.dat");
    } 
    if (argv[i] == string("-cpu")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-cpu");
      strcpy(file_name, "opencl_cpu_fft.dat");
    }
  }
  return false;
 }
 void printData3DN4(complex<double>* &data, int N, int dim) {
  for (int j = 0; j < N; j++) {
    for (int i = 0; i < N; i++) {
      for (int k = 0; k < N/2 + 1; k++) {
 	double d = data[i*N*N + j*N + k].real();
 	double a = data[i*N*N + j*N + k].imag();
 	if (d < 10e-5 && d > -10e-5)
 	  d = 0;
 	if (a < 10e-5 && a > -10e-5)
 	  a = 0;
 	cout << "(" << d << "," << a << ") ";
      }
    }
    cout << endl;
  }
  cout << endl;
 }
 void printData3DN4(double* &data, int N, int dim) {
  for (int j = 0; j < N; j++) {
    for (int i = 0; i < N; i++) {
      for (int k = 0; k < N; k++) {
 	double d = data[i*N*N + j*N + k];
 	if (d < 10e-5 && d > -10e-5)
 	  d = 0;
 	cout << d << " ";
      }
    }
    cout << endl;
  }
  cout << endl;
 }
--- a/test/testFFTSolver_MIC.cpp
+++ b/test/testFFTSolver_MIC.cpp
@ -1,4 +1,5 @@
 #include <iostream>
 //#include <mpi.h>
 #include <string.h>
 #include "DKSBase.h"
@ -10,265 +11,309 @@ using namespace std;
 void printData3D(double* data, int N, int NI, const char *message = "") {
-  if (strcmp(message, "") != 0)
+	if (strcmp(message, "") != 0)
-    cout << message;
+		cout << message;
-  for (int i = 0; i < NI; i++) {
+	for (int i = 0; i < NI; i++) {
-    for (int j = 0; j < N; j++) {
+		for (int j = 0; j < N; j++) {
-      for (int k = 0; k < N; k++) {
+			for (int k = 0; k < N; k++) {
-	cout << data[i*N*N + j*N + k] << "\t";
+				cout << data[i*N*N + j*N + k] << "\t";
-      }
+			}
-      cout << endl;
+			cout << endl;
-    }
+		}
-    cout << endl;
+		cout << endl;
-  }
+	}
 }
 void initData(double *data, int N) {
-  for (int i = 0; i < N/4 + 1; i++) {
+	for (int i = 0; i < N/4 + 1; i++) {
-    for (int j = 0; j < N/2 + 1; j++) {
+		for (int j = 0; j < N/2 + 1; j++) {
-      for (int k = 0; k < N/2 + 1; k++) {
+			for (int k = 0; k < N/2 + 1; k++) {
-	data[i*N*N + j*N + k] = k+1;
+				data[i*N*N + j*N + k] = k+1;
-      }
+			}
-    }
+		}
-  }
+	}
 }
 void initData2(double *data, int N) {
-  for (int i = 0; i < N; i++)
+	for (int i = 0; i < N; i++)
-    data[i] = i;
+		data[i] = i;
 }
 void initComplex( complex<double> *d, int N) {
-  for (int i = 0; i < N; i++) {
+	for (int i = 0; i < N; i++) {
-    d[i] = complex<double>(2, 0);
+		d[i] = complex<double>(2, 0);
-  }
+	}
 }
 void printComplex(complex<double> *d, int N) {
-  for (int i = 0; i < N; i++)
+	for (int i = 0; i < N; i++)
-    cout << d[i] << "\t";
+		cout << d[i] << "\t";
-  cout << endl;
+	cout << endl;
 }
 void printDouble(double *d, int N) {
  for (int i = 0; i < N; i++)
    cout << d[i] << ", ";
  cout << endl;
 }
 void initMirror(double *data, int n1, int n2, int n3) {
-  int d = 1;
+	int d = 1;
-  for (int i = 0; i < n3; i++) {
+	for (int i = 0; i < n3; i++) {
-    for (int j = 0; j < n2; j++) {
+		for (int j = 0; j < n2; j++) {
-      for (int k = 0; k < n1; k++) {
+			for (int k = 0; k < n1; k++) {
-	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+				if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
-	  data[i * n2 * n1 + j * n1 + k] = d++;
+					data[i * n2 * n1 + j * n1 + k] = d++;
-	else
+				else
-	  data[i * n2 * n1 + j * n1 + k] = 0;
+					data[i * n2 * n1 + j * n1 + k] = 0;
-      }
+			}
-    }
+		}
-  }
+	}
 }
 void printDiv(int c) {
-  for (int i = 0; i < c; i++)
+	for (int i = 0; i < c; i++)
-    cout << "-";
+		cout << "-";
-  cout << endl;
+	cout << endl;
 }
 void printMirror(double *data, int n1, int n2, int n3) {
-  printDiv(75);
+	printDiv(75);
-  for (int i = 0; i < n3; i++) {
+	for (int i = 0; i < n3; i++) {
-    for (int j = 0; j < n2; j++) {
+		for (int j = 0; j < n2; j++) {
-      for (int k = 0; k < n1; k++) {
+			for (int k = 0; k < n1; k++) {
-	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+				cout << data[i * n2 * n1 + j * n1 + k] << "\t";
-      }
+			}
-      cout << endl;
+			cout << endl;
-    }
+		}
-    cout << endl;
+		cout << endl;
-  }
+	}
-  cout << endl;
+	cout << endl;
 }
 double sumData(double *data, int datasize) {
-  double sum = 0;
+	double sum = 0;
-  for (int i = 0; i < datasize; i++)
+	for (int i = 0; i < datasize; i++)
-    sum += data[i];
+		sum += data[i];
-  return sum;
+	return sum;
 }
 int main(int argc, char *argv[]) {
-  char *api_name = new char[10];
+	/* mpi init */
-  char *device_name = new char[10];
+	//int rank, nprocs;
 	//MPI_Init(&argc, &argv);
 	//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-  for (int i = 1; i < argc; i++) {
+	/*
-    if (argv[i] == string("-cuda")) {
+	   if (nprocs != 8) {
-      strcpy(api_name, "Cuda");
+	   cout << "example was set to run with 8 processes" << endl;
-      strcpy(device_name, "-gpu");
+	   cout << "exit..." << endl;
-    } 
+	   return 0;
 	   }
 	   */
-    if (argv[i] == string("-opencl")) {
+	/* set domain size */
-      strcpy(api_name, "OpenCL");
+	int NG[3] = {64, 64, 32};
-      strcpy(device_name, "-gpu");
+	int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
-    } 
+	int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
 	int sizerho = NG[0] * NG[1] * NG[2];
 	int sizegreen = ng[0] * ng[1] * ng[2];
 	int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
 	int id[3];
-    if (argv[i] == string("-mic")) {
+	//id[0] = 0;
-      strcpy(api_name, "OpenMP");
+	//id[1] = NL[1] * (rank % 4);
-      strcpy(device_name, "-mic");
+	//id[2] = NL[2] * (rank / 4);
    } 
-    if (argv[i] == string("-cpu")) {
+	/* print some messages bout the example in the begginig */
-      strcpy(api_name, "OpenCL");
+	cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
-      strcpy(device_name, "-cpu");
+	//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
-    }
+	cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
-  }
+	//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
 	int tmp[3];
 	/*  for (int p = 1; p < nprocs; p++) {
 		MPI_Status mpistatus;
 		MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
 		cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
 		}*/
 	// } else {
 	//   MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
 	// }
-  cout << "Use api: " << api_name << ", " << device_name << endl;
+	/* dks init and create 2 streams */
 	int dkserr;
 	//int streamGreens, streamFFT;
 #ifdef DKS_MIC
 	DKSBase base;
 	base.setAPI("OpenMP", 6);
 	base.setDevice("-mic", 4);
 	base.initDevice();
 #endif
-  /* set domain size */
+#ifdef DKS_CUDA
-  int NG[3] = {64, 64, 32};
+	DKSBase base;
-  int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+	base.setAPI("Cuda", 4);
-  int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+	base.setDevice("-gpu", 4);
-  int sizerho = NG[0] * NG[1] * NG[2];
+	base.initDevice();
-  int sizegreen = ng[0] * ng[1] * ng[2];
+#endif
  int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
-  /* print some messages bout the example in the begginig */
+	//base.createStream(streamFFT);
-  cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+	//if (rank == 0) {
-  cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
+	//  base.createStream(streamGreens);
 	base.setupFFT(3, NG);
 	//}
-  /* dks init and create 2 streams */
+	/* allocate memory and init rho field */
-  int dkserr;
+	double *rho = new double[sizerho];
-  DKSBase base;
+	double *rho_out = new double[sizerho];
-  base.setAPI(api_name, strlen(api_name));
+	//double *green_out = new double[sizegreen];
-  base.setDevice(device_name, strlen(device_name));
+	initMirror(rho, NL[0], NL[1], NL[2]);
  base.initDevice();
  base.setupFFT(3, NG);
-  /* allocate memory and init rho field */
+	/*
-  double *rho = new double[sizerho];
+	   allocate memory on device for 
-  double *rho_out = new double[sizerho];
+	   - rho field
-  //double *green_out = new double[sizegreen];
+	   - rho FFT
-  double *mirror_out = new double[sizerho];
+	   - tmpgreen
-  //initMirror(rho, NL[0], NL[1], NL[2]);
+	   - greens integral
-  initMirror(rho, NG[0], NG[1], NG[2]);
+	   - greens integral FFT
 	   */
 	void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
 	// if (rank == 0) {
 	tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
 	rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
 	grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
 	rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
 	grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
 	/* } else {
 	   grntr_ptr = NULL;
 	   rho2_ptr = NULL;
 	   grn_ptr = NULL;
 	   rho2tr_ptr = NULL;
 	   tmpgreen_ptr = NULL;
 	   }*/
  /*
    allocate memory on device for 
    - rho field
    - rho FFT
    - tmpgreen
    - greens integral
    - greens integral FFT
  */
  void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
  tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
  rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
  grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
  rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
  grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
-  /* =================================================*/
+	/* send and receive pointer to allocated memory on device */
-  /* =================================================*/
+	/*
-  /* =====loop trough fftpoison solver iterations=====*/
+	   if (rank == 0) {
-  /* =================================================*/
+	   for (int p = 1; p < nprocs; p++)
-  /* =================================================*/
+	   base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
 	   } else {
 	   rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
 	   }
 	   MPI_Barrier(MPI_COMM_WORLD);
 	   */
  double old_sum = 0;
-  int hr_m[3] = {1, 1, 1};
+	/* =================================================*/
-  base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
+	/* =================================================*/
 	/* =====loop trough fftpoison solver iterations=====*/
 	/* =================================================*/
 	/* =================================================*/
-  /* calculate greens integral on gpu */
+	double old_sum = 0;
-  base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
+	double tmp_sum = 0;
 	for (int l = 0; l < 100; l++) {
 		//MPI_Barrier(MPI_COMM_WORLD);
 		/* on node 0, calculate tmpgreen on gpu */
 		int hr_m[3] = {1, 1, 1};
 		//if (rank == 0)
 		base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
 				hr_m[0], hr_m[1], hr_m[2]);
-  /* mirror the field */
+		/* calculate greens integral on gpu */
-  base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+		//if (rank == 0)
-  /*
+		base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
  base.readData<double>(grn_ptr, mirror_out, sizerho);
  for (int i = 0; i < sizerho; i++)
    cout << mirror_out[i] << " ";
  cout << endl << endl;
-  for (int i = 0; i < sizerho; i++)
+		/* mirror the field */
-    cout << rho[i] << " ";
+		//if (rank == 0)
-  cout << endl << endl;
+		base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
  */
  /* transfer rho field to device */
  base.writeData<double>(rho2_ptr, rho, sizerho);
  /* get FFT of rho field */
  base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
-  /* get FFT of mirrored greens integral */
+		/* get FFT of mirrored greens integral */
-  base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
+		//if (rank == 0) 
 		base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
-  /* multiply both FFTs */
+		/* transfer rho field to device */
-  base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
+		//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
 		base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
 		//MPI_Barrier(MPI_COMM_WORLD);
-  /*
+		/* get FFT of rho field */
-  complex<double> *crho = new complex<double>[sizecomp];
+		//if (rank == 0) {
-  complex<double> *cgre = new complex<double>[sizecomp];
+		//base.syncDevice();
-  base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
+		base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
-  base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
+		//}
-  for (int i = 0; i < sizecomp; i++)
+		/* multiply both FFTs */
-    cout << cgre[i].real() << " ";
+		//if (rank == 0)
-  cout << endl << endl;
+		base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
 		//MPI_Barrier(MPI_COMM_WORLD);
-  for (int i = 0; i < sizecomp; i++)
+		/* inverse fft and transfer data back */
-    cout << crho[i].real() << " ";
+		/* 
-  cout << endl << endl;
+		   multiple device syncs and mpi barriers are used to make sure data 
-  
+		   transfer is started when results are ready and progam moves on 
-  delete[] crho;
+		   only when data transfer is finished
-  delete[] cgre;
+		   */
-  */
+		//if (rank == 0) {
 		base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
 		//base.syncDevice();
 		//MPI_Barrier(MPI_COMM_WORLD);
 		//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
 		base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
 		//MPI_Barrier(MPI_COMM_WORLD);
 		//base.syncDevice();
 		//MPI_Barrier(MPI_COMM_WORLD);
 		//cout << "result: " << sumData(rho_out, sizerho) << endl;
 		if (l == 0) { 
 			old_sum = sumData(rho_out, sizerho);
 		} else {
 			tmp_sum = sumData(rho_out, sizerho);
 			if (old_sum != tmp_sum) {
 				cout << "diff in iteration: " << l << endl;
 			}
 		}
 		/*} else {
 		  MPI_Barrier(MPI_COMM_WORLD);
 		  base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
 		  MPI_Barrier(MPI_COMM_WORLD);
 		  MPI_Barrier(MPI_COMM_WORLD);
 		  }
 		  */
  /* inverse fft and transfer data back */
  /* 
     multiple device syncs and mpi barriers are used to make sure data 
     transfer is started when results are ready and progam moves on 
       only when data transfer is finished
  */
  base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
  base.readData<double> (rho2_ptr, rho_out, sizerho);
  for (int i = 0; i < 10; i++)
    cout << rho_out[i] << " ";
  cout << endl;
  old_sum = sumData(rho_out, sizerho);
 	}
 /* =================================================*/  
 /* =================================================*/
 /* ==========end fftpoison solver test run==========*/
 /* =================================================*/
 /* =================================================*/
  base.freeMemory<double>(tmpgreen_ptr, sizegreen);
  base.freeMemory<double>(grn_ptr, sizerho);
  base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
  base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
  base.freeMemory<double>(rho2_ptr, sizerho);
-  delete[] rho_out;
+
-  delete[] rho;
+/* free memory on device */
-  delete[] mirror_out;
+//if (rank == 0) {
-  cout << "Final sum: " << old_sum << endl;
+base.freeMemory<double>(tmpgreen_ptr, sizegreen);
 base.freeMemory<double>(grn_ptr, sizerho);
 base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
 base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
 //MPI_Barrier(MPI_COMM_WORLD);
 base.freeMemory<double>(rho2_ptr, sizerho);
 cout << "Final sum: " << old_sum << endl;
 /*} else {
  base.closeHandle(rho2_ptr);
  MPI_Barrier(MPI_COMM_WORLD);
  }*/
 //MPI_Finalize();
 }
--- a/test/testRandom.cpp
+++ b/test/testRandom.cpp
@ -1,81 +0,0 @@
 #include <iostream>
 #include <string>
 #include <vector>
 #include <sys/time.h>
 #include "DKSBase.h"
 using namespace std;
 int main(int argc, char *argv[]) {
  int size = 10;
  bool apiSet = false;
  char *api_name = new char[10];
  char *device_name = new char[10];
  for (int i = 1; i < argc; i++) {
    if (argv[i] == string("-cuda")) {
      strcpy(api_name, "Cuda");
      strcpy(device_name, "-gpu");
      apiSet = true;
    }
    if (argv[i] == string("-opencl")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-gpu");
      apiSet = true;
    }
    if (argv[i] == string("-N")) {
      size = atoi(argv[i+1]);
      i++;
    }
  }
  if (!apiSet) {
    strcpy(api_name, "Cuda");
    strcpy(device_name, "-gpu");
  }
  cout << "=========================BEGIN TEST=========================" << endl;
  cout << "Use api: " << api_name << "\t" << device_name << endl;
  cout << "Number of randoms: " << size << endl;
  //init dks
  int ierr;
  DKSBase base;
  base.setAPI(api_name, strlen(api_name));
  base.setDevice(device_name, strlen(api_name));
  base.initDevice();
  base.callInitRandoms(size);
  //create host vector to store results
  double *host_data = new double[size];
  //create device vector
  void *device_data = base.allocateMemory<double>(size, ierr);
  for (int i = 0; i < 5; i++) {
    //fill device vector with random values
    base.callCreateRandomNumbers(device_data, size);
    //read device vector
    base.readData<double>(device_data, host_data, size);
    //print host data
    for (int i = 0; i < size; i++)
      cout << host_data[i] << " ";
    cout << endl;
  }
  //free device vector
  base.freeMemory<double>(device_data, size);
  //free host data
  delete[] host_data;
  return 0;
 }
Author	SHA1	Message	Date
Uldis Locans	24f394c693	add enableRutherfordScattering option to OPALs collimatorPhysics GPU version	2017-04-24 10:44:41 +02:00
Uldis Locans	8f00d2a593	Allow other applications check for DKS version	2017-04-05 17:00:52 +02:00
Uldis Locans	1606b641d4	add seed to random number initialization	2017-03-17 10:43:41 +01:00