add enableRutherfordScattering option to OPALs collimatorPhysics GPU version

Allow other applications check for DKS version
add seed to random number initialization
2017-04-24 10:44:41 +02:00 · 2017-04-05 17:00:52 +02:00 · 2017-03-17 10:43:41 +01:00
79 changed files with 2270 additions and 6541 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,8 +1,10 @@
 CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
-SET (DKS_VERSION_MINOR 1)
+SET (DKS_VERSION_MINOR 0)
-SET (DKS_VERSION_PATCH 4)
+SET (DKS_VERSION_PATCH 2)
 SET (PACKAGE \"dks\")
 set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
 SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -10,9 +12,6 @@ SET (PACKAGE_NAME \"DKS\")
 SET (PACKAGE_TARNAME \"dks\")
 SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
 SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 if (APPLE)
  SET (CMAKE_MACOSX_RPATH TRUE)
 endif (APPLE)
 #get compiler name
 #STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
@ -31,60 +30,24 @@ MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
 set (BOOSTROOT $ENV{BOOST_DIR})
 SET (Boost_USE_STATIC_LIBS OFF)
 SET (Boost_USE_STATIC_RUNTIME OFF)
-#FIND_PACKAGE(Boost 1.55 REQUIRED COMPONENTS filesystem system)
+FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
 FIND_PACKAGE(Boost 1.41 REQUIRED)
 IF (Boost_FOUND)
  MESSAGE (STATUS "Boost version: ${Boost_VERSION}")
  MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
  MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
-  #MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
+  MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
  INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
  LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
 ENDIF (Boost_FOUND)
 #include OPAL, musrfit or pet kernels
 OPTION(DKS_FULL "Compile DKS with full library" OFF)
 OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
 OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
 OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
 IF (DKS_FULL)
  SET(ENABLE_OPAL ON)
  SET(ENABLE_MUSR ON)
  SET(ENABLE_PET ON)
 ENDIF(DKS_FULL)
 #find clFFT
 OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
 IF (ENABLE_AMD)
  SET (clFFT_USE_STATIC_LIBS OFF)
  FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
  MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
  MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
  INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
  LINK_DIRECTORIES (${CLFFT_LIBRARIES})
  #find clRNG
  #SET (clRNG_USE_STATIC_LIBS OFF)
  #FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
  #MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
  #MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
  #INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
  #LINK_DIRECTORIES (${CLRNG_LIBRARIES})
  #find_package(PkgConfig)
  #pkg_check_modules(clRng REQUIRED)
  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
 ENDIF (ENABLE_AMD)
 #enable UQTK
 OPTION (USE_UQTK "Use UQTK" OFF)
 #intel icpc compiler specific flags
 IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
  #for intel compiler turn on openmp and opencl
-  OPTION (USE_OPENCL "Use OpenCL" OFF)
+  OPTION (USE_OPENCL "Use OpenCL" ON)
  OPTION (USE_CUDA "Use CUDA" OFF)
  OPTION (USE_MIC "Use intel MIC" ON)
@ -115,30 +78,18 @@ IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
 ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
 #gnu copmpiler specific flags
-IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
+IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
-  OPTION (USE_OPENCL "Use OpenCL" OFF)
+  OPTION (USE_OPENCL "Use OpenCL" ON)
  OPTION (USE_CUDA "Use CUDA" OFF)
  OPTION (USE_MIC "Use intel MIC" OFF)
-  OPTION (STATIC_CUDA "Link static cuda libraries" OFF)
+  
-
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
  IF (ENABLE_MUSR)
    SET (USE_OPENCL ON)
  ENDIF (ENABLE_MUSR)
  #dont set openmp flag for apple devices
  IF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
  ELSE ($CMAKE_C_COMPILER_ID} STREQUAL "GNU")
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -std=c++11 -D__wsu") 
  ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
  FIND_PACKAGE(CUDA)
  IF (CUDA_FOUND)
    SET (USE_CUDA ON)
    OPTION(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cuda libraries" OFF)
    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -148,27 +99,20 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
-    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
-    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")
+    SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
-
+    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}  -DDEBUG -std=c++11 -D__wsu")    
-    IF (NOT STATIC_CUDA)
+    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA")
      SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart -lcufft -lcublas")
    ELSE (NOT STATIC_CUDA) 
      SET (CUDA_SEPARABLE_COMPILATION ON)
      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA -fPIC")
      SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-rdc=true;-lcufft_static;-lcublas_static;-lcurand_static")
      SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart_static -lcufft_static -lcublas_static -lculibos")
    ENDIF (NOT STATIC_CUDA)
    #if cuda version >= 7.0 add runtime commpilation flags
-    IF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
+    IF (NOT CUDA_VERSION VERSION_LESS "7.0")
      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
-    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
+    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
    MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
    SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
    #set(CUDA_SEPARABLE_COMPILATION ON)
    SET(BUILD_SHARED_LIBS OFF)
  ENDIF (CUDA_FOUND)
@ -178,9 +122,6 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE(STATUS "CUDA not found, looking for OpenCL")
    FIND_PACKAGE(OpenCL)
    MESSAGE("after FIND_PACKAGE(OpenCL): version: ${OpenCL_VERSION_STRING}")
    MESSAGE("after FIND_PACKAGE(OpenCL): inc dir: ${OpenCL_INCLUDE_DIR}")
    MESSAGE("after FIND_PACKAGE(OpenCL): lib dir: ${OpenCL_LIBRARY}")
    IF (OpenCL_FOUND)
      MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
      MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
@ -198,9 +139,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
  ENDIF(APPLE AND NOT CUDA_FOUND)
  #if cuda found set cuda opencl flags
-  IF (CUDA_FOUND AND USE_OPENCL)
+  IF (CUDA_FOUND)
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
-  ENDIF (CUDA_FOUND AND USE_OPENCL)
+  ENDIF (CUDA_FOUND)
  #if cuda not found but amd opencl found set opencl flags
  IF (NOT CUDA_FOUND AND OpenCL_FOUND)
@ -212,7 +153,7 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
  ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
-ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
+ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
 SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
 MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
@ -244,3 +185,4 @@ INSTALL (
  DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
  RENAME ${PROJECT_NAME}ConfigVersion.cmake
  )
--- a/2356
+++ b/2356
--- a/ReadMe.first
+++ b/ReadMe.first
@ -1,7 +1,7 @@
 ##################################################################
 #
 # Name:		Dynamic Kernel Scheduler
-# Version:	1.1
+# Version:	1.0
 # Author: 	Uldis Locans
 # Contacts:	locans.uldis@psi.ch
 #
@ -29,30 +29,30 @@ Intel MIC compilers (optional)
 ######Source######
 https://gitlab.psi.ch/uldis_l/DKS
 ######Changes from DKS-1.0.x version######
 DKS is split into three modules that can be enabled/disabled at compile time depending on which software it is used for.
 By default only DKSBase and DKSFFT modules are enabled. In order to install other modules the necessary otion needs to be enabled.
 Supported options are:
 -DENABLE_OPAL option should be enabled if DKS will be used for OPAL
 -DENABLE_MUSR option should be enable if DKS will be used for musrfit
 -DENABLE_PET option should be enabled if DKS will be used for PET image reconstruction
 See install instructions for more details on how to enable the necessary options in DKS
 ######Install######
 #consult the https://gitlab.psi.ch/uldis_l/DKS/wikis/home for full install isntructions
 #clone DKS
 git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS
-#switch to the desired version (OPTIONAL)
+#set compilers to use
-git checkout DKS-1.1.0
+#supported c++ compilers: g++, icpc, mpicxx whith g++
 #supported c compilers: gcc, icc, mpicc whith gcc
 export CXX_COMPILER=cpp_compiler_name
 export CC_COMPILER=c_compiler_name
-#configure installation in build directory
+#set dks root directory directory
-#enable DKS modules to compile -DENABLE_OPAL, -DENABLE_MUSR, -DENABLE_PET
+cd DKS
-CXX=<c++ compiler> CC=<c compiler> -DCMAKE_INSTALL_PREFIX=<install dir> <path to DKS source> [-DENABLE_OPAL=1 -DENABLE_MUSR=1 -DENABLE_PET=1] 
+export DKS_ROOT = $PWD
 #set build directory
 mkdir $DKS_BUILD_DIR
 cd $DKS_BUILD_DIR
 #set install directory
 export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
 CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
 #install DKS
 make
 make install
--- a/auto-tuning/CMakeLists.txt
+++ b/auto-tuning/CMakeLists.txt
@ -2,32 +2,18 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #chi square kernel tests
-IF (ENABLE_MUSR)
+ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
-  ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
  TARGET_LINK_LIBRARIES(testChiSquareRT dks ${CLFFT_LIBRARIES})
-  ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
+ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
-  TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
  IF (USE_UQTK)
    ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
    TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
  ENDIF (USE_UQTK)
  #test to verify search functions
  ADD_EXECUTABLE(testSearch testSearch.cpp)
  TARGET_LINK_LIBRARIES(testSearch dks ${CLFFT_LIBRARIES})
 ENDIF (ENABLE_MUSR)
 IF (ENABLE_OPAL)
  ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
  TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
  ADD_EXECUTABLE(testPushKick testPushKick.cpp)
  TARGET_LINK_LIBRARIES(testPushKick dks ${CLFFT_LIBRARIES})
 ENDIF(ENABLE_OPAL)
 ADD_EXECUTABLE(testFFT testFFT.cpp)
 TARGET_LINK_LIBRARIES(testFFT dks ${CLFFT_LIBRARIES})
 IF (USE_UQTK)
  ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
  TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
 ENDIF (USE_UQTK)
 #TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
 #test to verify search functions
 ADD_EXECUTABLE(testSearch testSearch.cpp)
 TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
--- a/auto-tuning/testChiSquareRT.cpp
+++ b/auto-tuning/testChiSquareRT.cpp
@ -292,9 +292,6 @@ int runTest(const char *api_name, const char *device_name, bool autotune, bool m
      //set autotuning on/off
      if (autotune)
 	dksbase.setAutoTuningOn();
      //check kernel
      dksbase.checkMuSRKernels(1);
      //tmp values to store results and tmp values for time steps and start time
      double result_gpu = 0.0;
@ -376,11 +373,11 @@ int main(int argc, char* argv[]) {
  }
-  int numPlatforms = 3;
+  int numPlatforms = 2;
  const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
  const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
-  for (int i = 2; i < numPlatforms; i++) {
+  for (int i = 0; i < numPlatforms; i++) {
    runTest(api[i], device[i], autotune, mlh, asym);
  }
--- a/auto-tuning/testChiSquareRTRandom.cpp
+++ b/auto-tuning/testChiSquareRTRandom.cpp
@ -392,10 +392,7 @@ int main(int argc, char *argv[]) {
  dksbase.setAPI(api_name);
  dksbase.setDevice(device_name);
  std::cout << "Init device" << std::endl;
  dksbase.initDevice();
  std::cout << "Init chi square" << std::endl;
  dksbase.initChiSquare(Ndata, np, nf, nm);
  dksbase.writeParams(p, np);
@ -404,24 +401,20 @@ int main(int argc, char *argv[]) {
  dksbase.callSetConsts(N0, TAU, BKG);
  std::cout << "Compile program" << std::endl;
  dksbase.callCompileProgram(sfunc);
  dksbase.checkMuSRKernels(1);
  if (autotune) 
    dksbase.setAutoTuningOn();
-  //std::cout << "Get operations" << std::endl;
+  int oper = 0;
-  //int oper = 0;
+  dksbase.getOperations(oper);
  //dksbase.getOperations(oper);
  cout << "=========================BEGIN TEST=========================" << endl;
  cout << "Use api: " << api_name << "\t" << device_name << endl;
  cout << "Number of params: " << np << endl;
  cout << "Number of maps: " << nm << endl;
  cout << "Number of predefined functions: " << nfunc << endl;
-  //cout << "Number of ptx instructions: " << oper << endl;
+  cout << "Number of ptx instructions: " << oper << endl;
  cout << "------------------------------------------------------------" << endl;
  cout << sfunc << endl;
  cout << "------------------------------------------------------------" << endl;
--- a/auto-tuning/testCollimatorPhysics.cpp
+++ b/auto-tuning/testCollimatorPhysics.cpp
@ -1,161 +0,0 @@
 #include <iostream>
 #include <vector>
 #include <string>
 #include "DKSOPAL.h"
 typedef struct {
  int label;
  unsigned localID;
  double Rincol[3];
  double Pincol[3];
 } PART;
 PART initPartSmall(int d) {
  PART p;
  p.label = 0;
  p.localID = d;
  p.Rincol[0] = 0.0;
  p.Rincol[1] = 0.0;
  p.Rincol[2] = 0.02;
  p.Pincol[0] = 0.0;
  p.Pincol[1] = 0.0;
  p.Pincol[2] = 3.9920183237269791e-01;
  return p;
 }
 void printPart(PART p) {
  std::cout << "label: " << p.label << ", ";
  std::cout << "localid: " << p.localID << ",";
  std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
  std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
  std::cout << std::endl;
 }
 void initParts(PART *p, int N) {
  for (int i = 0; i < N; i++)
    p[i] = initPartSmall(i);
 }
 void printParts(PART *p, int N) {
  for (int i = 0; i < N; i++)
    printPart(p[i]);
  std::cout << std::endl;
 }
 void initParams(double *data) {
  data[0]  = 0.0;//2.0000000000000000e-02;
  data[1]  = 1.0;//1.0000000000000000e-02;	
  data[2]  = 2.2100000000000000e+00;
  data[3]  = 6.0000000000000000e+00;	
  data[4]  = 1.2010700000000000e+01;	
  data[5]  = 2.6010000000000000e+00;	
  data[6]  = 1.7010000000000000e+03;	
  data[7]  = 1.2790000000000000e+03;	
  data[8]  = 1.6379999999999999e-02;	
  data[9]  = 1.9321266968325795e-01;	
  data[10] = 7.9000000000000000e+01;	
  data[11] = 1.0000000000000002e-12;
 }
 int main(int argc, char *argv[]) {
  int loop = 10;
  int numpart = 1e5;
  char *api_name = new char[10];
  char *device_name = new char[10];
  strcpy(api_name, "Cuda");
  strcpy(device_name, "-gpu");
  for (int i = 1; i < argc; i++) {
    if (argv[i] == std::string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
    }
    if (argv[i] == std::string("-npart")) {
      numpart = atoi(argv[i+1]);
      i++;
    }
    if (argv[i] == std::string("-loop")) {
      loop = atoi(argv[i+1]);
      i++;
    }
  }
  std::cout << "=========================BEGIN TEST=========================" << std::endl;
  std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
  std::cout << "Number of particles: " << numpart << std::endl;
  std::cout << "Number of loops: " << loop << std::endl;
  std::cout << "------------------------------------------------------------" << std::endl;
  //init part vector to test mc
  PART *parts = new PART[numpart];
  initParts(parts, numpart);
  double *params = new double[12];
  initParams(params);
  //init dks
  int ierr;
  DKSOPAL base;
  base.setAPI(api_name, strlen(api_name));
  base.setDevice(device_name, strlen(api_name));
  ierr = base.initDevice();
  if (ierr != DKS_SUCCESS)
    std::cout << "Error with init device!" << std::endl;
  //init random
  base.callInitRandoms(numpart);
  //**test collimator physics and sort***//
  void *part_ptr, *param_ptr;
  //allocate memory for particles
  part_ptr = base.allocateMemory<PART>(numpart, ierr);
  param_ptr = base.allocateMemory<double>(12, ierr);
  //transfer data to device
  base.writeData<PART>(part_ptr, parts, numpart);
  base.writeData<double>(param_ptr, params, 12);
  int numaddback;
  base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
  base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);  
  base.syncDevice();
  //read data from device
  base.readData<PART>(part_ptr, parts, numpart);
  //free memory
  base.freeMemory<PART>(part_ptr, numpart);
  base.freeMemory<double>(param_ptr, 12);  
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++) {
    std::cout << parts[i].label << "\t" 
 	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
 	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
 	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
 	      << std::endl;
  }
  std:: cout << "..." << std::endl;
  for (int i = numpart - 10; i < numpart; i++) {
    std::cout << parts[i].label << "\t" 
 	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
 	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
 	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
 	      << std::endl;
  }
  return 0;
 }
--- a/auto-tuning/testFFT.cpp
+++ b/auto-tuning/testFFT.cpp
@ -1,214 +0,0 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
 #include "Utility/TimeStamp.h"
 #include "DKSFFT.h"
 using namespace std;
 void compareData(complex<double>* data1, complex<double>* data2, int N, int dim);
 void compareData(double* data1, double *data2, int N, int dim);
 void initData(complex<double> *data, int dimsize[3], int dim);
 void initData(double *data, int dimsize[3], int dim);
 bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim, 
 		char *api_name, char *device_name);
 void printHelp();
 int main(int argc, char *argv[]) {
  int ierr;
  int N1 = 8;
  int N2 = 8;
  int N3 = 8;
  int dim = 3;
  char *api_name = new char[10];
  char *device_name = new char[10];
  if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) )
    return 0;
  cout << "Use api: " << api_name << ", " << device_name << endl;
  int dimsize[3] = {N1, N2, N3};
  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
  double *rdata = new double[sizereal];
  double *ordata = new double[sizereal];
  complex<double> *cdata = new complex<double>[sizereal];
  complex<double> *codata = new complex<double>[sizereal];
  initData(rdata, dimsize, 3);
  initData(cdata, dimsize, 3);
  /* init DKSBase */
  cout << "Init device and set function" << endl;
  DKSFFT base;
  base.setAPI(api_name, strlen(api_name));
  base.setDevice(device_name, strlen(device_name));
  cout << "init device" << endl;
  base.initDevice();
  cout << "setup fft" << endl;
  base.setupFFT(dim, dimsize);
  //Test RC FFT -> CR FFT
  void *real_ptr, *comp_ptr, *res_ptr;
  cout << "allocate memory" << endl;
  real_ptr = base.allocateMemory<double>(sizereal, ierr);
  res_ptr = base.allocateMemory<double>(sizereal, ierr);
  comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
  cout << "write data" << endl;
  base.writeData<double>(real_ptr, rdata, sizereal);
  cout << "perform fft" << endl;
  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
  base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize);
  base.callNormalizeC2RFFT(res_ptr, dim, dimsize);
  cout << "read data" << endl;
  base.readData<double>(res_ptr, ordata, sizereal);
  compareData(rdata, ordata, N1, 3);
  base.freeMemory<double>(real_ptr, sizereal);
  base.freeMemory<double>(res_ptr, sizereal);
  base.freeMemory< complex<double> >(comp_ptr, sizecomp);
  //Test CC FFT
  void *mem_ptr;
  mem_ptr = base.allocateMemory< complex<double> >(sizereal, ierr);
  base.writeData< complex<double> >(mem_ptr, cdata, sizereal);
  base.callFFT(mem_ptr, 3, dimsize);
  base.callIFFT(mem_ptr, 3, dimsize);
  base.callNormalizeFFT(mem_ptr, 3, dimsize);
  base.readData< complex<double> >(mem_ptr, codata, sizereal);
  compareData(cdata, codata, N1, 3);
  base.freeMemory< complex<double> > (mem_ptr, sizereal);
  delete[] rdata;
  delete[] ordata;
  delete[] cdata;
  delete[] codata;
 }
 void compareData(complex<double>* data1, complex<double>* data2, int N, int dim) {
  int ni, nj, nk, id;
  ni = (dim > 2) ? N : 1;
  nj = (dim > 1) ? N : 1;
  nk = N;
  double sum = 0;
  for (int i = 0; i < ni; i++) {
    for (int j = 0; j < nj; j++) {
      for (int k = 0; k < nk; k++) {
 	id = i*ni*ni + j*nj + k;
 	sum += fabs(data1[id].real() - data2[id].real());
 	sum += fabs(data1[id].imag() - data2[id].imag());
      }
    }
  }
  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
 }
 void compareData(double* data1, double* data2, int N, int dim) {
  int ni, nj, nk, id;
  ni = (dim > 2) ? N : 1;
  nj = (dim > 1) ? N : 1;
  nk = N;
  double sum = 0;
  for (int i = 0; i < ni; i++) {
    for (int j = 0; j < nj; j++) {
      for (int k = 0; k < nk; k++) {
 	id = i*ni*ni + j*nj + k;
 	sum += fabs(data1[id] - data2[id]);
      }
    }
  }
  cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
 }
 void initData(complex<double> *data, int dimsize[3], int dim) {
  if (dim == 3) {
    for (int i = 0; i < dimsize[2]; i++)
      for (int j = 0; j < dimsize[1]; j++) 
 	for (int k = 0; k < dimsize[0]; k++) 
 	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
  } else if (dim == 2) {
    for (int j = 0; j < dimsize[1]; j++) {
      for (int k = 0; k < dimsize[0]; k++) {
 	data[j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
      }
    }
  } else {
    for (int k = 0; k < dimsize[0]; k++) 
      data[k] = complex<double>(sin(k), 0.0);
  }
 }
 void initData(double *data, int dimsize[3], int dim) {
  if (dim == 3) {
    for (int i = 0; i < dimsize[2]; i++)
      for (int j = 0; j < dimsize[1]; j++) 
 	for (int k = 0; k < dimsize[0]; k++) 
 	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
  } else if (dim == 2) {
    for (int j = 0; j < dimsize[1]; j++) {
      for (int k = 0; k < dimsize[0]; k++) {
 	data[j*dimsize[0] + k] = sin(k);
      }
    }
  } else {
    for (int k = 0; k < dimsize[0]; k++) 
      data[k] = sin(k);
  }
 }
 bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
 		char *api_name, char *device_name) 
 {
  for (int i = 1; i < argc; i++) {
    if ( argv[i] == std::string("-dim")) {
      dim = atoi(argv[i + 1]);
      i++;
    }
    if ( argv[i] == std::string("-grid") ) {
      N1 = atoi(argv[i + 1]);
      N2 = atoi(argv[i + 2]);
      N3 = atoi(argv[i + 3]);
      i += 3;
    }
    if (argv[i] == string("-cuda")) {
      strcpy(api_name, "Cuda");
      strcpy(device_name, "-gpu");
    } 
    if (argv[i] == string("-opencl")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-gpu");
    } 
    if (argv[i] == string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
    } 
    if (argv[i] == string("-cpu")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-cpu");
    }
  }
  return false;
 }
--- a/auto-tuning/testPushKick.cpp
+++ b/auto-tuning/testPushKick.cpp
@ -1,132 +0,0 @@
 #include <iostream>
 #include <vector>
 #include <string>
 #include "DKSOPAL.h"
 #include <vector_types.h>
 #include "cuda_runtime.h"
 void initData(double3 *data, int N) {
  for (int i = 0; i < N; i++) {
    data[i].x = (double)rand() / RAND_MAX;
    data[i].y = (double)rand() / RAND_MAX;
    data[i].z = (double)rand() / RAND_MAX;
  }
 }
 void initDt(double *data, int N) {
  for (int i = 0; i < N; i++) {
    data[i] = 0.00001;
  }
 }
 int main(int argc, char *argv[]) {
  int loop = 10;
  int numpart = 1e5;
  char *api_name = new char[10];
  char *device_name = new char[10];
  strcpy(api_name, "Cuda");
  strcpy(device_name, "-gpu");
  for (int i = 1; i < argc; i++) {
    if (argv[i] == std::string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
    }
    if (argv[i] == std::string("-npart")) {
      numpart = atoi(argv[i+1]);
      i++;
    }
    if (argv[i] == std::string("-loop")) {
      loop = atoi(argv[i+1]);
      i++;
    }
  }
  std::cout << "=========================BEGIN TEST=========================" << std::endl;
  std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
  std::cout << "Number of particles: " << numpart << std::endl;
  std::cout << "Number of loops: " << loop << std::endl;
  std::cout << "------------------------------------------------------------" << std::endl;
  int ierr;
  DKSOPAL dksbase;
  dksbase.setAPI(api_name, strlen(api_name));
  dksbase.setDevice(device_name, strlen(api_name));
  ierr = dksbase.initDevice();
  if (ierr != DKS_SUCCESS)
    std::cout << "Error with init device!" << std::endl;
  double3 *R = new double3[numpart];
  double3 *P = new double3[numpart];
  double3 *Ef = new double3[numpart];
  double3 *Bf = new double3[numpart];
  double *dt = new double[numpart];
  initData(R, numpart);
  initData(P, numpart);
  initData(Ef, numpart);
  initData(Bf, numpart);
  initDt(dt, numpart);
  void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
  r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
  dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
  dksbase.writeData<double3>(r_ptr, R, numpart);
  dksbase.writeData<double3>(p_ptr, P, numpart);
  dksbase.writeData<double3>(ef_ptr, Ef, numpart);
  dksbase.writeData<double3>(bf_ptr, Bf, numpart);
  dksbase.writeData<double>(dt_ptr, dt, numpart);
  for (int i = 0; i < loop; ++i)
    dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  std:: cout << "..." << std::endl;
  for (int i = numpart - 10; i < numpart; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  std::cout << "============" << std::endl;
  dksbase.readData<double3>(r_ptr, R, numpart);
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  std:: cout << "..." << std::endl;
  for (int i = numpart - 10; i < numpart; i++)
    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
  dksbase.freeMemory<double3>(r_ptr, numpart);
  dksbase.freeMemory<double3>(p_ptr, numpart);
  dksbase.freeMemory<double3>(ef_ptr, numpart);
  dksbase.freeMemory<double3>(bf_ptr, numpart);
  dksbase.freeMemory<double>(dt_ptr, numpart);
  delete[] R;
  delete[] P;
  delete[] Ef;
  delete[] Bf;
  delete[] dt;
 }
--- a/cmake/DKSConfig.cmake.in
+++ b/cmake/DKSConfig.cmake.in
@ -3,7 +3,5 @@ SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
 SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 SET(${PROJECT_NAME}_LIBRARY "dks")
 SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
 SET(DKS_CUDA_STATIC ${STATIC_CUDA})
 SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
 SET(DKS_VERSION ${DKS_VERSION})
-SET(DKS_VERSION_STR ${DKS_VERSION_STR})
+SET(DKS_VERSION_STR ${DKS_VERSION_STR})
--- a/doc/refman.pdf
+++ b/doc/refman.pdf
--- a/src/Algorithms/CMakeLists.txt
+++ b/src/Algorithms/CMakeLists.txt
@ -6,7 +6,6 @@ SET (_HDRS
 	ImageReconstruction.h
 	CollimatorPhysics.h
 	FFT.h
 	GreensFunction.h
  )
 ADD_SOURCES (${_SRCS})
--- a/src/Algorithms/ChiSquareRuntime.h
+++ b/src/Algorithms/ChiSquareRuntime.h
@ -15,9 +15,6 @@
 class DKSBaseMuSR;
 /** 
 * Interface to implement ChiSquareRuntime class for musrfit.
 */
 class ChiSquareRuntime {
  friend class DKSBaseMuSR;
@ -66,54 +63,23 @@ public:
  /** Default constructor */
  //ChiSquareRuntime();
-  /** Default destructor. */
+  /** Default destructor */
  virtual ~ChiSquareRuntime() { };
  /**
   * Compile GPU programm generated at runtime.
   */
  virtual int compileProgram(std::string function, bool mlh = false) = 0;
  /**
   * Launche the compiled chiSquare kernel.
   */
  virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
 			      int numpar, int numfunc, int nummap,
 			      double timeStart, double timeStep,
 			      double &result) = 0;
  /** 
   * Write the parameter values to the GPU.
   */
  virtual int writeParams(const double *params, int numparams) = 0;
  /**
   * Write the function values to the GPU.
   */
  virtual int writeFunc(const double *func, int numfunc) = 0;
  /**
   * Write map values to the GPU.
   */
  virtual int writeMap(const int *map, int nummap) = 0;
  /**
   * Allocate temporary memory needed for the chi square calucaltios on the device.
   */
  virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
  /**
   * Free device memory allocated for chi square calculations.
   */
  virtual int freeChiSquare() = 0;
  /**
   * Check if available device can run the chi square GPU code.
   */
  virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
-  /** 
+  /** Set N0, tau and bgk values to use for the kernel.
   * Set N0, tau and bgk values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -125,8 +91,7 @@ public:
    return DKS_SUCCESS;
  }
-  /** 
+  /** Set alpha and beta values to use for the kernel.
   * Set alpha and beta values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -136,9 +101,8 @@ public:
    return DKS_SUCCESS;
  }
-  /** 
+  /** Set number of blocks and threads.
-   * Set number of blocks and threads.
+   *  Used to set parameters obtained from auto-tuning
   * Used to set parameters obtained from auto-tuning
   */
  int setKernelParams(int numBlocks, int blockSize) {
    int ierr = DKS_ERROR;
@ -154,9 +118,8 @@ public:
    return ierr;
  }
-  /** 
+  /** Get the number of operations in compiled kernel.
-   * Get the number of operations in compiled kernel.
+   *  Count the number of operation in the ptx file for the compiled program.
   * Count the number of operation in the ptx file for the compiled program.
   */
  int getOperations(int &oper) {
--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@ -5,10 +5,10 @@
 #include <string>
 #include "../DKSDefinitions.h"
-/**
+class DKSBaseMuSR;
- * Interface to impelment particle matter interaction for OPAL.
+
 */
 class DKSCollimatorPhysics {
  friend class DKSBaseMuSR;
 protected:
@ -18,61 +18,25 @@ protected:
 public:
  virtual ~DKSCollimatorPhysics() { }
-
+  
  /** 
   * Execute collimator physics kernel.
   *
   */  
  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
-				bool enableRutherforScattering = true) = 0;
+				bool enableRutherfordScattering = true) = 0;
  /** 
   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles) = 0;
  /** 
   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
  /** 
   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				       void *px_ptr, void *py_ptr, void *pz_ptr,
 				       void *par_ptr, int numparticles, int &numaddback) = 0;
  /** 
   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;
  /** 
   * BorisPusher kick function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				   void *bf_ptr, void *dt_ptr, double charge,
 				   double mass, int npart, double c, int streamId = -1) = 0; 
  /** 
   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
--- a/src/Algorithms/FFT.h
+++ b/src/Algorithms/FFT.h
@ -6,21 +6,12 @@
 #include "../DKSDefinitions.h"
-/**
+class DKSFFT {
 * Abstract class defining methods for DKS FFT class.
 * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
 */
 class BaseFFT {
 protected:
  int defaultN[3];
  int defaultNdim;
  /**
   * Check if FFT plan is created for the needed dimension and FFT size.
   * Returns true if the plan has been created and false if no plan for specified dimension
   * and size exists.
   */
  bool useDefaultPlan(int ndim, int N[3]) {
    if (ndim != defaultNdim)
      return false;
@ -31,59 +22,20 @@ protected:
 public:
-  virtual ~BaseFFT() { }
+  virtual ~DKSFFT() { }
  /** Setup FFT - init FFT library used by chosen device. */
  virtual int setupFFT(int ndim, int N[3]) = 0;
  /** Setup real to complex FFT - init FFT library used by chosen device. */
  virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
  /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
  virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
  /** Clean up. */
  virtual int destroyFFT() = 0;
  /** 
   * Exectute C2C FFT.
   * mem_ptr - memory ptr on the device for complex data.
   * Performs in place FFT.
   */
  virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
 			 int streamId = -1, bool forward = true) = 0;
  /** 
   * Exectute inverse C2C FFT.
   * mem_ptr - memory ptr on the device for complex data.
   * Performs in place FFT.
   */
  virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
  /**
   * Normalize the FFT or IFFT.
   * mem_ptr - memory to complex data.
   */
  virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
  /** 
   * Exectute R2C FFT.
   * real_ptr - real input data for FFT, comp_ptr - memory on the device where
   * results for the FFT are stored as complex numbers.
   */
  virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
  /** 
   * Exectute C2R FFT.
   * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
   */
  virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
  /**
   * Normalize CR FFT.
   */
  virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
 };
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@ -1,32 +0,0 @@
 #ifndef H_GREENSFUNCTION
 #define H_GREENSFUNCTION
 #include <iostream>
 #include <cmath>
 /**
 * Interface to implement Greens function calculations for OPAL.
 */
 class GreensFunction {
 public:
  virtual ~GreensFunction() { }
  /** calc greens integral, as defined in OPAL. */
  virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
 			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
  /** integration if rho2_m, see OPAL for more details. */
  virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
 					int streamId = -1) = 0;
  /** mirror rho2_m field. */
  virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
  /** multiply two complex fields from device memory. */
  virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
 };
 #endif
--- a/src/Algorithms/ImageReconstruction.h
+++ b/src/Algorithms/ImageReconstruction.h
@ -5,22 +5,17 @@
 #define BLOCK_SIZE 128
 /** Struct to hold voxel position for PET image. */
 struct VoxelPosition {
  float x;
  float y;
  float z;
 };
 /** Struct that holds pair of detectors that registered an envent. */
 struct ListEvent {
  unsigned detA : 16;
  unsigned detB : 16;
 };
 /**
 * Interface to implement PET image reconstruction.
 */
 class ImageReconstruction {
 protected:
@ -30,8 +25,7 @@ public:
  virtual ~ImageReconstruction() { }
-  /** 
+  /** Caluclate source.
   * Caluclate source.
   *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
   *  that are inside this sphere. All the sphere used have the same diameter.
   */
@ -39,8 +33,7 @@ public:
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0) = 0;
-  /** 
+  /** Calculate background.
   * Calculate background.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -49,8 +42,7 @@ public:
 				  void *avg, void *std, float diameter, int total_voxels, 
 				  int total_sources, int start = 0) = 0;
-  /** 
+  /** Caluclate source using differente sources.
   * Caluclate source using differente sources.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * each sphere is given by *diameter array.
@ -60,7 +52,7 @@ public:
 			       int total_sources, int start = 0) = 0;
  /**
-   * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
   * smaller sphere.
@ -69,8 +61,7 @@ public:
 				   void *avg, void *std, void *diameter, int total_voxels, 
 				   int total_sources, int start = 0) = 0;
-  /** 
+  /** Generate normalization.
   * Generate normalization.
   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
   * that updates voxel values in the image on the slope between these two detectors.
   */
@ -78,16 +69,14 @@ public:
 				 void *det_position, int total_det) = 0; 
-  /** 
+  /** Calculate forward projection.
   * Calculate forward projection.
   * For image reconstruction calculates forward projections.
   * see recon.cpp for details
   */
  virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 				void *image_position, int num_events) = 0;
-  /** 
+  /** Calculate backward projection.
   * Calculate backward projection.
   * For image reconstruction calculates backward projections.
   * see recon.cpp for details
   */
@ -95,29 +84,29 @@ public:
 				 void *det_position, void *image_position, 
 				 int num_events, int num_voxels) = 0;
-  /** 
+  /** Set the voxel dimensins on device.
-   *Set the voxel dimensins on device. 
+   * 
   */
  virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
-  /** 
+  /** Set the image edge variables on the device.
-   * Set the image edge variables on the device.
+   * 
   */
  virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
-  /** 
+  /** Set the image edge1 on the device.
-   * Set the image edge1 on the device.
+   * 
   */
  virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
-  /** 
+  /** Set the minimum crystan in one ring values on the device.
-   * Set the minimum crystan in one ring values on the device.
+   * 
   */
  virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
 				  float min_CrystalDist_InOneRing1) = 0;
-  /** 
+  /** Set all other required parameters for reconstruction.
-   * Set all other required parameters for reconstruction.
+   * 
   */
  virtual int setParams(float matrix_distance_factor, float phantom_diameter,
 			float atten_per_mm, float ring_diameter) = 0;
--- a/src/AutoTuning/DKSAutoTuning.h
+++ b/src/AutoTuning/DKSAutoTuning.h
@ -18,17 +18,6 @@
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;
 /** 
 * DKS autotuning class, allows to auto-tune the defince function.
 * Executes the defined function for auto-tuning and searches for optimal parameters to improve
 * the function execution time. The function that is auto-tuned, parameters and the ranges
 * need to be set. Includes multiple search methods, that searches the parameter space to finde 
 * the optimal solution.
 *  1) exaustive search
 *  2) line search
 *  3) hill climbimg
 *  4) simulated annealing
 */
 class DKSAutoTuning {
 private:
@ -47,13 +36,12 @@ private:
  int loops_m;
-  /** Update parameters from a state. */
+  /** Update parameters from a state */
  int setParameterValues(States states);
-  /** 
+  /** Evaluate the function and set execution time 
-   * Evaluate the function and set execution time 
+   *  Returns DKS_ERROR if errors occured during function execution. 
-   * Returns DKS_ERROR if errors occured during function execution. 
+   *  Returns DKS_SUCCESS if function executed as planned. 
   * Returns DKS_SUCCESS if function executed as planned. 
   */
  int evaluateFunction(double &value);
@ -62,13 +50,12 @@ public:
  /** Constructor */
  DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
-  /** Destructor. */
+  /** Destructor */
  ~DKSAutoTuning();
-  /** 
+  /** Set function to auto tune.
-   * Set function to auto tune.
+   *  Caller of setFunction is responsible to bind the correct parameters 
-   * Caller of setFunction is responsible to bind the correct parameters 
+   *  to the function with std::bind.
   * to the function with std::bind.
   */
  void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
    f_m = f;
@ -76,21 +63,15 @@ public:
    evaluate_time_m = evaluate_time;
  }
  /** 
   * Set function to auto tune.
   * Caller of setFunction is responsible to bind the correct parameters 
   * to the function with std::bind.
   */
  void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
    fd_m = f;
    function_name_m = name;
    evaluate_time_m = evaluate_time;
  }
-  /** 
+  /** Set parameter for auto tuning.
-   * Set parameter for auto tuning.
+   *  Provide a pointer to a parameter that will be changed during auto-tuning
-   * Provide a pointer to a parameter that will be changed during auto-tuning
+   *  and a min-max value for this element
   * and a min-max value for this element
   */
  template <typename T1>
  void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
@ -104,9 +85,9 @@ public:
  /** Perform exaustive search evaluating all the parameter configurations */
  void exaustiveSearch();
-  /**
+  /** Perform auto-tuning.
-   * Perform line-search auto-tuning by variying parameters one at a time.
+   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
-   * After one parameter is auto-tuned the next on is varied
+   *  parameters constant.
   */
  void lineSearch();  
--- a/src/AutoTuning/DKSAutoTuningTester.h
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@ -4,7 +4,6 @@
 #include <iostream>
 #include <cmath>
 /** Tester class for auto-tuning search algorithms. */
 class DKSAutoTuningTester {
  friend class DKSBaseMuSR;
--- a/src/AutoTuning/DKSConfig.h
+++ b/src/AutoTuning/DKSConfig.h
@ -1,3 +1,9 @@
 /** Class to save and load DKS autotunning configs.
 * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
 * Uses boost xml_parser to read and write the xml file and boost property tree to store
 * the xml content.
 */
 #ifndef DKS_CONFIG
 #define DKS_CONFIG
@ -5,7 +11,7 @@
 #include <boost/optional/optional.hpp>
 #include <boost/property_tree/xml_parser.hpp>
 #include <boost/foreach.hpp>
-//#include <boost/filesystem.hpp>
+#include <boost/filesystem.hpp>
 #include <string>
 #include <iostream>
 #include <cstdlib>
@ -18,18 +24,11 @@
 #include "../DKSDefinitions.h"
 namespace pt = boost::property_tree;
-//namespace fs = boost::filesystem;
+namespace fs = boost::filesystem;
 const std::string config_dir = "/.config/DKS";
 const std::string config_file = "/autotuning.xml";
 /** Class to save and load DKS autotunning configs.
 * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
 * Uses boost xml_parser to read and write the xml file and boost property tree to store
 * the xml content.
 * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
 * so the auto-tuning has no effect.
 */
 class DKSConfig {
 private:
--- a/src/AutoTuning/DKSSearchStates.h
+++ b/src/AutoTuning/DKSSearchStates.h
@ -9,9 +9,6 @@
 enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
 /** 
 * Parameter class allows to change the searchable parameters during the auto-tuning.
 */
 class Parameter {
 private:
@ -67,10 +64,6 @@ public:
 };
 /**
 * Struct to hold a auto-tuning state.
 * Holds the current value, min, max and a step to witch a state can change.
 */ 
 struct State {
  double value;
  double min;
@ -81,12 +74,6 @@ struct State {
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;
 /** 
 * Used by auto-tuning search algorithms to move between parameter configurations.
 * Allows to move from one parameter stat to another, get neighboring states, 
 * move to neighboring states and save state information. Print functions are available
 * for debugging purposes, to follow how algorithm muves between sates.
 */
 class DKSSearchStates {
 private:
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -35,29 +35,13 @@ ENDMACRO ()
 SET (DKS_BASEDIR_HDRS
  DKSBase.h
  DKSDefinitions.h
  DKSFFT.h
  )
 SET (DKS_BASEDIR_SRCS
  DKSBase.cpp
  DKSFFT.cpp
  )
-#add opal to DKS if enable_opal is set
+IF (USE_CUDA OR USE_OPENCL)
 IF (ENABLE_OPAL)
  SET (DKS_BASEDIR_HDRS
    ${DKS_BASEDIR_HDRS}
    DKSOPAL.h
    )
  SET (DKS_BASEDIR_SRCS
    ${DKS_BASEDIR_SRCS}
    DKSOPAL.cpp
    )
 ENDIF (ENABLE_OPAL)
 #and musrt to DKS if cuda or opencl is used and enable_musr is set
 IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
   SET (DKS_BASEDIR_HDRS
       ${DKS_BASEDIR_HDRS}
       DKSBaseMuSR.h
@ -67,10 +51,9 @@ IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
       ${DKS_BASEDIR_SRCS}
       DKSBaseMuSR.cpp
       )
-ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
+ENDIF (USE_CUDA OR USE_OPENCL)
-#add image reconstruction to DKS if cuda is used and enable_pet is set
+IF (USE_CUDA)
 IF (USE_CUDA AND ENABLE_PET)
  SET (DKS_BASEDIR_HDRS
    ${DKS_BASEDIR_HDRS}
    DKSImageReconstruction.h
@ -80,7 +63,7 @@ IF (USE_CUDA AND ENABLE_PET)
    ${DKS_BASEDIR_SRCS}
    DKSImageReconstruction.cpp
    )
-ENDIF (USE_CUDA AND ENABLE_PET)
+ENDIF (USE_CUDA)
 ADD_HEADERS (${DKS_BASEDIR_HDRS})
 ADD_SOURCES (${DKS_BASEDIR_SRCS})
@ -112,18 +95,26 @@ IF (USE_CUDA)
  CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
  CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
-  TARGET_LINK_LIBRARIES(dks ${DKS_CUDA_LIBS})
+  IF (USE_UQTK)
-  TARGET_LINK_LIBRARIES(dksshared ${DKS_CUDA_LIBS})
+    TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-  #TARGET_LINK_LIBRARIES(dks)
+    TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-  #TARGET_LINK_LIBRARIES(dksshared)
+  ELSE (USE_UQTK)
    TARGET_LINK_LIBRARIES(dks cudadevrt)
    TARGET_LINK_LIBRARIES(dksshared cudadevrt)
  ENDIF (USE_UQTK)
 ELSE (USE_CUDA)
  MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
  ADD_LIBRARY(dks ${DKS_SRCS})
  ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
-  TARGET_LINK_LIBRARIES(dks)
+  IF (USE_UQTK)
-  TARGET_LINK_LIBRARIES(dksshared)
+    TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
    TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
  ELSE (USE_UQTK)
    TARGET_LINK_LIBRARIES(dks)
    TARGET_LINK_LIBRARIES(dksshared)
  ENDIF(USE_UQTK)
 ENDIF (USE_CUDA)
--- a/src/CUDA/CMakeLists.txt
+++ b/src/CUDA/CMakeLists.txt
@ -1,27 +1,35 @@
-SET (_HDRS CudaBase.cuh CudaFFT.cuh)
+SET (_HDRS
-SET (_SRCS CudaBase.cu CudaFFT.cu)
+	CudaBase.cuh
 	CudaFFT.cuh
 	CudaGreensFunction.cuh
 	CudaChiSquare.cuh
 	CudaCollimatorPhysics.cuh
 	CudaImageReconstruction.cuh
 	CudaChiSquareRuntime.cuh
  )
 SET (_SRCS
 	CudaBase.cu
 	CudaFFT.cu
 	CudaGreensFunction.cu
 	CudaChiSquare.cu
 	CudaCollimatorPhysics.cu
 	CudaImageReconstruction.cu
 	CudaChiSquareRuntime.cu
 )
-IF (ENABLE_OPAL)
+#INCLUDE_DIRECTORIES (
-  SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
+#  ${CMAKE_CURRENT_SOURCE_DIR}
-  SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu)
+#)
 ENDIF (ENABLE_OPAL)
 IF (ENABLE_MUSR)
  SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
  SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
  SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
 ENDIF (ENABLE_MUSR)
 IF (ENABLE_PET)
  SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
  SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
 ENDIF (ENABLE_PET)
 MESSAGE (STATUS "CUDA headers: ${_HDRS}")
 ADD_SOURCES(${_SRCS})
 ADD_HEADERS(${_HDRS})
 INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
 SET (_KERNELS
  NVRTCKernels/CudaChiSquareKernel.cu
  )
 INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -13,13 +13,6 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
 }
 __global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < size)
    data[idx] = curand_uniform_double(&state[idx]);
 }
 //=====================================//
 //==========Private functions==========//
@ -55,7 +48,7 @@ int CudaBase::cuda_createCurandStates(int size, int seed) {
  int threads = 128;
  int blocks = size / threads + 1;
-  if (seed == -1) 
+  if (seed == -1)
    seed = time(NULL);
  //std::cout << "sizeof: " << sizeof(curandState) << std::endl;
@ -76,15 +69,6 @@ int CudaBase::cuda_deleteCurandStates() {
  return DKS_SUCCESS;
 }
 int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
  int threads = BLOCK_SIZE;
  int blocks = size / threads + 1;
  kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
  return DKS_SUCCESS;
 }
 curandState* CudaBase::cuda_getCurandStates() {
  return defaultRndState;
 }
@ -342,3 +326,62 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
  return DKS_SUCCESS;
 }
 /*
  Info: allcate memory and write data (push)
  Return: pointer to memory object
 */
 /*
  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
  void * mem_ptr;
  mem_ptr = cuda_allocateMemory(size, ierr);
  if (ierr == DKS_SUCCESS)
  ierr = cuda_writeData(mem_ptr, in_data, size);
  return mem_ptr;
  }
 */
 /*
  Info: read data and free memory (pull)
  Return: success or error code
 */
 /*
  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
  ierr = cuda_readData(mem_ptr, out_data, size);
  if (ierr == DKS_SUCCESS)
  ierr = cuda_freeMemory(mem_ptr);	
  else
  return DKS_ERROR;
  if (ierr == DKS_SUCCESS)	
  return DKS_SUCCESS;
  else
  return DKS_ERROR;
  }
 */
 /*
  Info: execute function
  Return: success or error code
 */
 int CudaBase::cuda_executeFunction() {
  std::cout << "Execute function" << std::endl;
  return DKS_SUCCESS;
 }
 /*
  Info: clean up
  Return: success or error code
 */
 int CudaBase::cuda_cleanUp() {
  std::cout << "clean up" << std::endl;
  return DKS_SUCCESS;
 }
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@ -12,15 +12,9 @@
 #include <cufft.h>
 #include <cublas_v2.h>
 #include <curand_kernel.h>
 #include <nvToolsExt.h>
 #include <time.h>
 #define BLOCK_SIZE 128
 /**
 * CUDA base class handles device setup and basic communication with the device.
 * Handles devicew setup, memory manegement, data transfers and stream setup for 
 * asynchronous data transfers and kernel executions.
 */
 class CudaBase {
 private:
@ -45,7 +39,6 @@ public:
   * Init cuda random number (cuRand) states.
   * Create an array of type curandState  with "size" elements on the GPU
   * and create a curandState with different seed for each array entry.
   * If no seed is given create a seed based on current time.
   * Return success or error code
   */
  int cuda_createCurandStates(int size, int seed = -1);
@ -53,17 +46,12 @@ public:
  /**
   * Delete curandState.
   * Delete curandState array on the GPU and free memory.
-   * Return success or error code
+   *  Return success or error code
   */
  int cuda_deleteCurandStates();
-  /** 
+  /** Get a pointer to curand states
-   * Create 'size' random numbers on the device and save in mem_ptr array.
+   *
   */
  int cuda_createRandomNumbers(void *mem_ptr, int size);
  /** 
   * Get a pointer to curand states.
   */
  curandState* cuda_getCurandStates();
@ -80,98 +68,93 @@ public:
  int cuda_addStream(cudaStream_t tmpStream, int &streamId);
  /**
-   * delete cuda stream.
+   * delete cuda stream
   * success or error code
   */
  int cuda_deleteStream(int id);
  /**
-   * delete all streams.
+   * delete all streams
   * success or error code
   */
  int cuda_deleteStreams();
  /**
-   * set stream to use.
+   * set stream to use
   * success or error code
   */
  int cuda_setStream(int id);
  /**
-   * get stream that is used.
+   * Info: get stream that is used
-   * Return: return id of curretn stream
+   *  Return: return id of curretn stream
   */
  int cuda_getStreamId();
  /**
-   * reset to default stream.
+   * Info: reset to default stream
   * Return: success or error code
   */
  int cuda_defaultStream();
  /**
-   * get number of streams.
+   * Info: get number of streams
   * Return: success or error code
   */
  int cuda_numberOfStreams();
  /**
-   * get stream.
+   * Info: get stream
   * Return: stream
   */
  cudaStream_t cuda_getStream(int id);
  /**
-   * Get default cublass handle.
+   * Get default cublass handle
   */
  cublasHandle_t cuda_getCublas();
  /**
-   * get information on cuda devices.
+   * Info: get information on cuda devices
   * Return: success or error code
   */
  int cuda_getDevices();
-  /** 
+  /** Get CUDA device count.
-   * Get CUDA device count.
+   *  Sets the number of devices on the platform that can use CUDA.
-   * Sets the number of devices on the platform that can use CUDA.
+   *  Returns DKS_SUCCESS
   */
  int cuda_getDeviceCount(int &ndev);
-  /** 
+  /** Get the name of the device.
-   * Get the name of the device.
+   *  QUery the device properties of the used device and set the string device_name
   * QUery the device properties of the used device and set the string device_name
   */
  int cuda_getDeviceName(std::string &device_name);
-  /** 
+  /** Set CUDA device to use.
-   * Set CUDA device to use.
+   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
   * If device passed in is larger than the number of devices use 
   * the default:0 and return DKS_ERROR 
   */
  int cuda_setDevice(int device);
-  /** 
+  /** Get unique devices
-   * Get unique devices.
+   *  Get array of indeces with the unique CUDA devices available on the paltform
   * Get array of indeces with the unique CUDA devices available on the paltform
   */
  int cuda_getUniqueDevices(std::vector<int> &devices);
  /**
-   * Initialize connection to the device.
+   * Info: init device
   * Only needed when runtime compilation is used.
   * Return: success or error code
   */
  int cuda_setUp();
  /**
-   * Allocate memory on cuda device.
+   * Info: allocate memory on cuda device
   * Return: pointer to memory object
   */
  void * cuda_allocateMemory(size_t size, int &ierr);
  /**
-   * Allocate host memory in pinned memory
+   * Info: allocate host memory in pinned memory
   * Return: success or error code
   */
  template<typename T>
@ -185,43 +168,7 @@ public:
  }		
  /** 
-   * Zero CUDA memory.
+   * Info: write data to memory
   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
    cudaError cerror;
    cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
    if (cerror != cudaSuccess) {
      DEBUG_MSG("Error zeroing cuda memory!\n");
      return DKS_ERROR;
    }
    return DKS_SUCCESS;
  }
  /** 
   * Zero CUDA memory async.
   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
    int dkserror = DKS_SUCCESS;
    cudaError cerror;
    if (streamId < cuda_numberOfStreams()) {
 	cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size, 
 				 cuda_getStream(streamId));
 	if (cerror != cudaSuccess)
 	  dkserror = DKS_ERROR;
    } else
      dkserror = DKS_ERROR;
    return dkserror;
  }
  /** 
   * Write data to memory
   * Retrun: success or error code
   */
  template<typename T>
@ -238,7 +185,7 @@ public:
  }
  /**
-   * Write data assynchonuously
+   * Info: write data assynchonuously
   * Return: success or error code
   */
  template<typename T>
@ -270,7 +217,7 @@ public:
  }
  /**
-   * Read data from memory
+   * Info: read data from memory
   * Return: success or error code
   */
  template<typename T>
@ -287,7 +234,7 @@ public:
  }
  /**
-   * Read data async from device memory
+   * Info: read data async from device memory
   * Return: success or error code
   */
  template<typename T>
@ -319,19 +266,19 @@ public:
  }
  /**
-   * Free memory on device
+   * Info: free memory on device
   * Return: success or error code
   */
  int cuda_freeMemory(void * mem_ptr);
  /**
-   * Free page locked memory on host
+   * Info: free page locked memory on host
   * Return: success or erro code
   */
  int cuda_freeHostMemory(void * mem_ptr);
  /**
-   * Allcate memory and write data (push)
+   * Info: allcate memory and write data (push)
   * Return: pointer to memory object
   */
  template<typename T>
@ -347,7 +294,7 @@ public:
  }
  /**
-   * Read data and free memory (pull)
+   * Info: read data and free memory (pull)
   * Return: success or error code
   */
  template<typename T>
@ -365,10 +312,21 @@ public:
    else
      return DKS_ERROR;
  }
  /**
   * Info: execute function
   * Return: success or error code
   */
  int cuda_executeFunction();
  /**
   * Info: clean up
   * Return: success or error code
   */
  int cuda_cleanUp();
  /**
-   * Sync cuda device.
+   * Info: sync cuda device
   * Waits till all the tasks on the GPU are finished.
   * Return: success or error code
   */
  int cuda_syncDevice() {
@ -377,7 +335,7 @@ public:
  }
  /**
-   * Page-lock host memory.
+   * Page-lock host memory
   */
  template<typename T>
  int cuda_hostRegister(T *ptr, int size) {
@ -391,7 +349,7 @@ public:
  }
  /**
-   * Release page locked memory.
+   * Release page locked memory
   */
  template<typename T>
  int cuda_hostUnregister(T *ptr) {
@ -404,7 +362,7 @@ public:
  }
  /**
-   * Print device memory info (total, used, avail)
+   * Info: print device memory info (total, used, avail)
   * Return: success or error code
   */
  int cuda_memInfo() {
--- a/src/CUDA/CudaChiSquare.cuh
+++ b/src/CUDA/CudaChiSquare.cuh
@ -8,7 +8,6 @@
 #include "CudaBase.cuh"
 /** Deprecated, CUDA simpleFit implementation of ChiSquare. */
 class CudaChiSquare {
 private:
--- a/src/CUDA/CudaChiSquareRuntime.cu
+++ b/src/CUDA/CudaChiSquareRuntime.cu
@ -86,19 +86,15 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
  //create program
  nvrtcProgram prog;
-//  std::cout << cudaProg.c_str() << std::endl;
+  //std::cout << cudaProg.c_str() << std::endl;
-  nvrtcResult createResult = nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+  nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
  if (createResult != NVRTC_SUCCESS) {
    DEBUG_MSG("Program creation failed!");
    return DKS_ERROR;
  }
  //compile program
-  const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
+  const char *opts[] = {"-fmad=false", ""};
-  int numopts = 2;
+  int numopts = 1;
  if (mlh) {
-    opts[2] = "-DMLH";
+    opts[1] = "-DMLH";
-    numopts = 3;
+    numopts = 2;
  }
  nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
@ -122,11 +118,7 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
  if (ptx_m != NULL)
    delete[] ptx_m;
  size_t ptxSize; 
-  nvrtcResult ptxSizeResult = nvrtcGetPTXSize(prog, &ptxSize);
+  nvrtcGetPTXSize(prog, &ptxSize); 
  if (ptxSizeResult != NVRTC_SUCCESS) {
    DEBUG_MSG("PTX get size error!");
    return DKS_ERROR;
  }
  ptx_m = new char[ptxSize]; 
  nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);  
@ -135,26 +127,10 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
    return DKS_ERROR;
  }
  // add some additional diagnostics
  const int buffer_size = 8192;
  CUjit_option options[3];
  void* values[3];
  char error_log[buffer_size];
  int err;
  options[0] = CU_JIT_ERROR_LOG_BUFFER;
  values[0]  = (void*)error_log;
  options[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  values[1]  = (void*)buffer_size;
  options[2] = CU_JIT_TARGET_FROM_CUCONTEXT;
  values[2]  = 0;
  //load module from ptx
-  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 3, options, values); 
+  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); 
  if (loadResult != CUDA_SUCCESS) {
-    const char *err_msg;
+    DEBUG_MSG("Load module from ptx failed!");
    cuGetErrorString(loadResult, &err_msg);
    std::string msg = "Load module from ptx failed! (" + std::to_string(loadResult) + ") : " + err_msg;
    DEBUG_MSG(msg);
    DEBUG_MSG(error_log);
    return DKS_ERROR;
  }
--- a/src/CUDA/CudaChiSquareRuntime.cuh
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@ -15,10 +15,6 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
 const std::string cudaFunctFooter = "}\n";
 /**
 * CUDA implementation of ChiSquareRuntime class.
 * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
 */
 class CudaChiSquareRuntime : public ChiSquareRuntime{
 private:
@ -33,72 +29,65 @@ private:
  cublasHandle_t defaultCublasRT;
-  /** 
+  /** Setup to init device
-   * Setup to init device.
+   *  Create context and init device for RT compilation
   * Create context and init device for RT compilation
   */
  void setUpContext();
-  /** 
+  /** Private function to add function to kernel string
-   * Private function to add function to kernel string.
+   *
   */
  std::string buildProgram(std::string function);
 public:
-  /** 
+  /** Constructor with CudaBase argument
-   * Constructor with CudaBase argument
+   *
   */
  CudaChiSquareRuntime(CudaBase *base);
-  /** 
+  /** Default constructor init cuda device
-   * Default constructor init cuda device
+   *
   */
  CudaChiSquareRuntime();
-  /** 
+  /** Default destructor
-   * Default destructor.
+   *
   */
  ~CudaChiSquareRuntime();
-  /** 
+  /** Compile program and save ptx.
   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);
-  /** 
+  /** Launch selected kernel
   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
-   * Result is put in &result variable.
+   * Result is put in &result variable
   */
  int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
 		      int numpar, int numfunc, int nummap,
 		      double timeStart, double timeStep,
 		      double &result);
-  /** 
+  /** Write params to device.
   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 
-  /** 
+  /** Write functions to device.
   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);
-  /** 
+  /** Write maps to device.
   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);
-  /** 
+  /** Allocate temporary memory needed for chi square.
   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -107,16 +96,14 @@ public:
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
-  /** 
+  /** Free temporary memory allocated for chi square.
   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();
-  /** 
+  /** Check if CUDA device is able to run the chi square kernel.
-   * Check if CUDA device is able to run the chi square kernel.
+   *  Redundant - all new CUDA devices that support RT compilation will also support 
-   * Redundant - all new CUDA devices that support RT compilation will also support 
+   *  double precision, there are no other requirements to run chi square on GPU
   * double precision, there are no other requirements to run chi square on GPU
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
    return DKS_SUCCESS;
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@ -1,16 +1,16 @@
 #include "CudaCollimatorPhysics.cuh"
-//constants used in OPAL
+//#define M_P 0.93827231e+00
 #define M_P 0.93827204e+00
 #define C 299792458.0
 #define PI 3.14159265358979323846
 #define AVO 6.022e23
 #define R_E 2.81794092e-15
 //#define eM_E 0.51099906e-03
 #define eM_E 0.51099892e-03
 #define Z_P 1
 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
 //parameter array indexes
 #define POSITION 0 
 #define ZSIZE 1
 #define RHO_M 2
@ -26,53 +26,14 @@
 #define LOWENERGY_THR 12
 #define BLOCK_SIZE 128
-#define NUMPAR 13
+#define NUMPAR 12
 /**
 * CUDA device function for calculating dot product.
 */
 __device__ inline double dot(double3 &d1, double3 &d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }
 /**
 * CUDA devce function to calculate cross product.
 */
 __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  double3 tmp;
  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
  tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
  tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
  return tmp;
 }
 /**
 * CUDA device function to calculate arbitrary rotation.
 */
 __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
  double c=cos(Theta);
  double s=sin(Theta);
  double dotW = sqrt(dot(W,W));
  W.x = W.x / dotW;
  W.y = W.y / dotW;
  W.z = W.z / dotW;
  double dotWR = dot(W, Rorg) * (1.0 - c);
  double3 crossW = cross(W, Rorg);
  double3 tmp;
  tmp.x = Rorg.x * c + crossW.x * s + W.x * dotWR;
  tmp.y = Rorg.y * c + crossW.y * s + W.y * dotWR;
  tmp.z = Rorg.z * c + crossW.z * s + W.z * dotWR;
  return tmp;
 } 
 /**
 * CUDA device function to check if particle is still in material.
 * z - particle position, par - parameter array. Particle is considered inside the
 * material if z is > material starting position and z < material starting position - mat size.
 */
 __device__ inline bool checkHit(double &z, double *par) {
  /* check if particle is in the degrader material */
@ -81,11 +42,6 @@ __device__ inline bool checkHit(double &z, double *par) {
 }
 /**
 * CUDA device function to calculate energyLoss for one particle.
 * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
 * algorith are available in OPAL user guide.
 */
 __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
 {
@ -130,53 +86,49 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
 }
-/**
+__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
- * CUDA device function for rotation in 2 dimensions.
+			   double &normP, double &thetacou, double &deltas, int coord,
 * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),  
 * "Passage of particles through matter"
 */
 __device__ inline void Rot(double &px, double &pz, double &x, double &z, double &plane, 
 			   double &betaGamma, double &thetacou, double &deltas, int coord,
 			   double *par) 
 {
-  // Calculate the angle between the px and pz momenta to change from beam coordinate to lab coordinate
+  double Psixz;
-  const double Psi = atan2(px, pz);
+  double pxz;
  const double pxz = sqrt(px*px + pz*pz);
  const double cosPsi = cos(Psi);
  const double sinPsi = sin(Psi);
  const double cosTheta = cos(thetacou);
  const double sinTheta = sin(thetacou);
-  // Apply the rotation about the random angle thetacou & change from beam
+  if (px>=0 && pz>=0)
-  // coordinate system to the lab coordinate system using Psixz (2 dimensions)
+    Psixz = atan(px/pz);
-  x += deltas * px / betaGamma + plane * cosPsi;
+  else if (px>0 && pz<0)
-  z -= plane * sinPsi;
+      Psixz = atan(px/pz) + PI;
  else if (px<0 && pz>0)
    Psixz = atan(px/pz) + 2*PI;
  else
    Psixz = atan(px/pz) + PI;
-  if (coord == 1) {
+  pxz = sqrt(px*px + pz*pz);
-    z += deltas * pz / betaGamma;
+
  if(coord==1) {
    x = x + deltas * px/normP + xplane*cos(Psixz);
    z = z - xplane * sin(Psixz);
  }
-  px = pxz * (cosPsi * sinTheta + sinPsi * cosTheta);
+  if(coord==2) {
-  pz = pxz * (-sinPsi * sinTheta + cosPsi * cosTheta);
+    x = x + deltas * px/normP + xplane*cos(Psixz);
    z = z - xplane * sin(Psixz) + deltas * pz / normP;
  }
  px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
-/**
+__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, 
- * CUDA device function to calculate Coulomb scattering for one particle.
+				   double* par, bool enableRutherfordScattering) 
 * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
 * For details on the algorithm see OPAL user guide.
 */
 __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
 				   bool enableRutherfordScattering) 
 {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
  double gamma = (Eng + M_P) / M_P;
  double normP = sqrt(dot(P, P));
  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
  double betaGamma = sqrt(dot(P, P));
  double deltas = par[DT_M] * beta * C;
  double mass = M_P * 1e9; // in eV
-  double theta0 = 13.6e6 / (beta * betaGamma * mass) * 
+  double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * 
    Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
  // x-direction: See Physical Review, "Multiple Scattering"
@ -191,9 +143,19 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
  }
  //__syncthreads();  
-  //double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+
-  double xplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
+  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
-  Rot(P.x, P.z, R.x, R.z, xplane, betaGamma, thetacou, deltas, 0, par);
+  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
  if( (P2 < 0.0047) && enableRutherfordScattering) {
    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
    if(P4 > 0.5)
      thetaru = -thetaru;
    Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
  }
  // y-direction: See Physical Review, "Multiple Scattering"
  z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
@ -206,43 +168,24 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
    thetacou = z2 * theta0;
  }
-  //double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  //__syncthreads();
  double yplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
  Rot(P.y,P.z,R.y,R.z, yplane, betaGamma, thetacou, deltas, 1, par);
-  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
  Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
  P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
  if( (P2 < 0.0047) && enableRutherfordScattering) {
    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-    //double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
-    double thetaru = 2.5 * sqrt(1 / P3) * 2.0 * theta0;
+    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-    double phiru = 2.0 * M_PI * curand_uniform_double(&state);
+    if(P4 > 0.5)
-    double th0=atan2(sqrt(P.x*P.x+P.y*P.y),fabs(P.z));
+      thetaru = -thetaru;
-    double3 W,X;
+    Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
    double dotP = sqrt(dot(P,P));
    X.x = cos(phiru)*sin(thetaru) * dotP;
    X.y = sin(phiru)*sin(thetaru) * dotP;
    X.z = cos(thetaru) * dotP;
    W.x = -P.y;
    W.y = P.x;
    W.z = 0.0;
    P = ArbitraryRotation(W, X, th0);
  }
 }
 /**
 * CUDA kernel that performs one step in particle movement trough mater.
 * One thread is launched for each particle in the simulation. The kernel checks if the particle
 * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
 * particles that are exiting the material.
 * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
 * @param[in] *par array of material properties, always constant size - 13
 * @param[in] *state array holding cuRand states to preserve states between kernel launches
 * @param[in] numparticles number of particles in the simulation
 * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
 */
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
 					int numparticles, bool enableRutherfordScattering)
@ -252,63 +195,51 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;
-  //transfer params and particle positions to shared memory
+  //transfer params to shared memory
  //R is kept in shared memory in order to reduce register pressure for the kernel
  extern __shared__ double smem[];
  double *p = (double*)smem;
-  double3 *R = (double3*)&smem[NUMPAR]; 
+  double3 *R = (double3*)&smem[NUMPAR];
-  curandState s; //each tread gets its own cuRand state for random number generation
+  curandState s; 
  double3 P;
  //load parameters to shared memory
  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
    p[tt] = par[tt];
  //sync threads to ensure that parameters are finished loading
  __syncthreads();
  //there might be some empty threads that do no work
  if (idx < numparticles) {
-    s = state[idx]; //load cuRand state to local memory
+    s = state[idx];
-    R[tid] = data[idx].Rincol; //load position to shared memory
+    R[tid] = data[idx].Rincol;
-    P = data[idx].Pincol; //load momentum to local memory
+    P = data[idx].Pincol;
    bool pdead = false;  
    volatile double sq = sqrt(1.0 + dot(P, P));
    double Eng;
    //check if particle is still in the material
    if (checkHit(R[tid].z, p)) {      
      //calculate enery loss
      Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);
      //check if particle is not dead
      if (!pdead) {
 	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
 	sq = sqrt(dot(P, P));
 	//caluclate Coulomb scattering
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
 	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
 	//update particle momentum
 	data[idx].Pincol = P;
      } else {
 	//mark particle as dead (-1)
 	data[idx].label = -1;
      }
-      
+    
      //update cuRand state
      state[idx] = s;
    } else {
-      //particle exits material - drift and mark as exiting (-2)
+    
      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -316,25 +247,14 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
    }
    //update particle position
    data[idx].Rincol = R[tid];
  }
 }
-/**
+__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
- * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
+					 curandState *state, int numparticles,  
- * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
+					 bool enableRutherfordScattering)
 * Deprecated - GPU version does not use SoA.
 * @param[in] data structure of arrays containing particle data
 * @param[in] *par array of material properties, always constant size - 13
 * @param[in] *state array holding cuRand states to preserve states between kernel launches
 * @param[in] numparticles number of particles in the simulation
 * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
 */
 __global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, 
 					   curandState *state, int numparticles,
 					   bool enableRutherfordScattering)
 {
  //get global id and thread id
@ -393,32 +313,92 @@ __global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
 }
-/**
+
 * Device function to swich off unitless positions.
 */
 inline __device__ void unitlessOff(double3 &a, const double &c) {
  a.x *= c;
  a.y *= c;
  a.z *= c;
 }
 /**
 * Device function to swich on unitless positions.
 */
 inline __device__ void unitlessOn(double3 &a, const double &c) {
  a.x /= c;
  a.y /= c;
  a.z /= c;
 }
 //swithch to unitless positions with dtc
 __global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    unitlessOn(R, dtc);
    unitlessOn(X, dtc);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 //swithc to unitless positions with dt*c
 __global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    double dt = gdt[idx];
    unitlessOff(R, dt*c);
    unitlessOff(X, dt*c);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 //swithc off unitless positions with dtc
 __global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    unitlessOff(R, dtc);
    unitlessOff(X, dtc);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 //switch off unitelss positions with dt*c
 __global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    double dt = gdt[idx];
    unitlessOff(R, dt*c);
    unitlessOff(X, dt*c);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 /**
 * CUDA kernel to perform particle push.
 * @param[in] *gR array of particle positions
 * @param[in] *gP array of particle momentums
 * @param[in] npart number of particles
 * @param[in] dtc dt*c
 */
 __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  //get global id and thread id
@ -446,15 +426,8 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  }
 }
-/**
+
- * CUDA kernel to perform particle push.
+__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
 * @param[in] *gR array of particle positions
 * @param[in] *gP array of particle momentums
 * @param[in] *gdt array of time steps for each particle
 * @param[in] npart number of particles
 * @param[in] c speed of light
 */
 __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
  //get global id and thread id
  volatile int tid = threadIdx.x;
@ -480,61 +453,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
  }
 }
-/** 
+//TODO: kernel for push with switch off unitless positions with dt[i]*c
 * CUDA kernel to perform particle kick.
 * @param[in] *gR array of particle positions
 * @param[in] *gP array of particle momentums
 * @param[in] *gEf 
 * @param[in] *gBf
 * @param[in] *gdt array of time steps for each particle
 * @param[in] npart number of particles
 * @param[in] c speed of light
 */
 __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
 			   double3 *gBf, double *gdt, double charge, 
 			   double mass, int npart, double c)
 {
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 P = gP[idx];
    double3 Ef = gEf[idx];
    double3 Bf = gBf[idx];
    double dt = gdt[idx];
    P.x += 0.5 * dt * charge * c / mass * Ef.x;
    P.y += 0.5 * dt * charge * c / mass * Ef.y;
    P.z += 0.5 * dt * charge * c / mass * Ef.z;
    double gamma = sqrt(1.0 + dot(P, P));
    double3 t, w, s;
    t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
    t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
    t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
    double3 crossPt = cross(P, t);
    w.x = P.x + crossPt.x;
    w.y = P.y + crossPt.y;
    w.z = P.z + crossPt.z;
    s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
    s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
    s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
    double3 crossws = cross(w, s);
    P.x += crossws.x;
    P.y += crossws.y;
    P.z += crossws.z;
    P.x += 0.5 * dt * charge * c / mass * Ef.x;
    P.y += 0.5 * dt * charge * c / mass * Ef.y;
    P.z += 0.5 * dt * charge * c / mass * Ef.z;
    gP[idx] = P;
  }
 }
 __device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
@ -639,7 +558,64 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
 }
-int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
+struct compare_particle
 {
  int threshold;
  compare_particle() {
    threshold = 0;
  }
  void set_threshold(int t) {
    threshold = t;
  }
  __host__ __device__
  bool operator()(CUDA_PART p1, CUDA_PART p2) {
    return p1.label > p2.label;
  }
  __host__  __device__
  bool operator()(CUDA_PART p1) {
    return p1.label < threshold;
  }
 };
 struct compare_particle_small
 {
  int threshold;
  compare_particle_small() {
    threshold = 0;
  }
  void set_threshold(int t) {
    threshold = t;
  }
  __host__ __device__
  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
    return p1.label > p2.label;
  }
  __host__  __device__
  bool operator()(CUDA_PART_SMALL p1) {
    return p1.label < threshold;
  }
 };
 struct less_then
 {
  __host__ __device__
  bool operator()(int x)
  {
    return x < 0;
  }
 };
 int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 					     bool enableRutherfordScattering)
 {
@ -701,12 +677,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
    }
  } else {
    if (streamId == -1) {
-      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, 
+      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart, 
-				      (double*)dt_ptr, npart, c);
+				      (double*)dt_ptr, c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
-      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, 
+      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, 
-					      (double*)dt_ptr, npart, c);
+					      (double*)dt_ptr, c);
    }
  }
@ -714,29 +690,6 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
  return DKS_SUCCESS;
 }
 int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 						void *bf_ptr, void *dt_ptr, double charge,
 						double mass, int npart,
 						double c, int streamId) 
 {
  int threads = BLOCK_SIZE;
  int blocks = npart / threads + 1;
  //call kernel
  if (streamId == -1) {
    kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
 				    (double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
  } else {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
    kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, 
 					    (double3*)ef_ptr, (double3*)bf_ptr, 
 					    (double*)dt_ptr, charge, mass,  npart, c);
  }
  return DKS_SUCCESS;
 }
 int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 							 void *lastSec_ptr, void *orient_ptr, 
 							 int npart, int nsec, 
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@ -20,8 +20,7 @@
 #include "CudaBase.cuh"
 /**
- * Structure for storing particle on GPU or MIC as AoS.
+ * Structure for storing particle on GPU
 * Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
 */
 typedef struct __align__(16) {
  int label;
@ -38,10 +37,7 @@ typedef struct __align__(16) {
 } CUDA_PART;
 /**
- * Structure for storing particle on GPU as AoS
+ * Structure for storing particle on GPU
 * Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
 * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
 * on the host side.
 */
 typedef struct {
  int label;
@ -51,8 +47,7 @@ typedef struct {
 } CUDA_PART_SMALL;
 /**
- * Structure for storing particle on GPU as SoA.
+ * Structure for storing particle on GPU
 * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
 */
 typedef struct {
  int *label;
@ -70,9 +65,6 @@ typedef struct {
 /**
 * Structure for storing particle on GPU
 * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
 * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
 * on the host side.
 */
 typedef struct {
  int *label;
@ -81,39 +73,11 @@ typedef struct {
  double3 *Pincol;
 } CUDA_PART2_SMALL;
-/** 
+/** CudaCollimatorPhysics class.
 * Operator used in thrust sort to compare particles by label.
 * Used to move dead particles to the end of array, since they have label -1 or -2.
 */
 struct compare_particle_small
 {
  int threshold;
  compare_particle_small() {
    threshold = 0;
  }
  void set_threshold(int t) {
    threshold = t;
  }
  __host__ __device__
  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
    return p1.label > p2.label;
  }
  __host__  __device__
  bool operator()(CUDA_PART_SMALL p1) {
    return p1.label < threshold;
  }
 };
 /** 
 * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Contains kerenls that execute CollimatorPhysics functions form OPAL.
- * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation
 */
-class CudaCollimatorPhysics : public DKSCollimatorPhysics {
+class CudaCollimatorPhysics : public DKSCollimatorPhysics{
 private:
@ -122,44 +86,32 @@ private:
 public:
-  /** 
+  /** Constructor with CudaBase argument
-   * Constructor with CudaBase as argument.
+   *
   * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
   */
  CudaCollimatorPhysics(CudaBase *base) {
    m_base = base;
    base_create = false;
  }
-  /** 
+  /** Constructor - empty. */
   * Empty constructor.
   * Create a new instance of CudaCollimatorPhysics with its own CudaBase. 
   */
  CudaCollimatorPhysics() { 
    m_base = new CudaBase();
    base_create = true;
  }
-  /** 
+  /** Destructor - empty */
   * Destructor.
   * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
   */
  ~CudaCollimatorPhysics() { 
    if (base_create)
      delete m_base;
  };
-  /** 
+  /** Execute collimator physics kernel.
   * Execute collimator physics kernel.
   *
   */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-			int numpartices, bool enableRutherforScattering = true);
+			int numpartices, bool enableRutherfordScattering = true);
  /** 
   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			   void *px_ptr, void *py_ptr, void *pz_ptr,
@ -168,17 +120,12 @@ public:
      return DKS_ERROR;
    }
-  /** 
+  /** Sort particle array on GPU.
   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  /** 
   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
@ -187,25 +134,14 @@ public:
      return DKS_ERROR;
    }
-  /** 
+  /** BorisPusher push function for integration from OPAL.
   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);
-  /** 
+  /** BorisPusher push function with transformto function form OPAL
   * BorisPusher kick function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			   void *bf_ptr, void *dt_ptr, double charge, double mass,
 			   int npart, double c, int streamId = -1); 
  /** 
   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
--- a/src/CUDA/CudaFFT.cuh
+++ b/src/CUDA/CudaFFT.cuh
@ -10,11 +10,7 @@
 #include "../Algorithms/FFT.h"
 #include "CudaBase.cuh"
-/**
+class CudaFFT : public DKSFFT{
 * Cuda FFT class based on BaseFFT interface.
 * Uses cuFFT library to perform FFTs on nvidias GPUs.
 */
 class CudaFFT : public BaseFFT {
 private:
@ -38,7 +34,7 @@ public:
  ~CudaFFT();
  /**
-   * Init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
   * Return: success or error code
   */
  int setupFFT(int ndim, int N[3]);
@ -46,21 +42,45 @@ public:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
  /**
-   * Destroy default FFT plans
+   * Info: destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT();
  /*
    Info: execute complex to complex double precision fft using cufft library
    Return: success or error code
  */
  int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
  /*
    Info: execute ifft 
    Return: success or error code
  */
  int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: execute normalize using cuda kernel for complex to complex iFFT
    Return: success or error code
  */
  int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: execute real to complex double precision FFT
    Return: success or error code
  */
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: exectue complex to real double precision FFT
    Return: success or error code
  */
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: execute normalize for complex to real iFFT
    Return: success or error code
  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
 };
--- a/src/CUDA/CudaGreensFunction.cu
+++ b/src/CUDA/CudaGreensFunction.cu
@ -189,11 +189,12 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
-    
+  
    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
-
+  
    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
  }
 }
@ -272,20 +273,28 @@ __global__ void mirroredRhoField(double *rho2_m,
    id7 = rk * NI * NJ + rj * NI + i;
    id8 = rk * NI * NJ + rj * NI + ri;
    double data = rho2_m[id1];
-    if (i != 0) rho2_m[id2] = data;
+    if (i != 0)
      rho2_m[id2] = data;
-    if (j != 0) rho2_m[id3] = data;
+    if (j != 0)
      rho2_m[id3] = data;
-    if (i != 0 && j != 0) rho2_m[id4] = data;
+    if (i != 0 && j != 0)
      rho2_m[id4] = data;
-    if (k != 0) rho2_m[id5] = data;
+    if (k != 0) 
      rho2_m[id5] = data;
-    if (k !=  0 && i != 0) rho2_m[id6] = data;
+    if (k !=  0 && i != 0)
      rho2_m[id6] = data;
-    if (k!= 0 && j != 0) rho2_m[id7] = data;
+    if (k!= 0 && j != 0)
      rho2_m[id7] = data;
-    if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
+    if (k != 0 && j != 0 & i != 0)
      rho2_m[id8] = data;
  }
@ -354,9 +363,9 @@ CudaGreensFunction::~CudaGreensFunction() {
    delete m_base;
 }
-int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
-				       double hr_m0, double hr_m1, double hr_m2,
+					    double hr_m0, double hr_m1, double hr_m2,
-				       int streamId)
+					    int streamId)
 {
  int thread = 128;
@ -364,7 +373,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
  //if no stream specified use default stream
  if (streamId == -1) {
-    kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
+    kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
    return DKS_SUCCESS;
  }
@ -372,7 +381,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
-    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
+    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
    return DKS_SUCCESS;
  }
@ -380,17 +389,15 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
 }
-int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, 
+int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, 
-						  int I, int J, int K,
+						       int I, int J, int K,
-						  int streamId) 
+						       int streamId) 
 {
  int thread = 128;
  int block = (I * J * K / thread) + 1;
  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
  if (streamId == -1) {
    m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
    kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, 
 					    2*(I - 1), 2*(J - 1), I, J, K);
    return DKS_SUCCESS;
@ -399,7 +406,6 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
    m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
    kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, 
 						  2*(I - 1), 2*(J - 1), I, J, K);
    return DKS_SUCCESS;
@ -409,22 +415,22 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
  return DKS_ERROR;
 }
-int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
+int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
  int thread = 128;
  int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
  if (streamId == -1) {
-    mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I,  2*J);
+    mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I,  2*J);
-    mirroredRhoField<<< block, thread >>>( (double *) rho2_m,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
+    mirroredRhoField<<< block, thread >>>( (double *) mem_ptr,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
    return DKS_SUCCESS;
  }
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
-    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I,  2*J);
+    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I,  2*J);
-    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
+    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
    return DKS_SUCCESS;
  }
@ -434,13 +440,13 @@ int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int st
  return DKS_ERROR;
 }
-int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, 
+int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, 
-					      int size, int streamId) {
+						   int size, int streamId) {
  int threads = 128;
  int blocks = size / threads + 1;
  int datasize = 2 * threads * sizeof(cuDoubleComplex);
-  
+
  if (streamId == -1) {
    multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1, 
 							     (cuDoubleComplex*)ptr2, 
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@ -2,18 +2,17 @@
 #define H_CUDA_GREENSFUNCTION
 #include <iostream>
-#include <cmath>
+#include <math.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuComplex.h>
 #include "cublas_v2.h"
-#include "../Algorithms/GreensFunction.h"
+
 #include "CudaBase.cuh"
-/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
+class CudaGreensFunction {
 class CudaGreensFunction : public GreensFunction{
 private:
@ -31,32 +30,32 @@ public:
  /* destructor */
  ~CudaGreensFunction();
-  /**
+  /*
    Info: calc itegral on device memory (taken from OPAL src code)
    Return: success or error code
  */
-  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+  int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
-		       double hr_m0, double hr_m1, double hr_m2, 
+			  double hr_m0, double hr_m1, double hr_m2, 
-		       int streamId = -1);
+			  int streamId = -1);
-  /**
+  /*
    Info: integration of rho2_m field (taken from OPAL src code)
    Return: success or error code
  */
-  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
+  int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
-				  int streamId = -1);
+				     int streamId = -1);
-  /**
+  /*
    Info: mirror rho field (taken from OPAL src code)
    Return: succes or error code
  */
-  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
+  int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
-  /**
+  /*
    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
    Return: success or error code
  */
-  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
+  int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
 };
--- a/src/CUDA/CudaImageReconstruction.cuh
+++ b/src/CUDA/CudaImageReconstruction.cuh
@ -10,7 +10,6 @@
 #include "../Algorithms/ImageReconstruction.h"
 #include "CudaBase.cuh"
 /** CUDA implementation of ImageReconstruction interface. */
 class CudaImageReconstruction : public ImageReconstruction {
 private:
--- a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
+++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
@ -83,56 +83,6 @@ __device__ double ifld(double t, double alpha, double phi, double nu, double lam
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }
 __device__ double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
  double wt = TWO_PI*nu*t;
  double rate2 = sigma*sigma*t*t;
  double rateL = 0.0;
  double result = 0.0;
  // make sure lambda > 0
  if (lambda < 0.0)
    return 0.0;
  if (beta < 0.001) {
    rateL = 1.0;
  } else {
    rateL = pow(lambda*t, beta);
  }
  if (nu < 0.01) {
    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
  } else {
    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
  }
  return result;
 }
 __device__ double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
  double wt = TWO_PI*nu*t;
  double at = a*t;
  double rateL = 0.0;
  double result = 0.0;
  // make sure lambda > 0
  if (lambda < 0.0)
    return 0.0;
  if (beta < 0.001) {
    rateL = 1.0;
  } else {
    rateL = pow(lambda*t, beta);
  }
  if (nu < 0.01) {
    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
  } else {
    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
  }
  return result;
 }
 __device__ double b(double t, double phi, double nu) {
  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
--- a/src/DKSBase.cpp
+++ b/src/DKSBase.cpp
@ -103,14 +103,25 @@ DKSBase::DKSBase() {
 #ifdef DKS_CUDA
  cbase = new CudaBase();
  cfft = new CudaFFT(cbase);
  cgreens = new CudaGreensFunction(cbase);
  cchi = new CudaChiSquare(cbase);
  ccol = new CudaCollimatorPhysics(cbase);
 #endif
 #ifdef DKS_OPENCL
  oclbase = new OpenCLBase();
  oclfft = new OpenCLFFT(oclbase);
  oclchi = new OpenCLChiSquare(oclbase);
  oclcol = new OpenCLCollimatorPhysics(oclbase);
 #endif
 #ifdef DKS_MIC
  micbase = new MICBase();
  micfft = new MICFFT(micbase);
  miccol = new MICCollimatorPhysics(micbase);
  micgreens = new MICGreensFunction(micbase);
  micchi = new MICChiSquare(micbase);
 #endif
 }
@ -127,14 +138,25 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
 #ifdef DKS_CUDA
  cbase = new CudaBase();
  cfft = new CudaFFT(cbase);
  cgreens = new CudaGreensFunction(cbase);
  cchi = new CudaChiSquare(cbase);
  ccol = new CudaCollimatorPhysics(cbase);
 #endif
 #ifdef DKS_OPENCL
  oclbase = new OpenCLBase();
  oclfft = new OpenCLFFT(oclbase);
  oclchi = new OpenCLChiSquare(oclbase);
  oclcol = new OpenCLCollimatorPhysics(oclbase);
 #endif
 #ifdef DKS_MIC
  micbase = new MICBase();
  micfft = new MICFFT(micbase);
  miccol = new MICCollimatorPhysics(micbase);
  micgreens = new MICGreensFunction(micbase);
  micchi = new MICChiSquare(micbase);
 #endif
 }
@ -151,16 +173,27 @@ DKSBase::~DKSBase() {
  if (m_function_name != NULL)
    delete[] m_function_name;
 #ifdef DKS_CUDA
  delete cfft;
  delete cgreens;
  delete cchi;
  delete ccol;
  delete cbase;
 #endif
 #ifdef DKS_OPENCL
  delete oclfft;
  delete oclchi;
  delete oclcol;
  delete oclbase;
 #endif
 #ifdef DKS_MIC
  delete micfft;
  delete miccol;
  delete micgreens;
  delete micchi;
  delete micbase;
 #endif
@ -274,45 +307,38 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
    return DKS_ERROR;
 }
-int DKSBase::setupDevice() {
+/*
-
+  init device
-  int ierr = DKS_ERROR;
+*/
 int DKSBase::initDevice() {
  //if api is not set default is OpenCL
  if (!m_api_set) {
    setDevice("-gpu", 4);
    setAPI(API_OPENCL, 6);
-    ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+    return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
  } else {
    if (apiOpenCL()) {
      if (!m_device_set) {
 	setDevice("-gpu", 4);
 	setAPI(API_OPENCL, 6);
-	ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+	return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
      } else {
 	setAPI(API_OPENCL, 6);
-	ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
+	return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
      }
    } else if (apiCuda()) {
      setDevice("-gpu", 4);
      setAPI(API_CUDA, 4);			
-      ierr = CUDA_SAFECALL(DKS_SUCCESS);
+      return CUDA_SAFECALL(DKS_SUCCESS);
    } else if (apiOpenMP()) {
      setDevice("-mic", 4);
      setAPI(API_OPENMP, 6);
-      ierr = MIC_SAFECALL(DKS_SUCCESS);
+      return MIC_SAFECALL(DKS_SUCCESS);
    }
  }
-  return ierr;
+  return DKS_ERROR;
 }
 /*
  init device
 */
 int DKSBase::initDevice() {
  return setupDevice();
 }
 /* 
@ -430,15 +456,358 @@ int DKSBase::syncDevice() {
  return DKS_ERROR;
 }
 /* setup fft plans to reuse if multiple ffts of same size are needed */
 int DKSBase::setupFFT(int ndim, int N[3]) {
-int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
+  if (apiCuda()) {
-  if (apiCuda())
+    return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
-    return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
+  } else if (apiOpenMP()) {
-  if (apiOpenCL())
+    //micbase.mic_setupFFT(ndim, N);
-    return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));
+    //BENI: setting up RC and CR transformations on MIC
    int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
    int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
    if (ierr1 != DKS_SUCCESS)
      return ierr1;
    if (ierr2 != DKS_SUCCESS)
      return ierr2;
    return DKS_SUCCESS;
  }
  return DKS_ERROR;
 }
 //BENI:
 int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
  if (apiCuda())
    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
  else if (apiOpenMP())
    return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
  return DKS_ERROR;
 }
 //BENI:
 int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
  if (apiCuda())
    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
  else if (apiOpenMP())
    return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
  return DKS_ERROR;
 }
 /* call OpenCL FFT function for selected platform */
 int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    //load kernel and execute
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
    else
      return DKS_ERROR;
  } else if (apiCuda()) {
    return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
  } else if (apiOpenMP()) {
    return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call OpenCL IFFT function for selected platform */
 int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
    else
      return DKS_ERROR;
  } else if (apiCuda()) {
    return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
  } else if (apiOpenMP()) {
    return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call normalize FFT function for selected platform */
 int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
    else 
      return DKS_ERROR;
  } else if (apiCuda()) {
    return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
  } else if (apiOpenMP()) {
    return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call real to complex FFT */
 int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
  else if (apiOpenMP())
    return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call complex to real FFT */
 int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
  else if (apiOpenMP())
    return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* normalize complex to real iFFT */
 int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
  DEBUG_MSG("No implementation for selected platform");
  return DKS_SUCCESS;
 }
 /* normalize complex to real iFFT */
 int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
  if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 				double hz_m0, double hz_m1, double hz_m2, int streamId) {
  if (apiCuda()) {
    return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ, 
 						      hz_m0, hz_m1, hz_m2, streamId) );
  } else if (apiOpenMP()) {
    //BENI:
    return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
  } 
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 				   int I, int J, int K, int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
  if (apiCuda()) 
    return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
  if (apiCuda())
    return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
 			     double fTimeResolution, double fRebin,
 			     int sensors, int length, int numpar, double &result)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
 						fTimeResolution, fRebin,
 						sensors, length, numpar, 
 						result));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
 						     fTimeResolution, fRebin,
 						     sensors, length, numpar, result));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 			       double fTimeResolution, double fRebin, double fGoodBinOffset,
 			       int sensors, int length, int numpar,
 			       double &result)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
 						  fTimeResolution, fRebin, fGoodBinOffset,
 						  sensors, length, numpar,
 						  result));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
 						       fTimeResolution, fRebin, fGoodBinOffset,
 						       sensors, length, numpar, result));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 				 double fTimeResolution, double fRebin, double fGoodBinOffset,
 				 int sensors, int length, int numpar,
 				 double &result)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
 						    fTimeResolution, fRebin, fGoodBinOffset,
 						    sensors, length, numpar,
 						    result));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
 							 fTimeResolution, fRebin, fGoodBinOffset,
 							 sensors, length, numpar, result));
    else
      return DKS_ERROR;
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 				   int numparticles, int numparams,
 				   int &numaddback, int &numdead) 
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
  } else if (apiOpenCL()) {
    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
      return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
    else
      return DKS_ERROR;
  } else if (apiOpenMP()) {
    return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
  } 
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles, 
 				    bool enableRutherfordScattering) 
 {
  if (apiCuda())
    return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, 
 						  enableRutherfordScattering) );
  else if (apiOpenMP())
    return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				      void *px_ptr, void *py_ptr, void *pz_ptr,
 				      void *par_ptr, int numparticles)
 {
  if (apiOpenMP()) {
    return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
 						      rx_ptr, ry_ptr, rz_ptr, 
 						      px_ptr, py_ptr, pz_ptr,
 						      par_ptr,  numparticles) );
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
 {
  if (apiCuda())
    return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
  else if (apiOpenMP())
    return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 					  void *px_ptr, void *py_ptr, void *pz_ptr,
 					  void *par_ptr, int numparticles, int &numaddback) 
 {
  if (apiOpenMP()) {
    return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
 							 rx_ptr, ry_ptr, rz_ptr, 
 							 px_ptr, py_ptr, pz_ptr,
 							 par_ptr,  numparticles, numaddback));
  }
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callInitRandoms(int size, int seed) {
  if (apiCuda()) 
@ -452,3 +821,43 @@ int DKSBase::callInitRandoms(int size, int seed) {
  return DKS_ERROR;
 }
 int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 				      void *dt_ptr, double dt, double c, 
 				      bool usedt, int streamId) 
 {
  if (apiCuda()) 
    return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, 
 						    usedt, streamId));
  else if (apiOpenMP())
    return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, 
 						     c, usedt, streamId));
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
 int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					       void *lastSec_ptr, void *orient_ptr, 
 					       int npart, int nsec, void *dt_ptr, double dt, 
 					       double c, bool usedt, int streamId)
 {
  if (apiCuda()) {
    return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
 							     lastSec_ptr, orient_ptr,
 							     npart, nsec, dt_ptr, dt, 
 							     c, usedt, streamId));
  } else if (apiOpenMP()) {
    return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
 							      lastSec_ptr, orient_ptr,
 							      npart, nsec, dt_ptr, dt, 
 							      c, usedt, streamId));
  } 
  DEBUG_MSG("No implementation for selceted platform");
  return DKS_ERROR;
 }
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@ -1,3 +1,11 @@
 /** DKSBase class.
 * DKSBase.h
 * Author: Uldis Locans
 * Date: 15.09.2014
 * Base class of Dynamic Kernel Scheduler that handles the function calls
 * from host application to DKS
 */
 #ifndef H_DKS_BASE
 #define H_DKS_BASE
@ -21,24 +29,34 @@
 #endif
 #include "OpenCL/OpenCLBase.h"
 #include "OpenCL/OpenCLFFT.h"
 #include "OpenCL/OpenCLChiSquare.h"
 #include "OpenCL/OpenCLCollimatorPhysics.h"
 #endif
 #ifdef DKS_CUDA
 #include "CUDA/CudaBase.cuh"
 #include "CUDA/CudaFFT.cuh"
 #include "CUDA/CudaGreensFunction.cuh"
 #include "CUDA/CudaChiSquare.cuh"
 #include "CUDA/CudaCollimatorPhysics.cuh"
 #include "nvToolsExt.h"
 #endif
 #ifdef DKS_MIC
 #include "MIC/MICBase.h"
 #include "MIC/MICChiSquare.h"
 #include "MIC/MICFFT.h"
 #include "MIC/MICCollimatorPhysics.h"
 #include "MIC/MICGreensFunction.hpp"
 #endif
 #include "Algorithms/CollimatorPhysics.h"
 #include "Algorithms/FFT.h"
 #include "AutoTuning/DKSConfig.h"
-/** 
+/** DKSBase class for handling function calls to DKS library */
 * API for handling communication function calls to DKS library.
 * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
 * memory manegement, data transfer and other basic communication functions between
 * the host and device.
 */
 class DKSBase {
 private:
@ -55,14 +73,25 @@ private:
 #ifdef DKS_OPENCL	
  OpenCLBase *oclbase;
  OpenCLFFT *oclfft;
  OpenCLChiSquare *oclchi;
  OpenCLCollimatorPhysics *oclcol;
 #endif
 #ifdef DKS_CUDA
  CudaBase *cbase;
  CudaFFT *cfft;
  CudaGreensFunction *cgreens;
  CudaChiSquare *cchi;
  CudaCollimatorPhysics *ccol;
 #endif
 #ifdef DKS_MIC
  MICBase *micbase;
  MICFFT *micfft;
  MICCollimatorPhysics *miccol;
  MICGreensFunction *micgreens;
  MICChiSquare *micchi;
 #endif
 protected:
@ -71,7 +100,7 @@ protected:
  DKSConfig dksconfig;
  /** 
-   * Check if current API is set to OpenCL.
+   * Check if current API is set to OpenCL
   * Return true/false wether current api is opencl
   */
  bool apiOpenCL();
@ -88,11 +117,11 @@ protected:
   */
  bool apiOpenMP();
-  /** Check if device is GPU. */
+  /** Check if device is GPU */
  bool deviceGPU();
-  /** Check if device is CPU. */
+  /** Check if device is CPU */
  bool deviceCPU();
-  /** Check if device is MIC. */
+  /** Check if device is MIC */
  bool deviceMIC();
  /**
@ -110,12 +139,6 @@ protected:
  }
 #endif
 #ifdef DKS_MIC
  MICBase *getMICBase() {
    return micbase;
  }
 #endif
  /** Call OpenCL base to load specified kenrel file.
   *
   */
@ -131,7 +154,6 @@ protected:
    return device_name;
  }
 public:
  /** 
@ -151,11 +173,6 @@ public:
   */
  ~DKSBase();
  /** Function to initialize objects based on the device used.
   *  
   */
  int setupDevice();
  /** Turn on auto tuning */
  void setAutoTuningOn() { m_auto_tuning = true; }
@ -388,7 +405,7 @@ public:
    } else if (apiOpenMP()) {
 #ifdef DKS_MIC
      void * mem_ptr = NULL;
-      mem_ptr = micbase->mic_allocateMemory<T>(elements);	
+      mem_ptr = micbase.mic_allocateMemory<T>(elements);	
      return mem_ptr;
 #endif
    }
@ -481,7 +498,7 @@ public:
      return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
+      return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
    } 
@ -515,7 +532,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
+      return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
    } 
    return DKS_ERROR;
@ -815,7 +832,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
+      return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
    } 
    return DKS_ERROR;
@ -843,7 +860,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements, 
+      return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements, 
 						       streamId, offset));
    }
@ -863,32 +880,229 @@ public:
    else if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
    else if (apiOpenMP())
-      return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
+      return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
    return DKS_ERROR;
  }
-  /**
+
-   * Create random numbers on the device and fille mem_data array
+  ///////////////////////////////////////////////
  ///////Function library part of dksbase////////
  ///////////////////////////////////////////////
  /** 
   * Setup FFT function.
   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
   * each fft will do its own setup according to fft size and dimensions.
   * TODO: opencl and mic implementations
   */
-  int callCreateRandomNumbers(void *mem_ptr, int size);
+  int setupFFT(int ndim, int N[3]);
  //BENI:
  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
  //BENI:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
  /** 
   * Call complex-to-complex fft.
   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation
   */
  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex-to-complex ifft.
   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation.
   */
  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize complex to complex ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size
   * TODO: mic implementation.
   */
  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call real to complex FFT.
   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations
   */
  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex to real iFFT.
   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations.
   */
  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize compelx to real ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size.
   * TODO: opencl and mic implementations.
   */
  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
  /**
   * Transpose 2D and 3D arrays, OpenCL implementation
   * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose 
   */
  int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 			    int I, int J, int K, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
  /** 
   * Element by element multiplication.
   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
   * TODO: opencl and mic implementations.
   */
  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
  /** 
   * Chi square for parameter fitting on device.
   * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for 
   * intermediate results. Chi square results are put in &results
   */
  int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
 		      double fTimeResolution, double fRebin,
 		      int sensors, int length, int numpar, double &result);
  /** 
   * max-log-likelihood for parameter fitting on device.
   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
   * mem_par - pointer to parameter set, mem_results - pointer for 
   * intermediate results. Chi square results are put in &results.
   * TODO: opencl and mic implementations.
   */
  int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 			double fTimeResolution, double fRebin, double fGoodBinOffser,
 			int sensors, int length, int numpar,
 			double &result);
  /** 
   * max-log-likelihood for parameter fitting on device.
   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
   * mem_par - pointer to parameter set, mem_results - pointer for 
   * intermediate results. Chi square results are put in &results.
   * TODO: opencl and mic implementations.
   */
  int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
 			  double fTimeResolution, double fRebin, double fGoodBinOffser,
 			  int sensors, int length, int numpar,
 			  double &result);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			    int numparticles, int numparams, 
 			    int &numaddback, int &numdead);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles, 
 			     bool enableRutherfordScattering = true);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
   */
  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
 			       void *par_ptr, int numparticles);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles, int &numaddback);
  /** 
   * Init random number states and save for reuse on device.
   * If seed is -1, a random seed based on current time is taken.
   * TODO: opencl and mic implementations.
   */
  int callInitRandoms(int size, int seed = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 			       void *dt_ptr, double dt, double c, 
 			       bool usedt = false, int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					void *lastSec_ptr, void *orient_ptr, 
 					int npart, int nsec, void *dt_ptr,
 					double dt, double c, bool usedt = false, 
 					int streamId = -1);
  /**
   * Print memory information on device (total, used, available)
   * TODO: opencl and mic imlementation
   */
  int callMemInfo() {
    #ifdef DKS_CUDA
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_memInfo());
-    #endif
+
    return DKS_ERROR;
  }
@ -897,12 +1111,10 @@ public:
   * Used for debuging and timing purposes only.
   */
  void oclEventInfo() {
    #ifdef DKS_OPENCL
    if (apiOpenCL())
      return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
    #endif
  }
  }
  /** 
   * Test function to profile opencl kernel calls.
--- a/src/DKSBaseMuSR.cpp
+++ b/src/DKSBaseMuSR.cpp
@ -24,7 +24,6 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
  //if we are not auto tuning and the size of the problem has changed find the new parameters
  //from autotuning config file
  if (!isAutoTuningOn() && length != chiSquareSize_m) {
    /*
    int numBlocks, blockSize;
    std::string device_name;
    getDeviceName(device_name);
@ -34,8 +33,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
 				 length, "BlockSize", blockSize);
    chiSq->setKernelParams(numBlocks, blockSize);
-    std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
+    //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
-    */
+
    chiSquareSize_m = length;
  } 
--- a/src/DKSBaseMuSR.h
+++ b/src/DKSBaseMuSR.h
@ -8,7 +8,6 @@
 #include "AutoTuning/DKSAutoTuningTester.h"
 #include "DKSBase.h"
 #include "DKSFFT.h"
 #include "Algorithms/ChiSquareRuntime.h"
@ -20,12 +19,7 @@
 #include "OpenCL/OpenCLChiSquareRuntime.h"
 #endif
-/**
+class DKSBaseMuSR : public DKSBase {
 * API to handle musrfit calls to DKS library.
 * Using ChiSquareRuntime interface allows to call chi square functions on the 
 * GPU or CPU using CUDA or OpenCL.
 */
 class DKSBaseMuSR : public DKSFFT {
 private:
--- a/src/DKSDefinitions.h
+++ b/src/DKSDefinitions.h
@ -62,12 +62,6 @@
 #define OPENCL_SAFEINIT(x) ( NULL )
 #endif
 #ifdef DKS_AMD
 #define OPENCL_SAFEINIT_AMD(x) ( x )
 #else
 #define OPENCL_SAFEINIT_AMD(x) ( NULL )
 #endif
 #ifdef DKS_MIC
 #define MIC_SAFEINIT(x) ( x )
 #else
--- a/src/DKSFFT.cpp
+++ b/src/DKSFFT.cpp
@ -1,147 +0,0 @@
 #include "DKSFFT.h"
 DKSFFT::DKSFFT() {
  dksfft = nullptr;
 }
 DKSFFT::~DKSFFT() {
  delete dksfft;
 }
 /* setup fft plans to reuse if multiple ffts of same size are needed */
 int DKSFFT::setupFFT(int ndim, int N[3]) {
  if (apiCuda()) {
    dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
    return dksfft->setupFFT(ndim, N);
  } else if (apiOpenCL()) {
    dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
    int ierr1 = dksfft->setupFFT(ndim, N);
    int ierr2 = dksfft->setupFFTRC(ndim, N);
    int ierr3 = dksfft->setupFFTCR(ndim, N);
    if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
      return DKS_ERROR;
    return DKS_SUCCESS;
  } else if (apiOpenMP()) {
    //micbase.mic_setupFFT(ndim, N);
    //BENI: setting up RC and CR transformations on MIC
    dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
    int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
    int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
    if (ierr1 != DKS_SUCCESS)
      return ierr1;
    if (ierr2 != DKS_SUCCESS)
      return ierr2;
    return DKS_SUCCESS;
  }
  return DKS_ERROR;
 }
 //BENI:
 int DKSFFT::setupFFTRC(int ndim, int N[3], double scale) {
  if (apiCuda())
    return dksfft->setupFFT(ndim, N);
  if (apiOpenCL())
    return dksfft->setupFFTRC(ndim, N);
  else if (apiOpenMP())
    return dksfft->setupFFTRC(ndim, N, scale);
  return DKS_ERROR;
 }
 //BENI:
 int DKSFFT::setupFFTCR(int ndim, int N[3], double scale) {
  if (apiCuda())
    return dksfft->setupFFT(ndim, N);
  if (apiOpenCL())
    return dksfft->setupFFTCR(ndim, N);
  else if (apiOpenMP())
    return dksfft->setupFFTCR(ndim, N, scale);
  return DKS_ERROR;
 }
 /* call OpenCL FFT function for selected platform */
 int DKSFFT::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL() || apiOpenMP()) 
    return dksfft->executeFFT(data_ptr, ndim, dimsize);
  else if (apiCuda())
    return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call OpenCL IFFT function for selected platform */
 int DKSFFT::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL() || apiOpenMP())
      return dksfft->executeIFFT(data_ptr, ndim, dimsize);
  else if (apiCuda()) 
    return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call normalize FFT function for selected platform */
 int DKSFFT::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiOpenCL()) {
    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
      return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
    else 
      return DKS_ERROR;
  } else if (apiCuda()) {
    return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
  } else if (apiOpenMP()) {
    return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
  }
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call real to complex FFT */
 int DKSFFT::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
  else if (apiOpenCL() || apiOpenMP())
    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* call complex to real FFT */
 int DKSFFT::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
  else if (apiOpenCL() || apiOpenMP())
    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
 /* normalize complex to real iFFT */
 int DKSFFT::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
  if (apiCuda())
    return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
  else if (apiOpenCL())
    return DKS_ERROR;
  else if (apiOpenMP())
    return DKS_ERROR;
  DEBUG_MSG("No implementation for selected platform");
  return DKS_ERROR;
 }
--- a/src/DKSFFT.h
+++ b/src/DKSFFT.h
@ -1,112 +0,0 @@
 #ifndef H_DKSBASE_FFT
 #define H_DKSBASE_FFT
 #include <iostream>
 #include "AutoTuning/DKSAutoTuning.h"
 #include "DKSBase.h"
 #include "DKSDefinitions.h"
 #include "Algorithms/GreensFunction.h"
 #include "Algorithms/CollimatorPhysics.h"
 #include "Algorithms/FFT.h"
 #ifdef DKS_AMD
 #include "OpenCL/OpenCLFFT.h"
 #endif
 #ifdef DKS_CUDA
 #include "CUDA/CudaFFT.cuh"
 #endif
 #ifdef DKS_MIC
 #include "MIC/MICFFT.h"
 #endif
 /**
 * API to handel calls to DKSFFT.
 * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
 */
 class DKSFFT : public DKSBase {
 private:
  BaseFFT *dksfft;
  int initFFT();
 public:
  DKSFFT();
  ~DKSFFT();
  /** 
   * Setup FFT function.
   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
   * each fft will do its own setup according to fft size and dimensions.
   * TODO: opencl and mic implementations
   */
  int setupFFT(int ndim, int N[3]);
  //BENI:
  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
  //BENI:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
  /** 
   * Call complex-to-complex fft.
   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation
   */
  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex-to-complex ifft.
   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
   * stream id can be specified to use other streams than default.
   * TODO: mic implementation.
   */
  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize complex to complex ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size
   * TODO: mic implementation.
   */
  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call real to complex FFT.
   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations
   */
  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Call complex to real iFFT.
   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
   * TODO: opencl and mic implementations.
   */
  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
  /** 
   * Normalize compelx to real ifft.
   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
   * fft size.
   * TODO: opencl and mic implementations.
   */
  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
 };
 #endif
--- a/src/DKSImageReconstruction.h
+++ b/src/DKSImageReconstruction.h
@ -10,9 +10,6 @@
 #include "CUDA/CudaImageReconstruction.cuh"
 #endif
 /**
 * API to handle PET image reconstruction calls.
 */
 class DKSImageRecon : public DKSBase {
 private:
@ -25,88 +22,87 @@ public:
  ~DKSImageRecon();
-  /** 
+  /** Image reconstruction analaysis calculate source.
-   * Image reconstruction analaysis calculate source.
+   * 
   *
   */
  int callCalculateSource(void *image_space, void *image_position, void *source_position, 
 			  void *avg, void *std, float diameter, int total_voxels, 
 			  int total_sources, int start = 0);
-  /** 
+  /** Image reconstruction analaysis calculate source.
-   * Image reconstruction analaysis calculate source.
+   * 
   *
   */
  int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0);
-  /** 
+  /** Image reconstruction analaysis calculate source.
-   * Image reconstruction analaysis calculate source.
+   * 
   *
   */
  int callCalculateSources(void *image_space, void *image_position, void *source_position, 
 			   void *avg, void *std, void *diameter, int total_voxels, 
 			   int total_sources, int start = 0);
-  /** 
+  /** Image reconstruction analaysis calculate source.
-   * Image reconstruction analaysis calculate source.
+   * 
   *
   */
  int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
 			       void *avg, void *std, void *diameter, int total_voxels, 
 			       int total_sources, int start = 0);
-  /** 
+  /** Image reconstruction - generate normalization.
-   * Image reconstruction - generate normalization.
+   * 
   */
  int callGenerateNormalization(void *recon, void *image_position, 
 				void *det_position, int total_det);
-  /** 
+  /** Image reconstruction - forward correction.
-   * Image reconstruction - forward correction.
+   * 
   */
  int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 			    void *image_position, int num_events);
-  /** 
+  /** Image reconstruction - backward projection.
-   * Image reconstruction - backward projection.
+   * 
   */
  int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
 			     void *det_position, void *image_position, 
 			     int num_events, int num_voxels);
-  /** 
+  /** Set the voxel dimensins on device.
   * Set the voxel dimensins on device.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
-  /** 
+  /** Set the image edge.
   * Set the image edge.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge(float x_edge, float y_edge, float z_edge);
-  /** 
+  /** Set the image edge1.
   * Set the image edge1.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
-  /** 
+  /** Set the minimum crystan in one ring values.
   * Set the minimum crystal in one ring values.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
-  /** 
+  /** Set all other required parameters for reconstruction.
   * Set all other required parameters for reconstruction.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
--- a/src/DKSMainPage.dox
+++ b/src/DKSMainPage.dox
@ -1,32 +0,0 @@
 /** 
 \mainpage
 <P>
 <B>
 The aim of DKS is to allow the creation of fast fine tuned kernels using device specific frameworks such as CUDA, OpenCL, OpenACC and OpenMP and accelerator libraries such as Thrust, Nvidia CUDA libraries, Intel MKL or others. On top of that, DKS allows the easy use of these kernels in host applications without providing any device or framework specific details. This approach facilitates the integration of different types of devices in the existing applications with minimal code changes and  makes the device and the host code a lot more manageable.
 </B>
 <P>
 The main parts of DKS are:
 <ul>
 	<li>DKSBase - provides the basic communication functions between host application and hardware accelerators including memory manegement, data transfer and synchronization.</li>
 	<li>DKSOPAL - provides functions for Object Oriented Particle Accelerator library to offload FFTPoisson calculations and particle matter interaction using Monte Carlo simulations to GPU and Intel MIC</li>
 	<li>DKSBaseMuSR - provides functions to perform parameter fitting for musrfit on the GPU</li>
 	<li>DKSImageRecon - provides functions to perform PET image reconstruction on the GPU</li>
 	<li>DKSFFT - provides functions to perform FFT on the GPU and Intel MIC</li>
 </ul>
 <P>
 <B>
 Developed by
 Uldis Locans
 </B>
 <P>
 For further information contact: locans.uldis@psi.ch - Uldis Locans
 <P>
 <P>
 <a href="https://gitlab.psi.ch/uldis_l/DKS">DKS on gitlab</a><br>
 */
--- a/src/DKSOPAL.cpp
+++ b/src/DKSOPAL.cpp
@ -1,162 +0,0 @@
 #include "DKSOPAL.h"
 DKSOPAL::DKSOPAL() {
  dkscol = nullptr;
  dksgreens = nullptr;
 }
 DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
  setAPI(api_name, strlen(api_name));
  setDevice(device_name, strlen(device_name));
 }
 DKSOPAL::~DKSOPAL() {
  delete dkscol;
  delete dksgreens;
 }
 int DKSOPAL::setupOPAL() {
  int ierr = DKS_ERROR;
  if (apiOpenCL()) {
    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
    //TODO: only enable if AMD libraries are available
    dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
    dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
  } else if (apiCuda()) {
    ierr = CUDA_SAFECALL( DKS_SUCCESS );
    dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
    dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
  } else if (apiOpenMP()) {
    ierr = MIC_SAFECALL( DKS_SUCCESS );
    dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
    dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
  } else {
    ierr = DKS_ERROR;
  }
  return ierr;
 }
 int DKSOPAL::initDevice() {
  int ierr = setupDevice();
  if (ierr == DKS_SUCCESS)
    ierr = setupOPAL();
  return ierr;
 }
 int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 				double hz_m0, double hz_m1, double hz_m2, int streamId) {
    return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, 
 				     hz_m0, hz_m1, hz_m2, streamId);
 }
 int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 				   int I, int J, int K, int streamId) {
  return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
 }
 int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
  return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);  
 }
 int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
  return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
 }
 int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 				   int numparticles, int numparams,
 				   int &numaddback, int &numdead, 
 				   bool enableRutherforScattering) 
 {
  return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
 }
 int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
 				    bool enableRutherforScattering) 
 {
  return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
 }
 int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				      void *px_ptr, void *py_ptr, void *pz_ptr,
 				      void *par_ptr, int numparticles)
 {
    return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
 					rx_ptr, ry_ptr, rz_ptr, 
 					px_ptr, py_ptr, pz_ptr,
 					par_ptr,  numparticles);
 }
 int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
 {
  return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
 }
 int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 					  void *px_ptr, void *py_ptr, void *pz_ptr,
 					  void *par_ptr, int numparticles, int &numaddback) 
 {
  return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
 						       rx_ptr, ry_ptr, rz_ptr, 
 						       px_ptr, py_ptr, pz_ptr,
 						       par_ptr,  numparticles, numaddback));
 }
 int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 				      void *dt_ptr, double dt, double c, 
 				      bool usedt, int streamId) 
 {
  return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
 }
 int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, 
 				      int npart, double c, int streamId) {
  return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
 }
 int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				      void *bf_ptr, void *dt_ptr, double charge, double mass,
 				      int npart, double c, int streamId) 
 {
  return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr, 
 				      charge, mass, npart, c, streamId);
 }
 int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					       void *lastSec_ptr, void *orient_ptr, 
 					       int npart, int nsec, void *dt_ptr, double dt, 
 					       double c, bool usedt, int streamId)
 {
  return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
 					       npart, nsec, dt_ptr, dt, c, usedt, streamId);
 }
--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@ -1,175 +0,0 @@
 #ifndef H_DKS_OPAL
 #define H_DKS_OPAL
 #include <iostream>
 #include "AutoTuning/DKSAutoTuning.h"
 #include "DKSBase.h"
 #include "DKSFFT.h"
 #include "DKSDefinitions.h"
 #include "Algorithms/GreensFunction.h"
 #include "Algorithms/CollimatorPhysics.h"
 #include "Algorithms/FFT.h"
 #ifdef DKS_AMD
 #include "OpenCL/OpenCLFFT.h"
 #include "OpenCL/OpenCLGreensFunction.h"
 #include "OpenCL/OpenCLCollimatorPhysics.h"
 #endif
 #ifdef DKS_CUDA
 #include "CUDA/CudaFFT.cuh"
 #include "CUDA/CudaGreensFunction.cuh"
 #include "CUDA/CudaCollimatorPhysics.cuh"
 #endif
 #ifdef DKS_MIC
 #include "MIC/MICFFT.h"
 #include "MIC/MICGreensFunction.hpp"
 #include "MIC/MICCollimatorPhysics.h"
 #endif
 /**
 * API to handle OPAL calls to DKS library.
 * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
 * functions.
 */
 class DKSOPAL : public DKSFFT {
 private: 
  DKSCollimatorPhysics *dkscol;
  GreensFunction *dksgreens;
  int setupOPAL();
 public:
  DKSOPAL();
  DKSOPAL(const char* api_name, const char* device_name);
  ~DKSOPAL();
  int initDevice();
  ///////////////////////////////////////////////
  ///////Function library part of dksbase////////
  ///////////////////////////////////////////////
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
 			    int I, int J, int K, int streamId = -1);
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
   * TODO: opencl and mic implementations.
   */
  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
  /** 
   * Element by element multiplication.
   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
   * TODO: opencl and mic implementations.
   */
  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			    int numparticles, int numparams, 
 			    int &numaddback, int &numdead,
 			    bool enableRutherfordScattering = true);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles, 
 			     bool enableRutherfordScattering = true);
  /** 
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
   */
  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
 			       void *par_ptr, int numparticles);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  /**
   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
   * TODO: opencl and mic implementations.
   */
  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles, int &numaddback);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
 			       void *dt_ptr, double dt, double c, 
 			       bool usedt = false, int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 					void *lastSec_ptr, void *orient_ptr, 
 					int npart, int nsec, void *dt_ptr,
 					double dt, double c, bool usedt = false, 
 					int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, 
 			       int npart, double c, int streamId = -1);
  /**
   * Integration code from ParallelTTracker from OPAL.
   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
   */
  int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			       void *bf_ptr, void *dt_ptr, double charge, 
 			       double mass, int npart, double c, int streamId = -1);
 };
 #endif
--- a/src/MIC/CMakeLists.txt
+++ b/src/MIC/CMakeLists.txt
@ -1,22 +1,19 @@
-SET (_SRCS MICBase.cpp MICFFT.cpp)
+SET (_SRCS
-SET (_HDRS MICBase.h MICFFT.h)
+  MICBase.cpp
  MICChiSquare.cpp
  MICFFT.cpp
  MICGreensFunction.cpp  
  MICCollimatorPhysics.cpp
  )
-IF (ENABLE_OPAL)
+SET (_HDRS
-  SET (_SRCS
+  MICBase.h
-    ${_SRCS}
+  MICChiSquare.h
-    MICChiSquare.cpp
+  MICFFT.h
-    MICGreensFunction.cpp  
+  MICCollimatorPhysics.h
-    MICCollimatorPhysics.cpp
+  MICGreensFunction.hpp    
-    )
+  MICMergeSort.h
-
+  )
  SET (_HDRS
    ${_HDRS}
    MICChiSquare.h
    MICCollimatorPhysics.h
    MICGreensFunction.hpp    
    MICMergeSort.h
    )
 ENDIF (ENABLE_OPAL)
 #INCLUDE_DIRECTORIES (
 #  ${CMAKE_CURRENT_SOURCE_DIR}
--- a/src/MIC/MICBase.cpp
+++ b/src/MIC/MICBase.cpp
@ -18,28 +18,30 @@ int MICBase::mic_createRandStreams(int size) {
  int seed = time(NULL);
-  int numThreads = 0;
+#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
 #pragma offload target(mic:m_device_id) inout(numThreads)
  {
    //get the number of threads
    int numThreads;
 #pragma omp parallel
    numThreads = omp_get_num_threads();
  }
-  defaultRndStream =  mic_allocateMemory<VSLStreamStatePtr>(numThreads);
+    //if default rnd stream already allocated delete the array
-  VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
+    if (defaultRndSet == 1)    
-  maxThreads = numThreads; 
+      delete[] defaultRndStream;
-  
+
-#pragma offload target(mic:m_device_id) \
+    //allocate defaultRndStream array
-  in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN)        \
+    defaultRndStream = new VSLStreamStatePtr[numThreads];
-  in(seed)
+
  {
    //create stream states for each thread
 #pragma omp parallel for
    for (int i = 0; i < omp_get_num_threads(); i++)
-      vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
+      vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
  }
-  defaultRndSet = 1;
+    defaultRndSet = 1;
  }
  return DKS_SUCCESS;
 }
@ -47,8 +49,15 @@ int MICBase::mic_createRandStreams(int size) {
 //delete default rand streams
 int MICBase::mic_deleteRandStreams() {
-  //mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
+#pragma offload target(mic:m_device_id) inout(defaultRndSet)
-  return DKS_SUCCESS;
+  {
    if (defaultRndSet == 1) {
      delete[] defaultRndStream;
      defaultRndSet = -1;
    }
  }
  return DKS_ERROR;
 }
 //create a new signal for the mic
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@ -26,82 +26,72 @@
 #define MIC_WIDTH 128
 /** MIC Base class handles device setup and basic communication with the device.
 * Handles devicew setup, memory manegement and  data transfers.
 */
 class MICBase {
 private:
  std::vector<int> micStreams;
  int maxThreads; 
 protected:
  int defaultRndSet;
 public:
-
+  VSLStreamStatePtr *defaultRndStream;
 //#pragma offload_attribute(push,target(mic))
  void *defaultRndStream; //VSLSStreamStatePtr
  void *testPtr;
 //#pragma offload_attribute(pop)
  int m_device_id;
-  /** constructor */
+  /* constructor */
  MICBase();
-  /** destructor */
+  /* destructor */
  ~MICBase();
-  /**
+  /*
-   * Create MKL rand streams for each thread
+    Info: create MKL rand streams for each thread
-   *  Return: success or error code
+    Return: success or error code
-   */
+  */
  int mic_createRandStreams(int size);
-  /**
+  /*
-   * Delete MKL rand streams
+    Info: delete MKL rand streams
-   * Return: succes or error code
+    Return: succes or error code
-   */
+  */
  int mic_deleteRandStreams();
-  /**
+  /*
-   * Create a new signal for the mic.
+    Info: create a new signal for the mic
-   * Signals can be used for assynchronous data transfers.
+    Return: success or error code
-   * Return: success or error code
+  */
   */
  int mic_createStream(int & streamId);
-  /**
+  /*
-   * Info: get the signal from the vector.
+    Info: get the signal from the vector
-   * Return: mic signal
+    Return: mic signal
  */
  int& mic_getStream(int id);
-  /**
+  /*
-   * Info: delete streams.
+    Info: delete streams
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  int mic_deleteStreams();
-  /**
+  /*
-   * Info: set device id.
+    Info: set device id
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  int mic_setDeviceId(int id);
-  /**
+  /*
-   * Info: get mic devices.
+    Info: get mic devices
-   * Prints information about mic devices.
+    Return: success or error code
-   * Return: success or error code
+  */
   */
  int mic_getDevices();
-  /**
+  /*
-   * Allocate memory on MIC device.
+    Info: allocate memory on MIC device
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  void * mic_allocateMemory(int size) {
@ -114,10 +104,10 @@ public:
    return tmp;
  }
-  /**
+  /*
-   * Transfer data to device.
+    Info: transfer data to device
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@ -128,10 +118,10 @@ public:
    return DKS_SUCCESS;
  }
-  /**
+  /*
-   * Write data to device, non-blocking.
+    Info: write data to device, non-blocking
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
  {
@ -144,10 +134,10 @@ public:
  }
-  /**
+  /*
-   * Read data from device
+    Info: read data from device
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@ -159,10 +149,10 @@ public:
    return DKS_SUCCESS;
  }
-  /**
+  /*
-   * Read data from device waiting for signal
+    Info: read data from device waiting for signal
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  int mic_readDataAsync(const void * data_ptr, void * result, int size, 
 			int streamId = -1, int offset = 0) {
@ -177,10 +167,10 @@ public:
  }
-  /**
+  /* 
-   * Wait till all the signals are complete
+     Info: wait till all the signals are complete
-   * Return siccess or error code
+     Return siccess or error code
-   */
+  */
  int mic_syncDevice() {
    //empty offload to wait for all the signals to finish and launch a new empy signal
@ -198,10 +188,10 @@ public:
  }
-  /**
+  /*
-   * Free memory on device
+    Info: free memory on device
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  int mic_freeMemory(void * data_ptr, int size) {
@ -212,13 +202,14 @@ public:
 #pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
    {
    }
    return DKS_SUCCESS;
  }
-  /**
+  /*
-   * Allocate memory and write data to device
+    Info: allocate memory and write data to device
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  template<typename T>
  void * mic_pushData(const void * data, int size) {
    T* tmp_ptr = new T[size];
@ -232,10 +223,10 @@ public:
  return tmp_ptr;
 }
-  /**
+/*
-   * Read data and free memory on device
+  Info: read data and free memory on device
-   * Return: success or erro code
+  Return: success or erro code
-   */
+*/
  template<typename T>
  int mic_pullData(void * data_ptr, void * result, int size) {
    T* tmp_ptr = (T*)data_ptr;
--- a/src/MIC/MICChiSquare.h
+++ b/src/MIC/MICChiSquare.h
@ -14,9 +14,6 @@
 #include <offload.h>
 #include "MICBase.h"
 /** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. 
 * Not complete and untested because of the poor performance of first MIC devices.
 */
 class MICChiSquare {
  MICBase *m_micbase;
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@ -22,34 +22,22 @@
 #define I_M 10
 #define DT_M 11
 /**
 * MIC device function for calculating dot product.
 */
 __declspec(target(mic))
 double dot(mic_double3 d1, mic_double3 d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }
 /**
 * MIC device function for calculating dot product.
 */
 __declspec(target(mic))
 double dot(double dx, double dy, double dz) {
  return (dx * dx + dy * dy + dz * dz);
 }
 /**
 * MIC device function to check if particle is still in material.
 */
 __declspec(target(mic))
 bool checkHit(double &z, double *par) {
  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
 }
 /**
 * MIC device function to calculate arbitrary rotation.
 */
 __declspec(target(mic))
 void Rot(double &px, double &pz, double &x, double &z, double xplane, 
 	 double normP, double thetacou, double deltas, int coord)
@ -82,14 +70,6 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
 /**
 * MIC device function to calculate Coulomb scattering for one particle.
 * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
 * Uses AoS to store particle positions and momentum, paralelized using OpenMP.
 * For details on the algorithm see OPAL user guide.
 * Deprecated on favor of SoA data layout.
 */
 __declspec(target(mic))
 void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -156,19 +136,11 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
 }
 /**
 * MIC device function to calculate Coulomb scattering for one particle.
 * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
 * Uses SoA to store particle positions and momentum, paralelized using OpenMP.
 * For details on the algorithm see OPAL user guide.
 */
 __declspec(target(mic))
-void coulombScat(double *rx, double *ry, double *rz, 
+void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
 		 double *px, double *py, double *pz, int *label,
 		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
 {
-
+ 
  //arrays for temporary storage, each core proceses MIC_WIDTH particles
  double normP[MIC_WIDTH] __attribute__((aligned(64)));
  double deltas[MIC_WIDTH] __attribute__((aligned(64)));
  double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -180,7 +152,6 @@ void coulombScat(double *rx, double *ry, double *rz,
  double z2[MIC_WIDTH] __attribute__((aligned(64)));
  double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
  //simd instruction tells the compiler its safe to vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -220,7 +191,6 @@ void coulombScat(double *rx, double *ry, double *rz,
    }
  }
  //vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + size; i++) {
@ -232,6 +202,7 @@ void coulombScat(double *rx, double *ry, double *rz,
    }
  }
  //generate array of random numbers
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -310,11 +281,6 @@ void coulombScat(double *rx, double *ry, double *rz,
 }
 /**
 * MIC device function to calculate energyLoss for one particle.
 * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
 * algorith are available in OPAL user guide.
 */
 __declspec(target(mic))
 void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
@ -326,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
  const double deltas = par[DT_M] * beta * C;
  const double deltasrho = deltas * 100 * par[RHO_M];
-  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    const double Ts = (Eng * 1E6) / 1.0073; 
@ -362,11 +328,6 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
    pdead = 1;
 }
 /**
 * MIC device function to calculate energyLoss for one particle.
 * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
 * algorith are available in OPAL user guide.
 */
 __declspec(target(mic))
 void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
@ -377,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
  const double deltas = par[DT_M] * beta * C;
  const double deltasrho = deltas * 100 * par[RHO_M];
-  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    const double Ts = (Eng * 1E6) / 1.0073; 
@ -407,29 +368,26 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
 }
-int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
+int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-					    bool enableRutherforScattering) 
+					    int numparticles, boll enableRutherfordScattering) 
 {
  //cast device memory pointers to appropriate types
  MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
  double *par = (double*) par_ptr;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target(mic:m_micbase->m_device_id)		\
  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
  in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
  in(numparticles)
  {
 #pragma omp parallel 
    {
-      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
+      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
      //for loop trough particles if not checkhit set label to -2 and update R.x
 #pragma omp for simd
      for (int i = 0; i < numparticles; i++) {
 	if ( !checkHit(data[i].Rincol.z, par) ) {
@ -489,7 +447,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 {
-  //cast device memory pointers to appropriate types
+
  int *label = (int*)label_ptr;
  unsigned *localID = (unsigned*)localID_ptr;
  double *rx = (double*)rx_ptr;
@ -503,10 +461,6 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  int padding = numparticles % MIC_WIDTH;
  int totalpart = numparticles + padding;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target (mic:0) \
  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
@ -517,16 +471,14 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  in(py:length(0) DKS_REUSE DKS_RETAIN)	\
  in(pz:length(0) DKS_REUSE DKS_RETAIN)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
  in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
  in(totalpart)
  {
 #pragma omp parallel
    {
      //every thread gets its own rnd stream state
-      //VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
+      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
-      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
+
      #pragma omp for nowait
      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -562,11 +514,9 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 	  double Eng = (sq - 1) * M_P;
 	  double dEdx = 0;
 	  if (label[i] == 0) {
 	    energyLoss(Eng, dEdx, par, randv, i - ii);
 	  }
 	  if (Eng > 1e-4 && dEdx < 0) {
 	    double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
@ -578,12 +528,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 	  if (Eng < 1e-4 || dEdx > 0)
 	    label[i] = -1;
-	      
+	        
 	} //end inner energy loss loop
      } //end outer energy loss loop
-      
+      } //end outer energy loss loop
      //vectorize coulomb scattering as much as possible
 #pragma omp for nowait
      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -593,7 +542,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
    } //end omp parallel
  } //end offload
-   
+     
  return DKS_SUCCESS;
 }
--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@ -26,13 +26,7 @@ typedef struct {
 } MIC_PART_SMALL;
-/**
+class MICCollimatorPhysics : DKSAlogorithms{
 * MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Implementes OPALs collimator physics class for particle matter interactions using OpenMP
 * and offload mode targetomg Intel Xeon Phi processors.
 * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
 */
 class MICCollimatorPhysics : public DKSCollimatorPhysics {
 private:
@ -44,10 +38,10 @@ public:
    m_micbase = base;
  };
-  ~MICCollimatorPhysics() {  };
+  ~MICCollimatorPhysics() { };
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
-			bool enableRutherforScattering = true);
+			bool enableRutherfordScattering = true);
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
--- a/src/MIC/MICFFT.cpp
+++ b/src/MIC/MICFFT.cpp
@ -6,16 +6,13 @@
 MICFFT::MICFFT(MICBase *base) {
  m_micbase = base;
  m_fftsetup = false;
 }
 MICFFT::~MICFFT() {
  if (m_fftsetup) {
 #pragma offload target(mic:0)
-    {
+  {
-      DftiFreeDescriptor(&FFTHandle_m);
+    DftiFreeDescriptor(&FFTHandle_m);
-      DftiFreeDescriptor(&handle);
+    DftiFreeDescriptor(&handle);
    }
  }
 }
@ -38,7 +35,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
  }
-  m_fftsetup = true;
+
  return DKS_SUCCESS;
 }
 //BENI:
@ -125,8 +122,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
 }
 //execute iFFT
-int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
+int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
-  return executeFFT(mem_ptr, ndim, N, -1, false);
+  return mic_executeFFT(mem_ptr, ndim, N, -1, false);
 }
 //execute REAL->COMPLEX FFT
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@ -7,18 +7,13 @@
 #include <offload.h>
 #include <mkl_dfti.h>
-#include "../Algorithms/FFT.h"
+#include "../Algorithm/DKSFFT.h"
 #include "MICBase.h"
-/** 
+class MICFFT : public DKSFFT {
 * MIC FFT based on BaseFFT interface.
 * uses MKL library to offload FFT on Intel Xeon Phi devices.
 */
 class MICFFT : public BaseFFT {
 private:
  bool m_fftsetup;
  MICBase *m_micbase;
  /// Internal FFT object for performing serial FFTs.
@ -79,18 +74,6 @@ public:
  /* normalize IFFT on MIC */
  int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
  /**
   * Info: destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT() { return DKS_SUCCESS; }
  /*
    Info: execute normalize for complex to real iFFT
    Return: success or error code
  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
 };
 #endif
--- a/src/MIC/MICGreensFunction.cpp
+++ b/src/MIC/MICGreensFunction.cpp
@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
  }
 */
-int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
+int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
-				      double hr_m0, double hr_m1, double hr_m2, int streamId) 
+					  double hr_m1, double hr_m2) 
 {
-  double *tmp_ptr = (double*) tmpgreen;
+  double *tmp_ptr = (double*) tmp_ptr_;
 #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
  {
    std::memset(tmp_ptr,0,I*J*K);
@ -173,14 +173,12 @@ return 0;
 */
 //CUDA similar version:
-int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
+int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
-					int streamId) 
+  double *tmpgreen = (double*) tmp_ptr_;
-{
+  double *mem_ptr = (double*) mem_ptr_;
  double *tmpgreen_ptr = (double*) tmpgreen;
  double *mem_ptr = (double*) rho2_m;
  // the actual integration
-#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
  {
    int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); 
    std::memset(mem_ptr,0,II*JJ*KK);
@ -199,27 +197,27 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
 	  tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
 	  if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
-	    tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+	    tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
 	  if (i+1 < NI_tmp)
-	    tmp1 = tmpgreen_ptr[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+	    tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
 	  if (j+1 < NJ_tmp)
-	    tmp2 = tmpgreen_ptr[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+	    tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
 	  if (k+1 < NK_tmp)
-	    tmp3 = tmpgreen_ptr[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
 	  if (i+1 < NI_tmp && j+1 < NJ_tmp)
-	    tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+	    tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
 	  if (i+1 < NI_tmp && k+1 < NK_tmp)
-	    tmp5 = tmpgreen_ptr[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
 	  if (j+1 < NJ_tmp && k+1 < NK_tmp)
-	    tmp6 = tmpgreen_ptr[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
-	  tmp7 = tmpgreen_ptr[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+	  tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
 	  double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
@ -236,8 +234,8 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
-int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
+int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
-  double *mem_ptr = (double*) rho2_m;	
+  double *mem_ptr = (double*) mem_ptr_;	
 #pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
  {
@ -283,11 +281,11 @@ int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int str
 }
 /*multiply complex fields*/
-int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
+int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
  //	  double *mem_ptr1 = (double*) mem_ptr1_;
  //	  double *mem_ptr2 = (double*) mem_ptr2_;
-  _Complex double *mem_ptr1 = (_Complex double *) ptr1;
+  _Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
-  _Complex double *mem_ptr2 = (_Complex double *) ptr2;
+  _Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
 #pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
  {
--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@ -9,14 +9,12 @@
 #include <offload.h>
 #include <mkl_dfti.h>
 #include "../Algorithms/GreensFunction.h"
 #include "MICBase.h"
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
-/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
+class MICGreensFunction {
 class MICGreensFunction : public GreensFunction {
 private:
  MICBase *m_micbase;
@ -30,18 +28,16 @@ public:
  ~MICGreensFunction();
  /* compute greens integral analytically */
-  int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
+  int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
 		     double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
  /* perform the actual integration */
-  int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K, 
+  int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
 				int stremaId = -1);
  /* Mirror rho-Field */
-  int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);
+  int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
  /*multiply complex fields*/
-  int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);
+  int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
 };
--- a/src/MIC/MICMergeSort.h
+++ b/src/MIC/MICMergeSort.h
@ -71,10 +71,6 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
  return p;
 }
 /**
 * Merge sort implementation for intel MIC.
 * Paralellized over all the MIC cores using OpenMP tasks.
 */
 template <typename T>
 void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
@ -88,9 +84,6 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
  }
 }
 /**
 * Quicksort algorithm, developed for use on Intel MIC devices.
 */
 template <typename T>
 void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
@ -107,10 +100,6 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
 }
 /** 
 * Insertion sort of @p list, developed for use on Intel MIC.
 * Used by quick_sort to sort small lists.
 */
 template <typename T>
 void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
--- a/src/OpenCL/CMakeLists.txt
+++ b/src/OpenCL/CMakeLists.txt
@ -1,53 +1,31 @@
-#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found
+SET (_SRCS
-
+	OpenCLBase.cpp
-SET (_HDRS OpenCLBase.h)
+	OpenCLFFT.cpp
-SET (_SRCS OpenCLBase.cpp)
+	OpenCLChiSquare.cpp
-SET (_KERNELS "")
+	OpenCLCollimatorPhysics.cpp
-
+	OpenCLChiSquareRuntime.cpp
 IF (ENABLE_AMD)
  SET (_SRCS
    ${_SRCS}
    OpenCLFFT.cpp
    )
  SET (_HDRS
    ${_HDRS}
    OpenCLFFT.h
    )
  SET (_KERNELS
    ${_KERNELS}
    OpenCLKernels/OpenCLFFT.cl
    OpenCLKernels/OpenCLFFTStockham.cl
    OpenCLKernels/OpenCLTranspose.cl
  )
 ENDIF (ENABLE_AMD)
-IF (ENABLE_MUSR)
+SET (_HDRS
-  SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
+	OpenCLBase.h
-  SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
+	OpenCLFFT.h
-  SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
+	OpenCLChiSquare.h
-ENDIF (ENABLE_MUSR)
+	OpenCLCollimatorPhysics.h
-
+	OpenCLChiSquareRuntime.h
-IF (ENABLE_AMD AND ENABLE_OPAL)
+  )
-  SET (_SRCS
+
-    ${_SRCS}
+#INCLUDE_DIRECTORIES (
-    OpenCLCollimatorPhysics.cpp
+#  ${CMAKE_CURRENT_SOURCE_DIR}
-    OpenCLGreensFunction.cpp
+#)
-    )
+
-
+SET (_KERNELS
-  SET (_HDRS
+  OpenCLKernels/OpenCLChiSquare.cl
-    ${_HDRS}
+  OpenCLKernels/OpenCLFFT.cl
-    OpenCLCollimatorPhysics.h
+  OpenCLKernels/OpenCLFFTStockham.cl
-    OpenCLGreensFunction.h
+  OpenCLKernels/OpenCLTranspose.cl
-    )
+  OpenCLKernels/OpenCLCollimatorPhysics.cl
-
+  OpenCLKernels/OpenCLChiSquareRuntime.cl
  SET (_KERNELS
    ${_KERNELS}
    OpenCLKernels/OpenCLCollimatorPhysics.cl
    OpenCLKernels/OpenCLGreensFunction.cl
  )
 ENDIF (ENABLE_AMD AND ENABLE_OPAL)
 ADD_SOURCES (${_SRCS})
 ADD_HEADERS (${_HDRS})
--- a/src/OpenCL/OpenCLBase.cpp
+++ b/src/OpenCL/OpenCLBase.cpp
@ -7,13 +7,21 @@ cl_device_id OpenCLBase::m_device_id = NULL;
 cl_event OpenCLBase::m_last_event = NULL;
 OpenCLBase::OpenCLBase() {
  //m_context = NULL;
  //m_command_queue = NULL;
  m_program = NULL;
  m_kernel = NULL;
  //m_device_id = NULL;
  //m_platform_id = NULL;
  m_kernel_file = NULL;
  m_last_event = NULL;
  //m_events = new cl_event[500];
  //m_num_events = 0;
  defaultRndSet = 0;
 }
 OpenCLBase::~OpenCLBase() {
@ -33,11 +41,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
  ocl_loadKernel(kernel_file);
  delete[] kernel_file;
-  
+
  //allocate memory for rand states
  int ierr;
  defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
-  
+
  //exec kernel
  int seed = 0;
  ocl_createKernel("initRand");
@ -47,34 +55,13 @@ int OpenCLBase::ocl_createRndStates(int size) {
  size_t work_items = size;
  size_t work_group_size = 1;
  ocl_executeKernel(1, &work_items, &work_group_size);
  defaultRndSet = 1;
  return DKS_SUCCESS;
 }
-int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
+  return OCL_SUCCESS;
  //load kernel
  char * kernel_file = new char[500];
  kernel_file[0] = '\0';
  strcat(kernel_file, OPENCL_KERNELS);
  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
  ocl_loadKernel(kernel_file);
  delete[] kernel_file;
  //set kernel variables
  cl_mem tmp_data = (cl_mem) mem_ptr;
  ocl_createKernel("createRandoms");
  ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
  ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
  ocl_setKernelArg(2, sizeof(int), &size);
  size_t work_size = 128;
  size_t work_items = (size % work_size + 1) * work_size;
  ocl_executeKernel(1, &work_items, &work_size);
  return DKS_SUCCESS;
 }
 /* destroy rnd states */
@ -83,7 +70,7 @@ int OpenCLBase::ocl_deleteRndStates() {
  ocl_freeMemory(defaultRndState);
  defaultRndSet = 0;
-  return DKS_SUCCESS;
+  return OCL_SUCCESS;
 }
@ -441,8 +428,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
  int ierr;
  //create program from kernel
-  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, 
+  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
 					NULL, &ierr);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
    return DKS_ERROR;
@ -452,7 +438,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
  ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
  /*
-    check if compiling kernel source succeded, if failed return error code
+    check if compileng kernel source succeded, if failed return error code
    if in debug mode get compilation info and print program build log witch
    will give indication what made the compilation fail
  */
@ -461,8 +447,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
    //get build status
    cl_build_status status;
-    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, 
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
 			  sizeof(cl_build_status), &status, NULL);
    //get log size
    size_t log_size;
@ -628,12 +613,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
    }
  }
-  if (ierr != DKS_SUCCESS) {
+  if (ierr != OCL_SUCCESS) {
    DEBUG_MSG("Failed to build kernel file " << kernel_file);
-    return DKS_ERROR;
+    return OCL_ERROR;
  }
-  return DKS_SUCCESS;
+  return OCL_SUCCESS;
 }
 //compile kernel form source code provided
@ -675,14 +660,17 @@ cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) {
 /*
  write data specified by in_data to device memory, device memory space defined by cl_mem
 */
-int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, 
+int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) {
 			      size_t offset, int blocking) 
 {
  cl_int ierr;
  //std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl;
  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event);
  //m_events[m_num_events] = m_last_event;
  m_events.push_back(m_last_event);
  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, 
 			      in_data, 0, NULL, NULL);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
@ -713,11 +701,6 @@ int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) {
 */
 int OpenCLBase::ocl_createKernel(const char* kernel_name) {
  cl_int ierr;
  //release the old kernel
  if (m_kernel != NULL)
    clReleaseKernel(m_kernel);
  //create a new kernel
  m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
@ -745,20 +728,24 @@ int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) {
  optional: work_group_size - can specify how work items are divided in work groups, 
  if left NULL OpenCL implementation handles this part.
 */
-int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, 
+int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) {
-				  const size_t *work_group_size) 
+  cl_int ierr;
-{
+		
-  cl_int ierr;	
+  cl_event tmp_event;
-
+  if (m_last_event == NULL) {
-  ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, 
+    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
-				work_items, work_group_size, 
+				  0, NULL, &tmp_event);
-				0, NULL, NULL);
+  } else {
    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
 				  1, &m_last_event, &tmp_event);
  }
  if (ierr != CL_SUCCESS)
-    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr 
+    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
-	      << " work items: " << *work_items << ", " 
+		
-	      << " work group: " << *work_group_size);
+  m_last_event = tmp_event;
-
+  m_events.push_back(m_last_event);
  return ierr;
 }
@ -766,13 +753,12 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items,
  read data from device, mem_ptr points to data on device out_data points to memory in host
  blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
 */
-int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, 
+int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) {
 			     size_t offset, int blocking) 
 {
  cl_int ierr;
  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event);
-  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, 
+  m_events.push_back(m_last_event);
   			     out_data, 0, NULL, NULL);
  if (ierr != CL_SUCCESS)
    DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
@ -936,27 +922,22 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
  if (ierr != DKS_SUCCESS)
    return ierr;
-  /* get device properties */
+  //get device properties
  //maximum number of work-items in a work group supported by device
  size_t max_group_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
  //maxumum local memory size per work group
  cl_ulong local_mem_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
  //get the supported extensions
  size_t ext_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
  char *ext = new char[ext_size];
  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
-  /* get kernel properties */
+  //get kernel properties
  //get max work group size that can be used for this kernel
  size_t kernel_group_size;
  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, 
 			   sizeof(size_t), &kernel_group_size, 0);
  threadsPerBlock = kernel_group_size;
  //get max local memory size that can be used for this kernel
  cl_ulong kernel_local_mem;
  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
 			   sizeof(cl_ulong), &kernel_local_mem, 0);
@ -965,18 +946,18 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
  std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
-  std::cout << "Work group size: max for device " << max_group_size << " > "
+  std::cout << "Work groups: device limit " << max_group_size << ", "
-	    << "max for kernel " << kernel_group_size << " > "
+	    << "kernel limit " << kernel_group_size << ", "
 	    << "required " << work_group_size << std::endl;
  std::cout << "Local memory: device limit " << local_mem_size << std::endl;
-  std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
+  
-  std::cout << std::endl << "Available extensions: " << ext << std::endl;
+  std::cout << "Available extensions: " << ext << std::endl;
-  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;   
+  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
  return DKS_SUCCESS;
 }
--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@ -1,3 +1,16 @@
 /*
  Name: OpenCLBase
  Author: Uldis Locans
  Info: OpenCL base class to handle all the common details associated 
  with kernel launch on OpenCL device
  Date: 2014.09.18
 */
 #ifndef H_OPENCL_BASE
 #define H_OPENCL_BASE
@ -17,10 +30,13 @@
 #include <CL/cl_ext.h>
 #endif
 #include "../DKSDefinitions.h"
-/** struct for random number state. */
+/* struct for random number state */
 typedef struct {
  double s10;
  double s11;
  double s12;
@ -29,292 +45,250 @@ typedef struct {
  double s22;
  double z;
  bool gen;
 } RNDState;
 /**
 * OpenCL base class to handle device setup and basic communication wiht the device.
 * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
 * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
 * class contains methods to read the kernel files, compile the kernel codes and launch kernels
 * from the compiled codes. Which kernel file needs to be loaded for the specif functin is 
 * handled by the base class that is launching the kernel.
 */
 class OpenCLBase {
 private:
  static cl_context m_context;
  static cl_command_queue m_command_queue;
  //variables containig OpenCL device and platform ids
  static cl_platform_id m_platform_id;
  static cl_device_id m_device_id;
  //variables containit compiled OpenCL program and kernel
  cl_context_properties m_context_properties[3];
  cl_program m_program;
  cl_kernel m_kernel;
  //variables for tracking OpenCL events
  static cl_event m_last_event;
  cl_int m_num_events;
  std::vector<cl_event> m_events;
  //currently load kernel file
  char * m_kernel_file;
  //type of device used by OpenCL
  cl_device_type m_device_type;
-  /**
+  /*
-   * Get all available OpenCL platforms.
+    Name: getPlatforms
-   * Get all avaialble platforms and save in m_platform_ids, save number of platforms
+    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
-   *  Return: success or error code
+    Return: success or error code
-   */
+  */
  int ocl_getPlatforms();
-  /**
+  /*
-   * Get first available OpenCL device of specified type.
+    Name: getDevice
-   * Get first avaialble devices and save device id and platform id for this device, 
+    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
-   * device name: (-gpu, -mic, -cpu)
+    ReturnL success or error code
-   *  ReturnL success or error code
+  */
   */
  int ocl_getDevice(const char* device_name);
-  /**
+  /*
-   * Get cl_device_type from the specified device name.
+    Name getDeviceType
-   * get device type from device name (-gpu, -cpu, -mic)
+    Info: get device type from device name (-gpu, -cpu, -mic)
-   *  Return: success or error code
+    Return: success or error code
-   */
+  */
  int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
-  /**
+  /*
-   * Create OpenCL context with specified device.
+    Name: createContext
-   *  Return: success or error code
+    Info: create context with specified device
-   */
+    Return: success or error code
  */
  int ocl_createContext();
-  /**
+  /*
-   * Build program from specified kernel file.
+    Name: buildProgram
-   * Return: success or error code.
+    Info: build program from specified kernel file
    Return: success or error code
  */
  int ocl_buildProgram(const char* kernel_file);
-  /** 
+  /** Compile program from kernel source string
-   * Compile program from kernel source string.
+   *
   * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the 
   * OpenCL program, that can be then executed on the device.
   * opts is a string specifiend additional compiler flags.
   */
  int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
 protected:
  //memory for random number states
  int defaultRndSet;
  cl_mem defaultRndState;
 public:
  //OpenCL context and commad queue
  static cl_context m_context;
  static cl_command_queue m_command_queue; 
-  /**
+  /*
-   * constructor
+    constructor
-   */
+  */
  OpenCLBase();
-  /**
+  /*
-   * destructor
+    destructor
-   */
+  */
  ~OpenCLBase();
-  /**
+  /*
-   * Allocate memory for size random number states and init the rnd states.
+    Create RND states
-   * Uses AMD clRng library for random numbers. 
+    Return: success or error code
-   * This library is only compatible with AMD devices.
+  */
   */
  int ocl_createRndStates(int size);
-  /** 
+  /*
-   * Create an array of random numbers on the device.
+    Destroy rnd states
-   * Filles hte mem_ptr with random numbers.
+    Return: success or error code
-   */
+  */
  int ocl_createRandomNumbers(void *mem_ptr, int size);
  /**
   * Destroy rnd states and free device memory.
   * Return: success or error code
   */
  int ocl_deleteRndStates();
-  /**
+  /*
-   * Prints info about all the available platforms and devices.
+    Name: getAllDevices
-   * Can be used for information purposes to see what devices are available on the system.
+    Info: get all available devices
-   * ReturnL success or error code.
+    ReturnL success or error code
  */
  int ocl_getAllDevices();
-  /** 
+  /** Get the OpenCL device count for the set type of device
-   * Get the OpenCL device count for the set type of device.
+   *
   * Device count is set in ndev parameter, returns success or error code.
   */
  int ocl_getDeviceCount(int &ndev);
-  /** 
+  /** Get the name of the device used
   * Get the name of the device currently us use.
   */
  int ocl_getDeviceName(std::string &device_name);
-  /** 
+  /** Set the device to use for OpenCL kernels.
-   * Set the device to use for OpenCL kernels.
+   *  device id to use is passed as integer.
   * Device id to use is passed as integer.
   */
  int ocl_setDevice(int device);
-  /** 
+  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
-   * Get a list of all the unique devices of the same type that can run OpenCL kernels.
+   *  Used when GPUs of different types might be pressent on the system.
   * Used when GPUs of different types might be pressent on the system.
   */
  int ocl_getUniqueDevices(std::vector<int> &devices);
-  /**
+  /*
-   * Initialize OpenCL connection with a device of specified type.
+    Name: setUp
-   * Find if specified device is avaialble, creates a contex and command queue.
+    Info: set up opencl resources
-   * Returns success or error code.
+    Return: success or error code
-   */
+  */
  int ocl_setUp(const char* device_name);
-  /**
+  /*
-   * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
+    Name: loadKernel
-   * Load and compile opencl kernel file if it has changed.
+    Info: load and compile opencl kernel file if it has changed
-   * Return: success or error code
+    Return: success or error code
  */
  int ocl_loadKernel(const char* kernel_file);
-  /** 
+  /** Build program from kernel source.
   * Build program from kernel source.
   * Builds a program from source code provided in kernel_source.
   * If compilation fails will return DKS_ERROR
   */
  int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
-  /**
+  /*
-   * Allocate memory on the device.
+    Name: allocateMemory
-   * Return: return pointer to memory
+    Info: allocate memory on device
    Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int &ierr);
-
+	
-  /**
+  /*
-   * Allocate memory of specific type on device.
+    Name: allocateMemory
-   * The availabel types are cl_mem_flags type listed in OpenCL documentation:
+    Info: allocate memory on device
-   * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, 
+    Return: return pointer to memory
   * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
-  /** 
+  /*
-   * Zero OpenCL memory buffer.
+    Name: writeData
-   * Set all the elemetns in the device array to zero.
+    Info: write data to device memory (needs ptr to mem object)
-   */
+    Return: success or error code
-  template <typename T>
+  */
  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
    cl_int ierr;
    ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset, 
 			       sizeof(T)*size, 0, nullptr, nullptr);
    if (ierr != CL_SUCCESS)
      return DKS_ERROR;
    return DKS_SUCCESS;
  }
  /**
   * Write data to device memory (needs ptr to mem object)
   * Return: success or error code
   */
  int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
-  /** 
+  /*
-   * Copy data from one buffer on the device to another
+    Name: copyData
-   * Return: success or error code
+    Info: copy data from one buffer on the device to another
-   */
+    Return: success or error code
  */
  int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
-  /** 
+  /*
-   * Create kernel from compiled OpenCL program.
+    Name: createKernel
-   * Return: success or error code
+    Info: create kernel from program
-   */
+    Return: success or error code
  */
  int ocl_createKernel(const char* kernel_name);
-  /**
+  /*
-   * Set argiments for the kernel that will be launched.
+    Name: setKernelArgs
-   * Return: success or error code
+    Info: set opencl kernel arguments
-   */
+    Return: success or error code
  */
  int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
-  /**
+  /*
-   * Execute selected kernel.
+    Name: executeKernel
-   * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
+    Info: execute selected kernel (needs kernel parameters)
-   * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
+    Return: success or error code
   * kernel arguments must be set.
   * Return: success or error code
  */
  int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
-  /**
+  /*
-   * Read data from device (needs pointer to mem object).
+    Name: readData
-   * Return: success or error code
+    Info: read data from device (needs pointer to mem object)
-   */
+    Return: success or error code
  */
  int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
-  /**
+  /*
-   * Free device memory (needs ptr to mem object).
+    Name: freeMemory
-   *  Return: success or error code
+    Info: free device memory (needs ptr to mem object)
-   */
+    Return: success or error code
  */
  int ocl_freeMemory(cl_mem mem_ptr);
-  /**
+  /*
-   * Free opencl resources.
+    Name: cleanUp
-   * Deletes the kernel, compiled program, command queue and colese the connection
+    Info: free opencl resources
-   * to device by releasing the context.
+    Return: success or error code
-   * Return: success or error code
+  */
   */
  int ocl_cleanUp();
-  /**
+  /*
-   * Print info of currently selected device.
+    Name: deviceInfo
-   * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
+    Info: print device info (mostly for debugging purposes)
-   * Return: success or error code
+    Return: success or error code
-   */
+  */
  int ocl_deviceInfo(bool verbose = true);
-  /* 
+  /* Check OpenCL kernel.
-   * Check OpenCL kernel.
+   * Query device and check if it can run the kernel with required parameters
   * Query device and check if it can run the kernel with required parameters.
   * Also check the available OpenCL extensions - usefull for checking the supported device
   * features, like double precission.
   */
  int ocl_checkKernel(const char* kernel_name, int work_group_size,
 		      bool double_precision, int &threadsPerBlock);
-  /**
+  /*
-   * Clear the event list.
+    Name: clearEvents
-   * Events can be used for timing and synchronization purposes.
+    Info: clear saved events (for debuging purposes)
-   */
+    Return: nothing
  */
  void ocl_clearEvents();
-  /**
+  /*
-   * print information about kernel timings from event list.
+    Name: eventInfo
-   * for debuging purposes
+    Info: print information about kernel timings (for debuging purposes)
-   */
+    Return: nothing
  */
  void ocl_eventInfo();
-  /**
+  /*
-   * Return current command queue.
+    Return current command queue
-   */
+  */
  cl_command_queue ocl_getQueue() { return m_command_queue; }
 };
--- a/src/OpenCL/OpenCLChiSquare.h
+++ b/src/OpenCL/OpenCLChiSquare.h
@ -14,7 +14,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
-/** Deprecated, SimpleFit implementation of ChiSquare. */
+
 class OpenCLChiSquare {
 private:
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@ -42,7 +42,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  if (!fp)
    DEBUG_MSG("Can't open kernel file" << kernel_file);
-  //get file size and allocate memory
+  //get file size and allocate memory	
  fseek(fp, 0, SEEK_END);
  fsize = ftell(fp);
  kernel_source = new char[fsize+1];
@ -52,7 +52,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  fread(kernel_source, 1, sizeof(char)*fsize, fp);
  kernel_source[fsize] = '\0';
  fclose(fp);
-
+  
  std::string kernel_string (kernel_source);
  return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;
@ -76,9 +76,10 @@ int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {
 double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
  int ierr;
-  //calc number of threads per workgroup and nr of work groups
+  //calc number of thread sper workgroup and nr of work groups
-  size_t work_size_sum = (size_t)blockSize_m;
+  size_t work_size_sum = 128;
  /*
  size_t work_items = (size_t)length;
@ -86,7 +87,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
    work_items = (length / work_size_sum + 1) * work_size_sum;
  int work_groups = length / work_size_sum + 1;
  */
-
+  
  size_t work_items = 80 * work_size_sum;
  int work_groups = 80;
@ -95,19 +96,20 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
  double *partial_sums = new double[work_groups];
  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
-
+  
  //execute sum kernel
  //ocl_createKernel("parallelReductionSum");
  m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
  m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
-  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); 
  m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
-  //read partial sums and free temp memory
+  //read partial sums and free temp mempry
  m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
  m_oclbase->ocl_freeMemory(tmp_ptr);
-
+  
  //sumup partial sums on the host
  double result = 0;
  for (int i = 0; i < work_groups; i++)
@ -139,7 +141,6 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
  //set work item size
  size_t work_items;
  size_t work_size = (size_t)blockSize_m;
  if (numBlocks_m < 0)
    work_items = (size_t)length;
  else
@ -156,7 +157,6 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
      return ierr;
    //set kernel args
    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -172,23 +172,20 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
    m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
-    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
-    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
    m_oclbase->ocl_setKernelArg(16, sizeof(double)*num, NULL);
    num = nummap; if (num == 0) num = 1;
    m_oclbase->ocl_setKernelArg(17, sizeof(int)*num, NULL);
    if (ierr != DKS_SUCCESS)
      return ierr;
  } else if (fitType == FITTYPE_ASYMMETRY) {
    //create kernel
    ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
    if (ierr != DKS_SUCCESS)
      return ierr;
    //set kernel args
    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -203,12 +200,9 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
-    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(14, sizeof(double)*num, NULL);
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
-    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
    num = nummap; if (num == 0) num = 1;
    m_oclbase->ocl_setKernelArg(16, sizeof(int)*num, NULL);
    if (ierr != DKS_SUCCESS)
      return ierr;
@ -232,7 +226,6 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
 }
 int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
  //write params to gpu
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
  return ierr;
 }
@ -242,7 +235,6 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
  if (numfunc == 0)
    return DKS_SUCCESS;
  //write function values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
  return ierr;
 }
@ -251,12 +243,11 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  if (nummap == 0)
    return DKS_SUCCESS;
  //wrtie map values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
  return ierr;
 }
-int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
+int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, 
 					  int size_func, int size_map)
 {
@ -266,7 +257,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
    freeChiSquare();
  }
-  //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
+  //allocate temporary memory
  mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
  mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
  if (size_func == 0)
@ -286,12 +277,12 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
  int ierr = DKS_ERROR;
  if (initDone_m) {
-    //free GPU memory
+    //free memory
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
-
+    
    initDone_m = false;
  }
@ -317,13 +308,9 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
    return DKS_ERROR;
  }
-  //check the GPU kernel
+  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
  ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
  if (threadsPerBlock < blockSize_m) {
    std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
    blockSize_m = threadsPerBlock;
  }
  return ierr;
 }
--- a/src/OpenCL/OpenCLChiSquareRuntime.h
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@ -17,54 +17,44 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
 const std::string openclFunctFooter = "}\n";
 /**
 * OpenCL implementation of ChiSquareRuntime class.
 * Implements ChiSquareRuntime interface to allow musrfit to target devices that
 * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
 */
 class OpenCLChiSquareRuntime : public ChiSquareRuntime {
 private:
  OpenCLBase *m_oclbase;
-  /** 
+  /** Private function to add user defined function to kernel string
-   * Private function to add user defined function to kernel string.
+   *
   */
  std::string buildProgram(std::string function);
  /**
   * Launch parallel reduction kernel to calculate the sum of data array
   */
  double calculateSum(cl_mem data, int length);
 public:
-  /** 
+  /** Constructor wiht openclbase argument
-   * Constructor wiht openclbase argument.
+   *
   */
  OpenCLChiSquareRuntime(OpenCLBase *base);
-  /** 
+  /** Default constructor
-   * Default constructor
+   *
   */
  OpenCLChiSquareRuntime();
-  /** 
+  /** Default destructor
-   * Default destructor
+   *
   */
  ~OpenCLChiSquareRuntime();
-  /** 
+    /** Compile program and save ptx.
   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);
-  /** 
+  /** Launch selected kernel
   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
   * Result is put in &result variable
   */
@ -74,26 +64,22 @@ public:
 		      double timeStart, double timeStep,
 		      double &result);
-  /** 
+  /** Write params to device.
   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 
-  /** 
+  /** Write functions to device.
   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);
-  /** 
+  /** Write maps to device.
   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);
-  /** 
+  /** Allocate temporary memory needed for chi square.
   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -101,16 +87,14 @@ public:
   */
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
-  /** 
+  /** Free temporary memory allocated for chi square.
   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();
-  /** 
+  /** Check MuSR kernels for necessary resources.
   * Check MuSR kernels for necessary resources.
   * Query device properties to get if sufficient resources are
-   * available to run the kernels. Also checks if double precission is enabled on the device.
+   * available to run the kernels
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock);
--- a/src/OpenCL/OpenCLCollimatorPhysics.cpp
+++ b/src/OpenCL/OpenCLCollimatorPhysics.cpp
@ -34,7 +34,7 @@ TODO:
 2. boost.compute sort for user defined structure crashes
 */
 int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-					       int numparticles, bool enableRutherforScattering) 
+					       int numparticles, bool enableRutherfordScattering) 
 {
  /*
  //set number of total threads, and number threads per block
--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@ -17,16 +17,12 @@
 #include "boost/compute/core.hpp"
 */
 /** Double3 structure for use in OpenCL code. */
 typedef struct {
  double x;
  double y;
  double z;
 } Double3;
 /**
 * Structure for stroing particles in OpenCL code.
 */
 typedef struct {
  int label;
  unsigned localID;
@ -39,10 +35,6 @@ typedef struct {
 //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
 //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
 /**
 * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
 */
 class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
 private:
@ -50,22 +42,18 @@ private:
 public:
-  /** 
+  /* constructor */
   * Constructor with OpenCLBase as argument.
   * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
   */
  OpenCLCollimatorPhysics(OpenCLBase *base) { 
    m_oclbase = base;
  }
-  /** 
+  /* destructor */
   * Destructor.
   */
  ~OpenCLCollimatorPhysics() { 
  }
  /* execute degrader code on device */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
-			bool enableRutherforScattering = true);
+			bool enableRutherfordScattering = true);
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
--- a/src/OpenCL/OpenCLFFT.cpp
+++ b/src/OpenCL/OpenCLFFT.cpp
@ -31,6 +31,7 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
  if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
    return OCL_ERROR;
  //execute kernel
  for (int step = 1; step < N; step <<= 1) {
@ -88,78 +89,26 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
  call fft execution on device for every dimension
 */
 int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
-
+  int ierr;
-  int dkserr = DKS_SUCCESS;
+	
  cl_int ierr;
  cl_mem inout = (cl_mem)data;
  int n = N[0];
-  if (forward)
+  for (int dim = 0; dim < ndim; dim++) {
-    ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
+    ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
-				0, NULL, NULL, &inout, NULL, NULL);
+    if (ierr != OCL_SUCCESS) {
-  else
+      DEBUG_MSG("Error executing bit reverse");
-    ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, 
+      return OCL_ERROR;
-				0, NULL, NULL, &inout, NULL, NULL);  
+    }
-  if (ierr != OCL_SUCCESS) {
+    ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
-    dkserr = DKS_ERROR;
+    if (ierr != OCL_SUCCESS) {
-    DEBUG_MSG("Error executing cfFFT\n");
+      DEBUG_MSG("Error executing fft reverse");
-    if (ierr == CLFFT_INVALID_PLAN)
+      return OCL_ERROR;
-      std::cout << "Invlalid plan" << std::endl;
+    }
    else 
      std::cout << "CLFFT error" << std::endl;
  }
-  return dkserr;
+  return OCL_SUCCESS;
 }
 /*
  call rcfft execution on device for every dimension
 */
 int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
  int dkserr = DKS_SUCCESS;
  cl_int ierr;
  cl_mem real_in = (cl_mem)real_ptr;
  cl_mem comp_out = (cl_mem)comp_ptr;
  ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
 			       0, NULL, NULL, &real_in, &comp_out, NULL);
  if (ierr != OCL_SUCCESS) {
    dkserr = DKS_ERROR;
    DEBUG_MSG("Error executing cfFFT\n");
    if (ierr == CLFFT_INVALID_PLAN)
      std::cout << "Invlalid plan" << std::endl;
    else 
      std::cout << "CLFFT error" << std::endl;
  }
  return dkserr;
 }
 /*
  call rcfft execution on device for every dimension
 */
 int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
  int dkserr = DKS_SUCCESS;
  cl_int ierr;
  cl_mem real_in = (cl_mem)real_ptr;
  cl_mem comp_out = (cl_mem)comp_ptr;
  ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, 
 				0, NULL, NULL, &comp_out, &real_in, NULL);
  if (ierr != OCL_SUCCESS) {
    dkserr = DKS_ERROR;
    DEBUG_MSG("Error executing cfFFT\n");
    if (ierr == CLFFT_INVALID_PLAN)
      std::cout << "Invlalid plan" << std::endl;
    else 
      std::cout << "CLFFT error" << std::endl;
  }
  return dkserr;
 }
 /*
@ -171,11 +120,10 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
 }
 /*
-  call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
+  call kernel to normalize fft
 */
 int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
 /*
  cl_mem inout = (cl_mem)data;
  int n = N[0];
@ -202,175 +150,132 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
    DEBUG_MSG("Error executing kernel");
    return OCL_ERROR;
  }
-*/	
+	
  return OCL_SUCCESS;
 }
-int OpenCLFFT::setupFFT(int ndim, int N[3]) {
+int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
  int ierr;
  int size = sizeof(cl_double2)*pow(N,ndim);
  cl_mem mem_tmp;
  cl_mem mem_src = (cl_mem)src;
  cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
-  cl_int err;
+  //set the number of work items in each dimension
  size_t work_items[3];
  int p = 1;
  int threads = N / 2;
  int f = (forward) ? -1 : 1;
  //execute kernel
  int n = (int)log2(N);
  for (int i = 0; i < ndim; i++) {
-  clfftDim dim;
+    int dim = i+1;
-  if (ndim == 1)
+    p = 1;
-    dim = CLFFT_1D;
+    work_items[0] = (dim == 1) ? N/2 : N;
-  else if (ndim == 2)
+    work_items[1] = (dim == 2) ? N/2 : N;
-    dim = CLFFT_2D;
+    work_items[2] = (dim == 3) ? N/2 : N;
-  else 
+		
-    dim = CLFFT_3D;
+    //transpose array if calculating dimension larger than 1
-  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
+    //if (dim > 1) 
    //	ocl_executeTranspose(mem_src, N, ndim, dim);
    //create kernel and set kernel arguments
    if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
      return OCL_ERROR;
    for (int t = 1; t <= log2(N); t++) {
      m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
      m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
      m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
      m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
      m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
      m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
      if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) 
 	return OCL_ERROR;
-  /* Create 3D fft plan*/
+      mem_tmp = mem_src;
-  err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
+      mem_src = mem_dst;
-
+      mem_dst = mem_tmp;
-  /* Set plan parameters */
+	
-  err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
+      p = 2*p;
  if (err != CL_SUCCESS)
    std::cout << "Error setting precision" << std::endl;
  err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
  if (err != CL_SUCCESS)
    std::cout << "Error setting layout" << std::endl;
  err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
  if (err != CL_SUCCESS)
    std::cout << "Error setting result location" << std::endl;
  /* Bake the plan */
  err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
  if (err != CL_SUCCESS) {
    DEBUG_MSG("Error creating Complex-to-complex plan");
    return DKS_ERROR;
  }
  return DKS_SUCCESS;
 }
 int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
  cl_int err;
  clfftDim dim;
  if (ndim == 1)
    dim = CLFFT_1D;
  else if (ndim == 2)
    dim = CLFFT_2D;
  else 
    dim = CLFFT_3D;
  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
  size_t half = (size_t)N[0] / 2 + 1;
  size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
  size_t clOutStride[3] = {1, half, half * N[1]};
  /* Create 3D fft plan*/
  err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
  /* Set plan parameters */
  err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
  err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
  err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
  err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
  err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
  /* Bake the plan */
  err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
  if (err != CL_SUCCESS) {
    DEBUG_MSG("Error creating Real-to-complex plan");
    return DKS_ERROR;
  }
  return DKS_SUCCESS;
 }
 int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
  cl_int err;
  clfftDim dim;
  if (ndim == 1)
    dim = CLFFT_1D;
  else if (ndim == 2)
    dim = CLFFT_2D;
  else
    dim = CLFFT_3D;
  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
  size_t half = (size_t)N[0] / 2 + 1;
  size_t clInStride[3] = {1, half, half * N[1]};
  size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
  /* Create 3D fft plan*/
  err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
  /* Set plan parameters */
  err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
  err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
  err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
  err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
  err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
  /* Bake the plan */
  err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
  if (err != CL_SUCCESS) {
    DEBUG_MSG("Error creating Complex-to-real plan");
    return DKS_ERROR;
  }
  return DKS_SUCCESS;
 }
 int OpenCLFFT::destroyFFT() {
  clfftDestroyPlan(&planHandleZ2Z);
  clfftDestroyPlan(&planHandleD2Z);
  clfftDestroyPlan(&planHandleZ2D);
  clfftTeardown();
  return DKS_SUCCESS;
 }
 void OpenCLFFT::printError(clfftStatus err) {
  if (err != CL_SUCCESS) {
    std::cout << "Error creating default plan " << err <<  std::endl;
    switch(err) {
    case CLFFT_BUGCHECK: 
      std::cout << "bugcheck" << std::endl; 
      break;
    case CLFFT_NOTIMPLEMENTED: 
      std::cout << "not implemented" << std::endl; 
      break;
    case CLFFT_TRANSPOSED_NOTIMPLEMENTED: 
      std::cout << "transposed not implemented" << std::endl; 
      break;
    case CLFFT_FILE_NOT_FOUND: 
      std::cout << "file not found" << std::endl; 
      break;
    case CLFFT_FILE_CREATE_FAILURE: 
      std::cout << "file create failure" << std::endl; 
      break;
    case CLFFT_VERSION_MISMATCH: 
      std::cout << "version missmatch" << std::endl; 
      break;
    case CLFFT_INVALID_PLAN: 
      std::cout << "invalid plan" << std::endl; 
      break;
    case CLFFT_DEVICE_NO_DOUBLE: 
      std::cout << "no double" << std::endl; 
      break;
    case CLFFT_DEVICE_MISMATCH: 
      std::cout << "device missmatch" << std::endl; 
      break;
    case CLFFT_ENDSTATUS: 
      std::cout << "end status" << std::endl; 
      break;
    default: 
      std::cout << "other: " << err << std::endl;
      break;
    }
    //transpose array back if calculating dimension larger than 1
    //if (dim > 1)
    //	ocl_executeTranspose(mem_src, N, ndim, dim);
  }	
  if (ndim*n % 2 == 1) {
    m_oclbase->ocl_copyData(mem_src, mem_dst, size);
    mem_tmp = mem_src;
    mem_src = mem_dst;
    mem_dst = mem_tmp;
  }
  m_oclbase->ocl_freeMemory(mem_dst);
  return OCL_SUCCESS;
 }
 int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
  cl_mem mem_src = (cl_mem)src;
  size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
  size_t work_group_size[3] = {(size_t)N/2, 1, 1};
  m_oclbase->ocl_createKernel("fft_batch3D");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
  m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
  m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
  m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
  for (int dim = 1; dim < ndim+1; dim++) {
    m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
    m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
  }
  return OCL_SUCCESS;
 }
 int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
  cl_mem mem_src = (cl_mem)src;
  if (ndim == 1)
    return OCL_SUCCESS;
  size_t work_items[3];
  work_items[0] = N[0];
  work_items[1] = N[1];
  work_items[2] = 1;
  size_t work_group_size[3];
  work_group_size[0] = N[0];
  work_group_size[1] = N[1];
  work_group_size[2] = 1;
  size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
  m_oclbase->ocl_createKernel("transpose");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
  m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
  m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
  m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
  m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
  return OCL_SUCCESS;
 }
 /*
--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@ -1,3 +1,14 @@
 /*
  Name: OpenCLFFT
  Author: Uldis Locans
  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
  Data: 19.09.2014
 */
 #ifndef H_OPENCL_FFT
 #define H_OPENCL_FFT
@ -9,25 +20,12 @@
 #include "../Algorithms/FFT.h"
 #include "OpenCLBase.h"
-#include "clFFT.h"
+class OpenCLFFT : public DKSFFT {
 /**
 * OpenCL FFT class based on BaseFFT interface.
 * Uses clFFT library to perform FFTs on AMD gpus.
 * clFFT library works also on nvida GPUs and other devices that
 * support OpenCL.
 */
 class OpenCLFFT : public BaseFFT {
 private:
  OpenCLBase *m_oclbase;
  clfftSetupData fftSetup;
  clfftPlanHandle planHandleZ2Z;
  clfftPlanHandle planHandleD2Z;
  clfftPlanHandle planHandleZ2D;
  /*
    Info: call fft kernels to execute FFT of the given domain,
    data - devevice memory ptr, cdim - current dim to transform, 
@ -44,31 +42,15 @@ private:
  */
  int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
  /** Get clfftStatus and print the corresponding error message.
   *  clfftStatus is returned from all clFFT library functions, print error displays the
   *  corresponding error message. If "other" is printed then error code corresponds to 
   *  OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
   *  be checked to determine the reason for the error.
   */
  void printError(clfftStatus err);
 public:
  /* constructor - currently does nothing*/
  OpenCLFFT(OpenCLBase *base) {
    m_oclbase = base;
    /* Set up fft */
    cl_int err;
    err = clfftInitSetupData(&fftSetup);
    err = clfftSetup(&fftSetup);
    if (err != CL_SUCCESS)
      DEBUG_MSG("Error seting up clFFT");
  }
  /* destructor - currently does nothing*/
-  ~OpenCLFFT() { destroyFFT(); }
+  ~OpenCLFFT() { }
  /*
    Info: execute forward fft function with data set on device
@ -95,22 +77,35 @@ public:
    Info: set FFT size
    Return: success or error code
  */
-  int setupFFT(int ndim, int N[3]);
+  int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
-  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
-  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
-  int destroyFFT();
+  int destroyFFT() { return DKS_SUCCESS; }
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
-		   int streamId = -1);
+				int streamId = -1)
    {
      return DKS_ERROR;
    }
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
-		   int streamId = -1);
+				int streamId = -1)
-  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) {
+    {
-    return DKS_ERROR;
+      return DKS_ERROR;
-  }
+    }
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
    {
      return DKS_ERROR;
    }
  int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
  int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
  int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
  //void printData3DN4(cl_double2* &data, int N);
 };
--- a/src/OpenCL/OpenCLGreensFunction.cpp
+++ b/src/OpenCL/OpenCLGreensFunction.cpp
@ -1,181 +0,0 @@
 #include "OpenCLGreensFunction.h"
 #define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
 OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
  m_base = base;
  base_create = false;
 }
 OpenCLGreensFunction::OpenCLGreensFunction() {
  m_base = new OpenCLBase();
  base_create = true;
 }
 OpenCLGreensFunction::~OpenCLGreensFunction() {
  if (base_create)
    delete m_base;
 }
 int OpenCLGreensFunction::buildProgram() {
  char *kernel_file = new char[500];
  kernel_file[0] = '\0';
  strcat(kernel_file, OPENCL_KERNELS);
  strcat(kernel_file, GREENS_KERNEL);
  return m_base->ocl_loadKernel(kernel_file);
 }
 int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
 					 double hr_m0, double hr_m1, double hr_m2, 
 					 int streamId)
 {
  int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
  //set the work item size
  size_t work_size = 128;
  size_t work_items = I * J * K;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("kernelTmpgreen");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
  m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
  m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
  m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
  m_base->ocl_setKernelArg(4, sizeof(int), &I);
  m_base->ocl_setKernelArg(5, sizeof(int), &J);
  m_base->ocl_setKernelArg(6, sizeof(int), &K);
  //execute kernel
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
 int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
 						    int K, int streamId)
 {
  int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem rho2_ptr = (cl_mem)rho2_m;
  cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
  int NI = 2*(I - 1);
  int NJ = 2*(J - 1);
  //set the work item size
  size_t work_size = 128;
  size_t work_items = I * J * K;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("kernelIntegration");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
  m_base->ocl_setKernelArg(2, sizeof(int), &NI);
  m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
  m_base->ocl_setKernelArg(4, sizeof(int), &I);
  m_base->ocl_setKernelArg(5, sizeof(int), &J);
  m_base->ocl_setKernelArg(6, sizeof(int), &K);
  //execute kernel
  double zero = 0.0;
  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
  m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
 int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) 
 {
  int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem rho2_ptr = (cl_mem)rho2_m;
  int NI = I + 1;
  int NJ = J + 1;
  int NK = K + 1;
  int I2 = 2*I;
  int J2 = 2*J;
  int K2 = 2*K;
  int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
  //set the work item size
  size_t work_size = 128;
  size_t work_items = NI * NJ * NK;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
  m_base->ocl_setKernelArg(1, sizeof(int), &I2);
  m_base->ocl_setKernelArg(2, sizeof(int), &J2);
  m_base->ocl_setKernelArg(3, sizeof(int), &K2);
  m_base->ocl_setKernelArg(4, sizeof(int), &NI);
  m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
  m_base->ocl_setKernelArg(6, sizeof(int), &NK);
  m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
  //execute kernel
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
 int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
 {
    int ierr = DKS_SUCCESS;
  //compile opencl program from source
  buildProgram();
  //cast the input data ptr to cl_mem
  cl_mem mem_ptr1 = (cl_mem) ptr1;
  cl_mem mem_ptr2 = (cl_mem) ptr2;
  //set the work item size
  size_t work_size = 128;
  size_t work_items = size;
  if (work_items % work_size > 0) 
    work_items = (work_items / work_size + 1) * work_size;
  //create kernel
  ierr = m_base->ocl_createKernel("multiplyComplexFields");
  //set kernel parameters
  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
  m_base->ocl_setKernelArg(2, sizeof(int), &size);
  //execute kernel
  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
  return ierr;
 }
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@ -1,64 +0,0 @@
 #ifndef H_OPENCL_GREENSFUNCTION
 #define H_OPENCL_GREENSFUNCTION
 #include <iostream>
 #include <cmath>
 #include "../Algorithms/GreensFunction.h"
 #include "OpenCLBase.h"
 /** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class OpenCLGreensFunction : public GreensFunction {
 private:
  bool base_create;
  OpenCLBase *m_base;
 public:
  /** Constructor with OpenCLBase argument */
  OpenCLGreensFunction(OpenCLBase *base);
  /** Default constructor */
  OpenCLGreensFunction();
  /** Destructor */
  ~OpenCLGreensFunction();
  /** Load OpenCL kernel file containing greens function kernels.
   *  m_base takes the kernel file and compiles the OpenCL programm.
   */
  int buildProgram();
  /**
    Info: calc itegral on device memory (taken from OPAL src code).
    Return: success or error code
  */
  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
 		       double hr_m0, double hr_m1, double hr_m2, 
 		       int streamId = -1);
  /**
    Info: integration of rho2_m field (taken from OPAL src code).
    Return: success or error code
  */
  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
 				  int streamId = -1);
  /**
    Info: mirror rho field (taken from OPAL src code).
    Return: succes or error code
  */
  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
  /**
    Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
    Return: success or error code
  */
  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
 };
 #endif
--- a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
@ -106,56 +106,6 @@ double ifld(double t, double alpha, double phi, double nu, double lambdaT, doubl
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }
 double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
  double wt = TWO_PI*nu*t;
  double rate2 = sigma*sigma*t*t;
  double rateL = 0.0;
  double result = 0.0;
  // make sure lambda > 0
  if (lambda < 0.0)
    return 0.0;
  if (beta < 0.001) {
    rateL = 1.0;
  } else {
    rateL = pow(lambda*t, beta);
  }
  if (nu < 0.01) {
    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
  } else {
    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
  }
  return result;
 }
 double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
  double wt = TWO_PI*nu*t;
  double at = a*t;
  double rateL = 0.0;
  double result = 0.0;
  // make sure lambda > 0
  if (lambda < 0.0)
    return 0.0;
  if (beta < 0.001) {
    rateL = 1.0;
  } else {
    rateL = pow(lambda*t, beta);
  }
  if (nu < 0.01) {
    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
  } else {
    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
  }
  return result;
 }
 double b(double t, double phi, double nu) {
  return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
--- a/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
@ -1,4 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #pragma OPENCL EXTENSION 
 /******Random numbers********/
@ -87,14 +89,13 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
  if (id < N) {
    RNDState tmp;
-    int tmp_seed = 2*id;// * 0x100000000ULL;
+    int tmp_seed = id;// * 0x100000000ULL;
    tmp.s10 = 12345 + tmp_seed;
    tmp.s11 = 12345 + tmp_seed;
-    tmp.s12 = 12345 + tmp_seed;
+    tmp.s12 = 123 + tmp_seed;
    tmp.s20 = 12345 + tmp_seed;
    tmp.s21 = 12345 + tmp_seed;
-    tmp.s22 = 12345 + tmp_seed;
+    tmp.s22 = 123 + tmp_seed;
    tmp.z = 0;
    tmp.gen = true;
@ -104,19 +105,6 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
 }
 /* create random numbers and fill an array */
 __kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
  int idx = get_global_id(0);
  if (idx < size) {
    RNDState s = states[idx];
    data[idx] = rand_uniform(&s);
    states[idx] = s;
  }
 }
 /**********Degrader**********/
 enum PARAMS { POSITION, 
--- a/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
@ -1,170 +0,0 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 /** compute the greens integral analytically */
 __kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
 			     int NI, int NJ, int NK)
 {
  int tid = get_local_size(0);
  int id = get_global_id(0);
  if (id < NI * NJ * NK) {
    int i = id % NI;
    int k = id / (NI * NJ);
    int j = (id - k * NI * NJ) / NI;
    double cellVolume = hr_m0 * hr_m1 * hr_m2;
    double vv0 = i * hr_m0 - hr_m0 / 2;
    double vv1 = j * hr_m1 - hr_m1 / 2;
    double vv2 = k * hr_m2 - hr_m2 / 2;
    double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
    double tmpgrn  = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
    tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
    tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
    tmpgrn = tmpgrn / 2;
    tmpgrn += vv1 * vv2 * log(vv0 + r);
    tmpgrn += vv0 * vv2 * log(vv1 + r);
    tmpgrn += vv0 * vv1 * log(vv2 + r);
    tmpgreen[id] = tmpgrn / cellVolume;
  }
 }
 /** perform the actual integration */
 __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen, 
 				int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) 
 {
  int tid = get_local_id(0);
  int id = get_global_id(0);
  int ni = NI;
  int nj = NJ;
  double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  if (id < NI_tmp * NJ_tmp * NK_tmp) {
    int i = id % NI_tmp;
    int k = id / (NI_tmp * NJ_tmp);
    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
    tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
    tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
    if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
      tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
    if (i+1 < NI_tmp)
      tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
    if (j+1 < NJ_tmp)
      tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
    if (k+1 < NK_tmp)
      tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    if (i+1 < NI_tmp && j+1 < NJ_tmp)
      tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
    if (i+1 < NI_tmp && k+1 < NK_tmp)
      tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    if (j+1 < NJ_tmp && k+1 < NK_tmp)
      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
  }
 }
 /** miror rho-field */
 __kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
  rho2_m[0] = rho2_m[NI*NJ];
 }
 __kernel void kernelMirroredRhoField(__global double *rho2_m, 
 				     int NI, int NJ, int NK, 
 				     int NI_tmp, int NJ_tmp, int NK_tmp,
 				     int size) 
 {
  int tid = get_local_id(0);
  int id = get_global_id(0);
  if (id == 0)
   rho2_m[0] = rho2_m[NI * NJ];
  barrier(CLK_GLOBAL_MEM_FENCE);
  int id1, id2, id3, id4, id5, id6, id7, id8;
  if (id < NI_tmp * NJ_tmp * NK_tmp) {
    int i = id % NI_tmp;
    int k = id / (NI_tmp * NJ_tmp);
    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
    int ri = NI - i;
    int rj = NJ - j;
    int rk = NK - k;
    id1 = k * NI * NJ + j * NI + i;
    id2 = k * NI * NJ + j * NI + ri;
    id3 = k * NI * NJ + rj * NI + i;
    id4 = k * NI * NJ + rj * NI + ri;
    id5 = rk * NI * NJ + j * NI + i;
    id6 = rk * NI * NJ + j * NI + ri;
    id7 = rk * NI * NJ + rj * NI + i;
    id8 = rk * NI * NJ + rj * NI + ri;
    double data = 0.0;
    if (id1 < size)
      data = rho2_m[id1];
    if (i != 0 && id2 < size) rho2_m[id2] = data;
    if (j != 0 && id3 < size) rho2_m[id3] = data;
    if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
    if (k != 0 && id5 < size) rho2_m[id5] = data;
    if (k !=  0 && i != 0 && id6 < size) rho2_m[id6] = data;
    if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
    if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;     
  }
 }
 /** multiply complex fields */
 double2 ComplexMul(double2 a, double2 b) {
  double2 c;
  c.x = a.x * b.x - a.y * b.y;
  c.y = a.x * b.y + a.y * b.x;
  return c;
 }
 __kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2, 
 				    int size) 
 {
  int idx = get_global_id(0);
  if (idx < size)
    ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
 }
--- a/src/Utility/DKSTimer.h
+++ b/src/Utility/DKSTimer.h
@ -5,10 +5,6 @@
 #include <string>
 #include <sys/time.h>
 /**
 * Custom timer class.
 * Allows to insert timers in the code to get function exectution times.
 */
 class DKSTimer {
 private:
@ -21,45 +17,39 @@ private:
 public:
-  /** Init DKSTimer by seting timer to zero. */
+  /** Init DKSTimer by seting timer to zero  */
  DKSTimer();
  ~DKSTimer();
-  /** 
+  /** Init the timer
-   * Init the timer.
+   *  Set the name for timer and clear all values
   * Set the name for timer and clear all values
   */
  void init(std::string n);
-  /** 
+  /** Start the timer.
-   * Start the timer.
+   *  Get the curret time with gettimeofday and save in timeStart
   * Get the curret time with gettimeofday and save in timeStart
   */
  void start();
-  /** 
+  /** Stop the timer 
-   * Stop the timer.
+   *  Get the curretn time with gettimeofday and save in timeEnd
-   * Get the curretn time with gettimeofday and save in timeEnd
+   *  Calculate elapsed time by timeEnd - timeStart and add to timervalue
   * Calculate elapsed time by timeEnd - timeStart and add to timervalue
   */
  void stop();
-  /** 
+  /** Reset timervalue to zero.
-   * Reset timervalue to zero.
+   *  Set timervalue, timeStart and timeEnd to zero
   * Set timervalue, timeStart and timeEnd to zero
   */
  void reset();
-  /** 
+  /** Return elapsed time in seconds.
-   * Return elapsed time in seconds.
+   *  Return the value of timervalue
   * Return the value of timervalue
   */
  double gettime();
-  /** 
+  /** Print timer.
-   * Print timer.
+   *  Print the elapsed time of the timer
   * Print the elapsed time of the timer
   */
  void print();
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #ADD_EXECUTABLE(testFFT testFFT.cpp)
 #ADD_EXECUTABLE(testMIC testMIC.cpp)
 #ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
-ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
+#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
-ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
+#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
 #ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
 #ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
 #ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
@ -22,11 +22,10 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
 #ADD_EXECUTABLE(testGather testGather.cpp)
 #ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
 #ADD_EXECUTABLE(testTranspose testTranspose.cpp)
 ADD_EXECUTABLE(testRandom testRandom.cpp)
 ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
-ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
+#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
 #ADD_EXECUTABLE(testPush testPush.cpp)
-ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
+#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
 #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
@ -39,8 +38,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #TARGET_LINK_LIBRARIES(testFFT dks)
 #TARGET_LINK_LIBRARIES(testMIC dks)
 #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
-TARGET_LINK_LIBRARIES(testFFT3D dks ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testFFT3D dks)
-TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
 #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
 #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
 #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -54,11 +53,10 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testGather dks)
 #TARGET_LINK_LIBRARIES(testGatherAsync dks)
 #TARGET_LINK_LIBRARIES(testTranspose dks)
-TARGET_LINK_LIBRARIES(testRandom dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
-TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
 TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testPush dks)
-TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
+#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
 #TARGET_LINK_LIBRARIES(testIntegration dks)
 #TARGET_LINK_LIBRARIES(testImageReconstruction dks)
@ -83,4 +81,4 @@ TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
 #IF (NOT CUDA_VERSION VERSION_LESS "7.0")
  #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
  #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
-#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
+#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
--- a/test/testCollimatorPhysicsSoA.cpp
+++ b/test/testCollimatorPhysicsSoA.cpp
@ -129,9 +129,7 @@ int main(int argc, char *argv[]) {
  //init random
  base.callInitRandoms(numpart);
  //**test collimator physics and sort***//
  void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
  //allocate memory for particles
@ -212,8 +210,8 @@ int main(int argc, char *argv[]) {
  base.freeMemory<double>(pz_ptr, numpart);
  base.freeMemory<double>(param_ptr, 12);
-  
+
-    /*  
+  /*  
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++) {
    std::cout <<  p.label[i] << "\t" << p.rx[i] 
--- a/test/testFFT3D.cpp
+++ b/test/testFFT3D.cpp
@ -1,7 +1,6 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
 #include <string>
 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@ -19,30 +18,22 @@ int main(int argc, char *argv[]) {
  int N = 16;
  char *api_name = new char[10];
  char *device_name = new char[10];
-
+  if (argc == 2) {
-  for (int i = 1; i < argc; i++) {
+    N = atoi(argv[1]);
-    if (argv[i] == string("-cuda")) {
+    strcpy(api_name, "Cuda");
-      strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
-      strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
-    } 
+    N = atoi(argv[1]);
-
+    strcpy(api_name, argv[2]);
-    if (argv[i] == string("-opencl")) {
+    strcpy(device_name, "-gpu");
-      strcpy(api_name, "OpenCL");
+  } else if (argc == 4) {
-      strcpy(device_name, "-gpu");
+    N = atoi(argv[1]);
-    } 
+    strcpy(api_name, argv[2]);
-
+    strcpy(device_name, argv[3]);
-    if (argv[i] == string("-mic")) {
+  } else {
-      strcpy(api_name, "OpenMP");
+    N = 16;
-      strcpy(device_name, "-mic");
+    strcpy(api_name, "OpenCL");
-    } 
+    strcpy(device_name, "-gpu");
    if (argv[i] == string("-cpu")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-cpu");
    }
    if (argv[i] == string("-N"))
      N = atoi(argv[i+1]);
  }
  cout << "Use api: " << api_name << ", " << device_name << endl;
@ -83,16 +74,9 @@ int main(int argc, char *argv[]) {
  /* write data to device */	
  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
  if (N < 5)
    printData3DN4(cdata, N, 3);
  /* execute fft */
  base.callFFT(mem_ptr, 3, dimsize);
  if (N < 5) {
    base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
    printData3DN4(cfft, N, 3);
  }
  /* execute ifft */	
  base.callIFFT(mem_ptr, 3, dimsize);
@ -102,9 +86,7 @@ int main(int argc, char *argv[]) {
  /* read data from device */
  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
-  if (N < 5)
+	
    printData3DN4(cifft, N, 3);	
  /* free device memory */
  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
@ -148,7 +130,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
 	if (a < 10e-5 && a > -10e-5)
 	  a = 0;
-	cout << "(" << d << "," << a << ") ";
+	cout << d << "; " << a << "\t";
      }
    }
    cout << endl;
@ -175,5 +157,3 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
 }
--- a/test/testFFT3DRC.cpp
+++ b/test/testFFT3DRC.cpp
@ -1,8 +1,6 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
 #include <fstream>
 #include <iomanip>
 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@ -10,53 +8,54 @@
 using namespace std;
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
-void initData(double *data, int dimsize[3], int dim);
+void initData(double *data, int dimsize[3]);
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim, 
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
 		char *api_name, char *device_name, char *file_name);
 void printHelp();
 void printData3DN4(complex<double>* &data, int N, int dim);
 void printData3DN4(double* &data, int N, int dim);
 double precision(double a) {
  //if (a < 1e-10)
  //  return 0.0;
  //else
    return a;
 }
 int main(int argc, char *argv[]) {
  int N1 = 8;
  int N2 = 8;
  int N3 = 8;
  int dim = 3;
-  int loop = 0;
+  int loop = 10;
  char *api_name = new char[10];
  char *device_name = new char[10];
  char *file_name = new char[50];
-  if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
+  if ( readParams(argc, argv, N1, N2, N3, loop) )
    return 0;
-  cout << "Use api: " << api_name << ", " << device_name << endl;
+  int dimsize[3] = {N3, N2, N1};
  int dimsize[3] = {N1, N2, N3};
  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
  double *rdata = new double[sizereal];
  double *outdata = new double[sizereal];
  complex<double> *cfft = new complex<double>[sizecomp];
-  initData(rdata, dimsize, dim);
+
  for (int i=0; i<sizecomp; ++i) {
    cfft[i].real() = 7.;
    cfft[i].imag() = 3.33;
  }
  initData(rdata, dimsize);
  /* init DKSBase */
  cout << "Init device and set function" << endl;
 #ifdef DKS_MIC
  DKSBase base;
-  base.setAPI(api_name, strlen(api_name));
+  base.setAPI("OpenMP", 6);
-  base.setDevice(device_name, strlen(device_name));
+  base.setDevice("-mic", 4);
  base.initDevice();
  base.setupFFTRC(dim, dimsize);
  /* setup backward fft (COMPLEX->REAL) */
  base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
 #endif
 #ifdef DKS_CUDA
  DKSBase base;
  base.setAPI("Cuda", 4);
  base.setDevice("-gpu", 4);
  base.initDevice();
  base.setupFFT(dim, dimsize);
 #endif
  // allocate memory on device
  int ierr;
@ -68,59 +67,69 @@ int main(int argc, char *argv[]) {
  // execute one run before starting the timers
  base.writeData<double>(real_ptr, rdata, sizereal);
  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
  base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
  base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
  base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
  base.readData<double>(real_res_ptr, outdata, sizereal);
  //timer for total loop time, FFT and IFFT calls
  struct timeval timeStart, timeEnd;
  struct timeval timeFFTStart[loop], timeFFTEnd[loop];
  struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
  gettimeofday(&timeStart, NULL);
  for (int i=0; i<loop; ++i){
    // write data to device
    base.writeData<double>(real_ptr, rdata, sizereal);
    // execute rcfft
    gettimeofday(&timeFFTStart[i], NULL);
    base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
    gettimeofday(&timeFFTEnd[i], NULL);
    // execute crfft
    gettimeofday(&timeIFFTStart[i], NULL);
    base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
    gettimeofday(&timeIFFTEnd[i], NULL);
    //normalize
 #ifdef DKS_CUDA
    base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
 #endif
    // read IFFT data from device
    base.readData<double>(real_res_ptr, outdata, sizereal);
  ofstream myfile;
  myfile.open(file_name);
  myfile<< "in\tout\treal\timag\n";
  for (int i = 0; i < sizereal; i++) {
    //myfile << precision(rdata[i]) << "\t";
    //myfile << precision(outdata[i]) << "\t";
    if (i < sizecomp) {
      myfile << precision(cfft[i].real()) << "\t";
      myfile << precision(cfft[i].imag());
    }
    myfile << "\n";
  }
-  myfile.close();
+  gettimeofday(&timeEnd, NULL);
 /*
  if (dim == 2) {
    for (int i = 0; i < N2; i++) {
      for (int j = 0; j < N1; j++) {
 	cout << rdata[i*N1 + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }
  if (dim == 2) {
    for (int i = 0; i < N2; i++) {
      for (int j = 0; j < N1 / 2 + 1; j++) {
 	cout << cfft[i*(N1 / 2 + 1)  + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }
 */
  // free device memory
  base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
  base.freeMemory<double>(real_ptr, sizereal);
  base.freeMemory<double>(real_res_ptr, sizereal);
  // compare in and out data to see if we get back the same results
  cout << "comp" << endl;
  compareData(rdata, outdata, N1, N2, N3, dim);
-  cout << "done" << endl;
+
  //calculate seconds for total time and fft times
  double tfft = 0;
  double tifft = 0;
  double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + 
 		  (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
  for (int i = 0; i < loop; i++) {
    tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + 
 	      (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
    tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + 
 	      (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
  }
  //print timing results
  std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
 	    << "\nTotal time\t" << ttot <<  "s\tavg time\t"  << ttot / loop  << "s"
 	    << "\nFFT total\t"  << tfft <<  "s\tFFT avg \t"  << tfft / loop  << "s"
 	    << "\nIFFT total\t" << tifft << "s\tIFFT avg\t"  << tifft / loop << "s"
 	    << "\n\n";
  return 0;
 }
@ -128,10 +137,10 @@ int main(int argc, char *argv[]) {
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
  int id;
  double sum = 0;
-  for (int i = 0; i < NK; i++) {
+  for (int i = 0; i < NI; i++) {
    for (int j = 0; j < NJ; j++) {
-      for (int k = 0; k < NI; k++) {
+      for (int k = 0; k < NK; k++) {
-	id = i*NI*NJ + j*NI + k;
+	id = k*NI*NJ + j*NI + i;
 	sum += fabs(data1[id] - data2[id]);
      }
    }
@ -139,21 +148,13 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
  std::cout << "RC <--> CR diff: " << sum << std::endl;
 }
-void initData(double *data, int dimsize[3], int dim) {
+void initData(double *data, int dimsize[3]) {
-  if (dim == 3) {
+  for (int i = 0; i < dimsize[2]; i++) {
    for (int i = 0; i < dimsize[2]; i++)
      for (int j = 0; j < dimsize[1]; j++) 
 	for (int k = 0; k < dimsize[0]; k++) 
 	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
  } else if (dim == 2) {
    for (int j = 0; j < dimsize[1]; j++) {
      for (int k = 0; k < dimsize[0]; k++) {
-	data[j*dimsize[0] + k] = sin(k);
+	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
      }
    }
  } else {
    for (int k = 0; k < dimsize[0]; k++) 
      data[k] = sin(k);
  }
 }
@ -172,17 +173,10 @@ void printHelp() {
  std::cout << std::endl;
 }
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
 		char *api_name, char *device_name, char *file_name) 
 {
  for (int i = 1; i < argc; i++) {
    if ( argv[i] == std::string("-dim")) {
      dim = atoi(argv[i + 1]);
      i++;
    }
    if ( argv[i] == std::string("-grid") ) {
      N1 = atoi(argv[i + 1]);
      N2 = atoi(argv[i + 2]);
@ -199,72 +193,7 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, in
      printHelp();
      return true;
    }
    if (argv[i] == string("-cuda")) {
      strcpy(api_name, "Cuda");
      strcpy(device_name, "-gpu");
      strcpy(file_name, "cuda_fft.dat");
    } 
    if (argv[i] == string("-opencl")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-gpu");
      strcpy(file_name, "opencl_fft.dat");
    } 
    if (argv[i] == string("-mic")) {
      strcpy(api_name, "OpenMP");
      strcpy(device_name, "-mic");
      strcpy(file_name, "openmp_fft.dat");
    } 
    if (argv[i] == string("-cpu")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-cpu");
      strcpy(file_name, "opencl_cpu_fft.dat");
    }
  }
  return false;
 }
 void printData3DN4(complex<double>* &data, int N, int dim) {
  for (int j = 0; j < N; j++) {
    for (int i = 0; i < N; i++) {
      for (int k = 0; k < N/2 + 1; k++) {
 	double d = data[i*N*N + j*N + k].real();
 	double a = data[i*N*N + j*N + k].imag();
 	if (d < 10e-5 && d > -10e-5)
 	  d = 0;
 	if (a < 10e-5 && a > -10e-5)
 	  a = 0;
 	cout << "(" << d << "," << a << ") ";
      }
    }
    cout << endl;
  }
  cout << endl;
 }
 void printData3DN4(double* &data, int N, int dim) {
  for (int j = 0; j < N; j++) {
    for (int i = 0; i < N; i++) {
      for (int k = 0; k < N; k++) {
 	double d = data[i*N*N + j*N + k];
 	if (d < 10e-5 && d > -10e-5)
 	  d = 0;
 	cout << d << " ";
      }
    }
    cout << endl;
  }
  cout << endl;
 }
--- a/test/testFFTSolver_MIC.cpp
+++ b/test/testFFTSolver_MIC.cpp
@ -1,4 +1,5 @@
 #include <iostream>
 //#include <mpi.h>
 #include <string.h>
 #include "DKSBase.h"
@ -10,265 +11,309 @@ using namespace std;
 void printData3D(double* data, int N, int NI, const char *message = "") {
-  if (strcmp(message, "") != 0)
+	if (strcmp(message, "") != 0)
-    cout << message;
+		cout << message;
-  for (int i = 0; i < NI; i++) {
+	for (int i = 0; i < NI; i++) {
-    for (int j = 0; j < N; j++) {
+		for (int j = 0; j < N; j++) {
-      for (int k = 0; k < N; k++) {
+			for (int k = 0; k < N; k++) {
-	cout << data[i*N*N + j*N + k] << "\t";
+				cout << data[i*N*N + j*N + k] << "\t";
-      }
+			}
-      cout << endl;
+			cout << endl;
-    }
+		}
-    cout << endl;
+		cout << endl;
-  }
+	}
 }
 void initData(double *data, int N) {
-  for (int i = 0; i < N/4 + 1; i++) {
+	for (int i = 0; i < N/4 + 1; i++) {
-    for (int j = 0; j < N/2 + 1; j++) {
+		for (int j = 0; j < N/2 + 1; j++) {
-      for (int k = 0; k < N/2 + 1; k++) {
+			for (int k = 0; k < N/2 + 1; k++) {
-	data[i*N*N + j*N + k] = k+1;
+				data[i*N*N + j*N + k] = k+1;
-      }
+			}
-    }
+		}
-  }
+	}
 }
 void initData2(double *data, int N) {
-  for (int i = 0; i < N; i++)
+	for (int i = 0; i < N; i++)
-    data[i] = i;
+		data[i] = i;
 }
 void initComplex( complex<double> *d, int N) {
-  for (int i = 0; i < N; i++) {
+	for (int i = 0; i < N; i++) {
-    d[i] = complex<double>(2, 0);
+		d[i] = complex<double>(2, 0);
-  }
+	}
 }
 void printComplex(complex<double> *d, int N) {
-  for (int i = 0; i < N; i++)
+	for (int i = 0; i < N; i++)
-    cout << d[i] << "\t";
+		cout << d[i] << "\t";
-  cout << endl;
+	cout << endl;
 }
 void printDouble(double *d, int N) {
  for (int i = 0; i < N; i++)
    cout << d[i] << ", ";
  cout << endl;
 }
 void initMirror(double *data, int n1, int n2, int n3) {
-  int d = 1;
+	int d = 1;
-  for (int i = 0; i < n3; i++) {
+	for (int i = 0; i < n3; i++) {
-    for (int j = 0; j < n2; j++) {
+		for (int j = 0; j < n2; j++) {
-      for (int k = 0; k < n1; k++) {
+			for (int k = 0; k < n1; k++) {
-	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+				if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
-	  data[i * n2 * n1 + j * n1 + k] = d++;
+					data[i * n2 * n1 + j * n1 + k] = d++;
-	else
+				else
-	  data[i * n2 * n1 + j * n1 + k] = 0;
+					data[i * n2 * n1 + j * n1 + k] = 0;
-      }
+			}
-    }
+		}
-  }
+	}
 }
 void printDiv(int c) {
-  for (int i = 0; i < c; i++)
+	for (int i = 0; i < c; i++)
-    cout << "-";
+		cout << "-";
-  cout << endl;
+	cout << endl;
 }
 void printMirror(double *data, int n1, int n2, int n3) {
-  printDiv(75);
+	printDiv(75);
-  for (int i = 0; i < n3; i++) {
+	for (int i = 0; i < n3; i++) {
-    for (int j = 0; j < n2; j++) {
+		for (int j = 0; j < n2; j++) {
-      for (int k = 0; k < n1; k++) {
+			for (int k = 0; k < n1; k++) {
-	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+				cout << data[i * n2 * n1 + j * n1 + k] << "\t";
-      }
+			}
-      cout << endl;
+			cout << endl;
-    }
+		}
-    cout << endl;
+		cout << endl;
-  }
+	}
-  cout << endl;
+	cout << endl;
 }
 double sumData(double *data, int datasize) {
-  double sum = 0;
+	double sum = 0;
-  for (int i = 0; i < datasize; i++)
+	for (int i = 0; i < datasize; i++)
-    sum += data[i];
+		sum += data[i];
-  return sum;
+	return sum;
 }
 int main(int argc, char *argv[]) {
-  char *api_name = new char[10];
+	/* mpi init */
-  char *device_name = new char[10];
+	//int rank, nprocs;
 	//MPI_Init(&argc, &argv);
 	//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-  for (int i = 1; i < argc; i++) {
+	/*
-    if (argv[i] == string("-cuda")) {
+	   if (nprocs != 8) {
-      strcpy(api_name, "Cuda");
+	   cout << "example was set to run with 8 processes" << endl;
-      strcpy(device_name, "-gpu");
+	   cout << "exit..." << endl;
-    } 
+	   return 0;
 	   }
 	   */
-    if (argv[i] == string("-opencl")) {
+	/* set domain size */
-      strcpy(api_name, "OpenCL");
+	int NG[3] = {64, 64, 32};
-      strcpy(device_name, "-gpu");
+	int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
-    } 
+	int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
 	int sizerho = NG[0] * NG[1] * NG[2];
 	int sizegreen = ng[0] * ng[1] * ng[2];
 	int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
 	int id[3];
-    if (argv[i] == string("-mic")) {
+	//id[0] = 0;
-      strcpy(api_name, "OpenMP");
+	//id[1] = NL[1] * (rank % 4);
-      strcpy(device_name, "-mic");
+	//id[2] = NL[2] * (rank / 4);
    } 
-    if (argv[i] == string("-cpu")) {
+	/* print some messages bout the example in the begginig */
-      strcpy(api_name, "OpenCL");
+	cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
-      strcpy(device_name, "-cpu");
+	//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
-    }
+	cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
-  }
+	//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
 	int tmp[3];
 	/*  for (int p = 1; p < nprocs; p++) {
 		MPI_Status mpistatus;
 		MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
 		cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
 		}*/
 	// } else {
 	//   MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
 	// }
-  cout << "Use api: " << api_name << ", " << device_name << endl;
+	/* dks init and create 2 streams */
 	int dkserr;
 	//int streamGreens, streamFFT;
 #ifdef DKS_MIC
 	DKSBase base;
 	base.setAPI("OpenMP", 6);
 	base.setDevice("-mic", 4);
 	base.initDevice();
 #endif
-  /* set domain size */
+#ifdef DKS_CUDA
-  int NG[3] = {64, 64, 32};
+	DKSBase base;
-  int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+	base.setAPI("Cuda", 4);
-  int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+	base.setDevice("-gpu", 4);
-  int sizerho = NG[0] * NG[1] * NG[2];
+	base.initDevice();
-  int sizegreen = ng[0] * ng[1] * ng[2];
+#endif
  int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
-  /* print some messages bout the example in the begginig */
+	//base.createStream(streamFFT);
-  cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+	//if (rank == 0) {
-  cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
+	//  base.createStream(streamGreens);
 	base.setupFFT(3, NG);
 	//}
-  /* dks init and create 2 streams */
+	/* allocate memory and init rho field */
-  int dkserr;
+	double *rho = new double[sizerho];
-  DKSBase base;
+	double *rho_out = new double[sizerho];
-  base.setAPI(api_name, strlen(api_name));
+	//double *green_out = new double[sizegreen];
-  base.setDevice(device_name, strlen(device_name));
+	initMirror(rho, NL[0], NL[1], NL[2]);
  base.initDevice();
  base.setupFFT(3, NG);
-  /* allocate memory and init rho field */
+	/*
-  double *rho = new double[sizerho];
+	   allocate memory on device for 
-  double *rho_out = new double[sizerho];
+	   - rho field
-  //double *green_out = new double[sizegreen];
+	   - rho FFT
-  double *mirror_out = new double[sizerho];
+	   - tmpgreen
-  //initMirror(rho, NL[0], NL[1], NL[2]);
+	   - greens integral
-  initMirror(rho, NG[0], NG[1], NG[2]);
+	   - greens integral FFT
 	   */
 	void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
 	// if (rank == 0) {
 	tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
 	rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
 	grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
 	rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
 	grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
 	/* } else {
 	   grntr_ptr = NULL;
 	   rho2_ptr = NULL;
 	   grn_ptr = NULL;
 	   rho2tr_ptr = NULL;
 	   tmpgreen_ptr = NULL;
 	   }*/
  /*
    allocate memory on device for 
    - rho field
    - rho FFT
    - tmpgreen
    - greens integral
    - greens integral FFT
  */
  void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
  tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
  rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
  grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
  rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
  grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
-  /* =================================================*/
+	/* send and receive pointer to allocated memory on device */
-  /* =================================================*/
+	/*
-  /* =====loop trough fftpoison solver iterations=====*/
+	   if (rank == 0) {
-  /* =================================================*/
+	   for (int p = 1; p < nprocs; p++)
-  /* =================================================*/
+	   base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
 	   } else {
 	   rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
 	   }
 	   MPI_Barrier(MPI_COMM_WORLD);
 	   */
  double old_sum = 0;
-  int hr_m[3] = {1, 1, 1};
+	/* =================================================*/
-  base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
+	/* =================================================*/
 	/* =====loop trough fftpoison solver iterations=====*/
 	/* =================================================*/
 	/* =================================================*/
-  /* calculate greens integral on gpu */
+	double old_sum = 0;
-  base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
+	double tmp_sum = 0;
 	for (int l = 0; l < 100; l++) {
 		//MPI_Barrier(MPI_COMM_WORLD);
 		/* on node 0, calculate tmpgreen on gpu */
 		int hr_m[3] = {1, 1, 1};
 		//if (rank == 0)
 		base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
 				hr_m[0], hr_m[1], hr_m[2]);
-  /* mirror the field */
+		/* calculate greens integral on gpu */
-  base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+		//if (rank == 0)
-  /*
+		base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
  base.readData<double>(grn_ptr, mirror_out, sizerho);
  for (int i = 0; i < sizerho; i++)
    cout << mirror_out[i] << " ";
  cout << endl << endl;
-  for (int i = 0; i < sizerho; i++)
+		/* mirror the field */
-    cout << rho[i] << " ";
+		//if (rank == 0)
-  cout << endl << endl;
+		base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
  */
  /* transfer rho field to device */
  base.writeData<double>(rho2_ptr, rho, sizerho);
  /* get FFT of rho field */
  base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
-  /* get FFT of mirrored greens integral */
+		/* get FFT of mirrored greens integral */
-  base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
+		//if (rank == 0) 
 		base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
-  /* multiply both FFTs */
+		/* transfer rho field to device */
-  base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
+		//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
 		base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
 		//MPI_Barrier(MPI_COMM_WORLD);
-  /*
+		/* get FFT of rho field */
-  complex<double> *crho = new complex<double>[sizecomp];
+		//if (rank == 0) {
-  complex<double> *cgre = new complex<double>[sizecomp];
+		//base.syncDevice();
-  base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
+		base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
-  base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
+		//}
-  for (int i = 0; i < sizecomp; i++)
+		/* multiply both FFTs */
-    cout << cgre[i].real() << " ";
+		//if (rank == 0)
-  cout << endl << endl;
+		base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
 		//MPI_Barrier(MPI_COMM_WORLD);
-  for (int i = 0; i < sizecomp; i++)
+		/* inverse fft and transfer data back */
-    cout << crho[i].real() << " ";
+		/* 
-  cout << endl << endl;
+		   multiple device syncs and mpi barriers are used to make sure data 
-  
+		   transfer is started when results are ready and progam moves on 
-  delete[] crho;
+		   only when data transfer is finished
-  delete[] cgre;
+		   */
-  */
+		//if (rank == 0) {
 		base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
 		//base.syncDevice();
 		//MPI_Barrier(MPI_COMM_WORLD);
 		//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
 		base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
 		//MPI_Barrier(MPI_COMM_WORLD);
 		//base.syncDevice();
 		//MPI_Barrier(MPI_COMM_WORLD);
 		//cout << "result: " << sumData(rho_out, sizerho) << endl;
 		if (l == 0) { 
 			old_sum = sumData(rho_out, sizerho);
 		} else {
 			tmp_sum = sumData(rho_out, sizerho);
 			if (old_sum != tmp_sum) {
 				cout << "diff in iteration: " << l << endl;
 			}
 		}
 		/*} else {
 		  MPI_Barrier(MPI_COMM_WORLD);
 		  base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
 		  MPI_Barrier(MPI_COMM_WORLD);
 		  MPI_Barrier(MPI_COMM_WORLD);
 		  }
 		  */
  /* inverse fft and transfer data back */
  /* 
     multiple device syncs and mpi barriers are used to make sure data 
     transfer is started when results are ready and progam moves on 
       only when data transfer is finished
  */
  base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
  base.readData<double> (rho2_ptr, rho_out, sizerho);
  for (int i = 0; i < 10; i++)
    cout << rho_out[i] << " ";
  cout << endl;
  old_sum = sumData(rho_out, sizerho);
 	}
 /* =================================================*/  
 /* =================================================*/
 /* ==========end fftpoison solver test run==========*/
 /* =================================================*/
 /* =================================================*/
  base.freeMemory<double>(tmpgreen_ptr, sizegreen);
  base.freeMemory<double>(grn_ptr, sizerho);
  base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
  base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
  base.freeMemory<double>(rho2_ptr, sizerho);
-  delete[] rho_out;
+
-  delete[] rho;
+/* free memory on device */
-  delete[] mirror_out;
+//if (rank == 0) {
-  cout << "Final sum: " << old_sum << endl;
+base.freeMemory<double>(tmpgreen_ptr, sizegreen);
 base.freeMemory<double>(grn_ptr, sizerho);
 base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
 base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
 //MPI_Barrier(MPI_COMM_WORLD);
 base.freeMemory<double>(rho2_ptr, sizerho);
 cout << "Final sum: " << old_sum << endl;
 /*} else {
  base.closeHandle(rho2_ptr);
  MPI_Barrier(MPI_COMM_WORLD);
  }*/
 //MPI_Finalize();
 }
--- a/test/testRandom.cpp
+++ b/test/testRandom.cpp
@ -1,81 +0,0 @@
 #include <iostream>
 #include <string>
 #include <vector>
 #include <sys/time.h>
 #include "DKSBase.h"
 using namespace std;
 int main(int argc, char *argv[]) {
  int size = 10;
  bool apiSet = false;
  char *api_name = new char[10];
  char *device_name = new char[10];
  for (int i = 1; i < argc; i++) {
    if (argv[i] == string("-cuda")) {
      strcpy(api_name, "Cuda");
      strcpy(device_name, "-gpu");
      apiSet = true;
    }
    if (argv[i] == string("-opencl")) {
      strcpy(api_name, "OpenCL");
      strcpy(device_name, "-gpu");
      apiSet = true;
    }
    if (argv[i] == string("-N")) {
      size = atoi(argv[i+1]);
      i++;
    }
  }
  if (!apiSet) {
    strcpy(api_name, "Cuda");
    strcpy(device_name, "-gpu");
  }
  cout << "=========================BEGIN TEST=========================" << endl;
  cout << "Use api: " << api_name << "\t" << device_name << endl;
  cout << "Number of randoms: " << size << endl;
  //init dks
  int ierr;
  DKSBase base;
  base.setAPI(api_name, strlen(api_name));
  base.setDevice(device_name, strlen(api_name));
  base.initDevice();
  base.callInitRandoms(size);
  //create host vector to store results
  double *host_data = new double[size];
  //create device vector
  void *device_data = base.allocateMemory<double>(size, ierr);
  for (int i = 0; i < 5; i++) {
    //fill device vector with random values
    base.callCreateRandomNumbers(device_data, size);
    //read device vector
    base.readData<double>(device_data, host_data, size);
    //print host data
    for (int i = 0; i < size; i++)
      cout << host_data[i] << " ";
    cout << endl;
  }
  //free device vector
  base.freeMemory<double>(device_data, size);
  //free host data
  delete[] host_data;
  return 0;
 }
Author	SHA1	Message	Date
Uldis Locans	24f394c693	add enableRutherfordScattering option to OPALs collimatorPhysics GPU version	2017-04-24 10:44:41 +02:00
Uldis Locans	8f00d2a593	Allow other applications check for DKS version	2017-04-05 17:00:52 +02:00
Uldis Locans	1606b641d4	add seed to random number initialization	2017-03-17 10:43:41 +01:00