changed version to 1.1.4

adapted for CUDA 11
added the two new muSR functions ifgk and ifll (CUDA/OpenCL).
2020-06-09 13:04:48 +02:00 · 2020-06-09 12:55:55 +02:00 · 2019-01-22 14:10:02 +01:00 · 2018-12-11 11:35:32 +01:00 · 2017-09-19 13:36:42 +02:00 · 2017-08-21 14:16:57 +02:00
80 changed files with 6586 additions and 2274 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,16 +1,18 @@
 CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
-SET (DKS_VERSION_MINOR 0.1)
+SET (DKS_VERSION_MINOR 1)
+SET (DKS_VERSION_PATCH 4)
+set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
-SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\")
+SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
 SET (PACKAGE_NAME \"DKS\")
-SET (PACKAGE_STRING \"DKS\ 1.0.1\")
 SET (PACKAGE_TARNAME \"dks\")
-SET (PACKAGE_VERSION \"1.0.1\")
-SET (VERSION \"1.0.1\")
-
+SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
 SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+if (APPLE)
+  SET (CMAKE_MACOSX_RPATH TRUE)
+endif (APPLE)

 #get compiler name
 #STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
@ -29,24 +31,60 @@ MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
 set (BOOSTROOT $ENV{BOOST_DIR})
 SET (Boost_USE_STATIC_LIBS OFF)
 SET (Boost_USE_STATIC_RUNTIME OFF)
-FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
+#FIND_PACKAGE(Boost 1.55 REQUIRED COMPONENTS filesystem system)
+FIND_PACKAGE(Boost 1.41 REQUIRED)
 IF (Boost_FOUND)
+  MESSAGE (STATUS "Boost version: ${Boost_VERSION}")
  MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
  MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
-  MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
+  #MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
  INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
  LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
 ENDIF (Boost_FOUND)

+#include OPAL, musrfit or pet kernels
+OPTION(DKS_FULL "Compile DKS with full library" OFF)
+OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
+OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
+OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
+
+IF (DKS_FULL)
+  SET(ENABLE_OPAL ON)
+  SET(ENABLE_MUSR ON)
+  SET(ENABLE_PET ON)
+ENDIF(DKS_FULL)
+
+#find clFFT
+OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
+IF (ENABLE_AMD)
+  SET (clFFT_USE_STATIC_LIBS OFF)
+  FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
+  MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
+  MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
+  INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
+  LINK_DIRECTORIES (${CLFFT_LIBRARIES})
+
+  #find clRNG
+  #SET (clRNG_USE_STATIC_LIBS OFF)
+  #FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
+  #MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
+  #MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
+  #INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
+  #LINK_DIRECTORIES (${CLRNG_LIBRARIES})
+  #find_package(PkgConfig)
+  #pkg_check_modules(clRng REQUIRED)
+
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
+ENDIF (ENABLE_AMD)
+
 #enable UQTK
 OPTION (USE_UQTK "Use UQTK" OFF)

-
 #intel icpc compiler specific flags
 IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)

  #for intel compiler turn on openmp and opencl
-  OPTION (USE_OPENCL "Use OpenCL" ON)
+  OPTION (USE_OPENCL "Use OpenCL" OFF)
  OPTION (USE_CUDA "Use CUDA" OFF)
  OPTION (USE_MIC "Use intel MIC" ON)
  
@ -77,18 +115,30 @@ IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
 ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)

 #gnu copmpiler specific flags
-IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
+IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
  
  
-  OPTION (USE_OPENCL "Use OpenCL" ON)
+  OPTION (USE_OPENCL "Use OpenCL" OFF)
  OPTION (USE_CUDA "Use CUDA" OFF)
  OPTION (USE_MIC "Use intel MIC" OFF)
-  
-  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
+  OPTION (STATIC_CUDA "Link static cuda libraries" OFF)
+
+  IF (ENABLE_MUSR)
+    SET (USE_OPENCL ON)
+  ENDIF (ENABLE_MUSR)
+
+  #dont set openmp flag for apple devices
+  IF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
+  ELSE ($CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -std=c++11 -D__wsu") 
+  ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    

  FIND_PACKAGE(CUDA)
  IF (CUDA_FOUND)
    SET (USE_CUDA ON)
+    OPTION(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cuda libraries" OFF)
    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -98,20 +148,27 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

-    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
-    SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
-    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}  -DDEBUG -std=c++11 -D__wsu")    
-    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
+    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")
+
+    IF (NOT STATIC_CUDA)
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA")
+      SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart -lcufft -lcublas")
+    ELSE (NOT STATIC_CUDA) 
+      SET (CUDA_SEPARABLE_COMPILATION ON)
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA -fPIC")
+      SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-rdc=true;-lcufft_static;-lcublas_static;-lcurand_static")
+      SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart_static -lcufft_static -lcublas_static -lculibos")
+    ENDIF (NOT STATIC_CUDA)

    #if cuda version >= 7.0 add runtime commpilation flags
-    IF (NOT CUDA_VERSION VERSION_LESS "7.0")
+    IF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
-    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
+    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
    
    MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
    
    SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
-    #set(CUDA_SEPARABLE_COMPILATION ON)
    SET(BUILD_SHARED_LIBS OFF)

  ENDIF (CUDA_FOUND)
@ -121,6 +178,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE(STATUS "CUDA not found, looking for OpenCL")

    FIND_PACKAGE(OpenCL)
+    MESSAGE("after FIND_PACKAGE(OpenCL): version: ${OpenCL_VERSION_STRING}")
+    MESSAGE("after FIND_PACKAGE(OpenCL): inc dir: ${OpenCL_INCLUDE_DIR}")
+    MESSAGE("after FIND_PACKAGE(OpenCL): lib dir: ${OpenCL_LIBRARY}")
    IF (OpenCL_FOUND)
      MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
      MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
@ -138,9 +198,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
  ENDIF(APPLE AND NOT CUDA_FOUND)

  #if cuda found set cuda opencl flags
-  IF (CUDA_FOUND)
+  IF (CUDA_FOUND AND USE_OPENCL)
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
-  ENDIF (CUDA_FOUND)
+  ENDIF (CUDA_FOUND AND USE_OPENCL)

  #if cuda not found but amd opencl found set opencl flags
  IF (NOT CUDA_FOUND AND OpenCL_FOUND)
@ -152,7 +212,7 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
  ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")

-ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
+ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)

 SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
 MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
@ -169,9 +229,18 @@ ADD_SUBDIRECTORY (auto-tuning)
 CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
  ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake )

+CONFIGURE_FILE (${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion_install.cmake @ONLY)
+
 ### install files ###
 INSTALL (
  FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake
  DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
  RENAME ${PROJECT_NAME}Config.cmake
  )
+
+INSTALL (
+  FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion_install.cmake
+  DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
+  RENAME ${PROJECT_NAME}ConfigVersion.cmake
+  )
--- a/2356
+++ b/2356
--- a/ReadMe.first
+++ b/ReadMe.first
@ -1,7 +1,7 @@
 ##################################################################
 #
 # Name:		Dynamic Kernel Scheduler
-# Version:	1.0
+# Version:	1.1
 # Author: 	Uldis Locans
 # Contacts:	locans.uldis@psi.ch
 #
@ -29,30 +29,30 @@ Intel MIC compilers (optional)
 ######Source######
 https://gitlab.psi.ch/uldis_l/DKS

+######Changes from DKS-1.0.x version######
+DKS is split into three modules that can be enabled/disabled at compile time depending on which software it is used for.
+By default only DKSBase and DKSFFT modules are enabled. In order to install other modules the necessary otion needs to be enabled.
+Supported options are:
+-DENABLE_OPAL option should be enabled if DKS will be used for OPAL
+-DENABLE_MUSR option should be enable if DKS will be used for musrfit
+-DENABLE_PET option should be enabled if DKS will be used for PET image reconstruction
+
+See install instructions for more details on how to enable the necessary options in DKS
+
 ######Install######
+#consult the https://gitlab.psi.ch/uldis_l/DKS/wikis/home for full install isntructions

 #clone DKS
 git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS

-#set compilers to use
-#supported c++ compilers: g++, icpc, mpicxx whith g++
-#supported c compilers: gcc, icc, mpicc whith gcc
-export CXX_COMPILER=cpp_compiler_name
-export CC_COMPILER=c_compiler_name
+#switch to the desired version (OPTIONAL)
+git checkout DKS-1.1.0

-#set dks root directory directory
-cd DKS
-export DKS_ROOT = $PWD
-
-#set build directory
-mkdir $DKS_BUILD_DIR
-cd $DKS_BUILD_DIR
-
-#set install directory
-export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
-
-CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
+#configure installation in build directory
+#enable DKS modules to compile -DENABLE_OPAL, -DENABLE_MUSR, -DENABLE_PET
+CXX=<c++ compiler> CC=<c compiler> -DCMAKE_INSTALL_PREFIX=<install dir> <path to DKS source> [-DENABLE_OPAL=1 -DENABLE_MUSR=1 -DENABLE_PET=1] 

+#install DKS
 make
 make install

--- a/auto-tuning/CMakeLists.txt
+++ b/auto-tuning/CMakeLists.txt
@ -2,18 +2,32 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )

 #chi square kernel tests
-ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
-TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
+IF (ENABLE_MUSR)
+  ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+  TARGET_LINK_LIBRARIES(testChiSquareRT dks ${CLFFT_LIBRARIES})

-ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
-TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
+  ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
+  TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${CLFFT_LIBRARIES})
+
+  IF (USE_UQTK)
+    ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
+    TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+  ENDIF (USE_UQTK)
+
+  #test to verify search functions
+  ADD_EXECUTABLE(testSearch testSearch.cpp)
+  TARGET_LINK_LIBRARIES(testSearch dks ${CLFFT_LIBRARIES})
+ENDIF (ENABLE_MUSR)
+
+IF (ENABLE_OPAL)
+  ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
+  TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
+
+  ADD_EXECUTABLE(testPushKick testPushKick.cpp)
+  TARGET_LINK_LIBRARIES(testPushKick dks ${CLFFT_LIBRARIES})
+ENDIF(ENABLE_OPAL)
+
+ADD_EXECUTABLE(testFFT testFFT.cpp)
+TARGET_LINK_LIBRARIES(testFFT dks ${CLFFT_LIBRARIES})

-IF (USE_UQTK)
-  ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
-  TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-ENDIF (USE_UQTK)
-#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})

-#test to verify search functions
-ADD_EXECUTABLE(testSearch testSearch.cpp)
-TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
--- a/auto-tuning/testChiSquareRT.cpp
+++ b/auto-tuning/testChiSquareRT.cpp
@ -292,6 +292,9 @@ int runTest(const char *api_name, const char *device_name, bool autotune, bool m
      //set autotuning on/off
      if (autotune)
 	dksbase.setAutoTuningOn();
+
+      //check kernel
+      dksbase.checkMuSRKernels(1);
      
      //tmp values to store results and tmp values for time steps and start time
      double result_gpu = 0.0;
@ -373,11 +376,11 @@ int main(int argc, char* argv[]) {

  }

-  int numPlatforms = 2;
+  int numPlatforms = 3;
  const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
  const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};

-  for (int i = 0; i < numPlatforms; i++) {
+  for (int i = 2; i < numPlatforms; i++) {
    runTest(api[i], device[i], autotune, mlh, asym);
  }

--- a/auto-tuning/testChiSquareRTRandom.cpp
+++ b/auto-tuning/testChiSquareRTRandom.cpp
@ -392,7 +392,10 @@ int main(int argc, char *argv[]) {
  dksbase.setAPI(api_name);
  dksbase.setDevice(device_name);

+  std::cout << "Init device" << std::endl;
  dksbase.initDevice();
+  
+  std::cout << "Init chi square" << std::endl;
  dksbase.initChiSquare(Ndata, np, nf, nm);

  dksbase.writeParams(p, np);
@ -401,20 +404,24 @@ int main(int argc, char *argv[]) {

  dksbase.callSetConsts(N0, TAU, BKG);

+  std::cout << "Compile program" << std::endl;
  dksbase.callCompileProgram(sfunc);
+  
+  dksbase.checkMuSRKernels(1);

  if (autotune) 
    dksbase.setAutoTuningOn();

-  int oper = 0;
-  dksbase.getOperations(oper);
+  //std::cout << "Get operations" << std::endl;
+  //int oper = 0;
+  //dksbase.getOperations(oper);

  cout << "=========================BEGIN TEST=========================" << endl;
  cout << "Use api: " << api_name << "\t" << device_name << endl;
  cout << "Number of params: " << np << endl;
  cout << "Number of maps: " << nm << endl;
  cout << "Number of predefined functions: " << nfunc << endl;
-  cout << "Number of ptx instructions: " << oper << endl;
+  //cout << "Number of ptx instructions: " << oper << endl;
  cout << "------------------------------------------------------------" << endl;
  cout << sfunc << endl;
  cout << "------------------------------------------------------------" << endl;
--- a/auto-tuning/testCollimatorPhysics.cpp
+++ b/auto-tuning/testCollimatorPhysics.cpp
@ -0,0 +1,161 @@
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "DKSOPAL.h"
+
+typedef struct {
+  int label;
+  unsigned localID;
+  double Rincol[3];
+  double Pincol[3];
+} PART;
+
+PART initPartSmall(int d) {
+
+  PART p;
+  p.label = 0;
+  p.localID = d;
+
+  p.Rincol[0] = 0.0;
+  p.Rincol[1] = 0.0;
+  p.Rincol[2] = 0.02;
+
+  p.Pincol[0] = 0.0;
+  p.Pincol[1] = 0.0;
+  p.Pincol[2] = 3.9920183237269791e-01;
+
+  return p;
+}
+
+void printPart(PART p) {
+  std::cout << "label: " << p.label << ", ";
+  std::cout << "localid: " << p.localID << ",";
+  std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
+  std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
+  std::cout << std::endl;
+}
+
+void initParts(PART *p, int N) {
+  for (int i = 0; i < N; i++)
+    p[i] = initPartSmall(i);
+}
+
+void printParts(PART *p, int N) {
+  for (int i = 0; i < N; i++)
+    printPart(p[i]);
+  std::cout << std::endl;
+}
+
+void initParams(double *data) {
+  data[0]  = 0.0;//2.0000000000000000e-02;
+  data[1]  = 1.0;//1.0000000000000000e-02;	
+  data[2]  = 2.2100000000000000e+00;
+  data[3]  = 6.0000000000000000e+00;	
+  data[4]  = 1.2010700000000000e+01;	
+  data[5]  = 2.6010000000000000e+00;	
+  data[6]  = 1.7010000000000000e+03;	
+  data[7]  = 1.2790000000000000e+03;	
+  data[8]  = 1.6379999999999999e-02;	
+  data[9]  = 1.9321266968325795e-01;	
+  data[10] = 7.9000000000000000e+01;	
+  data[11] = 1.0000000000000002e-12;
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 1e5;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == std::string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == std::string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+
+    if (argv[i] == std::string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  std::cout << "=========================BEGIN TEST=========================" << std::endl;
+  std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
+  std::cout << "Number of particles: " << numpart << std::endl;
+  std::cout << "Number of loops: " << loop << std::endl;
+  std::cout << "------------------------------------------------------------" << std::endl;
+
+  //init part vector to test mc
+  PART *parts = new PART[numpart];
+  initParts(parts, numpart);
+
+  double *params = new double[12];
+  initParams(params);
+  
+  //init dks
+  int ierr;
+  DKSOPAL base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  ierr = base.initDevice();
+  if (ierr != DKS_SUCCESS)
+    std::cout << "Error with init device!" << std::endl;
+
+  //init random
+  base.callInitRandoms(numpart);
+
+  //**test collimator physics and sort***//
+  void *part_ptr, *param_ptr;
+
+  //allocate memory for particles
+  part_ptr = base.allocateMemory<PART>(numpart, ierr);
+  param_ptr = base.allocateMemory<double>(12, ierr);
+
+  //transfer data to device
+  base.writeData<PART>(part_ptr, parts, numpart);
+  base.writeData<double>(param_ptr, params, 12);
+
+  int numaddback;
+  base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
+  base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);  
+  base.syncDevice();
+
+  //read data from device
+  base.readData<PART>(part_ptr, parts, numpart);
+
+  //free memory
+  base.freeMemory<PART>(part_ptr, numpart);
+  base.freeMemory<double>(param_ptr, 12);  
+
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++) {
+    std::cout << parts[i].label << "\t" 
+	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
+	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
+	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
+	      << std::endl;
+  }
+
+  std:: cout << "..." << std::endl;
+  
+  for (int i = numpart - 10; i < numpart; i++) {
+    std::cout << parts[i].label << "\t" 
+	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
+	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
+	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
+	      << std::endl;
+  }
+
+  return 0;
+}
--- a/auto-tuning/testFFT.cpp
+++ b/auto-tuning/testFFT.cpp
@ -0,0 +1,214 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSFFT.h"
+
+using namespace std;
+
+void compareData(complex<double>* data1, complex<double>* data2, int N, int dim);
+void compareData(double* data1, double *data2, int N, int dim);
+
+void initData(complex<double> *data, int dimsize[3], int dim);
+void initData(double *data, int dimsize[3], int dim);
+
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim, 
+		char *api_name, char *device_name);
+
+void printHelp();
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int N1 = 8;
+  int N2 = 8;
+  int N3 = 8;
+  int dim = 3;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) )
+    return 0;
+
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N1, N2, N3};
+  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
+
+  double *rdata = new double[sizereal];
+  double *ordata = new double[sizereal];
+  complex<double> *cdata = new complex<double>[sizereal];
+  complex<double> *codata = new complex<double>[sizereal];
+
+  initData(rdata, dimsize, 3);
+  initData(cdata, dimsize, 3);
+
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+  DKSFFT base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  cout << "init device" << endl;
+  base.initDevice();
+  cout << "setup fft" << endl;
+  base.setupFFT(dim, dimsize);
+
+  //Test RC FFT -> CR FFT
+  void *real_ptr, *comp_ptr, *res_ptr;
+  cout << "allocate memory" << endl;
+  real_ptr = base.allocateMemory<double>(sizereal, ierr);
+  res_ptr = base.allocateMemory<double>(sizereal, ierr);
+  comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+
+  cout << "write data" << endl;
+  base.writeData<double>(real_ptr, rdata, sizereal);
+
+  cout << "perform fft" << endl;
+  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+  base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize);
+  base.callNormalizeC2RFFT(res_ptr, dim, dimsize);
+
+  cout << "read data" << endl;
+  base.readData<double>(res_ptr, ordata, sizereal);
+
+  compareData(rdata, ordata, N1, 3);
+
+  base.freeMemory<double>(real_ptr, sizereal);
+  base.freeMemory<double>(res_ptr, sizereal);
+  base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+
+  //Test CC FFT
+  void *mem_ptr;
+  mem_ptr = base.allocateMemory< complex<double> >(sizereal, ierr);
+  base.writeData< complex<double> >(mem_ptr, cdata, sizereal);
+  base.callFFT(mem_ptr, 3, dimsize);
+  base.callIFFT(mem_ptr, 3, dimsize);
+  base.callNormalizeFFT(mem_ptr, 3, dimsize);
+  base.readData< complex<double> >(mem_ptr, codata, sizereal);
+
+  compareData(cdata, codata, N1, 3);
+
+  base.freeMemory< complex<double> > (mem_ptr, sizereal);
+
+  delete[] rdata;
+  delete[] ordata;
+  delete[] cdata;
+  delete[] codata;
+
+}
+
+void compareData(complex<double>* data1, complex<double>* data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id].real() - data2[id].real());
+	sum += fabs(data1[id].imag() - data2[id].imag());
+      }
+    }
+  }
+  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
+void compareData(double* data1, double* data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id] - data2[id]);
+      }
+    }
+  }
+  cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
+}
+
+void initData(complex<double> *data, int dimsize[3], int dim) {
+  if (dim == 3) {
+    for (int i = 0; i < dimsize[2]; i++)
+      for (int j = 0; j < dimsize[1]; j++) 
+	for (int k = 0; k < dimsize[0]; k++) 
+	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
+  } else if (dim == 2) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
+      }
+    }
+  } else {
+    for (int k = 0; k < dimsize[0]; k++) 
+      data[k] = complex<double>(sin(k), 0.0);
+  }
+}
+
+void initData(double *data, int dimsize[3], int dim) {
+  if (dim == 3) {
+    for (int i = 0; i < dimsize[2]; i++)
+      for (int j = 0; j < dimsize[1]; j++) 
+	for (int k = 0; k < dimsize[0]; k++) 
+	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
+  } else if (dim == 2) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[j*dimsize[0] + k] = sin(k);
+      }
+    }
+  } else {
+    for (int k = 0; k < dimsize[0]; k++) 
+      data[k] = sin(k);
+  }
+}
+
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
+		char *api_name, char *device_name) 
+{
+
+  for (int i = 1; i < argc; i++) {
+
+    if ( argv[i] == std::string("-dim")) {
+      dim = atoi(argv[i + 1]);
+      i++;
+    }
+
+    if ( argv[i] == std::string("-grid") ) {
+      N1 = atoi(argv[i + 1]);
+      N2 = atoi(argv[i + 2]);
+      N3 = atoi(argv[i + 3]);
+      i += 3;
+    }
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    } 
+
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    } 
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    } 
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+  }
+
+  return false;
+}
+
--- a/auto-tuning/testPushKick.cpp
+++ b/auto-tuning/testPushKick.cpp
@ -0,0 +1,132 @@
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "DKSOPAL.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+void initData(double3 *data, int N) {
+  for (int i = 0; i < N; i++) {
+    data[i].x = (double)rand() / RAND_MAX;
+    data[i].y = (double)rand() / RAND_MAX;
+    data[i].z = (double)rand() / RAND_MAX;
+  }
+}
+
+void initDt(double *data, int N) {
+  for (int i = 0; i < N; i++) {
+    data[i] = 0.00001;
+  }
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 1e5;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == std::string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == std::string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+
+    if (argv[i] == std::string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  std::cout << "=========================BEGIN TEST=========================" << std::endl;
+  std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
+  std::cout << "Number of particles: " << numpart << std::endl;
+  std::cout << "Number of loops: " << loop << std::endl;
+  std::cout << "------------------------------------------------------------" << std::endl;
+  
+  int ierr;
+  DKSOPAL dksbase;
+  dksbase.setAPI(api_name, strlen(api_name));
+  dksbase.setDevice(device_name, strlen(api_name));
+  ierr = dksbase.initDevice();
+  if (ierr != DKS_SUCCESS)
+    std::cout << "Error with init device!" << std::endl;
+
+  double3 *R = new double3[numpart];
+  double3 *P = new double3[numpart];
+  double3 *Ef = new double3[numpart];
+  double3 *Bf = new double3[numpart];
+  double *dt = new double[numpart];
+  
+  initData(R, numpart);
+  initData(P, numpart);
+  initData(Ef, numpart);
+  initData(Bf, numpart);
+  initDt(dt, numpart);
+
+  void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
+  
+  r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
+  p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
+  ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
+  bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
+  dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
+  
+
+  dksbase.writeData<double3>(r_ptr, R, numpart);
+  dksbase.writeData<double3>(p_ptr, P, numpart);
+  dksbase.writeData<double3>(ef_ptr, Ef, numpart);
+  dksbase.writeData<double3>(bf_ptr, Bf, numpart);
+  dksbase.writeData<double>(dt_ptr, dt, numpart);
+
+  for (int i = 0; i < loop; ++i)
+    dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
+
+
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++)
+    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
+
+  std:: cout << "..." << std::endl;
+  
+  for (int i = numpart - 10; i < numpart; i++)
+    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
+
+  std::cout << "============" << std::endl;
+
+  dksbase.readData<double3>(r_ptr, R, numpart);
+
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++)
+    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
+
+  std:: cout << "..." << std::endl;
+  
+  for (int i = numpart - 10; i < numpart; i++)
+    std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
+
+  dksbase.freeMemory<double3>(r_ptr, numpart);
+  dksbase.freeMemory<double3>(p_ptr, numpart);
+  dksbase.freeMemory<double3>(ef_ptr, numpart);
+  dksbase.freeMemory<double3>(bf_ptr, numpart);
+  dksbase.freeMemory<double>(dt_ptr, numpart);
+
+  delete[] R;
+  delete[] P;
+  delete[] Ef;
+  delete[] Bf;
+  delete[] dt;
+
+
+}
--- a/cmake/DKSConfig.cmake.in
+++ b/cmake/DKSConfig.cmake.in
@ -2,4 +2,8 @@ SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
 SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 SET(${PROJECT_NAME}_LIBRARY "dks")
-SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
+SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
+SET(DKS_CUDA_STATIC ${STATIC_CUDA})
+SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
+SET(DKS_VERSION ${DKS_VERSION})
+SET(DKS_VERSION_STR ${DKS_VERSION_STR})
--- a/cmake/DKSConfigVersion.cmake.in
+++ b/cmake/DKSConfigVersion.cmake.in
@ -0,0 +1,13 @@
+set(PACKAGE_VERSION @DKS_VERSION@)
+
+if("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "@DKS_VERSION_MAJOR@" AND "${PACKAGE_FIND_VERSION_MINOR}" EQUAL "@DKS_VERSION_MINOR@")
+  if ("${PACKAGE_FIND_VERSION_PATCH}" EQUAL "@DKS_VERSION_PATCH@")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  elseif("${PACKAGE_FIND_VERSION_PATCH}" LESS "@DKS_VERSION_PATCH@")
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  else()
+    set(PACKAGE_VERSION_UNSUITABLE TRUE)
+  endif()
+else()
+  set(PACKAGE_VERSION_UNSUITABLE TRUE)
+endif()
--- a/doc/refman.pdf
+++ b/doc/refman.pdf
--- a/src/Algorithms/CMakeLists.txt
+++ b/src/Algorithms/CMakeLists.txt
@ -6,6 +6,7 @@ SET (_HDRS
 	ImageReconstruction.h
 	CollimatorPhysics.h
 	FFT.h
+	GreensFunction.h
  )

 ADD_SOURCES (${_SRCS})
--- a/src/Algorithms/ChiSquareRuntime.h
+++ b/src/Algorithms/ChiSquareRuntime.h
@ -15,6 +15,9 @@

 class DKSBaseMuSR;

+/** 
+ * Interface to implement ChiSquareRuntime class for musrfit.
+ */
 class ChiSquareRuntime {
  friend class DKSBaseMuSR;

@ -63,23 +66,54 @@ public:
  /** Default constructor */
  //ChiSquareRuntime();

-  /** Default destructor */
+  /** Default destructor. */
  virtual ~ChiSquareRuntime() { };

+  /**
+   * Compile GPU programm generated at runtime.
+   */
  virtual int compileProgram(std::string function, bool mlh = false) = 0;
+
+  /**
+   * Launche the compiled chiSquare kernel.
+   */
  virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
 			      int numpar, int numfunc, int nummap,
 			      double timeStart, double timeStep,
 			      double &result) = 0;

+  /** 
+   * Write the parameter values to the GPU.
+   */
  virtual int writeParams(const double *params, int numparams) = 0;
+
+  /**
+   * Write the function values to the GPU.
+   */
  virtual int writeFunc(const double *func, int numfunc) = 0;
+
+  /**
+   * Write map values to the GPU.
+   */
  virtual int writeMap(const int *map, int nummap) = 0;
+
+  /**
+   * Allocate temporary memory needed for the chi square calucaltios on the device.
+   */
  virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
+
+  /**
+   * Free device memory allocated for chi square calculations.
+   */
  virtual int freeChiSquare() = 0;
+
+  /**
+   * Check if available device can run the chi square GPU code.
+   */
  virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;

-  /** Set N0, tau and bgk values to use for the kernel.
+  /** 
+   * Set N0, tau and bgk values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -91,7 +125,8 @@ public:
    return DKS_SUCCESS;
  }

-  /** Set alpha and beta values to use for the kernel.
+  /** 
+   * Set alpha and beta values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -101,8 +136,9 @@ public:
    return DKS_SUCCESS;
  }

-  /** Set number of blocks and threads.
-   *  Used to set parameters obtained from auto-tuning
+  /** 
+   * Set number of blocks and threads.
+   * Used to set parameters obtained from auto-tuning
   */
  int setKernelParams(int numBlocks, int blockSize) {
    int ierr = DKS_ERROR;
@ -118,8 +154,9 @@ public:
    return ierr;
  }

-  /** Get the number of operations in compiled kernel.
-   *  Count the number of operation in the ptx file for the compiled program.
+  /** 
+   * Get the number of operations in compiled kernel.
+   * Count the number of operation in the ptx file for the compiled program.
   */
  int getOperations(int &oper) {

--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@ -5,10 +5,10 @@
 #include <string>
 #include "../DKSDefinitions.h"

-class DKSBaseMuSR;
-
+/**
+ * Interface to impelment particle matter interaction for OPAL.
+ */
 class DKSCollimatorPhysics {
-  friend class DKSBaseMuSR;

 protected:

@ -18,24 +18,61 @@ protected:
 public:
  
  virtual ~DKSCollimatorPhysics() { }
-  
-  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;

+  /** 
+   * Execute collimator physics kernel.
+   *
+   */  
+  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
+				bool enableRutherforScattering = true) = 0;
+
+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles) = 0;
  
+  /** 
+   * Sort particle array on GPU.
+   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
+   * array so these particles are at the end of array
+   */
  virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;

+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				       void *px_ptr, void *py_ptr, void *pz_ptr,
 				       void *par_ptr, int numparticles, int &numaddback) = 0;

+  /** 
+   * BorisPusher push function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;

+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
+  virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
+				   void *bf_ptr, void *dt_ptr, double charge,
+				   double mass, int npart, double c, int streamId = -1) = 0; 
+
+  /** 
+   * BorisPusher push function with transformto function form OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
--- a/src/Algorithms/FFT.h
+++ b/src/Algorithms/FFT.h
@ -6,12 +6,21 @@

 #include "../DKSDefinitions.h"

-class DKSFFT {
+/**
+ * Abstract class defining methods for DKS FFT class.
+ * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
+ */
+class BaseFFT {

 protected:
  int defaultN[3];
  int defaultNdim;

+  /**
+   * Check if FFT plan is created for the needed dimension and FFT size.
+   * Returns true if the plan has been created and false if no plan for specified dimension
+   * and size exists.
+   */
  bool useDefaultPlan(int ndim, int N[3]) {
    if (ndim != defaultNdim)
      return false;
@ -22,20 +31,59 @@ protected:

 public:

-  virtual ~DKSFFT() { }
+  virtual ~BaseFFT() { }

+  /** Setup FFT - init FFT library used by chosen device. */
  virtual int setupFFT(int ndim, int N[3]) = 0;
+
+  /** Setup real to complex FFT - init FFT library used by chosen device. */
  virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
  virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Clean up. */
  virtual int destroyFFT() = 0;
+
+  /** 
+   * Exectute C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
  virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
 			 int streamId = -1, bool forward = true) = 0;
+
+  /** 
+   * Exectute inverse C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
  virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /**
+   * Normalize the FFT or IFFT.
+   * mem_ptr - memory to complex data.
+   */
  virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /** 
+   * Exectute R2C FFT.
+   * real_ptr - real input data for FFT, comp_ptr - memory on the device where
+   * results for the FFT are stored as complex numbers.
+   */
  virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /** 
+   * Exectute C2R FFT.
+   * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
+   */
  virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /**
+   * Normalize CR FFT.
+   */
  virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;

 };
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@ -0,0 +1,32 @@
+#ifndef H_GREENSFUNCTION
+#define H_GREENSFUNCTION
+
+#include <iostream>
+#include <cmath>
+
+/**
+ * Interface to implement Greens function calculations for OPAL.
+ */
+class GreensFunction {
+
+public:
+  
+  virtual ~GreensFunction() { }
+
+  /** calc greens integral, as defined in OPAL. */
+  virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
+			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
+
+  /** integration if rho2_m, see OPAL for more details. */
+  virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
+					int streamId = -1) = 0;
+
+  /** mirror rho2_m field. */
+  virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
+
+  /** multiply two complex fields from device memory. */
+  virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
+
+};
+
+#endif
--- a/src/Algorithms/ImageReconstruction.h
+++ b/src/Algorithms/ImageReconstruction.h
@ -5,17 +5,22 @@

 #define BLOCK_SIZE 128

+/** Struct to hold voxel position for PET image. */
 struct VoxelPosition {
  float x;
  float y;
  float z;
 };

+/** Struct that holds pair of detectors that registered an envent. */
 struct ListEvent {
  unsigned detA : 16;
  unsigned detB : 16;
 };

+/**
+ * Interface to implement PET image reconstruction.
+ */
 class ImageReconstruction {

 protected:
@ -25,7 +30,8 @@ public:

  virtual ~ImageReconstruction() { }
  
-  /** Caluclate source.
+  /** 
+   * Caluclate source.
   *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
   *  that are inside this sphere. All the sphere used have the same diameter.
   */
@ -33,7 +39,8 @@ public:
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0) = 0;

-  /** Calculate background.
+  /** 
+   * Calculate background.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -42,7 +49,8 @@ public:
 				  void *avg, void *std, float diameter, int total_voxels, 
 				  int total_sources, int start = 0) = 0;

-  /** Caluclate source using differente sources.
+  /** 
+   * Caluclate source using differente sources.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * each sphere is given by *diameter array.
@ -52,7 +60,7 @@ public:
 			       int total_sources, int start = 0) = 0;

  /**
-   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
   * smaller sphere.
@ -61,7 +69,8 @@ public:
 				   void *avg, void *std, void *diameter, int total_voxels, 
 				   int total_sources, int start = 0) = 0;

-  /** Generate normalization.
+  /** 
+   * Generate normalization.
   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
   * that updates voxel values in the image on the slope between these two detectors.
   */
@ -69,14 +78,16 @@ public:
 				 void *det_position, int total_det) = 0; 


-  /** Calculate forward projection.
+  /** 
+   * Calculate forward projection.
   * For image reconstruction calculates forward projections.
   * see recon.cpp for details
   */
  virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 				void *image_position, int num_events) = 0;

-  /** Calculate backward projection.
+  /** 
+   * Calculate backward projection.
   * For image reconstruction calculates backward projections.
   * see recon.cpp for details
   */
@ -84,29 +95,29 @@ public:
 				 void *det_position, void *image_position, 
 				 int num_events, int num_voxels) = 0;

-  /** Set the voxel dimensins on device.
-   * 
+  /** 
+   *Set the voxel dimensins on device. 
   */
  virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;

-  /** Set the image edge variables on the device.
-   * 
+  /** 
+   * Set the image edge variables on the device.
   */
  virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;

-  /** Set the image edge1 on the device.
-   * 
+  /** 
+   * Set the image edge1 on the device.
   */
  virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;

-  /** Set the minimum crystan in one ring values on the device.
-   * 
+  /** 
+   * Set the minimum crystan in one ring values on the device.
   */
  virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
 				  float min_CrystalDist_InOneRing1) = 0;

-  /** Set all other required parameters for reconstruction.
-   * 
+  /** 
+   * Set all other required parameters for reconstruction.
   */
  virtual int setParams(float matrix_distance_factor, float phantom_diameter,
 			float atten_per_mm, float ring_diameter) = 0;
--- a/src/AutoTuning/DKSAutoTuning.h
+++ b/src/AutoTuning/DKSAutoTuning.h
@ -18,6 +18,17 @@
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;

+/** 
+ * DKS autotuning class, allows to auto-tune the defince function.
+ * Executes the defined function for auto-tuning and searches for optimal parameters to improve
+ * the function execution time. The function that is auto-tuned, parameters and the ranges
+ * need to be set. Includes multiple search methods, that searches the parameter space to finde 
+ * the optimal solution.
+ *  1) exaustive search
+ *  2) line search
+ *  3) hill climbimg
+ *  4) simulated annealing
+ */
 class DKSAutoTuning {

 private:
@ -36,12 +47,13 @@ private:

  int loops_m;

-  /** Update parameters from a state */
+  /** Update parameters from a state. */
  int setParameterValues(States states);

-  /** Evaluate the function and set execution time 
-   *  Returns DKS_ERROR if errors occured during function execution. 
-   *  Returns DKS_SUCCESS if function executed as planned. 
+  /** 
+   * Evaluate the function and set execution time 
+   * Returns DKS_ERROR if errors occured during function execution. 
+   * Returns DKS_SUCCESS if function executed as planned. 
   */
  int evaluateFunction(double &value);

@ -50,12 +62,13 @@ public:
  /** Constructor */
  DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);

-  /** Destructor */
+  /** Destructor. */
  ~DKSAutoTuning();

-  /** Set function to auto tune.
-   *  Caller of setFunction is responsible to bind the correct parameters 
-   *  to the function with std::bind.
+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
   */
  void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
    f_m = f;
@ -63,15 +76,21 @@ public:
    evaluate_time_m = evaluate_time;
  }

+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
+   */
  void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
    fd_m = f;
    function_name_m = name;
    evaluate_time_m = evaluate_time;
  }

-  /** Set parameter for auto tuning.
-   *  Provide a pointer to a parameter that will be changed during auto-tuning
-   *  and a min-max value for this element
+  /** 
+   * Set parameter for auto tuning.
+   * Provide a pointer to a parameter that will be changed during auto-tuning
+   * and a min-max value for this element
   */
  template <typename T1>
  void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
@ -85,9 +104,9 @@ public:
  /** Perform exaustive search evaluating all the parameter configurations */
  void exaustiveSearch();

-  /** Perform auto-tuning.
-   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
-   *  parameters constant.
+  /**
+   * Perform line-search auto-tuning by variying parameters one at a time.
+   * After one parameter is auto-tuned the next on is varied
   */
  void lineSearch();  

--- a/src/AutoTuning/DKSAutoTuningTester.h
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@ -4,6 +4,7 @@
 #include <iostream>
 #include <cmath>

+/** Tester class for auto-tuning search algorithms. */
 class DKSAutoTuningTester {

  friend class DKSBaseMuSR;
--- a/src/AutoTuning/DKSConfig.h
+++ b/src/AutoTuning/DKSConfig.h
@ -1,9 +1,3 @@
-/** Class to save and load DKS autotunning configs.
- * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
- * Uses boost xml_parser to read and write the xml file and boost property tree to store
- * the xml content.
- */
-
 #ifndef DKS_CONFIG
 #define DKS_CONFIG

@ -11,7 +5,7 @@
 #include <boost/optional/optional.hpp>
 #include <boost/property_tree/xml_parser.hpp>
 #include <boost/foreach.hpp>
-#include <boost/filesystem.hpp>
+//#include <boost/filesystem.hpp>
 #include <string>
 #include <iostream>
 #include <cstdlib>
@ -24,11 +18,18 @@
 #include "../DKSDefinitions.h"

 namespace pt = boost::property_tree;
-namespace fs = boost::filesystem;
+//namespace fs = boost::filesystem;

 const std::string config_dir = "/.config/DKS";
 const std::string config_file = "/autotuning.xml";

+/** Class to save and load DKS autotunning configs.
+ * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
+ * Uses boost xml_parser to read and write the xml file and boost property tree to store
+ * the xml content.
+ * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
+ * so the auto-tuning has no effect.
+ */
 class DKSConfig {

 private:
--- a/src/AutoTuning/DKSSearchStates.h
+++ b/src/AutoTuning/DKSSearchStates.h
@ -9,6 +9,9 @@

 enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };

+/** 
+ * Parameter class allows to change the searchable parameters during the auto-tuning.
+ */
 class Parameter {

 private:
@ -64,6 +67,10 @@ public:

 };

+/**
+ * Struct to hold a auto-tuning state.
+ * Holds the current value, min, max and a step to witch a state can change.
+ */ 
 struct State {
  double value;
  double min;
@ -74,6 +81,12 @@ struct State {
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;

+/** 
+ * Used by auto-tuning search algorithms to move between parameter configurations.
+ * Allows to move from one parameter stat to another, get neighboring states, 
+ * move to neighboring states and save state information. Print functions are available
+ * for debugging purposes, to follow how algorithm muves between sates.
+ */
 class DKSSearchStates {

 private:
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -35,13 +35,29 @@ ENDMACRO ()
 SET (DKS_BASEDIR_HDRS
  DKSBase.h
  DKSDefinitions.h
+  DKSFFT.h
  )

 SET (DKS_BASEDIR_SRCS
  DKSBase.cpp
+  DKSFFT.cpp
  )

-IF (USE_CUDA OR USE_OPENCL)
+#add opal to DKS if enable_opal is set
+IF (ENABLE_OPAL)
+  SET (DKS_BASEDIR_HDRS
+    ${DKS_BASEDIR_HDRS}
+    DKSOPAL.h
+    )
+
+  SET (DKS_BASEDIR_SRCS
+    ${DKS_BASEDIR_SRCS}
+    DKSOPAL.cpp
+    )
+ENDIF (ENABLE_OPAL)
+
+#and musrt to DKS if cuda or opencl is used and enable_musr is set
+IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
   SET (DKS_BASEDIR_HDRS
       ${DKS_BASEDIR_HDRS}
       DKSBaseMuSR.h
@ -51,9 +67,10 @@ IF (USE_CUDA OR USE_OPENCL)
       ${DKS_BASEDIR_SRCS}
       DKSBaseMuSR.cpp
       )
-ENDIF (USE_CUDA OR USE_OPENCL)
+ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)

-IF (USE_CUDA)
+#add image reconstruction to DKS if cuda is used and enable_pet is set
+IF (USE_CUDA AND ENABLE_PET)
  SET (DKS_BASEDIR_HDRS
    ${DKS_BASEDIR_HDRS}
    DKSImageReconstruction.h
@ -63,7 +80,7 @@ IF (USE_CUDA)
    ${DKS_BASEDIR_SRCS}
    DKSImageReconstruction.cpp
    )
-ENDIF (USE_CUDA)
+ENDIF (USE_CUDA AND ENABLE_PET)

 ADD_HEADERS (${DKS_BASEDIR_HDRS})
 ADD_SOURCES (${DKS_BASEDIR_SRCS})
@ -95,26 +112,18 @@ IF (USE_CUDA)
  CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
  CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})

-  IF (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-    TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-  ELSE (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks cudadevrt)
-    TARGET_LINK_LIBRARIES(dksshared cudadevrt)
-  ENDIF (USE_UQTK)
+  TARGET_LINK_LIBRARIES(dks ${DKS_CUDA_LIBS})
+  TARGET_LINK_LIBRARIES(dksshared ${DKS_CUDA_LIBS})
+  #TARGET_LINK_LIBRARIES(dks)
+  #TARGET_LINK_LIBRARIES(dksshared)

 ELSE (USE_CUDA)
  MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
  ADD_LIBRARY(dks ${DKS_SRCS})
  ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})

-  IF (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-    TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-  ELSE (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks)
-    TARGET_LINK_LIBRARIES(dksshared)
-  ENDIF(USE_UQTK)
+  TARGET_LINK_LIBRARIES(dks)
+  TARGET_LINK_LIBRARIES(dksshared)

 ENDIF (USE_CUDA)

--- a/src/CUDA/CMakeLists.txt
+++ b/src/CUDA/CMakeLists.txt
@ -1,35 +1,27 @@
-SET (_HDRS
-	CudaBase.cuh
-	CudaFFT.cuh
-	CudaGreensFunction.cuh
-	CudaChiSquare.cuh
-	CudaCollimatorPhysics.cuh
-	CudaImageReconstruction.cuh
-	CudaChiSquareRuntime.cuh
-  )
-  
-SET (_SRCS
-	CudaBase.cu
-	CudaFFT.cu
-	CudaGreensFunction.cu
-	CudaChiSquare.cu
-	CudaCollimatorPhysics.cu
-	CudaImageReconstruction.cu
-	CudaChiSquareRuntime.cu
-)
+SET (_HDRS CudaBase.cuh CudaFFT.cuh)
+SET (_SRCS CudaBase.cu CudaFFT.cu)

-#INCLUDE_DIRECTORIES (
-#  ${CMAKE_CURRENT_SOURCE_DIR}
-#)
+IF (ENABLE_OPAL)
+  SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
+  SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu)
+ENDIF (ENABLE_OPAL)
+
+IF (ENABLE_MUSR)
+  SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
+  SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
+  SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
+ENDIF (ENABLE_MUSR)
+
+IF (ENABLE_PET)
+  SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
+  SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
+ENDIF (ENABLE_PET)
+
+MESSAGE (STATUS "CUDA headers: ${_HDRS}")

 ADD_SOURCES(${_SRCS})
 ADD_HEADERS(${_HDRS})

 INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
-
-SET (_KERNELS
-  NVRTCKernels/CudaChiSquareKernel.cu
-  )
-
 INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)

--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -13,6 +13,13 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {

 }

+__global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size)
+    data[idx] = curand_uniform_double(&state[idx]);
+}
+

 //=====================================//
 //==========Private functions==========//
@ -41,14 +48,15 @@ CudaBase::~CudaBase() {
 /*
  create curandStates
 */
-int CudaBase::cuda_createCurandStates(int size) {
+int CudaBase::cuda_createCurandStates(int size, int seed) {

  if (defaultRndSet == 1)
    cuda_deleteCurandStates();

  int threads = 128;
  int blocks = size / threads + 1;
-  int seed = time(NULL);
+  if (seed == -1) 
+    seed = time(NULL);

  //std::cout << "sizeof: " << sizeof(curandState) << std::endl;
  cudaMalloc(&defaultRndState, sizeof(curandState)*size);
@ -68,6 +76,15 @@ int CudaBase::cuda_deleteCurandStates() {
  return DKS_SUCCESS;
 }

+int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
+  int threads = BLOCK_SIZE;
+  int blocks = size / threads + 1;
+  
+  kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
+
+  return DKS_SUCCESS;
+}
+
 curandState* CudaBase::cuda_getCurandStates() {
  return defaultRndState;
 }
@ -325,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
 	
  return DKS_SUCCESS;
 }
-		
-/*
-  Info: allcate memory and write data (push)
-  Return: pointer to memory object
-*/
-/*
-  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
-
-  void * mem_ptr;
-  mem_ptr = cuda_allocateMemory(size, ierr);
-	
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_writeData(mem_ptr, in_data, size);
-		
-  return mem_ptr;
-  }
-*/
-		
-/*
-  Info: read data and free memory (pull)
-  Return: success or error code
-*/
-/*
-  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
-
-  ierr = cuda_readData(mem_ptr, out_data, size);
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_freeMemory(mem_ptr);	
-  else
-  return DKS_ERROR;
-	
-	
-  if (ierr == DKS_SUCCESS)	
-  return DKS_SUCCESS;
-  else
-  return DKS_ERROR;
-  }
-*/
-
-/*
-  Info: execute function
-  Return: success or error code
-*/
-int CudaBase::cuda_executeFunction() {
-
-  std::cout << "Execute function" << std::endl;
-  return DKS_SUCCESS;
-}
-		
-/*
-  Info: clean up
-  Return: success or error code
-*/
-int CudaBase::cuda_cleanUp() {
-
-  std::cout << "clean up" << std::endl;
-  return DKS_SUCCESS;
-	
-}
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@ -12,9 +12,15 @@
 #include <cufft.h>
 #include <cublas_v2.h>
 #include <curand_kernel.h>
-#include <nvToolsExt.h>
 #include <time.h>

+#define BLOCK_SIZE 128
+
+/**
+ * CUDA base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement, data transfers and stream setup for 
+ * asynchronous data transfers and kernel executions.
+ */
 class CudaBase {

 private:
@ -39,19 +45,25 @@ public:
   * Init cuda random number (cuRand) states.
   * Create an array of type curandState  with "size" elements on the GPU
   * and create a curandState with different seed for each array entry.
+   * If no seed is given create a seed based on current time.
   * Return success or error code
   */
-  int cuda_createCurandStates(int size);
+  int cuda_createCurandStates(int size, int seed = -1);

  /**
   * Delete curandState.
   * Delete curandState array on the GPU and free memory.
-   *  Return success or error code
+   * Return success or error code
   */
  int cuda_deleteCurandStates();

-  /** Get a pointer to curand states
-   *
+  /** 
+   * Create 'size' random numbers on the device and save in mem_ptr array.
+   */
+  int cuda_createRandomNumbers(void *mem_ptr, int size);
+
+  /** 
+   * Get a pointer to curand states.
   */
  curandState* cuda_getCurandStates();
 	
@ -68,93 +80,98 @@ public:
  int cuda_addStream(cudaStream_t tmpStream, int &streamId);

  /**
-   * delete cuda stream
+   * delete cuda stream.
   * success or error code
   */
  int cuda_deleteStream(int id);

  /**
-   * delete all streams
+   * delete all streams.
   * success or error code
   */
  int cuda_deleteStreams();

  /**
-   * set stream to use
+   * set stream to use.
   * success or error code
   */
  int cuda_setStream(int id);

  /**
-   * Info: get stream that is used
-   *  Return: return id of curretn stream
+   * get stream that is used.
+   * Return: return id of curretn stream
   */
  int cuda_getStreamId();

  /**
-   * Info: reset to default stream
+   * reset to default stream.
   * Return: success or error code
   */
  int cuda_defaultStream();

  /**
-   * Info: get number of streams
+   * get number of streams.
   * Return: success or error code
   */
  int cuda_numberOfStreams();

  /**
-   * Info: get stream
+   * get stream.
   * Return: stream
   */
  cudaStream_t cuda_getStream(int id);

  /**
-   * Get default cublass handle
+   * Get default cublass handle.
   */
  cublasHandle_t cuda_getCublas();

  /**
-   * Info: get information on cuda devices
+   * get information on cuda devices.
   * Return: success or error code
   */
  int cuda_getDevices();

-  /** Get CUDA device count.
-   *  Sets the number of devices on the platform that can use CUDA.
-   *  Returns DKS_SUCCESS
+  /** 
+   * Get CUDA device count.
+   * Sets the number of devices on the platform that can use CUDA.
   */
  int cuda_getDeviceCount(int &ndev);

-  /** Get the name of the device.
-   *  QUery the device properties of the used device and set the string device_name
+  /** 
+   * Get the name of the device.
+   * QUery the device properties of the used device and set the string device_name
   */
  int cuda_getDeviceName(std::string &device_name);

-  /** Set CUDA device to use.
-   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
+  /** 
+   * Set CUDA device to use.
+   * If device passed in is larger than the number of devices use 
+   * the default:0 and return DKS_ERROR 
   */
  int cuda_setDevice(int device);

-  /** Get unique devices
-   *  Get array of indeces with the unique CUDA devices available on the paltform
+  /** 
+   * Get unique devices.
+   * Get array of indeces with the unique CUDA devices available on the paltform
   */
  int cuda_getUniqueDevices(std::vector<int> &devices);
 	
  /**
-   * Info: init device
+   * Initialize connection to the device.
+   * Only needed when runtime compilation is used.
   * Return: success or error code
   */
  int cuda_setUp();
 	
  /**
-   * Info: allocate memory on cuda device
+   * Allocate memory on cuda device.
   * Return: pointer to memory object
   */
  void * cuda_allocateMemory(size_t size, int &ierr);

  /**
-   * Info: allocate host memory in pinned memory
+   * Allocate host memory in pinned memory
   * Return: success or error code
   */
  template<typename T>
@ -168,7 +185,43 @@ public:
  }		

  /** 
-   * Info: write data to memory
+   * Zero CUDA memory.
+   * Set all the elements of the array on the device to zero.
+   */
+  template<typename T>
+  int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
+    cudaError cerror;
+    cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error zeroing cuda memory!\n");
+      return DKS_ERROR;
+    }
+    
+    return DKS_SUCCESS;
+  }
+
+  /** 
+   * Zero CUDA memory async.
+   * Set all the elements of the array on the device to zero.
+   */
+  template<typename T>
+  int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
+    int dkserror = DKS_SUCCESS;
+    cudaError cerror;
+    if (streamId < cuda_numberOfStreams()) {
+	cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size, 
+				 cuda_getStream(streamId));
+	  
+	if (cerror != cudaSuccess)
+	  dkserror = DKS_ERROR;
+    } else
+      dkserror = DKS_ERROR;
+    
+    return dkserror;
+  }
+
+  /** 
+   * Write data to memory
   * Retrun: success or error code
   */
  template<typename T>
@ -185,7 +238,7 @@ public:
  }

  /**
-   * Info: write data assynchonuously
+   * Write data assynchonuously
   * Return: success or error code
   */
  template<typename T>
@ -217,7 +270,7 @@ public:
  }
 		
  /**
-   * Info: read data from memory
+   * Read data from memory
   * Return: success or error code
   */
  template<typename T>
@ -234,7 +287,7 @@ public:
  }

  /**
-   * Info: read data async from device memory
+   * Read data async from device memory
   * Return: success or error code
   */
  template<typename T>
@ -266,19 +319,19 @@ public:
  }
 		
  /**
-   * Info: free memory on device
+   * Free memory on device
   * Return: success or error code
   */
  int cuda_freeMemory(void * mem_ptr);
 	
  /**
-   * Info: free page locked memory on host
+   * Free page locked memory on host
   * Return: success or erro code
   */
  int cuda_freeHostMemory(void * mem_ptr);
 	
  /**
-   * Info: allcate memory and write data (push)
+   * Allcate memory and write data (push)
   * Return: pointer to memory object
   */
  template<typename T>
@ -294,7 +347,7 @@ public:
  }
 		
  /**
-   * Info: read data and free memory (pull)
+   * Read data and free memory (pull)
   * Return: success or error code
   */
  template<typename T>
@ -312,21 +365,10 @@ public:
    else
      return DKS_ERROR;
  }
-
-  /**
-   * Info: execute function
-   * Return: success or error code
-   */
-  int cuda_executeFunction();
-		
-  /**
-   * Info: clean up
-   * Return: success or error code
-   */
-  int cuda_cleanUp();
  
  /**
-   * Info: sync cuda device
+   * Sync cuda device.
+   * Waits till all the tasks on the GPU are finished.
   * Return: success or error code
   */
  int cuda_syncDevice() {
@ -335,7 +377,7 @@ public:
  }

  /**
-   * Page-lock host memory
+   * Page-lock host memory.
   */
  template<typename T>
  int cuda_hostRegister(T *ptr, int size) {
@ -349,7 +391,7 @@ public:
  }

  /**
-   * Release page locked memory
+   * Release page locked memory.
   */
  template<typename T>
  int cuda_hostUnregister(T *ptr) {
@ -362,7 +404,7 @@ public:
  }

  /**
-   * Info: print device memory info (total, used, avail)
+   * Print device memory info (total, used, avail)
   * Return: success or error code
   */
  int cuda_memInfo() {
--- a/src/CUDA/CudaChiSquare.cuh
+++ b/src/CUDA/CudaChiSquare.cuh
@ -8,6 +8,7 @@

 #include "CudaBase.cuh"

+/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
 class CudaChiSquare {

 private:
--- a/src/CUDA/CudaChiSquareRuntime.cu
+++ b/src/CUDA/CudaChiSquareRuntime.cu
@ -86,15 +86,19 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {

  //create program
  nvrtcProgram prog;
-  //std::cout << cudaProg.c_str() << std::endl;
-  nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+//  std::cout << cudaProg.c_str() << std::endl;
+  nvrtcResult createResult = nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+  if (createResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("Program creation failed!");
+    return DKS_ERROR;
+  }

  //compile program
-  const char *opts[] = {"-fmad=false", ""};
-  int numopts = 1;
+  const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
+  int numopts = 2;
  if (mlh) {
-    opts[1] = "-DMLH";
-    numopts = 2;
+    opts[2] = "-DMLH";
+    numopts = 3;
  }

  nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
@ -118,7 +122,11 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
  if (ptx_m != NULL)
    delete[] ptx_m;
  size_t ptxSize; 
-  nvrtcGetPTXSize(prog, &ptxSize); 
+  nvrtcResult ptxSizeResult = nvrtcGetPTXSize(prog, &ptxSize);
+  if (ptxSizeResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("PTX get size error!");
+    return DKS_ERROR;
+  }
  ptx_m = new char[ptxSize]; 
  nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);  
  
@ -127,10 +135,26 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
    return DKS_ERROR;
  }

+  // add some additional diagnostics
+  const int buffer_size = 8192;
+  CUjit_option options[3];
+  void* values[3];
+  char error_log[buffer_size];
+  int err;
+  options[0] = CU_JIT_ERROR_LOG_BUFFER;
+  values[0]  = (void*)error_log;
+  options[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  values[1]  = (void*)buffer_size;
+  options[2] = CU_JIT_TARGET_FROM_CUCONTEXT;
+  values[2]  = 0;
  //load module from ptx
-  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); 
+  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 3, options, values); 
  if (loadResult != CUDA_SUCCESS) {
-    DEBUG_MSG("Load module from ptx failed!");
+    const char *err_msg;
+    cuGetErrorString(loadResult, &err_msg);
+    std::string msg = "Load module from ptx failed! (" + std::to_string(loadResult) + ") : " + err_msg;
+    DEBUG_MSG(msg);
+    DEBUG_MSG(error_log);
    return DKS_ERROR;
  }

--- a/src/CUDA/CudaChiSquareRuntime.cuh
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double

 const std::string cudaFunctFooter = "}\n";

+/**
+ * CUDA implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
+ */
 class CudaChiSquareRuntime : public ChiSquareRuntime{

 private:
@ -29,65 +33,72 @@ private:

  cublasHandle_t defaultCublasRT;

-  /** Setup to init device
-   *  Create context and init device for RT compilation
+  /** 
+   * Setup to init device.
+   * Create context and init device for RT compilation
   */
  void setUpContext();

-  /** Private function to add function to kernel string
-   *
+  /** 
+   * Private function to add function to kernel string.
   */
  std::string buildProgram(std::string function);

 public:

-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase argument
   */
  CudaChiSquareRuntime(CudaBase *base);

-  /** Default constructor init cuda device
-   *
+  /** 
+   * Default constructor init cuda device
   */
  CudaChiSquareRuntime();
  
-  /** Default destructor
-   *
+  /** 
+   * Default destructor.
   */
  ~CudaChiSquareRuntime();

-  /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);

-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
-   * Result is put in &result variable
+   * Result is put in &result variable.
   */
  int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
 		      int numpar, int numfunc, int nummap,
 		      double timeStart, double timeStep,
 		      double &result);

-  /** Write params to device.
+  /** 
+   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 

-  /** Write functions to device.
+  /** 
+   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);

-  /** Write maps to device.
+  /** 
+   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);

-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -96,14 +107,16 @@ public:
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);


-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();

-  /** Check if CUDA device is able to run the chi square kernel.
-   *  Redundant - all new CUDA devices that support RT compilation will also support 
-   *  double precision, there are no other requirements to run chi square on GPU
+  /** 
+   * Check if CUDA device is able to run the chi square kernel.
+   * Redundant - all new CUDA devices that support RT compilation will also support 
+   * double precision, there are no other requirements to run chi square on GPU
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
    return DKS_SUCCESS;
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@ -1,16 +1,16 @@
 #include "CudaCollimatorPhysics.cuh"

-//#define M_P 0.93827231e+00
+//constants used in OPAL
 #define M_P 0.93827204e+00
 #define C 299792458.0
 #define PI 3.14159265358979323846
 #define AVO 6.022e23
 #define R_E 2.81794092e-15
-//#define eM_E 0.51099906e-03
 #define eM_E 0.51099892e-03
 #define Z_P 1
 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7

+//parameter array indexes
 #define POSITION 0 
 #define ZSIZE 1
 #define RHO_M 2
@ -23,16 +23,56 @@
 #define X0_M 9
 #define I_M 10
 #define DT_M 11
+#define LOWENERGY_THR 12

 #define BLOCK_SIZE 128
-#define NUMPAR 12
+#define NUMPAR 13

+/**
+ * CUDA device function for calculating dot product.
+ */
 __device__ inline double dot(double3 &d1, double3 &d2) {

  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);

 }

+/**
+ * CUDA devce function to calculate cross product.
+ */
+__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
+  double3 tmp;
+  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
+  tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
+  tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
+  return tmp;
+}
+
+/**
+ * CUDA device function to calculate arbitrary rotation.
+ */
+__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
+  double c=cos(Theta);
+  double s=sin(Theta);
+  double dotW = sqrt(dot(W,W));
+  W.x = W.x / dotW;
+  W.y = W.y / dotW;
+  W.z = W.z / dotW;
+
+  double dotWR = dot(W, Rorg) * (1.0 - c);
+  double3 crossW = cross(W, Rorg);
+  double3 tmp;
+  tmp.x = Rorg.x * c + crossW.x * s + W.x * dotWR;
+  tmp.y = Rorg.y * c + crossW.y * s + W.y * dotWR;
+  tmp.z = Rorg.z * c + crossW.z * s + W.z * dotWR;
+  return tmp;
+} 
+
+/**
+ * CUDA device function to check if particle is still in material.
+ * z - particle position, par - parameter array. Particle is considered inside the
+ * material if z is > material starting position and z < material starting position - mat size.
+ */
 __device__ inline bool checkHit(double &z, double *par) {

  /* check if particle is in the degrader material */
@ -41,6 +81,11 @@ __device__ inline bool checkHit(double &z, double *par) {
 }


+/**
+ * CUDA device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
 {

@ -81,51 +126,57 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
    Eng = Eng + delta_E / 1E3;
  }

-  pdead = ((Eng<1E-4) || (dEdx>0));
+  pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) );

 }

-__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
-			   double &normP, double &thetacou, double &deltas, int coord,
+/**
+ * CUDA device function for rotation in 2 dimensions.
+ * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),  
+ * "Passage of particles through matter"
+ */
+__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &plane, 
+			   double &betaGamma, double &thetacou, double &deltas, int coord,
 			   double *par) 
 {
-  double Psixz;
-  double pxz;
+  // Calculate the angle between the px and pz momenta to change from beam coordinate to lab coordinate
+  const double Psi = atan2(px, pz);
+  const double pxz = sqrt(px*px + pz*pz);
+  const double cosPsi = cos(Psi);
+  const double sinPsi = sin(Psi);
+  const double cosTheta = cos(thetacou);
+  const double sinTheta = sin(thetacou);

-  if (px>=0 && pz>=0)
-    Psixz = atan(px/pz);
-  else if (px>0 && pz<0)
-      Psixz = atan(px/pz) + PI;
-  else if (px<0 && pz>0)
-    Psixz = atan(px/pz) + 2*PI;
-  else
-    Psixz = atan(px/pz) + PI;
+  // Apply the rotation about the random angle thetacou & change from beam
+  // coordinate system to the lab coordinate system using Psixz (2 dimensions)
+  x += deltas * px / betaGamma + plane * cosPsi;
+  z -= plane * sinPsi;

-  pxz = sqrt(px*px + pz*pz);
-
-  if(coord==1) {
-    x = x + deltas * px/normP + xplane*cos(Psixz);
-    z = z - xplane * sin(Psixz);
+  if (coord == 1) {
+    z += deltas * pz / betaGamma;
  }

-  if(coord==2) {
-    x = x + deltas * px/normP + xplane*cos(Psixz);
-    z = z - xplane * sin(Psixz) + deltas * pz / normP;
-  }
-
-  px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
-  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
+  px = pxz * (cosPsi * sinTheta + sinPsi * cosTheta);
+  pz = pxz * (-sinPsi * sinTheta + cosPsi * cosTheta);
 }

-__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
+/**
+ * CUDA device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * For details on the algorithm see OPAL user guide.
+ */
+__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
+				   bool enableRutherfordScattering) 
+{

  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
  double gamma = (Eng + M_P) / M_P;
-  double normP = sqrt(dot(P, P));
  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
+  double betaGamma = sqrt(dot(P, P));
  double deltas = par[DT_M] * beta * C;
+  double mass = M_P * 1e9; // in eV

-  double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * 
+  double theta0 = 13.6e6 / (beta * betaGamma * mass) * 
    Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));

  // x-direction: See Physical Review, "Multiple Scattering"
@ -140,19 +191,9 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
  }

  //__syncthreads();  
-
-  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
-  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
-
-  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-  if(P2 < 0.0047) {
-    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
-    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-    if(P4 > 0.5)
-      thetaru = -thetaru;
-    Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
-  }
+  //double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  double xplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
+  Rot(P.x, P.z, R.x, R.z, xplane, betaGamma, thetacou, deltas, 0, par);

  // y-direction: See Physical Review, "Multiple Scattering"
  z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
@ -165,78 +206,109 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
    thetacou = z2 * theta0;
  }

-  //__syncthreads();
+  //double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  double yplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
+  Rot(P.y,P.z,R.y,R.z, yplane, betaGamma, thetacou, deltas, 1, par);

-  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
-  Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
-
-  P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-  if(P2 < 0.0047) {
+  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  if( (P2 < 0.0047) && enableRutherfordScattering) {
    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
-    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
-    if(P4 > 0.5)
-      thetaru = -thetaru;
-    Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
+    //double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    double thetaru = 2.5 * sqrt(1 / P3) * 2.0 * theta0;
+    double phiru = 2.0 * M_PI * curand_uniform_double(&state);
+    double th0=atan2(sqrt(P.x*P.x+P.y*P.y),fabs(P.z));
+    double3 W,X;
+    
+    double dotP = sqrt(dot(P,P));
+    X.x = cos(phiru)*sin(thetaru) * dotP;
+    X.y = sin(phiru)*sin(thetaru) * dotP;
+    X.z = cos(thetaru) * dotP;
+    W.x = -P.y;
+    W.y = P.x;
+    W.z = 0.0;
+    P = ArbitraryRotation(W, X, th0);
  }

 }


+/**
+ * CUDA kernel that performs one step in particle movement trough mater.
+ * One thread is launched for each particle in the simulation. The kernel checks if the particle
+ * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
+ * particles that are exiting the material.
+ * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
-					int numparticles)
+					int numparticles, bool enableRutherfordScattering)
 {

  //get global id and thread id
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

-  //transfer params to shared memory
+  //transfer params and particle positions to shared memory
+  //R is kept in shared memory in order to reduce register pressure for the kernel
  extern __shared__ double smem[];
  double *p = (double*)smem;
-  double3 *R = (double3*)&smem[NUMPAR];
+  double3 *R = (double3*)&smem[NUMPAR]; 

-  curandState s; 
+  curandState s; //each tread gets its own cuRand state for random number generation
  double3 P;

+  //load parameters to shared memory
  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
    p[tt] = par[tt];

+  //sync threads to ensure that parameters are finished loading
  __syncthreads();

+  //there might be some empty threads that do no work
  if (idx < numparticles) {
-    s = state[idx];
-    R[tid] = data[idx].Rincol;
-    P = data[idx].Pincol;
+    s = state[idx]; //load cuRand state to local memory
+    R[tid] = data[idx].Rincol; //load position to shared memory
+    P = data[idx].Pincol; //load momentum to local memory

    bool pdead = false;  
    volatile double sq = sqrt(1.0 + dot(P, P));

    double Eng;
    
+    //check if particle is still in the material
    if (checkHit(R[tid].z, p)) {      

+      //calculate enery loss
      Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);

+      //check if particle is not dead
      if (!pdead) {
 	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
 	sq = sqrt(dot(P, P));

+	//caluclate Coulomb scattering
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
-	coulombScat(R[tid], P, s, p); 

+	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
+
+	//update particle momentum
 	data[idx].Pincol = P;
      } else {
+	//mark particle as dead (-1)
 	data[idx].label = -1;
      }
-    
+      
+      //update cuRand state
      state[idx] = s;
    } else {
-    
+      //particle exits material - drift and mark as exiting (-2)
      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -244,13 +316,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
      
    }
 
+    //update particle position
    data[idx].Rincol = R[tid];
  }

 }

-__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
-					 curandState *state, int numparticles)
+/**
+ * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
+ * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
+ * Deprecated - GPU version does not use SoA.
+ * @param[in] data structure of arrays containing particle data
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
+__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, 
+					   curandState *state, int numparticles,
+					   bool enableRutherfordScattering)
 {

  //get global id and thread id
@ -288,7 +372,7 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
    	P.x = P.x * ptot / sq;
    	P.y = P.y * ptot / sq;
    	P.z = P.z * ptot / sq;
-    	coulombScat(R[tid], P, s, p); 
+    	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
      
    	data.Pincol[idx] = P;
    } else {
@ -309,92 +393,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,

 }

-
+/**
+ * Device function to swich off unitless positions.
+ */
 inline __device__ void unitlessOff(double3 &a, const double &c) {
  a.x *= c;
  a.y *= c;
  a.z *= c;
 }

+/**
+ * Device function to swich on unitless positions.
+ */
 inline __device__ void unitlessOn(double3 &a, const double &c) {
  a.x /= c;
  a.y /= c;
  a.z /= c;
 }

-//swithch to unitless positions with dtc
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOn(R, dtc);
-    unitlessOn(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//swithc to unitless positions with dt*c
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-
-//swithc off unitless positions with dtc
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOff(R, dtc);
-    unitlessOff(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//switch off unitelss positions with dt*c
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-

+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] npart number of particles
+ * @param[in] dtc dt*c
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {

  //get global id and thread id
@ -422,8 +446,15 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  }
 }

-
-__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
+__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {

  //get global id and thread id
  volatile int tid = threadIdx.x;
@ -449,7 +480,61 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, dou
  }
 }

-//TODO: kernel for push with switch off unitless positions with dt[i]*c
+/** 
+ * CUDA kernel to perform particle kick.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gEf 
+ * @param[in] *gBf
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
+__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
+			   double3 *gBf, double *gdt, double charge, 
+			   double mass, int npart, double c)
+{
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+
+  if (idx < npart) {
+    double3 R = gR[idx];
+    double3 P = gP[idx];
+    double3 Ef = gEf[idx];
+    double3 Bf = gBf[idx];
+    double dt = gdt[idx];
+
+    P.x += 0.5 * dt * charge * c / mass * Ef.x;
+    P.y += 0.5 * dt * charge * c / mass * Ef.y;
+    P.z += 0.5 * dt * charge * c / mass * Ef.z;
+
+    double gamma = sqrt(1.0 + dot(P, P));
+    double3 t, w, s;
+    t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
+    t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
+    t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
+    
+    double3 crossPt = cross(P, t);
+    w.x = P.x + crossPt.x;
+    w.y = P.y + crossPt.y;
+    w.z = P.z + crossPt.z;
+
+    s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
+    s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
+    s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
+
+    double3 crossws = cross(w, s);
+    P.x += crossws.x;
+    P.y += crossws.y;
+    P.z += crossws.z;
+
+    P.x += 0.5 * dt * charge * c / mass * Ef.x;
+    P.y += 0.5 * dt * charge * c / mass * Ef.y;
+    P.z += 0.5 * dt * charge * c / mass * Ef.z;
+
+    gP[idx] = P;
+  }
+}

 __device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {

@ -554,64 +639,8 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection

 }

-struct compare_particle
-{
-  int threshold;
-
-  compare_particle() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART p1, CUDA_PART p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct compare_particle_small
-{
-  int threshold;
-
-  compare_particle_small() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART_SMALL p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct less_then
-{
-  __host__ __device__
-  bool operator()(int x)
-  {
-    return x < 0;
-  }
-};
-
-int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
+int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
+					     bool enableRutherfordScattering)
 {

  int threads = BLOCK_SIZE;
@ -624,7 +653,8 @@ int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int n
  kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr, 
 							  (double*)par_ptr,
 							  m_base->cuda_getCurandStates(),
-							  numparticles);
+							  numparticles,
+							  enableRutherfordScattering);

  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
@ -671,12 +701,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
    }
  } else {
    if (streamId == -1) {
-      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart, 
-				      (double*)dt_ptr, c);
+      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, 
+				      (double*)dt_ptr, npart, c);
    } else {
      cudaStream_t cs = m_base->cuda_getStream(streamId);
-      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, 
-					      (double*)dt_ptr, c);
+      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, 
+					      (double*)dt_ptr, npart, c);
    }
  }

@ -684,6 +714,29 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
  return DKS_SUCCESS;
 }

+int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
+						void *bf_ptr, void *dt_ptr, double charge,
+						double mass, int npart,
+						double c, int streamId) 
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = npart / threads + 1;
+
+  //call kernel
+  if (streamId == -1) {
+    kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
+				    (double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
+  } else {
+    cudaStream_t cs = m_base->cuda_getStream(streamId);
+    kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, 
+					    (double3*)ef_ptr, (double3*)bf_ptr, 
+					    (double*)dt_ptr, charge, mass,  npart, c);
+  }
+
+  return DKS_SUCCESS;
+}
+
 int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
 							 void *lastSec_ptr, void *orient_ptr, 
 							 int npart, int nsec, 
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@ -20,7 +20,8 @@
 #include "CudaBase.cuh"

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU or MIC as AoS.
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
 */
 typedef struct __align__(16) {
  int label;
@ -37,7 +38,10 @@ typedef struct __align__(16) {
 } CUDA_PART;

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as AoS
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
 */
 typedef struct {
  int label;
@ -47,7 +51,8 @@ typedef struct {
 } CUDA_PART_SMALL;

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as SoA.
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
 */
 typedef struct {
  int *label;
@ -65,6 +70,9 @@ typedef struct {

 /**
 * Structure for storing particle on GPU
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
 */
 typedef struct {
  int *label;
@ -73,11 +81,39 @@ typedef struct {
  double3 *Pincol;
 } CUDA_PART2_SMALL;

-/** CudaCollimatorPhysics class.
- * Contains kerenls that execute CollimatorPhysics functions form OPAL.
- * For detailed documentation on CollimatorPhysics functions see OPAL documentation
+/** 
+ * Operator used in thrust sort to compare particles by label.
+ * Used to move dead particles to the end of array, since they have label -1 or -2.
 */
-class CudaCollimatorPhysics : public DKSCollimatorPhysics{
+struct compare_particle_small
+{
+  int threshold;
+
+  compare_particle_small() {
+    threshold = 0;
+  }
+
+  void set_threshold(int t) {
+    threshold = t;
+  }
+
+  __host__ __device__
+  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
+    return p1.label > p2.label;
+  }
+
+  __host__  __device__
+  bool operator()(CUDA_PART_SMALL p1) {
+    return p1.label < threshold;
+  }
+};
+
+/** 
+ * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Contains kerenls that execute CollimatorPhysics functions form OPAL.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
+class CudaCollimatorPhysics : public DKSCollimatorPhysics {

 private:

@ -86,32 +122,44 @@ private:

 public:

-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase as argument.
+   * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
   */
  CudaCollimatorPhysics(CudaBase *base) {
    m_base = base;
    base_create = false;
  }

-  /** Constructor - empty. */
+  /** 
+   * Empty constructor.
+   * Create a new instance of CudaCollimatorPhysics with its own CudaBase. 
+   */
  CudaCollimatorPhysics() { 
    m_base = new CudaBase();
    base_create = true;
  }

-  /** Destructor - empty */
+  /** 
+   * Destructor.
+   * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
+   */
  ~CudaCollimatorPhysics() { 
    if (base_create)
      delete m_base;
  };

-  /** Execute collimator physics kernel.
+  /** 
+   * Execute collimator physics kernel.
   *
   */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-			int numpartices);
+			int numpartices, bool enableRutherforScattering = true);

+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			   void *px_ptr, void *py_ptr, void *pz_ptr,
@ -120,12 +168,17 @@ public:
      return DKS_ERROR;
    }

-  /** Sort particle array on GPU.
+  /** 
+   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  
+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
@ -134,14 +187,25 @@ public:
      return DKS_ERROR;
    }

-  /** BorisPusher push function for integration from OPAL.
+  /** 
+   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);

-  /** BorisPusher push function with transformto function form OPAL
+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
+  int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
+			   void *bf_ptr, void *dt_ptr, double charge, double mass,
+			   int npart, double c, int streamId = -1); 
+
+  /** 
+   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
--- a/src/CUDA/CudaFFT.cuh
+++ b/src/CUDA/CudaFFT.cuh
@ -10,7 +10,11 @@
 #include "../Algorithms/FFT.h"
 #include "CudaBase.cuh"

-class CudaFFT : public DKSFFT{
+/**
+ * Cuda FFT class based on BaseFFT interface.
+ * Uses cuFFT library to perform FFTs on nvidias GPUs.
+ */
+class CudaFFT : public BaseFFT {

 private:

@ -34,7 +38,7 @@ public:
  ~CudaFFT();
 		
  /**
-   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Init cufftPlans witch can be reused for all FFTs of the same size and type
   * Return: success or error code
   */
  int setupFFT(int ndim, int N[3]);
@ -42,45 +46,21 @@ public:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }

  /**
-   * Info: destroy default FFT plans
+   * Destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT();

-  /*
-    Info: execute complex to complex double precision fft using cufft library
-    Return: success or error code
-  */
  int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
 		
-  /*
-    Info: execute ifft 
-    Return: success or error code
-  */
  int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute normalize using cuda kernel for complex to complex iFFT
-    Return: success or error code
-  */
  int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute real to complex double precision FFT
-    Return: success or error code
-  */
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: exectue complex to real double precision FFT
-    Return: success or error code
-  */
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);

-  /*
-    Info: execute normalize for complex to real iFFT
-    Return: success or error code
-  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);

 };
--- a/src/CUDA/CudaGreensFunction.cu
+++ b/src/CUDA/CudaGreensFunction.cu
@ -189,12 +189,11 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
  
    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
-  
+    
    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
-  
+
    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
  }
-
 }


@ -273,28 +272,20 @@ __global__ void mirroredRhoField(double *rho2_m,
    id7 = rk * NI * NJ + rj * NI + i;
    id8 = rk * NI * NJ + rj * NI + ri;
    
-    
    double data = rho2_m[id1];
-    if (i != 0)
-      rho2_m[id2] = data;
+    if (i != 0) rho2_m[id2] = data;
    
-    if (j != 0)
-      rho2_m[id3] = data;
+    if (j != 0) rho2_m[id3] = data;

-    if (i != 0 && j != 0)
-      rho2_m[id4] = data;
+    if (i != 0 && j != 0) rho2_m[id4] = data;
    
-    if (k != 0) 
-      rho2_m[id5] = data;
+    if (k != 0) rho2_m[id5] = data;
    
-    if (k !=  0 && i != 0)
-      rho2_m[id6] = data;
+    if (k !=  0 && i != 0) rho2_m[id6] = data;
    
-    if (k!= 0 && j != 0)
-      rho2_m[id7] = data;
+    if (k!= 0 && j != 0) rho2_m[id7] = data;
    
-    if (k != 0 && j != 0 & i != 0)
-      rho2_m[id8] = data;
+    if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
      
  }

@ -363,9 +354,9 @@ CudaGreensFunction::~CudaGreensFunction() {
    delete m_base;
 }

-int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
-					    double hr_m0, double hr_m1, double hr_m2,
-					    int streamId)
+int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+				       double hr_m0, double hr_m1, double hr_m2,
+				       int streamId)
 {
  
  int thread = 128;
@ -373,7 +364,7 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i

  //if no stream specified use default stream
  if (streamId == -1) {
-    kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
+    kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);

    return DKS_SUCCESS;
  }
@ -381,7 +372,7 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
  
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
-    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
+    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
    return DKS_SUCCESS;
  }
  
@ -389,15 +380,17 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
  
 }

-int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, 
-						       int I, int J, int K,
-						       int streamId) 
+int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, 
+						  int I, int J, int K,
+						  int streamId) 
 {
  
  int thread = 128;
  int block = (I * J * K / thread) + 1;
+  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);

  if (streamId == -1) {
+    m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
    kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, 
 					    2*(I - 1), 2*(J - 1), I, J, K);
    return DKS_SUCCESS;
@ -406,6 +399,7 @@ int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgr
  
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
+    m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
    kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, 
 						  2*(I - 1), 2*(J - 1), I, J, K);
    return DKS_SUCCESS;
@ -415,22 +409,22 @@ int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgr
  return DKS_ERROR;
 }

-int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
+int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
  
  int thread = 128;
  int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
  
  if (streamId == -1) {
-    mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I,  2*J);
-    mirroredRhoField<<< block, thread >>>( (double *) mem_ptr,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
+    mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I,  2*J);
+    mirroredRhoField<<< block, thread >>>( (double *) rho2_m,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
    return DKS_SUCCESS;
  }
  
  
  if (streamId < m_base->cuda_numberOfStreams()) {
    cudaStream_t cs = m_base->cuda_getStream(streamId);
-    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I,  2*J);
-    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
+    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I,  2*J);
+    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
    
    return DKS_SUCCESS;
  }
@ -440,13 +434,13 @@ int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K,
  return DKS_ERROR;
 }

-int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, 
-						   int size, int streamId) {
+int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, 
+					      int size, int streamId) {
  
  int threads = 128;
  int blocks = size / threads + 1;
  int datasize = 2 * threads * sizeof(cuDoubleComplex);
-
+  
  if (streamId == -1) {
    multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1, 
 							     (cuDoubleComplex*)ptr2, 
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@ -2,17 +2,18 @@
 #define H_CUDA_GREENSFUNCTION

 #include <iostream>
-#include <math.h>
+#include <cmath>

 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuComplex.h>
 #include "cublas_v2.h"

-
+#include "../Algorithms/GreensFunction.h"
 #include "CudaBase.cuh"

-class CudaGreensFunction {
+/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
+class CudaGreensFunction : public GreensFunction{

 private:
 	
@ -30,32 +31,32 @@ public:
  /* destructor */
  ~CudaGreensFunction();
 		
-  /*
+  /**
    Info: calc itegral on device memory (taken from OPAL src code)
    Return: success or error code
  */
-  int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
-			  double hr_m0, double hr_m1, double hr_m2, 
-			  int streamId = -1);
+  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+		       double hr_m0, double hr_m1, double hr_m2, 
+		       int streamId = -1);
 		
-  /*
+  /**
    Info: integration of rho2_m field (taken from OPAL src code)
    Return: success or error code
  */
-  int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
-				     int streamId = -1);
+  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
+				  int streamId = -1);
 		
-  /*
+  /**
    Info: mirror rho field (taken from OPAL src code)
    Return: succes or error code
  */
-  int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
+  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);

-  /*
+  /**
    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
    Return: success or error code
  */
-  int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
+  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);


 };
--- a/src/CUDA/CudaImageReconstruction.cuh
+++ b/src/CUDA/CudaImageReconstruction.cuh
@ -10,6 +10,7 @@
 #include "../Algorithms/ImageReconstruction.h"
 #include "CudaBase.cuh"

+/** CUDA implementation of ImageReconstruction interface. */
 class CudaImageReconstruction : public ImageReconstruction {

 private:
--- a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
+++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
@ -83,6 +83,56 @@ __device__ double ifld(double t, double alpha, double phi, double nu, double lam
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

+__device__ double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double rate2 = sigma*sigma*t*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
+  }
+
+  return result;
+}
+
+__device__ double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double at = a*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
+  }
+
+  return result;
+}
+
 __device__ double b(double t, double phi, double nu) {
  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
--- a/src/DKSBase.cpp
+++ b/src/DKSBase.cpp
@ -103,25 +103,14 @@ DKSBase::DKSBase() {

 #ifdef DKS_CUDA
  cbase = new CudaBase();
-  cfft = new CudaFFT(cbase);
-  cgreens = new CudaGreensFunction(cbase);
-  cchi = new CudaChiSquare(cbase);
-  ccol = new CudaCollimatorPhysics(cbase);
 #endif

 #ifdef DKS_OPENCL
  oclbase = new OpenCLBase();
-  oclfft = new OpenCLFFT(oclbase);
-  oclchi = new OpenCLChiSquare(oclbase);
-  oclcol = new OpenCLCollimatorPhysics(oclbase);
 #endif

 #ifdef DKS_MIC
  micbase = new MICBase();
-  micfft = new MICFFT(micbase);
-  miccol = new MICCollimatorPhysics(micbase);
-  micgreens = new MICGreensFunction(micbase);
-  micchi = new MICChiSquare(micbase);
 #endif

 }
@ -138,25 +127,14 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {

 #ifdef DKS_CUDA
  cbase = new CudaBase();
-  cfft = new CudaFFT(cbase);
-  cgreens = new CudaGreensFunction(cbase);
-  cchi = new CudaChiSquare(cbase);
-  ccol = new CudaCollimatorPhysics(cbase);
 #endif

 #ifdef DKS_OPENCL
  oclbase = new OpenCLBase();
-  oclfft = new OpenCLFFT(oclbase);
-  oclchi = new OpenCLChiSquare(oclbase);
-  oclcol = new OpenCLCollimatorPhysics(oclbase);
 #endif

 #ifdef DKS_MIC
  micbase = new MICBase();
-  micfft = new MICFFT(micbase);
-  miccol = new MICCollimatorPhysics(micbase);
-  micgreens = new MICGreensFunction(micbase);
-  micchi = new MICChiSquare(micbase);
 #endif

 }
@ -173,27 +151,16 @@ DKSBase::~DKSBase() {
  if (m_function_name != NULL)
    delete[] m_function_name;

- 
 #ifdef DKS_CUDA
-  delete cfft;
-  delete cgreens;
-  delete cchi;
-  delete ccol;
  delete cbase;
 #endif

 #ifdef DKS_OPENCL
-  delete oclfft;
-  delete oclchi;
-  delete oclcol;
  delete oclbase;
 #endif

+
 #ifdef DKS_MIC
-  delete micfft;
-  delete miccol;
-  delete micgreens;
-  delete micchi;
  delete micbase;
 #endif

@ -307,38 +274,45 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
    return DKS_ERROR;
 }

-/*
-  init device
-*/
-int DKSBase::initDevice() {
+int DKSBase::setupDevice() {
+
+  int ierr = DKS_ERROR;

  //if api is not set default is OpenCL
  if (!m_api_set) {
    setDevice("-gpu", 4);
    setAPI(API_OPENCL, 6);
-    return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+    ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
  } else {
    if (apiOpenCL()) {
      if (!m_device_set) {
 	setDevice("-gpu", 4);
 	setAPI(API_OPENCL, 6);
-	return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+	ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
      } else {
 	setAPI(API_OPENCL, 6);
-	return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
+	ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
      }
    } else if (apiCuda()) {
      setDevice("-gpu", 4);
      setAPI(API_CUDA, 4);			
-      return CUDA_SAFECALL(DKS_SUCCESS);
+      ierr = CUDA_SAFECALL(DKS_SUCCESS);
    } else if (apiOpenMP()) {
      setDevice("-mic", 4);
      setAPI(API_OPENMP, 6);
-      return MIC_SAFECALL(DKS_SUCCESS);
+      ierr = MIC_SAFECALL(DKS_SUCCESS);
    }
  }

-  return DKS_ERROR;
+  return ierr;
+
+}
+
+/*
+  init device
+*/
+int DKSBase::initDevice() {
+  return setupDevice();
 }

 /* 
@ -456,360 +430,19 @@ int DKSBase::syncDevice() {
  return DKS_ERROR;
 }

-/* setup fft plans to reuse if multiple ffts of same size are needed */
-int DKSBase::setupFFT(int ndim, int N[3]) {
-
-  if (apiCuda()) {
-    return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
-  } else if (apiOpenMP()) {
-    //micbase.mic_setupFFT(ndim, N);
-    //BENI: setting up RC and CR transformations on MIC
-    int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
-    int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
-    if (ierr1 != DKS_SUCCESS)
-      return ierr1;
-    if (ierr2 != DKS_SUCCESS)
-      return ierr2;
-    return DKS_SUCCESS;
-  }
-
-  return DKS_ERROR;
-
-}
-//BENI:
-int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {

+int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
  if (apiCuda())
-    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
+    return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
+  if (apiOpenCL())
+    return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));

  return DKS_ERROR;
-
 }

-//BENI:
-int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
-
-  if (apiCuda())
-    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
-
-  return DKS_ERROR;
-
-}
-
-/* call OpenCL FFT function for selected platform */
-int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
-
-  if (apiOpenCL()) {
-    //load kernel and execute
-    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
-      return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
-    else
-      return DKS_ERROR;
-  } else if (apiCuda()) {
-    return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
-  } else if (apiOpenMP()) {
-    return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
-  }
-   
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call OpenCL IFFT function for selected platform */
-int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
-  if (apiOpenCL()) {
-    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
-      return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
-    else
-      return DKS_ERROR;
-  } else if (apiCuda()) {
-    return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
-  } else if (apiOpenMP()) {
-    return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
-  }
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call normalize FFT function for selected platform */
-int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
-
-  if (apiOpenCL()) {
-    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
-      return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
-    else 
-      return DKS_ERROR;
-  } else if (apiCuda()) {
-    return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
-  } else if (apiOpenMP()) {
-    return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
-  }
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call real to complex FFT */
-int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
-
-  if (apiCuda())
-    return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
-  else if (apiOpenMP())
-    return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call complex to real FFT */
-int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
-  if (apiCuda())
-    return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
-  else if (apiOpenMP())
-    return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* normalize complex to real iFFT */
-int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
-  if (apiCuda())
-    return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_SUCCESS;
-}
-
-/* normalize complex to real iFFT */
-int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
-  if (apiOpenCL()) {
-    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
-      return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
-    else
-      return DKS_ERROR;
-  }
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-
-}
-
-int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
-				double hz_m0, double hz_m1, double hz_m2, int streamId) {
-
-  if (apiCuda()) {
-    return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ, 
-						      hz_m0, hz_m1, hz_m2, streamId) );
-  } else if (apiOpenMP()) {
-    //BENI:
-    return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
-  } 
-
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
-				   int I, int J, int K, int streamId) {
-
-  if (apiCuda())
-    return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
-  
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
-
+int DKSBase::callInitRandoms(int size, int seed) {
  if (apiCuda()) 
-    return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
-  
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
-
-  if (apiCuda())
-    return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
-
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-
-int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
-			     double fTimeResolution, double fRebin,
-			     int sensors, int length, int numpar, double &result)
-{
-
-  if (apiCuda()) {
-    return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
-						fTimeResolution, fRebin,
-						sensors, length, numpar, 
-						result));
-  } else if (apiOpenCL()) {
-
-    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
-      return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
-						     fTimeResolution, fRebin,
-						     sensors, length, numpar, result));
-    else
-      return DKS_ERROR;
-  }
-   
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-
-}
-
-int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
-			       double fTimeResolution, double fRebin, double fGoodBinOffset,
-			       int sensors, int length, int numpar,
-			       double &result)
-{
-  if (apiCuda()) {
-    return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
-						  fTimeResolution, fRebin, fGoodBinOffset,
-						  sensors, length, numpar,
-						  result));
-  } else if (apiOpenCL()) {
-    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
-      return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
-						       fTimeResolution, fRebin, fGoodBinOffset,
-						       sensors, length, numpar, result));
-    else
-      return DKS_ERROR;
-  }
-   
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
- 
-}
-
-int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
-				 double fTimeResolution, double fRebin, double fGoodBinOffset,
-				 int sensors, int length, int numpar,
-				 double &result)
-{
-  if (apiCuda()) {
-    return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
-						    fTimeResolution, fRebin, fGoodBinOffset,
-						    sensors, length, numpar,
-						    result));
-  } else if (apiOpenCL()) {
-    
-    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
-      return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
-							 fTimeResolution, fRebin, fGoodBinOffset,
-							 sensors, length, numpar, result));
-    else
-      return DKS_ERROR;
-  }
-  
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-  
-}
-
-int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
-				   int numparticles, int numparams,
-				   int &numaddback, int &numdead) 
-{
-
-  if (apiCuda()) {
-    return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
-  } else if (apiOpenCL()) {
-    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
-      return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
-    else
-      return DKS_ERROR;
-
-  } else if (apiOpenMP()) {
-    return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
-  } 
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-
-}
-
-
-int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) 
-{
-
-  if (apiCuda())
-    return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
-  else if (apiOpenMP())
-    return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
-
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
-				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
-				      void *px_ptr, void *py_ptr, void *pz_ptr,
-				      void *par_ptr, int numparticles)
-{
-
-  if (apiOpenMP()) {
-    return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
-						      rx_ptr, ry_ptr, rz_ptr, 
-						      px_ptr, py_ptr, pz_ptr,
-						      par_ptr,  numparticles) );
-  }
-
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-
-int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
-{
-
-  if (apiCuda())
-    return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
-   
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-}
-
-int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
-					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
-					  void *px_ptr, void *py_ptr, void *pz_ptr,
-					  void *par_ptr, int numparticles, int &numaddback) 
-{
-
-  if (apiOpenMP()) {
-    return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
-							 rx_ptr, ry_ptr, rz_ptr, 
-							 px_ptr, py_ptr, pz_ptr,
-							 par_ptr,  numparticles, numaddback));
-  }
-  
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-
-}
-
-
-int DKSBase::callInitRandoms(int size) {
-  if (apiCuda()) 
-    return CUDA_SAFECALL(cbase->cuda_createCurandStates(size));
+    return CUDA_SAFECALL(cbase->cuda_createCurandStates(size, seed));
  else if (apiOpenCL())
    return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size));
  else if (apiOpenMP())
@ -819,43 +452,3 @@ int DKSBase::callInitRandoms(int size) {
  return DKS_ERROR;
  
 }
-
-int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
-				      void *dt_ptr, double dt, double c, 
-				      bool usedt, int streamId) 
-{
-
-  if (apiCuda()) 
-    return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, 
-						    usedt, streamId));
-  else if (apiOpenMP())
-    return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, 
-						     c, usedt, streamId));
-   
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-  
-}
-
-int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
-					       void *lastSec_ptr, void *orient_ptr, 
-					       int npart, int nsec, void *dt_ptr, double dt, 
-					       double c, bool usedt, int streamId)
-{
-
-  if (apiCuda()) {
-    return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
-							     lastSec_ptr, orient_ptr,
-							     npart, nsec, dt_ptr, dt, 
-							     c, usedt, streamId));
-  } else if (apiOpenMP()) {
-    return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
-							      lastSec_ptr, orient_ptr,
-							      npart, nsec, dt_ptr, dt, 
-							      c, usedt, streamId));
-  } 
-    
-  DEBUG_MSG("No implementation for selceted platform");
-  return DKS_ERROR;
-  
-}
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@ -1,11 +1,3 @@
-/** DKSBase class.
- * DKSBase.h
- * Author: Uldis Locans
- * Date: 15.09.2014
- * Base class of Dynamic Kernel Scheduler that handles the function calls
- * from host application to DKS
- */
-
 #ifndef H_DKS_BASE
 #define H_DKS_BASE

@ -29,34 +21,24 @@
 #endif

 #include "OpenCL/OpenCLBase.h"
-#include "OpenCL/OpenCLFFT.h"
-#include "OpenCL/OpenCLChiSquare.h"
-#include "OpenCL/OpenCLCollimatorPhysics.h"
 #endif

 #ifdef DKS_CUDA
 #include "CUDA/CudaBase.cuh"
-#include "CUDA/CudaFFT.cuh"
-#include "CUDA/CudaGreensFunction.cuh"
-#include "CUDA/CudaChiSquare.cuh"
-#include "CUDA/CudaCollimatorPhysics.cuh"
-#include "nvToolsExt.h"
 #endif

 #ifdef DKS_MIC
 #include "MIC/MICBase.h"
-#include "MIC/MICChiSquare.h"
-#include "MIC/MICFFT.h"
-#include "MIC/MICCollimatorPhysics.h"
-#include "MIC/MICGreensFunction.hpp"
 #endif

-#include "Algorithms/CollimatorPhysics.h"
-#include "Algorithms/FFT.h"
-
 #include "AutoTuning/DKSConfig.h"

-/** DKSBase class for handling function calls to DKS library */
+/** 
+ * API for handling communication function calls to DKS library.
+ * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
+ * memory manegement, data transfer and other basic communication functions between
+ * the host and device.
+ */
 class DKSBase {

 private:
@ -73,25 +55,14 @@ private:

 #ifdef DKS_OPENCL	
  OpenCLBase *oclbase;
-  OpenCLFFT *oclfft;
-  OpenCLChiSquare *oclchi;
-  OpenCLCollimatorPhysics *oclcol;
 #endif

 #ifdef DKS_CUDA
  CudaBase *cbase;
-  CudaFFT *cfft;
-  CudaGreensFunction *cgreens;
-  CudaChiSquare *cchi;
-  CudaCollimatorPhysics *ccol;
 #endif

 #ifdef DKS_MIC
  MICBase *micbase;
-  MICFFT *micfft;
-  MICCollimatorPhysics *miccol;
-  MICGreensFunction *micgreens;
-  MICChiSquare *micchi;
 #endif

 protected:
@ -100,7 +71,7 @@ protected:
  DKSConfig dksconfig;

  /** 
-   * Check if current API is set to OpenCL
+   * Check if current API is set to OpenCL.
   * Return true/false wether current api is opencl
   */
  bool apiOpenCL();
@ -117,11 +88,11 @@ protected:
   */
  bool apiOpenMP();

-  /** Check if device is GPU */
+  /** Check if device is GPU. */
  bool deviceGPU();
-  /** Check if device is CPU */
+  /** Check if device is CPU. */
  bool deviceCPU();
-  /** Check if device is MIC */
+  /** Check if device is MIC. */
  bool deviceMIC();

  /**
@ -139,6 +110,12 @@ protected:
  }
 #endif

+#ifdef DKS_MIC
+  MICBase *getMICBase() {
+    return micbase;
+  }
+#endif
+
  /** Call OpenCL base to load specified kenrel file.
   *
   */
@ -154,6 +131,7 @@ protected:
    return device_name;
  }

+
 public:

  /** 
@ -173,6 +151,11 @@ public:
   */
  ~DKSBase();

+  /** Function to initialize objects based on the device used.
+   *  
+   */
+  int setupDevice();
+
  /** Turn on auto tuning */
  void setAutoTuningOn() { m_auto_tuning = true; }

@ -405,7 +388,7 @@ public:
    } else if (apiOpenMP()) {
 #ifdef DKS_MIC
      void * mem_ptr = NULL;
-      mem_ptr = micbase.mic_allocateMemory<T>(elements);	
+      mem_ptr = micbase->mic_allocateMemory<T>(elements);	
      return mem_ptr;
 #endif
    }
@ -498,7 +481,7 @@ public:
      return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));

    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
+      return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));

    } 
      
@ -532,7 +515,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
+      return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
    } 
    
    return DKS_ERROR;
@ -832,7 +815,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
+      return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
    } 
    
    return DKS_ERROR;
@ -860,7 +843,7 @@ public:
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
    } else if (apiOpenMP()) {
-      return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements, 
+      return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements, 
 						       streamId, offset));
    }

@ -880,228 +863,32 @@ public:
    else if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
    else if (apiOpenMP())
-      return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
+      return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));

    return DKS_ERROR;
  }

-
-  ///////////////////////////////////////////////
-  ///////Function library part of dksbase////////
-  ///////////////////////////////////////////////
-
-  /** 
-   * Setup FFT function.
-   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
-   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
-   * each fft will do its own setup according to fft size and dimensions.
-   * TODO: opencl and mic implementations
-   */
-  int setupFFT(int ndim, int N[3]);
-  //BENI:
-  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
-  //BENI:
-  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
-
-  /** 
-   * Call complex-to-complex fft.
-   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
-   * stream id can be specified to use other streams than default.
-   * TODO: mic implementation
-   */
-  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Call complex-to-complex ifft.
-   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
-   * stream id can be specified to use other streams than default.
-   * TODO: mic implementation.
-   */
-  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Normalize complex to complex ifft.
-   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
-   * fft size
-   * TODO: mic implementation.
-   */
-  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Call real to complex FFT.
-   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
-   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
-   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
-   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
-   * TODO: opencl and mic implementations
-   */
-  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Call complex to real iFFT.
-   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
-   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
-   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
-   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
-   * TODO: opencl and mic implementations.
-   */
-  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Normalize compelx to real ifft.
-   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
-   * fft size.
-   * TODO: opencl and mic implementations.
-   */
-  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
-
  /**
-   * Transpose 2D and 3D arrays, OpenCL implementation
-   * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose 
+   * Create random numbers on the device and fille mem_data array
   */
-  int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
-
-  /** 
-   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
-   * For specifics check OPAL docs.
-   * TODO: opencl and mic implementations.
-   */
-  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
-			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
-
-  /** 
-   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
-   * For specifics check OPAL docs.
-   * TODO: opencl and mic implementations.
-   */
-  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
-			    int I, int J, int K, int streamId = -1);
-
-  /** 
-   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
-   * For specifics check OPAL docs.
-   * TODO: opencl and mic implementations.
-   */
-  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
-
-  /** 
-   * Element by element multiplication.
-   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
-   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
-   * TODO: opencl and mic implementations.
-   */
-  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
-
-  /** 
-   * Chi square for parameter fitting on device.
-   * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for 
-   * intermediate results. Chi square results are put in &results
-   */
-  int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
-		      double fTimeResolution, double fRebin,
-		      int sensors, int length, int numpar, double &result);
-
-  /** 
-   * max-log-likelihood for parameter fitting on device.
-   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
-   * mem_par - pointer to parameter set, mem_results - pointer for 
-   * intermediate results. Chi square results are put in &results.
-   * TODO: opencl and mic implementations.
-   */
-  int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
-			double fTimeResolution, double fRebin, double fGoodBinOffser,
-			int sensors, int length, int numpar,
-			double &result);
-
-  /** 
-   * max-log-likelihood for parameter fitting on device.
-   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
-   * mem_par - pointer to parameter set, mem_results - pointer for 
-   * intermediate results. Chi square results are put in &results.
-   * TODO: opencl and mic implementations.
-   */
-  int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
-			  double fTimeResolution, double fRebin, double fGoodBinOffser,
-			  int sensors, int length, int numpar,
-			  double &result);
-
-  /** 
-   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
-   * TODO: opencl and mic implementations.
-   */
-  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
-			    int numparticles, int numparams, 
-			    int &numaddback, int &numdead);
-
-
-  
-  /** 
-   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
-   * TODO: opencl and mic implementations.
-   */
-  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
-
-  /** 
-   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
-   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
-   */
-  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
-			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
-			       void *px_ptr, void *py_ptr, void *pz_ptr,
-			       void *par_ptr, int numparticles);
-
-  /**
-   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
-   * TODO: opencl and mic implementations.
-   */
-  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
-
-  /**
-   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
-   * TODO: opencl and mic implementations.
-   */
-  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
-				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
-				   void *px_ptr, void *py_ptr, void *pz_ptr,
-				   void *par_ptr, int numparticles, int &numaddback);
+  int callCreateRandomNumbers(void *mem_ptr, int size);

  /** 
   * Init random number states and save for reuse on device.
+   * If seed is -1, a random seed based on current time is taken.
   * TODO: opencl and mic implementations.
   */
-  int callInitRandoms(int size);
-
-  /**
-   * Integration code from ParallelTTracker from OPAL.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
-   */
-  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
-			       void *dt_ptr, double dt, double c, 
-			       bool usedt = false, int streamId = -1);
-
-  /**
-   * Integration code from ParallelTTracker from OPAL.
-   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
-   */
-  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
-					void *lastSec_ptr, void *orient_ptr, 
-					int npart, int nsec, void *dt_ptr,
-					double dt, double c, bool usedt = false, 
-					int streamId = -1);
+  int callInitRandoms(int size, int seed = -1);

  /**
   * Print memory information on device (total, used, available)
   * TODO: opencl and mic imlementation
   */
  int callMemInfo() {
+    #ifdef DKS_CUDA
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_memInfo());
-
+    #endif
    return DKS_ERROR;
  }

@ -1110,11 +897,13 @@ public:
   * Used for debuging and timing purposes only.
   */
  void oclEventInfo() {
+    #ifdef DKS_OPENCL
    if (apiOpenCL())
      return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
-
+    #endif
  }

+
  /** 
   * Test function to profile opencl kernel calls.
   * Used for debuging and timing purposes only.
--- a/src/DKSBaseMuSR.cpp
+++ b/src/DKSBaseMuSR.cpp
@ -24,6 +24,7 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
  //if we are not auto tuning and the size of the problem has changed find the new parameters
  //from autotuning config file
  if (!isAutoTuningOn() && length != chiSquareSize_m) {
+    /*
    int numBlocks, blockSize;
    std::string device_name;
    getDeviceName(device_name);
@ -33,8 +34,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
 				 length, "BlockSize", blockSize);
    chiSq->setKernelParams(numBlocks, blockSize);
    
-    //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
-
+    std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
+    */
    chiSquareSize_m = length;
  } 

--- a/src/DKSBaseMuSR.h
+++ b/src/DKSBaseMuSR.h
@ -8,6 +8,7 @@
 #include "AutoTuning/DKSAutoTuningTester.h"

 #include "DKSBase.h"
+#include "DKSFFT.h"

 #include "Algorithms/ChiSquareRuntime.h"

@ -19,7 +20,12 @@
 #include "OpenCL/OpenCLChiSquareRuntime.h"
 #endif

-class DKSBaseMuSR : public DKSBase {
+/**
+ * API to handle musrfit calls to DKS library.
+ * Using ChiSquareRuntime interface allows to call chi square functions on the 
+ * GPU or CPU using CUDA or OpenCL.
+ */
+class DKSBaseMuSR : public DKSFFT {

 private:

--- a/src/DKSDefinitions.h
+++ b/src/DKSDefinitions.h
@ -62,6 +62,12 @@
 #define OPENCL_SAFEINIT(x) ( NULL )
 #endif

+#ifdef DKS_AMD
+#define OPENCL_SAFEINIT_AMD(x) ( x )
+#else
+#define OPENCL_SAFEINIT_AMD(x) ( NULL )
+#endif
+
 #ifdef DKS_MIC
 #define MIC_SAFEINIT(x) ( x )
 #else
--- a/src/DKSFFT.cpp
+++ b/src/DKSFFT.cpp
@ -0,0 +1,147 @@
+#include "DKSFFT.h"
+
+DKSFFT::DKSFFT() {
+  dksfft = nullptr;
+}
+
+DKSFFT::~DKSFFT() {
+  delete dksfft;
+}
+
+/* setup fft plans to reuse if multiple ffts of same size are needed */
+int DKSFFT::setupFFT(int ndim, int N[3]) {
+
+  if (apiCuda()) {
+    dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
+    return dksfft->setupFFT(ndim, N);
+  } else if (apiOpenCL()) {
+    dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
+    int ierr1 = dksfft->setupFFT(ndim, N);
+    int ierr2 = dksfft->setupFFTRC(ndim, N);
+    int ierr3 = dksfft->setupFFTCR(ndim, N);
+    if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
+      return DKS_ERROR;
+
+    return DKS_SUCCESS;
+  } else if (apiOpenMP()) {
+    //micbase.mic_setupFFT(ndim, N);
+    //BENI: setting up RC and CR transformations on MIC
+    dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
+    int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
+    int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
+    if (ierr1 != DKS_SUCCESS)
+      return ierr1;
+    if (ierr2 != DKS_SUCCESS)
+      return ierr2;
+    return DKS_SUCCESS;
+  }
+
+  return DKS_ERROR;
+
+}
+//BENI:
+int DKSFFT::setupFFTRC(int ndim, int N[3], double scale) {
+
+  if (apiCuda())
+    return dksfft->setupFFT(ndim, N);
+  if (apiOpenCL())
+    return dksfft->setupFFTRC(ndim, N);
+  else if (apiOpenMP())
+    return dksfft->setupFFTRC(ndim, N, scale);
+
+  return DKS_ERROR;
+
+}
+
+//BENI:
+int DKSFFT::setupFFTCR(int ndim, int N[3], double scale) {
+
+  if (apiCuda())
+    return dksfft->setupFFT(ndim, N);
+  if (apiOpenCL())
+    return dksfft->setupFFTCR(ndim, N);
+  else if (apiOpenMP())
+    return dksfft->setupFFTCR(ndim, N, scale);
+
+  return DKS_ERROR;
+
+}
+
+/* call OpenCL FFT function for selected platform */
+int DKSFFT::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiOpenCL() || apiOpenMP()) 
+    return dksfft->executeFFT(data_ptr, ndim, dimsize);
+  else if (apiCuda())
+    return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call OpenCL IFFT function for selected platform */
+int DKSFFT::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiOpenCL() || apiOpenMP())
+      return dksfft->executeIFFT(data_ptr, ndim, dimsize);
+  else if (apiCuda()) 
+    return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call normalize FFT function for selected platform */
+int DKSFFT::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiOpenCL()) {
+    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
+      return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
+    else 
+      return DKS_ERROR;
+  } else if (apiCuda()) {
+    return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
+  } else if (apiOpenMP()) {
+    return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
+  }
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call real to complex FFT */
+int DKSFFT::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiCuda())
+    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
+  else if (apiOpenCL() || apiOpenMP())
+    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call complex to real FFT */
+int DKSFFT::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiCuda())
+    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
+  else if (apiOpenCL() || apiOpenMP())
+    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* normalize complex to real iFFT */
+int DKSFFT::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiCuda())
+    return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
+  else if (apiOpenCL())
+    return DKS_ERROR;
+  else if (apiOpenMP())
+    return DKS_ERROR;
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+
--- a/src/DKSFFT.h
+++ b/src/DKSFFT.h
@ -0,0 +1,112 @@
+#ifndef H_DKSBASE_FFT
+#define H_DKSBASE_FFT
+
+#include <iostream>
+#include "AutoTuning/DKSAutoTuning.h"
+
+#include "DKSBase.h"
+
+#include "DKSDefinitions.h"
+
+#include "Algorithms/GreensFunction.h"
+#include "Algorithms/CollimatorPhysics.h"
+#include "Algorithms/FFT.h"
+
+#ifdef DKS_AMD
+#include "OpenCL/OpenCLFFT.h"
+#endif
+
+#ifdef DKS_CUDA
+#include "CUDA/CudaFFT.cuh"
+#endif
+
+#ifdef DKS_MIC
+#include "MIC/MICFFT.h"
+#endif
+
+/**
+ * API to handel calls to DKSFFT.
+ * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
+ */
+class DKSFFT : public DKSBase {
+
+private:
+
+  BaseFFT *dksfft;
+
+  int initFFT();
+  
+public:
+  
+  DKSFFT();
+  ~DKSFFT();
+
+  /** 
+   * Setup FFT function.
+   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
+   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
+   * each fft will do its own setup according to fft size and dimensions.
+   * TODO: opencl and mic implementations
+   */
+  int setupFFT(int ndim, int N[3]);
+  //BENI:
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
+  //BENI:
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
+
+  /** 
+   * Call complex-to-complex fft.
+   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
+   * stream id can be specified to use other streams than default.
+   * TODO: mic implementation
+   */
+  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call complex-to-complex ifft.
+   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
+   * stream id can be specified to use other streams than default.
+   * TODO: mic implementation.
+   */
+  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Normalize complex to complex ifft.
+   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
+   * fft size
+   * TODO: mic implementation.
+   */
+  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call real to complex FFT.
+   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
+   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
+   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
+   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
+   * TODO: opencl and mic implementations
+   */
+  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call complex to real iFFT.
+   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
+   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
+   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
+   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
+   * TODO: opencl and mic implementations.
+   */
+  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Normalize compelx to real ifft.
+   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
+   * fft size.
+   * TODO: opencl and mic implementations.
+   */
+  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+
+};
+
+#endif
--- a/src/DKSImageReconstruction.h
+++ b/src/DKSImageReconstruction.h
@ -10,6 +10,9 @@
 #include "CUDA/CudaImageReconstruction.cuh"
 #endif

+/**
+ * API to handle PET image reconstruction calls.
+ */
 class DKSImageRecon : public DKSBase {

 private:
@ -22,87 +25,88 @@ public:

  ~DKSImageRecon();

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateSource(void *image_space, void *image_position, void *source_position, 
 			  void *avg, void *std, float diameter, int total_voxels, 
 			  int total_sources, int start = 0);

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0);


-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateSources(void *image_space, void *image_position, void *source_position, 
 			   void *avg, void *std, void *diameter, int total_voxels, 
 			   int total_sources, int start = 0);

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
 			       void *avg, void *std, void *diameter, int total_voxels, 
 			       int total_sources, int start = 0);

-  /** Image reconstruction - generate normalization.
-   * 
+  /** 
+   * Image reconstruction - generate normalization.
   */
  int callGenerateNormalization(void *recon, void *image_position, 
 				void *det_position, int total_det);

-  /** Image reconstruction - forward correction.
-   * 
+  /** 
+   * Image reconstruction - forward correction.
   */
  int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 			    void *image_position, int num_events);

-  /** Image reconstruction - backward projection.
-   * 
+  /** 
+   * Image reconstruction - backward projection.
   */
  int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
 			     void *det_position, void *image_position, 
 			     int num_events, int num_voxels);

-  /** Set the voxel dimensins on device.
+  /** 
+   * Set the voxel dimensins on device.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);

-  /** Set the image edge.
+  /** 
+   * Set the image edge.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge(float x_edge, float y_edge, float z_edge);

-  /** Set the image edge1.
+  /** 
+   * Set the image edge1.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);

-  /** Set the minimum crystan in one ring values.
+  /** 
+   * Set the minimum crystal in one ring values.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);

-  /** Set all other required parameters for reconstruction.
+  /** 
+   * Set all other required parameters for reconstruction.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
--- a/src/DKSMainPage.dox
+++ b/src/DKSMainPage.dox
@ -0,0 +1,32 @@
+/** 
+\mainpage
+
+<P>
+<B>
+The aim of DKS is to allow the creation of fast fine tuned kernels using device specific frameworks such as CUDA, OpenCL, OpenACC and OpenMP and accelerator libraries such as Thrust, Nvidia CUDA libraries, Intel MKL or others. On top of that, DKS allows the easy use of these kernels in host applications without providing any device or framework specific details. This approach facilitates the integration of different types of devices in the existing applications with minimal code changes and  makes the device and the host code a lot more manageable.
+</B>
+<P>
+
+The main parts of DKS are:
+<ul>
+	<li>DKSBase - provides the basic communication functions between host application and hardware accelerators including memory manegement, data transfer and synchronization.</li>
+	<li>DKSOPAL - provides functions for Object Oriented Particle Accelerator library to offload FFTPoisson calculations and particle matter interaction using Monte Carlo simulations to GPU and Intel MIC</li>
+	<li>DKSBaseMuSR - provides functions to perform parameter fitting for musrfit on the GPU</li>
+	<li>DKSImageRecon - provides functions to perform PET image reconstruction on the GPU</li>
+	<li>DKSFFT - provides functions to perform FFT on the GPU and Intel MIC</li>
+</ul>
+
+<P>
+<B>
+Developed by
+Uldis Locans
+</B>
+
+<P>
+For further information contact: locans.uldis@psi.ch - Uldis Locans
+<P>
+
+<P>
+<a href="https://gitlab.psi.ch/uldis_l/DKS">DKS on gitlab</a><br>
+
+*/
--- a/src/DKSOPAL.cpp
+++ b/src/DKSOPAL.cpp
@ -0,0 +1,162 @@
+#include "DKSOPAL.h"
+
+DKSOPAL::DKSOPAL() {
+  dkscol = nullptr;
+  dksgreens = nullptr;
+}
+
+DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
+  setAPI(api_name, strlen(api_name));
+  setDevice(device_name, strlen(device_name));
+}
+
+DKSOPAL::~DKSOPAL() {
+  delete dkscol;
+  delete dksgreens;
+}
+
+int DKSOPAL::setupOPAL() {
+  int ierr = DKS_ERROR;
+  if (apiOpenCL()) {
+    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
+    //TODO: only enable if AMD libraries are available
+    dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
+    dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
+  } else if (apiCuda()) {
+    ierr = CUDA_SAFECALL( DKS_SUCCESS );
+    dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
+    dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
+  } else if (apiOpenMP()) {
+    ierr = MIC_SAFECALL( DKS_SUCCESS );
+    dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
+    dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
+  } else {
+    ierr = DKS_ERROR;
+  }
+
+  return ierr;
+}
+
+int DKSOPAL::initDevice() {
+  int ierr = setupDevice();
+  if (ierr == DKS_SUCCESS)
+    ierr = setupOPAL();
+  return ierr;
+
+}
+
+int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
+				double hz_m0, double hz_m1, double hz_m2, int streamId) {
+
+    return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, 
+				     hz_m0, hz_m1, hz_m2, streamId);
+
+}
+
+int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
+				   int I, int J, int K, int streamId) {
+
+  return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
+}
+
+int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
+
+  return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);  
+}
+
+int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
+  
+  return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
+}
+
+int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
+				   int numparticles, int numparams,
+				   int &numaddback, int &numdead, 
+				   bool enableRutherforScattering) 
+{
+
+  return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
+
+}
+
+
+int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
+				    bool enableRutherforScattering) 
+{
+
+  return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
+  
+}
+
+int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+				      void *px_ptr, void *py_ptr, void *pz_ptr,
+				      void *par_ptr, int numparticles)
+{
+
+  
+    return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
+					rx_ptr, ry_ptr, rz_ptr, 
+					px_ptr, py_ptr, pz_ptr,
+					par_ptr,  numparticles);
+
+}
+
+
+int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
+{
+
+  return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
+
+}
+
+int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+					  void *px_ptr, void *py_ptr, void *pz_ptr,
+					  void *par_ptr, int numparticles, int &numaddback) 
+{
+
+  return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
+						       rx_ptr, ry_ptr, rz_ptr, 
+						       px_ptr, py_ptr, pz_ptr,
+						       par_ptr,  numparticles, numaddback));
+
+}
+
+
+int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
+				      void *dt_ptr, double dt, double c, 
+				      bool usedt, int streamId) 
+{
+
+  return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
+
+}
+
+int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, 
+				      int npart, double c, int streamId) {
+  
+  return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
+
+}
+
+int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
+				      void *bf_ptr, void *dt_ptr, double charge, double mass,
+				      int npart, double c, int streamId) 
+{
+  
+  return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr, 
+				      charge, mass, npart, c, streamId);
+
+}
+
+int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
+					       void *lastSec_ptr, void *orient_ptr, 
+					       int npart, int nsec, void *dt_ptr, double dt, 
+					       double c, bool usedt, int streamId)
+{
+
+  return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
+					       npart, nsec, dt_ptr, dt, c, usedt, streamId);
+  
+}
--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@ -0,0 +1,175 @@
+#ifndef H_DKS_OPAL
+#define H_DKS_OPAL
+
+#include <iostream>
+#include "AutoTuning/DKSAutoTuning.h"
+
+#include "DKSBase.h"
+#include "DKSFFT.h"
+
+#include "DKSDefinitions.h"
+
+#include "Algorithms/GreensFunction.h"
+#include "Algorithms/CollimatorPhysics.h"
+#include "Algorithms/FFT.h"
+
+
+#ifdef DKS_AMD
+#include "OpenCL/OpenCLFFT.h"
+#include "OpenCL/OpenCLGreensFunction.h"
+#include "OpenCL/OpenCLCollimatorPhysics.h"
+#endif
+
+#ifdef DKS_CUDA
+#include "CUDA/CudaFFT.cuh"
+#include "CUDA/CudaGreensFunction.cuh"
+#include "CUDA/CudaCollimatorPhysics.cuh"
+
+#endif
+
+#ifdef DKS_MIC
+#include "MIC/MICFFT.h"
+#include "MIC/MICGreensFunction.hpp"
+#include "MIC/MICCollimatorPhysics.h"
+#endif
+
+/**
+ * API to handle OPAL calls to DKS library.
+ * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
+ * functions.
+ */
+class DKSOPAL : public DKSFFT {
+
+private: 
+
+  DKSCollimatorPhysics *dkscol;
+  GreensFunction *dksgreens;
+
+  int setupOPAL();
+
+public:
+  
+  DKSOPAL();
+
+  DKSOPAL(const char* api_name, const char* device_name);
+
+  ~DKSOPAL();
+
+  int initDevice();
+
+  ///////////////////////////////////////////////
+  ///////Function library part of dksbase////////
+  ///////////////////////////////////////////////
+
+  /** 
+   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
+   * For specifics check OPAL docs.
+   * TODO: opencl and mic implementations.
+   */
+  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
+			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
+
+  /** 
+   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
+   * For specifics check OPAL docs.
+   * TODO: opencl and mic implementations.
+   */
+  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
+			    int I, int J, int K, int streamId = -1);
+
+  /** 
+   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
+   * For specifics check OPAL docs.
+   * TODO: opencl and mic implementations.
+   */
+  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
+
+  /** 
+   * Element by element multiplication.
+   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
+   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
+   * TODO: opencl and mic implementations.
+   */
+  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
+
+  /** 
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
+			    int numparticles, int numparams, 
+			    int &numaddback, int &numdead,
+			    bool enableRutherfordScattering = true);
+  
+  /** 
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles, 
+			     bool enableRutherfordScattering = true);
+
+  /** 
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
+   */
+  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			       void *px_ptr, void *py_ptr, void *pz_ptr,
+			       void *par_ptr, int numparticles);
+
+  /**
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
+
+  /**
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+				   void *px_ptr, void *py_ptr, void *pz_ptr,
+				   void *par_ptr, int numparticles, int &numaddback);
+
+  /**
+   * Integration code from ParallelTTracker from OPAL.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
+   */
+  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
+			       void *dt_ptr, double dt, double c, 
+			       bool usedt = false, int streamId = -1);
+
+  /**
+   * Integration code from ParallelTTracker from OPAL.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
+   */
+  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
+					void *lastSec_ptr, void *orient_ptr, 
+					int npart, int nsec, void *dt_ptr,
+					double dt, double c, bool usedt = false, 
+					int streamId = -1);
+  /**
+   * Integration code from ParallelTTracker from OPAL.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
+   */
+  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr, 
+			       int npart, double c, int streamId = -1);
+
+  /**
+   * Integration code from ParallelTTracker from OPAL.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
+   */
+  int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
+			       void *bf_ptr, void *dt_ptr, double charge, 
+			       double mass, int npart, double c, int streamId = -1);
+
+
+};
+
+#endif
--- a/src/MIC/CMakeLists.txt
+++ b/src/MIC/CMakeLists.txt
@ -1,19 +1,22 @@
-SET (_SRCS
-  MICBase.cpp
-  MICChiSquare.cpp
-  MICFFT.cpp
-  MICGreensFunction.cpp  
-  MICCollimatorPhysics.cpp
-  )
+SET (_SRCS MICBase.cpp MICFFT.cpp)
+SET (_HDRS MICBase.h MICFFT.h)

-SET (_HDRS
-  MICBase.h
-  MICChiSquare.h
-  MICFFT.h
-  MICCollimatorPhysics.h
-  MICGreensFunction.hpp    
-  MICMergeSort.h
-  )
+IF (ENABLE_OPAL)
+  SET (_SRCS
+    ${_SRCS}
+    MICChiSquare.cpp
+    MICGreensFunction.cpp  
+    MICCollimatorPhysics.cpp
+    )
+
+  SET (_HDRS
+    ${_HDRS}
+    MICChiSquare.h
+    MICCollimatorPhysics.h
+    MICGreensFunction.hpp    
+    MICMergeSort.h
+    )
+ENDIF (ENABLE_OPAL)

 #INCLUDE_DIRECTORIES (
 #  ${CMAKE_CURRENT_SOURCE_DIR}
--- a/src/MIC/MICBase.cpp
+++ b/src/MIC/MICBase.cpp
@ -18,30 +18,28 @@ int MICBase::mic_createRandStreams(int size) {

  int seed = time(NULL);

-#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
+  int numThreads = 0;
+#pragma offload target(mic:m_device_id) inout(numThreads)
  {
-
-    //get the number of threads
-    int numThreads;
-
 #pragma omp parallel
    numThreads = omp_get_num_threads();
+  }

-    //if default rnd stream already allocated delete the array
-    if (defaultRndSet == 1)    
-      delete[] defaultRndStream;
-
-    //allocate defaultRndStream array
-    defaultRndStream = new VSLStreamStatePtr[numThreads];
-
+  defaultRndStream =  mic_allocateMemory<VSLStreamStatePtr>(numThreads);
+  VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
+  maxThreads = numThreads; 
+  
+#pragma offload target(mic:m_device_id) \
+  in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN)        \
+  in(seed)
+  {
    //create stream states for each thread
 #pragma omp parallel for
    for (int i = 0; i < omp_get_num_threads(); i++)
-      vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
-
-    defaultRndSet = 1;
+      vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
  }
-  
+
+  defaultRndSet = 1;
  return DKS_SUCCESS;

 }
@ -49,15 +47,8 @@ int MICBase::mic_createRandStreams(int size) {
 //delete default rand streams
 int MICBase::mic_deleteRandStreams() {

-#pragma offload target(mic:m_device_id) inout(defaultRndSet)
-  {
-    if (defaultRndSet == 1) {
-      delete[] defaultRndStream;
-      defaultRndSet = -1;
-    }
-  }
-
-  return DKS_ERROR;
+  //mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
+  return DKS_SUCCESS;
 }

 //create a new signal for the mic
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@ -26,72 +26,82 @@

 #define MIC_WIDTH 128

+/** MIC Base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement and  data transfers.
+ */
 class MICBase {

 private:
  std::vector<int> micStreams;
+  int maxThreads; 

 protected:

-
  int defaultRndSet;
-
 public:
-  VSLStreamStatePtr *defaultRndStream;
+
+//#pragma offload_attribute(push,target(mic))
+  void *defaultRndStream; //VSLSStreamStatePtr
+  void *testPtr;
+
+//#pragma offload_attribute(pop)
+
  int m_device_id;

-  /* constructor */
+  /** constructor */
  MICBase();

-  /* destructor */
+  /** destructor */
  ~MICBase();

-  /*
-    Info: create MKL rand streams for each thread
-    Return: success or error code
-  */
+  /**
+   * Create MKL rand streams for each thread
+   *  Return: success or error code
+   */
  int mic_createRandStreams(int size);

-  /*
-    Info: delete MKL rand streams
-    Return: succes or error code
-  */
+  /**
+   * Delete MKL rand streams
+   * Return: succes or error code
+   */
  int mic_deleteRandStreams();

-  /*
-    Info: create a new signal for the mic
-    Return: success or error code
-  */
+  /**
+   * Create a new signal for the mic.
+   * Signals can be used for assynchronous data transfers.
+   * Return: success or error code
+   */
  int mic_createStream(int & streamId);

-  /*
-    Info: get the signal from the vector
-    Return: mic signal
+  /**
+   * Info: get the signal from the vector.
+   * Return: mic signal
  */
  int& mic_getStream(int id);

-  /*
-    Info: delete streams
-    Return: success or error code
-  */
+  /**
+   * Info: delete streams.
+   * Return: success or error code
+   */
  int mic_deleteStreams();

-  /*
-    Info: set device id
-    Return: success or error code
-  */
+  /**
+   * Info: set device id.
+   * Return: success or error code
+   */
  int mic_setDeviceId(int id);

-  /*
-    Info: get mic devices
-    Return: success or error code
-  */
+  /**
+   * Info: get mic devices.
+   * Prints information about mic devices.
+   * Return: success or error code
+   */
  int mic_getDevices();

-  /*
-    Info: allocate memory on MIC device
-    Return: success or error code
-  */
+  /**
+   * Allocate memory on MIC device.
+   * Return: success or error code
+   */
  template<typename T>
  void * mic_allocateMemory(int size) {

@ -104,10 +114,10 @@ public:
    return tmp;
  }

-  /*
-    Info: transfer data to device
-    Return: success or error code
-  */
+  /**
+   * Transfer data to device.
+   * Return: success or error code
+   */
  template<typename T>
  int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@ -118,10 +128,10 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: write data to device, non-blocking
-    Return: success or error code
-  */
+  /**
+   * Write data to device, non-blocking.
+   * Return: success or error code
+   */
  template<typename T>
  int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
  {
@ -134,10 +144,10 @@ public:
  }
  

-  /*
-    Info: read data from device
-    Return: success or error code
-  */
+  /**
+   * Read data from device
+   * Return: success or error code
+   */
  template<typename T>
  int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@ -149,10 +159,10 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: read data from device waiting for signal
-    Return: success or error code
-  */
+  /**
+   * Read data from device waiting for signal
+   * Return: success or error code
+   */
  template<typename T>
  int mic_readDataAsync(const void * data_ptr, void * result, int size, 
 			int streamId = -1, int offset = 0) {
@ -167,10 +177,10 @@ public:

  }

-  /* 
-     Info: wait till all the signals are complete
-     Return siccess or error code
-  */
+  /**
+   * Wait till all the signals are complete
+   * Return siccess or error code
+   */
  int mic_syncDevice() {
    
    //empty offload to wait for all the signals to finish and launch a new empy signal
@ -188,10 +198,10 @@ public:

  }

-  /*
-    Info: free memory on device
-    Return: success or error code
-  */
+  /**
+   * Free memory on device
+   * Return: success or error code
+   */
  template<typename T>
  int mic_freeMemory(void * data_ptr, int size) {

@ -202,14 +212,13 @@ public:
 #pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
    {
    }
-
    return DKS_SUCCESS;
  }

-  /*
-    Info: allocate memory and write data to device
-    Return: success or error code
-  */
+  /**
+   * Allocate memory and write data to device
+   * Return: success or error code
+   */
  template<typename T>
  void * mic_pushData(const void * data, int size) {
    T* tmp_ptr = new T[size];
@ -223,10 +232,10 @@ public:
  return tmp_ptr;
 }

-/*
-  Info: read data and free memory on device
-  Return: success or erro code
-*/
+  /**
+   * Read data and free memory on device
+   * Return: success or erro code
+   */
  template<typename T>
  int mic_pullData(void * data_ptr, void * result, int size) {
    T* tmp_ptr = (T*)data_ptr;
--- a/src/MIC/MICChiSquare.h
+++ b/src/MIC/MICChiSquare.h
@ -14,6 +14,9 @@
 #include <offload.h>
 #include "MICBase.h"

+/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. 
+ * Not complete and untested because of the poor performance of first MIC devices.
+ */
 class MICChiSquare {

  MICBase *m_micbase;
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@ -22,22 +22,34 @@
 #define I_M 10
 #define DT_M 11

+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(mic_double3 d1, mic_double3 d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }

+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(double dx, double dy, double dz) {
  return (dx * dx + dy * dy + dz * dz);
 }

+/**
+ * MIC device function to check if particle is still in material.
+ */
 __declspec(target(mic))
 bool checkHit(double &z, double *par) {
  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
 }


+/**
+ * MIC device function to calculate arbitrary rotation.
+ */
 __declspec(target(mic))
 void Rot(double &px, double &pz, double &x, double &z, double xplane, 
 	 double normP, double thetacou, double deltas, int coord)
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }

+
+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses AoS to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ * Deprecated on favor of SoA data layout.
+ */
 __declspec(target(mic))
 void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr

 }

+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses SoA to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ */
 __declspec(target(mic))
-void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
+void coulombScat(double *rx, double *ry, double *rz, 
+		 double *px, double *py, double *pz, int *label,
 		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
 {
- 
+
+  //arrays for temporary storage, each core proceses MIC_WIDTH particles
  double normP[MIC_WIDTH] __attribute__((aligned(64)));
  double deltas[MIC_WIDTH] __attribute__((aligned(64)));
  double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  double z2[MIC_WIDTH] __attribute__((aligned(64)));
  double thetacou[MIC_WIDTH] __attribute__((aligned(64)));

+  //simd instruction tells the compiler its safe to vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  
+  //vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + size; i++) {
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  
-  
  //generate array of random numbers
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  
 }

+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {

@ -292,7 +326,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)

  const double deltas = par[DT_M] * beta * C;
  const double deltasrho = deltas * 100 * par[RHO_M];
-  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 

  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    const double Ts = (Eng * 1E6) / 1.0073; 
@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
    pdead = 1;
 }

+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {

@ -338,7 +377,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {

  const double deltas = par[DT_M] * beta * C;
  const double deltasrho = deltas * 100 * par[RHO_M];
-  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 

  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
    const double Ts = (Eng * 1E6) / 1.0073; 
@ -368,24 +407,29 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {

 }

-int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
+int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
+					    bool enableRutherforScattering) 
+{

  //cast device memory pointers to appropriate types
  MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
  double *par = (double*) par_ptr;
+  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;

+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target(mic:m_micbase->m_device_id)		\
  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
  in(numparticles)
  {
 	
 #pragma omp parallel 
    {
-      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
+      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
      
      //for loop trough particles if not checkhit set label to -2 and update R.x
-
 #pragma omp for simd
      for (int i = 0; i < numparticles; i++) {
 	if ( !checkHit(data[i].Rincol.z, par) ) {
@ -445,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 {


-
+  //cast device memory pointers to appropriate types
  int *label = (int*)label_ptr;
  unsigned *localID = (unsigned*)localID_ptr;
  double *rx = (double*)rx_ptr;
@ -459,6 +503,10 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  int padding = numparticles % MIC_WIDTH;
  int totalpart = numparticles + padding;

+  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
+
+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target (mic:0) \
  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
@ -469,14 +517,16 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  in(py:length(0) DKS_REUSE DKS_RETAIN)	\
  in(pz:length(0) DKS_REUSE DKS_RETAIN)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
  in(totalpart)
  {

+
 #pragma omp parallel
    {
      //every thread gets its own rnd stream state
-      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
-
+      //VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
+      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
      
      #pragma omp for nowait
      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -512,9 +562,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 	  double Eng = (sq - 1) * M_P;
 	  double dEdx = 0;
 	  
+	    
 	  if (label[i] == 0) {
 	    energyLoss(Eng, dEdx, par, randv, i - ii);
 	  }
+	  
 	    
 	  if (Eng > 1e-4 && dEdx < 0) {
 	    double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
@ -526,11 +578,12 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt

 	  if (Eng < 1e-4 || dEdx > 0)
 	    label[i] = -1;
-	        
+	      
 	} //end inner energy loss loop
-
-      } //end outer energy loss loop
 	
+      } //end outer energy loss loop
+
+      
      //vectorize coulomb scattering as much as possible
 #pragma omp for nowait
      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -540,7 +593,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
    } //end omp parallel
      
  } //end offload
-     
+   
  return DKS_SUCCESS;
 }

--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@ -26,7 +26,13 @@ typedef struct {
 } MIC_PART_SMALL;


-class MICCollimatorPhysics : DKSAlogorithms{
+/**
+ * MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes OPALs collimator physics class for particle matter interactions using OpenMP
+ * and offload mode targetomg Intel Xeon Phi processors.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
+class MICCollimatorPhysics : public DKSCollimatorPhysics {

 private:

@ -38,9 +44,10 @@ public:
    m_micbase = base;
  };

-  ~MICCollimatorPhysics() { };
+  ~MICCollimatorPhysics() {  };

-  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
+			bool enableRutherforScattering = true);

  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
--- a/src/MIC/MICFFT.cpp
+++ b/src/MIC/MICFFT.cpp
@ -6,13 +6,16 @@

 MICFFT::MICFFT(MICBase *base) {
  m_micbase = base;
+  m_fftsetup = false;
 }

 MICFFT::~MICFFT() {
+  if (m_fftsetup) {
 #pragma offload target(mic:0)
-  {
-    DftiFreeDescriptor(&FFTHandle_m);
-    DftiFreeDescriptor(&handle);
+    {
+      DftiFreeDescriptor(&FFTHandle_m);
+      DftiFreeDescriptor(&handle);
+    }
  }
 }

@ -35,7 +38,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {

  }

-
+  m_fftsetup = true;
  return DKS_SUCCESS;
 }
 //BENI:
@ -122,8 +125,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
 }

 //execute iFFT
-int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
-  return mic_executeFFT(mem_ptr, ndim, N, -1, false);
+int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
+  return executeFFT(mem_ptr, ndim, N, -1, false);
 }

 //execute REAL->COMPLEX FFT
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@ -7,13 +7,18 @@
 #include <offload.h>
 #include <mkl_dfti.h>

-#include "../Algorithm/DKSFFT.h"
+#include "../Algorithms/FFT.h"
 #include "MICBase.h"

-class MICFFT : public DKSFFT {
+/** 
+ * MIC FFT based on BaseFFT interface.
+ * uses MKL library to offload FFT on Intel Xeon Phi devices.
+ */
+class MICFFT : public BaseFFT {

 private:

+  bool m_fftsetup;
  MICBase *m_micbase;

  /// Internal FFT object for performing serial FFTs.
@ -74,6 +79,18 @@ public:
  /* normalize IFFT on MIC */
  int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);

+  /**
+   * Info: destroy default FFT plans
+   * Return: success or error code
+   */
+  int destroyFFT() { return DKS_SUCCESS; }
+
+  /*
+    Info: execute normalize for complex to real iFFT
+    Return: success or error code
+  */
+  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
+
 };

 #endif
--- a/src/MIC/MICGreensFunction.cpp
+++ b/src/MIC/MICGreensFunction.cpp
@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
  }
 */

-int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
-					  double hr_m1, double hr_m2) 
+int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
+				      double hr_m0, double hr_m1, double hr_m2, int streamId) 
 {

-  double *tmp_ptr = (double*) tmp_ptr_;
+  double *tmp_ptr = (double*) tmpgreen;
 #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
  {
    std::memset(tmp_ptr,0,I*J*K);
@ -173,12 +173,14 @@ return 0;
 */

 //CUDA similar version:
-int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
-  double *tmpgreen = (double*) tmp_ptr_;
-  double *mem_ptr = (double*) mem_ptr_;
+int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
+					int streamId) 
+{
+  double *tmpgreen_ptr = (double*) tmpgreen;
+  double *mem_ptr = (double*) rho2_m;

  // the actual integration
-#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
  {
    int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); 
    std::memset(mem_ptr,0,II*JJ*KK);
@ -197,27 +199,27 @@ int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp
 	  tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;

 	  if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
-	    tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+	    tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];

 	  if (i+1 < NI_tmp)
-	    tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+	    tmp1 = tmpgreen_ptr[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];

 	  if (j+1 < NJ_tmp)
-	    tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+	    tmp2 = tmpgreen_ptr[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];

 	  if (k+1 < NK_tmp)
-	    tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp3 = tmpgreen_ptr[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  

 	  if (i+1 < NI_tmp && j+1 < NJ_tmp)
-	    tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+	    tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  

 	  if (i+1 < NI_tmp && k+1 < NK_tmp)
-	    tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp5 = tmpgreen_ptr[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  

 	  if (j+1 < NJ_tmp && k+1 < NK_tmp)
-	    tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+	    tmp6 = tmpgreen_ptr[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  

-	  tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+	  tmp7 = tmpgreen_ptr[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];

 	  double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;

@ -234,8 +236,8 @@ int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp



-int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
-  double *mem_ptr = (double*) mem_ptr_;	
+int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
+  double *mem_ptr = (double*) rho2_m;	

 #pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
  {
@ -281,11 +283,11 @@ int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K)
 }

 /*multiply complex fields*/
-int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
+int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
  //	  double *mem_ptr1 = (double*) mem_ptr1_;
  //	  double *mem_ptr2 = (double*) mem_ptr2_;
-  _Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
-  _Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
+  _Complex double *mem_ptr1 = (_Complex double *) ptr1;
+  _Complex double *mem_ptr2 = (_Complex double *) ptr2;

 #pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
  {
--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@ -9,12 +9,14 @@
 #include <offload.h>
 #include <mkl_dfti.h>

+#include "../Algorithms/GreensFunction.h"
 #include "MICBase.h"

 #define DKS_SUCCESS 0
 #define DKS_ERROR 1

-class MICGreensFunction {
+/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
+class MICGreensFunction : public GreensFunction {

 private:
  MICBase *m_micbase;
@ -28,16 +30,18 @@ public:
  ~MICGreensFunction();

  /* compute greens integral analytically */
-  int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
+  int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
+		     double hr_m0, double hr_m1, double hr_m2, int streamId = -1);

  /* perform the actual integration */
-  int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
+  int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K, 
+				int stremaId = -1);

  /* Mirror rho-Field */
-  int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
+  int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);

  /*multiply complex fields*/
-  int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
+  int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);

 };

--- a/src/MIC/MICMergeSort.h
+++ b/src/MIC/MICMergeSort.h
@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
  return p;
 }

+/**
+ * Merge sort implementation for intel MIC.
+ * Paralellized over all the MIC cores using OpenMP tasks.
+ */
 template <typename T>
 void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {

@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
  }
 }

+/**
+ * Quicksort algorithm, developed for use on Intel MIC devices.
+ */
 template <typename T>
 void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

 }

+/** 
+ * Insertion sort of @p list, developed for use on Intel MIC.
+ * Used by quick_sort to sort small lists.
+ */
 template <typename T>
 void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

--- a/src/OpenCL/CMakeLists.txt
+++ b/src/OpenCL/CMakeLists.txt
@ -1,31 +1,53 @@
-SET (_SRCS
-	OpenCLBase.cpp
-	OpenCLFFT.cpp
-	OpenCLChiSquare.cpp
-	OpenCLCollimatorPhysics.cpp
-	OpenCLChiSquareRuntime.cpp
-  )
+#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found

-SET (_HDRS
-	OpenCLBase.h
-	OpenCLFFT.h
-	OpenCLChiSquare.h
-	OpenCLCollimatorPhysics.h
-	OpenCLChiSquareRuntime.h
-  )
+SET (_HDRS OpenCLBase.h)
+SET (_SRCS OpenCLBase.cpp)
+SET (_KERNELS "")

-#INCLUDE_DIRECTORIES (
-#  ${CMAKE_CURRENT_SOURCE_DIR}
-#)
+IF (ENABLE_AMD)
+  SET (_SRCS
+    ${_SRCS}
+    OpenCLFFT.cpp
+    )

-SET (_KERNELS
-  OpenCLKernels/OpenCLChiSquare.cl
-  OpenCLKernels/OpenCLFFT.cl
-  OpenCLKernels/OpenCLFFTStockham.cl
-  OpenCLKernels/OpenCLTranspose.cl
-  OpenCLKernels/OpenCLCollimatorPhysics.cl
-  OpenCLKernels/OpenCLChiSquareRuntime.cl
+  SET (_HDRS
+    ${_HDRS}
+    OpenCLFFT.h
+    )
+
+  SET (_KERNELS
+    ${_KERNELS}
+    OpenCLKernels/OpenCLFFT.cl
+    OpenCLKernels/OpenCLFFTStockham.cl
+    OpenCLKernels/OpenCLTranspose.cl
  )
+ENDIF (ENABLE_AMD)
+
+IF (ENABLE_MUSR)
+  SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
+  SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
+  SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
+ENDIF (ENABLE_MUSR)
+
+IF (ENABLE_AMD AND ENABLE_OPAL)
+  SET (_SRCS
+    ${_SRCS}
+    OpenCLCollimatorPhysics.cpp
+    OpenCLGreensFunction.cpp
+    )
+
+  SET (_HDRS
+    ${_HDRS}
+    OpenCLCollimatorPhysics.h
+    OpenCLGreensFunction.h
+    )
+
+  SET (_KERNELS
+    ${_KERNELS}
+    OpenCLKernels/OpenCLCollimatorPhysics.cl
+    OpenCLKernels/OpenCLGreensFunction.cl
+  )
+ENDIF (ENABLE_AMD AND ENABLE_OPAL)

 ADD_SOURCES (${_SRCS})
 ADD_HEADERS (${_HDRS})
--- a/src/OpenCL/OpenCLBase.cpp
+++ b/src/OpenCL/OpenCLBase.cpp
@ -7,21 +7,13 @@ cl_device_id OpenCLBase::m_device_id = NULL;
 cl_event OpenCLBase::m_last_event = NULL;

 OpenCLBase::OpenCLBase() {
-  //m_context = NULL;
-  //m_command_queue = NULL;
  m_program = NULL;
  m_kernel = NULL;
-  //m_device_id = NULL;
-  //m_platform_id = NULL;
  m_kernel_file = NULL;
 	
  m_last_event = NULL;
 	
-  //m_events = new cl_event[500];
-  //m_num_events = 0;
-
  defaultRndSet = 0;
-
 }

 OpenCLBase::~OpenCLBase() {
@ -41,11 +33,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
  ocl_loadKernel(kernel_file);
  delete[] kernel_file;
-
+  
  //allocate memory for rand states
  int ierr;
  defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
-
+  
  //exec kernel
  int seed = 0;
  ocl_createKernel("initRand");
@ -55,13 +47,34 @@ int OpenCLBase::ocl_createRndStates(int size) {
  
  size_t work_items = size;
  size_t work_group_size = 1;
-
  ocl_executeKernel(1, &work_items, &work_group_size);
-
  defaultRndSet = 1;
+  
+  return DKS_SUCCESS;
+}

-  return OCL_SUCCESS;
+int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
+  //load kernel
+  char * kernel_file = new char[500];
+  kernel_file[0] = '\0';
+  strcat(kernel_file, OPENCL_KERNELS);
+  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
+  ocl_loadKernel(kernel_file);
+  delete[] kernel_file;

+  //set kernel variables
+  cl_mem tmp_data = (cl_mem) mem_ptr;
+
+  ocl_createKernel("createRandoms");
+  ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
+  ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
+  ocl_setKernelArg(2, sizeof(int), &size);
+
+  size_t work_size = 128;
+  size_t work_items = (size % work_size + 1) * work_size;
+  ocl_executeKernel(1, &work_items, &work_size);
+
+  return DKS_SUCCESS;
 }

 /* destroy rnd states */
@ -70,7 +83,7 @@ int OpenCLBase::ocl_deleteRndStates() {
  ocl_freeMemory(defaultRndState);
  defaultRndSet = 0;

-  return OCL_SUCCESS;
+  return DKS_SUCCESS;

 }

@ -428,7 +441,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
  int ierr;

  //create program from kernel
-  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
+  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, 
+					NULL, &ierr);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
    return DKS_ERROR;
@ -438,7 +452,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
  ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
 	
  /*
-    check if compileng kernel source succeded, if failed return error code
+    check if compiling kernel source succeded, if failed return error code
    if in debug mode get compilation info and print program build log witch
    will give indication what made the compilation fail
  */
@ -447,7 +461,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
 		
    //get build status
    cl_build_status status;
-    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, 
+			  sizeof(cl_build_status), &status, NULL);

    //get log size
    size_t log_size;
@ -613,12 +628,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
    }
  }
 	
-  if (ierr != OCL_SUCCESS) {
+  if (ierr != DKS_SUCCESS) {
    DEBUG_MSG("Failed to build kernel file " << kernel_file);
-    return OCL_ERROR;
+    return DKS_ERROR;
  }
 	
-  return OCL_SUCCESS;
+  return DKS_SUCCESS;
 }

 //compile kernel form source code provided
@ -660,17 +675,14 @@ cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) {
 /*
  write data specified by in_data to device memory, device memory space defined by cl_mem
 */
-int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) {
+int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, 
+			      size_t offset, int blocking) 
+{

  cl_int ierr;
-	
-	
-  //std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl;
-  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event);
-	
-  //m_events[m_num_events] = m_last_event;
-  m_events.push_back(m_last_event);

+  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, 
+			      in_data, 0, NULL, NULL);
 	
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
@ -701,6 +713,11 @@ int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) {
 */
 int OpenCLBase::ocl_createKernel(const char* kernel_name) {
  cl_int ierr;
+
+  //release the old kernel
+  if (m_kernel != NULL)
+    clReleaseKernel(m_kernel);
+  //create a new kernel
  m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
@ -728,24 +745,20 @@ int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) {
  optional: work_group_size - can specify how work items are divided in work groups, 
  if left NULL OpenCL implementation handles this part.
 */
-int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) {
-  cl_int ierr;
-		
-  cl_event tmp_event;
-  if (m_last_event == NULL) {
-    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
-				  0, NULL, &tmp_event);
-  } else {
-    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
-				  1, &m_last_event, &tmp_event);
-  }
+int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, 
+				  const size_t *work_group_size) 
+{
+  cl_int ierr;	
+
+  ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, 
+				work_items, work_group_size, 
+				0, NULL, NULL);
 	
  if (ierr != CL_SUCCESS)
-    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
-		
-  m_last_event = tmp_event;
-  m_events.push_back(m_last_event);
-	
+    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr 
+	      << " work items: " << *work_items << ", " 
+	      << " work group: " << *work_group_size);
+
  return ierr;
 }

@ -753,12 +766,13 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const
  read data from device, mem_ptr points to data on device out_data points to memory in host
  blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
 */
-int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) {
+int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, 
+			     size_t offset, int blocking) 
+{
  cl_int ierr;
-	
-  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event);

-  m_events.push_back(m_last_event);
+  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, 
+   			     out_data, 0, NULL, NULL);
 	
  if (ierr != CL_SUCCESS)
    DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
@ -922,22 +936,27 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
  if (ierr != DKS_SUCCESS)
    return ierr;

-  //get device properties
+  /* get device properties */
+  //maximum number of work-items in a work group supported by device
  size_t max_group_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
+  //maxumum local memory size per work group
  cl_ulong local_mem_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
+  //get the supported extensions
  size_t ext_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
  char *ext = new char[ext_size];
  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);

-  //get kernel properties
+  /* get kernel properties */
+  //get max work group size that can be used for this kernel
  size_t kernel_group_size;
  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, 
 			   sizeof(size_t), &kernel_group_size, 0);
  threadsPerBlock = kernel_group_size;

+  //get max local memory size that can be used for this kernel
  cl_ulong kernel_local_mem;
  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
 			   sizeof(cl_ulong), &kernel_local_mem, 0);
@ -946,18 +965,18 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
  std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;


-  std::cout << "Work groups: device limit " << max_group_size << ", "
-	    << "kernel limit " << kernel_group_size << ", "
+  std::cout << "Work group size: max for device " << max_group_size << " > "
+	    << "max for kernel " << kernel_group_size << " > "
 	    << "required " << work_group_size << std::endl;
  

  std::cout << "Local memory: device limit " << local_mem_size << std::endl;
-  
+  std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
  

-  std::cout << "Available extensions: " << ext << std::endl;
+  std::cout << std::endl << "Available extensions: " << ext << std::endl;

-  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
+  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;   

  return DKS_SUCCESS;
 }
--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@ -1,16 +1,3 @@
-/*
-
-  Name: OpenCLBase
-
-  Author: Uldis Locans
-
-  Info: OpenCL base class to handle all the common details associated 
-  with kernel launch on OpenCL device
-
-  Date: 2014.09.18
-
-*/
-
 #ifndef H_OPENCL_BASE
 #define H_OPENCL_BASE

@ -30,13 +17,10 @@
 #include <CL/cl_ext.h>
 #endif

-
-
 #include "../DKSDefinitions.h"

-/* struct for random number state */
+/** struct for random number state. */
 typedef struct {
-
  double s10;
  double s11;
  double s12;
@ -45,250 +29,292 @@ typedef struct {
  double s22;
  double z;
  bool gen;
-
 } RNDState;

+/**
+ * OpenCL base class to handle device setup and basic communication wiht the device.
+ * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
+ * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
+ * class contains methods to read the kernel files, compile the kernel codes and launch kernels
+ * from the compiled codes. Which kernel file needs to be loaded for the specif functin is 
+ * handled by the base class that is launching the kernel.
+ */
 class OpenCLBase {

 private:
-	
-  static cl_context m_context;
-  static cl_command_queue m_command_queue;

+  //variables containig OpenCL device and platform ids
  static cl_platform_id m_platform_id;
  static cl_device_id m_device_id;

+  //variables containit compiled OpenCL program and kernel
  cl_context_properties m_context_properties[3];
  cl_program m_program;
  cl_kernel m_kernel;
 	
+  //variables for tracking OpenCL events
  static cl_event m_last_event;
  cl_int m_num_events;
  std::vector<cl_event> m_events;
 	
+  //currently load kernel file
  char * m_kernel_file;

+  //type of device used by OpenCL
  cl_device_type m_device_type;
 	
-  /*
-    Name: getPlatforms
-    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
-    Return: success or error code
-  */
+  /**
+   * Get all available OpenCL platforms.
+   * Get all avaialble platforms and save in m_platform_ids, save number of platforms
+   *  Return: success or error code
+   */
  int ocl_getPlatforms();
 	
 	
-  /*
-    Name: getDevice
-    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
-    ReturnL success or error code
-  */
+  /**
+   * Get first available OpenCL device of specified type.
+   * Get first avaialble devices and save device id and platform id for this device, 
+   * device name: (-gpu, -mic, -cpu)
+   *  ReturnL success or error code
+   */
  int ocl_getDevice(const char* device_name);
 	
-  /*
-    Name getDeviceType
-    Info: get device type from device name (-gpu, -cpu, -mic)
-    Return: success or error code
-  */
+  /**
+   * Get cl_device_type from the specified device name.
+   * get device type from device name (-gpu, -cpu, -mic)
+   *  Return: success or error code
+   */
  int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
 	
-  /*
-    Name: createContext
-    Info: create context with specified device
-    Return: success or error code
-  */
+  /**
+   * Create OpenCL context with specified device.
+   *  Return: success or error code
+   */
  int ocl_createContext();
 	
-  /*
-    Name: buildProgram
-    Info: build program from specified kernel file
-    Return: success or error code
+  /**
+   * Build program from specified kernel file.
+   * Return: success or error code.
  */
  int ocl_buildProgram(const char* kernel_file);

-  /** Compile program from kernel source string
-   *
+  /** 
+   * Compile program from kernel source string.
+   * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the 
+   * OpenCL program, that can be then executed on the device.
+   * opts is a string specifiend additional compiler flags.
   */
  int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);

 protected:

+  //memory for random number states
  int defaultRndSet;
  cl_mem defaultRndState;
 	
 	
 public:
+
+  //OpenCL context and commad queue
+  static cl_context m_context;
+  static cl_command_queue m_command_queue; 
    
-  /*
-    constructor
-  */
+  /**
+   * constructor
+   */
  OpenCLBase();
    
-  /*
-    destructor
-  */
+  /**
+   * destructor
+   */
  ~OpenCLBase();
    
-  /*
-    Create RND states
-    Return: success or error code
-  */
+  /**
+   * Allocate memory for size random number states and init the rnd states.
+   * Uses AMD clRng library for random numbers. 
+   * This library is only compatible with AMD devices.
+   */
  int ocl_createRndStates(int size);

-  /*
-    Destroy rnd states
-    Return: success or error code
-  */
+  /** 
+   * Create an array of random numbers on the device.
+   * Filles hte mem_ptr with random numbers.
+   */
+  int ocl_createRandomNumbers(void *mem_ptr, int size);
+
+  /**
+   * Destroy rnd states and free device memory.
+   * Return: success or error code
+   */
  int ocl_deleteRndStates();


-  /*
-    Name: getAllDevices
-    Info: get all available devices
-    ReturnL success or error code
+  /**
+   * Prints info about all the available platforms and devices.
+   * Can be used for information purposes to see what devices are available on the system.
+   * ReturnL success or error code.
  */
  int ocl_getAllDevices();

-  /** Get the OpenCL device count for the set type of device
-   *
+  /** 
+   * Get the OpenCL device count for the set type of device.
+   * Device count is set in ndev parameter, returns success or error code.
   */
  int ocl_getDeviceCount(int &ndev);

-  /** Get the name of the device used
+  /** 
+   * Get the name of the device currently us use.
   */
  int ocl_getDeviceName(std::string &device_name);

-  /** Set the device to use for OpenCL kernels.
-   *  device id to use is passed as integer.
+  /** 
+   * Set the device to use for OpenCL kernels.
+   * Device id to use is passed as integer.
   */
  int ocl_setDevice(int device);

-  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
-   *  Used when GPUs of different types might be pressent on the system.
+  /** 
+   * Get a list of all the unique devices of the same type that can run OpenCL kernels.
+   * Used when GPUs of different types might be pressent on the system.
   */
  int ocl_getUniqueDevices(std::vector<int> &devices);
    
-  /*
-    Name: setUp
-    Info: set up opencl resources
-    Return: success or error code
-  */
+  /**
+   * Initialize OpenCL connection with a device of specified type.
+   * Find if specified device is avaialble, creates a contex and command queue.
+   * Returns success or error code.
+   */
  int ocl_setUp(const char* device_name);
 	
-  /*
-    Name: loadKernel
-    Info: load and compile opencl kernel file if it has changed
-    Return: success or error code
+  /**
+   * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
+   * Load and compile opencl kernel file if it has changed.
+   * Return: success or error code
  */
  int ocl_loadKernel(const char* kernel_file);


-  /** Build program from kernel source.
+  /** 
+   * Build program from kernel source.
   * Builds a program from source code provided in kernel_source.
   * If compilation fails will return DKS_ERROR
   */
  int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
 	
-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory on the device.
+   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int &ierr);
-	
-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+
+  /**
+   * Allocate memory of specific type on device.
+   * The availabel types are cl_mem_flags type listed in OpenCL documentation:
+   * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, 
+   * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
+   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
 	
-  /*
-    Name: writeData
-    Info: write data to device memory (needs ptr to mem object)
-    Return: success or error code
-  */
+  /** 
+   * Zero OpenCL memory buffer.
+   * Set all the elemetns in the device array to zero.
+   */
+  template <typename T>
+  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
+  
+    cl_int ierr;
+    ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset, 
+			       sizeof(T)*size, 0, nullptr, nullptr);
+    if (ierr != CL_SUCCESS)
+      return DKS_ERROR;
+    return DKS_SUCCESS;
+  }
+
+  /**
+   * Write data to device memory (needs ptr to mem object)
+   * Return: success or error code
+   */
  int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: copyData
-    Info: copy data from one buffer on the device to another
-    Return: success or error code
-  */
+  /** 
+   * Copy data from one buffer on the device to another
+   * Return: success or error code
+   */
  int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
 	
-  /*
-    Name: createKernel
-    Info: create kernel from program
-    Return: success or error code
-  */
+  /** 
+   * Create kernel from compiled OpenCL program.
+   * Return: success or error code
+   */
  int ocl_createKernel(const char* kernel_name);
 	
-  /*
-    Name: setKernelArgs
-    Info: set opencl kernel arguments
-    Return: success or error code
-  */
+  /**
+   * Set argiments for the kernel that will be launched.
+   * Return: success or error code
+   */
  int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
 	
-  /*
-    Name: executeKernel
-    Info: execute selected kernel (needs kernel parameters)
-    Return: success or error code
+  /**
+   * Execute selected kernel.
+   * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
+   * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
+   * kernel arguments must be set.
+   * Return: success or error code
  */
  int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
 	
-  /*
-    Name: readData
-    Info: read data from device (needs pointer to mem object)
-    Return: success or error code
-  */
+  /**
+   * Read data from device (needs pointer to mem object).
+   * Return: success or error code
+   */
  int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: freeMemory
-    Info: free device memory (needs ptr to mem object)
-    Return: success or error code
-  */
+  /**
+   * Free device memory (needs ptr to mem object).
+   *  Return: success or error code
+   */
  int ocl_freeMemory(cl_mem mem_ptr);
 	
-  /*
-    Name: cleanUp
-    Info: free opencl resources
-    Return: success or error code
-  */
+  /**
+   * Free opencl resources.
+   * Deletes the kernel, compiled program, command queue and colese the connection
+   * to device by releasing the context.
+   * Return: success or error code
+   */
  int ocl_cleanUp();
 	
-  /*
-    Name: deviceInfo
-    Info: print device info (mostly for debugging purposes)
-    Return: success or error code
-  */
+  /**
+   * Print info of currently selected device.
+   * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
+   * Return: success or error code
+   */
  int ocl_deviceInfo(bool verbose = true);

-  /* Check OpenCL kernel.
-   * Query device and check if it can run the kernel with required parameters
+  /* 
+   * Check OpenCL kernel.
+   * Query device and check if it can run the kernel with required parameters.
+   * Also check the available OpenCL extensions - usefull for checking the supported device
+   * features, like double precission.
   */
  int ocl_checkKernel(const char* kernel_name, int work_group_size,
 		      bool double_precision, int &threadsPerBlock);
 	
-  /*
-    Name: clearEvents
-    Info: clear saved events (for debuging purposes)
-    Return: nothing
-  */
+  /**
+   * Clear the event list.
+   * Events can be used for timing and synchronization purposes.
+   */
  void ocl_clearEvents();

-  /*
-    Name: eventInfo
-    Info: print information about kernel timings (for debuging purposes)
-    Return: nothing
-  */
+  /**
+   * print information about kernel timings from event list.
+   * for debuging purposes
+   */
  void ocl_eventInfo();

-  /*
-    Return current command queue
-  */
+  /**
+   * Return current command queue.
+   */
  cl_command_queue ocl_getQueue() { return m_command_queue; }
 };

--- a/src/OpenCL/OpenCLChiSquare.h
+++ b/src/OpenCL/OpenCLChiSquare.h
@ -14,7 +14,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1

-
+/** Deprecated, SimpleFit implementation of ChiSquare. */
 class OpenCLChiSquare {

 private:
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@ -42,7 +42,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  if (!fp)
    DEBUG_MSG("Can't open kernel file" << kernel_file);

-  //get file size and allocate memory	
+  //get file size and allocate memory
  fseek(fp, 0, SEEK_END);
  fsize = ftell(fp);
  kernel_source = new char[fsize+1];
@ -52,7 +52,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  fread(kernel_source, 1, sizeof(char)*fsize, fp);
  kernel_source[fsize] = '\0';
  fclose(fp);
-  
+
  std::string kernel_string (kernel_source);
  return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;

@ -76,10 +76,9 @@ int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {

 double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {

-  
  int ierr;
-  //calc number of thread sper workgroup and nr of work groups
-  size_t work_size_sum = 128;
+  //calc number of threads per workgroup and nr of work groups
+  size_t work_size_sum = (size_t)blockSize_m;

  /*
  size_t work_items = (size_t)length;
@ -87,7 +86,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
    work_items = (length / work_size_sum + 1) * work_size_sum;
  int work_groups = length / work_size_sum + 1;
  */
-  
+
  size_t work_items = 80 * work_size_sum;
  int work_groups = 80;

@ -96,20 +95,19 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {

  double *partial_sums = new double[work_groups];
  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
-  
+
  //execute sum kernel
-  //ocl_createKernel("parallelReductionSum");
  m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
  m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
-  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); 
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
  m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
  
-  //read partial sums and free temp mempry
+  //read partial sums and free temp memory
  m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
  m_oclbase->ocl_freeMemory(tmp_ptr);
-  
+
  //sumup partial sums on the host
  double result = 0;
  for (int i = 0; i < work_groups; i++)
@ -141,6 +139,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
  //set work item size
  size_t work_items;
  size_t work_size = (size_t)blockSize_m;
+
  if (numBlocks_m < 0)
    work_items = (size_t)length;
  else
@ -157,6 +156,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
      return ierr;

    //set kernel args
+    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -172,20 +172,23 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
    m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
-    m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
+    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(double)*num, NULL);
+    num = nummap; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(17, sizeof(int)*num, NULL);

    if (ierr != DKS_SUCCESS)
      return ierr;
  } else if (fitType == FITTYPE_ASYMMETRY) {
    //create kernel
    ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
-
    if (ierr != DKS_SUCCESS)
      return ierr;

    //set kernel args
+    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -200,9 +203,12 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
-    m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
-    m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
+    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(14, sizeof(double)*num, NULL);
+    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    num = nummap; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(int)*num, NULL);

    if (ierr != DKS_SUCCESS)
      return ierr;
@ -226,6 +232,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
 }

 int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
+  //write params to gpu
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
  return ierr;
 }
@ -235,6 +242,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
  if (numfunc == 0)
    return DKS_SUCCESS;

+  //write function values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
  return ierr;
 }
@ -243,11 +251,12 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  if (nummap == 0)
    return DKS_SUCCESS;

+  //wrtie map values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
  return ierr;
 }

-int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, 
+int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
 					  int size_func, int size_map)
 {

@ -257,7 +266,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
    freeChiSquare();
  }

-  //allocate temporary memory
+  //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
  mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
  mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
  if (size_func == 0)
@ -277,12 +286,12 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
  int ierr = DKS_ERROR;
  if (initDone_m) {

-    //free memory
+    //free GPU memory
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
-    
+
    initDone_m = false;
  }

@ -308,9 +317,13 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
    return DKS_ERROR;
  }

-  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
+  //check the GPU kernel
+  ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
+  if (threadsPerBlock < blockSize_m) {
+    std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
+    blockSize_m = threadsPerBlock;
+  }

  return ierr;

 }
-
--- a/src/OpenCL/OpenCLChiSquareRuntime.h
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *

 const std::string openclFunctFooter = "}\n";

+/**
+ * OpenCL implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to target devices that
+ * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
+ */
 class OpenCLChiSquareRuntime : public ChiSquareRuntime {

 private:

  OpenCLBase *m_oclbase;

-  /** Private function to add user defined function to kernel string
-   *
+  /** 
+   * Private function to add user defined function to kernel string.
   */
  std::string buildProgram(std::string function);

+  /**
+   * Launch parallel reduction kernel to calculate the sum of data array
+   */
  double calculateSum(cl_mem data, int length);

 public:

-  /** Constructor wiht openclbase argument
-   *
+  /** 
+   * Constructor wiht openclbase argument.
   */
  OpenCLChiSquareRuntime(OpenCLBase *base);
  
-  /** Default constructor
-   *
+  /** 
+   * Default constructor
   */
  OpenCLChiSquareRuntime();

-  /** Default destructor
-   *
+  /** 
+   * Default destructor
   */
  ~OpenCLChiSquareRuntime();

-    /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);

-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
   * Result is put in &result variable
   */
@ -64,22 +74,26 @@ public:
 		      double timeStart, double timeStep,
 		      double &result);

-  /** Write params to device.
+  /** 
+   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 

-  /** Write functions to device.
+  /** 
+   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);

-  /** Write maps to device.
+  /** 
+   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);

-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -87,14 +101,16 @@ public:
   */
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);

-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();

-  /** Check MuSR kernels for necessary resources.
+  /** 
+   * Check MuSR kernels for necessary resources.
   * Query device properties to get if sufficient resources are
-   * available to run the kernels
+   * available to run the kernels. Also checks if double precission is enabled on the device.
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock);

--- a/src/OpenCL/OpenCLCollimatorPhysics.cpp
+++ b/src/OpenCL/OpenCLCollimatorPhysics.cpp
@ -34,7 +34,7 @@ TODO:
 2. boost.compute sort for user defined structure crashes
 */
 int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, 
-					       int numparticles) 
+					       int numparticles, bool enableRutherforScattering) 
 {
  /*
  //set number of total threads, and number threads per block
--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@ -17,12 +17,16 @@
 #include "boost/compute/core.hpp"
 */

+/** Double3 structure for use in OpenCL code. */
 typedef struct {
  double x;
  double y;
  double z;
 } Double3;

+/**
+ * Structure for stroing particles in OpenCL code.
+ */
 typedef struct {
  int label;
  unsigned localID;
@ -35,6 +39,10 @@ typedef struct {
 //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
 //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));

+/**
+ * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
+ */
 class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {

 private:
@ -42,17 +50,22 @@ private:

 public:

-  /* constructor */
+  /** 
+   * Constructor with OpenCLBase as argument.
+   * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
+   */
  OpenCLCollimatorPhysics(OpenCLBase *base) { 
    m_oclbase = base;
  }

-  /* destructor */
+  /** 
+   * Destructor.
+   */
  ~OpenCLCollimatorPhysics() { 
  }

-  /* execute degrader code on device */
-  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
+			bool enableRutherforScattering = true);

  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
--- a/src/OpenCL/OpenCLFFT.cpp
+++ b/src/OpenCL/OpenCLFFT.cpp
@ -31,7 +31,6 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
 	
  if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
    return OCL_ERROR;
-
 	
  //execute kernel
  for (int step = 1; step < N; step <<= 1) {
@ -89,26 +88,78 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
  call fft execution on device for every dimension
 */
 int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
-  int ierr;
-	
+
+  int dkserr = DKS_SUCCESS;
+  cl_int ierr;
  cl_mem inout = (cl_mem)data;
-  int n = N[0];

-  for (int dim = 0; dim < ndim; dim++) {
-    ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
-    if (ierr != OCL_SUCCESS) {
-      DEBUG_MSG("Error executing bit reverse");
-      return OCL_ERROR;
-    }
+  if (forward)
+    ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
+				0, NULL, NULL, &inout, NULL, NULL);
+  else
+    ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, 
+				0, NULL, NULL, &inout, NULL, NULL);  

-    ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
-    if (ierr != OCL_SUCCESS) {
-      DEBUG_MSG("Error executing fft reverse");
-      return OCL_ERROR;
-    }
+  if (ierr != OCL_SUCCESS) {
+    dkserr = DKS_ERROR;
+    DEBUG_MSG("Error executing cfFFT\n");
+    if (ierr == CLFFT_INVALID_PLAN)
+      std::cout << "Invlalid plan" << std::endl;
+    else 
+      std::cout << "CLFFT error" << std::endl;
  }

-  return OCL_SUCCESS;
+  return dkserr;
+}
+
+/*
+  call rcfft execution on device for every dimension
+*/
+int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
+
+  int dkserr = DKS_SUCCESS;
+  cl_int ierr;
+  cl_mem real_in = (cl_mem)real_ptr;
+  cl_mem comp_out = (cl_mem)comp_ptr;
+
+  ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
+			       0, NULL, NULL, &real_in, &comp_out, NULL);
+  
+  if (ierr != OCL_SUCCESS) {
+    dkserr = DKS_ERROR;
+    DEBUG_MSG("Error executing cfFFT\n");
+    if (ierr == CLFFT_INVALID_PLAN)
+      std::cout << "Invlalid plan" << std::endl;
+    else 
+      std::cout << "CLFFT error" << std::endl;
+  }
+
+  return dkserr;
+}
+
+/*
+  call rcfft execution on device for every dimension
+*/
+int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
+
+  int dkserr = DKS_SUCCESS;
+  cl_int ierr;
+  cl_mem real_in = (cl_mem)real_ptr;
+  cl_mem comp_out = (cl_mem)comp_ptr;
+
+  ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, 
+				0, NULL, NULL, &comp_out, &real_in, NULL);
+  
+  if (ierr != OCL_SUCCESS) {
+    dkserr = DKS_ERROR;
+    DEBUG_MSG("Error executing cfFFT\n");
+    if (ierr == CLFFT_INVALID_PLAN)
+      std::cout << "Invlalid plan" << std::endl;
+    else 
+      std::cout << "CLFFT error" << std::endl;
+  }
+
+  return dkserr;
 }
 	
 /*
@ -120,10 +171,11 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
 }
 	
 /*
-  call kernel to normalize fft
+  call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
 */
 int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {

+/*
  cl_mem inout = (cl_mem)data;

  int n = N[0];
@ -150,132 +202,175 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
    DEBUG_MSG("Error executing kernel");
    return OCL_ERROR;
  }
-	
+*/	
  return OCL_SUCCESS;
 }

-int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
-	
-  int ierr;
-  int size = sizeof(cl_double2)*pow(N,ndim);
-	
-  cl_mem mem_tmp;
-  cl_mem mem_src = (cl_mem)src;
-  cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
+int OpenCLFFT::setupFFT(int ndim, int N[3]) {

-  //set the number of work items in each dimension
-  size_t work_items[3];
-  int p = 1;
-  int threads = N / 2;
-  int f = (forward) ? -1 : 1;
-	
-  //execute kernel
-  int n = (int)log2(N);
-  for (int i = 0; i < ndim; i++) {
+  cl_int err;

-    int dim = i+1;
-    p = 1;
-    work_items[0] = (dim == 1) ? N/2 : N;
-    work_items[1] = (dim == 2) ? N/2 : N;
-    work_items[2] = (dim == 3) ? N/2 : N;
-		
-    //transpose array if calculating dimension larger than 1
-    //if (dim > 1) 
-    //	ocl_executeTranspose(mem_src, N, ndim, dim);
-		
-    //create kernel and set kernel arguments
-    if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
-      return OCL_ERROR;
-			
-    for (int t = 1; t <= log2(N); t++) {
-		
-      m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
-      m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
-      m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
-      m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
-      m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
-      m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
-		
-      if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) 
-	return OCL_ERROR;
-
-      mem_tmp = mem_src;
-      mem_src = mem_dst;
-      mem_dst = mem_tmp;
-	
-      p = 2*p;
-    }
-		
-    //transpose array back if calculating dimension larger than 1
-    //if (dim > 1)
-    //	ocl_executeTranspose(mem_src, N, ndim, dim);
-  }	
-
-  if (ndim*n % 2 == 1) {
-    m_oclbase->ocl_copyData(mem_src, mem_dst, size);
-    mem_tmp = mem_src;
-    mem_src = mem_dst;
-    mem_dst = mem_tmp;
-  }
-
-  m_oclbase->ocl_freeMemory(mem_dst);
-		
-  return OCL_SUCCESS;
-	
-}
-
-int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
-
-  cl_mem mem_src = (cl_mem)src;
-	
-  size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
-  size_t work_group_size[3] = {(size_t)N/2, 1, 1};
-	
-  m_oclbase->ocl_createKernel("fft_batch3D");
-	
-  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
-  m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
-  m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
-  m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
-  m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
-	
-	
-  for (int dim = 1; dim < ndim+1; dim++) {
-    m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
-    m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
-  }
-	
-  return OCL_SUCCESS;
-}
-
-int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
-	
-  cl_mem mem_src = (cl_mem)src;
-	
+  clfftDim dim;
  if (ndim == 1)
-    return OCL_SUCCESS;
-		
-  size_t work_items[3];
-  work_items[0] = N[0];
-  work_items[1] = N[1];
-  work_items[2] = 1;
+    dim = CLFFT_1D;
+  else if (ndim == 2)
+    dim = CLFFT_2D;
+  else 
+    dim = CLFFT_3D;
+  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};

-  size_t work_group_size[3];
-  work_group_size[0] = N[0];
-  work_group_size[1] = N[1];
-  work_group_size[2] = 1;
+  /* Create 3D fft plan*/
+  err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);

-  size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
-	
-  m_oclbase->ocl_createKernel("transpose");
-  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
-  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
-  m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
-  m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
-  m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
-  m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
+  /* Set plan parameters */
+  err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
+  if (err != CL_SUCCESS)
+    std::cout << "Error setting precision" << std::endl;
+  err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
+  if (err != CL_SUCCESS)
+    std::cout << "Error setting layout" << std::endl;
+  err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
+  if (err != CL_SUCCESS)
+    std::cout << "Error setting result location" << std::endl;
+  /* Bake the plan */
+  err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
+
+  if (err != CL_SUCCESS) {
+    DEBUG_MSG("Error creating Complex-to-complex plan");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+}
+
+int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
+  cl_int err;
+  
+  clfftDim dim;
+  if (ndim == 1)
+    dim = CLFFT_1D;
+  else if (ndim == 2)
+    dim = CLFFT_2D;
+  else 
+    dim = CLFFT_3D;
+
+  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
+
+  size_t half = (size_t)N[0] / 2 + 1;
+  size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
+  size_t clOutStride[3] = {1, half, half * N[1]};
+
+  /* Create 3D fft plan*/
+  err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
+
+  /* Set plan parameters */
+  err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
+  err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
+  err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
+  err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
+  err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
+
+  /* Bake the plan */
+  err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
+
+  if (err != CL_SUCCESS) {
+    DEBUG_MSG("Error creating Real-to-complex plan");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+}
+
+int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
+  cl_int err;
+  
+  clfftDim dim;
+  if (ndim == 1)
+    dim = CLFFT_1D;
+  else if (ndim == 2)
+    dim = CLFFT_2D;
+  else
+    dim = CLFFT_3D;
+  
+  size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
+
+  size_t half = (size_t)N[0] / 2 + 1;
+  size_t clInStride[3] = {1, half, half * N[1]};
+  size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
+
+  /* Create 3D fft plan*/
+  err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
+
+  /* Set plan parameters */
+  err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
+  err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
+  err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
+  err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
+  err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
+
+  /* Bake the plan */
+  err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
+
+  if (err != CL_SUCCESS) {
+    DEBUG_MSG("Error creating Complex-to-real plan");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+}
+
+int OpenCLFFT::destroyFFT() {
+  clfftDestroyPlan(&planHandleZ2Z);
+  clfftDestroyPlan(&planHandleD2Z);
+  clfftDestroyPlan(&planHandleZ2D);
+
+  clfftTeardown();
+
+  return DKS_SUCCESS;
+}
+
+
+void OpenCLFFT::printError(clfftStatus err) {
+
+  if (err != CL_SUCCESS) {
+    std::cout << "Error creating default plan " << err <<  std::endl;
+    switch(err) {
+    case CLFFT_BUGCHECK: 
+      std::cout << "bugcheck" << std::endl; 
+      break;
+    case CLFFT_NOTIMPLEMENTED: 
+      std::cout << "not implemented" << std::endl; 
+      break;
+    case CLFFT_TRANSPOSED_NOTIMPLEMENTED: 
+      std::cout << "transposed not implemented" << std::endl; 
+      break;
+    case CLFFT_FILE_NOT_FOUND: 
+      std::cout << "file not found" << std::endl; 
+      break;
+    case CLFFT_FILE_CREATE_FAILURE: 
+      std::cout << "file create failure" << std::endl; 
+      break;
+    case CLFFT_VERSION_MISMATCH: 
+      std::cout << "version missmatch" << std::endl; 
+      break;
+    case CLFFT_INVALID_PLAN: 
+      std::cout << "invalid plan" << std::endl; 
+      break;
+    case CLFFT_DEVICE_NO_DOUBLE: 
+      std::cout << "no double" << std::endl; 
+      break;
+    case CLFFT_DEVICE_MISMATCH: 
+      std::cout << "device missmatch" << std::endl; 
+      break;
+    case CLFFT_ENDSTATUS: 
+      std::cout << "end status" << std::endl; 
+      break;
+    default: 
+      std::cout << "other: " << err << std::endl;
+      break;
+    }
+  }

-  return OCL_SUCCESS;
 }

 /*
--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@ -1,14 +1,3 @@
-/*
-
-  Name: OpenCLFFT
-
-  Author: Uldis Locans
-
-  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
-
-  Data: 19.09.2014
-
-*/
 #ifndef H_OPENCL_FFT
 #define H_OPENCL_FFT

@ -20,12 +9,25 @@
 #include "../Algorithms/FFT.h"
 #include "OpenCLBase.h"

-class OpenCLFFT : public DKSFFT {
+#include "clFFT.h"
+
+/**
+ * OpenCL FFT class based on BaseFFT interface.
+ * Uses clFFT library to perform FFTs on AMD gpus.
+ * clFFT library works also on nvida GPUs and other devices that
+ * support OpenCL.
+ */
+class OpenCLFFT : public BaseFFT {

 private:

  OpenCLBase *m_oclbase;

+  clfftSetupData fftSetup;
+  clfftPlanHandle planHandleZ2Z;
+  clfftPlanHandle planHandleD2Z;
+  clfftPlanHandle planHandleZ2D;
+
  /*
    Info: call fft kernels to execute FFT of the given domain,
    data - devevice memory ptr, cdim - current dim to transform, 
@ -42,15 +44,31 @@ private:
  */
  int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);

+  /** Get clfftStatus and print the corresponding error message.
+   *  clfftStatus is returned from all clFFT library functions, print error displays the
+   *  corresponding error message. If "other" is printed then error code corresponds to 
+   *  OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
+   *  be checked to determine the reason for the error.
+   */
+  void printError(clfftStatus err);
+  
 public:

  /* constructor - currently does nothing*/
  OpenCLFFT(OpenCLBase *base) {
    m_oclbase = base;
+
+    /* Set up fft */
+    cl_int err;
+    err = clfftInitSetupData(&fftSetup);
+    err = clfftSetup(&fftSetup);
+    
+    if (err != CL_SUCCESS)
+      DEBUG_MSG("Error seting up clFFT");
  }
 	
  /* destructor - currently does nothing*/
-  ~OpenCLFFT() { }
+  ~OpenCLFFT() { destroyFFT(); }
 	
  /*
    Info: execute forward fft function with data set on device
@ -77,35 +95,22 @@ public:
    Info: set FFT size
    Return: success or error code
  */
-  int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
+  int setupFFT(int ndim, int N[3]);

-  int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0);

-  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0);

-  int destroyFFT() { return DKS_SUCCESS; }
+  int destroyFFT();
 	
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
-				int streamId = -1)
-    {
-      return DKS_ERROR;
-    }
+		   int streamId = -1);
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
-				int streamId = -1)
-    {
-      return DKS_ERROR;
-    }
-  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
-    {
-      return DKS_ERROR;
-    }
+		   int streamId = -1);
+  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) {
+    return DKS_ERROR;
+  }

-  int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
-
-  int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
-
-  int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
-	
  //void printData3DN4(cl_double2* &data, int N);

 };
--- a/src/OpenCL/OpenCLGreensFunction.cpp
+++ b/src/OpenCL/OpenCLGreensFunction.cpp
@ -0,0 +1,181 @@
+#include "OpenCLGreensFunction.h"
+#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
+
+OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
+  m_base = base;
+  base_create = false;
+}
+
+OpenCLGreensFunction::OpenCLGreensFunction() {
+  m_base = new OpenCLBase();
+  base_create = true;
+}
+
+OpenCLGreensFunction::~OpenCLGreensFunction() {
+  if (base_create)
+    delete m_base;
+}
+
+int OpenCLGreensFunction::buildProgram() {
+  char *kernel_file = new char[500];
+  kernel_file[0] = '\0';
+  strcat(kernel_file, OPENCL_KERNELS);
+  strcat(kernel_file, GREENS_KERNEL);
+
+  return m_base->ocl_loadKernel(kernel_file);
+}
+
+int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+					 double hr_m0, double hr_m1, double hr_m2, 
+					 int streamId)
+{
+  int ierr = DKS_SUCCESS;
+
+  //compile opencl program from source
+  buildProgram();
+
+  //cast the input data ptr to cl_mem
+  cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
+  
+  //set the work item size
+  size_t work_size = 128;
+  size_t work_items = I * J * K;
+  if (work_items % work_size > 0) 
+    work_items = (work_items / work_size + 1) * work_size;
+
+  //create kernel
+  ierr = m_base->ocl_createKernel("kernelTmpgreen");
+
+  //set kernel parameters
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
+  m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
+  m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
+  m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
+  m_base->ocl_setKernelArg(4, sizeof(int), &I);
+  m_base->ocl_setKernelArg(5, sizeof(int), &J);
+  m_base->ocl_setKernelArg(6, sizeof(int), &K);
+  
+  //execute kernel
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
+
+  return ierr;
+}
+		
+int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
+						    int K, int streamId)
+{
+  int ierr = DKS_SUCCESS;
+
+  //compile opencl program from source
+  buildProgram();
+
+  //cast the input data ptr to cl_mem
+  cl_mem rho2_ptr = (cl_mem)rho2_m;
+  cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
+  int NI = 2*(I - 1);
+  int NJ = 2*(J - 1);
+
+  //set the work item size
+  size_t work_size = 128;
+  size_t work_items = I * J * K;
+  if (work_items % work_size > 0) 
+    work_items = (work_items / work_size + 1) * work_size;
+
+  //create kernel
+  ierr = m_base->ocl_createKernel("kernelIntegration");
+
+  //set kernel parameters
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
+  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
+  m_base->ocl_setKernelArg(2, sizeof(int), &NI);
+  m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
+  m_base->ocl_setKernelArg(4, sizeof(int), &I);
+  m_base->ocl_setKernelArg(5, sizeof(int), &J);
+  m_base->ocl_setKernelArg(6, sizeof(int), &K);
+  
+  //execute kernel
+  double zero = 0.0;
+  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
+  m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
+
+  return ierr;
+
+}
+		
+ 
+int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) 
+{
+  int ierr = DKS_SUCCESS;
+
+  //compile opencl program from source
+  buildProgram();
+
+  //cast the input data ptr to cl_mem
+  cl_mem rho2_ptr = (cl_mem)rho2_m;
+  int NI = I + 1;
+  int NJ = J + 1;
+  int NK = K + 1;
+  int I2 = 2*I;
+  int J2 = 2*J;
+  int K2 = 2*K;
+
+  int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
+
+  //set the work item size
+  size_t work_size = 128;
+  size_t work_items = NI * NJ * NK;
+  if (work_items % work_size > 0) 
+    work_items = (work_items / work_size + 1) * work_size;
+
+  //create kernel
+  ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
+
+  //set kernel parameters
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
+  m_base->ocl_setKernelArg(1, sizeof(int), &I2);
+  m_base->ocl_setKernelArg(2, sizeof(int), &J2);
+  m_base->ocl_setKernelArg(3, sizeof(int), &K2);
+  m_base->ocl_setKernelArg(4, sizeof(int), &NI);
+  m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
+  m_base->ocl_setKernelArg(6, sizeof(int), &NK);
+  m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
+
+  //execute kernel
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
+
+  return ierr;
+}
+
+
+int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
+{
+    int ierr = DKS_SUCCESS;
+
+  //compile opencl program from source
+  buildProgram();
+
+  //cast the input data ptr to cl_mem
+  cl_mem mem_ptr1 = (cl_mem) ptr1;
+  cl_mem mem_ptr2 = (cl_mem) ptr2;
+
+  //set the work item size
+  size_t work_size = 128;
+  size_t work_items = size;
+  if (work_items % work_size > 0) 
+    work_items = (work_items / work_size + 1) * work_size;
+
+  //create kernel
+  ierr = m_base->ocl_createKernel("multiplyComplexFields");
+
+  //set kernel parameters
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
+  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
+  m_base->ocl_setKernelArg(2, sizeof(int), &size);
+
+  //execute kernel
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
+
+  return ierr;
+
+}
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@ -0,0 +1,64 @@
+#ifndef H_OPENCL_GREENSFUNCTION
+#define H_OPENCL_GREENSFUNCTION
+
+#include <iostream>
+#include <cmath>
+
+#include "../Algorithms/GreensFunction.h"
+#include "OpenCLBase.h"
+
+/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
+class OpenCLGreensFunction : public GreensFunction {
+
+private:
+
+  bool base_create;
+  OpenCLBase *m_base;
+
+public:
+
+  /** Constructor with OpenCLBase argument */
+  OpenCLGreensFunction(OpenCLBase *base);
+
+  /** Default constructor */
+  OpenCLGreensFunction();
+
+  /** Destructor */
+  ~OpenCLGreensFunction();
+
+  /** Load OpenCL kernel file containing greens function kernels.
+   *  m_base takes the kernel file and compiles the OpenCL programm.
+   */
+  int buildProgram();
+
+  /**
+    Info: calc itegral on device memory (taken from OPAL src code).
+    Return: success or error code
+  */
+  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
+		       double hr_m0, double hr_m1, double hr_m2, 
+		       int streamId = -1);
+		
+  /**
+    Info: integration of rho2_m field (taken from OPAL src code).
+    Return: success or error code
+  */
+  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
+				  int streamId = -1);
+		
+  /**
+    Info: mirror rho field (taken from OPAL src code).
+    Return: succes or error code
+  */
+  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
+
+  /**
+    Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
+    Return: success or error code
+  */
+  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
+
+};
+
+
+#endif
--- a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
@ -106,6 +106,56 @@ double ifld(double t, double alpha, double phi, double nu, double lambdaT, doubl
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

+double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double rate2 = sigma*sigma*t*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
+  }
+
+  return result;
+}
+
+double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double at = a*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
+  }
+
+  return result;
+}
+
 double b(double t, double phi, double nu) {
  return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
--- a/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
@ -1,6 +1,4 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#pragma OPENCL EXTENSION 
-

 /******Random numbers********/

@ -89,13 +87,14 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {

  if (id < N) {
    RNDState tmp;
-    int tmp_seed = id;// * 0x100000000ULL;
+    int tmp_seed = 2*id;// * 0x100000000ULL;
    tmp.s10 = 12345 + tmp_seed;
    tmp.s11 = 12345 + tmp_seed;
-    tmp.s12 = 123 + tmp_seed;
+    tmp.s12 = 12345 + tmp_seed;
    tmp.s20 = 12345 + tmp_seed;
    tmp.s21 = 12345 + tmp_seed;
-    tmp.s22 = 123 + tmp_seed;
+    tmp.s22 = 12345 + tmp_seed;
+

    tmp.z = 0;
    tmp.gen = true;
@ -105,6 +104,19 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {

 }

+/* create random numbers and fill an array */
+__kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
+
+  int idx = get_global_id(0);
+
+  if (idx < size) {
+    RNDState s = states[idx];
+    data[idx] = rand_uniform(&s);
+    states[idx] = s;
+  }
+
+}
+

 /**********Degrader**********/
 enum PARAMS { POSITION, 
--- a/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
@ -0,0 +1,170 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/** compute the greens integral analytically */
+__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
+			     int NI, int NJ, int NK)
+{
+
+  int tid = get_local_size(0);
+  int id = get_global_id(0);
+
+  if (id < NI * NJ * NK) {
+    int i = id % NI;
+    int k = id / (NI * NJ);
+    int j = (id - k * NI * NJ) / NI;
+  
+    
+    double cellVolume = hr_m0 * hr_m1 * hr_m2;
+    
+    double vv0 = i * hr_m0 - hr_m0 / 2;
+    double vv1 = j * hr_m1 - hr_m1 / 2;
+    double vv2 = k * hr_m2 - hr_m2 / 2;
+  
+    double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
+    
+    double tmpgrn  = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
+    tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
+    tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
+  
+    tmpgrn = tmpgrn / 2;
+
+    tmpgrn += vv1 * vv2 * log(vv0 + r);
+    tmpgrn += vv0 * vv2 * log(vv1 + r);
+    tmpgrn += vv0 * vv1 * log(vv2 + r);
+
+    tmpgreen[id] = tmpgrn / cellVolume;
+    
+  }
+
+}
+
+/** perform the actual integration */
+__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen, 
+				int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) 
+{
+  
+  int tid = get_local_id(0);
+  int id = get_global_id(0);
+
+  int ni = NI;
+  int nj = NJ;
+  
+  double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  if (id < NI_tmp * NJ_tmp * NK_tmp) {
+    int i = id % NI_tmp;
+    int k = id / (NI_tmp * NJ_tmp);
+    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
+
+    tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
+    tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
+    
+    if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
+      tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+  
+    if (i+1 < NI_tmp)
+      tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+    if (j+1 < NJ_tmp)
+      tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+    if (k+1 < NK_tmp)
+      tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+    if (i+1 < NI_tmp && j+1 < NJ_tmp)
+      tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+  
+    if (i+1 < NI_tmp && k+1 < NK_tmp)
+      tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+    if (j+1 < NJ_tmp && k+1 < NK_tmp)
+      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+    
+    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
+
+    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
+  }
+}
+
+/** miror rho-field */
+__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
+  rho2_m[0] = rho2_m[NI*NJ];
+}
+
+__kernel void kernelMirroredRhoField(__global double *rho2_m, 
+				     int NI, int NJ, int NK, 
+				     int NI_tmp, int NJ_tmp, int NK_tmp,
+				     int size) 
+{
+
+  int tid = get_local_id(0);
+  int id = get_global_id(0);
+
+  if (id == 0)
+   rho2_m[0] = rho2_m[NI * NJ];
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  int id1, id2, id3, id4, id5, id6, id7, id8;
+
+  if (id < NI_tmp * NJ_tmp * NK_tmp) {
+    int i = id % NI_tmp;
+    int k = id / (NI_tmp * NJ_tmp);
+    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
+
+    int ri = NI - i;
+    int rj = NJ - j;
+    int rk = NK - k;
+
+    id1 = k * NI * NJ + j * NI + i;
+    id2 = k * NI * NJ + j * NI + ri;
+    id3 = k * NI * NJ + rj * NI + i;
+    id4 = k * NI * NJ + rj * NI + ri;
+
+    id5 = rk * NI * NJ + j * NI + i;
+    id6 = rk * NI * NJ + j * NI + ri;
+    id7 = rk * NI * NJ + rj * NI + i;
+    id8 = rk * NI * NJ + rj * NI + ri;
+    
+    double data = 0.0;
+    if (id1 < size)
+      data = rho2_m[id1];
+    
+    if (i != 0 && id2 < size) rho2_m[id2] = data;
+    
+    if (j != 0 && id3 < size) rho2_m[id3] = data;
+
+    if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
+    
+    if (k != 0 && id5 < size) rho2_m[id5] = data;
+    
+    if (k !=  0 && i != 0 && id6 < size) rho2_m[id6] = data;
+    
+    if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
+    
+    if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;     
+  }
+
+}
+
+/** multiply complex fields */
+double2 ComplexMul(double2 a, double2 b) {
+  double2 c;
+  c.x = a.x * b.x - a.y * b.y;
+  c.y = a.x * b.y + a.y * b.x;
+
+  return c;
+}
+
+__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2, 
+				    int size) 
+{
+  
+  int idx = get_global_id(0);
+
+  if (idx < size)
+    ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
+  
+}
--- a/src/Utility/DKSTimer.h
+++ b/src/Utility/DKSTimer.h
@ -5,6 +5,10 @@
 #include <string>
 #include <sys/time.h>

+/**
+ * Custom timer class.
+ * Allows to insert timers in the code to get function exectution times.
+ */
 class DKSTimer {

 private:
@ -17,39 +21,45 @@ private:

 public:

-  /** Init DKSTimer by seting timer to zero  */
+  /** Init DKSTimer by seting timer to zero. */
  DKSTimer();

  ~DKSTimer();

-  /** Init the timer
-   *  Set the name for timer and clear all values
+  /** 
+   * Init the timer.
+   * Set the name for timer and clear all values
   */
  void init(std::string n);

-  /** Start the timer.
-   *  Get the curret time with gettimeofday and save in timeStart
+  /** 
+   * Start the timer.
+   * Get the curret time with gettimeofday and save in timeStart
   */
  void start();

-  /** Stop the timer 
-   *  Get the curretn time with gettimeofday and save in timeEnd
-   *  Calculate elapsed time by timeEnd - timeStart and add to timervalue
+  /** 
+   * Stop the timer.
+   * Get the curretn time with gettimeofday and save in timeEnd
+   * Calculate elapsed time by timeEnd - timeStart and add to timervalue
   */
  void stop();

-  /** Reset timervalue to zero.
-   *  Set timervalue, timeStart and timeEnd to zero
+  /** 
+   * Reset timervalue to zero.
+   * Set timervalue, timeStart and timeEnd to zero
   */
  void reset();

-  /** Return elapsed time in seconds.
-   *  Return the value of timervalue
+  /** 
+   * Return elapsed time in seconds.
+   * Return the value of timervalue
   */
  double gettime();

-  /** Print timer.
-   *  Print the elapsed time of the timer
+  /** 
+   * Print timer.
+   * Print the elapsed time of the timer
   */
  void print();
     
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #ADD_EXECUTABLE(testFFT testFFT.cpp)
 #ADD_EXECUTABLE(testMIC testMIC.cpp)
 #ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
-#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
-#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
+ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
+ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
 #ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
 #ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
 #ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
@ -22,10 +22,11 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #ADD_EXECUTABLE(testGather testGather.cpp)
 #ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
 #ADD_EXECUTABLE(testTranspose testTranspose.cpp)
+ADD_EXECUTABLE(testRandom testRandom.cpp)
 ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
-#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
+ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
 #ADD_EXECUTABLE(testPush testPush.cpp)
-#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
+ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
 #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)

@ -38,8 +39,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
 #TARGET_LINK_LIBRARIES(testFFT dks)
 #TARGET_LINK_LIBRARIES(testMIC dks)
 #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
-#TARGET_LINK_LIBRARIES(testFFT3D dks)
-#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
+TARGET_LINK_LIBRARIES(testFFT3D dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
 #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
 #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -53,10 +54,11 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
 #TARGET_LINK_LIBRARIES(testGather dks)
 #TARGET_LINK_LIBRARIES(testGatherAsync dks)
 #TARGET_LINK_LIBRARIES(testTranspose dks)
-TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
-#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
+TARGET_LINK_LIBRARIES(testRandom dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testPush dks)
-#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
+TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testIntegration dks)
 #TARGET_LINK_LIBRARIES(testImageReconstruction dks)

@ -81,4 +83,4 @@ TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
 #IF (NOT CUDA_VERSION VERSION_LESS "7.0")
  #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
  #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
-#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
+#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
--- a/test/testCollimatorPhysicsSoA.cpp
+++ b/test/testCollimatorPhysicsSoA.cpp
@ -129,7 +129,9 @@ int main(int argc, char *argv[]) {
  //init random
  base.callInitRandoms(numpart);

+  
  //**test collimator physics and sort***//
+  
  void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;

  //allocate memory for particles
@ -210,8 +212,8 @@ int main(int argc, char *argv[]) {
  base.freeMemory<double>(pz_ptr, numpart);

  base.freeMemory<double>(param_ptr, 12);
-
-  /*  
+  
+    /*  
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < 10; i++) {
    std::cout <<  p.label[i] << "\t" << p.rx[i] 
--- a/test/testFFT3D.cpp
+++ b/test/testFFT3D.cpp
@ -1,6 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
+#include <string>

 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@ -18,22 +19,30 @@ int main(int argc, char *argv[]) {
  int N = 16;
  char *api_name = new char[10];
  char *device_name = new char[10];
-  if (argc == 2) {
-    N = atoi(argv[1]);
-    strcpy(api_name, "Cuda");
-    strcpy(device_name, "-gpu");
-  } else if (argc == 3) {
-    N = atoi(argv[1]);
-    strcpy(api_name, argv[2]);
-    strcpy(device_name, "-gpu");
-  } else if (argc == 4) {
-    N = atoi(argv[1]);
-    strcpy(api_name, argv[2]);
-    strcpy(device_name, argv[3]);
-  } else {
-    N = 16;
-    strcpy(api_name, "OpenCL");
-    strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    } 
+
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    } 
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    } 
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-N"))
+      N = atoi(argv[i+1]);
  }

  cout << "Use api: " << api_name << ", " << device_name << endl;
@ -74,9 +83,16 @@ int main(int argc, char *argv[]) {
 	
  /* write data to device */	
  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+  if (N < 5)
+    printData3DN4(cdata, N, 3);
+

  /* execute fft */
  base.callFFT(mem_ptr, 3, dimsize);
+  if (N < 5) {
+    base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
+    printData3DN4(cfft, N, 3);
+  }
 	
  /* execute ifft */	
  base.callIFFT(mem_ptr, 3, dimsize);
@ -86,7 +102,9 @@ int main(int argc, char *argv[]) {
 	
  /* read data from device */
  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
-	
+  if (N < 5)
+    printData3DN4(cifft, N, 3);	
+
  /* free device memory */
  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
 	
@ -130,7 +148,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
 	if (a < 10e-5 && a > -10e-5)
 	  a = 0;
 					
-	cout << d << "; " << a << "\t";
+	cout << "(" << d << "," << a << ") ";
      }
    }
    cout << endl;
@ -157,3 +175,5 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
 }

+
+
--- a/test/testFFT3DRC.cpp
+++ b/test/testFFT3DRC.cpp
@ -1,6 +1,8 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
+#include <fstream>
+#include <iomanip>

 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@ -8,54 +10,53 @@
 using namespace std;

 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
-void initData(double *data, int dimsize[3]);
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
+void initData(double *data, int dimsize[3], int dim);
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim, 
+		char *api_name, char *device_name, char *file_name);
 void printHelp();

+void printData3DN4(complex<double>* &data, int N, int dim);
+void printData3DN4(double* &data, int N, int dim);
+
+double precision(double a) {
+  //if (a < 1e-10)
+  //  return 0.0;
+  //else
+    return a;
+}
+
 int main(int argc, char *argv[]) {

  int N1 = 8;
  int N2 = 8;
  int N3 = 8;
  int dim = 3;
-  int loop = 10;
+  int loop = 0;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  char *file_name = new char[50];

-  if ( readParams(argc, argv, N1, N2, N3, loop) )
+  if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
    return 0;

-  int dimsize[3] = {N3, N2, N1};
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N1, N2, N3};
  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];

  double *rdata = new double[sizereal];
  double *outdata = new double[sizereal];
  complex<double> *cfft = new complex<double>[sizecomp];
-
-  for (int i=0; i<sizecomp; ++i) {
-    cfft[i].real() = 7.;
-    cfft[i].imag() = 3.33;
-  }
-  initData(rdata, dimsize);
+  initData(rdata, dimsize, dim);

  /* init DKSBase */
  cout << "Init device and set function" << endl;
-#ifdef DKS_MIC
  DKSBase base;
-  base.setAPI("OpenMP", 6);
-  base.setDevice("-mic", 4);
-  base.initDevice();
-  base.setupFFTRC(dim, dimsize);
-  /* setup backward fft (COMPLEX->REAL) */
-  base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
-#endif
-
-#ifdef DKS_CUDA
-  DKSBase base;
-  base.setAPI("Cuda", 4);
-  base.setDevice("-gpu", 4);
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
  base.initDevice();
  base.setupFFT(dim, dimsize);
-#endif

  // allocate memory on device
  int ierr;
@ -67,69 +68,59 @@ int main(int argc, char *argv[]) {
  // execute one run before starting the timers
  base.writeData<double>(real_ptr, rdata, sizereal);
  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+  base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
  base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
+  base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
  base.readData<double>(real_res_ptr, outdata, sizereal);

-  //timer for total loop time, FFT and IFFT calls
-  struct timeval timeStart, timeEnd;
-  struct timeval timeFFTStart[loop], timeFFTEnd[loop];
-  struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
-
-  gettimeofday(&timeStart, NULL);
-  for (int i=0; i<loop; ++i){
-
-    // write data to device
-    base.writeData<double>(real_ptr, rdata, sizereal);
-
-    // execute rcfft
-    gettimeofday(&timeFFTStart[i], NULL);
-    base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
-    gettimeofday(&timeFFTEnd[i], NULL);
-
-    // execute crfft
-    gettimeofday(&timeIFFTStart[i], NULL);
-    base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
-    gettimeofday(&timeIFFTEnd[i], NULL);
-
-    //normalize
-#ifdef DKS_CUDA
-    base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
-#endif
-
-    // read IFFT data from device
-    base.readData<double>(real_res_ptr, outdata, sizereal);

+  
+  ofstream myfile;
+  myfile.open(file_name);
+  myfile<< "in\tout\treal\timag\n";
+  for (int i = 0; i < sizereal; i++) {
+    //myfile << precision(rdata[i]) << "\t";
+    //myfile << precision(outdata[i]) << "\t";
+    if (i < sizecomp) {
+      myfile << precision(cfft[i].real()) << "\t";
+      myfile << precision(cfft[i].imag());
+    }
+    myfile << "\n";
  }
-  gettimeofday(&timeEnd, NULL);
+  myfile.close();
+  

+/*
+  if (dim == 2) {
+    for (int i = 0; i < N2; i++) {
+      for (int j = 0; j < N1; j++) {
+	cout << rdata[i*N1 + j] << " ";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+
+
+  if (dim == 2) {
+    for (int i = 0; i < N2; i++) {
+      for (int j = 0; j < N1 / 2 + 1; j++) {
+	cout << cfft[i*(N1 / 2 + 1)  + j] << " ";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+*/
  // free device memory
  base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
  base.freeMemory<double>(real_ptr, sizereal);
  base.freeMemory<double>(real_res_ptr, sizereal);

  // compare in and out data to see if we get back the same results
+  cout << "comp" << endl;
  compareData(rdata, outdata, N1, N2, N3, dim);
-
-  //calculate seconds for total time and fft times
-  double tfft = 0;
-  double tifft = 0;
-  double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + 
-		  (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
-
-  for (int i = 0; i < loop; i++) {
-    tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + 
-	      (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
-
-    tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + 
-	      (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
-  }
-
-  //print timing results
-  std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
-	    << "\nTotal time\t" << ttot <<  "s\tavg time\t"  << ttot / loop  << "s"
-	    << "\nFFT total\t"  << tfft <<  "s\tFFT avg \t"  << tfft / loop  << "s"
-	    << "\nIFFT total\t" << tifft << "s\tIFFT avg\t"  << tifft / loop << "s"
-	    << "\n\n";
+  cout << "done" << endl;

  return 0;
 }
@ -137,10 +128,10 @@ int main(int argc, char *argv[]) {
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
  int id;
  double sum = 0;
-  for (int i = 0; i < NI; i++) {
+  for (int i = 0; i < NK; i++) {
    for (int j = 0; j < NJ; j++) {
-      for (int k = 0; k < NK; k++) {
-	id = k*NI*NJ + j*NI + i;
+      for (int k = 0; k < NI; k++) {
+	id = i*NI*NJ + j*NI + k;
 	sum += fabs(data1[id] - data2[id]);
      }
    }
@ -148,13 +139,21 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
  std::cout << "RC <--> CR diff: " << sum << std::endl;
 }

-void initData(double *data, int dimsize[3]) {
-  for (int i = 0; i < dimsize[2]; i++) {
+void initData(double *data, int dimsize[3], int dim) {
+  if (dim == 3) {
+    for (int i = 0; i < dimsize[2]; i++)
+      for (int j = 0; j < dimsize[1]; j++) 
+	for (int k = 0; k < dimsize[0]; k++) 
+	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
+  } else if (dim == 2) {
    for (int j = 0; j < dimsize[1]; j++) {
      for (int k = 0; k < dimsize[0]; k++) {
-	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
+	data[j*dimsize[0] + k] = sin(k);
      }
    }
+  } else {
+    for (int k = 0; k < dimsize[0]; k++) 
+      data[k] = sin(k);
  }
 }

@ -173,10 +172,17 @@ void printHelp() {
  std::cout << std::endl;
 }

-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
+		char *api_name, char *device_name, char *file_name) 
+{

  for (int i = 1; i < argc; i++) {

+    if ( argv[i] == std::string("-dim")) {
+      dim = atoi(argv[i + 1]);
+      i++;
+    }
+
    if ( argv[i] == std::string("-grid") ) {
      N1 = atoi(argv[i + 1]);
      N2 = atoi(argv[i + 2]);
@ -193,7 +199,72 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
      printHelp();
      return true;
    }
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+      strcpy(file_name, "cuda_fft.dat");
+    } 
+
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+      strcpy(file_name, "opencl_fft.dat");
+    } 
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+      strcpy(file_name, "openmp_fft.dat");
+    } 
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+      strcpy(file_name, "opencl_cpu_fft.dat");
+    }
  }

  return false;
 }
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+	double d = data[i*N*N + j*N + k].real();
+	double a = data[i*N*N + j*N + k].imag();
+				
+	if (d < 10e-5 && d > -10e-5)
+	  d = 0;
+	if (a < 10e-5 && a > -10e-5)
+	  a = 0;
+					
+	cout << "(" << d << "," << a << ") ";
+      }
+    }
+    cout << endl;
+  }
+  cout << endl;
+    
+}
+
+void printData3DN4(double* &data, int N, int dim) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N; k++) {
+	double d = data[i*N*N + j*N + k];
+				
+	if (d < 10e-5 && d > -10e-5)
+	  d = 0;
+					
+	cout << d << " ";
+      }
+    }
+    cout << endl;
+  }
+  cout << endl;
+    
+}
--- a/test/testFFTSolver_MIC.cpp
+++ b/test/testFFTSolver_MIC.cpp
@ -1,5 +1,4 @@
 #include <iostream>
-//#include <mpi.h>
 #include <string.h>

 #include "DKSBase.h"
@ -11,309 +10,265 @@ using namespace std;


 void printData3D(double* data, int N, int NI, const char *message = "") {
-	if (strcmp(message, "") != 0)
-		cout << message;
+  if (strcmp(message, "") != 0)
+    cout << message;

-	for (int i = 0; i < NI; i++) {
-		for (int j = 0; j < N; j++) {
-			for (int k = 0; k < N; k++) {
-				cout << data[i*N*N + j*N + k] << "\t";
-			}
-			cout << endl;
-		}
-		cout << endl;
-	}
+  for (int i = 0; i < NI; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }

 }

 void initData(double *data, int N) {

-	for (int i = 0; i < N/4 + 1; i++) {
-		for (int j = 0; j < N/2 + 1; j++) {
-			for (int k = 0; k < N/2 + 1; k++) {
-				data[i*N*N + j*N + k] = k+1;
-			}
-		}
-	}
+  for (int i = 0; i < N/4 + 1; i++) {
+    for (int j = 0; j < N/2 + 1; j++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+	data[i*N*N + j*N + k] = k+1;
+      }
+    }
+  }
 }

 void initData2(double *data, int N) {
-	for (int i = 0; i < N; i++)
-		data[i] = i;
+  for (int i = 0; i < N; i++)
+    data[i] = i;
 }

 void initComplex( complex<double> *d, int N) {

-	for (int i = 0; i < N; i++) {
-		d[i] = complex<double>(2, 0);
-	}
+  for (int i = 0; i < N; i++) {
+    d[i] = complex<double>(2, 0);
+  }

 }

 void printComplex(complex<double> *d, int N) {

-	for (int i = 0; i < N; i++)
-		cout << d[i] << "\t";
-	cout << endl;
+  for (int i = 0; i < N; i++)
+    cout << d[i] << "\t";
+  cout << endl;
+
+}
+
+void printDouble(double *d, int N) {
+
+  for (int i = 0; i < N; i++)
+    cout << d[i] << ", ";
+  cout << endl;

 }

 void initMirror(double *data, int n1, int n2, int n3) {
-	int d = 1;
-	for (int i = 0; i < n3; i++) {
-		for (int j = 0; j < n2; j++) {
-			for (int k = 0; k < n1; k++) {
-				if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
-					data[i * n2 * n1 + j * n1 + k] = d++;
-				else
-					data[i * n2 * n1 + j * n1 + k] = 0;
-			}
-		}
-	}
+  int d = 1;
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+	  data[i * n2 * n1 + j * n1 + k] = d++;
+	else
+	  data[i * n2 * n1 + j * n1 + k] = 0;
+      }
+    }
+  }
 }

 void printDiv(int c) {
-	for (int i = 0; i < c; i++)
-		cout << "-";
-	cout << endl;
+  for (int i = 0; i < c; i++)
+    cout << "-";
+  cout << endl;

 }

 void printMirror(double *data, int n1, int n2, int n3) {

-	printDiv(75);
-	for (int i = 0; i < n3; i++) {
-		for (int j = 0; j < n2; j++) {
-			for (int k = 0; k < n1; k++) {
-				cout << data[i * n2 * n1 + j * n1 + k] << "\t";
-			}
-			cout << endl;
-		}
-		cout << endl;
-	}
-	cout << endl;
+  printDiv(75);
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+  cout << endl;
 }

 double sumData(double *data, int datasize) {

-	double sum = 0;
-	for (int i = 0; i < datasize; i++)
-		sum += data[i];
+  double sum = 0;
+  for (int i = 0; i < datasize; i++)
+    sum += data[i];

-	return sum;
+  return sum;
 }

 int main(int argc, char *argv[]) {

-	/* mpi init */
-	//int rank, nprocs;
-	//MPI_Init(&argc, &argv);
-	//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  char *api_name = new char[10];
+  char *device_name = new char[10];

-	/*
-	   if (nprocs != 8) {
-	   cout << "example was set to run with 8 processes" << endl;
-	   cout << "exit..." << endl;
-	   return 0;
-	   }
-	   */
+  for (int i = 1; i < argc; i++) {
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    } 

-	/* set domain size */
-	int NG[3] = {64, 64, 32};
-	int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
-	int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
-	int sizerho = NG[0] * NG[1] * NG[2];
-	int sizegreen = ng[0] * ng[1] * ng[2];
-	int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
-	int id[3];
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    } 

-	//id[0] = 0;
-	//id[1] = NL[1] * (rank % 4);
-	//id[2] = NL[2] * (rank / 4);
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    } 

-	/* print some messages bout the example in the begginig */
-	cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
-	//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
-	cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
-	//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
-	int tmp[3];
-	/*  for (int p = 1; p < nprocs; p++) {
-		MPI_Status mpistatus;
-		MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
-		cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
-		}*/
-	// } else {
-	//   MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
-	// }
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+  }

-	/* dks init and create 2 streams */
-	int dkserr;
-	//int streamGreens, streamFFT;
-#ifdef DKS_MIC
-	DKSBase base;
-	base.setAPI("OpenMP", 6);
-	base.setDevice("-mic", 4);
-	base.initDevice();
-#endif
+  cout << "Use api: " << api_name << ", " << device_name << endl;

-#ifdef DKS_CUDA
-	DKSBase base;
-	base.setAPI("Cuda", 4);
-	base.setDevice("-gpu", 4);
-	base.initDevice();
-#endif
+  /* set domain size */
+  int NG[3] = {64, 64, 32};
+  int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+  int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+  int sizerho = NG[0] * NG[1] * NG[2];
+  int sizegreen = ng[0] * ng[1] * ng[2];
+  int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;

-	//base.createStream(streamFFT);
-	//if (rank == 0) {
-	//  base.createStream(streamGreens);
-	base.setupFFT(3, NG);
-	//}
+  /* print some messages bout the example in the begginig */
+  cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+  cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;

-	/* allocate memory and init rho field */
-	double *rho = new double[sizerho];
-	double *rho_out = new double[sizerho];
-	//double *green_out = new double[sizegreen];
-	initMirror(rho, NL[0], NL[1], NL[2]);
+  /* dks init and create 2 streams */
+  int dkserr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+  base.setupFFT(3, NG);

-	/*
-	   allocate memory on device for 
-	   - rho field
-	   - rho FFT
-	   - tmpgreen
-	   - greens integral
-	   - greens integral FFT
-	   */
-	void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
-	// if (rank == 0) {
-	tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
-	rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
-	grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
-	rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
-	grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
-	/* } else {
-	   grntr_ptr = NULL;
-	   rho2_ptr = NULL;
-	   grn_ptr = NULL;
-	   rho2tr_ptr = NULL;
-	   tmpgreen_ptr = NULL;
-	   }*/
+  /* allocate memory and init rho field */
+  double *rho = new double[sizerho];
+  double *rho_out = new double[sizerho];
+  //double *green_out = new double[sizegreen];
+  double *mirror_out = new double[sizerho];
+  //initMirror(rho, NL[0], NL[1], NL[2]);
+  initMirror(rho, NG[0], NG[1], NG[2]);

+  /*
+    allocate memory on device for 
+    - rho field
+    - rho FFT
+    - tmpgreen
+    - greens integral
+    - greens integral FFT
+  */
+  void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
+  tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
+  rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
+  grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
+  rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+  grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);

-	/* send and receive pointer to allocated memory on device */
-	/*
-	   if (rank == 0) {
-	   for (int p = 1; p < nprocs; p++)
-	   base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
-	   } else {
-	   rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
-	   }
-	   MPI_Barrier(MPI_COMM_WORLD);
-	   */
+  /* =================================================*/
+  /* =================================================*/
+  /* =====loop trough fftpoison solver iterations=====*/
+  /* =================================================*/
+  /* =================================================*/

+  double old_sum = 0;

-	/* =================================================*/
-	/* =================================================*/
-	/* =====loop trough fftpoison solver iterations=====*/
-	/* =================================================*/
-	/* =================================================*/
+  int hr_m[3] = {1, 1, 1};
+  base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);

-	double old_sum = 0;
-	double tmp_sum = 0;
-	for (int l = 0; l < 100; l++) {
-		//MPI_Barrier(MPI_COMM_WORLD);
-		/* on node 0, calculate tmpgreen on gpu */
-		int hr_m[3] = {1, 1, 1};
-		//if (rank == 0)
-		base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
-				hr_m[0], hr_m[1], hr_m[2]);
+  /* calculate greens integral on gpu */
+  base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);

-		/* calculate greens integral on gpu */
-		//if (rank == 0)
-		base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
+  /* mirror the field */
+  base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+  /*
+  base.readData<double>(grn_ptr, mirror_out, sizerho);
+  for (int i = 0; i < sizerho; i++)
+    cout << mirror_out[i] << " ";
+  cout << endl << endl;

-		/* mirror the field */
-		//if (rank == 0)
-		base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+  for (int i = 0; i < sizerho; i++)
+    cout << rho[i] << " ";
+  cout << endl << endl;
+  */
+  /* transfer rho field to device */
+  base.writeData<double>(rho2_ptr, rho, sizerho);

+  /* get FFT of rho field */
+  base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);

-		/* get FFT of mirrored greens integral */
-		//if (rank == 0) 
-		base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
+  /* get FFT of mirrored greens integral */
+  base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);

-		/* transfer rho field to device */
-		//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
-		base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
-		//MPI_Barrier(MPI_COMM_WORLD);
+  /* multiply both FFTs */
+  base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);

-		/* get FFT of rho field */
-		//if (rank == 0) {
-		//base.syncDevice();
-		base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
-		//}
+  /*
+  complex<double> *crho = new complex<double>[sizecomp];
+  complex<double> *cgre = new complex<double>[sizecomp];
+  base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
+  base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);

-		/* multiply both FFTs */
-		//if (rank == 0)
-		base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
-		//MPI_Barrier(MPI_COMM_WORLD);
+  for (int i = 0; i < sizecomp; i++)
+    cout << cgre[i].real() << " ";
+  cout << endl << endl;

-		/* inverse fft and transfer data back */
-		/* 
-		   multiple device syncs and mpi barriers are used to make sure data 
-		   transfer is started when results are ready and progam moves on 
-		   only when data transfer is finished
-		   */
-		//if (rank == 0) {
-		base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
-		//base.syncDevice();
-		//MPI_Barrier(MPI_COMM_WORLD);
-		//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
-		base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
-		//MPI_Barrier(MPI_COMM_WORLD);
-		//base.syncDevice();
-		//MPI_Barrier(MPI_COMM_WORLD);
-		//cout << "result: " << sumData(rho_out, sizerho) << endl;
-		if (l == 0) { 
-			old_sum = sumData(rho_out, sizerho);
-		} else {
-			tmp_sum = sumData(rho_out, sizerho);
-			if (old_sum != tmp_sum) {
-				cout << "diff in iteration: " << l << endl;
-			}
-		}
-		/*} else {
-		  MPI_Barrier(MPI_COMM_WORLD);
-		  base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
-		  MPI_Barrier(MPI_COMM_WORLD);
-		  MPI_Barrier(MPI_COMM_WORLD);
-		  }
-		  */
+  for (int i = 0; i < sizecomp; i++)
+    cout << crho[i].real() << " ";
+  cout << endl << endl;
+  
+  delete[] crho;
+  delete[] cgre;
+  */

+  /* inverse fft and transfer data back */
+  /* 
+     multiple device syncs and mpi barriers are used to make sure data 
+     transfer is started when results are ready and progam moves on 
+       only when data transfer is finished
+  */
+  base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
+  
+  base.readData<double> (rho2_ptr, rho_out, sizerho);
+  
+  for (int i = 0; i < 10; i++)
+    cout << rho_out[i] << " ";
+  cout << endl;
+  
+  old_sum = sumData(rho_out, sizerho);

-	}
 /* =================================================*/  
 /* =================================================*/
 /* ==========end fftpoison solver test run==========*/
 /* =================================================*/
 /* =================================================*/

+  base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+  base.freeMemory<double>(grn_ptr, sizerho);
+  base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
+  base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
+  base.freeMemory<double>(rho2_ptr, sizerho);

-
-/* free memory on device */
-//if (rank == 0) {
-base.freeMemory<double>(tmpgreen_ptr, sizegreen);
-base.freeMemory<double>(grn_ptr, sizerho);
-base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
-base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
-//MPI_Barrier(MPI_COMM_WORLD);
-base.freeMemory<double>(rho2_ptr, sizerho);
-cout << "Final sum: " << old_sum << endl;
-/*} else {
-  base.closeHandle(rho2_ptr);
-  MPI_Barrier(MPI_COMM_WORLD);
-  }*/
-
-//MPI_Finalize();
-
+  delete[] rho_out;
+  delete[] rho;
+  delete[] mirror_out;
+  cout << "Final sum: " << old_sum << endl;

 }
--- a/test/testRandom.cpp
+++ b/test/testRandom.cpp
@ -0,0 +1,81 @@
+#include <iostream>
+#include <string>
+#include <vector>
+#include <sys/time.h>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+  int size = 10;
+  bool apiSet = false;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+      apiSet = true;
+    }
+
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+      apiSet = true;
+    }
+
+    if (argv[i] == string("-N")) {
+      size = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  if (!apiSet) {
+    strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
+  }
+    
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of randoms: " << size << endl;
+
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+  base.callInitRandoms(size);
+
+  //create host vector to store results
+  double *host_data = new double[size];
+
+  //create device vector
+  void *device_data = base.allocateMemory<double>(size, ierr);
+  
+  for (int i = 0; i < 5; i++) {
+    //fill device vector with random values
+    base.callCreateRandomNumbers(device_data, size);
+
+    //read device vector
+    base.readData<double>(device_data, host_data, size);
+
+    //print host data
+    for (int i = 0; i < size; i++)
+      cout << host_data[i] << " ";
+    cout << endl;
+  }
+
+  //free device vector
+  base.freeMemory<double>(device_data, size);
+  
+  //free host data
+  delete[] host_data;
+
+  return 0;
+}
Author	SHA1	Message	Date
Andreas Suter	9381b14b87	changed version to 1.1.4	2020-06-09 13:04:48 +02:00
Andreas Suter	43cb9020c4	adapted for CUDA 11	2020-06-09 12:55:55 +02:00
Andreas Suter	3d946f666b	added the two new muSR functions ifgk and ifll (CUDA/OpenCL).	2019-01-22 14:10:02 +01:00
Andreas Suter	e6021eb6e3	Set kernel argument size to a value > 0 For the case that map or fun is not used in the msr-file, the corresponding call to the setKernelArg still needs a 2nd argument > 0, otherwise macOS crashes.	2018-12-11 11:35:32 +01:00
Uldis Locans	24604042e7	Coulomb scattering and rot updated in cuda version	2017-09-19 13:36:42 +02:00
Andreas Suter	5c9048a308	added a macOS specific flag which deals with path variables of shared libs	2017-08-21 14:16:57 +02:00
Uldis Locans	ba701a5744	remove opencl event tracking since it causes memory leak	2017-08-18 14:11:35 +02:00
Uldis Locans	74b12ebdc0	release old OpenCL kernel before creating a new one	2017-08-18 13:13:55 +02:00
Uldis Locans	6d14df5b32	update chiSquare test functions to check the created kernel before execution	2017-08-17 16:57:30 +02:00
Uldis Locans	79833cf7f5	update work item size correctly for devices where supported size is smaller than DKS default	2017-08-17 16:56:57 +02:00
Uldis Locans	ce3491740c	remove openmp flags for apple machines	2017-08-17 16:54:07 +02:00
Uldis Locans	fc6d2ccd4a	update doxygen HTML output	2017-08-10 15:50:25 +02:00
Uldis Locans	d1774b84e8	added doxygen files	2017-08-10 14:58:12 +02:00
Uldis Locans	ccc4329bef	updated documentation	2017-08-10 14:57:48 +02:00
Uldis Locans	7ca93a3a49	dont link with boost filesystem	2017-06-07 11:10:38 +02:00
Uldis Locans	aa14065994	Merge branch 'master' of gitlab.psi.ch:uldis_l/DKS	2017-06-06 14:24:20 +02:00
Uldis Locans	50ecb31042	remove any OpenCL calls when DKS compiled without OpenCL	2017-06-06 14:23:24 +02:00
Uldis Locans	3d130aa01f	write CUDA libraries that are needed for linking in config file	2017-06-06 14:22:34 +02:00
Uldis Locans	5071ea5741	add option to link DKS with static cuda libraries	2017-06-06 14:22:00 +02:00
Uldis Locans	efe5f0db38	remove nvToolsExt for manual profiling	2017-05-30 11:07:22 +02:00
Uldis Locans	1d420504cc	fix FFT test when OPAL and/or Musr modules are not compiled	2017-05-30 11:05:12 +02:00
Uldis Locans	cc59f550ab	move patch version forward	2017-05-29 13:29:08 +02:00
Uldis Locans	d20fea2caa	readme updated	2017-05-29 13:28:23 +02:00
Uldis Locans	8b7d824b3a	updated documentation	2017-05-29 13:21:34 +02:00
Uldis Locans	2c9fe4ea6f	fix for seperate FFT	2017-05-29 13:18:37 +02:00
Uldis Locans	e32f9aaff2	include FFT in DKSOPAL and DKSBaseMuSR	2017-05-29 12:49:32 +02:00
Uldis Locans	f3527969cb	seperate FFT from DKSOPAL	2017-05-29 09:39:25 +02:00
Uldis Locans	cadd258668	update DKS version	2017-04-27 18:15:48 +02:00
Uldis Locans	a94ed9f3b8	Update CUDA particle matter interaction according to latest changes in opal	2017-04-27 18:15:35 +02:00
Uldis Locans	61919ae53c	add enablerutherford scattering flag to opal collimatorphysics	2017-04-27 10:54:02 +02:00
Uldis Locans	432d2f5e4e	Correct patch and minor versions set when installing dks	2017-04-05 17:28:13 +02:00
Uldis Locans	6e57cf5580	DKSConfigVersion file provided to allow cmake find_package to determine if compatible DKS version is installed	2017-03-15 15:18:58 +01:00
Uldis Locans	9e9a20c4af	add tests for collimator physics and kick/push	2017-03-15 10:02:00 +01:00
Uldis Locans	9b9910e9f9	add seed to random number initialization	2017-03-15 10:01:28 +01:00
Uldis Locans	244ad0b230	OPALs greens function added to algorithms	2017-03-15 10:00:40 +01:00
Uldis Locans	9734157ad8	push and kick for OPALs BorisPusher	2017-03-15 09:59:33 +01:00
Uldis Locans	24f9c9dbf4	allow to enalbe wich parts of DKS to compile trough cmake flags	2017-02-28 15:11:59 +01:00
Uldis Locans	47fe8e8e52	allow to enalbe wich parts of DKS to compile trough cmake flags	2017-02-28 15:07:22 +01:00
Uldis Locans	eee9dfd89e	seperate OPAL DKS functions from base	2017-02-28 15:06:45 +01:00
Uldis Locans	7c7c2e240b	dksbase use Algoruthms base class for fft, colimator physics and greens function	2017-02-15 09:00:55 +01:00
Uldis Locans	e9d411235c	merge dks-1.0.0-branch (fix conflicts)	2017-02-07 09:57:04 +01:00
Uldis Locans	b5c5da29b2	add function to generate list of random numbers with cuda and opencl on the device	2016-12-09 13:43:09 +01:00
Uldis Locans	3a74d6cdee	init randoms - each thread gets same seed but different sequence	2016-11-28 16:23:29 +01:00
Uldis Locans	b3a12e02a8	OpenCL FFT using clfft and tests	2016-11-21 16:14:11 +01:00
Uldis Locans	4432d32480	OpenCL greens function calculation for OPAL	2016-11-17 18:03:28 +01:00
Uldis Locans	63a008d111	Greens function calculation for OPAL rewriten with abstract base class	2016-11-17 18:02:48 +01:00
Uldis Locans	87cdf52f07	Collimator physics for MIC fix	2016-11-16 18:12:17 +01:00
Uldis Locans	027bdc01f5	FFT for OpenCL using clFFT library	2016-11-16 18:12:00 +01:00