Coulomb scattering and rot updated in cuda version

added a macOS specific flag which deals with path variables of shared libs
remove opencl event tracking since it causes memory leak
2017-09-19 13:36:42 +02:00 · 2017-08-21 14:16:57 +02:00 · 2017-08-18 14:11:35 +02:00 · 2017-08-18 13:13:55 +02:00 · 2017-08-17 16:57:30 +02:00 · 2017-08-17 16:56:57 +02:00
57 changed files with 4019 additions and 1053 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
 SET (DKS_VERSION_MINOR 1)
-SET (DKS_VERSION_PATCH 1)
+SET (DKS_VERSION_PATCH 2)
 set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
 SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -10,6 +10,9 @@ SET (PACKAGE_NAME \"DKS\")
 SET (PACKAGE_TARNAME \"dks\")
 SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
 SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+if (APPLE)
+  SET (CMAKE_MACOSX_RPATH TRUE)
+endif (APPLE)

 #get compiler name
 #STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
@ -28,11 +31,13 @@ MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
 set (BOOSTROOT $ENV{BOOST_DIR})
 SET (Boost_USE_STATIC_LIBS OFF)
 SET (Boost_USE_STATIC_RUNTIME OFF)
-FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
+#FIND_PACKAGE(Boost 1.55 REQUIRED COMPONENTS filesystem system)
+FIND_PACKAGE(Boost 1.41 REQUIRED)
 IF (Boost_FOUND)
+  MESSAGE (STATUS "Boost version: ${Boost_VERSION}")
  MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
  MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
-  MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
+  #MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
  INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
  LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
 ENDIF (Boost_FOUND)
@ -79,7 +84,7 @@ OPTION (USE_UQTK "Use UQTK" OFF)
 IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)

  #for intel compiler turn on openmp and opencl
-  OPTION (USE_OPENCL "Use OpenCL" ON)
+  OPTION (USE_OPENCL "Use OpenCL" OFF)
  OPTION (USE_CUDA "Use CUDA" OFF)
  OPTION (USE_MIC "Use intel MIC" ON)
  
@ -110,18 +115,30 @@ IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
 ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)

 #gnu copmpiler specific flags
-IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
+IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
  
  
-  OPTION (USE_OPENCL "Use OpenCL" ON)
+  OPTION (USE_OPENCL "Use OpenCL" OFF)
  OPTION (USE_CUDA "Use CUDA" OFF)
  OPTION (USE_MIC "Use intel MIC" OFF)
+  OPTION (STATIC_CUDA "Link static cuda libraries" OFF)

+  IF (ENABLE_MUSR)
+    SET (USE_OPENCL ON)
+  ENDIF (ENABLE_MUSR)
+
+  #dont set openmp flag for apple devices
+  IF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
+  ELSE ($CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -std=c++11 -D__wsu") 
+  ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    

  FIND_PACKAGE(CUDA)
  IF (CUDA_FOUND)
    SET (USE_CUDA ON)
+    OPTION(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cuda libraries" OFF)
    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -131,20 +148,27 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

-    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
-    SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
-    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}  -DDEBUG -std=c++11 -D__wsu")    
-    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
+    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")
+
+    IF (NOT STATIC_CUDA)
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA")
+      SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart -lcufft -lcublas")
+    ELSE (NOT STATIC_CUDA) 
+      SET (CUDA_SEPARABLE_COMPILATION ON)
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA -fPIC")
+      SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-rdc=true;-lcufft_static;-lcublas_static;-lcurand_static")
+      SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart_static -lcufft_static -lcublas_static -lculibos")
+    ENDIF (NOT STATIC_CUDA)

    #if cuda version >= 7.0 add runtime commpilation flags
-    IF (NOT CUDA_VERSION VERSION_LESS "7.0")
+    IF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
-    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
+    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
    
    MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
    
    SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
-    #set(CUDA_SEPARABLE_COMPILATION ON)
    SET(BUILD_SHARED_LIBS OFF)

  ENDIF (CUDA_FOUND)
@ -154,6 +178,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE(STATUS "CUDA not found, looking for OpenCL")

    FIND_PACKAGE(OpenCL)
+    MESSAGE("after FIND_PACKAGE(OpenCL): version: ${OpenCL_VERSION_STRING}")
+    MESSAGE("after FIND_PACKAGE(OpenCL): inc dir: ${OpenCL_INCLUDE_DIR}")
+    MESSAGE("after FIND_PACKAGE(OpenCL): lib dir: ${OpenCL_LIBRARY}")
    IF (OpenCL_FOUND)
      MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
      MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
@ -171,9 +198,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
  ENDIF(APPLE AND NOT CUDA_FOUND)

  #if cuda found set cuda opencl flags
-  IF (CUDA_FOUND)
+  IF (CUDA_FOUND AND USE_OPENCL)
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
-  ENDIF (CUDA_FOUND)
+  ENDIF (CUDA_FOUND AND USE_OPENCL)

  #if cuda not found but amd opencl found set opencl flags
  IF (NOT CUDA_FOUND AND OpenCL_FOUND)
@ -185,7 +212,7 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
  ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")

-ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
+ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)

 SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
 MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
--- a/2356
+++ b/2356
--- a/ReadMe.first
+++ b/ReadMe.first
@ -29,30 +29,30 @@ Intel MIC compilers (optional)
 ######Source######
 https://gitlab.psi.ch/uldis_l/DKS

+######Changes from DKS-1.0.x version######
+DKS is split into three modules that can be enabled/disabled at compile time depending on which software it is used for.
+By default only DKSBase and DKSFFT modules are enabled. In order to install other modules the necessary otion needs to be enabled.
+Supported options are:
+-DENABLE_OPAL option should be enabled if DKS will be used for OPAL
+-DENABLE_MUSR option should be enable if DKS will be used for musrfit
+-DENABLE_PET option should be enabled if DKS will be used for PET image reconstruction
+
+See install instructions for more details on how to enable the necessary options in DKS
+
 ######Install######
+#consult the https://gitlab.psi.ch/uldis_l/DKS/wikis/home for full install isntructions

 #clone DKS
 git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS

-#set compilers to use
-#supported c++ compilers: g++, icpc, mpicxx whith g++
-#supported c compilers: gcc, icc, mpicc whith gcc
-export CXX_COMPILER=cpp_compiler_name
-export CC_COMPILER=c_compiler_name
+#switch to the desired version (OPTIONAL)
+git checkout DKS-1.1.0

-#set dks root directory directory
-cd DKS
-export DKS_ROOT = $PWD
-
-#set build directory
-mkdir $DKS_BUILD_DIR
-cd $DKS_BUILD_DIR
-
-#set install directory
-export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
-
-CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
+#configure installation in build directory
+#enable DKS modules to compile -DENABLE_OPAL, -DENABLE_MUSR, -DENABLE_PET
+CXX=<c++ compiler> CC=<c compiler> -DCMAKE_INSTALL_PREFIX=<install dir> <path to DKS source> [-DENABLE_OPAL=1 -DENABLE_MUSR=1 -DENABLE_PET=1] 

+#install DKS
 make
 make install

--- a/auto-tuning/CMakeLists.txt
+++ b/auto-tuning/CMakeLists.txt
@ -4,28 +4,30 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
 #chi square kernel tests
 IF (ENABLE_MUSR)
  ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
-  TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+  TARGET_LINK_LIBRARIES(testChiSquareRT dks ${CLFFT_LIBRARIES})

  ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
-  TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+  TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${CLFFT_LIBRARIES})

  IF (USE_UQTK)
    ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
-    TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+    TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
  ENDIF (USE_UQTK)
-  #TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})

  #test to verify search functions
  ADD_EXECUTABLE(testSearch testSearch.cpp)
-  TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+  TARGET_LINK_LIBRARIES(testSearch dks ${CLFFT_LIBRARIES})
 ENDIF (ENABLE_MUSR)

 IF (ENABLE_OPAL)
  ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
-  TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+  TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})

  ADD_EXECUTABLE(testPushKick testPushKick.cpp)
-  TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+  TARGET_LINK_LIBRARIES(testPushKick dks ${CLFFT_LIBRARIES})
 ENDIF(ENABLE_OPAL)

+ADD_EXECUTABLE(testFFT testFFT.cpp)
+TARGET_LINK_LIBRARIES(testFFT dks ${CLFFT_LIBRARIES})
+

--- a/auto-tuning/testChiSquareRT.cpp
+++ b/auto-tuning/testChiSquareRT.cpp
@ -293,6 +293,9 @@ int runTest(const char *api_name, const char *device_name, bool autotune, bool m
      if (autotune)
 	dksbase.setAutoTuningOn();

+      //check kernel
+      dksbase.checkMuSRKernels(1);
+      
      //tmp values to store results and tmp values for time steps and start time
      double result_gpu = 0.0;
      double result_cpu = 0.0;
@ -373,11 +376,11 @@ int main(int argc, char* argv[]) {

  }

-  int numPlatforms = 2;
+  int numPlatforms = 3;
  const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
  const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};

-  for (int i = 0; i < numPlatforms; i++) {
+  for (int i = 2; i < numPlatforms; i++) {
    runTest(api[i], device[i], autotune, mlh, asym);
  }

--- a/auto-tuning/testChiSquareRTRandom.cpp
+++ b/auto-tuning/testChiSquareRTRandom.cpp
@ -392,7 +392,10 @@ int main(int argc, char *argv[]) {
  dksbase.setAPI(api_name);
  dksbase.setDevice(device_name);

+  std::cout << "Init device" << std::endl;
  dksbase.initDevice();
+  
+  std::cout << "Init chi square" << std::endl;
  dksbase.initChiSquare(Ndata, np, nf, nm);

  dksbase.writeParams(p, np);
@ -401,20 +404,24 @@ int main(int argc, char *argv[]) {

  dksbase.callSetConsts(N0, TAU, BKG);

+  std::cout << "Compile program" << std::endl;
  dksbase.callCompileProgram(sfunc);
  
+  dksbase.checkMuSRKernels(1);
+
  if (autotune) 
    dksbase.setAutoTuningOn();

-  int oper = 0;
-  dksbase.getOperations(oper);
+  //std::cout << "Get operations" << std::endl;
+  //int oper = 0;
+  //dksbase.getOperations(oper);

  cout << "=========================BEGIN TEST=========================" << endl;
  cout << "Use api: " << api_name << "\t" << device_name << endl;
  cout << "Number of params: " << np << endl;
  cout << "Number of maps: " << nm << endl;
  cout << "Number of predefined functions: " << nfunc << endl;
-  cout << "Number of ptx instructions: " << oper << endl;
+  //cout << "Number of ptx instructions: " << oper << endl;
  cout << "------------------------------------------------------------" << endl;
  cout << sfunc << endl;
  cout << "------------------------------------------------------------" << endl;
--- a/auto-tuning/testFFT.cpp
+++ b/auto-tuning/testFFT.cpp
@ -0,0 +1,214 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSFFT.h"
+
+using namespace std;
+
+void compareData(complex<double>* data1, complex<double>* data2, int N, int dim);
+void compareData(double* data1, double *data2, int N, int dim);
+
+void initData(complex<double> *data, int dimsize[3], int dim);
+void initData(double *data, int dimsize[3], int dim);
+
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim, 
+		char *api_name, char *device_name);
+
+void printHelp();
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int N1 = 8;
+  int N2 = 8;
+  int N3 = 8;
+  int dim = 3;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) )
+    return 0;
+
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N1, N2, N3};
+  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
+
+  double *rdata = new double[sizereal];
+  double *ordata = new double[sizereal];
+  complex<double> *cdata = new complex<double>[sizereal];
+  complex<double> *codata = new complex<double>[sizereal];
+
+  initData(rdata, dimsize, 3);
+  initData(cdata, dimsize, 3);
+
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+  DKSFFT base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  cout << "init device" << endl;
+  base.initDevice();
+  cout << "setup fft" << endl;
+  base.setupFFT(dim, dimsize);
+
+  //Test RC FFT -> CR FFT
+  void *real_ptr, *comp_ptr, *res_ptr;
+  cout << "allocate memory" << endl;
+  real_ptr = base.allocateMemory<double>(sizereal, ierr);
+  res_ptr = base.allocateMemory<double>(sizereal, ierr);
+  comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+
+  cout << "write data" << endl;
+  base.writeData<double>(real_ptr, rdata, sizereal);
+
+  cout << "perform fft" << endl;
+  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+  base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize);
+  base.callNormalizeC2RFFT(res_ptr, dim, dimsize);
+
+  cout << "read data" << endl;
+  base.readData<double>(res_ptr, ordata, sizereal);
+
+  compareData(rdata, ordata, N1, 3);
+
+  base.freeMemory<double>(real_ptr, sizereal);
+  base.freeMemory<double>(res_ptr, sizereal);
+  base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+
+  //Test CC FFT
+  void *mem_ptr;
+  mem_ptr = base.allocateMemory< complex<double> >(sizereal, ierr);
+  base.writeData< complex<double> >(mem_ptr, cdata, sizereal);
+  base.callFFT(mem_ptr, 3, dimsize);
+  base.callIFFT(mem_ptr, 3, dimsize);
+  base.callNormalizeFFT(mem_ptr, 3, dimsize);
+  base.readData< complex<double> >(mem_ptr, codata, sizereal);
+
+  compareData(cdata, codata, N1, 3);
+
+  base.freeMemory< complex<double> > (mem_ptr, sizereal);
+
+  delete[] rdata;
+  delete[] ordata;
+  delete[] cdata;
+  delete[] codata;
+
+}
+
+void compareData(complex<double>* data1, complex<double>* data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id].real() - data2[id].real());
+	sum += fabs(data1[id].imag() - data2[id].imag());
+      }
+    }
+  }
+  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
+void compareData(double* data1, double* data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id] - data2[id]);
+      }
+    }
+  }
+  cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
+}
+
+void initData(complex<double> *data, int dimsize[3], int dim) {
+  if (dim == 3) {
+    for (int i = 0; i < dimsize[2]; i++)
+      for (int j = 0; j < dimsize[1]; j++) 
+	for (int k = 0; k < dimsize[0]; k++) 
+	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
+  } else if (dim == 2) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
+      }
+    }
+  } else {
+    for (int k = 0; k < dimsize[0]; k++) 
+      data[k] = complex<double>(sin(k), 0.0);
+  }
+}
+
+void initData(double *data, int dimsize[3], int dim) {
+  if (dim == 3) {
+    for (int i = 0; i < dimsize[2]; i++)
+      for (int j = 0; j < dimsize[1]; j++) 
+	for (int k = 0; k < dimsize[0]; k++) 
+	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
+  } else if (dim == 2) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[j*dimsize[0] + k] = sin(k);
+      }
+    }
+  } else {
+    for (int k = 0; k < dimsize[0]; k++) 
+      data[k] = sin(k);
+  }
+}
+
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
+		char *api_name, char *device_name) 
+{
+
+  for (int i = 1; i < argc; i++) {
+
+    if ( argv[i] == std::string("-dim")) {
+      dim = atoi(argv[i + 1]);
+      i++;
+    }
+
+    if ( argv[i] == std::string("-grid") ) {
+      N1 = atoi(argv[i + 1]);
+      N2 = atoi(argv[i + 2]);
+      N3 = atoi(argv[i + 3]);
+      i += 3;
+    }
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    } 
+
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    } 
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    } 
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+  }
+
+  return false;
+}
+
--- a/cmake/DKSConfig.cmake.in
+++ b/cmake/DKSConfig.cmake.in
@ -3,5 +3,7 @@ SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
 SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 SET(${PROJECT_NAME}_LIBRARY "dks")
 SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
+SET(DKS_CUDA_STATIC ${STATIC_CUDA})
+SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
 SET(DKS_VERSION ${DKS_VERSION})
 SET(DKS_VERSION_STR ${DKS_VERSION_STR})
--- a/doc/refman.pdf
+++ b/doc/refman.pdf
--- a/src/Algorithms/ChiSquareRuntime.h
+++ b/src/Algorithms/ChiSquareRuntime.h
@ -15,6 +15,9 @@

 class DKSBaseMuSR;

+/** 
+ * Interface to implement ChiSquareRuntime class for musrfit.
+ */
 class ChiSquareRuntime {
  friend class DKSBaseMuSR;

@ -63,23 +66,54 @@ public:
  /** Default constructor */
  //ChiSquareRuntime();

-  /** Default destructor */
+  /** Default destructor. */
  virtual ~ChiSquareRuntime() { };

+  /**
+   * Compile GPU programm generated at runtime.
+   */
  virtual int compileProgram(std::string function, bool mlh = false) = 0;
+
+  /**
+   * Launche the compiled chiSquare kernel.
+   */
  virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
 			      int numpar, int numfunc, int nummap,
 			      double timeStart, double timeStep,
 			      double &result) = 0;

+  /** 
+   * Write the parameter values to the GPU.
+   */
  virtual int writeParams(const double *params, int numparams) = 0;
+
+  /**
+   * Write the function values to the GPU.
+   */
  virtual int writeFunc(const double *func, int numfunc) = 0;
+
+  /**
+   * Write map values to the GPU.
+   */
  virtual int writeMap(const int *map, int nummap) = 0;
+
+  /**
+   * Allocate temporary memory needed for the chi square calucaltios on the device.
+   */
  virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
+
+  /**
+   * Free device memory allocated for chi square calculations.
+   */
  virtual int freeChiSquare() = 0;
+
+  /**
+   * Check if available device can run the chi square GPU code.
+   */
  virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;

-  /** Set N0, tau and bgk values to use for the kernel.
+  /** 
+   * Set N0, tau and bgk values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -91,7 +125,8 @@ public:
    return DKS_SUCCESS;
  }

-  /** Set alpha and beta values to use for the kernel.
+  /** 
+   * Set alpha and beta values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -101,7 +136,8 @@ public:
    return DKS_SUCCESS;
  }

-  /** Set number of blocks and threads.
+  /** 
+   * Set number of blocks and threads.
   * Used to set parameters obtained from auto-tuning
   */
  int setKernelParams(int numBlocks, int blockSize) {
@ -118,7 +154,8 @@ public:
    return ierr;
  }

-  /** Get the number of operations in compiled kernel.
+  /** 
+   * Get the number of operations in compiled kernel.
   * Count the number of operation in the ptx file for the compiled program.
   */
  int getOperations(int &oper) {
--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@ -5,6 +5,9 @@
 #include <string>
 #include "../DKSDefinitions.h"

+/**
+ * Interface to impelment particle matter interaction for OPAL.
+ */
 class DKSCollimatorPhysics {

 protected:
@ -16,28 +19,60 @@ public:
  
  virtual ~DKSCollimatorPhysics() { }

+  /** 
+   * Execute collimator physics kernel.
+   *
+   */  
  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
 				bool enableRutherforScattering = true) = 0;

+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles) = 0;
  
+  /** 
+   * Sort particle array on GPU.
+   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
+   * array so these particles are at the end of array
+   */
  virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;

+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				       void *px_ptr, void *py_ptr, void *pz_ptr,
 				       void *par_ptr, int numparticles, int &numaddback) = 0;

+  /** 
+   * BorisPusher push function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;

+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				   void *bf_ptr, void *dt_ptr, double charge,
 				   double mass, int npart, double c, int streamId = -1) = 0; 

+  /** 
+   * BorisPusher push function with transformto function form OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
--- a/src/Algorithms/FFT.h
+++ b/src/Algorithms/FFT.h
@ -6,12 +6,21 @@

 #include "../DKSDefinitions.h"

-class DKSFFT {
+/**
+ * Abstract class defining methods for DKS FFT class.
+ * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
+ */
+class BaseFFT {

 protected:
  int defaultN[3];
  int defaultNdim;

+  /**
+   * Check if FFT plan is created for the needed dimension and FFT size.
+   * Returns true if the plan has been created and false if no plan for specified dimension
+   * and size exists.
+   */
  bool useDefaultPlan(int ndim, int N[3]) {
    if (ndim != defaultNdim)
      return false;
@ -22,20 +31,59 @@ protected:

 public:

-  virtual ~DKSFFT() { }
+  virtual ~BaseFFT() { }

+  /** Setup FFT - init FFT library used by chosen device. */
  virtual int setupFFT(int ndim, int N[3]) = 0;
+
+  /** Setup real to complex FFT - init FFT library used by chosen device. */
  virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
  virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Clean up. */
  virtual int destroyFFT() = 0;
+
+  /** 
+   * Exectute C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
  virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
 			 int streamId = -1, bool forward = true) = 0;
+
+  /** 
+   * Exectute inverse C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
  virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /**
+   * Normalize the FFT or IFFT.
+   * mem_ptr - memory to complex data.
+   */
  virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /** 
+   * Exectute R2C FFT.
+   * real_ptr - real input data for FFT, comp_ptr - memory on the device where
+   * results for the FFT are stored as complex numbers.
+   */
  virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /** 
+   * Exectute C2R FFT.
+   * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
+   */
  virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /**
+   * Normalize CR FFT.
+   */
  virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;

 };
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@ -4,24 +4,27 @@
 #include <iostream>
 #include <cmath>

+/**
+ * Interface to implement Greens function calculations for OPAL.
+ */
 class GreensFunction {

 public:
  
  virtual ~GreensFunction() { }

-  /** calc greens integral, as defined in OPAL */
+  /** calc greens integral, as defined in OPAL. */
  virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
 			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;

-  /** integration if rho2_m, see OPAL for more details */
+  /** integration if rho2_m, see OPAL for more details. */
  virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
 					int streamId = -1) = 0;

-  /** mirror rho2_m field */
+  /** mirror rho2_m field. */
  virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;

-  /** multiply two complex fields from device memory */
+  /** multiply two complex fields from device memory. */
  virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;

 };
--- a/src/Algorithms/ImageReconstruction.h
+++ b/src/Algorithms/ImageReconstruction.h
@ -5,17 +5,22 @@

 #define BLOCK_SIZE 128

+/** Struct to hold voxel position for PET image. */
 struct VoxelPosition {
  float x;
  float y;
  float z;
 };

+/** Struct that holds pair of detectors that registered an envent. */
 struct ListEvent {
  unsigned detA : 16;
  unsigned detB : 16;
 };

+/**
+ * Interface to implement PET image reconstruction.
+ */
 class ImageReconstruction {

 protected:
@ -25,7 +30,8 @@ public:

  virtual ~ImageReconstruction() { }
  
-  /** Caluclate source.
+  /** 
+   * Caluclate source.
   *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
   *  that are inside this sphere. All the sphere used have the same diameter.
   */
@ -33,7 +39,8 @@ public:
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0) = 0;

-  /** Calculate background.
+  /** 
+   * Calculate background.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -42,7 +49,8 @@ public:
 				  void *avg, void *std, float diameter, int total_voxels, 
 				  int total_sources, int start = 0) = 0;

-  /** Caluclate source using differente sources.
+  /** 
+   * Caluclate source using differente sources.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * each sphere is given by *diameter array.
@ -52,7 +60,7 @@ public:
 			       int total_sources, int start = 0) = 0;

  /**
-   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
   * smaller sphere.
@ -61,7 +69,8 @@ public:
 				   void *avg, void *std, void *diameter, int total_voxels, 
 				   int total_sources, int start = 0) = 0;

-  /** Generate normalization.
+  /** 
+   * Generate normalization.
   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
   * that updates voxel values in the image on the slope between these two detectors.
   */
@ -69,14 +78,16 @@ public:
 				 void *det_position, int total_det) = 0; 


-  /** Calculate forward projection.
+  /** 
+   * Calculate forward projection.
   * For image reconstruction calculates forward projections.
   * see recon.cpp for details
   */
  virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 				void *image_position, int num_events) = 0;

-  /** Calculate backward projection.
+  /** 
+   * Calculate backward projection.
   * For image reconstruction calculates backward projections.
   * see recon.cpp for details
   */
@ -84,29 +95,29 @@ public:
 				 void *det_position, void *image_position, 
 				 int num_events, int num_voxels) = 0;

-  /** Set the voxel dimensins on device.
-   * 
+  /** 
+   *Set the voxel dimensins on device. 
   */
  virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;

-  /** Set the image edge variables on the device.
-   * 
+  /** 
+   * Set the image edge variables on the device.
   */
  virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;

-  /** Set the image edge1 on the device.
-   * 
+  /** 
+   * Set the image edge1 on the device.
   */
  virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;

-  /** Set the minimum crystan in one ring values on the device.
-   * 
+  /** 
+   * Set the minimum crystan in one ring values on the device.
   */
  virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
 				  float min_CrystalDist_InOneRing1) = 0;

-  /** Set all other required parameters for reconstruction.
-   * 
+  /** 
+   * Set all other required parameters for reconstruction.
   */
  virtual int setParams(float matrix_distance_factor, float phantom_diameter,
 			float atten_per_mm, float ring_diameter) = 0;
--- a/src/AutoTuning/DKSAutoTuning.h
+++ b/src/AutoTuning/DKSAutoTuning.h
@ -18,6 +18,17 @@
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;

+/** 
+ * DKS autotuning class, allows to auto-tune the defince function.
+ * Executes the defined function for auto-tuning and searches for optimal parameters to improve
+ * the function execution time. The function that is auto-tuned, parameters and the ranges
+ * need to be set. Includes multiple search methods, that searches the parameter space to finde 
+ * the optimal solution.
+ *  1) exaustive search
+ *  2) line search
+ *  3) hill climbimg
+ *  4) simulated annealing
+ */
 class DKSAutoTuning {

 private:
@ -36,10 +47,11 @@ private:

  int loops_m;

-  /** Update parameters from a state */
+  /** Update parameters from a state. */
  int setParameterValues(States states);

-  /** Evaluate the function and set execution time 
+  /** 
+   * Evaluate the function and set execution time 
   * Returns DKS_ERROR if errors occured during function execution. 
   * Returns DKS_SUCCESS if function executed as planned. 
   */
@ -50,10 +62,11 @@ public:
  /** Constructor */
  DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);

-  /** Destructor */
+  /** Destructor. */
  ~DKSAutoTuning();

-  /** Set function to auto tune.
+  /** 
+   * Set function to auto tune.
   * Caller of setFunction is responsible to bind the correct parameters 
   * to the function with std::bind.
   */
@ -63,13 +76,19 @@ public:
    evaluate_time_m = evaluate_time;
  }

+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
+   */
  void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
    fd_m = f;
    function_name_m = name;
    evaluate_time_m = evaluate_time;
  }

-  /** Set parameter for auto tuning.
+  /** 
+   * Set parameter for auto tuning.
   * Provide a pointer to a parameter that will be changed during auto-tuning
   * and a min-max value for this element
   */
@ -85,9 +104,9 @@ public:
  /** Perform exaustive search evaluating all the parameter configurations */
  void exaustiveSearch();

-  /** Perform auto-tuning.
-   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
-   *  parameters constant.
+  /**
+   * Perform line-search auto-tuning by variying parameters one at a time.
+   * After one parameter is auto-tuned the next on is varied
   */
  void lineSearch();  

--- a/src/AutoTuning/DKSAutoTuningTester.h
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@ -4,6 +4,7 @@
 #include <iostream>
 #include <cmath>

+/** Tester class for auto-tuning search algorithms. */
 class DKSAutoTuningTester {

  friend class DKSBaseMuSR;
--- a/src/AutoTuning/DKSConfig.h
+++ b/src/AutoTuning/DKSConfig.h
@ -1,9 +1,3 @@
-/** Class to save and load DKS autotunning configs.
- * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
- * Uses boost xml_parser to read and write the xml file and boost property tree to store
- * the xml content.
- */
-
 #ifndef DKS_CONFIG
 #define DKS_CONFIG

@ -11,7 +5,7 @@
 #include <boost/optional/optional.hpp>
 #include <boost/property_tree/xml_parser.hpp>
 #include <boost/foreach.hpp>
-#include <boost/filesystem.hpp>
+//#include <boost/filesystem.hpp>
 #include <string>
 #include <iostream>
 #include <cstdlib>
@ -24,11 +18,18 @@
 #include "../DKSDefinitions.h"

 namespace pt = boost::property_tree;
-namespace fs = boost::filesystem;
+//namespace fs = boost::filesystem;

 const std::string config_dir = "/.config/DKS";
 const std::string config_file = "/autotuning.xml";

+/** Class to save and load DKS autotunning configs.
+ * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
+ * Uses boost xml_parser to read and write the xml file and boost property tree to store
+ * the xml content.
+ * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
+ * so the auto-tuning has no effect.
+ */
 class DKSConfig {

 private:
--- a/src/AutoTuning/DKSSearchStates.h
+++ b/src/AutoTuning/DKSSearchStates.h
@ -9,6 +9,9 @@

 enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };

+/** 
+ * Parameter class allows to change the searchable parameters during the auto-tuning.
+ */
 class Parameter {

 private:
@ -64,6 +67,10 @@ public:

 };

+/**
+ * Struct to hold a auto-tuning state.
+ * Holds the current value, min, max and a step to witch a state can change.
+ */ 
 struct State {
  double value;
  double min;
@ -74,6 +81,12 @@ struct State {
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;

+/** 
+ * Used by auto-tuning search algorithms to move between parameter configurations.
+ * Allows to move from one parameter stat to another, get neighboring states, 
+ * move to neighboring states and save state information. Print functions are available
+ * for debugging purposes, to follow how algorithm muves between sates.
+ */
 class DKSSearchStates {

 private:
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -35,12 +35,12 @@ ENDMACRO ()
 SET (DKS_BASEDIR_HDRS
  DKSBase.h
  DKSDefinitions.h
-  DKSOPAL.h
+  DKSFFT.h
  )

 SET (DKS_BASEDIR_SRCS
  DKSBase.cpp
-  DKSOPAL.cpp
+  DKSFFT.cpp
  )

 #add opal to DKS if enable_opal is set
@ -112,26 +112,18 @@ IF (USE_CUDA)
  CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
  CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})

-  IF (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-    TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-  ELSE (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks cudadevrt)
-    TARGET_LINK_LIBRARIES(dksshared cudadevrt)
-  ENDIF (USE_UQTK)
+  TARGET_LINK_LIBRARIES(dks ${DKS_CUDA_LIBS})
+  TARGET_LINK_LIBRARIES(dksshared ${DKS_CUDA_LIBS})
+  #TARGET_LINK_LIBRARIES(dks)
+  #TARGET_LINK_LIBRARIES(dksshared)

 ELSE (USE_CUDA)
  MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
  ADD_LIBRARY(dks ${DKS_SRCS})
  ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})

-  IF (USE_UQTK)
-    TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-    TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
-  ELSE (USE_UQTK)
  TARGET_LINK_LIBRARIES(dks)
  TARGET_LINK_LIBRARIES(dksshared)
-  ENDIF(USE_UQTK)

 ENDIF (USE_CUDA)

--- a/src/CUDA/CMakeLists.txt
+++ b/src/CUDA/CMakeLists.txt
@ -1,9 +1,9 @@
-SET (_HDRS CudaBase.cuh)
-SET (_SRCS CudaBase.cu)
+SET (_HDRS CudaBase.cuh CudaFFT.cuh)
+SET (_SRCS CudaBase.cu CudaFFT.cu)

 IF (ENABLE_OPAL)
-  SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
-  SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu)
+  SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
+  SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu)
 ENDIF (ENABLE_OPAL)

 IF (ENABLE_MUSR)
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
 	
  return DKS_SUCCESS;
 }
-		
-/*
-  Info: allcate memory and write data (push)
-  Return: pointer to memory object
-*/
-/*
-  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
-
-  void * mem_ptr;
-  mem_ptr = cuda_allocateMemory(size, ierr);
-	
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_writeData(mem_ptr, in_data, size);
-		
-  return mem_ptr;
-  }
-*/
-		
-/*
-  Info: read data and free memory (pull)
-  Return: success or error code
-*/
-/*
-  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
-
-  ierr = cuda_readData(mem_ptr, out_data, size);
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_freeMemory(mem_ptr);	
-  else
-  return DKS_ERROR;
-	
-	
-  if (ierr == DKS_SUCCESS)	
-  return DKS_SUCCESS;
-  else
-  return DKS_ERROR;
-  }
-*/
-
-/*
-  Info: execute function
-  Return: success or error code
-*/
-int CudaBase::cuda_executeFunction() {
-
-  std::cout << "Execute function" << std::endl;
-  return DKS_SUCCESS;
-}
-		
-/*
-  Info: clean up
-  Return: success or error code
-*/
-int CudaBase::cuda_cleanUp() {
-
-  std::cout << "clean up" << std::endl;
-  return DKS_SUCCESS;
-	
-}
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@ -12,11 +12,15 @@
 #include <cufft.h>
 #include <cublas_v2.h>
 #include <curand_kernel.h>
-#include <nvToolsExt.h>
 #include <time.h>

 #define BLOCK_SIZE 128

+/**
+ * CUDA base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement, data transfers and stream setup for 
+ * asynchronous data transfers and kernel executions.
+ */
 class CudaBase {

 private:
@ -53,13 +57,13 @@ public:
   */
  int cuda_deleteCurandStates();

-  /** Create 'size' random numbers on the device and save in mem_ptr array
-   *
+  /** 
+   * Create 'size' random numbers on the device and save in mem_ptr array.
   */
  int cuda_createRandomNumbers(void *mem_ptr, int size);

-  /** Get a pointer to curand states
-   *
+  /** 
+   * Get a pointer to curand states.
   */
  curandState* cuda_getCurandStates();
 	
@ -76,93 +80,98 @@ public:
  int cuda_addStream(cudaStream_t tmpStream, int &streamId);

  /**
-   * delete cuda stream
+   * delete cuda stream.
   * success or error code
   */
  int cuda_deleteStream(int id);

  /**
-   * delete all streams
+   * delete all streams.
   * success or error code
   */
  int cuda_deleteStreams();

  /**
-   * set stream to use
+   * set stream to use.
   * success or error code
   */
  int cuda_setStream(int id);

  /**
-   * Info: get stream that is used
+   * get stream that is used.
   * Return: return id of curretn stream
   */
  int cuda_getStreamId();

  /**
-   * Info: reset to default stream
+   * reset to default stream.
   * Return: success or error code
   */
  int cuda_defaultStream();

  /**
-   * Info: get number of streams
+   * get number of streams.
   * Return: success or error code
   */
  int cuda_numberOfStreams();

  /**
-   * Info: get stream
+   * get stream.
   * Return: stream
   */
  cudaStream_t cuda_getStream(int id);

  /**
-   * Get default cublass handle
+   * Get default cublass handle.
   */
  cublasHandle_t cuda_getCublas();

  /**
-   * Info: get information on cuda devices
+   * get information on cuda devices.
   * Return: success or error code
   */
  int cuda_getDevices();

-  /** Get CUDA device count.
+  /** 
+   * Get CUDA device count.
   * Sets the number of devices on the platform that can use CUDA.
-   *  Returns DKS_SUCCESS
   */
  int cuda_getDeviceCount(int &ndev);

-  /** Get the name of the device.
+  /** 
+   * Get the name of the device.
   * QUery the device properties of the used device and set the string device_name
   */
  int cuda_getDeviceName(std::string &device_name);

-  /** Set CUDA device to use.
-   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
+  /** 
+   * Set CUDA device to use.
+   * If device passed in is larger than the number of devices use 
+   * the default:0 and return DKS_ERROR 
   */
  int cuda_setDevice(int device);

-  /** Get unique devices
+  /** 
+   * Get unique devices.
   * Get array of indeces with the unique CUDA devices available on the paltform
   */
  int cuda_getUniqueDevices(std::vector<int> &devices);
 	
  /**
-   * Info: init device
+   * Initialize connection to the device.
+   * Only needed when runtime compilation is used.
   * Return: success or error code
   */
  int cuda_setUp();
 	
  /**
-   * Info: allocate memory on cuda device
+   * Allocate memory on cuda device.
   * Return: pointer to memory object
   */
  void * cuda_allocateMemory(size_t size, int &ierr);

  /**
-   * Info: allocate host memory in pinned memory
+   * Allocate host memory in pinned memory
   * Return: success or error code
   */
  template<typename T>
@ -175,7 +184,8 @@ public:
    return DKS_SUCCESS;
  }		

-  /** Zero CUDA memory.
+  /** 
+   * Zero CUDA memory.
   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
@ -190,7 +200,8 @@ public:
    return DKS_SUCCESS;
  }

-  /** Zero CUDA memory.
+  /** 
+   * Zero CUDA memory async.
   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
@ -210,7 +221,7 @@ public:
  }

  /** 
-   * Info: write data to memory
+   * Write data to memory
   * Retrun: success or error code
   */
  template<typename T>
@ -227,7 +238,7 @@ public:
  }

  /**
-   * Info: write data assynchonuously
+   * Write data assynchonuously
   * Return: success or error code
   */
  template<typename T>
@ -259,7 +270,7 @@ public:
  }
 		
  /**
-   * Info: read data from memory
+   * Read data from memory
   * Return: success or error code
   */
  template<typename T>
@ -276,7 +287,7 @@ public:
  }

  /**
-   * Info: read data async from device memory
+   * Read data async from device memory
   * Return: success or error code
   */
  template<typename T>
@ -308,19 +319,19 @@ public:
  }
 		
  /**
-   * Info: free memory on device
+   * Free memory on device
   * Return: success or error code
   */
  int cuda_freeMemory(void * mem_ptr);
 	
  /**
-   * Info: free page locked memory on host
+   * Free page locked memory on host
   * Return: success or erro code
   */
  int cuda_freeHostMemory(void * mem_ptr);
 	
  /**
-   * Info: allcate memory and write data (push)
+   * Allcate memory and write data (push)
   * Return: pointer to memory object
   */
  template<typename T>
@ -336,7 +347,7 @@ public:
  }
 		
  /**
-   * Info: read data and free memory (pull)
+   * Read data and free memory (pull)
   * Return: success or error code
   */
  template<typename T>
@ -356,19 +367,8 @@ public:
  }
  
  /**
-   * Info: execute function
-   * Return: success or error code
-   */
-  int cuda_executeFunction();
-		
-  /**
-   * Info: clean up
-   * Return: success or error code
-   */
-  int cuda_cleanUp();
-  
-  /**
-   * Info: sync cuda device
+   * Sync cuda device.
+   * Waits till all the tasks on the GPU are finished.
   * Return: success or error code
   */
  int cuda_syncDevice() {
@ -377,7 +377,7 @@ public:
  }

  /**
-   * Page-lock host memory
+   * Page-lock host memory.
   */
  template<typename T>
  int cuda_hostRegister(T *ptr, int size) {
@ -391,7 +391,7 @@ public:
  }

  /**
-   * Release page locked memory
+   * Release page locked memory.
   */
  template<typename T>
  int cuda_hostUnregister(T *ptr) {
@ -404,7 +404,7 @@ public:
  }

  /**
-   * Info: print device memory info (total, used, avail)
+   * Print device memory info (total, used, avail)
   * Return: success or error code
   */
  int cuda_memInfo() {
--- a/src/CUDA/CudaChiSquare.cuh
+++ b/src/CUDA/CudaChiSquare.cuh
@ -8,6 +8,7 @@

 #include "CudaBase.cuh"

+/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
 class CudaChiSquare {

 private:
--- a/src/CUDA/CudaChiSquareRuntime.cuh
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double

 const std::string cudaFunctFooter = "}\n";

+/**
+ * CUDA implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
+ */
 class CudaChiSquareRuntime : public ChiSquareRuntime{

 private:
@ -29,65 +33,72 @@ private:

  cublasHandle_t defaultCublasRT;

-  /** Setup to init device
+  /** 
+   * Setup to init device.
   * Create context and init device for RT compilation
   */
  void setUpContext();

-  /** Private function to add function to kernel string
-   *
+  /** 
+   * Private function to add function to kernel string.
   */
  std::string buildProgram(std::string function);

 public:

-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase argument
   */
  CudaChiSquareRuntime(CudaBase *base);

-  /** Default constructor init cuda device
-   *
+  /** 
+   * Default constructor init cuda device
   */
  CudaChiSquareRuntime();
  
-  /** Default destructor
-   *
+  /** 
+   * Default destructor.
   */
  ~CudaChiSquareRuntime();

-  /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);

-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
-   * Result is put in &result variable
+   * Result is put in &result variable.
   */
  int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
 		      int numpar, int numfunc, int nummap,
 		      double timeStart, double timeStep,
 		      double &result);

-  /** Write params to device.
+  /** 
+   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 

-  /** Write functions to device.
+  /** 
+   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);

-  /** Write maps to device.
+  /** 
+   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);

-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -96,12 +107,14 @@ public:
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);


-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();

-  /** Check if CUDA device is able to run the chi square kernel.
+  /** 
+   * Check if CUDA device is able to run the chi square kernel.
   * Redundant - all new CUDA devices that support RT compilation will also support 
   * double precision, there are no other requirements to run chi square on GPU
   */
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@ -1,16 +1,16 @@
 #include "CudaCollimatorPhysics.cuh"

-//#define M_P 0.93827231e+00
+//constants used in OPAL
 #define M_P 0.93827204e+00
 #define C 299792458.0
 #define PI 3.14159265358979323846
 #define AVO 6.022e23
 #define R_E 2.81794092e-15
-//#define eM_E 0.51099906e-03
 #define eM_E 0.51099892e-03
 #define Z_P 1
 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7

+//parameter array indexes
 #define POSITION 0 
 #define ZSIZE 1
 #define RHO_M 2
@ -28,12 +28,18 @@
 #define BLOCK_SIZE 128
 #define NUMPAR 13

+/**
+ * CUDA device function for calculating dot product.
+ */
 __device__ inline double dot(double3 &d1, double3 &d2) {

  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);

 }

+/**
+ * CUDA devce function to calculate cross product.
+ */
 __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  double3 tmp;
  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@ -42,6 +48,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  return tmp;
 }

+/**
+ * CUDA device function to calculate arbitrary rotation.
+ */
 __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
  double c=cos(Theta);
  double s=sin(Theta);
@ -59,6 +68,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
  return tmp;
 } 

+/**
+ * CUDA device function to check if particle is still in material.
+ * z - particle position, par - parameter array. Particle is considered inside the
+ * material if z is > material starting position and z < material starting position - mat size.
+ */
 __device__ inline bool checkHit(double &z, double *par) {

  /* check if particle is in the degrader material */
@ -67,6 +81,11 @@ __device__ inline bool checkHit(double &z, double *par) {
 }


+/**
+ * CUDA device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
 {

@ -111,51 +130,53 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,

 }

-__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
-			   double &normP, double &thetacou, double &deltas, int coord,
+/**
+ * CUDA device function for rotation in 2 dimensions.
+ * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),  
+ * "Passage of particles through matter"
+ */
+__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &plane, 
+			   double &betaGamma, double &thetacou, double &deltas, int coord,
 			   double *par) 
 {
-  double Psixz;
-  double pxz;
+  // Calculate the angle between the px and pz momenta to change from beam coordinate to lab coordinate
+  const double Psi = atan2(px, pz);
+  const double pxz = sqrt(px*px + pz*pz);
+  const double cosPsi = cos(Psi);
+  const double sinPsi = sin(Psi);
+  const double cosTheta = cos(thetacou);
+  const double sinTheta = sin(thetacou);

-  /*
-  if (px>=0 && pz>=0)
-    Psixz = atan(px/pz);
-  else if (px>0 && pz<0)
-      Psixz = atan(px/pz) + PI;
-  else if (px<0 && pz>0)
-    Psixz = atan(px/pz) + 2*PI;
-  else
-    Psixz = atan(px/pz) + PI;
-  */
-  Psixz = atan2(px, pz);
-  pxz = sqrt(px*px + pz*pz);
+  // Apply the rotation about the random angle thetacou & change from beam
+  // coordinate system to the lab coordinate system using Psixz (2 dimensions)
+  x += deltas * px / betaGamma + plane * cosPsi;
+  z -= plane * sinPsi;

  if (coord == 1) {
-    x = x + deltas * px/normP + xplane*cos(Psixz);
-    z = z - xplane * sin(Psixz);
+    z += deltas * pz / betaGamma;
  }

-  if(coord==2) {
-    x = x + deltas * px/normP + xplane*cos(Psixz);
-    z = z - xplane * sin(Psixz) + deltas * pz / normP;
-  }
-
-  px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
-  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
+  px = pxz * (cosPsi * sinTheta + sinPsi * cosTheta);
+  pz = pxz * (-sinPsi * sinTheta + cosPsi * cosTheta);
 }

+/**
+ * CUDA device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * For details on the algorithm see OPAL user guide.
+ */
 __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
 				   bool enableRutherfordScattering) 
 {

  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
  double gamma = (Eng + M_P) / M_P;
-  double normP = sqrt(dot(P, P));
  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
+  double betaGamma = sqrt(dot(P, P));
  double deltas = par[DT_M] * beta * C;
+  double mass = M_P * 1e9; // in eV

-  double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * 
+  double theta0 = 13.6e6 / (beta * betaGamma * mass) * 
    Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));

  // x-direction: See Physical Review, "Multiple Scattering"
@ -170,8 +191,9 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
  }

  //__syncthreads();  
-  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
-  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
+  //double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  double xplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
+  Rot(P.x, P.z, R.x, R.z, xplane, betaGamma, thetacou, deltas, 0, par);

  // y-direction: See Physical Review, "Multiple Scattering"
  z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
@ -184,10 +206,9 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
    thetacou = z2 * theta0;
  }

-  //__syncthreads();
-
-  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
-  Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
+  //double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  double yplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
+  Rot(P.y,P.z,R.y,R.z, yplane, betaGamma, thetacou, deltas, 1, par);

  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
  if( (P2 < 0.0047) && enableRutherfordScattering) {
@ -211,6 +232,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
 }


+/**
+ * CUDA kernel that performs one step in particle movement trough mater.
+ * One thread is launched for each particle in the simulation. The kernel checks if the particle
+ * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
+ * particles that are exiting the material.
+ * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
 					int numparticles, bool enableRutherfordScattering)
@ -220,51 +252,63 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

-  //transfer params to shared memory
+  //transfer params and particle positions to shared memory
+  //R is kept in shared memory in order to reduce register pressure for the kernel
  extern __shared__ double smem[];
  double *p = (double*)smem;
  double3 *R = (double3*)&smem[NUMPAR]; 

-  curandState s; 
+  curandState s; //each tread gets its own cuRand state for random number generation
  double3 P;

+  //load parameters to shared memory
  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
    p[tt] = par[tt];

+  //sync threads to ensure that parameters are finished loading
  __syncthreads();

+  //there might be some empty threads that do no work
  if (idx < numparticles) {
-    s = state[idx];
-    R[tid] = data[idx].Rincol;
-    P = data[idx].Pincol;
+    s = state[idx]; //load cuRand state to local memory
+    R[tid] = data[idx].Rincol; //load position to shared memory
+    P = data[idx].Pincol; //load momentum to local memory

    bool pdead = false;  
    volatile double sq = sqrt(1.0 + dot(P, P));

    double Eng;
    
+    //check if particle is still in the material
    if (checkHit(R[tid].z, p)) {      

+      //calculate enery loss
      Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);

+      //check if particle is not dead
      if (!pdead) {
 	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
 	sq = sqrt(dot(P, P));

+	//caluclate Coulomb scattering
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
+
 	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 

+	//update particle momentum
 	data[idx].Pincol = P;
      } else {
+	//mark particle as dead (-1)
 	data[idx].label = -1;
      }
      
+      //update cuRand state
      state[idx] = s;
    } else {
-    
+      //particle exits material - drift and mark as exiting (-2)
      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -272,12 +316,23 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
      
    }
 
+    //update particle position
    data[idx].Rincol = R[tid];
  }

 }

-__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
+/**
+ * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
+ * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
+ * Deprecated - GPU version does not use SoA.
+ * @param[in] data structure of arrays containing particle data
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
+__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, 
 					   curandState *state, int numparticles,
 					   bool enableRutherfordScattering)
 {
@ -338,92 +393,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,

 }

-
+/**
+ * Device function to swich off unitless positions.
+ */
 inline __device__ void unitlessOff(double3 &a, const double &c) {
  a.x *= c;
  a.y *= c;
  a.z *= c;
 }

+/**
+ * Device function to swich on unitless positions.
+ */
 inline __device__ void unitlessOn(double3 &a, const double &c) {
  a.x /= c;
  a.y /= c;
  a.z /= c;
 }

-//swithch to unitless positions with dtc
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOn(R, dtc);
-    unitlessOn(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//swithc to unitless positions with dt*c
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-
-//swithc off unitless positions with dtc
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOff(R, dtc);
-    unitlessOff(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//switch off unitelss positions with dt*c
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-

+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] npart number of particles
+ * @param[in] dtc dt*c
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {

  //get global id and thread id
@ -451,7 +446,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  }
 }

-
+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {

  //get global id and thread id
@ -478,6 +480,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
  }
 }

+/** 
+ * CUDA kernel to perform particle kick.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gEf 
+ * @param[in] *gBf
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
 __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
 			   double3 *gBf, double *gdt, double charge, 
 			   double mass, int npart, double c)
@ -627,63 +639,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection

 }

-struct compare_particle
-{
-  int threshold;
-
-  compare_particle() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART p1, CUDA_PART p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct compare_particle_small
-{
-  int threshold;
-
-  compare_particle_small() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART_SMALL p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct less_then
-{
-  __host__ __device__
-  bool operator()(int x)
-  {
-    return x < 0;
-  }
-};
-
 int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
 					     bool enableRutherfordScattering)
 {
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@ -20,7 +20,8 @@
 #include "CudaBase.cuh"

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU or MIC as AoS.
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
 */
 typedef struct __align__(16) {
  int label;
@ -37,7 +38,10 @@ typedef struct __align__(16) {
 } CUDA_PART;

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as AoS
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
 */
 typedef struct {
  int label;
@ -47,7 +51,8 @@ typedef struct {
 } CUDA_PART_SMALL;

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as SoA.
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
 */
 typedef struct {
  int *label;
@ -65,6 +70,9 @@ typedef struct {

 /**
 * Structure for storing particle on GPU
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
 */
 typedef struct {
  int *label;
@ -73,9 +81,37 @@ typedef struct {
  double3 *Pincol;
 } CUDA_PART2_SMALL;

-/** CudaCollimatorPhysics class.
+/** 
+ * Operator used in thrust sort to compare particles by label.
+ * Used to move dead particles to the end of array, since they have label -1 or -2.
+ */
+struct compare_particle_small
+{
+  int threshold;
+
+  compare_particle_small() {
+    threshold = 0;
+  }
+
+  void set_threshold(int t) {
+    threshold = t;
+  }
+
+  __host__ __device__
+  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
+    return p1.label > p2.label;
+  }
+
+  __host__  __device__
+  bool operator()(CUDA_PART_SMALL p1) {
+    return p1.label < threshold;
+  }
+};
+
+/** 
+ * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Contains kerenls that execute CollimatorPhysics functions form OPAL.
- * For detailed documentation on CollimatorPhysics functions see OPAL documentation
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
 */
 class CudaCollimatorPhysics : public DKSCollimatorPhysics {

@ -86,32 +122,44 @@ private:

 public:

-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase as argument.
+   * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
   */
  CudaCollimatorPhysics(CudaBase *base) {
    m_base = base;
    base_create = false;
  }

-  /** Constructor - empty. */
+  /** 
+   * Empty constructor.
+   * Create a new instance of CudaCollimatorPhysics with its own CudaBase. 
+   */
  CudaCollimatorPhysics() { 
    m_base = new CudaBase();
    base_create = true;
  }

-  /** Destructor - empty */
+  /** 
+   * Destructor.
+   * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
+   */
  ~CudaCollimatorPhysics() { 
    if (base_create)
      delete m_base;
  };

-  /** Execute collimator physics kernel.
+  /** 
+   * Execute collimator physics kernel.
   *
   */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			int numpartices, bool enableRutherforScattering = true);

+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			   void *px_ptr, void *py_ptr, void *pz_ptr,
@ -120,12 +168,17 @@ public:
      return DKS_ERROR;
    }

-  /** Sort particle array on GPU.
+  /** 
+   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  
+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
@ -134,18 +187,25 @@ public:
      return DKS_ERROR;
    }

-  /** BorisPusher push function for integration from OPAL.
+  /** 
+   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);

+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			   void *bf_ptr, void *dt_ptr, double charge, double mass,
 			   int npart, double c, int streamId = -1); 

-  /** BorisPusher push function with transformto function form OPAL
+  /** 
+   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
--- a/src/CUDA/CudaFFT.cuh
+++ b/src/CUDA/CudaFFT.cuh
@ -10,7 +10,11 @@
 #include "../Algorithms/FFT.h"
 #include "CudaBase.cuh"

-class CudaFFT : public DKSFFT{
+/**
+ * Cuda FFT class based on BaseFFT interface.
+ * Uses cuFFT library to perform FFTs on nvidias GPUs.
+ */
+class CudaFFT : public BaseFFT {

 private:

@ -34,7 +38,7 @@ public:
  ~CudaFFT();
 		
  /**
-   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Init cufftPlans witch can be reused for all FFTs of the same size and type
   * Return: success or error code
   */
  int setupFFT(int ndim, int N[3]);
@ -42,45 +46,21 @@ public:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }

  /**
-   * Info: destroy default FFT plans
+   * Destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT();

-  /*
-    Info: execute complex to complex double precision fft using cufft library
-    Return: success or error code
-  */
  int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
 		
-  /*
-    Info: execute ifft 
-    Return: success or error code
-  */
  int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute normalize using cuda kernel for complex to complex iFFT
-    Return: success or error code
-  */
  int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute real to complex double precision FFT
-    Return: success or error code
-  */
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: exectue complex to real double precision FFT
-    Return: success or error code
-  */
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);

-  /*
-    Info: execute normalize for complex to real iFFT
-    Return: success or error code
-  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);

 };
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@ -12,6 +12,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "CudaBase.cuh"

+/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class CudaGreensFunction : public GreensFunction{

 private:
--- a/src/CUDA/CudaImageReconstruction.cuh
+++ b/src/CUDA/CudaImageReconstruction.cuh
@ -10,6 +10,7 @@
 #include "../Algorithms/ImageReconstruction.h"
 #include "CudaBase.cuh"

+/** CUDA implementation of ImageReconstruction interface. */
 class CudaImageReconstruction : public ImageReconstruction {

 private:
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@ -1,11 +1,3 @@
-/** DKSBase class.
- * DKSBase.h
- * Author: Uldis Locans
- * Date: 15.09.2014
- * Base class of Dynamic Kernel Scheduler that handles the function calls
- * from host application to DKS
- */
-
 #ifndef H_DKS_BASE
 #define H_DKS_BASE

@ -33,7 +25,6 @@

 #ifdef DKS_CUDA
 #include "CUDA/CudaBase.cuh"
-#include "nvToolsExt.h"
 #endif

 #ifdef DKS_MIC
@ -42,7 +33,12 @@

 #include "AutoTuning/DKSConfig.h"

-/** DKSBase class for handling function calls to DKS library */
+/** 
+ * API for handling communication function calls to DKS library.
+ * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
+ * memory manegement, data transfer and other basic communication functions between
+ * the host and device.
+ */
 class DKSBase {

 private:
@ -75,7 +71,7 @@ protected:
  DKSConfig dksconfig;

  /** 
-   * Check if current API is set to OpenCL
+   * Check if current API is set to OpenCL.
   * Return true/false wether current api is opencl
   */
  bool apiOpenCL();
@ -92,11 +88,11 @@ protected:
   */
  bool apiOpenMP();

-  /** Check if device is GPU */
+  /** Check if device is GPU. */
  bool deviceGPU();
-  /** Check if device is CPU */
+  /** Check if device is CPU. */
  bool deviceCPU();
-  /** Check if device is MIC */
+  /** Check if device is MIC. */
  bool deviceMIC();

  /**
@ -889,9 +885,10 @@ public:
   * TODO: opencl and mic imlementation
   */
  int callMemInfo() {
+    #ifdef DKS_CUDA
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_memInfo());
-
+    #endif
    return DKS_ERROR;
  }

@ -900,11 +897,13 @@ public:
   * Used for debuging and timing purposes only.
   */
  void oclEventInfo() {
+    #ifdef DKS_OPENCL
    if (apiOpenCL())
      return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
-
+    #endif
  }

+
  /** 
   * Test function to profile opencl kernel calls.
   * Used for debuging and timing purposes only.
--- a/src/DKSBaseMuSR.cpp
+++ b/src/DKSBaseMuSR.cpp
@ -24,6 +24,7 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
  //if we are not auto tuning and the size of the problem has changed find the new parameters
  //from autotuning config file
  if (!isAutoTuningOn() && length != chiSquareSize_m) {
+    /*
    int numBlocks, blockSize;
    std::string device_name;
    getDeviceName(device_name);
@ -33,8 +34,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
 				 length, "BlockSize", blockSize);
    chiSq->setKernelParams(numBlocks, blockSize);
    
-    //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
-
+    std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
+    */
    chiSquareSize_m = length;
  } 

--- a/src/DKSBaseMuSR.h
+++ b/src/DKSBaseMuSR.h
@ -8,6 +8,7 @@
 #include "AutoTuning/DKSAutoTuningTester.h"

 #include "DKSBase.h"
+#include "DKSFFT.h"

 #include "Algorithms/ChiSquareRuntime.h"

@ -19,7 +20,12 @@
 #include "OpenCL/OpenCLChiSquareRuntime.h"
 #endif

-class DKSBaseMuSR : public DKSBase {
+/**
+ * API to handle musrfit calls to DKS library.
+ * Using ChiSquareRuntime interface allows to call chi square functions on the 
+ * GPU or CPU using CUDA or OpenCL.
+ */
+class DKSBaseMuSR : public DKSFFT {

 private:

--- a/src/DKSFFT.cpp
+++ b/src/DKSFFT.cpp
@ -0,0 +1,147 @@
+#include "DKSFFT.h"
+
+DKSFFT::DKSFFT() {
+  dksfft = nullptr;
+}
+
+DKSFFT::~DKSFFT() {
+  delete dksfft;
+}
+
+/* setup fft plans to reuse if multiple ffts of same size are needed */
+int DKSFFT::setupFFT(int ndim, int N[3]) {
+
+  if (apiCuda()) {
+    dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
+    return dksfft->setupFFT(ndim, N);
+  } else if (apiOpenCL()) {
+    dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
+    int ierr1 = dksfft->setupFFT(ndim, N);
+    int ierr2 = dksfft->setupFFTRC(ndim, N);
+    int ierr3 = dksfft->setupFFTCR(ndim, N);
+    if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
+      return DKS_ERROR;
+
+    return DKS_SUCCESS;
+  } else if (apiOpenMP()) {
+    //micbase.mic_setupFFT(ndim, N);
+    //BENI: setting up RC and CR transformations on MIC
+    dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
+    int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
+    int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
+    if (ierr1 != DKS_SUCCESS)
+      return ierr1;
+    if (ierr2 != DKS_SUCCESS)
+      return ierr2;
+    return DKS_SUCCESS;
+  }
+
+  return DKS_ERROR;
+
+}
+//BENI:
+int DKSFFT::setupFFTRC(int ndim, int N[3], double scale) {
+
+  if (apiCuda())
+    return dksfft->setupFFT(ndim, N);
+  if (apiOpenCL())
+    return dksfft->setupFFTRC(ndim, N);
+  else if (apiOpenMP())
+    return dksfft->setupFFTRC(ndim, N, scale);
+
+  return DKS_ERROR;
+
+}
+
+//BENI:
+int DKSFFT::setupFFTCR(int ndim, int N[3], double scale) {
+
+  if (apiCuda())
+    return dksfft->setupFFT(ndim, N);
+  if (apiOpenCL())
+    return dksfft->setupFFTCR(ndim, N);
+  else if (apiOpenMP())
+    return dksfft->setupFFTCR(ndim, N, scale);
+
+  return DKS_ERROR;
+
+}
+
+/* call OpenCL FFT function for selected platform */
+int DKSFFT::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiOpenCL() || apiOpenMP()) 
+    return dksfft->executeFFT(data_ptr, ndim, dimsize);
+  else if (apiCuda())
+    return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call OpenCL IFFT function for selected platform */
+int DKSFFT::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiOpenCL() || apiOpenMP())
+      return dksfft->executeIFFT(data_ptr, ndim, dimsize);
+  else if (apiCuda()) 
+    return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call normalize FFT function for selected platform */
+int DKSFFT::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiOpenCL()) {
+    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
+      return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
+    else 
+      return DKS_ERROR;
+  } else if (apiCuda()) {
+    return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
+  } else if (apiOpenMP()) {
+    return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
+  }
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call real to complex FFT */
+int DKSFFT::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiCuda())
+    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
+  else if (apiOpenCL() || apiOpenMP())
+    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call complex to real FFT */
+int DKSFFT::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiCuda())
+    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
+  else if (apiOpenCL() || apiOpenMP())
+    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* normalize complex to real iFFT */
+int DKSFFT::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiCuda())
+    return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
+  else if (apiOpenCL())
+    return DKS_ERROR;
+  else if (apiOpenMP())
+    return DKS_ERROR;
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+
--- a/src/DKSFFT.h
+++ b/src/DKSFFT.h
@ -0,0 +1,112 @@
+#ifndef H_DKSBASE_FFT
+#define H_DKSBASE_FFT
+
+#include <iostream>
+#include "AutoTuning/DKSAutoTuning.h"
+
+#include "DKSBase.h"
+
+#include "DKSDefinitions.h"
+
+#include "Algorithms/GreensFunction.h"
+#include "Algorithms/CollimatorPhysics.h"
+#include "Algorithms/FFT.h"
+
+#ifdef DKS_AMD
+#include "OpenCL/OpenCLFFT.h"
+#endif
+
+#ifdef DKS_CUDA
+#include "CUDA/CudaFFT.cuh"
+#endif
+
+#ifdef DKS_MIC
+#include "MIC/MICFFT.h"
+#endif
+
+/**
+ * API to handel calls to DKSFFT.
+ * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
+ */
+class DKSFFT : public DKSBase {
+
+private:
+
+  BaseFFT *dksfft;
+
+  int initFFT();
+  
+public:
+  
+  DKSFFT();
+  ~DKSFFT();
+
+  /** 
+   * Setup FFT function.
+   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
+   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
+   * each fft will do its own setup according to fft size and dimensions.
+   * TODO: opencl and mic implementations
+   */
+  int setupFFT(int ndim, int N[3]);
+  //BENI:
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
+  //BENI:
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
+
+  /** 
+   * Call complex-to-complex fft.
+   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
+   * stream id can be specified to use other streams than default.
+   * TODO: mic implementation
+   */
+  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call complex-to-complex ifft.
+   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
+   * stream id can be specified to use other streams than default.
+   * TODO: mic implementation.
+   */
+  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Normalize complex to complex ifft.
+   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
+   * fft size
+   * TODO: mic implementation.
+   */
+  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call real to complex FFT.
+   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
+   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
+   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
+   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
+   * TODO: opencl and mic implementations
+   */
+  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call complex to real iFFT.
+   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
+   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
+   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
+   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
+   * TODO: opencl and mic implementations.
+   */
+  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Normalize compelx to real ifft.
+   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
+   * fft size.
+   * TODO: opencl and mic implementations.
+   */
+  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+
+};
+
+#endif
--- a/src/DKSImageReconstruction.h
+++ b/src/DKSImageReconstruction.h
@ -10,6 +10,9 @@
 #include "CUDA/CudaImageReconstruction.cuh"
 #endif

+/**
+ * API to handle PET image reconstruction calls.
+ */
 class DKSImageRecon : public DKSBase {

 private:
@ -22,87 +25,88 @@ public:

  ~DKSImageRecon();

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateSource(void *image_space, void *image_position, void *source_position, 
 			  void *avg, void *std, float diameter, int total_voxels, 
 			  int total_sources, int start = 0);

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0);


-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateSources(void *image_space, void *image_position, void *source_position, 
 			   void *avg, void *std, void *diameter, int total_voxels, 
 			   int total_sources, int start = 0);

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
 			       void *avg, void *std, void *diameter, int total_voxels, 
 			       int total_sources, int start = 0);

-  /** Image reconstruction - generate normalization.
-   * 
+  /** 
+   * Image reconstruction - generate normalization.
   */
  int callGenerateNormalization(void *recon, void *image_position, 
 				void *det_position, int total_det);

-  /** Image reconstruction - forward correction.
-   * 
+  /** 
+   * Image reconstruction - forward correction.
   */
  int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 			    void *image_position, int num_events);

-  /** Image reconstruction - backward projection.
-   * 
+  /** 
+   * Image reconstruction - backward projection.
   */
  int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
 			     void *det_position, void *image_position, 
 			     int num_events, int num_voxels);

-  /** Set the voxel dimensins on device.
+  /** 
+   * Set the voxel dimensins on device.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);

-  /** Set the image edge.
+  /** 
+   * Set the image edge.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge(float x_edge, float y_edge, float z_edge);

-  /** Set the image edge1.
+  /** 
+   * Set the image edge1.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);

-  /** Set the minimum crystan in one ring values.
+  /** 
+   * Set the minimum crystal in one ring values.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);

-  /** Set all other required parameters for reconstruction.
+  /** 
+   * Set all other required parameters for reconstruction.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
--- a/src/DKSMainPage.dox
+++ b/src/DKSMainPage.dox
@ -0,0 +1,32 @@
+/** 
+\mainpage
+
+<P>
+<B>
+The aim of DKS is to allow the creation of fast fine tuned kernels using device specific frameworks such as CUDA, OpenCL, OpenACC and OpenMP and accelerator libraries such as Thrust, Nvidia CUDA libraries, Intel MKL or others. On top of that, DKS allows the easy use of these kernels in host applications without providing any device or framework specific details. This approach facilitates the integration of different types of devices in the existing applications with minimal code changes and  makes the device and the host code a lot more manageable.
+</B>
+<P>
+
+The main parts of DKS are:
+<ul>
+	<li>DKSBase - provides the basic communication functions between host application and hardware accelerators including memory manegement, data transfer and synchronization.</li>
+	<li>DKSOPAL - provides functions for Object Oriented Particle Accelerator library to offload FFTPoisson calculations and particle matter interaction using Monte Carlo simulations to GPU and Intel MIC</li>
+	<li>DKSBaseMuSR - provides functions to perform parameter fitting for musrfit on the GPU</li>
+	<li>DKSImageRecon - provides functions to perform PET image reconstruction on the GPU</li>
+	<li>DKSFFT - provides functions to perform FFT on the GPU and Intel MIC</li>
+</ul>
+
+<P>
+<B>
+Developed by
+Uldis Locans
+</B>
+
+<P>
+For further information contact: locans.uldis@psi.ch - Uldis Locans
+<P>
+
+<P>
+<a href="https://gitlab.psi.ch/uldis_l/DKS">DKS on gitlab</a><br>
+
+*/
--- a/src/DKSOPAL.cpp
+++ b/src/DKSOPAL.cpp
@ -1,7 +1,6 @@
 #include "DKSOPAL.h"

 DKSOPAL::DKSOPAL() {
-  dksfft = nullptr;
  dkscol = nullptr;
  dksgreens = nullptr;
 }
@ -12,7 +11,6 @@ DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
 }

 DKSOPAL::~DKSOPAL() {
-  delete dksfft;
  delete dkscol;
  delete dksgreens;
 }
@ -22,17 +20,14 @@ int DKSOPAL::setupOPAL() {
  if (apiOpenCL()) {
    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
    //TODO: only enable if AMD libraries are available
-    dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
    dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
    dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
  } else if (apiCuda()) {
    ierr = CUDA_SAFECALL( DKS_SUCCESS );
-    dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
    dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
    dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
  } else if (apiOpenMP()) {
    ierr = MIC_SAFECALL( DKS_SUCCESS );
-    dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
    dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
    dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
  } else {
@ -50,139 +45,6 @@ int DKSOPAL::initDevice() {

 }

-/* setup fft plans to reuse if multiple ffts of same size are needed */
-int DKSOPAL::setupFFT(int ndim, int N[3]) {
-
-  if (apiCuda()) {
-    return dksfft->setupFFT(ndim, N);
-  } else if (apiOpenCL()) {
-    int ierr1 = dksfft->setupFFT(ndim, N);
-    int ierr2 = dksfft->setupFFTRC(ndim, N);
-    int ierr3 = dksfft->setupFFTCR(ndim, N);
-    if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
-      return DKS_ERROR;
-
-    return DKS_SUCCESS;
-  } else if (apiOpenMP()) {
-    //micbase.mic_setupFFT(ndim, N);
-    //BENI: setting up RC and CR transformations on MIC
-    int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
-    int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
-    if (ierr1 != DKS_SUCCESS)
-      return ierr1;
-    if (ierr2 != DKS_SUCCESS)
-      return ierr2;
-    return DKS_SUCCESS;
-  }
-
-  return DKS_ERROR;
-
-}
-//BENI:
-int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) {
-
-  if (apiCuda())
-    return dksfft->setupFFT(ndim, N);
-  if (apiOpenCL())
-    return dksfft->setupFFTRC(ndim, N);
-  else if (apiOpenMP())
-    return dksfft->setupFFTRC(ndim, N, scale);
-
-  return DKS_ERROR;
-
-}
-
-//BENI:
-int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) {
-
-  if (apiCuda())
-    return dksfft->setupFFT(ndim, N);
-  if (apiOpenCL())
-    return dksfft->setupFFTCR(ndim, N);
-  else if (apiOpenMP())
-    return dksfft->setupFFTCR(ndim, N, scale);
-
-  return DKS_ERROR;
-
-}
-
-/* call OpenCL FFT function for selected platform */
-int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
-
-  if (apiOpenCL() || apiOpenMP()) 
-    return dksfft->executeFFT(data_ptr, ndim, dimsize);
-  else if (apiCuda())
-    return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call OpenCL IFFT function for selected platform */
-int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
-  if (apiOpenCL() || apiOpenMP())
-      return dksfft->executeIFFT(data_ptr, ndim, dimsize);
-  else if (apiCuda()) 
-    return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call normalize FFT function for selected platform */
-int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
-
-  if (apiOpenCL()) {
-    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
-      return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
-    else 
-      return DKS_ERROR;
-  } else if (apiCuda()) {
-    return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
-  } else if (apiOpenMP()) {
-    return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
-  }
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call real to complex FFT */
-int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
-
-  if (apiCuda())
-    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
-  else if (apiOpenCL() || apiOpenMP())
-    return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* call complex to real FFT */
-int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
-  if (apiCuda())
-    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
-  else if (apiOpenCL() || apiOpenMP())
-    return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
-/* normalize complex to real iFFT */
-int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
-  if (apiCuda())
-    return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
-  else if (apiOpenCL())
-    return DKS_ERROR;
-  else if (apiOpenMP())
-    return DKS_ERROR;
-
-  DEBUG_MSG("No implementation for selected platform");
-  return DKS_ERROR;
-}
-
 int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
 				double hz_m0, double hz_m1, double hz_m2, int streamId) {

--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@ -5,6 +5,7 @@
 #include "AutoTuning/DKSAutoTuning.h"

 #include "DKSBase.h"
+#include "DKSFFT.h"

 #include "DKSDefinitions.h"

@ -32,11 +33,15 @@
 #include "MIC/MICCollimatorPhysics.h"
 #endif

-class DKSOPAL : public DKSBase {
+/**
+ * API to handle OPAL calls to DKS library.
+ * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
+ * functions.
+ */
+class DKSOPAL : public DKSFFT {

 private: 

-  DKSFFT *dksfft;
  DKSCollimatorPhysics *dkscol;
  GreensFunction *dksgreens;

@ -56,71 +61,6 @@ public:
  ///////Function library part of dksbase////////
  ///////////////////////////////////////////////

-  /** 
-   * Setup FFT function.
-   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
-   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
-   * each fft will do its own setup according to fft size and dimensions.
-   * TODO: opencl and mic implementations
-   */
-  int setupFFT(int ndim, int N[3]);
-  //BENI:
-  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
-  //BENI:
-  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
-
-/** 
-   * Call complex-to-complex fft.
-   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
-   * stream id can be specified to use other streams than default.
-   * TODO: mic implementation
-   */
-  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Call complex-to-complex ifft.
-   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
-   * stream id can be specified to use other streams than default.
-   * TODO: mic implementation.
-   */
-  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Normalize complex to complex ifft.
-   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
-   * fft size
-   * TODO: mic implementation.
-   */
-  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Call real to complex FFT.
-   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
-   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
-   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
-   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
-   * TODO: opencl and mic implementations
-   */
-  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Call complex to real iFFT.
-   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
-   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
-   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
-   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
-   * TODO: opencl and mic implementations.
-   */
-  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
-
-  /** 
-   * Normalize compelx to real ifft.
-   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
-   * fft size.
-   * TODO: opencl and mic implementations.
-   */
-  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
-
  /** 
   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
   * For specifics check OPAL docs.
--- a/src/MIC/CMakeLists.txt
+++ b/src/MIC/CMakeLists.txt
@ -1,11 +1,10 @@
-SET (_SRCS MICBase.cpp)
-SET (_HDRS MICBase.h)
+SET (_SRCS MICBase.cpp MICFFT.cpp)
+SET (_HDRS MICBase.h MICFFT.h)

 IF (ENABLE_OPAL)
  SET (_SRCS
    ${_SRCS}
    MICChiSquare.cpp
-    MICFFT.cpp
    MICGreensFunction.cpp  
    MICCollimatorPhysics.cpp
    )
@ -13,7 +12,6 @@ IF (ENABLE_OPAL)
  SET (_HDRS
    ${_HDRS}
    MICChiSquare.h
-    MICFFT.h
    MICCollimatorPhysics.h
    MICGreensFunction.hpp    
    MICMergeSort.h
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@ -26,6 +26,9 @@

 #define MIC_WIDTH 128

+/** MIC Base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement and  data transfers.
+ */
 class MICBase {

 private:
@ -45,57 +48,59 @@ public:

  int m_device_id;

-  /* constructor */
+  /** constructor */
  MICBase();

-  /* destructor */
+  /** destructor */
  ~MICBase();

-  /*
-    Info: create MKL rand streams for each thread
-    Return: success or error code
+  /**
+   * Create MKL rand streams for each thread
+   *  Return: success or error code
   */
  int mic_createRandStreams(int size);

-  /*
-    Info: delete MKL rand streams
-    Return: succes or error code
+  /**
+   * Delete MKL rand streams
+   * Return: succes or error code
   */
  int mic_deleteRandStreams();

-  /*
-    Info: create a new signal for the mic
-    Return: success or error code
+  /**
+   * Create a new signal for the mic.
+   * Signals can be used for assynchronous data transfers.
+   * Return: success or error code
   */
  int mic_createStream(int & streamId);

-  /*
-    Info: get the signal from the vector
-    Return: mic signal
+  /**
+   * Info: get the signal from the vector.
+   * Return: mic signal
  */
  int& mic_getStream(int id);

-  /*
-    Info: delete streams
-    Return: success or error code
+  /**
+   * Info: delete streams.
+   * Return: success or error code
   */
  int mic_deleteStreams();

-  /*
-    Info: set device id
-    Return: success or error code
+  /**
+   * Info: set device id.
+   * Return: success or error code
   */
  int mic_setDeviceId(int id);

-  /*
-    Info: get mic devices
-    Return: success or error code
+  /**
+   * Info: get mic devices.
+   * Prints information about mic devices.
+   * Return: success or error code
   */
  int mic_getDevices();

-  /*
-    Info: allocate memory on MIC device
-    Return: success or error code
+  /**
+   * Allocate memory on MIC device.
+   * Return: success or error code
   */
  template<typename T>
  void * mic_allocateMemory(int size) {
@ -109,9 +114,9 @@ public:
    return tmp;
  }

-  /*
-    Info: transfer data to device
-    Return: success or error code
+  /**
+   * Transfer data to device.
+   * Return: success or error code
   */
  template<typename T>
  int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
@ -123,9 +128,9 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: write data to device, non-blocking
-    Return: success or error code
+  /**
+   * Write data to device, non-blocking.
+   * Return: success or error code
   */
  template<typename T>
  int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
@ -139,9 +144,9 @@ public:
  }
  

-  /*
-    Info: read data from device
-    Return: success or error code
+  /**
+   * Read data from device
+   * Return: success or error code
   */
  template<typename T>
  int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
@ -154,9 +159,9 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: read data from device waiting for signal
-    Return: success or error code
+  /**
+   * Read data from device waiting for signal
+   * Return: success or error code
   */
  template<typename T>
  int mic_readDataAsync(const void * data_ptr, void * result, int size, 
@ -172,9 +177,9 @@ public:

  }

-  /* 
-     Info: wait till all the signals are complete
-     Return siccess or error code
+  /**
+   * Wait till all the signals are complete
+   * Return siccess or error code
   */
  int mic_syncDevice() {
    
@ -193,9 +198,9 @@ public:

  }

-  /*
-    Info: free memory on device
-    Return: success or error code
+  /**
+   * Free memory on device
+   * Return: success or error code
   */
  template<typename T>
  int mic_freeMemory(void * data_ptr, int size) {
@ -210,9 +215,9 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: allocate memory and write data to device
-    Return: success or error code
+  /**
+   * Allocate memory and write data to device
+   * Return: success or error code
   */
  template<typename T>
  void * mic_pushData(const void * data, int size) {
@ -227,9 +232,9 @@ public:
  return tmp_ptr;
 }

-/*
-  Info: read data and free memory on device
-  Return: success or erro code
+  /**
+   * Read data and free memory on device
+   * Return: success or erro code
   */
  template<typename T>
  int mic_pullData(void * data_ptr, void * result, int size) {
--- a/src/MIC/MICChiSquare.h
+++ b/src/MIC/MICChiSquare.h
@ -14,6 +14,9 @@
 #include <offload.h>
 #include "MICBase.h"

+/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. 
+ * Not complete and untested because of the poor performance of first MIC devices.
+ */
 class MICChiSquare {

  MICBase *m_micbase;
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@ -22,22 +22,34 @@
 #define I_M 10
 #define DT_M 11

+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(mic_double3 d1, mic_double3 d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }

+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(double dx, double dy, double dz) {
  return (dx * dx + dy * dy + dz * dz);
 }

+/**
+ * MIC device function to check if particle is still in material.
+ */
 __declspec(target(mic))
 bool checkHit(double &z, double *par) {
  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
 }


+/**
+ * MIC device function to calculate arbitrary rotation.
+ */
 __declspec(target(mic))
 void Rot(double &px, double &pz, double &x, double &z, double xplane, 
 	 double normP, double thetacou, double deltas, int coord)
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }

+
+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses AoS to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ * Deprecated on favor of SoA data layout.
+ */
 __declspec(target(mic))
 void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr

 }

+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses SoA to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ */
 __declspec(target(mic))
-void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
+void coulombScat(double *rx, double *ry, double *rz, 
+		 double *px, double *py, double *pz, int *label,
 		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
 {

+  //arrays for temporary storage, each core proceses MIC_WIDTH particles
  double normP[MIC_WIDTH] __attribute__((aligned(64)));
  double deltas[MIC_WIDTH] __attribute__((aligned(64)));
  double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  double z2[MIC_WIDTH] __attribute__((aligned(64)));
  double thetacou[MIC_WIDTH] __attribute__((aligned(64)));

+  //simd instruction tells the compiler its safe to vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  
+  //vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + size; i++) {
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  
-  
  //generate array of random numbers
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  
 }

+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {

@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
    pdead = 1;
 }

+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {

@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
  double *par = (double*) par_ptr;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;

+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target(mic:m_micbase->m_device_id)		\
  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
      
      //for loop trough particles if not checkhit set label to -2 and update R.x
-
 #pragma omp for simd
      for (int i = 0; i < numparticles; i++) {
 	if ( !checkHit(data[i].Rincol.z, par) ) {
@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 {


-
+  //cast device memory pointers to appropriate types
  int *label = (int*)label_ptr;
  unsigned *localID = (unsigned*)localID_ptr;
  double *rx = (double*)rx_ptr;
@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt

  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;

+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target (mic:0) \
  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@ -26,6 +26,12 @@ typedef struct {
 } MIC_PART_SMALL;


+/**
+ * MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes OPALs collimator physics class for particle matter interactions using OpenMP
+ * and offload mode targetomg Intel Xeon Phi processors.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
 class MICCollimatorPhysics : public DKSCollimatorPhysics {

 private:
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@ -10,7 +10,11 @@
 #include "../Algorithms/FFT.h"
 #include "MICBase.h"

-class MICFFT : public DKSFFT {
+/** 
+ * MIC FFT based on BaseFFT interface.
+ * uses MKL library to offload FFT on Intel Xeon Phi devices.
+ */
+class MICFFT : public BaseFFT {

 private:

--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@ -15,6 +15,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1

+/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class MICGreensFunction : public GreensFunction {

 private:
--- a/src/MIC/MICMergeSort.h
+++ b/src/MIC/MICMergeSort.h
@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
  return p;
 }

+/**
+ * Merge sort implementation for intel MIC.
+ * Paralellized over all the MIC cores using OpenMP tasks.
+ */
 template <typename T>
 void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {

@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
  }
 }

+/**
+ * Quicksort algorithm, developed for use on Intel MIC devices.
+ */
 template <typename T>
 void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

 }

+/** 
+ * Insertion sort of @p list, developed for use on Intel MIC.
+ * Used by quick_sort to sort small lists.
+ */
 template <typename T>
 void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

--- a/src/OpenCL/CMakeLists.txt
+++ b/src/OpenCL/CMakeLists.txt
@ -4,6 +4,25 @@ SET (_HDRS OpenCLBase.h)
 SET (_SRCS OpenCLBase.cpp)
 SET (_KERNELS "")

+IF (ENABLE_AMD)
+  SET (_SRCS
+    ${_SRCS}
+    OpenCLFFT.cpp
+    )
+
+  SET (_HDRS
+    ${_HDRS}
+    OpenCLFFT.h
+    )
+
+  SET (_KERNELS
+    ${_KERNELS}
+    OpenCLKernels/OpenCLFFT.cl
+    OpenCLKernels/OpenCLFFTStockham.cl
+    OpenCLKernels/OpenCLTranspose.cl
+  )
+ENDIF (ENABLE_AMD)
+
 IF (ENABLE_MUSR)
  SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
  SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
@ -13,23 +32,18 @@ ENDIF (ENABLE_MUSR)
 IF (ENABLE_AMD AND ENABLE_OPAL)
  SET (_SRCS
    ${_SRCS}
-    OpenCLFFT.cpp
    OpenCLCollimatorPhysics.cpp
    OpenCLGreensFunction.cpp
    )

  SET (_HDRS
    ${_HDRS}
-    OpenCLFFT.h
    OpenCLCollimatorPhysics.h
    OpenCLGreensFunction.h
    )

  SET (_KERNELS
    ${_KERNELS}
-    OpenCLKernels/OpenCLFFT.cl
-    OpenCLKernels/OpenCLFFTStockham.cl
-    OpenCLKernels/OpenCLTranspose.cl
    OpenCLKernels/OpenCLCollimatorPhysics.cl
    OpenCLKernels/OpenCLGreensFunction.cl
  )
--- a/src/OpenCL/OpenCLBase.cpp
+++ b/src/OpenCL/OpenCLBase.cpp
@ -675,17 +675,14 @@ cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) {
 /*
  write data specified by in_data to device memory, device memory space defined by cl_mem
 */
-int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) {
+int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, 
+			      size_t offset, int blocking) 
+{

  cl_int ierr;

-	
-  //std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl;
-  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event);
-	
-  //m_events[m_num_events] = m_last_event;
-  m_events.push_back(m_last_event);
-
+  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, 
+			      in_data, 0, NULL, NULL);
 	
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
@ -716,6 +713,11 @@ int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) {
 */
 int OpenCLBase::ocl_createKernel(const char* kernel_name) {
  cl_int ierr;
+
+  //release the old kernel
+  if (m_kernel != NULL)
+    clReleaseKernel(m_kernel);
+  //create a new kernel
  m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
  if (ierr != CL_SUCCESS) {
    DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
@ -743,23 +745,19 @@ int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) {
  optional: work_group_size - can specify how work items are divided in work groups, 
  if left NULL OpenCL implementation handles this part.
 */
-int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) {
+int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, 
+				  const size_t *work_group_size) 
+{
  cl_int ierr;	

-  cl_event tmp_event;
-  if (m_last_event == NULL) {
-    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
-				  0, NULL, &tmp_event);
-  } else {
-    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
-				  1, &m_last_event, &tmp_event);
-  }
+  ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, 
+				work_items, work_group_size, 
+				0, NULL, NULL);
 	
  if (ierr != CL_SUCCESS)
-    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
-		
-  m_last_event = tmp_event;
-  m_events.push_back(m_last_event);
+    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr 
+	      << " work items: " << *work_items << ", " 
+	      << " work group: " << *work_group_size);

  return ierr;
 }
@ -768,12 +766,13 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const
  read data from device, mem_ptr points to data on device out_data points to memory in host
  blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
 */
-int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) {
+int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, 
+			     size_t offset, int blocking) 
+{
  cl_int ierr;

-  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event);
-
-  m_events.push_back(m_last_event);
+  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, 
+   			     out_data, 0, NULL, NULL);
 	
  if (ierr != CL_SUCCESS)
    DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
@ -937,22 +936,27 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
  if (ierr != DKS_SUCCESS)
    return ierr;

-  //get device properties
+  /* get device properties */
+  //maximum number of work-items in a work group supported by device
  size_t max_group_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
+  //maxumum local memory size per work group
  cl_ulong local_mem_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
+  //get the supported extensions
  size_t ext_size;
  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
  char *ext = new char[ext_size];
  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);

-  //get kernel properties
+  /* get kernel properties */
+  //get max work group size that can be used for this kernel
  size_t kernel_group_size;
  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, 
 			   sizeof(size_t), &kernel_group_size, 0);
  threadsPerBlock = kernel_group_size;

+  //get max local memory size that can be used for this kernel
  cl_ulong kernel_local_mem;
  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
 			   sizeof(cl_ulong), &kernel_local_mem, 0);
@ -961,16 +965,16 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
  std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;


-  std::cout << "Work groups: device limit " << max_group_size << ", "
-	    << "kernel limit " << kernel_group_size << ", "
+  std::cout << "Work group size: max for device " << max_group_size << " > "
+	    << "max for kernel " << kernel_group_size << " > "
 	    << "required " << work_group_size << std::endl;
  

  std::cout << "Local memory: device limit " << local_mem_size << std::endl;
+  std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
  

-
-  std::cout << "Available extensions: " << ext << std::endl;
+  std::cout << std::endl << "Available extensions: " << ext << std::endl;

  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;   

--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@ -1,16 +1,3 @@
-/*
-
-  Name: OpenCLBase
-
-  Author: Uldis Locans
-
-  Info: OpenCL base class to handle all the common details associated 
-  with kernel launch on OpenCL device
-
-  Date: 2014.09.18
-
-*/
-
 #ifndef H_OPENCL_BASE
 #define H_OPENCL_BASE

@ -32,7 +19,7 @@

 #include "../DKSDefinitions.h"

-/* struct for random number state */
+/** struct for random number state. */
 typedef struct {
  double s10;
  double s11;
@ -44,168 +31,194 @@ typedef struct {
  bool gen;
 } RNDState;

+/**
+ * OpenCL base class to handle device setup and basic communication wiht the device.
+ * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
+ * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
+ * class contains methods to read the kernel files, compile the kernel codes and launch kernels
+ * from the compiled codes. Which kernel file needs to be loaded for the specif functin is 
+ * handled by the base class that is launching the kernel.
+ */
 class OpenCLBase {

 private:

+  //variables containig OpenCL device and platform ids
  static cl_platform_id m_platform_id;
  static cl_device_id m_device_id;

+  //variables containit compiled OpenCL program and kernel
  cl_context_properties m_context_properties[3];
  cl_program m_program;
  cl_kernel m_kernel;
 	
+  //variables for tracking OpenCL events
  static cl_event m_last_event;
  cl_int m_num_events;
  std::vector<cl_event> m_events;
 	
+  //currently load kernel file
  char * m_kernel_file;

+  //type of device used by OpenCL
  cl_device_type m_device_type;
 	
-  /*
-    Name: getPlatforms
-    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
-    Return: success or error code
+  /**
+   * Get all available OpenCL platforms.
+   * Get all avaialble platforms and save in m_platform_ids, save number of platforms
+   *  Return: success or error code
   */
  int ocl_getPlatforms();
 	
 	
-  /*
-    Name: getDevice
-    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
-    ReturnL success or error code
+  /**
+   * Get first available OpenCL device of specified type.
+   * Get first avaialble devices and save device id and platform id for this device, 
+   * device name: (-gpu, -mic, -cpu)
+   *  ReturnL success or error code
   */
  int ocl_getDevice(const char* device_name);
 	
-  /*
-    Name getDeviceType
-    Info: get device type from device name (-gpu, -cpu, -mic)
-    Return: success or error code
+  /**
+   * Get cl_device_type from the specified device name.
+   * get device type from device name (-gpu, -cpu, -mic)
+   *  Return: success or error code
   */
  int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
 	
-  /*
-    Name: createContext
-    Info: create context with specified device
-    Return: success or error code
+  /**
+   * Create OpenCL context with specified device.
+   *  Return: success or error code
   */
  int ocl_createContext();
 	
-  /*
-    Name: buildProgram
-    Info: build program from specified kernel file
-    Return: success or error code
+  /**
+   * Build program from specified kernel file.
+   * Return: success or error code.
  */
  int ocl_buildProgram(const char* kernel_file);

-  /** Compile program from kernel source string
-   *
+  /** 
+   * Compile program from kernel source string.
+   * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the 
+   * OpenCL program, that can be then executed on the device.
+   * opts is a string specifiend additional compiler flags.
   */
  int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);

 protected:

+  //memory for random number states
  int defaultRndSet;
  cl_mem defaultRndState;
 	
 	
 public:

+  //OpenCL context and commad queue
  static cl_context m_context;
  static cl_command_queue m_command_queue; 
    
-  /*
-    constructor
+  /**
+   * constructor
   */
  OpenCLBase();
    
-  /*
-    destructor
+  /**
+   * destructor
   */
  ~OpenCLBase();
    
-  /*
-    Create RND states
-    Return: success or error code
+  /**
+   * Allocate memory for size random number states and init the rnd states.
+   * Uses AMD clRng library for random numbers. 
+   * This library is only compatible with AMD devices.
   */
  int ocl_createRndStates(int size);

-  /* Create an array of random numbers on the device
-   *
+  /** 
+   * Create an array of random numbers on the device.
+   * Filles hte mem_ptr with random numbers.
   */
  int ocl_createRandomNumbers(void *mem_ptr, int size);

-  /*
-    Destroy rnd states
-    Return: success or error code
+  /**
+   * Destroy rnd states and free device memory.
+   * Return: success or error code
   */
  int ocl_deleteRndStates();


-  /*
-    Name: getAllDevices
-    Info: get all available devices
-    ReturnL success or error code
+  /**
+   * Prints info about all the available platforms and devices.
+   * Can be used for information purposes to see what devices are available on the system.
+   * ReturnL success or error code.
  */
  int ocl_getAllDevices();

-  /** Get the OpenCL device count for the set type of device
-   *
+  /** 
+   * Get the OpenCL device count for the set type of device.
+   * Device count is set in ndev parameter, returns success or error code.
   */
  int ocl_getDeviceCount(int &ndev);

-  /** Get the name of the device used
+  /** 
+   * Get the name of the device currently us use.
   */
  int ocl_getDeviceName(std::string &device_name);

-  /** Set the device to use for OpenCL kernels.
-   *  device id to use is passed as integer.
+  /** 
+   * Set the device to use for OpenCL kernels.
+   * Device id to use is passed as integer.
   */
  int ocl_setDevice(int device);

-  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
+  /** 
+   * Get a list of all the unique devices of the same type that can run OpenCL kernels.
   * Used when GPUs of different types might be pressent on the system.
   */
  int ocl_getUniqueDevices(std::vector<int> &devices);
    
-  /*
-    Name: setUp
-    Info: set up opencl resources
-    Return: success or error code
+  /**
+   * Initialize OpenCL connection with a device of specified type.
+   * Find if specified device is avaialble, creates a contex and command queue.
+   * Returns success or error code.
   */
  int ocl_setUp(const char* device_name);
 	
-  /*
-    Name: loadKernel
-    Info: load and compile opencl kernel file if it has changed
-    Return: success or error code
+  /**
+   * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
+   * Load and compile opencl kernel file if it has changed.
+   * Return: success or error code
  */
  int ocl_loadKernel(const char* kernel_file);


-  /** Build program from kernel source.
+  /** 
+   * Build program from kernel source.
   * Builds a program from source code provided in kernel_source.
   * If compilation fails will return DKS_ERROR
   */
  int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
 	
-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory on the device.
+   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int &ierr);

-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory of specific type on device.
+   * The availabel types are cl_mem_flags type listed in OpenCL documentation:
+   * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, 
+   * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
+   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
 	
-  /** Zero OpenCL memory buffer
-   *  Set all the elemetns in the device array to zero
+  /** 
+   * Zero OpenCL memory buffer.
+   * Set all the elemetns in the device array to zero.
   */
  template <typename T>
  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@ -218,91 +231,89 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Name: writeData
-    Info: write data to device memory (needs ptr to mem object)
-    Return: success or error code
+  /**
+   * Write data to device memory (needs ptr to mem object)
+   * Return: success or error code
   */
  int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: copyData
-    Info: copy data from one buffer on the device to another
-    Return: success or error code
+  /** 
+   * Copy data from one buffer on the device to another
+   * Return: success or error code
   */
  int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
 	
-  /*
-    Name: createKernel
-    Info: create kernel from program
-    Return: success or error code
+  /** 
+   * Create kernel from compiled OpenCL program.
+   * Return: success or error code
   */
  int ocl_createKernel(const char* kernel_name);
 	
-  /*
-    Name: setKernelArgs
-    Info: set opencl kernel arguments
-    Return: success or error code
+  /**
+   * Set argiments for the kernel that will be launched.
+   * Return: success or error code
   */
  int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
 	
-  /*
-    Name: executeKernel
-    Info: execute selected kernel (needs kernel parameters)
-    Return: success or error code
+  /**
+   * Execute selected kernel.
+   * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
+   * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
+   * kernel arguments must be set.
+   * Return: success or error code
  */
  int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
 	
-  /*
-    Name: readData
-    Info: read data from device (needs pointer to mem object)
-    Return: success or error code
+  /**
+   * Read data from device (needs pointer to mem object).
+   * Return: success or error code
   */
  int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: freeMemory
-    Info: free device memory (needs ptr to mem object)
-    Return: success or error code
+  /**
+   * Free device memory (needs ptr to mem object).
+   *  Return: success or error code
   */
  int ocl_freeMemory(cl_mem mem_ptr);
 	
-  /*
-    Name: cleanUp
-    Info: free opencl resources
-    Return: success or error code
+  /**
+   * Free opencl resources.
+   * Deletes the kernel, compiled program, command queue and colese the connection
+   * to device by releasing the context.
+   * Return: success or error code
   */
  int ocl_cleanUp();
 	
-  /*
-    Name: deviceInfo
-    Info: print device info (mostly for debugging purposes)
-    Return: success or error code
+  /**
+   * Print info of currently selected device.
+   * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
+   * Return: success or error code
   */
  int ocl_deviceInfo(bool verbose = true);

-  /* Check OpenCL kernel.
-   * Query device and check if it can run the kernel with required parameters
+  /* 
+   * Check OpenCL kernel.
+   * Query device and check if it can run the kernel with required parameters.
+   * Also check the available OpenCL extensions - usefull for checking the supported device
+   * features, like double precission.
   */
  int ocl_checkKernel(const char* kernel_name, int work_group_size,
 		      bool double_precision, int &threadsPerBlock);
 	
-  /*
-    Name: clearEvents
-    Info: clear saved events (for debuging purposes)
-    Return: nothing
+  /**
+   * Clear the event list.
+   * Events can be used for timing and synchronization purposes.
   */
  void ocl_clearEvents();

-  /*
-    Name: eventInfo
-    Info: print information about kernel timings (for debuging purposes)
-    Return: nothing
+  /**
+   * print information about kernel timings from event list.
+   * for debuging purposes
   */
  void ocl_eventInfo();

-  /*
-    Return current command queue
+  /**
+   * Return current command queue.
   */
  cl_command_queue ocl_getQueue() { return m_command_queue; }
 };
--- a/src/OpenCL/OpenCLChiSquare.h
+++ b/src/OpenCL/OpenCLChiSquare.h
@ -14,7 +14,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1

-
+/** Deprecated, SimpleFit implementation of ChiSquare. */
 class OpenCLChiSquare {

 private:
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@ -79,7 +79,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
  
  int ierr;
  //calc number of threads per workgroup and nr of work groups
-  size_t work_size_sum = 128;
+  size_t work_size_sum = (size_t)blockSize_m;

  /*
  size_t work_items = (size_t)length;
@ -98,7 +98,6 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
  
  //execute sum kernel
-  //ocl_createKernel("parallelReductionSum");
  m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
@ -141,6 +140,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
  //set work item size
  size_t work_items;
  size_t work_size = (size_t)blockSize_m;
+
  if (numBlocks_m < 0)
    work_items = (size_t)length;
  else
@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
 }

 int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
+  //write params to gpu
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
  return ierr;
 }
@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
  if (numfunc == 0)
    return DKS_SUCCESS;

+  //write function values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
  return ierr;
 }
@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  if (nummap == 0)
    return DKS_SUCCESS;

+  //wrtie map values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
  return ierr;
 }
@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
    freeChiSquare();
  }

-  //allocate temporary memory
+  //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
  mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
  mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
  if (size_func == 0)
@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
  int ierr = DKS_ERROR;
  if (initDone_m) {

-    //free memory
+    //free GPU memory
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@ -308,7 +311,12 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
    return DKS_ERROR;
  }

-  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
+  //check the GPU kernel
+  ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
+  if (threadsPerBlock < blockSize_m) {
+    std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
+    blockSize_m = threadsPerBlock;
+  }

  return ierr;

--- a/src/OpenCL/OpenCLChiSquareRuntime.h
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *

 const std::string openclFunctFooter = "}\n";

+/**
+ * OpenCL implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to target devices that
+ * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
+ */
 class OpenCLChiSquareRuntime : public ChiSquareRuntime {

 private:

  OpenCLBase *m_oclbase;

-  /** Private function to add user defined function to kernel string
-   *
+  /** 
+   * Private function to add user defined function to kernel string.
   */
  std::string buildProgram(std::string function);

+  /**
+   * Launch parallel reduction kernel to calculate the sum of data array
+   */
  double calculateSum(cl_mem data, int length);

 public:

-  /** Constructor wiht openclbase argument
-   *
+  /** 
+   * Constructor wiht openclbase argument.
   */
  OpenCLChiSquareRuntime(OpenCLBase *base);
  
-  /** Default constructor
-   *
+  /** 
+   * Default constructor
   */
  OpenCLChiSquareRuntime();

-  /** Default destructor
-   *
+  /** 
+   * Default destructor
   */
  ~OpenCLChiSquareRuntime();

-    /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);

-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
   * Result is put in &result variable
   */
@ -64,22 +74,26 @@ public:
 		      double timeStart, double timeStep,
 		      double &result);

-  /** Write params to device.
+  /** 
+   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 

-  /** Write functions to device.
+  /** 
+   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);

-  /** Write maps to device.
+  /** 
+   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);

-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -87,14 +101,16 @@ public:
   */
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);

-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();

-  /** Check MuSR kernels for necessary resources.
+  /** 
+   * Check MuSR kernels for necessary resources.
   * Query device properties to get if sufficient resources are
-   * available to run the kernels
+   * available to run the kernels. Also checks if double precission is enabled on the device.
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock);

--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@ -17,12 +17,16 @@
 #include "boost/compute/core.hpp"
 */

+/** Double3 structure for use in OpenCL code. */
 typedef struct {
  double x;
  double y;
  double z;
 } Double3;

+/**
+ * Structure for stroing particles in OpenCL code.
+ */
 typedef struct {
  int label;
  unsigned localID;
@ -35,6 +39,10 @@ typedef struct {
 //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
 //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));

+/**
+ * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
+ */
 class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {

 private:
@ -42,16 +50,20 @@ private:

 public:

-  /* constructor */
+  /** 
+   * Constructor with OpenCLBase as argument.
+   * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
+   */
  OpenCLCollimatorPhysics(OpenCLBase *base) { 
    m_oclbase = base;
  }

-  /* destructor */
+  /** 
+   * Destructor.
+   */
  ~OpenCLCollimatorPhysics() { 
  }

-  /* execute degrader code on device */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 			bool enableRutherforScattering = true);

--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@ -1,14 +1,3 @@
-/*
-
-  Name: OpenCLFFT
-
-  Author: Uldis Locans
-
-  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
-
-  Data: 19.09.2014
-
-*/
 #ifndef H_OPENCL_FFT
 #define H_OPENCL_FFT

@ -22,7 +11,13 @@

 #include "clFFT.h"

-class OpenCLFFT : public DKSFFT {
+/**
+ * OpenCL FFT class based on BaseFFT interface.
+ * Uses clFFT library to perform FFTs on AMD gpus.
+ * clFFT library works also on nvida GPUs and other devices that
+ * support OpenCL.
+ */
+class OpenCLFFT : public BaseFFT {

 private:

@ -112,8 +107,7 @@ public:
 		   int streamId = -1);
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 		   int streamId = -1);
-  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
-    {
+  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) {
    return DKS_ERROR;
  }

--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@ -7,6 +7,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "OpenCLBase.h"

+/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class OpenCLGreensFunction : public GreensFunction {

 private:
@ -31,7 +32,7 @@ public:
  int buildProgram();

  /**
-    Info: calc itegral on device memory (taken from OPAL src code)
+    Info: calc itegral on device memory (taken from OPAL src code).
    Return: success or error code
  */
  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
@ -39,20 +40,20 @@ public:
 		       int streamId = -1);
 		
  /**
-    Info: integration of rho2_m field (taken from OPAL src code)
+    Info: integration of rho2_m field (taken from OPAL src code).
    Return: success or error code
  */
  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
 				  int streamId = -1);
 		
  /**
-    Info: mirror rho field (taken from OPAL src code)
+    Info: mirror rho field (taken from OPAL src code).
    Return: succes or error code
  */
  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);

  /**
-    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
+    Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
    Return: success or error code
  */
  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
--- a/src/Utility/DKSTimer.h
+++ b/src/Utility/DKSTimer.h
@ -5,6 +5,10 @@
 #include <string>
 #include <sys/time.h>

+/**
+ * Custom timer class.
+ * Allows to insert timers in the code to get function exectution times.
+ */
 class DKSTimer {

 private:
@ -17,38 +21,44 @@ private:

 public:

-  /** Init DKSTimer by seting timer to zero  */
+  /** Init DKSTimer by seting timer to zero. */
  DKSTimer();

  ~DKSTimer();

-  /** Init the timer
+  /** 
+   * Init the timer.
   * Set the name for timer and clear all values
   */
  void init(std::string n);

-  /** Start the timer.
+  /** 
+   * Start the timer.
   * Get the curret time with gettimeofday and save in timeStart
   */
  void start();

-  /** Stop the timer 
+  /** 
+   * Stop the timer.
   * Get the curretn time with gettimeofday and save in timeEnd
   * Calculate elapsed time by timeEnd - timeStart and add to timervalue
   */
  void stop();

-  /** Reset timervalue to zero.
+  /** 
+   * Reset timervalue to zero.
   * Set timervalue, timeStart and timeEnd to zero
   */
  void reset();

-  /** Return elapsed time in seconds.
+  /** 
+   * Return elapsed time in seconds.
   * Return the value of timervalue
   */
  double gettime();

-  /** Print timer.
+  /** 
+   * Print timer.
   * Print the elapsed time of the timer
   */
  void print();
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -39,8 +39,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #TARGET_LINK_LIBRARIES(testFFT dks)
 #TARGET_LINK_LIBRARIES(testMIC dks)
 #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
-TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
-TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testFFT3D dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
 #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
 #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -54,11 +54,11 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testGather dks)
 #TARGET_LINK_LIBRARIES(testGatherAsync dks)
 #TARGET_LINK_LIBRARIES(testTranspose dks)
-TARGET_LINK_LIBRARIES(testRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
-TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
-TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testRandom dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testPush dks)
-TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
+TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testIntegration dks)
 #TARGET_LINK_LIBRARIES(testImageReconstruction dks)
Author	SHA1	Message	Date
Uldis Locans	24604042e7	Coulomb scattering and rot updated in cuda version	2017-09-19 13:36:42 +02:00
Andreas Suter	5c9048a308	added a macOS specific flag which deals with path variables of shared libs	2017-08-21 14:16:57 +02:00
Uldis Locans	ba701a5744	remove opencl event tracking since it causes memory leak	2017-08-18 14:11:35 +02:00
Uldis Locans	74b12ebdc0	release old OpenCL kernel before creating a new one	2017-08-18 13:13:55 +02:00
Uldis Locans	6d14df5b32	update chiSquare test functions to check the created kernel before execution	2017-08-17 16:57:30 +02:00
Uldis Locans	79833cf7f5	update work item size correctly for devices where supported size is smaller than DKS default	2017-08-17 16:56:57 +02:00
Uldis Locans	ce3491740c	remove openmp flags for apple machines	2017-08-17 16:54:07 +02:00
Uldis Locans	fc6d2ccd4a	update doxygen HTML output	2017-08-10 15:50:25 +02:00
Uldis Locans	d1774b84e8	added doxygen files	2017-08-10 14:58:12 +02:00
Uldis Locans	ccc4329bef	updated documentation	2017-08-10 14:57:48 +02:00
Uldis Locans	7ca93a3a49	dont link with boost filesystem	2017-06-07 11:10:38 +02:00
Uldis Locans	aa14065994	Merge branch 'master' of gitlab.psi.ch:uldis_l/DKS	2017-06-06 14:24:20 +02:00
Uldis Locans	50ecb31042	remove any OpenCL calls when DKS compiled without OpenCL	2017-06-06 14:23:24 +02:00
Uldis Locans	3d130aa01f	write CUDA libraries that are needed for linking in config file	2017-06-06 14:22:34 +02:00
Uldis Locans	5071ea5741	add option to link DKS with static cuda libraries	2017-06-06 14:22:00 +02:00
Uldis Locans	efe5f0db38	remove nvToolsExt for manual profiling	2017-05-30 11:07:22 +02:00
Uldis Locans	1d420504cc	fix FFT test when OPAL and/or Musr modules are not compiled	2017-05-30 11:05:12 +02:00
Uldis Locans	cc59f550ab	move patch version forward	2017-05-29 13:29:08 +02:00
Uldis Locans	d20fea2caa	readme updated	2017-05-29 13:28:23 +02:00
Uldis Locans	8b7d824b3a	updated documentation	2017-05-29 13:21:34 +02:00
Uldis Locans	2c9fe4ea6f	fix for seperate FFT	2017-05-29 13:18:37 +02:00
Uldis Locans	e32f9aaff2	include FFT in DKSOPAL and DKSBaseMuSR	2017-05-29 12:49:32 +02:00
Uldis Locans	f3527969cb	seperate FFT from DKSOPAL	2017-05-29 09:39:25 +02:00