commit 4fa529aaeaa65f3e0af23649bcc893d693a0cc35
Author: Uldis Locans <uldis.locans@gmail.com>
Date:   Mon Oct 10 14:49:32 2016 +0200

    snapshot of svn

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..9c08e39
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,174 @@
+CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
+PROJECT (DKS)
+SET (DKS_VERSION_MAJOR 1)
+SET (DKS_VERSION_MINOR 0.1)
+SET (PACKAGE \"dks\")
+SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\")
+SET (PACKAGE_NAME \"DKS\")
+SET (PACKAGE_STRING \"DKS\ 1.0.1\")
+SET (PACKAGE_TARNAME \"dks\")
+SET (PACKAGE_VERSION \"1.0.1\")
+SET (VERSION \"1.0.1\")
+
+SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+
+#get compiler name
+#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
+STRING (REGEX REPLACE ".*/" "" COMPILER_NAME ${CMAKE_CXX_COMPILER})
+MESSAGE (STATUS "Your compiler is: ${COMPILER_NAME}")
+MESSAGE (STATUS "Your compiler is: ${CMAKE_CXX_COMPILER}")
+
+MESSAGE (STATUS "C compiler: ${CMAKE_C_COMPILER_ID}")
+MESSAGE (STATUS "CXX compiler: ${CMAKE_CXX_COMPILER_ID}")
+
+#opencl and cuda kernel files are in the builds include directory
+SET (OPENCL_KERNELS -DOPENCL_KERNELS=\\"${CMAKE_INSTALL_PREFIX}/include/\\")
+MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
+
+#find boost
+set (BOOSTROOT $ENV{BOOST_DIR})
+SET (Boost_USE_STATIC_LIBS OFF)
+SET (Boost_USE_STATIC_RUNTIME OFF)
+FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
+IF (Boost_FOUND)
+  MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
+  MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
+  MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
+  INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
+  LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
+ENDIF (Boost_FOUND)
+
+#enable UQTK
+OPTION (USE_UQTK "Use UQTK" OFF)
+
+
+#intel icpc compiler specific flags
+IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
+
+  #for intel compiler turn on openmp and opencl
+  OPTION (USE_OPENCL "Use OpenCL" ON)
+  OPTION (USE_CUDA "Use CUDA" OFF)
+  OPTION (USE_MIC "Use intel MIC" ON)
+  
+  #find xiar and xild and set flags for offload build on mic
+  FIND_PROGRAM(XIAR xiar)
+  IF(XIAR)
+    MESSAGE(STATUS "xiar found: ${XIAR}")
+    SET(CMAKE_AR "${XIAR}")
+  ENDIF(XIAR)
+  MARK_AS_ADVANCED(XIAR)
+  SET(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> rcs -qoffload-build <TARGET> <LINK_FLAGS> <OBJECTS>")
+  SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs -qoffload-build <TARGET> <LINK_FLAGS> <OBJECTS>")
+  
+  FIND_PROGRAM(XILD xild)
+  IF(XILD)
+    SET(CMAKE_LINKER "${XILD}")
+  ENDIF(XILD)
+  MARK_AS_ADVANCED(XILD)
+
+  #set flags for openmp and opencl
+  #TODO: check which opencl to use: nvidia, amd, intel, apple
+  SET (CMAKE_CXX_FLAGS "-DDEBUG -O3 -Wall -offload -mkl -openmp -lOpenCL -lpthread -DDKS_MIC -DDKS_OPENCL -qopt-report=5 -qopt-report-phase=vec -std=c++11")
+
+  IF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc")
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
+  ENDIF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc")
+  
+ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
+
+#gnu copmpiler specific flags
+IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
+  
+  
+  OPTION (USE_OPENCL "Use OpenCL" ON)
+  OPTION (USE_CUDA "Use CUDA" OFF)
+  OPTION (USE_MIC "Use intel MIC" OFF)
+  
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
+
+  FIND_PACKAGE(CUDA)
+  IF (CUDA_FOUND)
+    SET (USE_CUDA ON)
+    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+    LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+    
+    MESSAGE (STATUS "cuda include: ${CUDA_INCLUDE_DIRS}")
+    MESSAGE (STATUS "cuda libs: ${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+    MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
+
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
+    SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
+    
+    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
+
+    #if cuda version >= 7.0 add runtime commpilation flags
+    IF (NOT CUDA_VERSION VERSION_LESS "7.0")
+      SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
+    ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
+    
+    MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
+    
+    SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
+    #set(CUDA_SEPARABLE_COMPILATION ON)
+    SET(BUILD_SHARED_LIBS OFF)
+
+  ENDIF (CUDA_FOUND)
+
+  IF (NOT CUDA_FOUND)
+
+    MESSAGE(STATUS "CUDA not found, looking for OpenCL")
+
+    FIND_PACKAGE(OpenCL)
+    IF (OpenCL_FOUND)
+      MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
+      MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
+      MESSAGE(STATUS "OpenCL library dir: ${OpenCL_LIBRARY}")
+      INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIR})
+      LINK_DIRECTORIES(${OpenCL_LIBRARY})
+    ENDIF (OpenCL_FOUND)
+
+  ENDIF (NOT CUDA_FOUND)
+
+  #if mac OS and no CUDA set apple opencl flags
+  IF (APPLE AND NOT CUDA_FOUND)
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -framework opencl -lpthread -DDKS_OPENCL")
+  ENDIF(APPLE AND NOT CUDA_FOUND)
+
+  #if cuda found set cuda opencl flags
+  IF (CUDA_FOUND)
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
+  ENDIF (CUDA_FOUND)
+
+  #if cuda not found but amd opencl found set opencl flags
+  IF (NOT CUDA_FOUND AND OpenCL_FOUND)
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
+  ENDIF(NOT CUDA_FOUND AND OpenCL_FOUND)
+
+  #if mpi compiler used set mpi flag
+  IF (${COMPILER_NAME} STREQUAL "mpicxx")
+    SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
+  ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
+
+ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
+
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
+MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
+
+ADD_SUBDIRECTORY (src)
+
+IF (ENABLE_TESTS)
+  ADD_SUBDIRECTORY (test)
+ENDIF (ENABLE_TESTS)
+
+ADD_SUBDIRECTORY (auto-tuning)
+
+### write configure files ###
+CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake )
+
+### install files ###
+INSTALL (
+  FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake
+  DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
+  RENAME ${PROJECT_NAME}Config.cmake
+  )
diff --git a/ReadMe.first b/ReadMe.first
new file mode 100644
index 0000000..e781b63
--- /dev/null
+++ b/ReadMe.first
@@ -0,0 +1,82 @@
+##################################################################
+#
+# Name:		Dynamic Kernel Scheduler
+# Version:	1.0
+# Author: 	Uldis Locans
+# Contacts:	locans.uldis@psi.ch
+#
+##################################################################
+
+Dynamic Kernel Scheduler is a library that provides a software layer between host application
+and hardware accelerators. DKS handles communication between host and device and schedules task
+execution using predefined algorithms writen using CUDA and OpenCL for GPUs, and OpenMP with
+offload pragmas for IntelMIC. See DKSBase class documentation for full list of functions provided
+by DKS.
+
+#####Requirements#####
+
+OpenMPI (Cuda aware OpenMPI enabled for full compatability)
+g++ or icpc compiler
+Cuda 7.0 or higher (optional)
+Nvidia or Intel OpenCL SDK (optional)
+Intel MIC compilers (optional)
+
+
+######Install######
+
+#check out DKS
+svn co svn+ssh://YOULOGIN@savannah02.psi.ch/repos/amas/users/adelmann/Ph.D-students/Locans/work/DKS/trunk DKS
+
+#set compilers to use
+#supported c++ compilers: g++, icpc, mpicxx whith g++
+#supported c compilers: gcc, icc, mpicc whith gcc
+export CXX_COMPILER=cpp_compiler_name
+export CC_COMPILER=c_compiler_name
+
+#set dks root directory directory
+cd DKS
+export DKS_ROOT = $PWD
+
+#set build directory
+mkdir $DKS_BUILD_DIR
+cd $DKS_BUILD_DIR
+
+#set install directory
+export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
+
+CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
+
+make
+make install
+
+
+######DKS usage######
+Make install copies the include files and library files to $DKS_BUILD_DIR/build folder, lib folder
+in the build directory contains libdks.a and libdksshared.so, on of these libraries can be used to link
+with DKS. All the necessary include files are located in $DKS_BUILD_DIR/build/include.
+
+Additional flags needed for CUDA and OpenCL mode:
+-lcudart -lcufft -lcublas -lnvToolsExt -lOpenCL -lnvrtc -lcuda -DDKS_CUDA -DDKS_OPENCL
+
+Additional flags needed for IntelMIC and OpenCL mode:
+-offload -mkl -openmp -lOpenCL -DDKS_MIC -DDKS_OPENCL
+
+Note: always run make install, during runtime OpenCL and CUDA will search for kernel files in 
+$DKS_INSTALL_DIR/build/include directory for runtime compilation.
+
+######Running DKS######
+
+#running with cuda
+#nvidia multi process service started for better CUDA and MPI execution
+
+#to start mps service (if multiple users use DKS start MPS as root)
+nvidia-cuda-mps-control -d
+#to stop mps service
+echo quit | nvidia-cuda-mps-control
+
+
+#runnign dks with MIC
+#Intel Manycore Platform Software Stack (mpss) service started
+
+#to start mpss
+service mpss start
diff --git a/auto-tuning/CMakeLists.txt b/auto-tuning/CMakeLists.txt
new file mode 100644
index 0000000..e3be789
--- /dev/null
+++ b/auto-tuning/CMakeLists.txt
@@ -0,0 +1,19 @@
+INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+
+#chi square kernel tests
+ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
+
+ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
+TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
+
+IF (USE_UQTK)
+  ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
+  TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+ENDIF (USE_UQTK)
+#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
+
+#test to verify search functions
+ADD_EXECUTABLE(testSearch testSearch.cpp)
+TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
diff --git a/auto-tuning/testChiSquareRT.cpp b/auto-tuning/testChiSquareRT.cpp
new file mode 100644
index 0000000..01e4ae0
--- /dev/null
+++ b/auto-tuning/testChiSquareRT.cpp
@@ -0,0 +1,385 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <fstream>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+#define PI 3.14159265358979323846
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+#define N0 0.25
+#define TAU 2.197019
+#define BKG 1.0
+
+#define ALPHA 1.0
+#define BETA 1.0
+
+using namespace std;
+
+void randData(double *data, int N, int scale = 1) {
+  for (int i = 0; i < N; i++)
+    data[i] = ((double)rand() / RAND_MAX ) * scale;
+}
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double t, double lamda) {
+  return exp( -lamda*t );
+}
+
+double ge(double t, double lamda, double beta) {
+  return exp( -pow(lamda*t, beta) );
+}
+
+double sg(double t, double sigma) {
+  return exp( -0.5 * pow(sigma*t, 2) );
+}
+
+double stg(double t, double sigma) {
+  double sigmatsq = pow(sigma*t,2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double t, double lambda) {
+  double lambdat = lambda*t;
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double t, double lambda, double sigma) {
+  double lambdat = lambda*t;
+  double sigmatsq = pow(sigma*t, 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double t, double sigma, double beta) {
+  if (beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(sigma*t, beta);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
+}
+
+double spg(double t, double lambda, double gamma, double q) {
+  double lam2 = lambda*lambda;
+  double lamt2q = t*t*lam2*q;
+  double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double t, double nu, double lambda) {
+  double nut  = nu*t;
+  double nuth = nu*t/2.0;
+  double lamt = lambda*t;
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double t, double phi, double nu) {
+  double tmp_nu = TWO_PI*nu*t;
+  double tmp_phi = DEG_TO_RAD * phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+
+  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+double b(double t, double phi, double nu) {
+  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
+}
+
+double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI * nu * t;
+  double ph = DEG_TO_RAD * phi;
+
+  return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+double ab(double t, double sigma, double gamma) {
+  double gt = gamma*t;
+
+  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double t, double Delta0, double Rb) {
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double t, double phi, double nu, double Delta0, double Rb) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double t, double Delta0, double Rb, double nuc) {
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
+}
+
+double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
+}
+
+
+double cpuChiSq(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, 
+		double timeStart, double timeStep, bool mlh = false) 
+{
+
+  double result = 0.0;
+  for (int i = 0; i < Ndata; i++) {
+    
+    double t = timeStart + i*timeStep;
+    double d = data[i];
+    double e = data[i];
+
+    double fTheory = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
+    double theo = N0 * exp(-t/TAU) * (1.0 + fTheory) + BKG;
+
+    if (mlh) {
+      if ((d > 1.0e-9) && (fabs(theo) > 1.0e-9))
+	result += 2.0 * ((theo - d) + d * log(d / theo));
+      else
+	result += 2.0 * (theo - d);
+    } else {
+      if (e != 0.0)
+	result += ( (theo - d) * (theo - d) ) / (e * e);
+      else
+	result += theo * theo;
+    }
+  }
+
+  return result;
+}
+
+double cpuChiSqAsym(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, 
+		    double timeStart, double timeStep, bool mlh = false) 
+{
+
+  double result = 0.0;
+  for (int i = 0; i < Ndata; i++) {
+    
+    double t = timeStart + i*timeStep;
+    double d = data[i];
+    double e = data[i];
+
+    double theoVal = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
+    double ab = ALPHA * BETA;
+    
+
+    double theo = ((ab+1.0)*theoVal - (ALPHA-1.0))/((ALPHA+1.0) - (ab-1.0)*theoVal);
+
+    if (mlh) {
+      result += 0.0; //log max likelihood not defined here
+    } else {
+      if (e != 0.0)
+	result += ( (theo - d) * (theo - d) ) / (e  * e);
+      else
+	result += theo * theo;
+    }
+  }
+
+  return result;
+}
+
+int runTest(const char *api_name, const char *device_name, bool autotune, bool mlh, bool asym) {
+
+  int ierr;
+
+  /* 
+   * Histogram size used in tests. If autotune run kernes with sizes from 1e5 to 1e6.
+   * If autotune is off just run the test once (used for debuging to test the kernel) 
+   */ 
+  int Nstart = 1e5;
+  int Nstep = 1e5;
+  int Nend = (autotune) ? 1e6 : 1e5; 
+
+  //parameter, function and map sizes used in tests
+  int Npar = 66;
+  int Nfnc = 2;
+  int Nmap = 5;
+
+  //print test info
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Max log likelihood: " << std::boolalpha << mlh << endl;
+  cout << "Asymetry fit: " << std::boolalpha << asym << endl;
+
+  DKSBaseMuSR dksbase;
+  dksbase.setAPI(api_name);
+  dksbase.setDevice(device_name);
+  ierr = dksbase.initDevice();
+  if (ierr != DKS_SUCCESS) {
+    std::cout << "Device not supported!" << std::endl;
+    return DKS_ERROR;
+  }
+
+  //get the list of different devices
+  std::vector<int> devices;
+  dksbase.getDeviceList(devices);
+  std::cout << "Unique devices: " << devices.size() << std::endl;
+
+  //create the function string to use in test
+  string sFnc = "p[m[0]] * f[m[1]] * sg(t, p[m[2]]) * tf(t, p[m[3]], f[m[4]])";
+  int map[5] = {0, 0, 1, 2, 1};
+
+  //runt tests from 100k to 1mil data points
+  for (unsigned int device = 0; device < devices.size(); device++) {
+    for (int Ndata = Nstart; Ndata <= Nend; Ndata += Nstep) {
+
+      dksbase.setDefaultDevice(device);
+
+      std::cout << "Ndata: " << Ndata << std::endl;
+
+      //init the chi square calculations
+      dksbase.initChiSquare(Ndata, Npar, Nfnc, Nmap);
+
+      //create random arrays for data, parameter and function storage
+      double *data = new double[Ndata];  
+      double *par = new double[Npar];
+      double *fnc = new double[Nfnc];
+  
+      randData(data, Ndata);
+      randData(par, Npar);
+      randData(fnc, Nfnc, 100);
+
+      //allocate memory on device
+      void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+
+      //write data, params, functions and maps to the device
+      dksbase.writeData<double>(data_ptr, data, Ndata);
+      dksbase.writeParams(par, Npar);
+      dksbase.writeFunctions(fnc, Nfnc);
+      dksbase.writeMaps(map, Nmap);
+    
+      //set musrfit constants
+      dksbase.callSetConsts(N0, TAU, BKG);
+      dksbase.callSetConsts(ALPHA, BETA);
+
+      //compile the program created with the function string
+      dksbase.callCompileProgram(sFnc, mlh);
+
+      //set autotuning on/off
+      if (autotune)
+	dksbase.setAutoTuningOn();
+      
+      //tmp values to store results and tmp values for time steps and start time
+      double result_gpu = 0.0;
+      double result_cpu = 0.0;
+      double dt = 1e-12;
+      double ts = 1e-7;
+
+      //execute kernel on the GPU and execute the same function on the cpu
+      if (!asym) {
+	dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, Npar, Nfnc, 
+				    Nmap, ts, dt, result_gpu);
+	result_cpu = cpuChiSq(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);   
+      } else {
+	dksbase.callLaunchChiSquare(2, data_ptr, data_ptr, Ndata, Npar, Nfnc, 
+				    Nmap, ts, dt, result_gpu);
+	result_cpu = cpuChiSqAsym(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);
+      }
+
+      //check the results 
+      cout << "DKS: " << result_gpu << endl;
+      cout << "CPU: " << result_cpu << endl;
+
+      //free CPU and GPU memory
+      dksbase.freeMemory<double>(data_ptr, Ndata);
+      dksbase.freeChiSquare();
+  
+      delete[] data;
+      delete[] par;
+      delete[] fnc;
+      cout << "------------------------------------------------------------" << endl;
+    }
+  }
+
+  return DKS_SUCCESS;
+}
+
+int main(int argc, char* argv[]) {
+
+  bool asym = false;
+  bool mlh = false;
+  bool autotune = false;
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; ++i) {
+    
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    }
+    
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    }
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-mlh"))
+      mlh = true;
+
+    if (argv[i] == string("-asym"))
+      asym = true;
+
+    if (argv[i] == string("-autotune"))
+      autotune = true;
+
+  }
+
+  int numPlatforms = 2;
+  const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
+  const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
+
+  for (int i = 0; i < numPlatforms; i++) {
+    runTest(api[i], device[i], autotune, mlh, asym);
+  }
+
+  return 0;
+}
diff --git a/auto-tuning/testChiSquareRTRandom.cpp b/auto-tuning/testChiSquareRTRandom.cpp
new file mode 100644
index 0000000..b9e9b53
--- /dev/null
+++ b/auto-tuning/testChiSquareRTRandom.cpp
@@ -0,0 +1,450 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <fstream>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+#define PI 3.14159265358979323846
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+//#define N0 0.25
+#define N0 1e-10
+#define TAU 2.197019
+#define BKG 0.05
+
+using namespace std;
+
+typedef std::function<double()> doubleF;
+
+void randData(double *data, int N, int scale = 1) {
+  for (int i = 0; i < N; i++)
+    data[i] = ((double)rand() / RAND_MAX ) * scale;
+}
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double *t, double *lamda) {
+  return exp( -*lamda**t );
+}
+
+double ge(double *t, double *lamda, double *beta) {
+  return exp( -pow( (*lamda)*(*t), *beta) );
+}
+
+double sg(double *t, double *sigma) {
+  return exp( -0.5 * pow((*sigma)*(*t), 2) );
+}
+
+double stg(double *t, double *sigma) {
+  double sigmatsq = pow((*sigma)*(*t),2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double *t, double *lambda) {
+  double lambdat = *lambda*(*t);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double *t, double *lambda, double *sigma) {
+  double lambdat = *lambda*(*t);
+  double sigmatsq = pow(*sigma*(*t), 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double *t, double *sigma, double *beta) {
+  if (*beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(*sigma*(*t), (*beta));
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
+}
+
+double spg(double *t, double *lambda, double *gamma, double *q) {
+  double lam2 = (*lambda)*(*lambda);
+  double lamt2q = (*t)*(*t)*lam2*(*q);
+  double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double *t, double *nu, double *lambda) {
+  double nut  = *nu*(*t);
+  double nuth = *nu*(*t)/2.0;
+  double lamt = *lambda*(*t);
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double *t, double *phi, double *nu) {
+  double tmp_nu = TWO_PI**nu**t;
+  double tmp_phi = DEG_TO_RAD * *phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+
+  return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double b(double *t, double *phi, double *nu) {
+  return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
+}
+
+double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI * *nu * *t;
+  double ph = DEG_TO_RAD * *phi;
+
+  return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double ab(double *t, double *sigma, double *gamma) {
+  double gt = *gamma**t;
+
+  return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double *t, double *Delta0, double *Rb) {
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
+}
+
+double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
+}
+
+double evalf(std::vector< std::pair<int, doubleF> > func) {
+
+  double result = 0.0;
+  for (auto f : func) {
+    switch (f.first) {
+    case 0: result += f.second(); break;
+    case 1: result -= f.second(); break;
+    default: result += f.second(); break;
+    }
+  }
+
+  return result;
+}
+
+double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
+
+  double result = 0.0;
+  double ts = *t;
+
+  for (int i = 0; i < ndata; i++) {
+
+    *t = ts + i*dt;
+    double d = data[i];
+    double e = data[i];
+
+    double vf = evalf(func);
+    double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
+    
+    if (e != 0.0)
+      result += ( (theo - d) * (theo - d) ) / (e*e);
+    else
+      result += theo * theo;
+
+  }
+  return result;
+}
+
+//create a random length from 50 - 1000 array and fill with random values from 0 to 1
+void randomParams(double *p, int np) {
+  for (int i = 0; i < np; i++)
+    p[i] = (double)rand() / RAND_MAX;
+}
+
+//create map array of random size and fill with indexes from 0 to max, max < size of param array
+void randomMaps(int *m, int nm, int max) {
+  for (int i = 0; i < nm; i++)
+    m[i] = rand() % max;
+}
+
+int generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc, 
+			   double *t, double *p, int *m, int np, int nm) 
+{
+
+  //nf defines the number of functions to generate (from 1 to 25)
+  int nf = rand() % 25 + 1;
+
+  for (int n = 0; n < nf; n++) {
+    std::string sf = "";
+    doubleF f;
+
+    int r = rand() % 18; //choose random function to use
+
+    int id1 = rand() % nm;
+    int id2 = rand() % nm;
+    int id3 = rand() % nm;
+    int id4 = rand() % nm;
+    int id5 = rand() % nm;
+
+    std::string p1 = "p[m[" + to_string(id1) + "]])";
+    std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
+    std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]])";
+    std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
+    std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
+
+    //get a random index from maps and use it to get the parameter value, bind function and parameter
+    //values to f, and create string for gpu in sfunc
+    switch (r) {
+    case 0:
+      f = std::bind(se, t, &p[m[id1]]);
+      sf = "se(t," + p1;
+      break;
+    case 1:
+      f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ge(t," + p2;
+      break;
+    case 2:
+      f = std::bind(sg, t, &p[m[id1]]);
+      sf = "sg(t, " + p1;
+      break;
+    case 3:
+      f = std::bind(stg, t, &p[m[id1]]);
+      sf = "stg(t, " + p1;
+      break;
+    case 4:
+      f = std::bind(sekt, t, &p[m[id1]]);
+      sf = "sekt(t, " + p1;
+      break;
+    case 5:
+      f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "lgkt(t, " + p2;
+      break;
+    case 6:
+      f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "skt(t, " + p2;
+      break;
+    case 7:
+      f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "spg(t, " + p3;
+      break;
+    case 8:
+      f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "rahf(t, " + p2;
+      break;
+    case 9:
+      f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "tf(t, " + p2;
+      break;
+    case 10:
+      f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ifld(t, " + p5;
+      break;
+    case 11:
+      f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
+      sf = "b(t, " + p2;
+      break;
+    case 12: 
+      f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ib(t, " + p5;
+      break;
+    case 13:
+      f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ab(t, " + p2;
+      break;
+    case 14:
+      f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "snkzf(t, " + p2;
+      break;
+    case 15:
+      f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
+      sf = "snktf(t, " + p4;
+      break;
+    case 16: 
+      f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "dnkzf(t, " + p3;
+      break;
+    case 17: 
+      f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "dnktf(t, " + p5;
+      break;
+    }
+    
+
+    int sign = rand() % 2;
+    if (n == 0) sign = 0;
+    func.push_back( std::make_pair(sign, f) );
+    if (n == 0)
+      sfunc = sf;
+    else {
+      switch(sign) {
+      case 0: sfunc += " + " + sf; break;
+      case 1: sfunc += " - " + sf; break;
+      default: sfunc += " + " + sf; break;
+      }
+
+    }
+  }
+
+  return nf;
+}
+
+int main(int argc, char *argv[]) {
+
+
+  srand(time(NULL));
+
+  int ierr;
+  int Ndata = 1e6;
+
+  bool autotune = false;
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; ++i) {
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    }
+    
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    }
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-autotune")) {
+      autotune = true;
+    }
+
+  }
+
+  //create a random number of parameters
+  int np = ( rand() % (1000 - 50) ) + 50;
+  int nm = ( rand() % (50 - 5) ) + 5;
+  int nf = ( rand() % (50 - 5) ) + 5;
+
+  int *m = new int[nm];
+  double *p = new double[np];
+  double *f = new double[nf];
+
+  randomParams(p, np);
+  randomMaps(m, nm, np);
+  randomParams(f, nf);
+
+  double dt = 1e-10;
+  double t = 1e-10;
+  std::vector< std::pair<int, doubleF> > func;
+  std::string sfunc;
+  int nfunc = generateRandomFunction(func, sfunc, &t, p, m, np, nm);
+
+  //create DKS base object, set and init device / framework
+  DKSBaseMuSR dksbase;
+  dksbase.setAPI(api_name);
+  dksbase.setDevice(device_name);
+
+  dksbase.initDevice();
+  dksbase.initChiSquare(Ndata, np, nf, nm);
+
+  dksbase.writeParams(p, np);
+  dksbase.writeFunctions(f, nf);
+  dksbase.writeMaps(m, nm);
+
+  dksbase.callSetConsts(N0, TAU, BKG);
+
+  dksbase.callCompileProgram(sfunc);
+
+  if (autotune) 
+    dksbase.setAutoTuningOn();
+
+  int oper = 0;
+  dksbase.getOperations(oper);
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of params: " << np << endl;
+  cout << "Number of maps: " << nm << endl;
+  cout << "Number of predefined functions: " << nfunc << endl;
+  cout << "Number of ptx instructions: " << oper << endl;
+  cout << "------------------------------------------------------------" << endl;
+  cout << sfunc << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //allocate memory on host and device device
+  double *data = new double[Ndata];
+  randomParams(data, Ndata);
+  void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+  dksbase.writeData<double>(data_ptr, data, Ndata);
+
+  for (int N = 1e5; N < Ndata + 1; N += 1e5) {
+    double result_dks, result_cpu;
+
+    t = 1e-10;
+
+    dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, N, np, nf, nm, t, dt, result_dks);
+    result_cpu = cpuChiSq(data, func, N, &t, dt);
+
+    cout << "Npart: " << N << endl;
+    cout << "DKS: " << result_dks << endl;
+    cout << "CPU: " << result_cpu << endl;
+
+  }
+
+  dksbase.freeMemory<double>(data_ptr, Ndata);
+  dksbase.freeChiSquare();
+  delete[] data;
+  delete[] p;
+  delete[] f;
+  delete[] m;
+
+  return 0;
+}
diff --git a/auto-tuning/testChiSquareRTUQTK.cpp b/auto-tuning/testChiSquareRTUQTK.cpp
new file mode 100644
index 0000000..c8602fc
--- /dev/null
+++ b/auto-tuning/testChiSquareRTUQTK.cpp
@@ -0,0 +1,618 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <fstream>
+
+#include <cstdio>
+#include <stddef.h>
+#include <fstream>
+#include <math.h>
+#include <time.h>
+#include <getopt.h>
+#include <unistd.h>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+#include "Array1D.h"
+#include "Array2D.h"
+#include "Array3D.h"
+#include "error_handlers.h"
+#include "PCSet.h"
+#include "fast_laplace.h"
+#include "uqtktools.h"
+#include "lreg.h"
+
+#define PI 3.14159265358979323846
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+//#define N0 0.25
+#define N0 1e-10
+#define TAU 2.197019
+#define BKG 0.05
+
+using namespace std;
+
+typedef std::function<double()> doubleF;
+
+void randData(double *data, int N, int scale = 1) {
+  for (int i = 0; i < N; i++)
+    data[i] = ((double)rand() / RAND_MAX ) * scale;
+}
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double *t, double *lamda) {
+  return exp( -*lamda**t );
+}
+//math func + math oper + memory loads
+//1 + 1 + 2
+
+
+double ge(double *t, double *lamda, double *beta) {
+  return exp( -pow( (*lamda)*(*t), *beta) );
+}
+//2 + 1 + 3
+
+double sg(double *t, double *sigma) {
+  return exp( -0.5 * pow((*sigma)*(*t), 2) );
+}
+//2 + 2 + 2
+
+double stg(double *t, double *sigma) {
+  double sigmatsq = pow((*sigma)*(*t),2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double *t, double *lambda) {
+  double lambdat = *lambda*(*t);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double *t, double *lambda, double *sigma) {
+  double lambdat = *lambda*(*t);
+  double sigmatsq = pow(*sigma*(*t), 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double *t, double *sigma, double *beta) {
+  if (*beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(*sigma*(*t), (*beta));
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
+}
+
+double spg(double *t, double *lambda, double *gamma, double *q) {
+  double lam2 = (*lambda)*(*lambda);
+  double lamt2q = (*t)*(*t)*lam2*(*q);
+  double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double *t, double *nu, double *lambda) {
+  double nut  = *nu*(*t);
+  double nuth = *nu*(*t)/2.0;
+  double lamt = *lambda*(*t);
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double *t, double *phi, double *nu) {
+  double tmp_nu = TWO_PI**nu**t;
+  double tmp_phi = DEG_TO_RAD * *phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+
+  return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double b(double *t, double *phi, double *nu) {
+  return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
+}
+
+double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI * *nu * *t;
+  double ph = DEG_TO_RAD * *phi;
+
+  return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double ab(double *t, double *sigma, double *gamma) {
+  double gt = *gamma**t;
+
+  return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double *t, double *Delta0, double *Rb) {
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
+}
+
+double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
+}
+
+double evalf(std::vector< std::pair<int, doubleF> > func) {
+
+  double result = 0.0;
+  for (auto f : func) {
+    switch (f.first) {
+    case 0: result += f.second(); break;
+    case 1: result -= f.second(); break;
+    default: result += f.second(); break;
+    }
+  }
+
+  return result;
+}
+
+double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
+
+  double result = 0.0;
+  double ts = *t;
+
+  for (int i = 0; i < ndata; i++) {
+
+    *t = ts + i*dt;
+    double d = data[i];
+    double e = data[i];
+
+    double vf = evalf(func);
+    double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
+    
+    if (e != 0.0)
+      result += ( (theo - d) * (theo - d) ) / (e * e);
+    else
+      result += theo * theo;
+
+  }
+  return result;
+}
+
+//create a random length from 50 - 1000 array and fill with random values from 0 to 1
+void randomParams(double *p, int np) {
+  for (int i = 0; i < np; i++)
+    p[i] = (double)rand() / RAND_MAX;
+}
+
+//create map array of random size and fill with indexes from 0 to max, max < size of param array
+void randomMaps(int *m, int nm, int max) {
+  for (int i = 0; i < nm; i++)
+    m[i] = rand() % max;
+}
+
+void generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc, 
+			    double *t, double *p, int *m, int np, int nm, int nfunc) 
+{
+
+  for (int n = 0; n < nfunc; n++) {
+    std::string sf = "";
+    doubleF f;
+
+    int r = rand() % 18; //randomly choose one of the predefined functions to use
+
+    int id1 = rand() % nm; //randomly select parameters to use in the function
+    int id2 = rand() % nm;
+    int id3 = rand() % nm;
+    int id4 = rand() % nm;
+    int id5 = rand() % nm;
+
+    std::string p1 = "p[m[" + to_string(id1) + "]])";
+    std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
+    std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]])";
+    std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
+    std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
+
+    //get a random index from maps and use it to get the parameter value, bind function and parameter
+    //values to f, and create string for gpu in sfunc
+    switch (r) {
+    case 0:
+      f = std::bind(se, t, &p[m[id1]]);
+      sf = "se(t," + p1;
+      break;
+    case 1:
+      f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ge(t," + p2;
+      break;
+    case 2:
+      f = std::bind(sg, t, &p[m[id1]]);
+      sf = "sg(t, " + p1;
+      break;
+    case 3:
+      f = std::bind(stg, t, &p[m[id1]]);
+      sf = "stg(t, " + p1;
+      break;
+    case 4:
+      f = std::bind(sekt, t, &p[m[id1]]);
+      sf = "sekt(t, " + p1;
+      break;
+    case 5:
+      f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "lgkt(t, " + p2;
+      break;
+    case 6:
+      f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "skt(t, " + p2;
+      break;
+    case 7:
+      f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "spg(t, " + p3;
+      break;
+    case 8:
+      f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "rahf(t, " + p2;
+      break;
+    case 9:
+      f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "tf(t, " + p2;
+      break;
+    case 10:
+      f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ifld(t, " + p5;
+      break;
+    case 11:
+      f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
+      sf = "b(t, " + p2;
+      break;
+    case 12: 
+      f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ib(t, " + p5;
+      break;
+    case 13:
+      f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ab(t, " + p2;
+      break;
+    case 14:
+      f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "snkzf(t, " + p2;
+      break;
+    case 15:
+      f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
+      sf = "snktf(t, " + p4;
+      break;
+    case 16: 
+      f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "dnkzf(t, " + p3;
+      break;
+    case 17: 
+      f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "dnktf(t, " + p5;
+      break;
+    }
+    
+
+    int sign = rand() % 2;
+    if (n == 0) sign = 0;
+    func.push_back( std::make_pair(sign, f) );
+    if (n == 0)
+      sfunc = sf;
+    else {
+      switch(sign) {
+      case 0: sfunc += " + " + sf; break;
+      case 1: sfunc += " - " + sf; break;
+      default: sfunc += " + " + sf; break;
+      }
+
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+
+
+  srand(time(NULL));
+
+  bool autotune = false;
+  bool eval = false;
+  bool test = false;
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  int nord = 15; //the order of the initial, overcomplete basis
+  int loop = 100;
+
+  for (int i = 1; i < argc; ++i) {
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    }
+    
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    }
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-autotune")) {
+      autotune = true;
+    }
+
+    if (argv[i] == string("-eval"))
+      eval = true;
+    
+    if (argv[i] == string("-test"))
+      test = true;
+
+    if (argv[i] == string("-nord"))
+      nord = atoi(argv[i+1]);
+
+    if (argv[i] == string("-loop"))
+      loop = atoi(argv[i+1]);
+
+  }
+
+  //init dks and set chi^2 constants
+  DKSBaseMuSR dksbase;
+  dksbase.setAPI(api_name);
+  dksbase.setDevice(device_name);
+  dksbase.initDevice();
+
+  if (autotune)
+    dksbase.setAutoTuningOn();
+
+  int nydim = 2; //the dimensionality of input
+  int nxdim = 5;
+  //UQTk arrays
+  Array2D<double> xdata(loop, nxdim, 0.0);
+  Array2D<double> ydata(loop, nydim, 0.0);
+  
+  Array2D<double> xdata_pce(loop, nxdim, 0.0);
+  Array2D<double> ydata_pce(loop, nydim, 0.0);
+
+  int size = 10000;
+  Array2D<double> xtmp(size, nxdim, 0.0);
+  Array2D<double> ytmp(size, nydim, 0.0);
+
+  if (eval || test) {
+    for (int l = 0; l < loop; l++) {
+
+      int ierr;
+
+      //create a random number of parameters
+      int n = rand() % 9 + 1;
+      int Ndata = n * 100000; //number of data points 100k to 1milj, with 100k incr.
+      int np = ( rand() % (1000 - 50) ) + 50; //from 50 to 1000 for different shared memory needs
+      int nm = ( rand() % (50 - 5) ) + 5; //use 5 to 50 of the parameters, for different memory access
+      int nf = ( rand() % (50 - 5) ) + 5; //not used in the test case, but changes the shared memory
+      int nfunc = (rand() % (10 - 1) ) + 1; //1 to 10 user defined functions
+    
+      //allocate storage for parameters, maps and functions
+      int *m = new int[nm];
+      double *p = new double[np];
+      double *f = new double[nf];
+
+      //fill with random numbers
+      randomParams(p, np);
+      randomMaps(m, nm, np);
+      randomParams(f, nf);
+
+      //create a random user function that can be passed to GPU kernel and evaluated on the host
+      double dt = 1e-10;
+      double t = 1e-10;
+      std::vector< std::pair<int, doubleF> > func;
+      std::string sfunc;
+      generateRandomFunction(func, sfunc, &t, p, m, np, nm, nfunc);
+      
+      //create a data array and fill with random values
+      double *data = new double[Ndata];
+      randomParams(data, Ndata);
+
+      
+      //allocate device memory for the data and transfer to the GPU
+      void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+      dksbase.writeData<double>(data_ptr, data, Ndata);
+
+      //init chi^2 
+      dksbase.initChiSquare(Ndata, np, nf, nm);
+      dksbase.callSetConsts(N0, TAU, BKG);
+
+      //write params to the devic
+      dksbase.writeParams(p, np);
+      dksbase.writeFunctions(f, nf);
+      dksbase.writeMaps(m, nm);
+
+      //compile the kernel with the new function
+      dksbase.callCompileProgram(sfunc);
+    
+      //run the kernel on the GPU and evaluate the function on the host
+      double result_dks, result_cpu, tmp_result;
+      
+      ierr = dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, 
+					 t, dt, result_dks);
+      
+      if (ierr == DKS_SUCCESS) {
+	result_cpu = cpuChiSq(data, func, Ndata, &t, dt);
+
+	std::vector<int> config;
+	dksbase.callAutoTuningChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, 
+					t, dt, tmp_result, config);
+      
+	cout << "DKS: " << result_dks << endl;
+	cout << "CPU: " << result_cpu << endl;
+	cout << "Launch parameters: " << config[0] << ", " << config[1] << endl;
+	cout << sfunc << endl;
+	cout << "Kernel parameters: " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
+
+	xdata(l,0) = np;
+	xdata(l,1) = nm;
+	xdata(l,2) = nf;
+	xdata(l,3) = nfunc;
+	xdata(l,4) = Ndata;
+	
+	ydata(l,0) = config[0];
+	ydata(l,1) = config[1];
+
+	std::cout << std::endl << "Loop " << l + 1 << " finished" << std::endl << std::endl;
+      } else {
+	cout << "Created kernel failed! " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
+	cout << sfunc << endl;
+      }
+      
+      
+      //free temporary resources
+      delete[] m;
+      delete[] p;
+      delete[] f;
+      delete[] data;
+      dksbase.freeChiSquare();
+      dksbase.freeMemory<double>(data_ptr, Ndata);
+    }
+  } else {
+    //read_datafileVS(xdata, "xdata.dat");
+    //read_datafileVS(ydata, "ydata.dat");
+    xtmp.SetValue(0.0);
+    ytmp.SetValue(0.0);
+    read_datafileVS(xtmp, "xdata_pce.dat");
+    read_datafileVS(ytmp, "ydata_pce.dat");
+    for (int i = 0; i < loop; i++) {
+      for (int j = 0; j < nxdim; j++)
+	xdata(i,j) = xtmp(i,j);
+      for (int j = 0; j < nydim; j++)
+	ydata(i,j) = ytmp(i,j);
+    }
+  }
+  
+  
+  if (eval) {
+    for (int i = 0; i < nxdim; i++) {
+      for (int j = 0; j < loop; j++) {
+	xdata_pce(j,i) = xdata(j,i);
+	ydata_pce(j,i) = ydata(j,i);
+      }
+    }
+
+    for (int i = 0; i < nydim; i++) {
+      for (int j = 0; j < loop; j++) {
+	xdata_pce(j,i) = xdata(j,i);
+	ydata_pce(j,i) = ydata(j,i);
+      }
+    }
+  } else {
+    //read_datafileVS(xdata_pce, "xdata_pce.dat");
+    //read_datafileVS(ydata_pce, "ydata_pce.dat");
+    xtmp.SetValue(0.0);
+    ytmp.SetValue(0.0);
+    read_datafileVS(xtmp, "xdata_pce.dat");
+    read_datafileVS(ytmp, "ydata_pce.dat");
+    for (int i = 0; i < loop; i++) {
+      for (int j = 0; j < nxdim; j++)
+	xdata_pce(i,j) = xtmp(i,j);
+      for (int j = 0; j < nydim; j++)
+	ydata_pce(i,j) = ytmp(i,j);
+    }
+    std::cout << "Built pce with " << xdata_pce.XSize() << " datapoints" << std::endl;
+  }
+
+  //default input settings
+  string which_chaos="LU"; //PC type
+  string msc="m";
+  
+  Lreg* reg;
+  reg = new PCreg(which_chaos,nord,nxdim);
+  int nbas = reg->GetNbas();  
+
+  Array2D<double> ypc_data(xdata.XSize(), nydim, 0.0);
+  for (int i = 0; i < nydim; i++) {
+    
+    std::cout << "start dim " << i+1 << std::endl;
+
+    Array1D<double> ydata_1d(xdata_pce.XSize(), 0.0);
+    for (unsigned int j = 0; j < xdata_pce.XSize(); j++)
+      ydata_1d(j) = ydata_pce(j,i);
+
+    std::cout << "setup data" << std::endl;
+    reg->SetupData(xdata_pce,ydata_1d);
+    
+    std::cout << "Comput best lambda" << std::endl;
+    double lambda=reg->LSQ_computeBestLambda();
+    Array1D<double> lam(nbas,lambda);
+
+
+    reg->SetWeights(lam);
+
+    std::cout << "LSQ build regr" << std::endl;
+
+    reg->LSQ_BuildRegr();
+    std::cout << std::endl << "Lambda : " << lambda << std::endl;
+
+    Array1D<double> ypc;
+    Array1D<double> ycheck;
+    Array2D<double> ycheck_cov;
+
+    reg->EvalRegr(xdata,msc,ypc,ycheck,ycheck_cov);
+    std::cout << std::endl << "Eval" << std::endl;
+    
+    for (unsigned int j = 0; j < xdata.XSize(); j++)
+      ypc_data(j,i) = ypc(j);
+
+  }
+
+  if (eval) {
+    write_datafile(xdata_pce, "xdata_pce.dat");
+    write_datafile(ydata_pce, "ydata_pce.dat");
+  }
+
+  write_datafile(xdata, "xdata.dat");
+  write_datafile(ydata, "ydata.dat");
+  write_datafile(ypc_data, "ypc_data.dat");
+
+  return 0;
+}
diff --git a/auto-tuning/testSearch.cpp b/auto-tuning/testSearch.cpp
new file mode 100644
index 0000000..e3b8efe
--- /dev/null
+++ b/auto-tuning/testSearch.cpp
@@ -0,0 +1,22 @@
+#include <iostream>
+
+#include "DKSBaseMuSR.h"
+
+/** No accelerator device is used, this test is used to confirm, that search functions
+ * used for auto-tuning work properly
+ */
+
+int main() {
+
+  DKSBaseMuSR base;
+
+  std::cout << "Start test" << std::endl;
+
+  base.testAutoTuning();
+
+  std::cout << "Test finished" << std::endl;
+    
+  
+
+  return 0;
+}
diff --git a/cmake/DKSConfig.cmake.in b/cmake/DKSConfig.cmake.in
new file mode 100644
index 0000000..d764963
--- /dev/null
+++ b/cmake/DKSConfig.cmake.in
@@ -0,0 +1,4 @@
+SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${${PROJECT_NAME}_CXX_FLAGS}")
+SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
+SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+SET(${PROJECT_NAME}_LIBRARY "dks")
\ No newline at end of file
diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
new file mode 100644
index 0000000..c0b848e
--- /dev/null
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -0,0 +1,139 @@
+#.rst:
+# FindOpenCL
+# ----------
+#
+# Try to find OpenCL
+#
+# Once done this will define::
+#
+#   OpenCL_FOUND          - True if OpenCL was found
+#   OpenCL_INCLUDE_DIRS   - include directories for OpenCL
+#   OpenCL_LIBRARIES      - link against this library to use OpenCL
+#   OpenCL_VERSION_STRING - Highest supported OpenCL version (eg. 1.2)
+#   OpenCL_VERSION_MAJOR  - The major version of the OpenCL implementation
+#   OpenCL_VERSION_MINOR  - The minor version of the OpenCL implementation
+#
+# The module will also define two cache variables::
+#
+#   OpenCL_INCLUDE_DIR    - the OpenCL include directory
+#   OpenCL_LIBRARY        - the path to the OpenCL library
+#
+
+#=============================================================================
+# Copyright 2014 Matthaeus G. Chajdas
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+function(_FIND_OPENCL_VERSION)
+  include(CheckSymbolExists)
+  include(CMakePushCheckState)
+  set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY})
+
+  CMAKE_PUSH_CHECK_STATE()
+  foreach(VERSION "2_0" "1_2" "1_1" "1_0")
+    set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}")
+
+    if(APPLE)
+      # prefer the header from the Framework
+      set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/Headers/cl.h")
+      if(EXISTS "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h")
+        set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h")
+      endif()
+    
+      CHECK_SYMBOL_EXISTS(
+        CL_VERSION_${VERSION}
+        ${OSX_OpenCL_HEADER}
+        OPENCL_VERSION_${VERSION})
+    else()
+      CHECK_SYMBOL_EXISTS(
+        CL_VERSION_${VERSION}
+        "${OpenCL_INCLUDE_DIR}/CL/cl.h"
+        OPENCL_VERSION_${VERSION})
+    endif()
+
+    if(OPENCL_VERSION_${VERSION})
+      string(REPLACE "_" "." VERSION "${VERSION}")
+      set(OpenCL_VERSION_STRING ${VERSION} PARENT_SCOPE)
+      string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}")
+      list(GET version_components 0 major_version)
+      list(GET version_components 1 minor_version)
+      set(OpenCL_VERSION_MAJOR ${major_version} PARENT_SCOPE)
+      set(OpenCL_VERSION_MINOR ${minor_version} PARENT_SCOPE)
+      break()
+    endif()
+  endforeach()
+  CMAKE_POP_CHECK_STATE()
+endfunction()
+
+find_path(OpenCL_INCLUDE_DIR
+  NAMES
+    CL/cl.h OpenCL/cl.h
+  PATHS
+    ENV "PROGRAMFILES(X86)"
+    ENV AMDAPPSDKROOT
+    ENV INTELOCLSDKROOT
+    ENV NVSDKCOMPUTE_ROOT
+    ENV CUDA_PATH
+    ENV ATISTREAMSDKROOT
+  PATH_SUFFIXES
+    include
+    OpenCL/common/inc
+    "AMD APP/include")
+
+_FIND_OPENCL_VERSION()
+
+if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+  find_path(OpenCL_LIBRARY
+    NAMES libOpenCL.so
+    PATHS
+      ENV "PROGRAMFILES(X86)"
+      ENV AMDAPPSDKROOT
+      ENV INTELOCLSDKROOT
+      ENV CUDA_PATH
+      ENV NVSDKCOMPUTE_ROOT
+      ENV ATISTREAMSDKROOT
+    PATH_SUFFIXES
+      "AMD APP/lib/x86"
+      lib/x86
+      lib/Win32
+      OpenCL/common/lib/Win32)
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  find_path(OpenCL_LIBRARY
+    NAMES libOpenCL.so
+    PATHS
+      ENV "PROGRAMFILES(X86)"
+      ENV AMDAPPSDKROOT
+      ENV INTELOCLSDKROOT
+      ENV CUDA_PATH
+      ENV NVSDKCOMPUTE_ROOT
+      ENV ATISTREAMSDKROOT
+    PATH_SUFFIXES
+      "AMD APP/lib/x86_64"
+      lib/x86_64
+      lib/x64
+      OpenCL/common/lib/x64)
+endif()
+
+set(OpenCL_LIBRARIES ${OpenCL_LIBRARY})
+set(OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# Ubuntu 12.04 / Travis CI have an old version of CMake that doesn't
+# support "FOUND_VAR OpenCL_FOUND". This could, in principle, be added
+# at a later date.
+find_package_handle_standard_args(
+  OpenCL FOUND_VAR OpenCL_FOUND
+  REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR
+  VERSION_VAR OpenCL_VERSION_STRING)
+
+mark_as_advanced(
+  OpenCL_INCLUDE_DIR
+  OpenCL_LIBRARY)
diff --git a/doc/refman.pdf b/doc/refman.pdf
new file mode 100644
index 0000000..41d9b03
Binary files /dev/null and b/doc/refman.pdf differ
diff --git a/run_tuning_tests.sh b/run_tuning_tests.sh
new file mode 100755
index 0000000..05a4e12
--- /dev/null
+++ b/run_tuning_tests.sh
@@ -0,0 +1,97 @@
+#!/bin/bash 
+export  MIC_ENV_PREFIX=MIC
+echo $MIC_ENV_PREFIX
+export  MIC_OMP_NUM_THREADS=236
+echo $MIC_OMP_NUM_THREADS
+export MIC_KMP_PLACE_THREADS=59c4t0o
+echo $MIC_KMP_PLACE_THREADS
+export MIC_USE_2MB_BUFFERS=64K
+echo $MIC_USE_2MB_BUFFERS
+export MIC_KMP_AFFINITY=scatter
+echo $MIC_KMP_AFFINITY
+
+#./testFFT3DRC 256 256 256
+
+echo 'real strides divisible by 4 but not by 8'
+#./testFFT3DRC 257 244 268
+#./testFFT3DRC 244 268 257
+#./testFFT3DRC 268 257 244
+#./testFFT3DRC 257 268 244
+#./testFFT3DRC 244 257 268
+#./testFFT3DRC 268 244 257
+
+echo 'real strides divisible by 8 but not by 16'
+#./testFFT3DRC 257 248 263
+#./testFFT3DRC 248 263 257
+#./testFFT3DRC 263 257 248
+#./testFFT3DRC 257 263 248
+#./testFFT3DRC 248 257 263
+#./testFFT3DRC 263 248 257
+
+echo 'complex strides divisible by 4 but not by 8'
+#./testFFT3DRC 257 246 268
+#./testFFT3DRC 246 268 257
+#./testFFT3DRC 268 257 246
+#./testFFT3DRC 257 268 246
+#./testFFT3DRC 246 257 268
+#./testFFT3DRC 268 246 257
+
+echo 'complex strides divisible by 8 but not by 16'
+#./testFFT3DRC 257 206 317
+#./testFFT3DRC 206 317 257
+#./testFFT3DRC 317 257 206
+#./testFFT3DRC 257 317 206
+#./testFFT3DRC 206 257 317
+#./testFFT3DRC 317 206 257
+
+echo 'perform scaling tests'
+export  MIC_OMP_NUM_THREADS=1
+echo $MIC_OMP_NUM_THREADS
+export MIC_KMP_PLACE_THREADS=1c1t0o
+echo $MIC_KMP_PLACE_THREADS
+#./testFFT3DRC 256 256 256
+
+export  MIC_OMP_NUM_THREADS=2
+echo $MIC_OMP_NUM_THREADS
+export MIC_KMP_PLACE_THREADS=1c2t0o
+echo $MIC_KMP_PLACE_THREADS
+#./testFFT3DRC 256 256 256
+
+
+
+export  MIC_OMP_NUM_THREADS=3
+echo $MIC_OMP_NUM_THREADS
+export MIC_KMP_PLACE_THREADS=1c3t0o
+echo $MIC_KMP_PLACE_THREADS
+#./testFFT3DRC 256 256 256
+
+
+export  MIC_OMP_NUM_THREADS=4
+echo $MIC_OMP_NUM_THREADS
+export MIC_KMP_PLACE_THREADS=1c4t0o
+echo $MIC_KMP_PLACE_THREADS
+#./testFFT3DRC 256 256 256
+
+NUM_PROC="2 4 8 16 32 59"
+for p in $NUM_PROC; do
+	t=$(($p * 4))
+	echo $t
+	export  MIC_OMP_NUM_THREADS=$t
+	echo $MIC_OMP_NUM_THREADS
+	mystring="$p"
+	mystring+="c4t0o"
+	export MIC_KMP_PLACE_THREADS=$mystring
+	echo $MIC_KMP_PLACE_THREADS
+	./testFFT3DRC 256 256 256
+
+	
+done
+
+
+
+
+
+
+
+
+
diff --git a/src/Algorithms/CMakeLists.txt b/src/Algorithms/CMakeLists.txt
new file mode 100644
index 0000000..0a189b6
--- /dev/null
+++ b/src/Algorithms/CMakeLists.txt
@@ -0,0 +1,14 @@
+SET (_SRCS
+  )
+
+SET (_HDRS
+	ChiSquareRuntime.h
+	ImageReconstruction.h
+	CollimatorPhysics.h
+	FFT.h
+  )
+
+ADD_SOURCES (${_SRCS})
+ADD_HEADERS (${_HDRS})
+
+INSTALL(FILES ${_HDRS} DESTINATION include/Algorithms)
\ No newline at end of file
diff --git a/src/Algorithms/ChiSquareRuntime.h b/src/Algorithms/ChiSquareRuntime.h
new file mode 100644
index 0000000..85e8d9f
--- /dev/null
+++ b/src/Algorithms/ChiSquareRuntime.h
@@ -0,0 +1,158 @@
+#ifndef H_CHISQUARE_RUNTIME
+#define H_CHISQUARE_RUNTIME
+
+#include <iostream>
+#include <string>
+#include <sstream>
+#include "../DKSDefinitions.h"
+
+#define BLOCK_SIZE 128
+
+#define FITTYPE_UNDEFINED    0
+#define FITTYPE_SINGLE_HISTO 1
+#define FITTYPE_ASYMMETRY    2
+#define FITTYPE_MU_MINUS     3
+
+class DKSBaseMuSR;
+
+class ChiSquareRuntime {
+  friend class DKSBaseMuSR;
+
+protected:
+  // single histo fit parameter
+  double N0_m;
+  double tau_m;
+  double bkg_m;
+  // asymmetry fit parameter
+  double alpha_m;
+  double beta_m;
+
+  bool initDone_m;
+  void *mem_chisq_m;
+  void *mem_param_m;
+  void *mem_func_m;
+  void *mem_map_m;
+
+  int numBlocks_m;
+  int blockSize_m;
+
+  char *ptx_m;
+
+  void setN0(double value) {
+    N0_m = value;
+  }
+
+  void setTau(double value) {
+    tau_m = value;
+  }
+
+  void setBKG(double value) {
+    bkg_m = value;
+  }
+
+  void setAlpha(double value) {
+    alpha_m = value;
+  }
+
+  void setBeta(double value) {
+    beta_m = value;
+  }
+
+public:
+
+  /** Default constructor */
+  //ChiSquareRuntime();
+
+  /** Default destructor */
+  virtual ~ChiSquareRuntime() { };
+
+  virtual int compileProgram(std::string function, bool mlh = false) = 0;
+  virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
+			      int numpar, int numfunc, int nummap,
+			      double timeStart, double timeStep,
+			      double &result) = 0;
+
+  virtual int writeParams(const double *params, int numparams) = 0;
+  virtual int writeFunc(const double *func, int numfunc) = 0;
+  virtual int writeMap(const int *map, int nummap) = 0;
+  virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
+  virtual int freeChiSquare() = 0;
+  virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
+
+  /** Set N0, tau and bgk values to use for the kernel.
+   * If values changes between data sets this needs to be called before
+   * every kernel call. Returns DKS_SUCCESS.
+   */
+  int setConsts(double N0, double tau, double bkg) {
+    setN0(N0);
+    setTau(tau);
+    setBKG(bkg);
+    
+    return DKS_SUCCESS;
+  }
+
+  /** Set alpha and beta values to use for the kernel.
+   * If values changes between data sets this needs to be called before
+   * every kernel call. Returns DKS_SUCCESS.
+   */
+  int setConsts(double alpha, double beta) {
+    setAlpha(alpha);
+    setBeta(beta);
+    return DKS_SUCCESS;
+  }
+
+  /** Set number of blocks and threads.
+   *  Used to set parameters obtained from auto-tuning
+   */
+  int setKernelParams(int numBlocks, int blockSize) {
+    int ierr = DKS_ERROR;
+    if (numBlocks > 0) {
+      numBlocks_m = numBlocks;
+      ierr = DKS_SUCCESS;
+    }
+    if (blockSize > 0) {
+      blockSize_m = blockSize;
+      ierr = DKS_SUCCESS;
+    }
+
+    return ierr;
+  }
+
+  /** Get the number of operations in compiled kernel.
+   *  Count the number of operation in the ptx file for the compiled program.
+   */
+  int getOperations(int &oper) {
+
+    std::string ptx_str(ptx_m);
+    std::istringstream is(ptx_str);
+
+    std::string line;
+    bool start = false;
+    int count = 0;
+    while(std::getline(is, line)) {
+      
+      //when fTheory start enable counting of operations
+      size_t f1 = line.find("fTheory");
+      size_t f2 = line.find(".visible");
+      size_t f3 = line.find(";");
+      if (f1 != std::string::npos && f2 != std::string::npos) {
+	start = true;
+	continue;
+      }
+
+      //exit when the new functions begins
+      if (start && f2 != std::string::npos)
+	break;
+
+      //count opertations
+      if (start && f3 != std::string::npos)
+	count++;
+    }
+
+    oper = count;
+    return DKS_SUCCESS;
+  }
+
+};
+
+#endif
diff --git a/src/Algorithms/CollimatorPhysics.h b/src/Algorithms/CollimatorPhysics.h
new file mode 100644
index 0000000..b7e8190
--- /dev/null
+++ b/src/Algorithms/CollimatorPhysics.h
@@ -0,0 +1,47 @@
+#ifndef H_COLLIMATOR_PHYSICS
+#define H_COLLIMATOR_PHYSICS
+
+#include <iostream>
+#include <string>
+#include "../DKSDefinitions.h"
+
+class DKSBaseMuSR;
+
+class DKSCollimatorPhysics {
+  friend class DKSBaseMuSR;
+
+protected:
+
+  int numBlocks_m;
+  int blockSize_m;
+
+public:
+  
+  virtual ~DKSCollimatorPhysics() { }
+  
+  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;
+
+  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+				   void *px_ptr, void *py_ptr, void *pz_ptr,
+				   void *par_ptr, int numparticles) = 0;
+  
+  virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
+
+  virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+				       void *px_ptr, void *py_ptr, void *pz_ptr,
+				       void *par_ptr, int numparticles, int &numaddback) = 0;
+
+  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
+				   double dt, double c, bool usedt = false, int streamId = -1) = 0;
+
+  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
+					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
+					    double dt, double c, bool usedt = false, 
+					    int streamId = -1) = 0;
+
+
+};
+
+#endif
diff --git a/src/Algorithms/FFT.h b/src/Algorithms/FFT.h
new file mode 100644
index 0000000..b16e5f6
--- /dev/null
+++ b/src/Algorithms/FFT.h
@@ -0,0 +1,43 @@
+#ifndef H_DKS_FFT
+#define H_DKS_FFT
+
+#include <iostream>
+#include <math.h>
+
+#include "../DKSDefinitions.h"
+
+class DKSFFT {
+
+protected:
+  int defaultN[3];
+  int defaultNdim;
+
+  bool useDefaultPlan(int ndim, int N[3]) {
+    if (ndim != defaultNdim)
+      return false;
+    if (N[0] != defaultN[0] && N[1] != defaultN[1] && N[2] != defaultN[2])
+      return false;
+    return true;
+  }
+
+public:
+
+  virtual ~DKSFFT() { }
+
+  virtual int setupFFT(int ndim, int N[3]) = 0;
+  virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
+  virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
+  virtual int destroyFFT() = 0;
+  virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
+			 int streamId = -1, bool forward = true) = 0;
+  virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+  virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+  virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
+				int streamId = -1) = 0;
+  virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
+				int streamId = -1) = 0;
+  virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+};
+
+#endif
diff --git a/src/Algorithms/ImageReconstruction.h b/src/Algorithms/ImageReconstruction.h
new file mode 100644
index 0000000..3a6266e
--- /dev/null
+++ b/src/Algorithms/ImageReconstruction.h
@@ -0,0 +1,117 @@
+#ifndef H_IMAGERECONSTRUCTION
+#define H_IMAGERECONSTRUCTION
+
+#include "../DKSDefinitions.h"
+
+#define BLOCK_SIZE 128
+
+struct VoxelPosition {
+  float x;
+  float y;
+  float z;
+};
+
+struct ListEvent {
+  unsigned detA : 16;
+  unsigned detB : 16;
+};
+
+class ImageReconstruction {
+
+protected:
+  void *m_event_branch;
+
+public:
+
+  virtual ~ImageReconstruction() { }
+  
+  /** Caluclate source.
+   *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
+   *  that are inside this sphere. All the sphere used have the same diameter.
+   */
+  virtual int calculateSource(void *image_space, void *image_position, void *source_position, 
+			      void *avg, void *std, float diameter, int total_voxels, 
+			      int total_sources, int start = 0) = 0;
+
+  /** Calculate background.
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
+   * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
+   */
+  virtual int calculateBackground(void *image_space, void *image_position, void *source_position, 
+				  void *avg, void *std, float diameter, int total_voxels, 
+				  int total_sources, int start = 0) = 0;
+
+  /** Caluclate source using differente sources.
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
+   * each sphere is given by *diameter array.
+   */
+  virtual int calculateSources(void *image_space, void *image_position, void *source_position, 
+			       void *avg, void *std, void *diameter, int total_voxels, 
+			       int total_sources, int start = 0) = 0;
+
+  /**
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
+   * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
+   * smaller sphere.
+   */
+  virtual int calculateBackgrounds(void *image_space, void *image_position, void *source_position, 
+				   void *avg, void *std, void *diameter, int total_voxels, 
+				   int total_sources, int start = 0) = 0;
+
+  /** Generate normalization.
+   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
+   * that updates voxel values in the image on the slope between these two detectors.
+   */
+  virtual int generateNormalization(void *recon, void *image_position, 
+				 void *det_position, int total_det) = 0; 
+
+
+  /** Calculate forward projection.
+   * For image reconstruction calculates forward projections.
+   * see recon.cpp for details
+   */
+  virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
+				void *image_position, int num_events) = 0;
+
+  /** Calculate backward projection.
+   * For image reconstruction calculates backward projections.
+   * see recon.cpp for details
+   */
+  virtual int backwardProjection(void *correction, void *recon_corrector, void *list_data, 
+				 void *det_position, void *image_position, 
+				 int num_events, int num_voxels) = 0;
+
+  /** Set the voxel dimensins on device.
+   * 
+   */
+  virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
+
+  /** Set the image edge variables on the device.
+   * 
+   */
+  virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
+
+  /** Set the image edge1 on the device.
+   * 
+   */
+  virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
+
+  /** Set the minimum crystan in one ring values on the device.
+   * 
+   */
+  virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
+				  float min_CrystalDist_InOneRing1) = 0;
+
+  /** Set all other required parameters for reconstruction.
+   * 
+   */
+  virtual int setParams(float matrix_distance_factor, float phantom_diameter,
+			float atten_per_mm, float ring_diameter) = 0;
+  
+  
+};
+
+#endif
diff --git a/src/AutoTuning/CMakeLists.txt b/src/AutoTuning/CMakeLists.txt
new file mode 100644
index 0000000..62cd2b6
--- /dev/null
+++ b/src/AutoTuning/CMakeLists.txt
@@ -0,0 +1,21 @@
+SET (_SRCS
+	DKSAutoTuning.cpp
+	DKSSearchStates.cpp
+	DKSConfig.cpp
+  )
+
+SET (_HDRS
+	DKSAutoTuning.h
+	DKSSearchStates.h
+	DKSAutoTuningTester.h
+	DKSConfig.h
+  )
+
+#INCLUDE_DIRECTORIES (
+#  ${CMAKE_CURRENT_SOURCE_DIR}
+#)
+
+ADD_SOURCES (${_SRCS})
+ADD_HEADERS (${_HDRS})
+
+INSTALL(FILES ${_HDRS} DESTINATION include/AutoTuning)
\ No newline at end of file
diff --git a/src/AutoTuning/DKSAutoTuning.cpp b/src/AutoTuning/DKSAutoTuning.cpp
new file mode 100644
index 0000000..050d1a8
--- /dev/null
+++ b/src/AutoTuning/DKSAutoTuning.cpp
@@ -0,0 +1,302 @@
+#include "DKSAutoTuning.h"
+
+DKSAutoTuning::DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops) {
+
+  base_m = base;
+  api_name_m = api;
+  device_name_m = device;
+  loops_m = loops;
+
+  evaluate_time_m = true;
+}
+
+DKSAutoTuning::~DKSAutoTuning() {
+  params_m.clear();
+}
+
+int DKSAutoTuning::setParameterValues(States state) {
+
+  //if states and params don't match in size something has gone wrong
+  if (state.size() != params_m.size()) {
+    DEBUG_MSG("Parameters and states don't match!");
+    return DKS_ERROR;
+  }
+
+  //set the value pointed by params to value saved in state
+  for (unsigned int i = 0; i < params_m.size(); i++)
+    params_m[i].setValue(state[i].value);
+
+  return DKS_SUCCESS;
+}
+
+/** TODO: might need a better timing for GPU code */
+int DKSAutoTuning::evaluateFunction(double &value) {
+
+  int ierr = DKS_ERROR;
+  DKSTimer t;
+ 
+  t.init(function_name_m);
+
+  if (evaluate_time_m) {
+    //run for "loop" times and return the average time. 
+    //syncDevice() is used to make sure that nothing is running on the device before the timer starts
+    // and to make sure the function has completed on the device before the time stops 
+    for (int j = 0; j < loops_m; j++) {
+      base_m->syncDevice();
+      t.start();
+      ierr = f_m();
+      base_m->syncDevice();
+      t.stop();
+      if (ierr != DKS_SUCCESS) //exit loop if kernel execution fials
+	break;
+    }
+    
+    //returns
+    value = t.gettime() / loops_m;
+  } else {
+    value = fd_m();
+    ierr = DKS_SUCCESS;
+  }
+
+  return ierr;
+}
+
+void DKSAutoTuning::clearParameters() {
+  params_m.clear();
+}
+
+void DKSAutoTuning::exaustiveSearch() {
+
+  DKSTimer t;
+  t.init("exaustive search");
+  t.start();
+
+  if (params_m.size() < 2)
+    return;
+
+  Parameter p1 = params_m[0];
+  Parameter p2 = params_m[1];
+
+  double time;
+  double mint = 1000000.0;
+  int minv1 = 0;
+  int minv2 = 0;
+
+  //std::ofstream myfile;
+  //std::string filename;
+  //filename =  "search_" + api_name_m + "_" + device_name_m + ".dat";
+  //myfile.open(filename);
+
+  for (double v1 = p1.min; v1 <= p1.max; v1 += p1.step) {
+    for (double v2 = p2.min; v2 <= p2.max; v2 += p2.step) {
+      p1.setValue(v1);
+      p2.setValue(v2);
+
+      int ierr = evaluateFunction(time);
+
+      if (ierr == DKS_SUCCESS && time < mint) {
+	mint = time;
+	minv1 = v1;
+	minv2 = v2;
+      } 
+      if (ierr == DKS_ERROR)
+	time = 1;
+
+      //myfile << time << "\t";
+    }
+    //myfile << "\n";
+  }
+  //myfile.close();
+
+  //std::cout << "Optimal launch parameters:" << std::endl;
+  //std::cout << mint << "\t" << minv1 << "\t" << minv2 << std::endl;
+  p1.setValue(minv1);
+  p2.setValue(minv2);
+
+  t.stop();
+  //std::cout << "exaustive search: " << t.gettime() << std::endl;
+}
+
+void DKSAutoTuning::lineSearch() {
+  DKSTimer t;
+  t.init("line search");
+  t.start();
+
+  double time;
+  int ierr = DKS_ERROR;
+
+  if (params_m.size() < 1) {
+    DEBUG_MSG("Need some parameters to autotune!");
+    return;
+  }
+
+  double mint = 1000000.0;
+  //loop trough parameters one parameter at a time
+  for (auto param : params_m) {
+    int minv = param.getValue();
+
+    //go trough all the values of the parameter, while keeping other parameters const
+    for (double i = param.min; i <= param.max; i += param.step) {
+      //adjust parameters
+      param.setValue(i);
+
+      //run for "loop" times and get average
+      ierr = evaluateFunction(time);
+
+      //if there was no error executing the function and time is better than previou
+      //min time, save the parameter configuration
+      if (ierr == DKS_SUCCESS && time < mint) {
+	mint = time;
+	minv = i;
+      }
+      
+    } //repeat
+
+    param.setValue(minv);
+  }
+ 
+  //DEBUG: print out the found best parameters
+  for (auto param : params_m)
+    std::cout << "Parameter " << param.name << " set to " << param.getValue() << std::endl;
+
+  std::cout << "Best time: " << mint << std::endl;
+
+  t.stop();
+  std::cout << "Line search time: " << t.gettime() << std::endl;
+
+}
+
+void DKSAutoTuning::hillClimbing(int restart_loops) {
+
+  DKSTimer t;
+  t.init("hill climbing");
+  t.start();
+
+  std::cout << "hill climbing" << std::endl;
+
+  int ierr;
+  double time_current;
+  double time_next;
+  DKSSearchStates search(params_m);
+   
+  std::cout << "start " << restart_loops << std::endl;
+
+  for (int i = 0; i < restart_loops; i++) {
+
+
+    //init random current state
+    search.initCurrentState();
+
+    //evaluate current state
+    setParameterValues(search.getCurrentState());
+    ierr = evaluateFunction(time_current);
+
+    //std::cout << "Start iteration " << i+1 << std::endl;
+    //search.printCurrentState(time_current);
+
+    if (ierr == DKS_ERROR)
+      continue;
+   
+    //statr the loop
+    bool topReached = false;
+    while(!topReached) {
+
+      search.getNeighbours();
+
+      //get all the neighbors of the current state
+      bool neighbourFound = false;
+      while (!neighbourFound && search.nextNeighbourExists()) {
+  
+	//evaluate all the neighbors of the current state
+	setParameterValues(search.getNextNeighbour());
+	ierr = evaluateFunction(time_next);
+
+	//search.printNeighbour(time_next);
+
+	if (ierr == DKS_ERROR)
+	  std::cout << "Error evaluating function" << std::endl;
+
+	//move to the first option that improives the solution
+	if (ierr == DKS_SUCCESS && time_next < time_current) {
+	  time_current = time_next;
+	  search.moveToNeighbour();
+	  neighbourFound = true;
+	}
+
+      }
+      
+      //if no better option is found save the state and move to step 1
+      if (!neighbourFound) {
+	search.saveCurrentState(time_current);
+	topReached = true;
+      }
+
+    }
+  }  
+
+  std::cout << std::endl;
+  search.printBest();
+
+  t.stop();
+  std::cout << "hill climbing: " << t.gettime() << std::endl;
+}
+
+void DKSAutoTuning::simulatedAnnealing(double Tstart, double Tstep) {
+
+  DKSTimer t;
+  t.init("simulated annealing");
+  t.start();
+
+  int ierr;
+  double time_current;
+  double time_next;
+
+  DKSSearchStates search(params_m);
+
+  //make a random guess
+  search.initCurrentState();
+
+  //evaluate current state
+  setParameterValues(search.getCurrentState());
+  ierr = evaluateFunction(time_current);
+
+  if (ierr == DKS_ERROR)
+    return;
+
+  for (double Temp = Tstart; Temp > 0; Temp -= Tstep) {
+
+    search.printCurrentState(time_current);
+
+    //calucate all the neighbours of current state
+    search.getNeighbours(10);
+
+    //make a move to random neighbour and evaluate the runtime
+    setParameterValues(search.getRandomNeighbour());
+    ierr = evaluateFunction(time_next);
+
+    if (ierr == DKS_ERROR)
+      return;
+
+    //if the solution improves move to this point else move to this point with probabily exp(-dE/T)
+    if (time_next < time_current) {
+      time_current = time_next;
+      search.moveToNeighbour();
+    } else {
+      double p = (double)rand() / RAND_MAX;
+      double dE = time_next - time_current;
+      double P = exp(-dE/Temp);
+
+      if (P > p) { 
+	time_current = time_next;
+	search.moveToNeighbour();
+      }
+    }  
+  }
+
+  search.printCurrentState(time_current);
+  
+  t.stop();
+  std::cout << "Simulated annealing: " << t.gettime() << std::endl;
+
+}
+
diff --git a/src/AutoTuning/DKSAutoTuning.h b/src/AutoTuning/DKSAutoTuning.h
new file mode 100644
index 0000000..ca8f3a3
--- /dev/null
+++ b/src/AutoTuning/DKSAutoTuning.h
@@ -0,0 +1,103 @@
+#ifndef DKS_AUTOTUNIG
+#define DKS_AUTOTUNIG
+
+#include <iostream>
+#include <functional>
+#include <vector>
+#include <string>
+#include <fstream>
+#include <cstdlib>
+#include <chrono>
+#include <ctime>
+
+
+#include "../DKSBase.h"
+#include "../Utility/DKSTimer.h"
+#include "DKSSearchStates.h"
+
+typedef std::vector<Parameter> Parameters;
+typedef std::vector<State> States;
+
+class DKSAutoTuning {
+
+private:
+
+  bool evaluate_time_m;
+
+  std::string api_name_m;
+  std::string device_name_m;
+  std::string function_name_m;
+
+  std::function<int()> f_m;
+  std::function<double()> fd_m;
+  Parameters params_m;
+
+  DKSBase *base_m;
+
+  int loops_m;
+
+  /** Update parameters from a state */
+  int setParameterValues(States states);
+
+  /** Evaluate the function and set execution time 
+   *  Returns DKS_ERROR if errors occured during function execution. 
+   *  Returns DKS_SUCCESS if function executed as planned. 
+   */
+  int evaluateFunction(double &value);
+
+public:
+
+  /** Constructor */
+  DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
+
+  /** Destructor */
+  ~DKSAutoTuning();
+
+  /** Set function to auto tune.
+   *  Caller of setFunction is responsible to bind the correct parameters 
+   *  to the function with std::bind.
+   */
+  void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
+    f_m = f;
+    function_name_m = name;
+    evaluate_time_m = evaluate_time;
+  }
+
+  void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
+    fd_m = f;
+    function_name_m = name;
+    evaluate_time_m = evaluate_time;
+  }
+
+  /** Set parameter for auto tuning.
+   *  Provide a pointer to a parameter that will be changed during auto-tuning
+   *  and a min-max value for this element
+   */
+  template <typename T1>
+  void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
+    Parameter p(value, min, max, step, name);
+    params_m.push_back(p);
+  }
+
+  /** Delete all added parameters */
+  void clearParameters();
+
+  /** Perform exaustive search evaluating all the parameter configurations */
+  void exaustiveSearch();
+
+  /** Perform auto-tuning.
+   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
+   *  parameters constant.
+   */
+  void lineSearch();  
+
+  /** Perform hill climbing
+   */
+  void hillClimbing(int restart_loops = 1);
+
+  /** Perfor simulated annealing to find the parameters */
+  void simulatedAnnealing(double Tstart, double Tstep);
+
+};
+
+#endif
diff --git a/src/AutoTuning/DKSAutoTuningTester.h b/src/AutoTuning/DKSAutoTuningTester.h
new file mode 100644
index 0000000..9c44309
--- /dev/null
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@@ -0,0 +1,33 @@
+#ifndef DKS_TESTAUTOTUNING
+#define DKS_TESTAUTOTUNING
+
+#include <iostream>
+#include <cmath>
+
+class DKSAutoTuningTester {
+
+  friend class DKSBaseMuSR;
+
+private:
+
+  double x;
+  double y;
+
+public:
+
+  DKSAutoTuningTester() {
+    x = 0.0;
+    y = 0.0;
+  }
+
+  ~DKSAutoTuningTester();
+
+  double peaksZ() {
+
+    double z = 3 * pow(1-x,2) * exp(-pow(x,2) - pow(y+1,2)) - 10 * (x/5 - pow(x,3) - pow(y,5)) * exp(-pow(x,2) - pow(y,2)) - (1.0/3.0) * exp( - pow(x+1,2) - pow(y,2));
+    return z;
+  }
+
+};
+
+#endif
diff --git a/src/AutoTuning/DKSConfig.cpp b/src/AutoTuning/DKSConfig.cpp
new file mode 100644
index 0000000..645c6ab
--- /dev/null
+++ b/src/AutoTuning/DKSConfig.cpp
@@ -0,0 +1,163 @@
+#include "DKSConfig.h"
+
+DKSConfig::DKSConfig() {
+
+  //get home directory
+  homeset_m = true;
+  if ((homedir_m = getenv("HOME")) == NULL)
+    homeset_m = false;
+
+  loadConfigFile();
+
+}
+
+DKSConfig::~DKSConfig() {
+  //delete tree_m;
+  
+  saveConfigFile();
+}
+
+
+int DKSConfig::loadConfigFile() {
+  
+  int ierr = DKS_ERROR;
+  /*
+  if (homeset_m) {
+    //check if $HOME/.config/DKS exists
+    std::string filename = homedir_m + config_dir + config_file;
+    std::cout << "Check for: " << filename << std::endl;
+    if (fs::exists(filename)) {
+      try {
+	pt::read_xml(filename, tree_m);
+	treeloaded_m = true;
+	ierr = DKS_SUCCESS;
+      } catch (std::exception &e) {
+	DEBUG_MSG("Error loading autotuning file!");
+	treeloaded_m = false;
+	ierr = DKS_ERROR;
+      }
+    }
+  }
+  */
+  return ierr;
+}
+
+
+int DKSConfig::saveConfigFile() {
+
+  int ierr = DKS_ERROR;
+  /*
+  std::string savedir = homedir_m + config_dir;
+  std::string savefile = homedir_m + config_dir + config_file;
+
+  std::cout << savedir << std::endl;
+  std::cout << savefile << std::endl;
+
+  if (homeset_m) {
+    //check if $HOME/.config/DKS directory exists, if not create
+    bool homecreated = false;
+    fs::path p (savedir);
+    if (!fs::is_directory(p))
+      homecreated = fs::create_directory(p);
+
+    try {
+      if (homecreated) {
+	pt::write_xml(savefile, tree_m);
+	ierr =  DKS_SUCCESS;
+      }
+    } catch(std::exception &e) {
+      ierr = DKS_ERROR;
+    }
+
+  }
+  */
+  return ierr;
+}
+
+
+int DKSConfig::addConfigParameter(const std::string api, const std::string device,
+				  const std::string name, const std::string func, 
+				  int size, std::string param, int value) {
+
+
+  //keys to acces data in the tree
+  std::string device_name = name;
+  device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end());
+  std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func;
+  std::string parameter = key + ".parameter";
+  std::string attr_size = "<xmlattr>.size";
+  std::string attr_param = "<xmlattr>." + param;
+
+  //tmp node where new attributes are cteated in case the attribute doesn't exist in the tree
+  pt::ptree *tmp;
+  bool newNode = true;
+
+  //loop trough all the items in the node and see if new param needs to be created
+  //or old one updated
+  boost::optional< pt::ptree& > child = tree_m.get_child_optional(key);
+  if (child) {
+    BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) {
+      int oldsize = v.second.get<int>(attr_size,-1);
+    
+      //if param with the same size already exists in the tree save pointer to this
+      if (size == oldsize) {
+	tmp = &v.second;
+	newNode = false;
+      }
+    }
+  }
+  
+  //if parameter doesnt exist with this size, create a new parameter
+  if (newNode) {
+    tmp = new pt::ptree();
+    tmp->add(attr_size, size);
+    tmp->add(attr_param, value);
+    tree_m.add_child(parameter, *tmp);
+  } else {
+    //if parameter exists update the parameter value
+    tmp->put(attr_param, value);
+  }
+
+  return DKS_SUCCESS;
+}
+
+int DKSConfig::getConfigParameter(const std::string api, const std::string device,
+				  const std::string name, const std::string func, 
+				  int size, std::string param, int &value) {
+
+  //get the value of the tree, default to -1 if value doesn't exist
+  int ierr = DKS_SUCCESS;
+
+  //define key and attribute values to find parameters in the tree
+  std::string device_name = name;
+  device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end());
+  std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func;
+  std::string attr_size = "<xmlattr>.size";
+  std::string attr_param = "<xmlattr>." + param;
+  
+  float maxDist = std::numeric_limits<float>::max();
+
+  //check if the parameters exist
+  boost::optional< pt::ptree& > child = tree_m.get_child_optional(key);
+  if (child) {
+    //loop trough parameters and get the one that is closes to the size specified
+    BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) {
+      int param_size = v.second.get<int>(attr_size,-1); //get parameter size
+      if (param_size > 0) { // if param_size is -1 param is not defined correctly and not usable
+	float dist = abs(param_size - size);
+	if (dist < maxDist) {
+	  value = v.second.get<int>(attr_param,-1);
+	  maxDist = dist;
+	}
+      }
+    }
+  } else {
+    value = -1;
+    ierr = DKS_ERROR;
+  }
+  
+  return ierr;
+}
+
+
+
diff --git a/src/AutoTuning/DKSConfig.h b/src/AutoTuning/DKSConfig.h
new file mode 100644
index 0000000..bf7255a
--- /dev/null
+++ b/src/AutoTuning/DKSConfig.h
@@ -0,0 +1,69 @@
+/** Class to save and load DKS autotunning configs.
+ * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
+ * Uses boost xml_parser to read and write the xml file and boost property tree to store
+ * the xml content.
+ */
+
+#ifndef DKS_CONFIG
+#define DKS_CONFIG
+
+#include <boost/property_tree/ptree.hpp>
+#include <boost/optional/optional.hpp>
+#include <boost/property_tree/xml_parser.hpp>
+#include <boost/foreach.hpp>
+#include <boost/filesystem.hpp>
+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include <exception>
+#include <limits>
+#include <cmath>
+#include <algorithm>
+#include <cctype>
+
+#include "../DKSDefinitions.h"
+
+namespace pt = boost::property_tree;
+namespace fs = boost::filesystem;
+
+const std::string config_dir = "/.config/DKS";
+const std::string config_file = "/autotuning.xml";
+
+class DKSConfig {
+
+private:
+
+  pt::ptree tree_m;
+  const char *homedir_m;
+  bool homeset_m;
+  bool treeloaded_m;
+
+public:
+
+  /** Constructor, set home variable.
+   * If home directory is not set config file can not be read or saved
+   */
+  DKSConfig();
+
+  ~DKSConfig();
+
+  /** Load autotuinig.xml into tree variable if this file exists */
+  int loadConfigFile();
+  
+  /** Save autotuning.xml file */
+  int saveConfigFile();
+
+  /** Add config parameter to tree */
+  int addConfigParameter(const std::string api, const std::string device,
+			 const std::string name, const std::string func, 
+			 int size, std::string param, int value);
+
+  /** Get config parameter from the tree */
+  int getConfigParameter(const std::string api, const std::string device,
+			 const std::string name, const std::string func,
+			 int size, std::string param, int &value);
+
+
+};
+
+#endif
diff --git a/src/AutoTuning/DKSSearchStates.cpp b/src/AutoTuning/DKSSearchStates.cpp
new file mode 100644
index 0000000..4bfcaba
--- /dev/null
+++ b/src/AutoTuning/DKSSearchStates.cpp
@@ -0,0 +1,233 @@
+#include "DKSSearchStates.h"
+
+/** set the current state so that number of parameters and parameter bounds are known */
+DKSSearchStates::DKSSearchStates(Parameters params) {
+
+  for (auto p : params) {
+    State s; 
+    s.value = p.getValue();
+    s.min = p.min;
+    s.max = p.max;
+    s.step = p.step;
+    current_state_m.push_back(s);
+  }
+
+  neighbour_state_m.resize(current_state_m.size());
+  best_state_m.resize(current_state_m.size());
+
+  best_time_m = std::numeric_limits<double>::max();
+
+  next_neighbour_m = -1;
+
+  srand(time(NULL));
+
+}
+
+DKSSearchStates::~DKSSearchStates() {
+  current_state_m.clear();
+  neighbour_state_m.clear();
+  best_state_m.clear();
+  neighbours_m.clear();
+}
+
+/** Get all the possible neighbours of the current state */
+void DKSSearchStates::getNeighbours(int dist) {
+
+  std::vector< std::vector<double> > values;
+
+  for (auto state : current_state_m) {
+    std::vector<double> s;
+
+    for (int d = dist; d > 0; d--) {
+      if (state.value - d*state.step >= state.min)
+	s.push_back(state.value - state.step);
+    }
+
+    s.push_back(state.value);
+    
+    for (int d = 1; d < dist + 1; d++) {
+      if (state.value + d*state.step <= state.max)
+	s.push_back(state.value + state.step);
+    }
+
+    values.push_back(s);
+  }
+
+
+  std::vector< std::vector<double> > s {{}};
+  for (auto& u : values) {
+    std::vector< std::vector<double> > r;
+    for(auto& x : s) {
+      for( auto y : u) {
+	r.push_back(x);
+	r.back().push_back(y);
+      }
+    }
+    s.swap(r);
+  }
+
+  //get current state values
+  std::vector<double> current;
+  for (auto state : current_state_m)
+    current.push_back(state.value);
+  s.erase(std::remove(s.begin(), s.end(), current));
+
+  neighbours_m.clear();
+  neighbours_m = s;
+  next_neighbour_m = 0;
+}
+
+void DKSSearchStates::setCurrentState(std::vector<Parameter> current_state) {
+ 
+  current_state_m.clear();
+  for (auto& p : current_state) {
+    State s; 
+    s.value = p.getValue();
+    s.min = p.min;
+    s.max = p.max;
+    s.step = p.step;
+    current_state_m.push_back(s);
+  }
+}
+
+void DKSSearchStates::setCurrentState(std::vector<State> current_state) {
+
+  current_state_m.clear();
+  for (auto& p : current_state) {
+    State s; 
+    s.value = p.value;
+    s.min = p.min;
+    s.max = p.max;
+    s.step = p.step;
+    current_state_m.push_back(s);
+  }
+}
+
+void DKSSearchStates::initCurrentState() {
+
+  //go trough parameters in current state and generate a new random value
+  for (auto& s : current_state_m) {
+    //get number of total values
+    int values = (s.max - s.min) / s.step + 1;    
+
+    int r = rand() % values;
+
+    s.value = s.min + r * s.step;
+  }
+
+  getNeighbours();
+}
+
+States DKSSearchStates::getCurrentState() {
+  return current_state_m;
+}
+
+States DKSSearchStates::getNextNeighbour() {
+
+  //check if there are ant neighbours to move on
+  if (next_neighbour_m < (int)neighbours_m.size()) {
+
+    //get the vector of values for each parameters in the neighbour cell
+    std::vector<double> neighbour_values = neighbours_m[next_neighbour_m];
+
+    //set the values to neighbour_state_m
+    for (unsigned int n = 0; n < neighbour_state_m.size(); n++)
+      neighbour_state_m[n].value = neighbour_values[n];
+
+  }
+
+  next_neighbour_m++;
+  return neighbour_state_m;
+
+}
+
+States DKSSearchStates::getRandomNeighbour() {
+
+  int rand_neighbour = rand() % (int)neighbours_m.size();
+  
+  //get the vector of values for each parameters in the neighbour cell
+  std::vector<double> neighbour_values = neighbours_m[rand_neighbour];
+
+  //set the values to neighbour_state_m
+  for (unsigned int n = 0; n < neighbour_state_m.size(); n++)
+    neighbour_state_m[n].value = neighbour_values[n];
+
+  next_neighbour_m = rand_neighbour + 1;
+  return neighbour_state_m;
+
+}
+
+bool DKSSearchStates::nextNeighbourExists() {
+  bool neighbourExists = false;
+  if (next_neighbour_m < (int)neighbours_m.size())
+    neighbourExists = true;
+
+  return neighbourExists;
+}
+
+void DKSSearchStates::moveToNeighbour() {
+
+  for (unsigned int i = 0; i < current_state_m.size(); i++)
+    current_state_m[i].value = neighbour_state_m[i].value;
+
+  //getNeighbours();
+
+}
+
+void DKSSearchStates::saveCurrentState(double current_time) {
+
+  if (current_time < best_time_m) {
+    for (unsigned int i = 0; i < current_state_m.size(); i++) {
+      best_state_m[i].value = current_state_m[i].value;
+      best_state_m[i].min = current_state_m[i].min;
+      best_state_m[i].max = current_state_m[i].max;
+      best_state_m[i].step = current_state_m[i].step;
+    }
+
+    best_time_m = current_time;
+  }
+
+}
+
+
+void DKSSearchStates::printCurrentState(double time) {
+  std::cout << "Current state: ";
+  for (auto s : current_state_m)
+    std::cout << s.value << "\t";
+  std::cout << time << std::endl;
+
+}
+
+void DKSSearchStates::printInfo() {
+
+  std::cout << "Current state: ";
+  for (auto s : current_state_m)
+    std::cout << s.value << "\t";
+  std::cout << std::endl;
+
+  std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): ";
+  if (next_neighbour_m > 0) {
+    for (auto s : neighbour_state_m)
+      std::cout << s.value << "\t";
+  }
+  std::cout << std::endl;
+
+}
+
+void DKSSearchStates::printNeighbour(double time) {
+  std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): ";
+  if (next_neighbour_m > 0) {
+    for (auto s : neighbour_state_m)
+      std::cout << s.value << "\t";
+  }
+  std::cout << time << std::endl;
+}
+
+void DKSSearchStates::printBest() {
+  std::cout << "Best state (" << best_time_m << "): ";
+  if (best_time_m > 0) {
+    for (auto s : best_state_m)
+      std::cout << s.value << "\t";
+  }
+  std::cout << std::endl;
+}
diff --git a/src/AutoTuning/DKSSearchStates.h b/src/AutoTuning/DKSSearchStates.h
new file mode 100644
index 0000000..cdd8fb0
--- /dev/null
+++ b/src/AutoTuning/DKSSearchStates.h
@@ -0,0 +1,162 @@
+#ifndef DKS_SEARCHSTATES
+#define DKS_SEARCHSTATES
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
+
+class Parameter {
+
+private:
+  int *ivalue;
+  double *dvalue;
+  VALUE_TYPE type;
+
+public:
+  double min;
+  double max;
+  double step;
+  std::string name;
+
+  Parameter(int *_value, int _min, int _max, int _step, std::string _name) {
+
+    ivalue = _value;
+    min = (double)_min;
+    max = (double)_max;
+    step = (double)_step;
+    name = _name;
+    type = DKS_INT;
+  }
+
+  Parameter(double *_value, double _min, double _max, double _step, std::string _name) {
+
+    std::cout << "Double" << std::endl;
+
+    dvalue = _value;
+    min = _min;
+    max = _max;
+    step = _step;
+    name = _name;
+    type = DKS_DOUBLE;
+  }
+
+  template <typename T>
+  void setValue(T v) {
+    if (type == DKS_INT)
+      *ivalue = (int)v;
+    if (type == DKS_DOUBLE)
+      *dvalue = (double)v;
+  }
+
+  double getValue() {
+    switch (type) {
+    case DKS_INT:
+      return (double)*ivalue;
+    case DKS_DOUBLE:
+      return *dvalue;
+    };
+    return -1.0;
+  }
+
+};
+
+struct State {
+  double value;
+  double min;
+  double max;
+  double step;
+};
+
+typedef std::vector<Parameter> Parameters;
+typedef std::vector<State> States;
+
+class DKSSearchStates {
+
+private:
+
+  States current_state_m;
+  States neighbour_state_m;
+
+  States best_state_m;
+  double best_time_m;
+
+  std::vector< std::vector<double> > neighbours_m;
+  int next_neighbour_m;
+
+public:
+
+  /** Constructor alwats takes params array as variable.
+   *  Params array is needed to know how many params will be searched and what are thou bounds
+   *  of each parameter.
+   */
+  DKSSearchStates(Parameters params);
+
+  ~DKSSearchStates();
+
+  /** Set current state using parameter vector */
+  void setCurrentState(Parameters current_state);
+
+  /** set current state using the state vector */
+  void setCurrentState(States current_state);
+
+  /** init random current state */
+  void initCurrentState();
+
+  /** get current state */
+  States getCurrentState();
+
+  /** get next neighbour state.
+   *  if there are no next neighbore stay at the curretn neighbour
+   */
+  States getNextNeighbour();
+
+  /** get random neighbour state */
+  States getRandomNeighbour();
+
+  /** calculate all the neighbour states */
+  void getNeighbours(int dist = 1);
+
+  /** Chech if there are more neighbours to evaluate
+   *  Return true if more neighbors exist, false if we are at the last neighbour
+   */
+  bool nextNeighbourExists();
+
+  /** move to next neighbour.
+   *  set the current state as the next neighbour, 
+   *  calculate the neighbours of the new current state.
+   */
+  void moveToNeighbour();
+  
+  /** Save the current state and the evaluation time of the current state.
+   *  If evaluation time of the current state is better than the evaluation time of the 
+   *  best state, save the current state as best.
+   */
+  void saveCurrentState(double current_time);
+
+
+  //Print functions - mostly usefull for debugging purposes, or for benchmark runs to print the
+  //status of the search
+
+  /** Print current state.
+   *  cout the current state. Mostly used for debuging purposes
+   */
+  void printCurrentState(double time = 0.0);
+
+  /** Print current neighbour info */
+  void printNeighbour(double time = 0.0);
+
+  /** Print info.
+   *  Print the whole info about the search: current state, current neighbour, total neighbors
+   */
+  void printInfo();
+
+  /** Print the best saved state */
+  void printBest();
+
+};
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..df12a31
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,130 @@
+CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+
+SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+MACRO (ADD_SOURCES )
+  FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+  FOREACH (_src ${ARGN})
+    IF (_relPath)
+      LIST (APPEND DKS_SRCS "${_relPath}/${_src}")
+    ELSE ()
+      LIST (APPEND DKS_SRCS "${_src}")
+    ENDIF ()
+  ENDFOREACH ()
+  IF (_relPath)
+    # propagate SRCS to parent directory
+    SET (DKS_SRCS ${DKS_SRCS} PARENT_SCOPE)
+  ENDIF ()
+ENDMACRO ()
+
+MACRO (ADD_HEADERS )
+  FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+  FOREACH (_hdr ${ARGN})
+    IF (_relPath)
+      LIST (APPEND DKS_HDRS "${_relPath}/${_hdr}")
+    ELSE ()
+      LIST (APPEND DKS_HDRS "${_hdr}")
+    ENDIF ()
+  ENDFOREACH ()
+  IF (_relPath)
+    # propagate HDRS to parent directory
+    SET (DKS_HDRS ${DKS_HDRS} PARENT_SCOPE)
+  ENDIF ()
+ENDMACRO ()
+
+
+SET (DKS_BASEDIR_HDRS
+  DKSBase.h
+  DKSDefinitions.h
+  )
+
+SET (DKS_BASEDIR_SRCS
+  DKSBase.cpp
+  )
+
+IF (USE_CUDA OR USE_OPENCL)
+   SET (DKS_BASEDIR_HDRS
+       ${DKS_BASEDIR_HDRS}
+       DKSBaseMuSR.h
+       )
+
+   SET (DKS_BASEDIR_SRCS
+       ${DKS_BASEDIR_SRCS}
+       DKSBaseMuSR.cpp
+       )
+ENDIF (USE_CUDA OR USE_OPENCL)
+
+IF (USE_CUDA)
+  SET (DKS_BASEDIR_HDRS
+    ${DKS_BASEDIR_HDRS}
+    DKSImageReconstruction.h
+    )
+  
+  SET (DKS_BASEDIR_SRCS
+    ${DKS_BASEDIR_SRCS}
+    DKSImageReconstruction.cpp
+    )
+ENDIF (USE_CUDA)
+
+ADD_HEADERS (${DKS_BASEDIR_HDRS})
+ADD_SOURCES (${DKS_BASEDIR_SRCS})
+
+MESSAGE (STATUS "HEADERS: ${DKS_BASEDIR_HDRS}")
+MESSAGE (STATUS "SOURCES: ${DKS_BASEDIR_SRCS}")
+
+#add only those source files that will be used
+IF (USE_OPENCL)
+  MESSAGE (STATUS "Add OpenCL source files")
+  ADD_SUBDIRECTORY (OpenCL)
+ENDIF (USE_OPENCL)
+
+IF (USE_CUDA)
+  MESSAGE (STATUS "Add CUDA source files")
+  ADD_SUBDIRECTORY (CUDA)
+ENDIF (USE_CUDA)
+
+IF (USE_MIC)
+  MESSAGE (STATUS "Add MIC source files")
+  ADD_SUBDIRECTORY (MIC)
+ENDIF (USE_MIC)
+
+ADD_SUBDIRECTORY (Utility)
+ADD_SUBDIRECTORY (AutoTuning)
+ADD_SUBDIRECTORY (Algorithms)
+
+IF (USE_CUDA)
+  CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
+  CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
+
+  IF (USE_UQTK)
+    TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+    TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+  ELSE (USE_UQTK)
+    TARGET_LINK_LIBRARIES(dks cudadevrt)
+    TARGET_LINK_LIBRARIES(dksshared cudadevrt)
+  ENDIF (USE_UQTK)
+
+ELSE (USE_CUDA)
+  MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
+  ADD_LIBRARY(dks ${DKS_SRCS})
+  ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
+
+  IF (USE_UQTK)
+    TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+    TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+  ELSE (USE_UQTK)
+    TARGET_LINK_LIBRARIES(dks)
+    TARGET_LINK_LIBRARIES(dksshared)
+  ENDIF(USE_UQTK)
+
+ENDIF (USE_CUDA)
+
+INSTALL(TARGETS dks DESTINATION lib)
+INSTALL(TARGETS dksshared DESTINATION lib)
+INSTALL(FILES ${DKS_BASEDIR_HDRS} DESTINATION include)
+
+#IF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc"))
+#  INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/libdksMIC.a DESTINATION build/lib)
+#ENDIF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc"))
+
+
+
diff --git a/src/CUDA/CMakeLists.txt b/src/CUDA/CMakeLists.txt
new file mode 100644
index 0000000..977d570
--- /dev/null
+++ b/src/CUDA/CMakeLists.txt
@@ -0,0 +1,35 @@
+SET (_HDRS
+	CudaBase.cuh
+	CudaFFT.cuh
+	CudaGreensFunction.cuh
+	CudaChiSquare.cuh
+	CudaCollimatorPhysics.cuh
+	CudaImageReconstruction.cuh
+	CudaChiSquareRuntime.cuh
+  )
+  
+SET (_SRCS
+	CudaBase.cu
+	CudaFFT.cu
+	CudaGreensFunction.cu
+	CudaChiSquare.cu
+	CudaCollimatorPhysics.cu
+	CudaImageReconstruction.cu
+	CudaChiSquareRuntime.cu
+)
+
+#INCLUDE_DIRECTORIES (
+#  ${CMAKE_CURRENT_SOURCE_DIR}
+#)
+
+ADD_SOURCES(${_SRCS})
+ADD_HEADERS(${_HDRS})
+
+INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
+
+SET (_KERNELS
+  NVRTCKernels/CudaChiSquareKernel.cu
+  )
+
+INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
+
diff --git a/src/CUDA/CMakeListsLibcuda.txt b/src/CUDA/CMakeListsLibcuda.txt
new file mode 100644
index 0000000..a94b877
--- /dev/null
+++ b/src/CUDA/CMakeListsLibcuda.txt
@@ -0,0 +1,25 @@
+CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+
+FIND_PACKAGE(CUDA REQUIRED)
+
+SET (CUDA_NVCC_FLAGS "-arch=sm_30")
+
+SET(LIB_TYPE STATIC)
+
+SET (DKS_CUDA_HDRS
+	CudaBase.cuh
+	CudaFFT.cuh
+	CudaGreensFunction.cuh
+  )
+  
+SET (DKS_CUDA_SRCS
+	CudaBase.cu
+	CudaFFT.cu
+	CudaGreensFunction.cu
+)
+
+INCLUDE_DIRECTORIES (
+  ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})
\ No newline at end of file
diff --git a/src/CUDA/CudaBase.cu b/src/CUDA/CudaBase.cu
new file mode 100644
index 0000000..f352cf2
--- /dev/null
+++ b/src/CUDA/CudaBase.cu
@@ -0,0 +1,386 @@
+#include "CudaBase.cuh"
+
+//=====================================//
+//============Cuda kernels=============//
+//=====================================//
+
+__global__ void initcuRandState(curandState *state, int size, int seed = 0) {
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    curand_init(seed + idx, 0, 0, &state[idx]);
+  }
+
+}
+
+
+//=====================================//
+//==========Private functions==========//
+//=====================================//	
+
+
+//====================================//
+//==========Public functions==========//
+//====================================//	
+
+CudaBase::CudaBase() { 
+
+  currentStream = -1;
+  cudaStreams.reserve(10);
+  defaultRndSet = -1;
+ 
+}
+		
+CudaBase::~CudaBase() { 
+  
+  cuda_deleteStreams();
+  cuda_deleteCurandStates();
+ 
+}
+
+/*
+  create curandStates
+*/
+int CudaBase::cuda_createCurandStates(int size) {
+
+  if (defaultRndSet == 1)
+    cuda_deleteCurandStates();
+
+  int threads = 128;
+  int blocks = size / threads + 1;
+  int seed = time(NULL);
+
+  //std::cout << "sizeof: " << sizeof(curandState) << std::endl;
+  cudaMalloc(&defaultRndState, sizeof(curandState)*size);
+  initcuRandState<<<blocks, threads>>>(defaultRndState, size, seed);
+
+  defaultRndSet = 1;
+
+  return DKS_SUCCESS;
+}
+
+int CudaBase::cuda_deleteCurandStates() {
+  if (defaultRndSet == 1) {
+    cudaFree(defaultRndState);
+    defaultRndSet = -1;
+  }
+
+  return DKS_SUCCESS;
+}
+
+curandState* CudaBase::cuda_getCurandStates() {
+  return defaultRndState;
+}
+
+/*
+  add cuda stream
+*/
+int CudaBase::cuda_createStream(int &streamId) {
+
+  cudaStream_t tmpStream;
+  cudaError_t cerror;
+
+  cerror = cudaStreamCreate(&tmpStream);
+  if (cerror != cudaSuccess) {
+    DEBUG_MSG("Failed to create new CUDA stream, cuda error: " << cerror);
+    return DKS_ERROR;
+  }
+
+  cudaStreams.push_back(tmpStream);
+  streamId = cudaStreams.size() - 1;
+
+  return DKS_SUCCESS;
+}
+
+/*
+  add existing stream to list
+*/
+int CudaBase::cuda_addStream(cudaStream_t tmpStream, int &streamId) {
+  cudaStreams.push_back(tmpStream);
+  streamId = cudaStreams.size() - 1;
+
+  return DKS_SUCCESS;
+}
+
+
+/*
+  delete stream
+*/
+int CudaBase::cuda_deleteStream(int id) {
+  //TODO: lets see if this is necessary, currently do nothing
+  return DKS_ERROR;
+}
+
+/*
+  delete all streams
+*/
+int CudaBase::cuda_deleteStreams() {
+
+  //delete all cuda streams
+  for (unsigned int i = 0; i < cudaStreams.size(); i++) {
+    cudaStreamDestroy(cudaStreams[i]);
+  }
+  cudaStreams.clear();
+  currentStream = -1;
+
+  return DKS_SUCCESS;
+}
+
+
+/*
+  set stream id
+*/
+int CudaBase::cuda_setStream(int id) {
+  currentStream = id;
+  return DKS_SUCCESS;
+}
+
+/*
+  return stream id
+*/
+int CudaBase::cuda_getStreamId() {
+  return currentStream;
+}
+
+/*
+  set default stream as the stream to use
+*/
+int CudaBase::cuda_defaultStream() {
+  currentStream  = -1;
+  return DKS_SUCCESS;
+}
+
+int CudaBase::cuda_numberOfStreams() {
+  return cudaStreams.size();
+}
+
+cudaStream_t CudaBase::cuda_getStream(int id) {
+  return cudaStreams[id];
+}
+
+cublasHandle_t CudaBase::cuda_getCublas() {
+  return defaultCublas;
+}
+	
+/*
+  get all available cuda devices
+*/
+int CudaBase::cuda_getDevices() {
+
+  std::cout << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::cout << "=============CUDA=============" << std::endl;
+  std::cout << "==============================" << std::endl;
+	
+  int ndev;
+  cudaGetDeviceCount(&ndev);
+
+  std::cout << ndev << std::endl;
+	
+	
+  for (int i = 0; i < ndev; i++) {
+			
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+		
+    std::cout << "Device " << i+1 << ":" << std::endl;
+    std::cout << "Name: " << prop.name << std::endl;
+    std::cout << "PCI bus id: " << prop.pciBusID << std::endl;
+    std::cout << "PCI device id: " << prop.pciDeviceID << std::endl;
+    std::cout << "PCI domain id: " << prop.pciDomainID << std::endl; 
+    std::cout << "==============================" << std::endl;
+  }
+	
+  return DKS_SUCCESS;
+
+}
+	
+
+int CudaBase::cuda_getDeviceCount(int &ndev) {
+  cudaGetDeviceCount(&ndev);
+  return DKS_SUCCESS;
+}
+
+int CudaBase::cuda_getDeviceName(std::string &device_name) {
+  
+  int ierr = DKS_SUCCESS;
+
+  int ndev = 0;
+  cudaGetDeviceCount(&ndev);
+  if (ndev > 0) {
+    int device = 0;
+    cudaDeviceProp prop;
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&prop, device);
+
+    device_name = prop.name;
+  } else {
+    ierr = DKS_ERROR;
+  }
+  return ierr;
+}
+
+int CudaBase::cuda_setDevice(int device) {
+  int ierr = DKS_SUCCESS;
+  int ndev = 0;
+  cudaGetDeviceCount(&ndev);
+
+  std::cout << "Init: " << device << "\t" << ndev << std::endl;
+
+  if (device < ndev) {
+    std::cout << "set device to: " << ndev << std::endl;
+    cudaSetDevice(device);
+  } else {
+    if (ndev > 0)
+      cudaSetDevice(0);
+    else
+      ierr = DKS_ERROR;
+  }
+  return ierr;
+}
+
+int CudaBase::cuda_getUniqueDevices(std::vector<int> &devices) {
+
+  std::vector< std::string > names;
+
+  int ndev;
+  cudaGetDeviceCount(&ndev);
+
+  for (int i = 0; i < ndev; i++) {
+			
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+
+    //add first device to the list, for other devices check if the name is already in the list
+    if (i == 0) {
+      devices.push_back(i);
+      names.push_back(prop.name);
+    } else {
+      std::string target = prop.name;
+      bool isPresent = (std::find(names.begin(), names.end(), target) != names.end());
+      if (!isPresent) {
+	devices.push_back(i);
+	names.push_back(prop.name);
+      }
+    }
+  }
+
+  return DKS_SUCCESS;
+}
+
+
+/*
+  set up cuda device
+*/
+int CudaBase::cuda_setUp() {
+
+  std::cout << "set up" << std::endl;
+  return DKS_SUCCESS;
+}
+	
+/*
+  allocate memory on cuda device
+*/
+void * CudaBase::cuda_allocateMemory(size_t size, int &ierr) {
+	
+  cudaError cerror;
+  void * mem_ptr = NULL;
+	
+  cerror = cudaMalloc((void **) &mem_ptr, size);
+  if (cerror != cudaSuccess) {
+    DEBUG_MSG("Failed to allocate memory, cuda error: " << cerror);
+    std::cout << "Error: " << cudaGetErrorString(cerror) << std::endl;
+    ierr = DKS_ERROR;
+  } else {
+    ierr = DKS_SUCCESS;
+  }
+	
+  return mem_ptr;
+}
+		
+/*
+  Info: free memory on device
+  Return: success or error code
+*/
+int CudaBase::cuda_freeMemory(void * mem_ptr) {
+  cudaError cerror;
+	
+  cerror = cudaFree(mem_ptr);
+  if (cerror != cudaSuccess) {
+    DEBUG_MSG("Error freeing memory, cuda error: " << cerror);
+    return DKS_ERROR;
+  }
+	
+  return DKS_SUCCESS;
+}
+
+int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
+  cudaError cerror;
+	
+  cerror = cudaFreeHost(mem_ptr);
+  if (cerror != cudaSuccess) {
+    DEBUG_MSG("Error freeing host memory, cuda error: " << cerror);
+    return DKS_ERROR;
+  }
+	
+  return DKS_SUCCESS;
+}
+		
+/*
+  Info: allcate memory and write data (push)
+  Return: pointer to memory object
+*/
+/*
+  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
+
+  void * mem_ptr;
+  mem_ptr = cuda_allocateMemory(size, ierr);
+	
+  if (ierr == DKS_SUCCESS)
+  ierr = cuda_writeData(mem_ptr, in_data, size);
+		
+  return mem_ptr;
+  }
+*/
+		
+/*
+  Info: read data and free memory (pull)
+  Return: success or error code
+*/
+/*
+  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
+
+  ierr = cuda_readData(mem_ptr, out_data, size);
+  if (ierr == DKS_SUCCESS)
+  ierr = cuda_freeMemory(mem_ptr);	
+  else
+  return DKS_ERROR;
+	
+	
+  if (ierr == DKS_SUCCESS)	
+  return DKS_SUCCESS;
+  else
+  return DKS_ERROR;
+  }
+*/
+
+/*
+  Info: execute function
+  Return: success or error code
+*/
+int CudaBase::cuda_executeFunction() {
+
+  std::cout << "Execute function" << std::endl;
+  return DKS_SUCCESS;
+}
+		
+/*
+  Info: clean up
+  Return: success or error code
+*/
+int CudaBase::cuda_cleanUp() {
+
+  std::cout << "clean up" << std::endl;
+  return DKS_SUCCESS;
+	
+}
diff --git a/src/CUDA/CudaBase.cuh b/src/CUDA/CudaBase.cuh
new file mode 100644
index 0000000..325016d
--- /dev/null
+++ b/src/CUDA/CudaBase.cuh
@@ -0,0 +1,390 @@
+#ifndef H_CUDA_BASE
+#define H_CUDA_BASE
+
+#include "../DKSDefinitions.h"
+
+#include <iostream>
+#include <stdio.h>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include <cublas_v2.h>
+#include <curand_kernel.h>
+#include <nvToolsExt.h>
+#include <time.h>
+
+class CudaBase {
+
+private:
+	
+  int currentStream;
+  std::vector<cudaStream_t> cudaStreams;
+
+protected: 
+
+  cublasHandle_t defaultCublas;
+
+  curandState *defaultRndState;
+  int defaultRndSet;
+
+public:
+	
+  CudaBase();
+		
+  ~CudaBase();
+
+  /**
+   * Init cuda random number (cuRand) states.
+   * Create an array of type curandState  with "size" elements on the GPU
+   * and create a curandState with different seed for each array entry.
+   * Return success or error code
+   */
+  int cuda_createCurandStates(int size);
+
+  /**
+   * Delete curandState.
+   * Delete curandState array on the GPU and free memory.
+   *  Return success or error code
+   */
+  int cuda_deleteCurandStates();
+
+  /** Get a pointer to curand states
+   *
+   */
+  curandState* cuda_getCurandStates();
+	
+  /**
+   * Create a cuda stream and set streamId to index refering to this stream.
+   * Return success or error code
+   */
+  int cuda_createStream(int &streamId);
+
+  /**
+   * add existing cuda stream to the list.
+   * Return: success or error code.
+   */
+  int cuda_addStream(cudaStream_t tmpStream, int &streamId);
+
+  /**
+   * delete cuda stream
+   * success or error code
+   */
+  int cuda_deleteStream(int id);
+
+  /**
+   * delete all streams
+   * success or error code
+   */
+  int cuda_deleteStreams();
+
+  /**
+   * set stream to use
+   * success or error code
+   */
+  int cuda_setStream(int id);
+
+  /**
+   * Info: get stream that is used
+   *  Return: return id of curretn stream
+   */
+  int cuda_getStreamId();
+
+  /**
+   * Info: reset to default stream
+   * Return: success or error code
+   */
+  int cuda_defaultStream();
+
+  /**
+   * Info: get number of streams
+   * Return: success or error code
+   */
+  int cuda_numberOfStreams();
+
+  /**
+   * Info: get stream
+   * Return: stream
+   */
+  cudaStream_t cuda_getStream(int id);
+
+  /**
+   * Get default cublass handle
+   */
+  cublasHandle_t cuda_getCublas();
+
+  /**
+   * Info: get information on cuda devices
+   * Return: success or error code
+   */
+  int cuda_getDevices();
+
+  /** Get CUDA device count.
+   *  Sets the number of devices on the platform that can use CUDA.
+   *  Returns DKS_SUCCESS
+   */
+  int cuda_getDeviceCount(int &ndev);
+
+  /** Get the name of the device.
+   *  QUery the device properties of the used device and set the string device_name
+   */
+  int cuda_getDeviceName(std::string &device_name);
+
+  /** Set CUDA device to use.
+   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
+   */
+  int cuda_setDevice(int device);
+
+  /** Get unique devices
+   *  Get array of indeces with the unique CUDA devices available on the paltform
+   */
+  int cuda_getUniqueDevices(std::vector<int> &devices);
+	
+  /**
+   * Info: init device
+   * Return: success or error code
+   */
+  int cuda_setUp();
+	
+  /**
+   * Info: allocate memory on cuda device
+   * Return: pointer to memory object
+   */
+  void * cuda_allocateMemory(size_t size, int &ierr);
+
+  /**
+   * Info: allocate host memory in pinned memory
+   * Return: success or error code
+   */
+  template<typename T>
+  int cuda_allocateHostMemory(T *&ptr, size_t size) {
+    cudaError cerror;
+    cerror = cudaMallocHost((void**)&ptr, sizeof(T)*size);
+    if (cerror != cudaSuccess)
+      return DKS_ERROR;
+    
+    return DKS_SUCCESS;
+  }		
+
+  /** 
+   * Info: write data to memory
+   * Retrun: success or error code
+   */
+  template<typename T>
+  int cuda_writeData(T * mem_ptr, const void * in_data, size_t size, int offset = 0) {
+    cudaError cerror;
+	
+    cerror = cudaMemcpy(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error copying data to device, cuda error: " << cerror);
+      return DKS_ERROR;
+    }
+	
+    return DKS_SUCCESS;
+  }
+
+  /**
+   * Info: write data assynchonuously
+   * Return: success or error code
+   */
+  template<typename T>
+  int cuda_writeDataAsync(T *mem_ptr, const void *in_data, size_t size, int streamId = -1, int offset = 0) {
+    cudaError cerror;
+
+    //if default stream or no stream specified, use default write method
+    if (streamId == -1) {
+      cuda_writeData(mem_ptr, in_data, size, offset);
+      return DKS_SUCCESS;
+    }
+
+    if (streamId < cuda_numberOfStreams()) {
+      //call async write
+      cerror = cudaMemcpyAsync(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice,
+			      cuda_getStream(streamId));
+      
+      if (cerror != cudaSuccess) {
+	DEBUG_MSG("Error async data copy, cuda error: " << cerror);
+	return DKS_ERROR;
+      } 
+    } else {
+      DEBUG_MSG("Error invalid stream id: " << streamId);
+      return DKS_ERROR;
+    }
+  
+
+    return DKS_SUCCESS;
+  }
+		
+  /**
+   * Info: read data from memory
+   * Return: success or error code
+   */
+  template<typename T>
+  int cuda_readData(const T * mem_ptr, void * out_data, size_t size, int offset = 0) {
+    cudaError cerror;
+	
+    cerror = cudaMemcpy(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error reading data from device");
+      return DKS_ERROR;
+    }
+	
+    return DKS_SUCCESS;
+  }
+
+  /**
+   * Info: read data async from device memory
+   * Return: success or error code
+   */
+  template<typename T>
+  int cuda_readDataAsync(const T *mem_ptr, void *out_data, size_t size, int streamId = -1, int offset = 0) {
+    cudaError cerror;
+    
+    if (streamId == -1) {
+      cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, 0);
+      if (cerror != cudaSuccess) {
+	DEBUG_MSG("Error async read from devie default stream");
+	return DKS_ERROR;
+      }
+      return DKS_SUCCESS;
+    }
+
+    if (streamId < cuda_numberOfStreams()) {
+      cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, 
+			       cuda_getStream(streamId));
+      if (cerror != cudaSuccess) {
+	DEBUG_MSG("Error async read from device, cuda error: " << cerror);
+	return DKS_ERROR;
+      }
+    } else {
+      DEBUG_MSG("Error invalid stream id: " << streamId);
+      return DKS_ERROR;
+    }
+    
+    return DKS_SUCCESS;
+  }
+		
+  /**
+   * Info: free memory on device
+   * Return: success or error code
+   */
+  int cuda_freeMemory(void * mem_ptr);
+	
+  /**
+   * Info: free page locked memory on host
+   * Return: success or erro code
+   */
+  int cuda_freeHostMemory(void * mem_ptr);
+	
+  /**
+   * Info: allcate memory and write data (push)
+   * Return: pointer to memory object
+   */
+  template<typename T>
+  void * cuda_pushData(const void * in_data, size_t size, int &ierr) {
+		
+    void * mem_ptr;
+    mem_ptr = cuda_allocateMemory(size, ierr);
+	
+    if (ierr == DKS_SUCCESS)
+      ierr = cuda_writeData((T*)mem_ptr, in_data, size);
+		
+    return mem_ptr;
+  }
+		
+  /**
+   * Info: read data and free memory (pull)
+   * Return: success or error code
+   */
+  template<typename T>
+  int cuda_pullData(T * mem_ptr, void * out_data, size_t size, int &ierr) {
+
+    ierr = cuda_readData(mem_ptr, out_data, size);
+    if (ierr == DKS_SUCCESS)
+      ierr = cuda_freeMemory(mem_ptr);	
+    else
+      return DKS_ERROR;
+	
+	
+    if (ierr == DKS_SUCCESS)	
+      return DKS_SUCCESS;
+    else
+      return DKS_ERROR;
+  }
+
+  /**
+   * Info: execute function
+   * Return: success or error code
+   */
+  int cuda_executeFunction();
+		
+  /**
+   * Info: clean up
+   * Return: success or error code
+   */
+  int cuda_cleanUp();
+  
+  /**
+   * Info: sync cuda device
+   * Return: success or error code
+   */
+  int cuda_syncDevice() {
+    cudaDeviceSynchronize();
+    return DKS_SUCCESS;
+  }
+
+  /**
+   * Page-lock host memory
+   */
+  template<typename T>
+  int cuda_hostRegister(T *ptr, int size) {
+    int cerr = cudaHostRegister(ptr, size*sizeof(T), cudaHostRegisterPortable);
+    if (cerr == cudaSuccess) {
+      return DKS_SUCCESS;
+    } else {
+      DEBUG_MSG("Host memroy was not page locked");
+      return DKS_ERROR;
+    }
+  }
+
+  /**
+   * Release page locked memory
+   */
+  template<typename T>
+  int cuda_hostUnregister(T *ptr) {
+    int cerr = cudaHostUnregister(ptr);
+    if (cerr == cudaSuccess)
+      return DKS_SUCCESS;
+    else
+      return DKS_ERROR;
+
+  }
+
+  /**
+   * Info: print device memory info (total, used, avail)
+   * Return: success or error code
+   */
+  int cuda_memInfo() {
+    int ierr;
+    size_t avail;
+    size_t total;
+    double mb = 1000000.0;
+    
+    ierr = cudaMemGetInfo( &avail, &total);
+    
+    if (ierr != cudaSuccess) {
+      DEBUG_MSG("Device mem info could not be obtained!");
+      return DKS_ERROR;
+    }
+
+    std::cout << "Device memory info, total: " << total / mb << "MB,\t";
+    std::cout << "used: " << (total - avail) / mb << "MB,\t";
+    std::cout << "avail: " << avail / mb << "MB" << std::endl;
+
+    return DKS_SUCCESS;
+  }
+
+};
+
+#endif
diff --git a/src/CUDA/CudaChiSquare.cu b/src/CUDA/CudaChiSquare.cu
new file mode 100644
index 0000000..db7f4f7
--- /dev/null
+++ b/src/CUDA/CudaChiSquare.cu
@@ -0,0 +1,287 @@
+#include "CudaChiSquare.cuh"
+
+//simple kernel version
+__global__ void kernelPHistoTFFcn(double *data, double *par, double *chisq, 
+				  double fTimeResolution, double fRebin, int n) {
+
+  int j = blockIdx.x;
+  int i = blockIdx.y;
+
+  int idx = i * n + j;
+
+  const double tau = 2.197019;
+  double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
+  double time = dt0 + fTimeResolution * fRebin * j;  
+
+  double w = par[0]*0.08516155035269027;
+
+  double ldata = data[idx];
+  
+  double theo =  par[2 + i*4] * exp(-time/tau) * (1.0 + par[3 + i*4] * exp(-0.5 * pow(par[1]*time,2.0) ) * cos(w * time+par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4]; 
+
+  
+  if (ldata != 0.0)
+    chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
+  else
+    chisq[idx] = theo * theo;
+  
+}
+
+__global__ void kernelPHistoTFFcn_2(double *data, double *par, double *chisq, 
+				    double fTimeResolution, double fRebin, int n, int s) {
+
+  int j = blockIdx.x;
+
+  const double tau = 2.197019;
+  double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
+  double time = dt0 + fTimeResolution * fRebin * j;  
+  double w = par[0]*0.08516155035269027;
+  double tt = exp(-time/tau);
+  double pp = exp(-0.5 * par[1] * time * par[1] * time);
+  double wt = w * time;
+
+  int idx;
+  double ldata, theo;
+  for (int i = 0; i < s; i++) {
+    idx = i * n + j;
+    ldata = data[idx];
+  
+    theo = par[2 + i*4] * tt * (1.0 + par[3 + i*4] * pp * cos(wt + par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4]; 
+  
+    if (ldata != 0.0)
+      chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
+    else
+      chisq[idx] = theo * theo;
+  }
+  
+}
+
+#define TAU 2.197019
+
+__global__ void kernelPHistoTFFcn_3(double *data, double *par, double *chisq, 
+				    double fTimeResolution, double fRebin, 
+				    int length, int sensors, int numpar) {
+
+  
+  //define shared variable for parameters
+  extern __shared__ double p[];
+
+  //get thread id and calc global id
+  int tid = threadIdx.x;
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //load parameters from global to shared memory
+  if (tid < numpar)
+    p[tid] = par[tid];
+
+  //sync threads
+  __syncthreads();
+
+  if (j < length) {
+
+    double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
+    double time = dt0 + fTimeResolution * fRebin * j;  
+    double w = p[0]*0.08516155035269027;
+    double tt = exp(-time/TAU);
+    double pp = exp(-0.5 * pow(p[1]*time, 2.0));
+    double wt = w * time;
+
+    int idx;
+    double ldata, theo;
+    for (int i = 0; i < sensors; i++) {
+      idx = i * length + j;
+      ldata = data[idx];
+  
+      theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; 
+  
+      if (ldata != 0.0)
+	chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
+      else
+	chisq[idx] = theo * theo;
+    }
+  }
+  
+  
+}
+
+__global__ void kernelSingleGaussTF(double *data, unsigned int *t0, double *par, double *result,
+				    double fTimeResolution, double fRebin, double fGoodBinOffset,
+				    int length, int sensors, int numpar)
+{
+
+  //define shared variable for parameters
+  extern __shared__ double p[];
+
+  //get thread id and calc global id
+  int tid = threadIdx.x;
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //load parameters from global to shared memory
+  if (tid < numpar)
+    p[tid] = par[tid];
+
+  //sync threads
+  __syncthreads();
+
+  if (j < length) {
+    double dt0 = fTimeResolution*0.5*(fRebin - 1);
+    double w1 = par[0]*0.08516155035269027;
+
+    int idx;
+    double ldata, lft0, theo, time;
+    for (int  i = 0; i < sensors; i++) {
+      idx = i * length + j;
+      lft0 = t0[i];
+      if (j >= lft0 + fGoodBinOffset/fRebin) {
+	ldata = data[idx];
+	time = dt0 + fTimeResolution * fRebin* (j - lft0);
+	theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0))
+					*cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; 
+	// 1.74532925199432955e-2 = pi/180
+
+	if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) )
+	  result[idx] = (theo - ldata) + ldata*log(ldata/theo);
+	else
+	  result[idx] = theo - ldata;
+      } else {
+	result[idx] = 0;
+      }
+    }
+  }
+
+}
+
+__global__ void kernelDoubleLorentzTF(double *data, unsigned int *t0, double *par, double *result,
+				      double fTimeResolution, double fRebin, double fGoodBinOffset,
+				      int length, int sensors, int numpar)
+{
+
+  //define shared variable for parameters
+  extern __shared__ double p[];
+
+  //get thread id and calc global id
+  int tid = threadIdx.x;
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //load parameters from global to shared memory
+  if (tid < numpar)
+    p[tid] = par[tid];
+
+  //sync threads
+  __syncthreads();
+
+  if (j < length) {
+    double dt0 = fTimeResolution*0.5*(fRebin - 1);
+    double w1 = p[0]*0.08516155035269027;
+    double w2 = p[2]*0.08516155035269027;
+
+    int idx;
+    double ldata, lft0, theo, time;
+    for (int  i = 0; i < sensors; i++) {
+      
+      idx = i * length + j;
+      lft0 = t0[i];
+      if (j >= lft0 + fGoodBinOffset/fRebin) {
+	ldata = data[idx];
+	time = dt0+fTimeResolution*fRebin*(j-lft0);
+
+	theo = p[4+i*5]*exp(-time/TAU)*
+	  (1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)*
+	   cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+
+	   (1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)*
+	   cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5]; 
+	// 1.74532925199432955e-2 = pi/180
+	if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
+	  result[idx] = (theo - ldata) + ldata*log(ldata/theo);
+	else
+	  result[idx] = theo - ldata;
+      } else {
+	result[idx] = 0;
+      }
+    }
+  }
+}
+
+
+
+int CudaChiSquare::cuda_PHistoTFFcn(void *mem_data, void *mem_ptr, void *mem_chisq,
+				    double fTimeResolution, double fRebin,
+				    int sensors, int length, int numpar,
+				    double &result)
+{
+  
+  int threads = 128;
+  int blocks = length / threads + 1;
+
+  kernelPHistoTFFcn_3<<<blocks, threads, numpar*sizeof(double) >>>((double*)mem_data, 
+								   (double*)mem_ptr, 
+								   (double*)mem_chisq, 
+								   fTimeResolution, 
+								   fRebin, length, 
+								   sensors, numpar);
+
+
+  cublasStatus_t status;
+  status = cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_chisq, 1, &result);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    DEBUG_MSG("cublas asum failed");
+    return DKS_ERROR;
+  }
+  
+
+  return DKS_SUCCESS;
+}
+
+
+int CudaChiSquare::cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+				      double fTimeResolution, double fRebin, double fGoodBinOffset,
+				      int sensors, int length, int numpar,
+				      double &result)
+{
+
+  int threads = 128;
+  int blocks = length / threads + 1;
+
+  kernelSingleGaussTF<<<blocks, threads, numpar*sizeof(double) >>>( (double*)mem_data,
+								    (unsigned int*)mem_t0,
+								    (double*)mem_par,
+								    (double*)mem_result,
+								    fTimeResolution,
+								    fRebin,
+								    fGoodBinOffset,
+								    length, sensors, numpar);
+
+  cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result);
+  result = 2.0 * result;
+			
+
+  return DKS_SUCCESS;
+
+}
+
+
+int CudaChiSquare::cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+					double fTimeResolution, double fRebin, double fGoodBinOffset,
+					int sensors, int length, int numpar,
+					double &result)
+{
+
+  int threads = 128;
+  int blocks = length / threads + 1;
+
+  kernelDoubleLorentzTF<<<blocks, threads, numpar*sizeof(double) >>>( (double*)mem_data,
+								      (unsigned int*)mem_t0,
+								      (double*)mem_par,
+								      (double*)mem_result,
+								      fTimeResolution,
+								      fRebin,
+								      fGoodBinOffset,
+								      length, sensors, numpar);
+
+  cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result);
+  result = 2.0 * result;
+			
+
+  return DKS_SUCCESS;
+
+}
diff --git a/src/CUDA/CudaChiSquare.cuh b/src/CUDA/CudaChiSquare.cuh
new file mode 100644
index 0000000..588dec5
--- /dev/null
+++ b/src/CUDA/CudaChiSquare.cuh
@@ -0,0 +1,59 @@
+#ifndef H_CUDA_CHISQUARE
+#define H_CUDA_CHISQUARE
+
+#include <iostream>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "CudaBase.cuh"
+
+class CudaChiSquare {
+
+private:
+
+  bool base_create;
+  CudaBase *m_base;
+
+public:
+
+  /**
+   * Constructor which gets CudaBase as argument
+   */
+  CudaChiSquare(CudaBase *base) {
+    m_base = base;
+    base_create = false;
+  }
+
+  /* constructor */
+  CudaChiSquare() {
+    m_base = new CudaBase();
+    base_create = true;
+  }
+  
+  /* destructor */
+  ~CudaChiSquare() {
+    if (base_create)
+      delete m_base;
+  }
+
+  /* PHistoTFFcn calculation */
+  int cuda_PHistoTFFcn(void * mem_data, void * mem_par, void * mem_chisq, 
+		       double fTimeResolution, double fRebin,
+		       int sensors, int length, int numpar,
+		       double &result);
+
+  int cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			 double fTimeResolution, double fRebin, double fGoodBinOffset,
+			 int sensors, int length, int numpar,
+			 double &result);
+
+  int cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			   double fTimeResolution, double fRebin, double fGoodBinOffset,
+			   int sensors, int length, int numpar,
+			   double &result);
+
+
+};
+
+#endif
diff --git a/src/CUDA/CudaChiSquareRuntime.cu b/src/CUDA/CudaChiSquareRuntime.cu
new file mode 100644
index 0000000..ebbbcd0
--- /dev/null
+++ b/src/CUDA/CudaChiSquareRuntime.cu
@@ -0,0 +1,313 @@
+#include "CudaChiSquareRuntime.cuh"
+
+CudaChiSquareRuntime::CudaChiSquareRuntime(CudaBase *base) {
+  blockSize_m = BLOCK_SIZE;
+  numBlocks_m = -1;
+
+  ptx_m = NULL;
+  
+  m_base = base;
+  base_create = false;
+  setUpContext();
+}
+
+//constructor, init cuda device and create context
+CudaChiSquareRuntime::CudaChiSquareRuntime() {
+  blockSize_m = BLOCK_SIZE;
+  numBlocks_m = -1;
+
+  ptx_m = NULL;
+
+  m_base = new CudaBase();
+  base_create = true;
+  setUpContext();
+}
+
+//free resources
+CudaChiSquareRuntime::~CudaChiSquareRuntime() {
+  delete[] ptx_m;
+  cuCtxDestroy(context_m);
+
+  freeChiSquare();
+
+  if (base_create)
+    delete m_base;
+}
+
+void CudaChiSquareRuntime::setUpContext() {
+  cuInit(0);
+  cuDeviceGet(&cuDevice_m, 0);
+  cuCtxCreate(&context_m, 0, cuDevice_m);
+
+  N0_m = 1.0;
+  tau_m = 1.0;
+  bkg_m = 1.0;
+
+  initDone_m = false;
+}
+
+//build program string
+std::string CudaChiSquareRuntime::buildProgram(std::string function) {
+
+  long fsize;
+  char *kernel_source;
+
+  //get kernel source
+  char * kernel_file = new char[500];
+  kernel_file[0] = '\0';
+  strcat(kernel_file, OPENCL_KERNELS);
+  strcat(kernel_file, "CUDA/NVRTCKernels/CudaChiSquareKernel.cu");
+
+  //read kernels from file
+  FILE *fp = fopen(kernel_file, "rb");
+  if (!fp)
+    DEBUG_MSG("Can't open kernel file" << kernel_file);
+
+  //get file size and allocate memory	
+  fseek(fp, 0, SEEK_END);
+  fsize = ftell(fp);
+  kernel_source = new char[fsize+1];
+
+  //read file and content in kernel source
+  rewind(fp);
+  fread(kernel_source, 1, sizeof(char)*fsize, fp);
+  kernel_source[fsize] = '\0';
+  fclose(fp);
+  
+  std::string kernel_string (kernel_source);
+  return kernel_string + cudaFunctHeader + "return " + function + ";" + cudaFunctFooter;
+}
+
+//
+int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
+
+  //build program string
+  std::string cudaProg = buildProgram(function);
+
+  //create program
+  nvrtcProgram prog;
+  //std::cout << cudaProg.c_str() << std::endl;
+  nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+
+  //compile program
+  const char *opts[] = {"-fmad=false", ""};
+  int numopts = 1;
+  if (mlh) {
+    opts[1] = "-DMLH";
+    numopts = 2;
+  }
+
+  nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
+ 
+  if (compileResults != NVRTC_SUCCESS) {
+    //obtain compilation log
+    size_t logSize; 
+    nvrtcGetProgramLogSize(prog, &logSize); 
+    char *log = new char[logSize]; 
+    nvrtcGetProgramLog(prog, log); 
+    DEBUG_MSG("Compilation failed!");
+    DEBUG_MSG(log);
+    delete[] log;    
+ 
+    return DKS_ERROR;
+  } else {
+    DEBUG_MSG("Compilation successfull!");
+  }
+
+  //obtain PTX from program
+  if (ptx_m != NULL)
+    delete[] ptx_m;
+  size_t ptxSize; 
+  nvrtcGetPTXSize(prog, &ptxSize); 
+  ptx_m = new char[ptxSize]; 
+  nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);  
+  
+  if (nvrtcPTXResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("Get PTX failed!");
+    return DKS_ERROR;
+  }
+
+  //load module from ptx
+  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); 
+  if (loadResult != CUDA_SUCCESS) {
+    DEBUG_MSG("Load module from ptx failed!");
+    return DKS_ERROR;
+  }
+
+  // Destroy the program
+  nvrtcDestroyProgram(&prog);
+
+  return DKS_SUCCESS;
+}
+
+int CudaChiSquareRuntime::launchChiSquare(int fitType,
+					  void *mem_data, void *mem_err, int length,
+					  int numpar, int numfunc, int nummap,
+					  double timeStart, double timeStep, double &result)
+{
+
+  if (!initDone_m) {
+    DEBUG_MSG("ChiSquare init needs to be called at some point!");
+    return DKS_ERROR;
+  }
+
+  int blocks;
+  int threads = blockSize_m;
+  if (numBlocks_m < 0)
+    blocks = length / threads + 1;
+  else
+    blocks = numBlocks_m;
+
+  CUresult cuStatus;
+  void **args = 0;
+
+  if (fitType == FITTYPE_SINGLE_HISTO) {
+    cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareSingleHisto");
+
+    if (cuStatus != CUDA_SUCCESS) {
+      DEBUG_MSG("Failed to get function from module!");
+      return DKS_ERROR;
+    }
+
+    args = (void**) malloc(15 * sizeof(void*));
+    args[0] = &mem_data;
+    args[1] = &mem_err;
+    args[2] = &mem_param_m;
+    args[3] = &mem_chisq_m;
+    args[4] = &mem_map_m;
+    args[5] = &mem_func_m;
+    args[6] = &length;
+    args[7] = &numpar;
+    args[8] = &numfunc;
+    args[9] = &nummap;
+    args[10] = &timeStart;
+    args[11] = &timeStep;
+    args[12] = &tau_m;
+    args[13] = &N0_m;
+    args[14] = &bkg_m;
+  } else if (fitType == FITTYPE_ASYMMETRY) {
+    cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareAsymmetry");
+
+    if (cuStatus != CUDA_SUCCESS) {
+      DEBUG_MSG("Failed to get function from module!");
+      return DKS_ERROR;
+    }
+
+    args = (void**) malloc(14 * sizeof(void*));
+    args[0] = &mem_data;
+    args[1] = &mem_err;
+    args[2] = &mem_param_m;
+    args[3] = &mem_chisq_m;
+    args[4] = &mem_map_m;
+    args[5] = &mem_func_m;
+    args[6] = &length;
+    args[7] = &numpar;
+    args[8] = &numfunc;
+    args[9] = &nummap;
+    args[10] = &timeStart;
+    args[11] = &timeStep;
+    args[12] = &alpha_m;
+    args[13] = &beta_m;
+  } else if (fitType == FITTYPE_MU_MINUS) {
+    DEBUG_MSG("Not Yet Implemented!");
+    return DKS_ERROR;
+  } else {
+    DEBUG_MSG("Undefined Fit Type!");
+    return DKS_ERROR;
+  }
+ 
+  cuStatus = cuLaunchKernel(kernel_m,
+			    blocks, 1, 1,
+			    threads, 1, 1,
+			    (numpar + numfunc)*sizeof(double) + nummap*sizeof(int), NULL,
+			    args, 0);
+
+  
+
+  if (cuStatus != CUDA_SUCCESS) {
+    std::string msg;
+    msg = "Failed to run kernel! (" + std::to_string(blocks) + ", " + std::to_string(threads) + ")";
+    DEBUG_MSG(msg);
+    const char *desc;
+    cuGetErrorString(cuStatus, &desc);
+    std::cout << desc << std::endl;
+    return DKS_ERROR;
+  }
+
+  cublasStatus_t status;
+  status = cublasDasum(defaultCublasRT, length, (double*)mem_chisq_m, 1, &result);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    DEBUG_MSG("cublas sum failed!");
+    return DKS_ERROR;
+  }
+  
+  // cleanup
+  if (args)
+    free(args);
+
+  return DKS_SUCCESS;
+
+}
+
+int CudaChiSquareRuntime::writeParams(const double *params, int numparams) {
+  int ierr = m_base->cuda_writeData( (double*)mem_param_m, params, sizeof(double)*numparams);
+  return ierr;
+}
+
+int CudaChiSquareRuntime::writeFunc(const double *func, int numfunc) {
+  int ierr = m_base->cuda_writeData( (double*)mem_func_m, func, sizeof(double)*numfunc);
+  return ierr;
+}
+
+int CudaChiSquareRuntime::writeMap(const int *map, int nummap) {
+  int ierr = m_base->cuda_writeData( (int*)mem_map_m, map, sizeof(int)*nummap);
+  return ierr;
+}
+
+int CudaChiSquareRuntime::initChiSquare(int size_data, int size_param, int size_func,
+					int size_map) {
+
+  int ierr = DKS_ERROR;
+  if (initDone_m) {
+    DEBUG_MSG("Reinitializing ChiSquare");
+    freeChiSquare();
+  }
+
+  //init cublas
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status = cublasCreate(&defaultCublasRT);
+  if (status != CUBLAS_STATUS_SUCCESS)
+    DEBUG_MSG("CUBLAS create default handle failed!");
+
+  //allocate temporary memory
+  mem_chisq_m = m_base->cuda_allocateMemory(size_data*sizeof(double), ierr);
+  mem_param_m = m_base->cuda_allocateMemory(size_param*sizeof(double), ierr);
+  mem_func_m = m_base->cuda_allocateMemory(size_func*sizeof(double), ierr);
+  mem_map_m = m_base->cuda_allocateMemory(size_map*sizeof(int), ierr);
+  initDone_m = true;
+
+  return ierr;
+}
+
+int CudaChiSquareRuntime::freeChiSquare() {
+  int ierr = DKS_ERROR;
+  if (initDone_m) {
+    //delete cublas
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    status = cublasDestroy(defaultCublasRT);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      DEBUG_MSG("CUBLAS delete default handle failed!");
+      return DKS_ERROR;
+    }
+
+    //free memory
+    ierr = m_base->cuda_freeMemory(mem_chisq_m);
+    ierr = m_base->cuda_freeMemory(mem_param_m);
+    ierr = m_base->cuda_freeMemory(mem_func_m);
+    ierr = m_base->cuda_freeMemory(mem_map_m);
+    
+    initDone_m = false;
+  }
+
+  return ierr;
+}
diff --git a/src/CUDA/CudaChiSquareRuntime.cuh b/src/CUDA/CudaChiSquareRuntime.cuh
new file mode 100644
index 0000000..79a9af5
--- /dev/null
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@@ -0,0 +1,114 @@
+#ifndef H_CUDA_CHISQUARE_RUNTIME
+#define H_CUDA_CHISQUARE_RUNTIME
+
+#include <iostream>
+#include <string>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+
+#include "../Algorithms/ChiSquareRuntime.h"
+#include "CudaBase.cuh"
+
+const std::string cudaFunctHeader = "__device__ double fTheory(double t, double *p, double *f, int *m) {";
+
+const std::string cudaFunctFooter = "}\n";
+
+class CudaChiSquareRuntime : public ChiSquareRuntime{
+
+private:
+
+  bool base_create;
+  CudaBase *m_base;
+
+  CUdevice cuDevice_m;
+  CUcontext context_m;
+  CUmodule module_m;
+  CUfunction kernel_m;
+
+  cublasHandle_t defaultCublasRT;
+
+  /** Setup to init device
+   *  Create context and init device for RT compilation
+   */
+  void setUpContext();
+
+  /** Private function to add function to kernel string
+   *
+   */
+  std::string buildProgram(std::string function);
+
+public:
+
+  /** Constructor with CudaBase argument
+   *
+   */
+  CudaChiSquareRuntime(CudaBase *base);
+
+  /** Default constructor init cuda device
+   *
+   */
+  CudaChiSquareRuntime();
+  
+  /** Default destructor
+   *
+   */
+  ~CudaChiSquareRuntime();
+
+  /** Compile program and save ptx.
+   * Add function string to the calcFunction kernel and compile the program
+   * Function must be valid C math expression. Parameters can be addressed in
+   * a form par[map[idx]]
+   */
+  int compileProgram(std::string function, bool mlh = false);
+
+  /** Launch selected kernel
+   * Launched the selected kernel from the compiled code.
+   * Result is put in &result variable
+   */
+  int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
+		      int numpar, int numfunc, int nummap,
+		      double timeStart, double timeStep,
+		      double &result);
+
+  /** Write params to device.
+   * Write params from double array to mem_param_m memory on the device.
+   */
+  int writeParams(const double *params, int numparams); 
+
+  /** Write functions to device.
+   * Write function values from double array to mem_func_m memory on the device.
+   */
+  int writeFunc(const double *func, int numfunc);
+
+  /** Write maps to device.
+   * Write map values from int array to mem_map_m memory on the device.
+   */
+  int writeMap(const int *map, int nummap);
+
+  /** Allocate temporary memory needed for chi square.
+   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
+   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
+   * size_func and size_map are the maximum number of parameters, functions and maps used in 
+   * calculations.
+   */
+  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
+
+
+  /** Free temporary memory allocated for chi square.
+   * Frees the chisq temporary memory and memory for params, functions and maps
+   */
+  int freeChiSquare();
+
+  /** Check if CUDA device is able to run the chi square kernel.
+   *  Redundant - all new CUDA devices that support RT compilation will also support 
+   *  double precision, there are no other requirements to run chi square on GPU
+   */
+  int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
+    return DKS_SUCCESS;
+  }
+
+};
+
+#endif
diff --git a/src/CUDA/CudaCollimatorPhysics.cu b/src/CUDA/CudaCollimatorPhysics.cu
new file mode 100644
index 0000000..495ad59
--- /dev/null
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@@ -0,0 +1,728 @@
+#include "CudaCollimatorPhysics.cuh"
+
+//#define M_P 0.93827231e+00
+#define M_P 0.93827204e+00
+#define C 299792458.0
+#define PI 3.14159265358979323846
+#define AVO 6.022e23
+#define R_E 2.81794092e-15
+//#define eM_E 0.51099906e-03
+#define eM_E 0.51099892e-03
+#define Z_P 1
+#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
+
+#define POSITION 0 
+#define ZSIZE 1
+#define RHO_M 2
+#define Z_M 3
+#define A_M 4
+#define A2_C 5
+#define A3_C 6
+#define A4_C 7
+#define A5_C 8
+#define X0_M 9
+#define I_M 10
+#define DT_M 11
+
+#define BLOCK_SIZE 128
+#define NUMPAR 12
+
+__device__ inline double dot(double3 &d1, double3 &d2) {
+
+  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
+
+}
+
+__device__ inline bool checkHit(double &z, double *par) {
+
+  /* check if particle is in the degrader material */
+  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
+
+}
+
+
+__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
+{
+
+  volatile double dEdx = 0.0;
+
+  volatile double gamma = (Eng + M_P) / M_P;
+  volatile double gamma2 = gamma * gamma;
+
+  double beta = sqrt(1.0 - 1.0 / gamma2);
+  volatile double beta2 = beta * beta;
+
+  double deltas = par[DT_M] * beta * C;
+  volatile double deltasrho = deltas * 100 * par[RHO_M];
+  volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); 
+
+  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
+    double Ts = (Eng * 1E6) / 1.0073; 
+    double epsilon_low = par[A2_C] * pow(Ts, 0.45);
+    double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
+    double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
+
+    dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
+
+    double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
+    Eng = Eng + delta_E / 1E3;
+  }
+  
+  if (Eng >= 0.0006) {
+    double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
+      (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
+
+    dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
+      (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * 
+		       Tmax / par[I_M] / par[I_M]) - beta2);
+
+    double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
+    
+    Eng = Eng + delta_E / 1E3;
+  }
+
+  pdead = ((Eng<1E-4) || (dEdx>0));
+
+}
+
+__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
+			   double &normP, double &thetacou, double &deltas, int coord,
+			   double *par) 
+{
+  double Psixz;
+  double pxz;
+
+  if (px>=0 && pz>=0)
+    Psixz = atan(px/pz);
+  else if (px>0 && pz<0)
+      Psixz = atan(px/pz) + PI;
+  else if (px<0 && pz>0)
+    Psixz = atan(px/pz) + 2*PI;
+  else
+    Psixz = atan(px/pz) + PI;
+
+  pxz = sqrt(px*px + pz*pz);
+
+  if(coord==1) {
+    x = x + deltas * px/normP + xplane*cos(Psixz);
+    z = z - xplane * sin(Psixz);
+  }
+
+  if(coord==2) {
+    x = x + deltas * px/normP + xplane*cos(Psixz);
+    z = z - xplane * sin(Psixz) + deltas * pz / normP;
+  }
+
+  px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
+  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
+}
+
+__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
+
+  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
+  double gamma = (Eng + M_P) / M_P;
+  double normP = sqrt(dot(P, P));
+  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
+  double deltas = par[DT_M] * beta * C;
+
+  double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * 
+    Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
+
+  // x-direction: See Physical Review, "Multiple Scattering"
+  double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  double thetacou = z2 * theta0;
+
+  while(fabs(thetacou) > 3.5 * theta0) {
+    z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    thetacou = z2 * theta0;
+  }
+
+  //__syncthreads();  
+
+  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
+
+  double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  if(P2 < 0.0047) {
+    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    if(P4 > 0.5)
+      thetaru = -thetaru;
+    Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
+  }
+
+  // y-direction: See Physical Review, "Multiple Scattering"
+  z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  thetacou = z2 * theta0;
+
+  while(fabs(thetacou) > 3.5 * theta0) {
+    z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    thetacou = z2 * theta0;
+  }
+
+  //__syncthreads();
+
+  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
+
+  P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  if(P2 < 0.0047) {
+    double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    if(P4 > 0.5)
+      thetaru = -thetaru;
+    Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
+  }
+
+}
+
+
+template <typename T>
+__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
+					int numparticles)
+{
+
+  //get global id and thread id
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+
+  //transfer params to shared memory
+  extern __shared__ double smem[];
+  double *p = (double*)smem;
+  double3 *R = (double3*)&smem[NUMPAR];
+
+  curandState s; 
+  double3 P;
+
+  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
+    p[tt] = par[tt];
+
+  __syncthreads();
+
+  if (idx < numparticles) {
+    s = state[idx];
+    R[tid] = data[idx].Rincol;
+    P = data[idx].Pincol;
+
+    bool pdead = false;  
+    volatile double sq = sqrt(1.0 + dot(P, P));
+
+    double Eng;
+    
+    if (checkHit(R[tid].z, p)) {      
+
+      Eng = (sq - 1) * M_P;
+      energyLoss(Eng, pdead, s, p);
+
+      if (!pdead) {
+	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
+	sq = sqrt(dot(P, P));
+
+	P.x = P.x * ptot / sq;
+	P.y = P.y * ptot / sq;
+	P.z = P.z * ptot / sq;
+	coulombScat(R[tid], P, s, p); 
+
+	data[idx].Pincol = P;
+      } else {
+	data[idx].label = -1;
+      }
+    
+      state[idx] = s;
+    } else {
+    
+      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
+      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
+      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
+      data[idx].label = -2;
+      
+    }
+ 
+    data[idx].Rincol = R[tid];
+  }
+
+}
+
+__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
+					 curandState *state, int numparticles)
+{
+
+  //get global id and thread id
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+
+  //transfer params to shared memory
+  __shared__ double p[NUMPAR];
+  __shared__ double3 R[BLOCK_SIZE];
+
+  if (tid < NUMPAR)
+    p[tid] = par[tid];
+
+  __syncthreads();
+
+  curandState s;
+  double3 P;
+  if (idx < numparticles) {
+    R[tid] = data.Rincol[idx];
+    P = data.Pincol[idx];
+    s = state[idx];
+  
+    double sq = sqrt(1.0 + dot(P, P));
+    bool pdead = false;  
+
+    if (checkHit(R[tid].z, p)) {
+      
+      double Eng = (sq - 1) * M_P;
+      energyLoss(Eng, pdead, s, p);
+      
+      if (!pdead) {
+
+    	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
+    	sq = sqrt(dot(P, P));
+    	P.x = P.x * ptot / sq;
+    	P.y = P.y * ptot / sq;
+    	P.z = P.z * ptot / sq;
+    	coulombScat(R[tid], P, s, p); 
+      
+    	data.Pincol[idx] = P;
+    } else {
+    	data.label[idx] = -1;
+    }    
+    
+    } else {
+      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
+      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
+      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
+
+      data.label[idx] = -2;
+    }
+    
+    data.Rincol[idx] = R[tid];
+    state[idx] = s;
+  }
+
+}
+
+
+inline __device__ void unitlessOff(double3 &a, const double &c) {
+  a.x *= c;
+  a.y *= c;
+  a.z *= c;
+}
+
+inline __device__ void unitlessOn(double3 &a, const double &c) {
+  a.x /= c;
+  a.y /= c;
+  a.z /= c;
+}
+
+//swithch to unitless positions with dtc
+__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
+
+  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < npart) {
+    double3 R = gR[idx];
+    double3 X = gX[idx];
+
+    unitlessOn(R, dtc);
+    unitlessOn(X, dtc);
+
+    gR[idx] = R;
+    gX[idx] = X;
+  }
+
+}
+
+//swithc to unitless positions with dt*c
+__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
+
+  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < npart) {
+    double3 R = gR[idx];
+    double3 X = gX[idx];
+    double dt = gdt[idx];
+
+    unitlessOff(R, dt*c);
+    unitlessOff(X, dt*c);
+
+    gR[idx] = R;
+    gX[idx] = X;
+  }
+}
+
+//swithc off unitless positions with dtc
+__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
+
+  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < npart) {
+    double3 R = gR[idx];
+    double3 X = gX[idx];
+
+    unitlessOff(R, dtc);
+    unitlessOff(X, dtc);
+
+    gR[idx] = R;
+    gX[idx] = X;
+  }
+
+}
+
+//switch off unitelss positions with dt*c
+__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
+
+  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < npart) {
+    double3 R = gR[idx];
+    double3 X = gX[idx];
+    double dt = gdt[idx];
+
+    unitlessOff(R, dt*c);
+    unitlessOff(X, dt*c);
+
+    gR[idx] = R;
+    gX[idx] = X;
+  }
+}
+
+
+__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
+
+  //get global id and thread id
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+
+  if (idx < npart) {
+
+    double3 R = gR[idx];
+    double3 P = gP[idx];
+
+    //switch to unitless positions
+    unitlessOn(R, dtc);
+
+    //push
+    double tmp = sqrt(1.0 + dot(P, P));
+    R.x += 0.5 * P.x / tmp;
+    R.y += 0.5 * P.y / tmp;
+    R.z += 0.5 * P.z / tmp;
+   
+    //switch off unitless positions with dt*c
+    unitlessOff(R, dtc);
+    
+    gR[idx] = R;
+  }
+}
+
+
+__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
+
+  //get global id and thread id
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+
+  if (idx < npart) {
+
+    double3 R = gR[idx];
+    double3 P = gP[idx];
+    double dt = gdt[idx];
+
+    //switch to unitless positions with dt*c
+    unitlessOn(R, dt*c);
+    
+    R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P));
+    R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P));
+    R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P));
+   
+    //switch off unitless positions with dt*c
+    unitlessOff(R, dt*c);
+
+    gR[idx] = R;
+  }
+}
+
+//TODO: kernel for push with switch off unitless positions with dt[i]*c
+
+__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
+
+  const double sina = sin(ori.x);
+  const double cosa = cos(ori.x);
+  const double sinb = sin(ori.y);
+  const double cosb = cos(ori.y);
+  const double sinc = sin(ori.z);
+  const double cosc = cos(ori.z);
+
+  double3 temp;
+  temp.x = 0.0;
+  temp.y = 0.0;
+  temp.z = 0.0;
+
+  temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
+  temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x + 
+    (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
+  temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x + 
+    (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
+
+  return temp;
+
+}
+
+__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
+				    int npart, int nsect, double dtc)
+{
+
+  //get global id and thread id
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+  
+
+  if (idx < npart) {
+
+    double3 X = gX[idx];
+    double3 P = gP[idx];
+    long lLastSection = gLastSection[idx];
+
+    double3 ori;
+    if (lLastSection > -1 && lLastSection < nsect) {
+      ori = gOrient[lLastSection];
+    } else {
+      ori.x = 0.0;
+      ori.y = 0.0;
+      ori.z = 0.0;
+    }
+
+    double3 tmp = deviceTransformTo(P, ori);
+
+    unitlessOn(X, dtc);
+
+    X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
+    X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
+    X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
+
+    unitlessOff(X, dtc);
+
+    gX[idx] = X;
+  }
+
+}
+
+__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
+				    int npart, int nsect, double *gdt, double c)
+{
+
+  //get global id and thread id
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+  
+
+  if (idx < npart) {
+
+    double3 X = gX[idx];
+    double3 P = gP[idx];
+    long lLastSection = gLastSection[idx];
+    double dt = gdt[idx];
+
+    double3 ori;
+    if (lLastSection > -1 && lLastSection < nsect) {
+      ori = gOrient[lLastSection];
+    } else {
+      ori.x = 0.0;
+      ori.y = 0.0;
+      ori.z = 0.0;
+    }
+
+    double3 tmp = deviceTransformTo(P, ori);
+
+    unitlessOn(X, dt*c);
+
+    X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
+    X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
+    X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
+
+    unitlessOff(X, dt*c);
+
+    gX[idx] = X;
+  }
+
+}
+
+struct compare_particle
+{
+  int threshold;
+
+  compare_particle() {
+    threshold = 0;
+  }
+
+  void set_threshold(int t) {
+    threshold = t;
+  }
+
+  __host__ __device__
+  bool operator()(CUDA_PART p1, CUDA_PART p2) {
+    return p1.label > p2.label;
+  }
+
+  __host__  __device__
+  bool operator()(CUDA_PART p1) {
+    return p1.label < threshold;
+  }
+};
+
+
+struct compare_particle_small
+{
+  int threshold;
+
+  compare_particle_small() {
+    threshold = 0;
+  }
+
+  void set_threshold(int t) {
+    threshold = t;
+  }
+
+  __host__ __device__
+  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
+    return p1.label > p2.label;
+  }
+
+  __host__  __device__
+  bool operator()(CUDA_PART_SMALL p1) {
+    return p1.label < threshold;
+  }
+};
+
+
+struct less_then
+{
+  __host__ __device__
+  bool operator()(int x)
+  {
+    return x < 0;
+  }
+};
+
+int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = numparticles / threads + 1;
+
+  //calc shared memory size
+  int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE;
+
+  //call kernel
+  kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr, 
+							  (double*)par_ptr,
+							  m_base->cuda_getCurandStates(),
+							  numparticles);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+    std::cout << "Err2: " << cudaGetErrorString(err) << std::endl;
+  
+  return DKS_SUCCESS;
+
+}
+
+int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles, 
+						 int &numaddback)
+{
+
+  //wrap mem_ptr with thrust device ptr
+  thrust::device_ptr<CUDA_PART_SMALL> dev_ptr( (CUDA_PART_SMALL*)mem_ptr);
+ 
+  //count -2 and -1 particles
+  compare_particle_small comp;
+  comp.set_threshold(0);
+  numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp);
+
+  //sort particles
+  if (numaddback > 0)
+    thrust::sort(dev_ptr, dev_ptr + numparticles, comp);
+
+  return DKS_SUCCESS;
+}
+
+int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
+						void *dt_ptr, double dt, double c, bool usedt,
+						int streamId) 
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = npart / threads + 1;
+
+  //call kernel
+  if (!usedt) {
+    if (streamId == -1) {
+      kernelPush<<<blocks, threads >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
+    } else {
+      cudaStream_t cs = m_base->cuda_getStream(streamId);
+      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
+    }
+  } else {
+    if (streamId == -1) {
+      kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart, 
+				      (double*)dt_ptr, c);
+    } else {
+      cudaStream_t cs = m_base->cuda_getStream(streamId);
+      kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, 
+					      (double*)dt_ptr, c);
+    }
+  }
+
+
+  return DKS_SUCCESS;
+}
+
+int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
+							 void *lastSec_ptr, void *orient_ptr, 
+							 int npart, int nsec, 
+							 void *dt_ptr, double dt, 
+							 double c, bool usedt,
+							 int streamId) 
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = npart / threads + 1;
+  int smem = sizeof(double3) * nsec;
+
+  //call kernel
+  if (!usedt) {
+    if (streamId == -1) {
+      kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr, 
+						     (long*)lastSec_ptr, (double3*)orient_ptr,
+						     npart, nsec, dt*c);
+    } else {
+      cudaStream_t cs = m_base->cuda_getStream(streamId);
+      kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr, 
+							 (long*)lastSec_ptr, (double3*)orient_ptr,
+							 npart, nsec, dt*c);
+    }
+  } else {
+    if (streamId == -1) {
+      kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr, 
+						     (long*)lastSec_ptr, (double3*)orient_ptr,
+						     npart, nsec, (double*)dt_ptr, c);
+    } else {
+      cudaStream_t cs = m_base->cuda_getStream(streamId);
+      kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr, 
+							 (long*)lastSec_ptr, (double3*)orient_ptr,
+							 npart, nsec, (double*)dt_ptr, c);
+    }
+  }
+
+  return DKS_SUCCESS;
+}
+
+
+
diff --git a/src/CUDA/CudaCollimatorPhysics.cuh b/src/CUDA/CudaCollimatorPhysics.cuh
new file mode 100644
index 0000000..9808f33
--- /dev/null
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@@ -0,0 +1,155 @@
+#ifndef H_CUDA_COLLIMATORPHYSICS
+#define H_CUDA_COLLIMATORPHYSICS
+
+#include <iostream>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector_types.h>
+#include <curand_kernel.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#include <thrust/count.h>
+
+#include <cublas_v2.h>
+
+#include "../Algorithms/CollimatorPhysics.h"
+#include "CudaBase.cuh"
+
+/**
+ * Structure for storing particle on GPU
+ */
+typedef struct __align__(16) {
+  int label;
+  unsigned localID;
+  double3 Rincol;
+  double3 Pincol;
+  long IDincol;
+  int Binincol;
+  double DTincol;
+  double Qincol;
+  long LastSecincol;
+  double3 Bfincol;
+  double3 Efincol;
+} CUDA_PART;
+
+/**
+ * Structure for storing particle on GPU
+ */
+typedef struct {
+  int label;
+  unsigned localID;
+  double3 Rincol;
+  double3 Pincol;
+} CUDA_PART_SMALL;
+
+/**
+ * Structure for storing particle on GPU
+ */
+typedef struct {
+  int *label;
+  unsigned *localID;
+  double3 *Rincol;
+  double3 *Pincol;
+  long *IDincol;
+  int *Binincol;
+  double *DTincol;
+  double *Qincol;
+  long *LastSecincol;
+  double3 *Bfincol;
+  double3 *Efincol;
+} CUDA_PART2;
+
+/**
+ * Structure for storing particle on GPU
+ */
+typedef struct {
+  int *label;
+  unsigned *localID;
+  double3 *Rincol;
+  double3 *Pincol;
+} CUDA_PART2_SMALL;
+
+/** CudaCollimatorPhysics class.
+ * Contains kerenls that execute CollimatorPhysics functions form OPAL.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation
+ */
+class CudaCollimatorPhysics : public DKSCollimatorPhysics{
+
+private:
+
+  bool base_create;
+  CudaBase *m_base;
+
+public:
+
+  /** Constructor with CudaBase argument
+   *
+   */
+  CudaCollimatorPhysics(CudaBase *base) {
+    m_base = base;
+    base_create = false;
+  }
+
+  /** Constructor - empty. */
+  CudaCollimatorPhysics() { 
+    m_base = new CudaBase();
+    base_create = true;
+  }
+
+  /** Destructor - empty */
+  ~CudaCollimatorPhysics() { 
+    if (base_create)
+      delete m_base;
+  };
+
+  /** Execute collimator physics kernel.
+   *
+   */
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
+			int numpartices);
+
+  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			   void *px_ptr, void *py_ptr, void *pz_ptr,
+			   void *par_ptr, int numparticles)
+    {
+      return DKS_ERROR;
+    }
+
+  /** Sort particle array on GPU.
+   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
+   * array so these particles are at the end of array
+   */
+  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
+  
+  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			       void *px_ptr, void *py_ptr, void *pz_ptr,
+			       void *par_ptr, int numparticles, int &numaddback) 
+    {
+      return DKS_ERROR;
+    }
+
+  /** BorisPusher push function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
+  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
+			   double dt, double c, bool usedt = false, int streamId = -1);
+
+  /** BorisPusher push function with transformto function form OPAL
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
+  int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
+				    void *orient_ptr, int npart, int nsec, 
+				    void *dt_ptr, double dt, double c, 
+				    bool usedt = false, int streamId = -1);
+
+};
+
+#endif
diff --git a/src/CUDA/CudaFFT.cu b/src/CUDA/CudaFFT.cu
new file mode 100644
index 0000000..88e45ca
--- /dev/null
+++ b/src/CUDA/CudaFFT.cu
@@ -0,0 +1,376 @@
+#include "CudaFFT.cuh"
+
+__global__ void normalize(cufftDoubleComplex *in, int N) {
+	
+  int id = blockIdx.x; //*blockDim.x + threadIdx.x;
+  if (id < N) {
+    in[id].x = in[id].x / N;
+    in[id].y = in[id].y / N;
+  }
+	
+}
+
+CudaFFT::CudaFFT(CudaBase *base) {
+  m_base = base;
+  base_create = false;
+}
+
+/* constructor */
+CudaFFT::CudaFFT() { 
+  m_base = new CudaBase();
+  base_create = true;
+}
+	
+/* destructor */
+CudaFFT::~CudaFFT() { 
+  if (base_create)
+    delete m_base;
+}
+		
+/*
+  Info: execute fft using cufft library
+  Return: success or error code
+*/
+int CudaFFT::executeFFT(void * mem_ptr, int ndim, int N[3], int streamId, bool forward) {
+
+  //create fft plan
+  cufftResult cresult;
+  cufftHandle plan;
+
+  if (useDefaultPlan(ndim, N)) {
+    plan = defaultPlanZ2Z;
+  } else { 
+    switch (ndim) {
+    case 1:
+      cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2Z, 1);
+      break;
+    case 2:
+      cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2Z);
+      break;
+    case 3:
+      cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2Z);
+      break;
+    default:
+      cresult = CUFFT_SUCCESS;
+      break;
+    }
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error creating plan, cuda error: " << cresult);
+      if (cresult == CUFFT_SETUP_FAILED)
+	DEBUG_MSG("Setup failed");
+		
+      if (cresult == CUFFT_INVALID_SIZE)
+	DEBUG_MSG("Invalid size");
+		
+      if (cresult == CUFFT_INVALID_TYPE)
+	DEBUG_MSG("Invalid type");
+		
+      if (cresult == CUFFT_ALLOC_FAILED)
+	DEBUG_MSG("Alloc failed");
+				
+      return DKS_ERROR;
+    }
+  }
+	
+  if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
+    cufftSetStream(plan, m_base->cuda_getStream(streamId));
+  else
+    cufftSetStream(plan, 0);
+
+  //execute perform in place FFT on created plan 
+  if (forward) {
+    cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr, 
+			   (cufftDoubleComplex*)mem_ptr, CUFFT_FORWARD);
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error executing fft, cuda error: " << cresult);
+      cufftDestroy(plan);
+      return DKS_ERROR;
+    }		
+  } else {
+    cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr, 
+			   (cufftDoubleComplex*)mem_ptr, CUFFT_INVERSE);
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error executing ifft, cuda error: " << cresult);
+      cufftDestroy(plan);
+      return DKS_ERROR;
+    }
+  }
+
+  //clean up resources
+  if (!useDefaultPlan(ndim, N))
+      cufftDestroy(plan);
+  return DKS_SUCCESS;
+}
+		
+/*
+  Info: execute ifft 
+  Return: success or error code
+*/
+int CudaFFT::executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId) {
+  return executeFFT(mem_ptr, ndim, N, streamId, false);
+}
+		
+/*
+  Info: execute normalize using cuda kernel
+  Return: success or error code
+*/
+int CudaFFT::normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId) {
+
+  cublasStatus_t status;
+  unsigned int size = N[0]*N[1]*N[2];
+  cuDoubleComplex alpha = make_cuDoubleComplex(1.0/size, 0);
+		
+  if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
+    cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId));
+
+  status = cublasZscal(defaultCublasFFT, size, &alpha, (cuDoubleComplex*)mem_ptr, 1);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    DEBUG_MSG("CUBLAS exec Zscal failed!");
+    return DKS_ERROR;
+  }
+	
+  return DKS_SUCCESS;
+}
+
+/*
+  Info: execute real to complex double precision FFT
+  Return: success or error code
+*/
+int CudaFFT::executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) {
+
+  //create fft plan
+  cufftResult cresult;
+  cufftHandle plan;
+  if (useDefaultPlan(ndim, N)) {
+    plan = defaultPlanD2Z;
+  } else {
+    switch (ndim) {
+    case 1:
+      cresult = cufftPlan1d(&plan, N[0], CUFFT_D2Z, 1);
+      break;
+    case 2:
+      cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_D2Z);
+      break;
+    case 3:
+      cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_D2Z);
+      break;
+    default:
+      cresult = CUFFT_SUCCESS;
+      break;
+    }
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error creating plan, cuda error: " << cresult);
+      return DKS_ERROR;
+    }
+  }
+	
+  if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
+    cresult = cufftSetStream(plan, m_base->cuda_getStream(streamId));
+  else
+    cufftSetStream(plan, 0);
+
+  //execute perform in place FFT on created plan 
+  cresult = cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr, (cufftDoubleComplex*)comp_ptr);
+
+  if (cresult != CUFFT_SUCCESS) {
+    DEBUG_MSG("Error executing fft, cuda error: " << cresult);
+    if (cresult == CUFFT_INVALID_PLAN)
+      DEBUG_MSG("invalid plan");
+    if (cresult == CUFFT_INVALID_VALUE)
+      DEBUG_MSG("invalid value");
+    if (cresult == CUFFT_INTERNAL_ERROR)
+      DEBUG_MSG("internal error");
+    if (cresult == CUFFT_EXEC_FAILED)
+      DEBUG_MSG("exec failed");
+    if (cresult == CUFFT_SETUP_FAILED)
+      DEBUG_MSG("setup failed");
+
+    return DKS_ERROR;
+  }
+			
+  //clean up resources
+  if (!useDefaultPlan(ndim, N)) {
+    cresult = cufftDestroy(plan);
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult);
+      return DKS_ERROR;
+    }
+  }
+  return DKS_SUCCESS;
+}
+	
+/*
+  Info: exectue complex to real double precision FFT
+  Return: success or error code
+*/
+int CudaFFT::executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) {
+
+  //create fft plan
+  cufftResult cresult;
+  cufftHandle plan;
+
+  if (useDefaultPlan(ndim, N)) {
+    plan = defaultPlanZ2D;
+  } else {
+    switch (ndim) {
+    case 1:
+      cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2D, 1);
+      break;
+    case 2:
+      cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2D);
+      break;
+    case 3:
+      cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2D);
+      break;
+    default:
+      cresult = CUFFT_SUCCESS;
+      break;
+    }
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error creating plan, cuda error: " << cresult);
+      return DKS_ERROR;
+    }
+  }
+  
+  if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
+    cufftSetStream(plan, m_base->cuda_getStream(streamId));
+  else
+    cufftSetStream(plan, 0);
+	
+  //execute perform in place FFT on created plan 
+  cresult = cufftExecZ2D(plan, (cufftDoubleComplex*)comp_ptr, (cufftDoubleReal*)real_ptr);
+  
+  if (cresult != CUFFT_SUCCESS) {
+    DEBUG_MSG("Error executing fft, cuda error: " << cresult);
+    cufftDestroy(plan);
+    return DKS_ERROR;
+  }
+			
+  //clean up resources
+  if (!useDefaultPlan(ndim, N)) {
+    cresult = cufftDestroy(plan);
+    if (cresult != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult);
+      return DKS_ERROR;
+    }
+  }
+  return DKS_SUCCESS;
+}
+
+/*
+  Info: execute normalize for complex to real iFFT
+  Return: success or error code
+*/
+int CudaFFT::normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId) {
+  cublasStatus_t status;
+  unsigned int size = N[0]*N[1]*N[2];
+  double alpha = 1.0/size;
+		
+  if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
+    cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId));
+
+  status = cublasDscal(defaultCublasFFT, size, &alpha, (double*)real_ptr, 1);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    DEBUG_MSG("CUBLAS exec Zscal failed!");
+    return DKS_ERROR;
+  }
+	
+  return DKS_SUCCESS;
+}
+
+/*
+  Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+  Return: success or error code
+*/
+int CudaFFT::setupFFT(int ndim, int N[3]) {
+ 
+  cufftResult cr1 = CUFFT_SUCCESS;
+  cufftResult cr2 = CUFFT_SUCCESS;
+  cufftResult cr3 = CUFFT_SUCCESS;
+
+  //create default fft plans
+  if (ndim == 1) {
+    cr1 = cufftPlan1d(&defaultPlanZ2Z, N[0], CUFFT_Z2Z, 1);
+    cr2 = cufftPlan1d(&defaultPlanD2Z, N[0], CUFFT_D2Z, 1);
+    cr3 = cufftPlan1d(&defaultPlanZ2D, N[0], CUFFT_Z2D, 1);    
+  }
+
+  if (ndim == 2) {
+    cr1 = cufftPlan2d(&defaultPlanZ2Z, N[1], N[0], CUFFT_Z2Z);
+    cr2 = cufftPlan2d(&defaultPlanD2Z, N[1], N[0], CUFFT_D2Z);
+    cr3 = cufftPlan2d(&defaultPlanZ2D, N[1], N[0], CUFFT_Z2D);
+  }
+
+  if (ndim == 3) {
+    cr1 = cufftPlan3d(&defaultPlanZ2Z, N[2], N[1], N[0], CUFFT_Z2Z);
+    cr2 = cufftPlan3d(&defaultPlanD2Z, N[2], N[1], N[0], CUFFT_D2Z);
+    cr3 = cufftPlan3d(&defaultPlanZ2D, N[2], N[1], N[0], CUFFT_Z2D);
+  }
+
+  if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) {
+    DEBUG_MSG("Error creating default plan");
+    return DKS_ERROR;
+  }
+
+  //create cublas plan
+  cublasStatus_t status;
+  status = cublasCreate(&defaultCublasFFT);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    DEBUG_MSG("CUBLAS create default handle failed!");
+    return DKS_ERROR;
+  }
+  //std::cout << "cublas created" << std::endl;
+
+  defaultNdim = ndim;
+  if (ndim > 0) {
+    defaultN[0] = N[0]; 
+    defaultN[1] = N[1]; 
+    defaultN[2] = N[2];
+  }
+
+  return DKS_SUCCESS;
+
+}
+
+/*
+  Info: destroy default FFT plans
+  Return: success or error code
+*/
+int CudaFFT::destroyFFT() {
+  
+  cufftResult cr1 = CUFFT_SUCCESS;
+  cufftResult cr2 = CUFFT_SUCCESS;
+  cufftResult cr3 = CUFFT_SUCCESS;
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  if (defaultNdim > 0) {
+    //clean up resources
+    cr1 = cufftDestroy(defaultPlanZ2Z);
+    cr2 = cufftDestroy(defaultPlanD2Z);
+    cr3 = cufftDestroy(defaultPlanZ2D);
+
+    if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) {
+      DEBUG_MSG("Error destroying default cufft plans");
+      return DKS_ERROR;
+    }
+  
+  }
+
+  if (defaultNdim > -1) {
+    status = cublasDestroy(defaultCublasFFT);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      DEBUG_MSG("CUBLAS delete default handle failed!");
+      return DKS_ERROR;
+    }
+  }
+
+  defaultN[0] = -1;
+  defaultN[1] = -1;
+  defaultN[2] = -1;
+  defaultNdim = -1;
+  return DKS_SUCCESS;
+
+}
+
+
+
diff --git a/src/CUDA/CudaFFT.cuh b/src/CUDA/CudaFFT.cuh
new file mode 100644
index 0000000..0c22f2c
--- /dev/null
+++ b/src/CUDA/CudaFFT.cuh
@@ -0,0 +1,88 @@
+#ifndef H_CUDA_FFT
+#define H_CUDA_FFT
+
+#include <iostream>
+#include <math.h>
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include "cublas_v2.h"
+
+#include "../Algorithms/FFT.h"
+#include "CudaBase.cuh"
+
+class CudaFFT : public DKSFFT{
+
+private:
+
+  bool base_create;
+  CudaBase *m_base;
+	
+  cufftHandle defaultPlanZ2Z;
+  cufftHandle defaultPlanD2Z;
+  cufftHandle defaultPlanZ2D;
+  cublasHandle_t defaultCublasFFT;
+
+public:
+	
+  /** Constructor with CudaBase as argument */
+  CudaFFT(CudaBase *base);
+
+  /** constructor */
+  CudaFFT();
+		
+  /** destructor */
+  ~CudaFFT();
+		
+  /**
+   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Return: success or error code
+   */
+  int setupFFT(int ndim, int N[3]);
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
+
+  /**
+   * Info: destroy default FFT plans
+   * Return: success or error code
+   */
+  int destroyFFT();
+
+  /*
+    Info: execute complex to complex double precision fft using cufft library
+    Return: success or error code
+  */
+  int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
+		
+  /*
+    Info: execute ifft 
+    Return: success or error code
+  */
+  int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
+		
+  /*
+    Info: execute normalize using cuda kernel for complex to complex iFFT
+    Return: success or error code
+  */
+  int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
+		
+  /*
+    Info: execute real to complex double precision FFT
+    Return: success or error code
+  */
+  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
+		
+  /*
+    Info: exectue complex to real double precision FFT
+    Return: success or error code
+  */
+  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
+
+  /*
+    Info: execute normalize for complex to real iFFT
+    Return: success or error code
+  */
+  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
+
+};
+
+#endif
diff --git a/src/CUDA/CudaGreensFunction.cu b/src/CUDA/CudaGreensFunction.cu
new file mode 100644
index 0000000..140954b
--- /dev/null
+++ b/src/CUDA/CudaGreensFunction.cu
@@ -0,0 +1,469 @@
+#include "CudaGreensFunction.cuh"
+
+__global__ void kernelTmpgreen(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ) {
+
+  
+  int i = blockIdx.x;
+  int j = blockIdx.y;
+  int k = blockIdx.z;
+  
+  double cellVolume = hr_m0 * hr_m1 * hr_m2;
+  
+  double vv0 = i * hr_m0 - hr_m0 / 2;
+  double vv1 = j * hr_m1 - hr_m1 / 2;
+  double vv2 = k * hr_m2 - hr_m2 / 2;
+  
+  double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
+
+  double tmpgrn  = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
+  tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
+  tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
+  
+  tmpgrn = tmpgrn / 2;
+
+  tmpgrn += vv1 * vv2 * log(vv0 + r);
+  tmpgrn += vv0 * vv2 * log(vv1 + r);
+  tmpgrn += vv0 * vv1 * log(vv2 + r);
+
+  tmpgreen[i + j * NI + k * NI * NJ] = tmpgrn / cellVolume;
+  
+}
+
+__global__ void kernelTmpgreen_2(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ, int NK) {
+
+  int tid = threadIdx.x;
+  int id = blockIdx.x * blockDim.x + tid;
+
+  if (id < NI * NJ * NK) {
+    int i = id % NI;
+    int k = id / (NI * NJ);
+    int j = (id - k * NI * NJ) / NI;
+  
+    
+    double cellVolume = hr_m0 * hr_m1 * hr_m2;
+    
+    double vv0 = i * hr_m0 - hr_m0 / 2;
+    double vv1 = j * hr_m1 - hr_m1 / 2;
+    double vv2 = k * hr_m2 - hr_m2 / 2;
+  
+    double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
+    
+    double tmpgrn  = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
+    tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
+    tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
+  
+    tmpgrn = tmpgrn / 2;
+
+    tmpgrn += vv1 * vv2 * log(vv0 + r);
+    tmpgrn += vv0 * vv2 * log(vv1 + r);
+    tmpgrn += vv0 * vv1 * log(vv2 + r);
+
+    tmpgreen[id] = tmpgrn / cellVolume;
+    
+  }
+  
+}
+
+//calculate greens integral on cpu and transfer to gpu
+void kernelTmpgreenCPU(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
+		       int NI, int NJ, int NK)
+{
+
+  double cellVolume = hr_m0 * hr_m1 * hr_m2;
+  
+  for (int k = 0; k < NK; k++) {
+    for (int j = 0; j < NJ; j++) {
+      for (int i = 0; i < NI; i++) {
+	
+	double vv0 = i * hr_m0 - hr_m0 / 2;
+	double vv1 = j * hr_m1 - hr_m1 / 2;
+	double vv2 = k * hr_m2 - hr_m2 / 2;
+  
+	double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
+	
+	double tmpgrn = 0;
+	tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
+	tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
+	tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
+	
+	tmpgrn = tmpgrn / 2;
+	
+	tmpgrn += vv1 * vv2 * log(vv0 + r);	
+	tmpgrn += vv0 * vv2 * log(vv1 + r);
+	tmpgrn += vv0 * vv1 * log(vv2 + r);
+	
+	tmpgrn = tmpgrn / cellVolume;
+	
+	tmpgreen[k*NJ*NI + j*NJ + i] = tmpgrn;
+      }
+    }
+  }
+
+}
+
+
+__global__ void kernelIngration(double *rho2_m, double *tmpgreen, int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) {
+
+  int i = blockIdx.x;
+  int j = blockIdx.y;
+  int k = blockIdx.z;
+
+  int ni = NI;
+  int nj = NJ;
+  
+  double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  
+  tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
+  tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
+  
+  
+  if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
+    tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+  
+  if (i+1 < NI_tmp)
+    tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+  if (j+1 < NJ_tmp)
+    tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+  if (k+1 < NK_tmp)
+    tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+  if (i+1 < NI_tmp && j+1 < NJ_tmp)
+    tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+  
+  if (i+1 < NI_tmp && k+1 < NK_tmp)
+    tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+  if (j+1 < NJ_tmp && k+1 < NK_tmp)
+    tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+  tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+  
+  double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
+  
+  rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
+
+}
+
+__global__ void kernelIngration_2(double *rho2_m, double *tmpgreen, 
+				  int NI, int NJ,
+				  int NI_tmp, int NJ_tmp, int NK_tmp) {
+
+  int tid = threadIdx.x;
+  int id = blockIdx.x * blockDim.x + tid;
+
+  int ni = NI;
+  int nj = NJ;
+  
+  double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  if (id < NI_tmp * NJ_tmp * NK_tmp) {
+    int i = id % NI_tmp;
+    int k = id / (NI_tmp * NJ_tmp);
+    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
+
+    tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
+    tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
+    
+    if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
+      tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+  
+    if (i+1 < NI_tmp)
+      tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+    if (j+1 < NJ_tmp)
+      tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+    if (k+1 < NK_tmp)
+      tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+    if (i+1 < NI_tmp && j+1 < NJ_tmp)
+      tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+  
+    if (i+1 < NI_tmp && k+1 < NK_tmp)
+      tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+    if (j+1 < NJ_tmp && k+1 < NK_tmp)
+      tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+  
+    tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+  
+    double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
+  
+    rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
+  }
+
+}
+
+
+//just one kernel will be executed
+__global__ void mirroredRhoField0(double *rho2_m, int NI, int NJ) {
+  rho2_m[0] = rho2_m[NI*NJ];
+}
+
+__global__ void mirroredRhoFieldI(double *rho2_m, int NI, int NJ) {
+
+  int i = blockIdx.x;
+  int j = blockIdx.y;
+  int k = blockIdx.z;
+
+  int idx1 = i + j*NI + k*NI*NJ;
+  int idx2 = (NI-i) + j*NI + k*NI*NJ;
+ 
+  if (NI-i < NI)
+    rho2_m[idx2] = rho2_m[idx1];
+
+}
+
+__global__ void mirroredRhoFieldJ(double *rho2_m, int NI, int NJ) {
+
+  int i = blockIdx.x;
+  int j = blockIdx.y;
+  int k = blockIdx.z;
+  
+  int idx1 = i + j*NI + k*NI*NJ;
+  int idx2 = i + (NJ-j)*NI + k*NI*NJ;
+  
+  if (NJ-j < NJ)
+    rho2_m[idx2] = rho2_m[idx1];
+
+}
+
+__global__ void mirroredRhoFieldK(double *rho2_m, int NI, int NJ, int NK) {
+
+  int i = blockIdx.x;
+  int j = blockIdx.y;
+  int k = blockIdx.z;
+  
+  int idx1 = i + j*NI + k*NI*NJ;
+  int idx2 = i + j*NI + (NK-k)*NI*NJ;
+  
+  if (NK-k < NK)
+    rho2_m[idx2] = rho2_m[idx1];
+  
+}
+
+__global__ void mirroredRhoField(double *rho2_m, 
+				 int NI, int NJ, int NK, 
+				 int NI_tmp, int NJ_tmp, int NK_tmp) {
+
+  int tid = threadIdx.x;
+  int id = blockIdx.x * blockDim.x + tid;
+
+  int id1, id2, id3, id4, id5, id6, id7, id8;
+
+  if (id < NI_tmp * NJ_tmp * NK_tmp) {
+    int i = id % NI_tmp;
+    int k = id / (NI_tmp * NJ_tmp);
+    int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
+
+    int ri = NI - i;
+    int rj = NJ - j;
+    int rk = NK - k;
+
+    id1 = k * NI * NJ + j * NI + i;
+    id2 = k * NI * NJ + j * NI + ri;
+    id3 = k * NI * NJ + rj * NI + i;
+    id4 = k * NI * NJ + rj * NI + ri;
+
+    id5 = rk * NI * NJ + j * NI + i;
+    id6 = rk * NI * NJ + j * NI + ri;
+    id7 = rk * NI * NJ + rj * NI + i;
+    id8 = rk * NI * NJ + rj * NI + ri;
+    
+    
+    double data = rho2_m[id1];
+    if (i != 0)
+      rho2_m[id2] = data;
+    
+    if (j != 0)
+      rho2_m[id3] = data;
+
+    if (i != 0 && j != 0)
+      rho2_m[id4] = data;
+    
+    if (k != 0) 
+      rho2_m[id5] = data;
+    
+    if (k !=  0 && i != 0)
+      rho2_m[id6] = data;
+    
+    if (k!= 0 && j != 0)
+      rho2_m[id7] = data;
+    
+    if (k != 0 && j != 0 & i != 0)
+      rho2_m[id8] = data;
+      
+  }
+
+}
+
+__device__ inline cuDoubleComplex ComplexMul(cuDoubleComplex a, cuDoubleComplex b) {
+
+  cuDoubleComplex c;
+  c.x = a.x * b.x - a.y * b.y;
+  c.y = a.x * b.y + a.y * b.x;
+  
+  return c;
+
+}
+
+__global__ void multiplyComplexFields(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2) {
+
+  int idx = blockIdx.x;
+  
+  ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
+}
+
+
+/*
+copy data in shared memory first to improve memory access (few global memory accesses, maybo no improvements) 
+use more threads per block to improve occupancy of hardware (test for best block and thread sizes)
+*/
+__global__ void multiplyComplexFields_2(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2, 
+					int size) 
+{
+
+  int tid = threadIdx.x;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  
+  extern __shared__ cuDoubleComplex data[];
+
+  if (idx < size) {
+    data[2*tid] = ptr1[idx];
+    data[2*tid + 1] = ptr2[idx];
+  }
+
+  __syncthreads();
+
+  if (idx < size)
+    ptr1[idx] = ComplexMul(data[2*tid], data[2*tid+1]);
+  
+
+}
+
+
+CudaGreensFunction::CudaGreensFunction(CudaBase *base) { 
+  m_base = base;
+  base_create = false;
+}
+
+/* constructor */
+CudaGreensFunction::CudaGreensFunction() { 
+  m_base = new CudaBase();
+  base_create = true;
+}
+	
+/* destructor */
+CudaGreensFunction::~CudaGreensFunction() { 
+  if (base_create)
+    delete m_base;
+}
+
+int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
+					    double hr_m0, double hr_m1, double hr_m2,
+					    int streamId)
+{
+  
+  int thread = 128;
+  int block = (I * J * K / thread) + 1;
+
+  //if no stream specified use default stream
+  if (streamId == -1) {
+    kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
+
+    return DKS_SUCCESS;
+  }
+
+  
+  if (streamId < m_base->cuda_numberOfStreams()) {
+    cudaStream_t cs = m_base->cuda_getStream(streamId);
+    kernelTmpgreen_2<<< block, thread, 0,  cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
+    return DKS_SUCCESS;
+  }
+  
+  return DKS_ERROR;
+  
+}
+
+int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, 
+						       int I, int J, int K,
+						       int streamId) 
+{
+  
+  int thread = 128;
+  int block = (I * J * K / thread) + 1;
+
+  if (streamId == -1) {
+    kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, 
+					    2*(I - 1), 2*(J - 1), I, J, K);
+    return DKS_SUCCESS;
+  }
+
+  
+  if (streamId < m_base->cuda_numberOfStreams()) {
+    cudaStream_t cs = m_base->cuda_getStream(streamId);
+    kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, 
+						  2*(I - 1), 2*(J - 1), I, J, K);
+    return DKS_SUCCESS;
+  }
+  
+  
+  return DKS_ERROR;
+}
+
+int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
+  
+  int thread = 128;
+  int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
+  
+  if (streamId == -1) {
+    mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I,  2*J);
+    mirroredRhoField<<< block, thread >>>( (double *) mem_ptr,  2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
+    return DKS_SUCCESS;
+  }
+  
+  
+  if (streamId < m_base->cuda_numberOfStreams()) {
+    cudaStream_t cs = m_base->cuda_getStream(streamId);
+    mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I,  2*J);
+    mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
+    
+    return DKS_SUCCESS;
+  }
+  
+  
+  
+  return DKS_ERROR;
+}
+
+int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, 
+						   int size, int streamId) {
+  
+  int threads = 128;
+  int blocks = size / threads + 1;
+  int datasize = 2 * threads * sizeof(cuDoubleComplex);
+
+  if (streamId == -1) {
+    multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1, 
+							     (cuDoubleComplex*)ptr2, 
+							     size);
+    return DKS_SUCCESS;
+  }
+
+  if (streamId < m_base->cuda_numberOfStreams()) {
+    cudaStream_t cs = m_base->cuda_getStream(streamId);
+    multiplyComplexFields_2<<<blocks, threads, datasize, cs >>> ( (cuDoubleComplex*)ptr1, 
+								  (cuDoubleComplex*) ptr2, size);
+    return DKS_SUCCESS;
+  }
+  
+  return DKS_ERROR;
+ 
+}
+		
+
+
diff --git a/src/CUDA/CudaGreensFunction.cuh b/src/CUDA/CudaGreensFunction.cuh
new file mode 100644
index 0000000..5095e7a
--- /dev/null
+++ b/src/CUDA/CudaGreensFunction.cuh
@@ -0,0 +1,63 @@
+#ifndef H_CUDA_GREENSFUNCTION
+#define H_CUDA_GREENSFUNCTION
+
+#include <iostream>
+#include <math.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuComplex.h>
+#include "cublas_v2.h"
+
+
+#include "CudaBase.cuh"
+
+class CudaGreensFunction {
+
+private:
+	
+  bool base_create;
+  CudaBase *m_base;
+
+public:
+	
+  /** Constructor with CudaBase argument */
+  CudaGreensFunction(CudaBase *base);
+
+  /* constructor */
+  CudaGreensFunction();
+		
+  /* destructor */
+  ~CudaGreensFunction();
+		
+  /*
+    Info: calc itegral on device memory (taken from OPAL src code)
+    Return: success or error code
+  */
+  int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, 
+			  double hr_m0, double hr_m1, double hr_m2, 
+			  int streamId = -1);
+		
+  /*
+    Info: integration of rho2_m field (taken from OPAL src code)
+    Return: success or error code
+  */
+  int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
+				     int streamId = -1);
+		
+  /*
+    Info: mirror rho field (taken from OPAL src code)
+    Return: succes or error code
+  */
+  int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
+
+  /*
+    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
+    Return: success or error code
+  */
+  int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
+
+
+};
+
+#endif
diff --git a/src/CUDA/CudaImageReconstruction.cu b/src/CUDA/CudaImageReconstruction.cu
new file mode 100644
index 0000000..14ab4ee
--- /dev/null
+++ b/src/CUDA/CudaImageReconstruction.cu
@@ -0,0 +1,1221 @@
+#include "CudaImageReconstruction.cuh"
+
+//x_edge, y_edge, z_edge and matrix_distance_factor need to be set as const for the run
+//voxel_x, voxel_y and voxel_z also need to be set as const for the run
+__device__ float d_x_edge = 30.8;
+__device__ float d_y_edge = 30.8;
+__device__ float d_z_edge = 16.8;
+
+__device__ float d_matrix_distance_factor = 1.2;
+
+__device__ int d_voxel_x = 90;
+__device__ int d_voxel_y = 90;
+__device__ int d_voxel_z = 50;
+
+__device__ float d_voxel_size = 0.7;
+
+
+//phantom_diameter needs to be defined, atten_per_mm as well
+__device__ float d_phantom_diameter = 51;
+__device__ float d_atten_per_mm = 0.0095;
+__device__ float d_ring_diameter = 138;
+__device__ float d_minimum_CrystalDistance_InOneRing = 123.489;
+
+
+__device__ float d_x_edge1 = 29.26;
+__device__ float d_y_edge1 = 29.26;
+__device__ float d_z_edge1 = 15.96;
+__device__ float d_z_edge2 = 14.28;
+__device__ float d_minimum_CrystalDistance_InOneRing1 = 127.681;
+
+
+__device__ inline float distance(VoxelPosition &a, VoxelPosition &b) {
+  float dist_x = pow(a.x - b.x, 2);
+  float dist_y = pow(a.y - b.y, 2);
+  float dist_z = pow(a.z - b.z, 2);
+  return sqrt(dist_x + dist_y + dist_z);
+}
+
+__global__ void kernelCalculateSource(float *image_space, VoxelPosition *image_position,
+				      VoxelPosition *source_position, float *average, 
+				      float *stdev, float diameter, int total_voxels,
+				      int total_sources, int start)
+{
+
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+  volatile int voxel_id = idx + start;
+
+  if (voxel_id < total_voxels && idx < total_sources) {
+    //read source position
+    VoxelPosition source = source_position[voxel_id];
+
+    int count = 0;
+    float sum = 0;
+    float sqsum = 0;
+    
+    int sx = floor( ((source.x - diameter) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    sx = (sx < 0) ? 0 : sx;
+    sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx;
+
+    int sy = floor( ((source.y - diameter) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    sy = (sy < 0) ? 0 : sy;
+    sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy;
+
+    int sz = floor( ((source.z - diameter) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    sz = (sz < 0) ? 0 : sz;
+    sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz;
+
+    int ex = floor( ((source.x + diameter) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    ex = (ex < 0) ? 0 : ex;
+    ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex;
+
+    int ey = floor( ((source.y + diameter) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    ey = (ey < 0) ? 0 : ey;
+    ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey;
+
+    int ez = floor( ((source.z + diameter) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    ez = (ez < 0) ? 0 : ez;
+    ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez;
+    
+    VoxelPosition voxel;
+    for (int z = sz; z < ez; z++) {
+      voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size;
+      for (int y = sy; y < ey; y++) {
+	voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size;
+	for (int x = sx; x < ex; x++) {
+	  voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size;
+
+	  float dist = distance(voxel, source);
+	  
+	  if (dist < diameter * 0.5 ) {
+	    //read voxel value
+	    int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x;
+	    float v = image_space[i];
+	    sum += v;
+	    sqsum += v*v;
+	    count++;
+	  }
+	}
+      }
+    }
+
+    float avg = sum / count;
+    average[idx] = avg;
+    stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) );
+  }
+}
+
+__global__ void kernelCalculateBackground(float *image_space, VoxelPosition *image_position,
+					  VoxelPosition *source_position, float *average, 
+					  float *stdev, float diameter, int total_voxels,
+					  int total_sources, int start)
+{
+
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+  volatile int voxel_id = idx + start;
+
+  if (voxel_id < total_voxels && idx < total_sources) {
+    //read source position
+    VoxelPosition source = source_position[voxel_id];
+
+    int count = 0;
+    float sum = 0;
+    float sqsum = 0;
+
+    int sx = floor( ((source.x - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    sx = (sx < 0) ? 0 : sx;
+    sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx;
+
+    int sy = floor( ((source.y - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    sy = (sy < 0) ? 0 : sy;
+    sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy;
+
+    int sz = floor( ((source.z - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    sz = (sz < 0) ? 0 : sz;
+    sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz;
+
+    int ex = floor( ((source.x + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    ex = (ex < 0) ? 0 : ex;
+    ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex;
+
+    int ey = floor( ((source.y + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    ey = (ey < 0) ? 0 : ey;
+    ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey;
+
+    int ez = floor( ((source.z + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    ez = (ez < 0) ? 0 : ez;
+    ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez;
+
+    VoxelPosition voxel;
+    for (int z = sz; z < ez; z++) {
+      voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size;
+      for (int y = sy; y < ey; y++) {
+	voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size;
+	for (int x = sx; x < ex; x++) {
+	  voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size;
+
+	  float dist = distance(voxel, source);
+	  
+	  //if ( dist > diameter * 0.5 && dist < (diameter * 0.5 + 1) ) {
+	  if ( dist > diameter * 0.5 && dist < (diameter) ) {
+	    //read voxel value
+	    int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x;
+	    float v = image_space[i];
+	    sum += v;
+	    sqsum += v*v;
+	    count++;
+	  }
+	}
+      }
+    }
+
+    float avg = sum / count;
+    average[idx] = avg;
+    stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) );
+
+  }
+}
+
+
+__global__ void kernelCalculateSources(float *image_space, VoxelPosition *image_position,
+				       VoxelPosition *source_position, float *average, 
+				       float *stdev, float *diameter, int total_voxels,
+				       int total_sources, int start)
+{
+
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+  volatile int voxel_id = idx + start;
+
+  if (voxel_id < total_voxels && idx < total_sources) {
+    //read source position
+    VoxelPosition source = source_position[voxel_id];
+    float diam = diameter[voxel_id];
+
+    int count = 0;
+    float sum = 0;
+    float sqsum = 0;
+
+    int sx = floor( ((source.x - diam) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    sx = (sx < 0) ? 0 : sx;
+    sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx;
+
+    int sy = floor( ((source.y - diam) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    sy = (sy < 0) ? 0 : sy;
+    sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy;
+
+    int sz = floor( ((source.z - diam) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    sz = (sz < 0) ? 0 : sz;
+    sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz;
+
+    int ex = floor( ((source.x + diam) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    ex = (ex < 0) ? 0 : ex;
+    ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex;
+
+    int ey = floor( ((source.y + diam) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    ey = (ey < 0) ? 0 : ey;
+    ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey;
+
+    int ez = floor( ((source.z + diam) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    ez = (ez < 0) ? 0 : ez;
+    ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez;
+    
+    VoxelPosition voxel;
+    for (int z = sz; z < ez; z++) {
+      voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size;
+      for (int y = sy; y < ey; y++) {
+	voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size;
+	for (int x = sx; x < ex; x++) {
+	  voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size;
+
+	  float dist = distance(voxel, source);
+	  
+	  if (dist < diam * 0.5 ) {
+	    //read voxel value
+	    int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x;
+	    float v = image_space[i];
+	    sum += v;
+	    sqsum += v*v;
+	    count++;
+	  }
+	}
+      }
+    }
+
+    float avg = sum / count;
+    average[idx] = avg;
+    stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) );
+
+  }
+}
+
+__global__ void kernelCalculateBackgrounds(float *image_space, VoxelPosition *image_position,
+					   VoxelPosition *source_position, float *average, 
+					   float *stdev, float *diameter, int total_voxels,
+					   int total_sources, int start)
+{
+
+  volatile int tid = threadIdx.x;
+  volatile int idx = blockIdx.x * blockDim.x + tid;
+  volatile int voxel_id = idx + start;
+
+  if (voxel_id < total_voxels && idx < total_sources) {
+    //read source position
+    VoxelPosition source = source_position[voxel_id];
+    float diam = diameter[voxel_id];
+
+    int count = 0;
+    float sum = 0;
+    float sqsum = 0;
+
+    int sx = floor( ((source.x - (diam + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    sx = (sx < 0) ? 0 : sx;
+    sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx;
+
+    int sy = floor( ((source.y - (diam + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    sy = (sy < 0) ? 0 : sy;
+    sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy;
+
+    int sz = floor( ((source.z - (diam + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    sz = (sz < 0) ? 0 : sz;
+    sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz;
+
+    int ex = floor( ((source.x + (diam + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) );
+    ex = (ex < 0) ? 0 : ex;
+    ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex;
+
+    int ey = floor( ((source.y + (diam + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) );
+    ey = (ey < 0) ? 0 : ey;
+    ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey;
+
+    int ez = floor( ((source.z + (diam + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) );
+    ez = (ez < 0) ? 0 : ez;
+    ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez;
+
+    VoxelPosition voxel;
+    for (int z = sz; z < ez; z++) {
+      voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size;
+      for (int y = sy; y < ey; y++) {
+	voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size;
+	for (int x = sx; x < ex; x++) {
+	  voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size;
+	  
+	  float dist = distance(voxel, source);
+	  
+	  //if ( dist > diam * 0.5 && dist < (diam * 0.5 + 1) ) {
+	  if ( dist > diam * 0.5 && dist < diam ) {
+	    //read voxel value
+	    int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x;
+	    float v = image_space[i];
+	    sum += v;
+	    sqsum += v*v;
+	    count++;
+	  }
+	}
+      }
+    }
+
+    float avg = sum / count;
+    average[idx] = avg;
+    stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) );
+
+  }
+}
+
+__device__ void localRaytracingX(float *recon, VoxelPosition *image_position, 
+				 float &atten_factor, float &slope_y, float &slope_z, 
+				 float &a_x, float &a_y, float &a_z)
+{
+
+  for (int x = 0; x < d_voxel_x; x++) {
+    float lor_x = image_position[x].x;
+    float lor_y =  slope_y * ( lor_x - a_x ) + a_y;
+    float lor_z =  slope_z * ( lor_x - a_x ) + a_z;
+
+    if ( pow(lor_x / d_x_edge,2) + pow( lor_y/d_y_edge, 2) < 1.0 && abs(lor_z) < d_z_edge ) {
+
+      int y = floor( (lor_y+d_y_edge) / d_voxel_size);
+      int z = floor( (lor_z+d_z_edge) / d_voxel_size);
+   
+      int voxel_id = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_y-image_position[voxel_id].y,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) ) 
+				    ) * atten_factor);
+
+      voxel_id = z * d_voxel_y * d_voxel_x + (y+1) * d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_y-image_position[voxel_id].y,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) ) 
+				    ) * atten_factor);
+
+      voxel_id = (z+1) * d_voxel_y * d_voxel_x + y * d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_y-image_position[voxel_id].y,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) ) 
+				    ) * atten_factor);
+
+      voxel_id = (z+1) * d_voxel_y * d_voxel_x + (y+1) * d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_y-image_position[voxel_id].y,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) ) 
+				    ) * atten_factor);
+    }
+  }
+
+}
+				  
+
+__device__ void localRaytracingY(float *recon, VoxelPosition *image_position, 
+				 float &atten_factor, float &slope_x, float &slope_z, 
+				 float &a_x, float &a_y, float &a_z)
+{
+
+  for (int y=0;y<d_voxel_y;y++) {
+    int voxel = y * d_voxel_x;
+    float lor_y = image_position[voxel].y;
+    float lor_x =  slope_x * ( lor_y - a_y ) + a_x;
+    float lor_z =  slope_z * ( lor_y - a_y ) + a_z;
+
+    if ( pow(lor_x/d_x_edge,2)+pow(lor_y/d_y_edge,2)<1.0 && abs(lor_z)<d_z_edge ) {
+
+      int x = floor((lor_x+d_x_edge)/d_voxel_size);
+      int z = floor((lor_z+d_z_edge)/d_voxel_size);
+	
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) ) 
+				    ) * atten_factor);
+      
+      voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) )
+				    ) * atten_factor);
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) )
+				    ) * atten_factor);
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_z-image_position[voxel_id].z,2) ) 
+				    ) * atten_factor);
+    }
+  }
+
+}
+
+__device__ void localRaytracingZ(float *recon, VoxelPosition *image_position, 
+				 float &atten_factor, float &slope_x, float &slope_y, 
+				 float &a_x, float a_y, float &a_z)
+{
+
+  for (int z=0; z<d_voxel_z; z++) {
+    int voxel = z * d_voxel_y * d_voxel_x;
+    float lor_z = image_position[voxel].z;
+    float lor_x =  slope_x * ( lor_z - a_z ) + a_x;
+    float lor_y =  slope_y * ( lor_z - a_z ) + a_y;
+
+    if ( pow(lor_x/d_x_edge,2)+pow(lor_y/d_y_edge,2)<1.0 && abs(lor_z)<d_z_edge ) {
+
+      int x = floor((lor_x+d_x_edge)/d_voxel_size);
+      int y = floor((lor_y+d_y_edge)/d_voxel_size);
+
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_y-image_position[voxel_id].y,2) )
+				    ) * atten_factor);
+
+      voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_y-image_position[voxel_id].y,2) )
+				    ) * atten_factor);
+      
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_y-image_position[voxel_id].y,2) ) 
+				    ) * atten_factor);
+
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x+1;
+      atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - 
+				    sqrt( pow(lor_x-image_position[voxel_id].x,2) + 
+					  pow(lor_y-image_position[voxel_id].y,2) )
+				    ) * atten_factor);
+    }
+  }
+
+}
+
+__device__ float atten_factor_calcu(float a_x, float a_y, float a_z, 
+				    float b_x, float b_y, float b_z) 
+{
+
+  float distance_lor_xy = sqrt(pow( (a_x - b_x),2 ) + pow( (a_y - b_y),2 ));
+  float distance_tocenter = pow( d_phantom_diameter/2,2 ) + pow( (a_x-b_x)/2,2 ) 
+    + pow( (a_y - b_y)/2,2 ) - pow( d_ring_diameter/2,2 );
+
+  float distance_xy;
+  if (distance_tocenter>0.001) {
+    distance_xy = 2.0 * sqrt( distance_tocenter ) ;
+  }
+  else
+    distance_xy = 0.0;
+
+  float distance_z = abs( a_z - b_z ) * distance_xy / distance_lor_xy;
+  float distance = sqrt( pow(distance_xy,2) + pow(distance_z,2) );
+
+  return exp(-distance*d_atten_per_mm);
+
+}
+
+__global__ void kernelNormalization(float *recon, VoxelPosition *image_position, 
+				    VoxelPosition *det_position, int total_det)
+{
+
+  int tidx = threadIdx.x;
+  int tidy = threadIdx.y;
+
+  int detA = blockIdx.x * blockDim.x + tidx;
+  int detB = blockIdx.y * blockDim.y + tidy;
+
+  if (detA != detB && detA < total_det && detB < total_det) {
+
+    VoxelPosition pA = det_position[detA];
+    VoxelPosition pB = det_position[detB];
+
+    float distance_x = abs( pA.x - pB.x);
+    float distance_y = abs( pA.y - pB.y);
+    float distance_z = abs( pA.z - pB.z);
+
+    if( sqrt(pow(distance_x,2) + pow(distance_y,2)) > d_minimum_CrystalDistance_InOneRing) {
+      float atten_factor;
+      atten_factor = atten_factor_calcu(pA.x,pA.y,pA.z,pB.x,pB.y,pB.z);
+
+      if (distance_x > distance_y && distance_x > distance_z) {
+
+	float slope_y = ( pB.y - pA.y ) / ( pB.x - pA.x );
+	float slope_z = ( pB.z - pA.z ) / ( pB.x - pA.x );
+
+	localRaytracingX(recon, image_position, atten_factor, slope_y, slope_z, pA.x, pA.y, pA.z);
+			  
+      }
+      else if (distance_y > distance_z) {
+
+	float slope_x = ( pB.x - pA.x ) / ( pB.y - pA.y );
+	float slope_z = ( pB.z - pA.z ) / ( pB.y - pA.y );	
+
+	localRaytracingY(recon, image_position, atten_factor, slope_x, slope_z, pA.x, pA.y, pA.z);
+      }
+      else {
+
+	float slope_x = ( pB.x - pA.x ) / ( pB.z - pA.z );
+	float slope_y = ( pB.y - pA.y ) / ( pB.z - pA.z );	
+
+	localRaytracingZ(recon, image_position, atten_factor, slope_x, slope_y, pA.x, pA.y, pA.z);
+      } 
+
+    }
+  }
+}
+
+__device__ float localRaytracingForwardX(float*recon, VoxelPosition &pos,
+					 float &a_x, float &a_y, float &a_z, 
+					 float &b_x, float &b_y, float &b_z)
+{
+  
+  float result = 0.000001;
+  float slope_y = ( b_y - a_y ) / ( b_x - a_x);
+  float slope_z = ( b_z - a_z ) / ( b_x - a_x);
+
+  for (int x=0; x<d_voxel_x; x++) {
+    float lor_x = pos.x + x * d_voxel_size;
+    float lor_y =  slope_y * ( lor_x - a_x ) + a_y;
+    float lor_z =  slope_z * ( lor_x - a_x ) + a_z;
+
+    if ( pow(lor_x/d_x_edge1, 2) + pow(lor_y/d_y_edge1, 2) < 1.0 && abs(lor_z) < d_z_edge1 ) {
+
+      int y = floor((lor_y+d_y_edge)/d_voxel_size);
+      int z = floor((lor_z+d_z_edge)/d_voxel_size);
+
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      result += (d_matrix_distance_factor - 
+		 sqrt( pow(lor_y-pos.y-y*d_voxel_size,2) + 
+		       pow(lor_z-pos.z-z*d_voxel_size,2) ) 
+		 ) * recon[voxel_id];
+
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_y-pos.y-(y+1)*d_voxel_size,2) + 
+			pow(lor_z-pos.z-z*d_voxel_size,2) ) 
+		  ) * recon[voxel_id];
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_y-pos.y-y*d_voxel_size,2) + 
+			pow(lor_z-pos.z-(z+1)*d_voxel_size,2) ) 
+		  ) * recon[voxel_id];
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_y-pos.y-(y+1)*d_voxel_size,2) + 
+			pow(lor_z-pos.z-(z+1)*d_voxel_size,2) ) 
+		  ) * recon[voxel_id];
+    }
+  }
+
+  return result;
+}
+
+__device__ float localRaytracingForwardY(float*recon, VoxelPosition &pos,
+					 float &a_x, float &a_y, float &a_z, 
+					 float &b_x, float &b_y, float &b_z)
+{
+
+  float result = 0.000001;
+  float slope_x = ( b_x - a_x ) / ( b_y - a_y);
+  float slope_z = ( b_z - a_z ) / ( b_y - a_y);
+
+  for (int y=0;y<d_voxel_y;y++) {
+    float lor_y = pos.y + y * d_voxel_size;
+    float lor_x =  slope_x * ( lor_y - a_y ) + a_x;
+    float lor_z =  slope_z * ( lor_y - a_y ) + a_z;
+    
+    if ( pow(lor_x/d_x_edge1,2)+pow(lor_y/d_y_edge1,2)<1.0 && abs(lor_z)<d_z_edge1 ) {
+      int x = floor((lor_x+d_x_edge)/d_voxel_size);
+      int z = floor((lor_z+d_z_edge)/d_voxel_size);
+
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+			pow(lor_z-pos.z-z*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+
+      voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+			pow(lor_z-pos.z-z*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+      
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+			pow(lor_z-pos.z-(z+1)*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+			pow(lor_z-pos.z-(z+1)*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+    }
+  }
+  
+  return result;
+}
+
+__device__ float localRaytracingForwardZ(float*recon, VoxelPosition &pos,
+					 float &a_x, float &a_y, float &a_z, 
+					 float &b_x, float &b_y, float &b_z)
+{
+
+  float result = 0.000001;
+  float slope_x = ( b_x - a_x ) / ( b_z - a_z);
+  float slope_y = ( b_y - a_y ) / ( b_z - a_z);
+
+  for (int z=0;z<d_voxel_z;z++) {    
+    float lor_z = pos.z + z * d_voxel_size;
+    float lor_x =  slope_x * ( lor_z - a_z ) + a_x;
+    float lor_y =  slope_y * ( lor_z - a_z ) + a_y;
+   
+    if ( pow(lor_x/d_x_edge1,2)+pow(lor_y/d_y_edge1,2)<1.0 && abs(lor_z)<d_z_edge1 ) {
+      int x = floor((lor_x+d_x_edge)/d_voxel_size);
+      int y = floor((lor_y+d_y_edge)/d_voxel_size);
+
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+			pow(lor_y-pos.y-y*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+
+      voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+			pow(lor_y-pos.y-y*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+			pow(lor_y-pos.y-(y+1)*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x+1;
+      result += ( d_matrix_distance_factor - 
+		  sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+			pow(lor_y-pos.y-(y+1)*d_voxel_size,2) )
+		  ) * recon[voxel_id];
+    }
+  }
+
+  return result;
+
+}
+
+__global__ void kernelZeroForward(float *correction, int size) {
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size)
+    correction[idx] = 1 / 0.000001;
+
+}
+
+__global__ void kernelForwardProjection(float *correction, float *recon, ListEvent *list_data, 
+					VoxelPosition *det_position, 
+					VoxelPosition *image_position, 
+					int *event_branch,
+					int num_events) 
+{
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  float result = 0.000001;
+
+  if (idx < num_events) {
+
+    int branch = event_branch[idx];
+    ListEvent id = list_data[idx];
+    VoxelPosition pA = det_position[id.detA];
+    VoxelPosition pB = det_position[id.detB];
+    
+    VoxelPosition pos = image_position[0];
+
+    if (branch == 1)
+      result = localRaytracingForwardX(recon, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z);
+    else if (branch == 2)
+      result = localRaytracingForwardY(recon, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z);
+    else if (branch == 3)
+      result = localRaytracingForwardZ(recon, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z);
+        
+    correction[idx] = 1 / result;
+
+  }
+
+}
+
+__global__ void kernelCheckEvents(ListEvent *list_data, VoxelPosition *det_position, 
+				  int *event_branch, int num_events)
+{
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < num_events) {
+
+    ListEvent id = list_data[idx];
+    VoxelPosition pA = det_position[id.detA];
+    VoxelPosition pB = det_position[id.detB];
+
+    //distance between two detectors
+    float distance_x = abs( pA.x - pB.x );
+    float distance_y = abs( pA.y - pB.y );
+    float distance_z = 0;
+    float distance_z1 = pA.z*0.6 + pB.z*0.4;
+    float distance_z2 = pA.z*0.4 + pB.z*0.6;
+
+    int branch = 0;
+
+    if( sqrt(pow(distance_x,2) + pow(distance_y,2)) > d_minimum_CrystalDistance_InOneRing1 && 
+	(abs(distance_z1)<d_z_edge2 || abs(distance_z2)<d_z_edge2 || distance_z1*distance_z2<0 ) ) {
+
+      if (distance_x > distance_y && distance_x > distance_z)
+	branch = 1;
+      else if (distance_y > distance_z)
+	branch = 2;
+      else
+	branch = 3;
+
+    }
+
+    event_branch[idx] = branch;
+  }
+
+}
+
+__global__ void kernelZeroBackward(float *recon_corrector, int size) {
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size)
+    recon_corrector[idx] = 0;
+
+}
+
+__device__ void localRaytracingBackwardX(float &correction, float *recon_corrector, 
+					 VoxelPosition &pos,
+					 float &a_x, float &a_y, float &a_z, 
+					 float &b_x, float &b_y, float &b_z)
+{
+
+  float slope_y = ( b_y - a_y ) / ( b_x - a_x);
+  float slope_z = ( b_z - a_z ) / ( b_x - a_x);
+
+  for (int x=0;x<d_voxel_x;x++) {
+    float lor_x = pos.x + x * d_voxel_size;
+    float lor_y =  slope_y * ( lor_x - a_x ) + a_y;
+    float lor_z =  slope_z * ( lor_x - a_x ) + a_z;
+
+    if ( pow(lor_x/d_x_edge1,2)+pow(lor_y/d_y_edge1,2)<1.0 && abs(lor_z)<d_z_edge1 ) {
+      int y = floor((lor_y+d_y_edge)/d_voxel_size);
+      int z = floor((lor_z+d_z_edge)/d_voxel_size);
+      
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_y-pos.y-y*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-z*d_voxel_size,2) ) 
+					      ) * correction);
+      
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_y-pos.y-(y+1)*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-z*d_voxel_size,2) ) 
+					      ) * correction);
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_y-pos.y-y*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-(z+1)*d_voxel_size,2) ) 
+					      ) * correction);
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_y-pos.y-(y+1)*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-(z+1)*d_voxel_size,2) ) 
+					      ) * correction);
+    }
+  }
+
+}
+
+__device__ void localRaytracingBackwardY(float &correction, float*recon_corrector, 
+					 VoxelPosition &pos,
+					 float &a_x, float &a_y, float &a_z, 
+					 float &b_x, float &b_y, float &b_z)
+{
+
+  float slope_x = ( b_x - a_x ) / ( b_y - a_y);
+  float slope_z = ( b_z - a_z ) / ( b_y - a_y);
+
+  for (int y=0;y<d_voxel_y;y++) {
+    float lor_y = pos.y + y * d_voxel_size;
+    float lor_x =  slope_x * ( lor_y - a_y ) + a_x;
+    float lor_z =  slope_z * ( lor_y - a_y ) + a_z;
+
+    if ( pow(lor_x/d_x_edge1,2)+pow(lor_y/d_y_edge1,2)<1.0 && abs(lor_z)<d_z_edge1 ) {
+      int x = floor((lor_x+d_x_edge)/d_voxel_size);
+      int z = floor((lor_z+d_z_edge)/d_voxel_size);
+
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-z*d_voxel_size,2) )
+					      ) * correction);
+
+      voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-z*d_voxel_size,2) )
+					      ) * correction);
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-(z+1)*d_voxel_size,2) )
+					      ) * correction);
+
+      voxel_id = (z+1)*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+						    pow(lor_z-pos.z-(z+1)*d_voxel_size,2) )
+					      ) * correction);
+    }
+  }
+
+}
+
+__device__ void localRaytracingBackwardZ(float &correction, float*recon_corrector, 
+					 VoxelPosition &pos,
+					 float &a_x, float &a_y, float &a_z, 
+					 float &b_x, float &b_y, float &b_z)
+{
+
+  float slope_x = ( b_x - a_x ) / ( b_z - a_z);
+  float slope_y = ( b_y - a_y ) / ( b_z - a_z);
+
+  for (int z=0;z<d_voxel_z;z++) {
+    float lor_z = pos.z + z * d_voxel_size;
+    float lor_x =  slope_x * ( lor_z - a_z ) + a_x;
+    float lor_y =  slope_y * ( lor_z - a_z ) + a_y;
+
+    if ( pow(lor_x/d_x_edge1,2)+pow(lor_y/d_y_edge1,2)<1.0 && abs(lor_z)<d_z_edge1 ) {
+      int x = floor((lor_x+d_x_edge)/d_voxel_size);
+      int y = floor((lor_y+d_y_edge)/d_voxel_size);
+
+      int voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+						    pow(lor_y-pos.y-y*d_voxel_size,2) )
+					       ) * correction);
+
+      voxel_id = z*d_voxel_y*d_voxel_x + y*d_voxel_x + x+1;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+						    pow(lor_y-pos.y-y*d_voxel_size,2) )
+					      ) * correction);
+
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-x*d_voxel_size,2) + 
+						    pow(lor_y-pos.y-(y+1)*d_voxel_size,2) )
+					     ) * correction);
+      
+      voxel_id = z*d_voxel_y*d_voxel_x + (y+1)*d_voxel_x + x+1;
+      atomicAdd(&recon_corrector[voxel_id], ( d_matrix_distance_factor - 
+					      sqrt( pow(lor_x-pos.x-(x+1)*d_voxel_size,2) + 
+						    pow(lor_y-pos.y-(y+1)*d_voxel_size,2) )
+					     ) * correction);
+    }
+  }
+
+}
+
+
+__global__ void kernelBackwardProjection(float *correction, float *recon_corrector, 
+					 ListEvent *list_data, VoxelPosition *det_position,
+					 VoxelPosition *image_position, int *event_branch,
+					 int num_events) 
+{
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < num_events) {
+
+    ListEvent id = list_data[idx];
+    VoxelPosition pA = det_position[id.detA];
+    VoxelPosition pB = det_position[id.detB];
+    int branch = event_branch[idx];
+
+    float corr;
+    VoxelPosition pos;
+
+    if (branch > 0) {
+      corr = correction[idx];
+      pos = image_position[0];
+    }
+
+    if (branch == 1)
+      localRaytracingBackwardX(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z);
+    else if (branch == 2)
+      localRaytracingBackwardY(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z);
+    else if (branch == 3)
+      localRaytracingBackwardZ(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z);
+
+  }
+
+}
+
+
+int CudaImageReconstruction::calculateSource(void *image_space, void *image_position, 
+					     void *source_position, void *avg, void *std, 
+					     float diameter, int total_voxels, 
+					     int total_sources, int start)
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = total_sources / threads + 1;
+
+  //call kernel
+  kernelCalculateSource<<<blocks, threads>>>( (float*) image_space,
+					      (VoxelPosition*) image_position,
+					      (VoxelPosition*) source_position,
+					      (float*) avg,
+					      (float*) std,
+					      diameter,
+					      total_voxels,
+					      total_sources,
+					      start);
+  
+  return DKS_SUCCESS;
+}
+
+int CudaImageReconstruction::calculateBackground(void *image_space, void *image_position, 
+						 void *source_position, void *avg, void *std, 
+						 float diameter, int total_voxels, 
+						 int total_sources, int start)
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = total_sources / threads + 1;
+
+
+  //call kernel
+  kernelCalculateBackground<<<blocks, threads>>>( (float*) image_space,
+						  (VoxelPosition*) image_position,
+						  (VoxelPosition*) source_position,
+						  (float*) avg,
+						  (float*) std,
+						  diameter,
+						  total_voxels,
+						  total_sources,
+						  start);
+
+  return DKS_SUCCESS;
+}
+
+int CudaImageReconstruction::calculateSources(void *image_space, void *image_position, 
+					      void *source_position, void *avg, void *std, 
+					      void *diameter, int total_voxels, 
+					      int total_sources, int start)
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = total_sources / threads + 1;
+
+  //call kernel
+  kernelCalculateSources<<<blocks, threads>>>( (float*) image_space,
+					       (VoxelPosition*) image_position,
+					       (VoxelPosition*) source_position,
+					       (float*) avg,
+					       (float*) std,
+					       (float*) diameter,
+					       total_voxels,
+					       total_sources,
+					       start);
+
+  return DKS_SUCCESS;
+}
+
+int CudaImageReconstruction::calculateBackgrounds(void *image_space, void *image_position, 
+						  void *source_position, void *avg, void *std, 
+						  void *diameter, int total_voxels, 
+						  int total_sources, int start)
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = total_sources / threads + 1;
+
+
+  //call kernel
+  kernelCalculateBackgrounds<<<blocks, threads>>>( (float*) image_space,
+						   (VoxelPosition*) image_position,
+						   (VoxelPosition*) source_position,
+						   (float*) avg,
+						   (float*) std,
+						   (float*) diameter,
+						   total_voxels,
+						   total_sources,
+						   start);
+
+  return DKS_SUCCESS;
+}
+
+int CudaImageReconstruction::generateNormalization(void *recon, void *image_position,
+						   void *det_position, int total_det)
+{
+
+  int blocksize = 32;
+  dim3 threads(blocksize, blocksize, 1);
+
+  dim3 blocks(total_det / blocksize + 1, total_det / blocksize + 1);
+
+  kernelNormalization<<<blocks, threads>>>( (float*) recon, 
+					    (VoxelPosition*) image_position,
+					    (VoxelPosition*) det_position,
+					    total_det);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    DEBUG_MSG("Error launching normalization kernel!");
+    std::cout << cudaGetErrorString(err);
+    return DKS_ERROR;
+  }
+  return DKS_SUCCESS;
+
+}
+
+int CudaImageReconstruction::forwardProjection(void *correction, void *recon, 
+					       void *list_data, void *det_position, 
+					       void *image_position, int num_events) 
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks = num_events / threads + 1;
+  
+  int ierr;
+  m_event_branch = m_base->cuda_allocateMemory(sizeof(int)*num_events, ierr);
+
+  kernelCheckEvents<<<blocks, threads >>>((ListEvent*)list_data, 
+					  (VoxelPosition*)det_position, 
+					  (int*)m_event_branch,
+					  num_events);
+
+  //warp mem pointers with thrust device ptr
+  thrust::device_ptr<int> t_event_branch( (int*)m_event_branch );
+  thrust::device_ptr<ListEvent> t_list_data( (ListEvent*)list_data );
+  
+  thrust::sort_by_key( t_event_branch, t_event_branch + num_events, t_list_data );
+  
+  kernelForwardProjection<<<blocks, threads>>>( (float*)correction,
+						(float*)recon,
+						(ListEvent*)list_data,
+						(VoxelPosition*)det_position,
+						(VoxelPosition*)image_position,
+						(int*)m_event_branch,
+						num_events);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cout << "Error launching kernel!" << std::endl;
+    std::cout << cudaGetErrorString(err) << std::endl;
+      
+  }
+
+  return DKS_SUCCESS;
+
+}
+
+
+int CudaImageReconstruction::backwardProjection(void *correction, void *recon_corrector, 
+						void *list_data, 
+						void *det_position, void *image_position, 
+						int num_events, int num_voxels) 
+{
+
+  int threads = BLOCK_SIZE;
+  int blocks1 = num_voxels / threads + 1;
+  int blocks2 = num_events / threads + 1;
+
+  kernelZeroBackward<<<blocks1, threads>>>((float*)recon_corrector, num_voxels);
+
+
+  kernelBackwardProjection<<<blocks2, threads>>>( (float*)correction,
+						  (float*)recon_corrector,
+						  (ListEvent*)list_data,
+						  (VoxelPosition*)det_position,
+						  (VoxelPosition*)image_position,
+						  (int*)m_event_branch,
+						  num_events);
+
+  m_base->cuda_freeMemory( m_event_branch );
+
+  return DKS_SUCCESS;
+
+}
+
+int CudaImageReconstruction::setDimensions(int voxel_x, int voxel_y, int voxel_z, 
+					   float voxel_size)
+{
+
+  //copy from host to __device__ variables
+  cudaMemcpyToSymbol(d_voxel_x, &voxel_x, sizeof(int));
+  cudaMemcpyToSymbol(d_voxel_y, &voxel_y, sizeof(int));
+  cudaMemcpyToSymbol(d_voxel_z, &voxel_z, sizeof(int));
+  cudaMemcpyToSymbol(d_voxel_size, &voxel_size, sizeof(float));
+
+  //check for error
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    DEBUG_MSG("Error copying to device memory!");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+  
+}
+
+int CudaImageReconstruction::setEdge(float x_edge, float y_edge, float z_edge) 
+{
+
+  //copy from host to __device__ variables
+  cudaMemcpyToSymbol(d_x_edge, &x_edge, sizeof(float));
+  cudaMemcpyToSymbol(d_y_edge, &y_edge, sizeof(float));
+  cudaMemcpyToSymbol(d_z_edge, &z_edge, sizeof(float));
+
+  //check for error
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    DEBUG_MSG("Error copying to device memory!");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+
+}
+
+int CudaImageReconstruction::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2)
+{
+
+  //copy from host to __device__ variables
+  cudaMemcpyToSymbol(d_x_edge1, &x_edge1, sizeof(float));
+  cudaMemcpyToSymbol(d_y_edge1, &y_edge1, sizeof(float));
+  cudaMemcpyToSymbol(d_z_edge1, &z_edge1, sizeof(float));
+  cudaMemcpyToSymbol(d_z_edge2, &z_edge2, sizeof(float));
+
+  //check for error
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    DEBUG_MSG("Error copying to device memory!");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+
+}
+
+int CudaImageReconstruction::setMinCrystalInRing(float min_CrystalDist_InOneRing, 
+						 float min_CrystalDist_InOneRing1)
+{
+
+  //copy from host to __device__ variables
+  cudaMemcpyToSymbol(d_minimum_CrystalDistance_InOneRing, 
+		     &min_CrystalDist_InOneRing, sizeof(float));
+
+  cudaMemcpyToSymbol(d_minimum_CrystalDistance_InOneRing1, 
+		     &min_CrystalDist_InOneRing1, sizeof(float));  
+
+  //check for error
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    DEBUG_MSG("Error copying to device memory!");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+
+}
+
+int CudaImageReconstruction::setParams(float matrix_distance_factor, float phantom_diameter, 
+				       float atten_per_mm, float ring_diameter)
+{
+
+  //copy from host to __device__ variables
+  cudaMemcpyToSymbol(d_matrix_distance_factor, &matrix_distance_factor, sizeof(float));
+  cudaMemcpyToSymbol(d_phantom_diameter, &phantom_diameter, sizeof(float));
+  cudaMemcpyToSymbol(d_atten_per_mm, &atten_per_mm, sizeof(float));
+  cudaMemcpyToSymbol(d_ring_diameter, &ring_diameter, sizeof(float));
+
+  //check for error
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    DEBUG_MSG("Error copying to device memory!");
+    return DKS_ERROR;
+  }
+
+  return DKS_SUCCESS;
+
+}
diff --git a/src/CUDA/CudaImageReconstruction.cuh b/src/CUDA/CudaImageReconstruction.cuh
new file mode 100644
index 0000000..4cf532c
--- /dev/null
+++ b/src/CUDA/CudaImageReconstruction.cuh
@@ -0,0 +1,118 @@
+#ifndef H_CUDA_IMAGERECONSTRUCTION
+#define H_CUDA_IMAGERECONSTRUCTION
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#include <thrust/count.h>
+
+#include "../Algorithms/ImageReconstruction.h"
+#include "CudaBase.cuh"
+
+class CudaImageReconstruction : public ImageReconstruction {
+
+private:
+
+  bool base_create;
+  CudaBase *m_base;
+  
+public:
+ 
+  /** Constructor */
+  CudaImageReconstruction() { 
+    m_base = new CudaBase();
+    base_create = true;
+  };
+
+  /** Constructor with base **/
+  CudaImageReconstruction(CudaBase *base) {
+    m_base = base;
+    base_create = false;
+  }
+
+  /** Destructor */
+  ~CudaImageReconstruction() { 
+    if (base_create)
+      delete m_base;
+  };
+
+  /** CUDA implementation of caluclate source
+   */
+  int calculateSource(void *image_space, void *image_position, void *source_position, 
+		      void *avg, void *std, float diameter, int total_voxels, 
+		      int total_sources, int start = 0);
+
+  /** Cuda implementation of calculate background
+   */
+  int calculateBackground(void *image_space, void *image_position, void *source_position, 
+			  void *avg, void *std, float diameter, int total_voxels, 
+			  int total_sources, int start = 0);
+
+  /**
+   * Caluclate source for differente sources
+   */
+  int calculateSources(void *image_space, void *image_position, void *source_position, 
+		       void *avg, void *std, void *diameter, int total_voxels, 
+		       int total_sources, int start = 0);
+
+  /**
+   * Calculate background for differente sources
+   */
+  int calculateBackgrounds(void *image_space, void *image_position, void *source_position, 
+			   void *avg, void *std, void *diameter, int total_voxels, 
+			   int total_sources, int start = 0);
+
+  /** Generate normalization.
+   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
+   * that updates voxel values in the image on the slope between these two detectors.
+   */
+  int generateNormalization(void *recon, void *image_position, 
+			    void *det_position, int total_det);
+
+
+  /** Calculate forward projection.
+   * For image reconstruction calculates forward projections.
+   * see recon.cpp for details
+   */
+  int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
+			void *image_position, int num_events);
+
+  /** Calculate backward projection.
+   * For image reconstruction calculates backward projections.
+   * see recon.cpp for details
+   */
+  int backwardProjection(void *correction, void *recon_corrector, void *list_data, 
+			 void *det_position, void *image_position, 
+			 int num_events, int num_voxels);
+
+  /** Set the voxel dimensins on device.
+   * 
+   */
+  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
+
+  /** Set the image edge.
+   * 
+   */
+  int setEdge(float x_edge, float y_edge, float z_edge);
+
+  /** Set the image edge1.
+   * 
+   */
+  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
+
+  /** Set the minimum crystan in one ring values.
+   * 
+   */
+  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
+
+  /** Set all other required parameters for reconstruction.
+   * 
+   */
+  int setParams(float matrix_distance_factor, float phantom_diameter,
+		float atten_per_mm, float ring_diameter);
+
+
+};
+
+#endif
diff --git a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
new file mode 100644
index 0000000..b22cab7
--- /dev/null
+++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
@@ -0,0 +1,316 @@
+#define PI     3.141592653589793115998
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+/** Theory function declaration.
+ * Definition of the theory function will be build during runtime before compilation.
+ */
+__device__ double fTheory(double t, double *p, double *f, int *m);
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+__device__ double se(double t, double lamda) {
+  return exp( -lamda*t );
+}
+
+__device__ double ge(double t, double lamda, double beta) {
+  return exp( -pow(lamda*t, beta) );
+}
+
+__device__ double sg(double t, double sigma) {
+  return exp( -0.5*pow(sigma*t, 2.0) );
+}
+
+__device__ double stg(double t, double sigma) {
+  double sigmatsq = pow(sigma*t, 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5*sigmatsq);
+}
+
+__device__ double sekt(double t, double lambda) {
+  double lambdat = lambda*t;
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+__device__ double lgkt(double t, double lambda, double sigma) {
+  double lambdat = lambda*t;
+  double sigmatsq = pow(sigma*t, 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+__device__ double skt(double t, double sigma, double beta) {
+  if (beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(sigma*t, beta);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
+}
+
+__device__ double spg(double t, double lambda, double gamma, double q) {
+  double lam2 = lambda*lambda;
+  double lamt2q = t*t*lam2*q;
+  double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+__device__ double rahf(double t, double nu, double lambda) {
+  double nut  = nu*t;
+  double nuth = nu*t/2.0;
+  double lamt = lambda*t;
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+__device__ double tf(double t, double phi, double nu) {
+  double tmp_nu = TWO_PI*nu*t;
+  double tmp_phi = DEG_TO_RAD*phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+__device__ double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+
+  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+__device__ double b(double t, double phi, double nu) {
+  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
+}
+
+__device__ double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI * nu * t;
+  double ph = DEG_TO_RAD * phi;
+
+  return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+__device__ double ab(double t, double sigma, double gamma) {
+  double gt = gamma*t;
+
+  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+__device__ double snkzf(double t, double Delta0, double Rb) {
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+__device__ double snktf(double t, double phi, double nu, double Delta0, double Rb) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+__device__ double dnkzf(double t, double Delta0, double Rb, double nuc) {
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
+}
+
+__device__ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
+}
+
+/** Theory and chisquare functions.
+ * Based on the compiler flags set theory is calculated either in single hist mode or asymetric.
+ * Based on the compiler flags calculate either chisq or MLE
+ */
+
+__device__ inline double singleHist(double &N0, double &tau, double &bkg, double &f, double &t) {
+  return N0 * exp (-t/tau ) * (1.0 + f) + bkg;
+}
+
+__device__ inline double asymetry(double &a, double &b, double &f) {
+  return (f * (a * b) - (a - 1.0)) / ((a + 1.0) - f * (a * b - 1.0));
+}
+
+__device__ inline double getTheory(double &c1, double &c2, double &c3, double &f, double &t) {
+#ifndef ASYMETRY
+  return singleHist(c1, c2, c3, f, t);
+#elif
+  return asymetry(c1, c2, f);
+#endif
+}
+
+__device__ inline double chiSq(double &data, double &theo, double &err) {
+  double res = (theo - data) * (theo - data);
+  if (err != 0.0)
+    res /= err;
+ 
+  return res;
+}
+
+__device__ inline double mle(double &data, double &theo, double &err) {
+  double res = (theo - data);
+  if ( data > 1.0e-9 && fabs(theo) > 1.0e-9 )
+    res += data * log(data / theo);
+
+  return res;
+}
+
+__device__ inline double getChiSq(double &data, double &theo, double &err) {
+#ifndef MLE
+  return chiSq(data, theo, err);
+#elif
+  return mle(data, theo, err);
+#endif
+}
+
+//-----------------------------------------------------------------------------------------------
+/**
+ * Kernel to calculate theory function and chisquare/mle values for single histogram fits.
+ */
+extern "C" __global__ void kernelChiSquareSingleHisto(double *data, double *err, double *par,
+             double *chisq, int *map, double *funcv, int length,
+					   int numpar, int numfunc, int nummap,
+					   double timeStart, double timeStep,         
+					   double tau, double N0, double bkg) {
+  //define shared variable for parameters                                  
+  extern __shared__ double smem[];                                         
+  double *p = (double*)smem;                                                    
+  double *f = (double*)&smem[numpar];
+  int *m = (int*)&smem[numpar + numfunc];
+                                                                           
+  //get thread id and calc global id                                       
+  int tid;
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+                                                                           
+  //load parameters from global to shared memory                           
+  tid = threadIdx.x;  
+  while (tid < numpar) {
+    p[tid] = par[tid];  
+    tid += blockDim.x;
+  }                                                   
+
+  //load functions from global to shared memory
+  tid = threadIdx.x;
+  while (tid < numfunc) {
+    f[tid] = funcv[tid];
+    tid += blockDim.x;
+  }
+
+  //load maps from global memory
+  tid = threadIdx.x;
+  while (tid < nummap) {
+    m[tid] = map[tid];
+    tid += blockDim.x;
+  }
+                                                                 
+  //sync threads                                                           
+  __syncthreads();
+                                                                           
+  while (j < length) {
+     
+    double t = timeStart + j*timeStep;                                  
+    double ldata = data[j];
+    double lerr = err[j];
+   
+    double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg;
+
+    #ifdef MLH
+    if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
+      chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo));
+    else
+      chisq[j] = 2.0 * (theo - ldata);
+    #else
+    if (lerr != 0.0)
+      chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
+    else
+      chisq[j] = theo * theo;
+    #endif
+    
+    j += gridDim.x * blockDim.x;
+
+  }           
+}
+
+//-----------------------------------------------------------------------------------------------
+/**
+ * Kernel to calculate theory function and chisquare/mle values for asymmetry fits.
+ */
+extern "C" __global__ void kernelChiSquareAsymmetry(double *data, double *err, double *par,
+             double *chisq, int *map, double *funcv, int length,
+             int numpar, int numfunc, int nummap,
+             double timeStart, double timeStep,
+             double alpha, double beta) {
+  //define shared variable for parameters
+  extern __shared__ double smem[];
+  double *p = (double*)smem;
+  double *f = (double*)&smem[numpar];
+  int *m = (int*)&smem[numpar + numfunc];
+
+  //get thread id and calc global id
+  int tid;
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //load parameters from global to shared memory
+  tid = threadIdx.x;
+  while (tid < numpar) {
+    p[tid] = par[tid];
+    tid += blockDim.x;
+  }
+
+  //load functions from global to shared memory
+  tid = threadIdx.x;
+  while (tid < numfunc) {
+    f[tid] = funcv[tid];
+    tid += blockDim.x;
+  }
+
+  //load maps from global memory
+  tid = threadIdx.x;
+  while (tid < nummap) {
+    m[tid] = map[tid];
+    tid += blockDim.x;
+  }
+
+  //sync threads
+  __syncthreads();
+
+  while (j < length) {
+
+    double t = timeStart + j*timeStep;
+    double ldata = data[j];
+    double lerr = err[j];
+
+    double theoVal = fTheory(t, p, f, m);
+    double ab = alpha*beta;
+
+    double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0) - (ab-1.0)*theoVal);
+
+    #ifdef MLH
+    chisq[j] = 0.0; // log max likelihood not defined here
+    #else
+    if (lerr != 0.0)
+      chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
+    else
+      chisq[j] = theo * theo;
+    #endif
+
+    j += gridDim.x * blockDim.x;
+  }
+}
+
diff --git a/src/DKSBase.cpp b/src/DKSBase.cpp
new file mode 100644
index 0000000..96e9b19
--- /dev/null
+++ b/src/DKSBase.cpp
@@ -0,0 +1,861 @@
+#include "DKSBase.h"
+
+#define API_OPENCL "OpenCL"
+#define API_CUDA "Cuda"
+#define API_OPENMP "OpenMP"
+
+#define DEVICE_GPU "-gpu"
+#define DEVICE_CPU "-cpu"
+#define DEVICE_MIC "-mic"
+
+//=====================================//
+//==========Private functions==========//
+//=====================================//
+
+bool DKSBase::apiOpenCL() {
+
+  if (!m_api_set)
+    return false;
+
+  if (strcmp(m_api_name, API_OPENCL) != 0)
+    return false;
+
+  return true;
+}
+
+bool DKSBase::apiCuda() {
+
+  if (!m_api_set)
+    return false;
+
+  if (strcmp(m_api_name, API_CUDA) != 0)
+    return false;
+
+  return true;
+}
+
+bool DKSBase::apiOpenMP() {
+  if (!m_api_set)
+    return false;
+
+  if (strcmp(m_api_name, API_OPENMP) != 0)
+    return false;
+
+  return true;
+}
+
+bool DKSBase::deviceGPU() {
+  if (!m_device_set)
+    return false;
+  if (strcmp(m_device_name, DEVICE_GPU) != 0)
+    return false;
+
+  return true;
+}
+
+bool DKSBase::deviceCPU() {
+  if (!m_device_set)
+    return false;
+  if (strcmp(m_device_name, DEVICE_CPU) != 0)
+    return false;
+
+  return true;
+}
+
+bool DKSBase::deviceMIC() {
+  if (!m_device_set)
+    return false;
+  if (strcmp(m_device_name, DEVICE_MIC) != 0)
+    return false;
+
+  return true;
+}
+
+
+int DKSBase::loadOpenCLKernel(const char *kernel_name) {
+    //load kernel
+    char * kernel_file = new char[500];
+    kernel_file[0] = '\0';
+    strcat(kernel_file, OPENCL_KERNELS);
+    strcat(kernel_file, kernel_name);
+    int ierr = OPENCL_SAFECALL( oclbase->ocl_loadKernel(kernel_file) );
+    delete[] kernel_file;
+
+    return ierr;
+}
+
+//=====================================//
+//==========Public functions===========//
+//=====================================//
+
+DKSBase::DKSBase() {
+
+  m_device_name = NULL;
+  m_api_name = NULL;
+  m_function_name = NULL;
+
+  m_device_set = false;
+  m_api_set = false;
+  m_function_set = false;
+  
+  m_auto_tuning = false;
+  m_use_config = false;
+
+#ifdef DKS_CUDA
+  cbase = new CudaBase();
+  cfft = new CudaFFT(cbase);
+  cgreens = new CudaGreensFunction(cbase);
+  cchi = new CudaChiSquare(cbase);
+  ccol = new CudaCollimatorPhysics(cbase);
+#endif
+
+#ifdef DKS_OPENCL
+  oclbase = new OpenCLBase();
+  oclfft = new OpenCLFFT(oclbase);
+  oclchi = new OpenCLChiSquare(oclbase);
+  oclcol = new OpenCLCollimatorPhysics(oclbase);
+#endif
+
+#ifdef DKS_MIC
+  micbase = new MICBase();
+  micfft = new MICFFT(micbase);
+  miccol = new MICCollimatorPhysics(micbase);
+  micgreens = new MICGreensFunction(micbase);
+  micchi = new MICChiSquare(micbase);
+#endif
+
+}
+
+DKSBase::DKSBase(const char* api_name, const char* device_name) {
+
+  setAPI(api_name, strlen(api_name));
+  setDevice(device_name, strlen(device_name));
+  m_function_name = NULL;
+  m_function_set = false;
+
+  m_auto_tuning = false;
+  m_use_config = false;
+
+#ifdef DKS_CUDA
+  cbase = new CudaBase();
+  cfft = new CudaFFT(cbase);
+  cgreens = new CudaGreensFunction(cbase);
+  cchi = new CudaChiSquare(cbase);
+  ccol = new CudaCollimatorPhysics(cbase);
+#endif
+
+#ifdef DKS_OPENCL
+  oclbase = new OpenCLBase();
+  oclfft = new OpenCLFFT(oclbase);
+  oclchi = new OpenCLChiSquare(oclbase);
+  oclcol = new OpenCLCollimatorPhysics(oclbase);
+#endif
+
+#ifdef DKS_MIC
+  micbase = new MICBase();
+  micfft = new MICFFT(micbase);
+  miccol = new MICCollimatorPhysics(micbase);
+  micgreens = new MICGreensFunction(micbase);
+  micchi = new MICChiSquare(micbase);
+#endif
+
+}
+
+
+DKSBase::~DKSBase() {
+
+  if (m_device_name != NULL)
+    delete[] m_device_name;
+
+  if (m_api_name != NULL)
+    delete[] m_api_name;
+
+  if (m_function_name != NULL)
+    delete[] m_function_name;
+
+ 
+#ifdef DKS_CUDA
+  delete cfft;
+  delete cgreens;
+  delete cchi;
+  delete ccol;
+  delete cbase;
+#endif
+
+#ifdef DKS_OPENCL
+  delete oclfft;
+  delete oclchi;
+  delete oclcol;
+  delete oclbase;
+#endif
+
+#ifdef DKS_MIC
+  delete micfft;
+  delete miccol;
+  delete micgreens;
+  delete micchi;
+  delete micbase;
+#endif
+
+}
+
+/*
+  Name: setDevice
+  Info: sets specific device to use. length specifies device_name string length (deprecated)
+  Return: success or error code
+*/            
+int DKSBase::setDevice(const char* device_name, int length) {
+
+  if (m_device_set) 
+    delete[] m_device_name;
+
+  int l = strlen(device_name);
+  m_device_name = new char[l+1];
+
+  for (int i = 0; i < l; i++) 
+    m_device_name[i] = device_name[i];
+  m_device_name[l] = '\0';
+
+  m_device_set = true;
+
+  return DKS_SUCCESS;
+
+}
+
+/*
+  Name: setAPI
+  Info: sets specific api (OpenCL, CUDA, OpenACC, OpenMP) to use
+  Return: success or error code
+*/
+int DKSBase::setAPI(const char* api_name, int length) {
+
+  if (m_api_set) 
+    delete[] m_api_name;
+
+  int l = strlen(api_name);
+  m_api_name = new char[l+1];
+
+  for (int i = 0; i < l; i++)
+    m_api_name[i] = api_name[i];
+  m_api_name[l] = '\0'; 
+
+  m_api_set = true;
+
+  return DKS_SUCCESS;
+}
+
+/*
+  Name: getDevices
+  Info: get all available devices
+  Return: success or error code
+*/
+int DKSBase::getDevices() { 
+
+  int ierr1 = OPENCL_SAFECALL( oclbase->ocl_getAllDevices() );
+  int ierr2 = CUDA_SAFECALL( cbase->cuda_getDevices() );
+  int ierr3 = MIC_SAFECALL( micbase->mic_getDevices() );
+
+  if (ierr1 + ierr2 + ierr3 != DKS_SUCCESS)
+    return DKS_ERROR;
+
+  return DKS_SUCCESS;
+}
+
+int DKSBase::getDeviceCount(int &ndev) {
+  ndev = 0;
+  if (apiOpenCL())
+    return OPENCL_SAFECALL( oclbase->ocl_getDeviceCount(ndev) );
+  else if (apiCuda())
+    return CUDA_SAFECALL( cbase->cuda_getDeviceCount(ndev) );
+  else if (apiOpenMP())
+    return DKS_ERROR;
+  else
+    return DKS_ERROR;
+}
+
+int DKSBase::getDeviceName(std::string &device_name) {
+  if (apiOpenCL())
+    return OPENCL_SAFECALL( oclbase->ocl_getDeviceName(device_name) );
+  else if (apiCuda())
+    return CUDA_SAFECALL( cbase->cuda_getDeviceName(device_name) );
+  else if (apiOpenMP())
+    return DKS_ERROR;
+  else
+    return DKS_ERROR;
+}
+
+int DKSBase::setDefaultDevice(int device) {
+  std::cout << "Set device " << device << std::endl;
+  if (apiOpenCL())
+    return OPENCL_SAFECALL( oclbase->ocl_setDevice(device) );
+  else if (apiCuda())
+    return CUDA_SAFECALL( cbase->cuda_setDevice(device) );
+  else if (apiOpenMP())
+    return DKS_ERROR;
+  else
+    return DKS_ERROR;
+}
+
+int DKSBase::getDeviceList(std::vector<int> &devices) {
+  if (apiOpenCL())
+    return OPENCL_SAFECALL( oclbase->ocl_getUniqueDevices(devices) );
+  else if (apiCuda())
+    return CUDA_SAFECALL( cbase->cuda_getUniqueDevices(devices)  );
+  else if (apiOpenMP())
+    return DKS_ERROR;
+  else
+    return DKS_ERROR;
+}
+
+/*
+  init device
+*/
+int DKSBase::initDevice() {
+
+  //if api is not set default is OpenCL
+  if (!m_api_set) {
+    setDevice("-gpu", 4);
+    setAPI(API_OPENCL, 6);
+    return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+  } else {
+    if (apiOpenCL()) {
+      if (!m_device_set) {
+	setDevice("-gpu", 4);
+	setAPI(API_OPENCL, 6);
+	return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
+      } else {
+	setAPI(API_OPENCL, 6);
+	return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
+      }
+    } else if (apiCuda()) {
+      setDevice("-gpu", 4);
+      setAPI(API_CUDA, 4);			
+      return CUDA_SAFECALL(DKS_SUCCESS);
+    } else if (apiOpenMP()) {
+      setDevice("-mic", 4);
+      setAPI(API_OPENMP, 6);
+      return MIC_SAFECALL(DKS_SUCCESS);
+    }
+  }
+
+  return DKS_ERROR;
+}
+
+/* 
+   set up cuda, opencl and mic to allow async data transfer and kernel execution.
+   name stream 'stolen' from cuda. opencl context ~ cuda stream.
+   TODO: implementations for OpenCL and MIC still needed
+*/
+int DKSBase::createStream(int &streamId) {
+
+  if (apiCuda())
+    return CUDA_SAFECALL( cbase->cuda_createStream(streamId) );
+  else if (apiOpenMP()) 
+    return MIC_SAFECALL( micbase->mic_createStream(streamId) );
+
+  DEBUG_MSG("Streams not enbled for this platforms jet");
+  return DKS_ERROR;
+}
+
+/* send device pointer to other processes */
+#ifdef DKS_MPI
+int DKSBase::sendPointer(void *mem_ptr, int dest, MPI_Comm comm) {
+
+  if ( apiCuda() ) {
+#ifdef DKS_CUDA
+    cudaError cerror;
+    cudaIpcMemHandle_t shandle;
+    cerror = cudaIpcGetMemHandle(&shandle, mem_ptr);
+    MPI_Send(&shandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, dest, 100, comm);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error geting mem handle");
+      return DKS_ERROR;
+    }
+
+    return DKS_SUCCESS;
+#endif
+  }
+  else if (apiOpenMP()) {
+#ifdef DKS_MIC 
+    //BENI:
+    DEBUG_MSG("No  SendPointer for MIC is implemented");
+    return DKS_ERROR;
+#endif
+  }
+  else {
+    DEBUG_MSG("Send device pointer not implemented on selected platform");
+    return DKS_ERROR;
+  }
+  return DKS_ERROR;
+}
+#endif
+
+/* receive device pointer */
+#ifdef DKS_MPI
+void * DKSBase::receivePointer(int hostproc, MPI_Comm comm, int &ierr) {
+
+  void *mem_ptr;
+  if (apiCuda()) {
+#ifdef DKS_CUDA
+    cudaError cerror;
+    cudaIpcMemHandle_t rhandle;
+    MPI_Recv(&rhandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, hostproc, 100, comm, NULL);
+    cerror = cudaIpcOpenMemHandle(&mem_ptr, rhandle, cudaIpcMemLazyEnablePeerAccess);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error opening received handle");
+      ierr = DKS_ERROR;
+    }
+#endif
+    return mem_ptr;
+  }
+  else if (apiOpenMP()) {
+#ifdef DKS_MIC 
+    //BENI:
+    DEBUG_MSG("No  ReceivePointer for MIC is implemented");
+    return DKS_SUCCESS;
+#endif
+    return mem_ptr;
+  }
+  else {
+    ierr = DKS_ERROR;
+    DEBUG_MSG("Receive device pointer not implemented for selected platform");
+    return mem_ptr;
+  }
+}
+#endif
+
+/* close received handle */
+int DKSBase::closeHandle(void *mem_ptr) {
+
+  if (apiCuda()) {
+#ifdef DKS_CUDA
+    cudaError cerror;  
+    cerror = cudaIpcCloseMemHandle(mem_ptr);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error closing memory handle");
+      return DKS_ERROR;
+    }
+
+    return DKS_SUCCESS;
+#endif
+  }
+
+  DEBUG_MSG("Memory handles not implemented for selected platform");
+  return DKS_ERROR;
+
+}
+
+/* sync device calls */
+int DKSBase::syncDevice() {
+
+  if (apiCuda())
+    return CUDA_SAFECALL( cbase->cuda_syncDevice() );
+  else if (apiOpenMP())
+    return MIC_SAFECALL( micbase->mic_syncDevice() );
+
+  return DKS_ERROR;
+}
+
+/* setup fft plans to reuse if multiple ffts of same size are needed */
+int DKSBase::setupFFT(int ndim, int N[3]) {
+
+  if (apiCuda()) {
+    return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
+  } else if (apiOpenMP()) {
+    //micbase.mic_setupFFT(ndim, N);
+    //BENI: setting up RC and CR transformations on MIC
+    int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
+    int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
+    if (ierr1 != DKS_SUCCESS)
+      return ierr1;
+    if (ierr2 != DKS_SUCCESS)
+      return ierr2;
+    return DKS_SUCCESS;
+  }
+
+  return DKS_ERROR;
+
+}
+//BENI:
+int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
+
+  if (apiCuda())
+    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
+
+  return DKS_ERROR;
+
+}
+
+//BENI:
+int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
+
+  if (apiCuda())
+    return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
+
+  return DKS_ERROR;
+
+}
+
+/* call OpenCL FFT function for selected platform */
+int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiOpenCL()) {
+    //load kernel and execute
+    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
+      return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
+    else
+      return DKS_ERROR;
+  } else if (apiCuda()) {
+    return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
+  } else if (apiOpenMP()) {
+    return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
+  }
+   
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call OpenCL IFFT function for selected platform */
+int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiOpenCL()) {
+    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
+      return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
+    else
+      return DKS_ERROR;
+  } else if (apiCuda()) {
+    return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
+  } else if (apiOpenMP()) {
+    return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
+  }
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call normalize FFT function for selected platform */
+int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiOpenCL()) {
+    if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
+      return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
+    else 
+      return DKS_ERROR;
+  } else if (apiCuda()) {
+    return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
+  } else if (apiOpenMP()) {
+    return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
+  }
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call real to complex FFT */
+int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
+
+  if (apiCuda())
+    return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
+  else if (apiOpenMP())
+    return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* call complex to real FFT */
+int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiCuda())
+    return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
+  else if (apiOpenMP())
+    return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+}
+
+/* normalize complex to real iFFT */
+int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
+  if (apiCuda())
+    return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_SUCCESS;
+}
+
+/* normalize complex to real iFFT */
+int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
+  if (apiOpenCL()) {
+    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
+      return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
+    else
+      return DKS_ERROR;
+  }
+
+  DEBUG_MSG("No implementation for selected platform");
+  return DKS_ERROR;
+
+}
+
+int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
+				double hz_m0, double hz_m1, double hz_m2, int streamId) {
+
+  if (apiCuda()) {
+    return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ, 
+						      hz_m0, hz_m1, hz_m2, streamId) );
+  } else if (apiOpenMP()) {
+    //BENI:
+    return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
+  } 
+
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
+				   int I, int J, int K, int streamId) {
+
+  if (apiCuda())
+    return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
+  
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
+
+  if (apiCuda()) 
+    return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
+  
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
+
+  if (apiCuda())
+    return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
+
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+
+int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
+			     double fTimeResolution, double fRebin,
+			     int sensors, int length, int numpar, double &result)
+{
+
+  if (apiCuda()) {
+    return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
+						fTimeResolution, fRebin,
+						sensors, length, numpar, 
+						result));
+  } else if (apiOpenCL()) {
+
+    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
+      return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
+						     fTimeResolution, fRebin,
+						     sensors, length, numpar, result));
+    else
+      return DKS_ERROR;
+  }
+   
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+
+}
+
+int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			       double fTimeResolution, double fRebin, double fGoodBinOffset,
+			       int sensors, int length, int numpar,
+			       double &result)
+{
+  if (apiCuda()) {
+    return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
+						  fTimeResolution, fRebin, fGoodBinOffset,
+						  sensors, length, numpar,
+						  result));
+  } else if (apiOpenCL()) {
+    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
+      return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
+						       fTimeResolution, fRebin, fGoodBinOffset,
+						       sensors, length, numpar, result));
+    else
+      return DKS_ERROR;
+  }
+   
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+ 
+}
+
+int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+				 double fTimeResolution, double fRebin, double fGoodBinOffset,
+				 int sensors, int length, int numpar,
+				 double &result)
+{
+  if (apiCuda()) {
+    return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
+						    fTimeResolution, fRebin, fGoodBinOffset,
+						    sensors, length, numpar,
+						    result));
+  } else if (apiOpenCL()) {
+    
+    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
+      return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
+							 fTimeResolution, fRebin, fGoodBinOffset,
+							 sensors, length, numpar, result));
+    else
+      return DKS_ERROR;
+  }
+  
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+  
+}
+
+int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
+				   int numparticles, int numparams,
+				   int &numaddback, int &numdead) 
+{
+
+  if (apiCuda()) {
+    return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
+  } else if (apiOpenCL()) {
+    if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
+      return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
+    else
+      return DKS_ERROR;
+
+  } else if (apiOpenMP()) {
+    return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
+  } 
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+
+}
+
+
+int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) 
+{
+
+  if (apiCuda())
+    return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
+  else if (apiOpenMP())
+    return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
+
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+				      void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+				      void *px_ptr, void *py_ptr, void *pz_ptr,
+				      void *par_ptr, int numparticles)
+{
+
+  if (apiOpenMP()) {
+    return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr, 
+						      rx_ptr, ry_ptr, rz_ptr, 
+						      px_ptr, py_ptr, pz_ptr,
+						      par_ptr,  numparticles) );
+  }
+
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+
+int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) 
+{
+
+  if (apiCuda())
+    return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
+   
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+}
+
+int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+					  void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+					  void *px_ptr, void *py_ptr, void *pz_ptr,
+					  void *par_ptr, int numparticles, int &numaddback) 
+{
+
+  if (apiOpenMP()) {
+    return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, 
+							 rx_ptr, ry_ptr, rz_ptr, 
+							 px_ptr, py_ptr, pz_ptr,
+							 par_ptr,  numparticles, numaddback));
+  }
+  
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+
+}
+
+
+int DKSBase::callInitRandoms(int size) {
+  if (apiCuda()) 
+    return CUDA_SAFECALL(cbase->cuda_createCurandStates(size));
+  else if (apiOpenCL())
+    return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(micbase->mic_createRandStreams(size));
+
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+  
+}
+
+int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
+				      void *dt_ptr, double dt, double c, 
+				      bool usedt, int streamId) 
+{
+
+  if (apiCuda()) 
+    return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, 
+						    usedt, streamId));
+  else if (apiOpenMP())
+    return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, 
+						     c, usedt, streamId));
+   
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+  
+}
+
+int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
+					       void *lastSec_ptr, void *orient_ptr, 
+					       int npart, int nsec, void *dt_ptr, double dt, 
+					       double c, bool usedt, int streamId)
+{
+
+  if (apiCuda()) {
+    return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
+							     lastSec_ptr, orient_ptr,
+							     npart, nsec, dt_ptr, dt, 
+							     c, usedt, streamId));
+  } else if (apiOpenMP()) {
+    return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, 
+							      lastSec_ptr, orient_ptr,
+							      npart, nsec, dt_ptr, dt, 
+							      c, usedt, streamId));
+  } 
+    
+  DEBUG_MSG("No implementation for selceted platform");
+  return DKS_ERROR;
+  
+}
diff --git a/src/DKSBase.h b/src/DKSBase.h
new file mode 100644
index 0000000..ea8bc39
--- /dev/null
+++ b/src/DKSBase.h
@@ -0,0 +1,1133 @@
+/** DKSBase class.
+ * DKSBase.h
+ * Author: Uldis Locans
+ * Date: 15.09.2014
+ * Base class of Dynamic Kernel Scheduler that handles the function calls
+ * from host application to DKS
+ */
+
+#ifndef H_DKS_BASE
+#define H_DKS_BASE
+
+#include <iostream>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "DKSDefinitions.h"
+
+#ifdef DKS_MPI
+#include <mpi.h>
+#endif
+
+#ifdef DKS_OPENCL
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "OpenCL/OpenCLBase.h"
+#include "OpenCL/OpenCLFFT.h"
+#include "OpenCL/OpenCLChiSquare.h"
+#include "OpenCL/OpenCLCollimatorPhysics.h"
+#endif
+
+#ifdef DKS_CUDA
+#include "CUDA/CudaBase.cuh"
+#include "CUDA/CudaFFT.cuh"
+#include "CUDA/CudaGreensFunction.cuh"
+#include "CUDA/CudaChiSquare.cuh"
+#include "CUDA/CudaCollimatorPhysics.cuh"
+#include "nvToolsExt.h"
+#endif
+
+#ifdef DKS_MIC
+#include "MIC/MICBase.h"
+#include "MIC/MICChiSquare.h"
+#include "MIC/MICFFT.h"
+#include "MIC/MICCollimatorPhysics.h"
+#include "MIC/MICGreensFunction.hpp"
+#endif
+
+#include "Algorithms/CollimatorPhysics.h"
+#include "Algorithms/FFT.h"
+
+#include "AutoTuning/DKSConfig.h"
+
+/** DKSBase class for handling function calls to DKS library */
+class DKSBase {
+
+private:
+  char *m_device_name;
+  char *m_api_name;
+  char *m_function_name;
+
+  bool m_device_set;
+  bool m_api_set;
+  bool m_function_set;
+
+  bool m_auto_tuning;
+  bool m_use_config;
+
+#ifdef DKS_OPENCL	
+  OpenCLBase *oclbase;
+  OpenCLFFT *oclfft;
+  OpenCLChiSquare *oclchi;
+  OpenCLCollimatorPhysics *oclcol;
+#endif
+
+#ifdef DKS_CUDA
+  CudaBase *cbase;
+  CudaFFT *cfft;
+  CudaGreensFunction *cgreens;
+  CudaChiSquare *cchi;
+  CudaCollimatorPhysics *ccol;
+#endif
+
+#ifdef DKS_MIC
+  MICBase *micbase;
+  MICFFT *micfft;
+  MICCollimatorPhysics *miccol;
+  MICGreensFunction *micgreens;
+  MICChiSquare *micchi;
+#endif
+
+protected:
+
+  //gives access to dks autotuning config file
+  DKSConfig dksconfig;
+
+  /** 
+   * Check if current API is set to OpenCL
+   * Return true/false wether current api is opencl
+   */
+  bool apiOpenCL();
+
+  /** 
+   * Check if current API is set to CUDA.
+   * Return true/false wether curretn api is cuda
+   */
+  bool apiCuda();
+
+  /** 
+   * Check if current API is set to OpenMP.
+   * Return true/false whether current api is OpenMP
+   */
+  bool apiOpenMP();
+
+  /** Check if device is GPU */
+  bool deviceGPU();
+  /** Check if device is CPU */
+  bool deviceCPU();
+  /** Check if device is MIC */
+  bool deviceMIC();
+
+  /**
+   * Get cbase pointer
+   */
+#ifdef DKS_CUDA
+  CudaBase *getCudaBase() {
+    return cbase;
+  }
+#endif
+
+#ifdef DKS_OPENCL
+  OpenCLBase *getOpenCLBase() {
+    return oclbase;
+  }
+#endif
+
+  /** Call OpenCL base to load specified kenrel file.
+   *
+   */
+  int loadOpenCLKernel(const char *kernel_name);
+
+  std::string getAPI() {
+    std::string api_name(m_api_name);
+    return api_name;
+  }
+
+  std::string getDevice() {
+    std::string device_name(&m_device_name[1]);
+    return device_name;
+  }
+
+public:
+
+  /** 
+   * Default constructor.
+   */
+  DKSBase();
+
+  /** 
+   * Constructor that sets api and devcie to use with DKS.
+   */
+  DKSBase(const char* api_name, const char* device_name);
+
+
+  /** 
+   * Destructor.
+   * Free DKS resources.
+   */
+  ~DKSBase();
+
+  /** Turn on auto tuning */
+  void setAutoTuningOn() { m_auto_tuning = true; }
+
+  /** Turn of auto tuning */
+  void setAutoTuningOff() { m_auto_tuning = false; }
+
+  /** Get status of auto tuning */
+  bool isAutoTuningOn() { return m_auto_tuning; }
+
+  /** Turn on use of config file */
+  void setUseConfigOn() { m_use_config = true; }
+ 
+  /** Turn off use of config file */
+  void setUseConfigOff() { m_use_config = false; }
+
+  /** Check if using config file */
+  bool isUseConfigOn() { return m_use_config; }
+
+  /** 
+   * Set device to use with DKS.
+   * Sets specific device to use with DKS. Supported devices are -gpu and -mic.
+   * Length specifies the number of characters in device_name array (length - deprecated).
+   * Return success or error code.
+   */
+  int setDevice(const char* device_name, int length = -1);
+
+  /** 
+   * Set framework to use with DKS.
+   * Sets framework and API that DKS uses to execute code on device. Supported API's 
+   * are OpenCL, CUDA and OpenMP. Returns success or error code. Length specifies
+   * the number of characters in api_name array (length - deprecated).
+   */
+  int setAPI(const char* api_name, int length = -1);
+
+  /** 
+   * Prints information about all available devices.
+   * Calls CUDA, OpenCL and MIC functions to query for available devices
+   * for each framework and pirnts information about each device. Length specifies 
+   * the number of characters in api_name array
+   * Returns success or error code
+   */
+  int getDevices();
+
+  /** 
+   * Returns device count.
+   * Saves the number of the devices available on the platform to ndev.
+   */
+  int getDeviceCount(int &ndev);
+
+  /** Get the name of the device in use.
+   *  Query the device that is used and get the naem of the device. The name is saved in the
+   *  device_name string. Returns DKS_SUCCESS
+   */
+  int getDeviceName(std::string &device_name);
+
+  /** Set the device to use.
+   *  Pass the index of the device to use by dks.
+   */
+  int setDefaultDevice(int device);
+
+  /** Get unique devices.
+   *  Get a list of all the unique devices available on the platform.
+   *  When API and device type for DKS is set, getDeviceList can get all the unique devices
+   *  available for this API and device type. Used for autotuning if multiple different GPUs are
+   *  installed on the system.
+   */
+  int getDeviceList(std::vector<int> &devices);
+
+  /** 
+   * Inititialize DKS.
+   * Set framework and device to use. If OpenCL is used create context with device.
+   * Return success or error code.
+   */
+  int initDevice();
+
+  /** 
+   * Create stream for async execution.
+   * Function to create different streams with device to allow assync kernel execution and data
+   * transfer. Currently implemented for CUDA with cuda streams. streamId will be can be used later 
+   * use the created stream. Returns success or error code. 
+   * TODO: for opencl use different 
+   * contexts similar as cuda streams to achieve async execution. TODO: for intel mic look at
+   * library (libxstream) from Hans Pabst.  
+   */
+  int createStream(int &streamId);
+
+  /** 
+   * Send pointer to device memory from one MPI process to another.
+   * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses 
+   * cuda icp. Gets icp handle of memory allocated on device pointed by mem_ptr does MPI_Send to
+   * dest process where matching receivePointer should be called. Returns success or error code.
+   * TODO: opencl and mic cases still need implementations
+   */
+#ifdef DKS_MPI
+  int sendPointer(void *mem_ptr, int dest, MPI_Comm comm);
+#endif
+
+  /** 
+   * Receive pointer to device memory from another MPI process.
+   * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses 
+   * cuda icp. Uses MPI_Recv to get icp handle from another MPI process and opens a reference
+   * to this memory. Togeter with sendPointer function allows multiple MPI processes to share
+   * one memory region of the device. Returns success or error code. 
+   * TODO: opencl and mic cases still need implementations
+   */
+#ifdef DKS_MPI
+  void * receivePointer(int hostproc, MPI_Comm comm, int &ierr);
+#endif
+
+  /** 
+   * Close handle to device memory.
+   * If receivePointer is used to open memory handle allocated by another MPI process closeHandle
+   * should be called to free resources instead of freeMemory. Returns success or error code.
+   * TODO: opencl and mic cases still need implementations.
+   */
+  int closeHandle(void *mem_ptr);
+
+  /** 
+   * Wait till all tasks running on device are completed.
+   * Forces a device synchronization - waits till all tasks on the device are complete.
+   * Implemented for cuda. Forces sync only in context in witch it is called - only waits
+   * for tasks launched by process calling syncDevice. If multiple processes launch different
+   * tasks each process is responsible for its own synchronization. Returns success or error code.
+   * TODO: opencl and mic implementations still necessary
+   */
+  int syncDevice();
+
+  /** 
+   * Allocate memory and transfer data to device.
+   * Returns a void pointer which can be used in later kernels to reference 
+   * allocated device memory. data_in pointer to data to be transfered to device,
+   * elements is the number of data elements to transfer, T - type of data to transfer.
+   * If memory allocation or data transfer fails ierr will be set to error code.
+   */
+  template <typename T>
+  void * pushData(const void *data_in, int elements, int &ierr) {
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version			
+      cl_mem mem_ptr;
+      size_t size = sizeof(T)*elements;
+      mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
+      oclbase->ocl_writeData(mem_ptr, data_in, size, CL_FALSE);
+
+      ierr = DKS_SUCCESS;
+      return mem_ptr;
+#endif
+    } else if (apiCuda()){
+#ifdef DKS_CUDA
+      //cuda version
+      void * mem_ptr = NULL;
+      size_t size = sizeof(T)*elements;
+      mem_ptr = cbase->cuda_allocateMemory(size, ierr);
+      cbase->cuda_writeData((T*)mem_ptr, data_in, size);
+
+      ierr = DKS_SUCCESS;
+      return mem_ptr;
+#endif
+    } else if (apiOpenMP()) {
+#ifdef DKS_MIC
+      void * mem_ptr = NULL;
+      mem_ptr = micbase.mic_pushData<T>(data_in, elements);
+
+      return mem_ptr;
+#endif
+    }
+
+    ierr = DKS_ERROR;
+    return NULL;
+  }	
+
+  /** 
+   * Read data from device and free device memory.
+   * Reads data from device pointed by mem_ptr into data_out pointer. Elements
+   * specifies the number of data elements to read, T specifies the datatype of
+   * elements to copy. Returns error code if read data or free memory fails.
+   */
+  template<typename T>
+  int pullData(void *mem_ptr, void* data_out, int elements) {
+
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version
+      size_t size = sizeof(T)*elements;
+      cl_mem clmem_ptr = (cl_mem)mem_ptr;
+      oclbase->ocl_readData(clmem_ptr, data_out, size);
+      oclbase->ocl_freeMemory(clmem_ptr);
+#endif
+    } else if (apiCuda()) {
+#ifdef DKS_CUDA
+      //cuda version
+      size_t size = sizeof(T)*elements;
+      cbase->cuda_readData((T*)mem_ptr, data_out, size);
+      cbase->cuda_freeMemory(mem_ptr);
+#endif
+    } else if (apiOpenMP()) {
+#ifdef DKS_MIC
+      micbase.mic_pullData<T>(mem_ptr, data_out, elements);
+#endif
+    }
+
+    return DKS_SUCCESS;
+  }
+
+  /** 
+   * Allocate memory on device and return pointer to device memory.
+   * Allocates memory of type T, elements specifies the number of
+   * elements for which memory should be allocated. If memory allocation
+   * fails ierr is set to error code. Returns void pointer to device memory.
+   */
+  template<typename T>
+  void * allocateMemory(int elements, int &ierr) {
+    ierr = DKS_SUCCESS;
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version
+      cl_mem mem_ptr;
+      size_t size = sizeof(T)*elements;
+      mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
+      return mem_ptr;
+#endif
+    } else if (apiCuda()) {
+#ifdef DKS_CUDA
+      //cuda version
+      void * mem_ptr = NULL;
+      size_t size = sizeof(T)*elements;
+      mem_ptr = cbase->cuda_allocateMemory(size, ierr);			
+      return mem_ptr;
+#endif
+    } else if (apiOpenMP()) {
+#ifdef DKS_MIC
+      void * mem_ptr = NULL;
+      mem_ptr = micbase.mic_allocateMemory<T>(elements);	
+      return mem_ptr;
+#endif
+    }
+
+    ierr = DKS_ERROR;
+    return NULL;
+  }
+
+  /** 
+   * Allocates host memory as page-locked.
+   * Used for memroy allocation on the host side for pointer ptr for size elements. 
+   * Page locked memory improves
+   * data transfer rates between host and device and allows async data transfer
+   * and kernel execution. Reurns succes or error code.
+   * TODO: opencl and mic implementations needed.
+   */
+  template<typename T>
+  int allocateHostMemory(T *&ptr, int size) 
+  {
+    if (apiCuda())
+      return CUDA_SAFECALL(cbase->cuda_allocateHostMemory(ptr, size));
+
+    DEBUG_MSG("Pinned memory allocation not implemented for this platform");
+    return DKS_ERROR;
+  }
+
+  /** 
+   * Free host page-locked memory.
+   * Used to free page-locked memory on the host that was allocated using 
+   * allocateHostMemory. ptr is the host pointer where page-locked memory was allocated,
+   * size - number of elements held by the memroy.
+   */
+  template<typename T>
+  int freeHostMemory(T* &ptr, int size) 
+  {
+    if (apiCuda())
+      return CUDA_SAFECALL(cbase->cuda_freeHostMemory(ptr));
+
+    return DKS_ERROR;
+  }
+
+  /**
+   * Page lock allocated host memory.
+   * Page locked memory improves data transfer between host and device (true for cuda and
+   * opencl, maybe also mic). ptr - pointer to memory that needs to be page locked,
+   * size - number of elements in array.
+   * TODO: mic and opencl implementations needed
+   */
+  template <typename T>
+  int registerHostMemory(T *ptr, int size) {
+    if (apiCuda())
+      return CUDA_SAFECALL(cbase->cuda_hostRegister(ptr, size));
+
+    return DKS_ERROR;
+  }
+
+  /**
+   * Unregister page locked memory.
+   * TODO: opencl and mic implementations needed·
+   */
+  template <typename T>
+  int unregisterHostMemory(T *ptr) {
+    if (apiCuda())
+      return CUDA_SAFECALL(cbase->cuda_hostUnregister(ptr));
+    return DKS_ERROR;
+  }
+
+  /** 
+   * Write data from host to device.
+   * Write data from data to device memory referenced by mem_ptr. Elements spicify the
+   * number of elements to write, offset specifies the offset from the first element.
+   * Returns success or error code. Performs a blocking write - control to the host
+   * is returned only when data transfer is complete.
+   */
+  template<typename T>
+  int writeData(void *mem_ptr, const void *data, int elements, int offset = 0) {
+
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version
+      size_t size = sizeof(T)*elements;
+      size_t offset_bytes = sizeof(T)*offset;
+      cl_mem clmem_ptr = (cl_mem)mem_ptr;
+      return oclbase->ocl_writeData(clmem_ptr, data, size, offset_bytes, CL_FALSE);
+#endif
+
+    } else if (apiCuda()){
+      //cuda version
+      size_t size = sizeof(T)*elements;
+      return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
+
+    } else if (apiOpenMP()) {
+      return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
+
+    } 
+      
+    return DKS_ERROR;
+    
+  }
+
+  /** 
+   * Write data to device using async write.
+   * Queue a async data write and return control to host imediately.
+   * mem_ptr - device memory pointer, data - host memory pointer, 
+   * elements - number of data elements to write
+   * stremaId - stream id to use, offset - offset on device from first element
+   * For trully async execution on cuda stream other than default needs to be created
+   * and device memory must be page-locked. Otherwise functions just asynchronosly with
+   * respect to host.
+   * TODO: mic and opencl implementations needed (goes to blocking writes)
+   */
+  template<typename T>
+  int writeDataAsync(void *mem_ptr, const void *data, int elements, 
+		     int streamId = -1, int offset = 0) {
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version
+      size_t size = sizeof(T)*elements;
+      cl_mem clmem_ptr = (cl_mem)mem_ptr;
+      oclbase->ocl_writeData(clmem_ptr, data, size, 0, CL_FALSE);
+#endif
+    } else if (apiCuda()){
+      //cuda version
+      size_t size = sizeof(T)*elements;
+      return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
+    } else if (apiOpenMP()) {
+      return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
+    } 
+    
+    return DKS_ERROR;
+
+  }
+
+  /** 
+   * Gather 3D data from multiple mpi processes to one memory region.
+   * When multiple processes share the same device memory using sendPointer and receivePointer
+   * gather3DDataAsync allows each process to write data to its memory region. Uses async writes. 
+   * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
+   * data dimensions, id - starting indexes in global domain for each process
+   * streamId - stream to use for data transfers.
+   * Returns success or error code.
+   */
+#ifdef DKS_MPI
+  template<typename T>
+  int gather3DDataAsync(void *mem_ptr, const T *data, int Ng[3], int Nl[3], 
+			int id[3], int streamId = -1 ) {
+
+
+    //int p = 1;
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int hoffset, doffset, ierr;
+
+    //number of continuous memory elements
+    int elements = Nl[0];
+    if (Nl[0] == Ng[0]) {
+      elements *= Nl[1];
+      if (Nl[1] == Ng[1])
+	elements *= Nl[2];
+    }
+
+    //starting index
+    int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];
+
+    //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
+    if (Nl[0] != Ng[0]) {
+      for (int i = 0; i < Nl[2]; i++) {
+	for (int j = 0; j < Nl[1]; j++) {
+	  doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
+	  hoffset = (i * Nl[1] + j) * elements;
+	  ierr = writeDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
+	  if (ierr == DKS_ERROR) return DKS_ERROR;
+	}
+      }
+      return DKS_SUCCESS;
+    }
+
+    //copy piece by piece 3rd dim if 2nd dim is split
+    if (Nl[1] != Ng[1]) {
+      for (int i = 0; i < Nl[2]; i++) {
+	doffset = i* Ng[1] * Ng[0] + sid;
+	ierr = writeDataAsync<T>(mem_ptr, data + i*elements, elements, streamId, doffset);
+	if (ierr == DKS_ERROR) return DKS_ERROR;
+      }
+      return DKS_SUCCESS;
+    }
+
+    //if only 3rd dim is split all elements are continuous so write one chunk
+    doffset = sid;
+    return writeDataAsync<T>(mem_ptr, data, elements, streamId, doffset);
+
+  }
+#endif
+
+  /** 
+   * Scatter 3D data to multiple MPI processes from one device memory region.
+   * When multiple processes share the same device memory using sendPointer and receivePointer
+   * scatter3DDataAsync allows each process to read data from its memory region. Uses async reads. 
+   * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
+   * data dimensions, id - starting indexes in global domain for each process
+   * streamId - stream to use for data transfers.
+   * Returns success or error code.
+   */
+#ifdef DKS_MPI
+  template<typename T>
+  int scatter3DDataAsync(const void *mem_ptr, T *data, int Ng[3], int Nl[3], 
+			 int id[3], int streamId = -1) {
+
+    //int p = 1;
+    //int rank;
+    //MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int hoffset, doffset, ierr;
+
+    //number of continuous memory elements
+    int elements = Nl[0];
+    if (Nl[0] == Ng[0]) {
+      elements *= Nl[1];
+      if (Nl[1] == Ng[1])
+	elements *= Nl[2];
+    }
+
+    //starting index
+    int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];
+
+    //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
+    if (Nl[0] != Ng[0]) {
+      for (int i = 0; i < Nl[2]; i++) {
+	for (int j = 0; j < Nl[1]; j++) {
+	  doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
+	  hoffset = (i * Nl[1] + j) * elements;
+	  ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
+	  if (ierr == DKS_ERROR) return DKS_ERROR;
+	}
+      }
+      return DKS_SUCCESS;
+    }
+
+    //copy piece by piece 3rd dim if 2nd dim is split
+    if (Nl[1] != Ng[1]) {
+      for (int i = 0; i < Nl[2]; i++) {
+	doffset = i* Ng[1] * Ng[0] + sid;
+	hoffset = i * elements;
+	ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
+	if (ierr == DKS_ERROR) return DKS_ERROR;
+      }
+      return DKS_SUCCESS;
+    }
+
+    //if only 3rd dim is split all elements are continuous so write one chunk
+    doffset = sid;
+    return readDataAsync<T>(mem_ptr, data, elements, streamId, doffset);
+
+  }
+#endif
+
+  /** 
+   * Create MPI subarray for 3D data gather and scatter using cuda aware MPI.
+   * If multiple MPI processes share device and cuda aware MPI is used for data transfer
+   * creates a MPI subarray so each MPI process can write and read to its own memory region.
+   * N_global - global domain dimensions, N_local - local domain dimensions, datatype - MPI datatype
+   */
+#ifdef DKS_MPI
+  template<typename T>
+  MPI_Datatype create3DMPISubarray(int N_global[3], int N_local[3], MPI_Datatype datatype) {
+    //create MPI datatypes to transfer decomposed domain from GPU memory
+    int sizes[3] = {N_global[2], N_global[1], N_global[0]};
+    int subsizes[3] = {N_local[2], N_local[1], N_local[0]};
+    int starts[3] = {0, 0, 0};
+
+    MPI_Datatype stype, rtype;
+    MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, datatype, &stype);
+    MPI_Type_create_resized(stype, 0, sizeof(T), &rtype);
+    MPI_Type_commit(&rtype);
+
+    return rtype;
+  }
+#endif
+
+  /** 
+   * Gather 3D data from multiple MPI processes to device using cuda aware MPI.
+   * Using cuda aware mpi allows to gather data to one device memory region allocated
+   * by one of the mpi processes. mem_ptr - device pointer, data - host memory pointer,
+   * size - number of elements to transfer, stype - data type of elements, N_global - 
+   * global dimensions of the domain, N_local - local domain dimensions, 
+   * idx,idy,idz - starting indexes in global domain for each process, numNodes - number
+   * of processes, myNode - current node, rootNode - node that allocated device memory,
+   * comm - MPI communicator
+   * TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
+   */
+#ifdef DKS_MPI
+  template<typename T>
+  int gather3DData(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
+		   int N_local[3], int * idx, int * idy, int * idz, 
+		   int numNodes, int myNode, int rootNode, MPI_Comm comm) 
+  {
+
+    MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);
+
+    //calculate displacements from global domain size and local domain starting index
+    int *counts = new int[numNodes];
+    int *displs = new int[numNodes];
+    for (int i = 0; i < numNodes; i++) {
+      counts[i] = 1;
+      displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];	
+    }
+
+    if (apiOpenCL()) {
+      //TODO: gather all the date in root node, transfer to device from root node
+      return DKS_ERROR;
+    } else if (apiCuda()) {
+      MPI_Gatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm );
+    } else if (apiOpenMP()) {
+      //TODO: gather all the date in root node, transfer to device from root node
+      return DKS_ERROR;
+    }		
+
+    return DKS_SUCCESS;
+
+  }
+#endif
+
+  /** 
+   * Gather 3D data from multiple MPI processes to device using cuda aware MPI and non blocking gather.
+   * For detailed parameter description see gather3DData docs.
+   * TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
+   */
+#ifdef DKS_MPI
+  template<typename T>
+  int gather3DDataAsync(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
+			int N_local[3], int * idx, int * idy, int * idz, 
+			int numNodes, int myNode, int rootNode, 
+			MPI_Comm comm, MPI_Request &request)
+  {
+
+    MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);
+
+    //calculate displacements from global domain size and local domain starting index
+    int *counts = new int[numNodes];
+    int *displs = new int[numNodes];
+    for (int i = 0; i < numNodes; i++) {
+      counts[i] = 1;
+      displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];	
+    }
+
+    if (apiOpenCL()) {
+      //TODO: gather all the date in root node, transfer to device from root node
+      return DKS_ERROR;
+    } else if (apiCuda()) {
+      MPI_Igatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm, &request );
+
+    } else if (apiOpenMP()) {
+      //TODO: gather all the date in root node, transfer to device from root node
+      return DKS_ERROR;
+    }		
+
+    return DKS_SUCCESS;
+
+  }
+#endif
+
+  /** 
+   * Scatter 3D data from device to multiple MPI processes using cuda aware MPI.
+   * If multiple MPI prcesses share one device allows to scatter 3D data regions 
+   * from device memory allocated by one of the processes to all other MPI processes.
+   * For detailed parameter description see gather3DData docs.
+   * TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
+   */
+#ifdef DKS_MPI	
+  template<typename T>
+  int scatter3DData(void *mem_ptr, T *data, int size, MPI_Datatype rtype, int N_global[3],
+		    int N_local[3], int * idx, int * idy, int * idz, 
+		    int numNodes, int myNode, int rootNode, MPI_Comm comm) 
+  {
+
+    MPI_Datatype stype = create3DMPISubarray<T>(N_global, N_local, rtype);
+
+    //calculate displacements from global domain size and local domain starting index
+    int *counts = new int[numNodes];
+    int *displs = new int[numNodes];
+    for (int i = 0; i < numNodes; i++) {
+      counts[i] = 1;
+      displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];	
+    }
+
+    if (apiOpenCL()) {
+      //TODO: gather all the date in root node, transfer to device from root node
+    } else if (apiCuda()) {
+
+      //async scatter
+      //use cuda aware mpi
+      MPI_Scatterv( mem_ptr, counts, displs, stype, data, size, rtype, rootNode, comm );
+      return DKS_ERROR;
+    } else if (apiOpenMP()) {
+
+      //TODO: gather all the date in root node, transfer to device from root node	
+      return DKS_ERROR;
+    }		
+
+    return DKS_SUCCESS;
+
+  }
+#endif	
+
+  /** 
+   * Read data from device memory.
+   * Read data referenced by mem_ptr int out_data. Elements indicates the number of data
+   * elements to read and offset is the offset on the device from start of the memroy.
+   * Data type to read is specified by T. Performs a blocking read.
+   */
+  template<typename T>
+  int readData(const void *mem_ptr, void *out_data, int elements, int offset = 0) {
+
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version
+      cl_mem clmem_ptr = (cl_mem)mem_ptr;
+      size_t size = sizeof(T)*elements;
+      size_t offset_bytes = sizeof(T)*offset;
+      return oclbase->ocl_readData(clmem_ptr, out_data, size, offset_bytes);
+#endif
+    } else if (apiCuda()){
+      size_t size = sizeof(T)*elements;
+      return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
+    } else if (apiOpenMP()) {
+      return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
+    } 
+    
+    return DKS_ERROR;
+  }	
+
+  /** 
+   * Performs an async data read from device.
+   * Queues data read from device and returns control to host. stream id specifies stream to use for 
+   * the read. Device async read can be performed if host memroy is page-locked and strema other than 
+   * default -1 is used. For other parameter detailed description see readData function.
+   * TODO: opencl and mic implementations (currently reverts to blocking reads).
+   */
+  template<typename T>
+  int readDataAsync(const void *mem_ptr, void *out_data, int elements, int streamId = -1, int offset = 0) {
+
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      //OpenCL version
+      cl_mem clmem_ptr = (cl_mem)mem_ptr;
+      size_t size = sizeof(T)*elements;
+      return oclbase->ocl_readData(clmem_ptr, out_data, size, 0);
+#endif
+    } else if (apiCuda()){
+      //cuda version
+      size_t size = sizeof(T)*elements;
+      return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
+    } else if (apiOpenMP()) {
+      return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements, 
+						       streamId, offset));
+    }
+
+    return DKS_ERROR;
+  }	
+
+
+  /** 
+   * Free memory allocated on device.
+   * Free memory referenced by mem_ptr, elements - number of elements in memory,
+   * T - data type.
+   */
+  template<typename T>
+  int freeMemory(void *mem_ptr, int elements) {
+    if (apiOpenCL()) 
+      return OPENCL_SAFECALL(oclbase->ocl_freeMemory((cl_mem)mem_ptr));
+    else if (apiCuda())
+      return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
+    else if (apiOpenMP())
+      return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
+
+    return DKS_ERROR;
+  }
+
+
+  ///////////////////////////////////////////////
+  ///////Function library part of dksbase////////
+  ///////////////////////////////////////////////
+
+  /** 
+   * Setup FFT function.
+   * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
+   * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case 
+   * each fft will do its own setup according to fft size and dimensions.
+   * TODO: opencl and mic implementations
+   */
+  int setupFFT(int ndim, int N[3]);
+  //BENI:
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
+  //BENI:
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
+
+  /** 
+   * Call complex-to-complex fft.
+   * Executes in place complex to compelx fft on the device on data pointed by data_ptr.
+   * stream id can be specified to use other streams than default.
+   * TODO: mic implementation
+   */
+  int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call complex-to-complex ifft.
+   * Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
+   * stream id can be specified to use other streams than default.
+   * TODO: mic implementation.
+   */
+  int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Normalize complex to complex ifft.
+   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
+   * fft size
+   * TODO: mic implementation.
+   */
+  int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call real to complex FFT.
+   * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
+   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
+   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
+   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
+   * TODO: opencl and mic implementations
+   */
+  int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Call complex to real iFFT.
+   * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
+   * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
+   * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
+   * (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
+   * TODO: opencl and mic implementations.
+   */
+  int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /** 
+   * Normalize compelx to real ifft.
+   * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by 
+   * fft size.
+   * TODO: opencl and mic implementations.
+   */
+  int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
+
+  /**
+   * Transpose 2D and 3D arrays, OpenCL implementation
+   * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose 
+   */
+  int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
+
+  /** 
+   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
+   * For specifics check OPAL docs.
+   * TODO: opencl and mic implementations.
+   */
+  int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, 
+			 double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
+
+  /** 
+   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
+   * For specifics check OPAL docs.
+   * TODO: opencl and mic implementations.
+   */
+  int callGreensIntegration(void *mem_ptr, void *tmp_ptr, 
+			    int I, int J, int K, int streamId = -1);
+
+  /** 
+   * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
+   * For specifics check OPAL docs.
+   * TODO: opencl and mic implementations.
+   */
+  int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
+
+  /** 
+   * Element by element multiplication.
+   * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
+   * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
+   * TODO: opencl and mic implementations.
+   */
+  int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
+
+  /** 
+   * Chi square for parameter fitting on device.
+   * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for 
+   * intermediate results. Chi square results are put in &results
+   */
+  int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, 
+		      double fTimeResolution, double fRebin,
+		      int sensors, int length, int numpar, double &result);
+
+  /** 
+   * max-log-likelihood for parameter fitting on device.
+   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
+   * mem_par - pointer to parameter set, mem_results - pointer for 
+   * intermediate results. Chi square results are put in &results.
+   * TODO: opencl and mic implementations.
+   */
+  int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			double fTimeResolution, double fRebin, double fGoodBinOffser,
+			int sensors, int length, int numpar,
+			double &result);
+
+  /** 
+   * max-log-likelihood for parameter fitting on device.
+   * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, 
+   * mem_par - pointer to parameter set, mem_results - pointer for 
+   * intermediate results. Chi square results are put in &results.
+   * TODO: opencl and mic implementations.
+   */
+  int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			  double fTimeResolution, double fRebin, double fGoodBinOffser,
+			  int sensors, int length, int numpar,
+			  double &result);
+
+  /** 
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysics(void *mem_ptr, void *par_ptr, 
+			    int numparticles, int numparams, 
+			    int &numaddback, int &numdead);
+
+
+  
+  /** 
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
+
+  /** 
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * Test function for the MIC to test SoA layout vs AoS layout used in previous versions
+   */
+  int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			       void *px_ptr, void *py_ptr, void *pz_ptr,
+			       void *par_ptr, int numparticles);
+
+  /**
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
+
+  /**
+   * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
+   * TODO: opencl and mic implementations.
+   */
+  int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+				   void *px_ptr, void *py_ptr, void *pz_ptr,
+				   void *par_ptr, int numparticles, int &numaddback);
+
+  /** 
+   * Init random number states and save for reuse on device.
+   * TODO: opencl and mic implementations.
+   */
+  int callInitRandoms(int size);
+
+  /**
+   * Integration code from ParallelTTracker from OPAL.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
+   */
+  int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, 
+			       void *dt_ptr, double dt, double c, 
+			       bool usedt = false, int streamId = -1);
+
+  /**
+   * Integration code from ParallelTTracker from OPAL.
+   * For specifics check OPAL docs and CudaCollimatorPhysics class docs
+   */
+  int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
+					void *lastSec_ptr, void *orient_ptr, 
+					int npart, int nsec, void *dt_ptr,
+					double dt, double c, bool usedt = false, 
+					int streamId = -1);
+
+  /**
+   * Print memory information on device (total, used, available)
+   * TODO: opencl and mic imlementation
+   */
+  int callMemInfo() {
+    if (apiCuda())
+      return CUDA_SAFECALL(cbase->cuda_memInfo());
+
+    return DKS_ERROR;
+  }
+
+  /** 
+   * Test function to profile opencl kernel calls.
+   * Used for debuging and timing purposes only.
+   */
+  void oclEventInfo() {
+    if (apiOpenCL())
+      return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
+
+  }
+
+  /** 
+   * Test function to profile opencl kernel calls.
+   * Used for debuging and timing purposes only.
+   */	
+  void oclClearEvents() {
+    if (apiOpenCL()) {
+#ifdef DKS_OPENCL
+      oclbase->ocl_clearEvents();
+#endif
+    }
+  }
+
+
+};
+
+#endif
diff --git a/src/DKSBaseMuSR.cpp b/src/DKSBaseMuSR.cpp
new file mode 100644
index 0000000..3df59e9
--- /dev/null
+++ b/src/DKSBaseMuSR.cpp
@@ -0,0 +1,196 @@
+#include "DKSBaseMuSR.h"
+
+DKSBaseMuSR::DKSBaseMuSR() { 
+  chiSq = nullptr;
+  chiSquareSize_m = -1;
+}
+
+DKSBaseMuSR::~DKSBaseMuSR() { 
+  freeChiSquare();
+}
+
+int DKSBaseMuSR::callCompileProgram(std::string function, bool mlh) {
+  return chiSq->compileProgram(function, mlh);
+}
+
+int DKSBaseMuSR::callLaunchChiSquare(int fitType,
+				     void *mem_data, void *mem_err, int length,
+				     int numpar, int numfunc, int nummap,
+				     double timeStart, double timeStep, 
+				     double &result) 
+{
+
+
+  //if we are not auto tuning and the size of the problem has changed find the new parameters
+  //from autotuning config file
+  if (!isAutoTuningOn() && length != chiSquareSize_m) {
+    int numBlocks, blockSize;
+    std::string device_name;
+    getDeviceName(device_name);
+    dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", 
+				 length, "NumBlocks", numBlocks);
+    dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", 
+				 length, "BlockSize", blockSize);
+    chiSq->setKernelParams(numBlocks, blockSize);
+    
+    //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
+
+    chiSquareSize_m = length;
+  } 
+
+  int ierr = chiSq->launchChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, 
+				    nummap, timeStart, timeStep, result);
+ 
+  if ( isAutoTuningOn() ) {
+    std::vector<int> config;
+    callAutoTuningChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, nummap, timeStart, 
+			    timeStep, result, config);
+  }
+ 
+  return ierr;
+}
+
+int DKSBaseMuSR::callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
+					 int numpar, int numfunc, int nummap,
+					 double timeStart, double timeStep,
+					 double &result, std::vector<int> &config)
+{
+
+  int loops = 100;
+  DKSAutoTuning *autoTuning;
+  if (apiCuda())
+    autoTuning = new DKSAutoTuning(this, API_CUDA, DEVICE_GPU_NEW, loops);
+  else if (apiOpenCL() && deviceGPU())
+    autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_GPU_NEW, loops);
+  else if (apiOpenCL() && deviceCPU())
+    autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_CPU_NEW, loops);
+  else if (apiOpenCL() && deviceMIC())
+    autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_MIC_NEW, loops);
+  else
+    autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW, loops);
+
+
+  int maxThreadsPerBlock = 1024;
+  checkMuSRKernels(fitType, maxThreadsPerBlock);
+  std::cout << "Max threads for autotune " << maxThreadsPerBlock << std::endl;
+
+  //create the function to be timed
+  std::function<int()> f = std::bind(&ChiSquareRuntime::launchChiSquare, chiSq, 
+				     fitType, mem_data, mem_err, length, numpar, numfunc, nummap, 
+				     timeStart, timeStep, result);
+  autoTuning->setFunction(f, "launchChiSquare");
+  
+  //create the parameters for auto-tuning
+  autoTuning->addParameter(&chiSq->blockSize_m, 32, maxThreadsPerBlock, 32, "BlockSize");
+  autoTuning->addParameter(&chiSq->numBlocks_m, 100, 5000, 100, "NumBlocks");
+
+  autoTuning->lineSearch();
+
+  //autoTuning->hillClimbing(100);
+
+  //autoTuning->simulatedAnnealing(1e-3, 1e-6);
+  
+  //autoTuning->exaustiveSearch();
+
+  std::string device_name;
+  getDeviceName(device_name);
+  dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length, 
+			       "NumBlocks", chiSq->numBlocks_m);
+  dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length, 
+			       "BlockSize", chiSq->blockSize_m);
+  
+
+  config.push_back(chiSq->blockSize_m);
+  config.push_back(chiSq->numBlocks_m);
+
+  delete autoTuning;
+
+  return DKS_SUCCESS;
+
+}
+
+int DKSBaseMuSR::testAutoTuning() {
+
+  DKSAutoTuning *autoTuning;
+  DKSAutoTuningTester *tester;
+
+  autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW);
+  tester = new DKSAutoTuningTester();
+
+  std::function<double()> f = std::bind(&DKSAutoTuningTester::peaksZ, tester);
+  autoTuning->setFunction(f, "testAutoTuner", false);
+
+  autoTuning->addParameter(&tester->x, -3.0, 3.0, 0.5, "x");
+  autoTuning->addParameter(&tester->y, -3.0, 3.0, 0.5, "y");
+
+  autoTuning->exaustiveSearch();
+
+  autoTuning->hillClimbing(10);
+
+  autoTuning->simulatedAnnealing(10, 0.0005);
+
+  return DKS_SUCCESS;
+}
+
+int DKSBaseMuSR::callSetConsts(double N0, double tau, double bkg) {
+  return chiSq->setConsts(N0, tau, bkg);
+}
+
+int DKSBaseMuSR::callSetConsts(double alpha, double beta) {
+  return chiSq->setConsts(alpha, beta);
+}
+
+int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int size_map) {
+  int ierr;
+
+  if (apiCuda()) {
+    ierr = CUDA_SAFECALL( DKS_SUCCESS );
+    chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
+  } else {
+    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
+    chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
+  }
+
+  if (ierr == DKS_SUCCESS) {
+    return chiSq->initChiSquare(size_data, size_param, size_func, size_map);
+  } else {
+    DEBUG_MSG("DKS API not set, or DKS compiled without sellected API support");
+    return DKS_ERROR;
+  }
+}
+
+int DKSBaseMuSR::freeChiSquare()  {
+  int ierr = DKS_SUCCESS;
+  if (chiSq != NULL) {
+    ierr = chiSq->freeChiSquare();
+    delete chiSq;
+    chiSq = NULL;
+  }
+  return ierr;
+}
+
+int DKSBaseMuSR::writeParams(const double *params, int numparams) {
+  return chiSq->writeParams(params, numparams);
+}
+
+int DKSBaseMuSR::writeFunctions(const double *func, int numfunc) {
+  return chiSq->writeFunc(func, numfunc);
+}
+
+int DKSBaseMuSR::writeMaps(const int *map, int numfunc) {
+  return chiSq->writeMap(map, numfunc);;
+
+}
+
+int DKSBaseMuSR::checkMuSRKernels(int fitType) {
+  int threadsPerBlock = 1;
+  return chiSq->checkChiSquareKernels(fitType, threadsPerBlock);
+}
+
+int DKSBaseMuSR::checkMuSRKernels(int fitType, int &threadsPerBlock) {
+  return chiSq->checkChiSquareKernels(fitType, threadsPerBlock);
+}
+
+int DKSBaseMuSR::getOperations(int &oper) {
+  return chiSq->getOperations(oper);
+}
diff --git a/src/DKSBaseMuSR.h b/src/DKSBaseMuSR.h
new file mode 100644
index 0000000..30f2d89
--- /dev/null
+++ b/src/DKSBaseMuSR.h
@@ -0,0 +1,137 @@
+#ifndef H_DKS_BASEMUSR
+#define H_DKS_BASEMUSR
+
+#include <iostream>
+#include <string>
+
+#include "AutoTuning/DKSAutoTuning.h"
+#include "AutoTuning/DKSAutoTuningTester.h"
+
+#include "DKSBase.h"
+
+#include "Algorithms/ChiSquareRuntime.h"
+
+#ifdef DKS_CUDA
+#include "CUDA/CudaChiSquareRuntime.cuh"
+#endif
+
+#ifdef DKS_OPENCL
+#include "OpenCL/OpenCLChiSquareRuntime.h"
+#endif
+
+class DKSBaseMuSR : public DKSBase {
+
+private:
+
+  ChiSquareRuntime *chiSq;
+
+  int chiSquareSize_m;
+
+public:
+
+  DKSBaseMuSR();
+
+  ~DKSBaseMuSR();
+
+  /** Compile the program with kernels to be run.
+   * String function contains the string that will be added to the code to compile in the
+   * function: __device__ double fTheory(double t, double *p, double *f, int *m);
+   * Function string must be a valid C math expression. It can contain operators, math functions
+   * and predefined functions listed in:
+   * http://lmu.web.psi.ch/musrfit/user/MUSR/MusrFit.html#A_4.3_The_THEORY_Block
+   * Predifined functions can be accessed by the abbreviation given in the table
+   * Parameters can be accesed in form p[idx] or p[m[idx]] - where p represents parameter array
+   * m represents map array and idx is the index to use from the maps. Precalculated function 
+   * values can be accessed the same way - f[idx] or f[m[idx]]. Returns DKS_SUCCESS if everythin 
+   * runs successfully, otherwise returns DKS_ERROR. If DKS is compiled with debug flag enabled 
+   * prints DKS error message in case something fails
+   */
+  int callCompileProgram(std::string function, bool mlh = false);
+
+  /** Launch chi square calculation on data set writen in mem_data memory on device.
+   * mem_par, mem_map and mem_func hold pointers to parameter, function and map values
+   * for this data set (parameter array is one for all the data sets, maps and functions
+   * change between data sets). Resulting chi square value for this dataset will be put in
+   * result variable. Returns DKS_SUCCESS if everythin runs successfully, otherwise returns
+   * DKS_ERROR. If DKS is compiled with debug flag enabled prints DKS error message in case
+   * something fails
+   */
+  int callLaunchChiSquare(int fitType,
+			  void *mem_data, void *mem_err, int length,
+			  int numpar, int numfunc, int nummap,
+			  double timeStart, double timeStep,
+			  double &result);
+
+  /** Launch auto-tuning of chisquare function for the selected device.
+   *  Creates a function pointer to callLaunchChiSquare with necessary arguments bind to
+   *  function call. CUDA and OpenCL version - gives AutoTuning class access to numThreads
+   *  parameter which is varied to find the optimal value by AutoTuning class. Uses brute force
+   *  method to test all the values.
+   */
+  int callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
+			      int numpar, int numfunc, int nummap,
+			      double timeStart, double timeStep,
+			      double &result, std::vector<int> &config);
+
+  /** Set N0, tau and BKG values for the run.
+   * Needs to be called before kernel launch if these values are changing
+   */
+  int callSetConsts(double N0, double tau, double bkg);
+
+  /** Set alpha and beta values for the run.
+   * Needs to be called before kernel launch if these values are changing
+   */
+  int callSetConsts(double alpha, double beta);
+
+  /** Init chisquare calculations.
+   * Size is the maximum number of elements in any of the data sets used.
+   */
+  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
+
+  /** Free temporary device storage allocated for chi^2 kernel.
+   * Return error code if freeing the device fails.
+   */
+  int freeChiSquare();
+
+  /** Write params to device.
+   * Write pramas from double array to device, params device memory is managed by DKS.
+   */
+  int writeParams(const double *params, int numparams);
+
+  /** Write function values to device.
+   * Write precalculated function values to device, memory for functions on device is handled
+   * by DKS.
+   */
+  int writeFunctions(const double *func, int numfunc);
+
+  /** Write map indexes to device.
+   * Write map indexes to use in defined theory function to devive. Memory for map indexes is
+   * handeld by DKS.
+   */
+  int writeMaps(const int *map, int numfunc);
+
+  /** Check if device can run necessary kernels.
+   * Check selected device properties to see if device
+   * suports double precision and if device can run the
+   * necessary number of work_items / work_groups to successfully
+   * execute CUDA/OpenCL kernels.
+   */
+  int checkMuSRKernels(int fitType);
+
+  /** Perform the same check as checkMuSRKernels(int fitType) and return max threads per block.
+   * Used for autotuning to check what is the device limit for threads per block to correctly
+   * set the upper bound when searching the parameter space.
+   */
+  int checkMuSRKernels(int fitType, int &threadsPerBlock);
+
+  /** Debug function to test auto-tuning search functions
+   */
+  int testAutoTuning();
+
+  /** Get the number of operations in compiled kernel.
+   */
+  int getOperations(int &oper);
+
+};
+
+#endif
diff --git a/src/DKSDefinitions.h b/src/DKSDefinitions.h
new file mode 100644
index 0000000..63fba34
--- /dev/null
+++ b/src/DKSDefinitions.h
@@ -0,0 +1,71 @@
+#ifndef H_DKS_DEFINITIONS
+#define H_DKS_DEFINITIONS
+
+#define API_OPENCL "OpenCL"
+#define API_CUDA "Cuda"
+#define API_OPENMP "OpenMP"
+#define API_UNKNOWN "Unknown"
+
+#define DEVICE_GPU_NEW "GPU"
+#define DEVICE_CPU_NEW "CPU"
+#define DEVICE_MIC_NEW "MIC"
+#define DEVICE_UNKNOWN_NEW "Unknown"
+
+#define DEVICE_GPU "-gpu"
+#define DEVICE_CPU "-cpu"
+#define DEVICE_MIC "-mic"
+
+//define macro for printing debug messages if debug flag is set
+#ifdef DEBUG
+#define DEBUG_MSG(x) (std::cout << x << std::endl)
+#else
+#define DEBUG_MSG(x)
+#endif
+
+//define DKS error codes
+#define DKS_SUCCESS 0
+#define	DKS_ERROR 1
+#define DKS_API_NOT_ENABLED 100
+
+#define OCL_SUCCESS 0
+#define OCL_ERROR 1
+
+//define macros to enable or disable calls to specific frameworks
+//if framework specific flag is set execute the satement, of not give DKS_API_NOT_ENABLED error
+#ifdef DKS_CUDA
+#define CUDA_SAFECALL(...) ( __VA_ARGS__ )
+#else
+#define CUDA_SAFECALL(...) ( DKS_API_NOT_ENABLED )
+#endif
+
+#ifdef DKS_OPENCL
+#define OPENCL_SAFECALL(...) ( __VA_ARGS__ )
+#else
+#define OPENCL_SAFECALL(...) ( DKS_API_NOT_ENABLED )
+#endif
+
+#ifdef DKS_MIC
+#define MIC_SAFECALL(...) ( __VA_ARGS__ )
+#else
+#define MIC_SAFECALL(...) ( DKS_API_NOT_ENABLED )
+#endif
+
+#ifdef DKS_CUDA
+#define CUDA_SAFEINIT(x) ( x )
+#else
+#define CUDA_SAFEINIT(x) ( NULL )
+#endif
+
+#ifdef DKS_OPENCL
+#define OPENCL_SAFEINIT(x) ( x )
+#else
+#define OPENCL_SAFEINIT(x) ( NULL )
+#endif
+
+#ifdef DKS_MIC
+#define MIC_SAFEINIT(x) ( x )
+#else
+#define MIC_SAFEINIT(x) ( NULL )
+#endif
+
+#endif
diff --git a/src/DKSDevice.cpp b/src/DKSDevice.cpp
new file mode 100644
index 0000000..e69de29
diff --git a/src/DKSDevice.h b/src/DKSDevice.h
new file mode 100644
index 0000000..79a69fe
--- /dev/null
+++ b/src/DKSDevice.h
@@ -0,0 +1,37 @@
+/*
+
+Author: Uldis Locans
+
+Info: class that holds information about the compute device
+
+Data: 25.09.2014
+
+*/
+
+#define DKS_DEVICE_TYPE_GPU 1
+#define DKS_DEVICE_TYPE_MIC 2
+#define DKS_DEVICE_TYPE_CPU 3
+
+class Device {
+
+	private:
+		int m_device_id;
+		int m_device_type;
+		char *m_device_name;
+		char *m_device_vendor;
+		
+		bool m_sup_opencl;
+		bool m_sup_cuda;
+		bool m_sup_openmp;
+		bool m_sup_openacc;
+		
+		int m_pci_bus_id;
+	
+	public:
+	
+		Device();
+		~Device();
+		
+		
+
+};
\ No newline at end of file
diff --git a/src/DKSImageReconstruction.cpp b/src/DKSImageReconstruction.cpp
new file mode 100644
index 0000000..5f2222a
--- /dev/null
+++ b/src/DKSImageReconstruction.cpp
@@ -0,0 +1,130 @@
+#include "DKSImageReconstruction.h"
+
+DKSImageRecon::DKSImageRecon() {
+
+  //set up base. since reconstruction is always using cuda, set up base to CUDA
+  setAPI("Cuda");
+  setDevice("-gpu");
+  initDevice();
+
+  imageRecon = CUDA_SAFEINIT( new CudaImageReconstruction(getCudaBase()) );
+}
+
+DKSImageRecon::~DKSImageRecon() { 
+  delete[] imageRecon;
+}
+
+int DKSImageRecon::callCalculateSource(void *image_space, void *image_position, 
+				       void *source_position, void *avg, void *std, 
+				       float diameter, int total_voxels, 
+				       int total_sources, int start)
+{
+  int ierr;
+  ierr = imageRecon->calculateSource(image_space, image_position, source_position, 
+				     avg, std, diameter, total_voxels, 
+				     total_sources, start);
+  return ierr;
+}
+
+int DKSImageRecon::callCalculateBackground(void *image_space, void *image_position, 
+					   void *source_position, void *avg, void *std, 
+					   float diameter, int total_voxels, 
+					   int total_sources, int start)
+{
+
+  int ierr;
+  ierr = imageRecon->calculateBackground(image_space, image_position, 
+					 source_position, avg, std, diameter, 
+					 total_voxels, total_sources, start);
+  return ierr;
+}
+
+int DKSImageRecon::callCalculateSources(void *image_space, void *image_position, 
+					void *source_position, void *avg, void *std, 
+					void *diameter, int total_voxels, 
+					int total_sources, int start)
+{
+  int ierr;
+  ierr = imageRecon->calculateSources(image_space, image_position, 
+				      source_position, avg, std, diameter, 
+				      total_voxels, total_sources, start);
+  return ierr;
+}
+
+int DKSImageRecon::callCalculateBackgrounds(void *image_space, void *image_position, 
+					    void *source_position, void *avg, void *std, 
+					    void *diameter, int total_voxels, 
+					    int total_sources, int start)
+{
+  
+  int ierr;
+  ierr = imageRecon->calculateBackgrounds(image_space, image_position, 
+					  source_position, avg, std, diameter, 
+					  total_voxels, total_sources, start);
+
+return ierr;
+}
+
+
+int DKSImageRecon::callGenerateNormalization(void *recon, void *image_position, 
+					     void *det_position, int total_det)
+{
+  
+  int ierr = imageRecon->generateNormalization(recon, image_position, 
+					       det_position, total_det);
+  return ierr;
+}
+
+
+int DKSImageRecon::callForwardProjection(void *correction, void *recon, void *list_data, 
+					 void *det_position, void *image_position, int num_events)
+{
+
+  int ierr;
+  ierr = imageRecon->forwardProjection(correction, recon, list_data, det_position, 
+				       image_position, num_events);
+  return ierr;
+}
+
+int DKSImageRecon::callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
+					  void *det_position, void *image_position, 
+					  int num_events, int num_voxels)
+{
+
+  int ierr;
+  ierr = imageRecon->backwardProjection(correction, recon_corrector, list_data, 
+					det_position, image_position, num_events, 
+					num_voxels);
+  return ierr;
+}
+
+int DKSImageRecon::setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) {
+  int ierr = imageRecon->setDimensions(voxel_x, voxel_y, voxel_z, voxel_size);
+  return ierr;
+}
+
+int DKSImageRecon::setEdge(float x_edge, float y_edge, float z_edge) {
+  int ierr = imageRecon->setEdge(x_edge, y_edge, z_edge);
+  return ierr;
+}
+
+int DKSImageRecon::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) {
+  int ierr = imageRecon->setEdge1(x_edge1, y_edge1, z_edge1, z_edge2);
+  return ierr;
+}
+
+int DKSImageRecon::setMinCrystalInRing(float min_CrystalDist_InOneRing, 
+				       float min_CrystalDist_InOneRing1) 
+{
+  int ierr = imageRecon->setMinCrystalInRing(min_CrystalDist_InOneRing, 
+					     min_CrystalDist_InOneRing1);
+  return ierr;
+}
+
+int DKSImageRecon::setParams(float matrix_distance_factor, float phantom_diameter,
+			     float atten_per_mm, float ring_diameter)
+{
+  int ierr = imageRecon->setParams(matrix_distance_factor, phantom_diameter,
+				   atten_per_mm, ring_diameter);
+  return ierr;
+}
diff --git a/src/DKSImageReconstruction.h b/src/DKSImageReconstruction.h
new file mode 100644
index 0000000..32f67ef
--- /dev/null
+++ b/src/DKSImageReconstruction.h
@@ -0,0 +1,120 @@
+#ifndef H_DKS_IMAGERECONSTRUCTION
+#define H_DKS_IMAGERECONSTRUCTION
+
+#include <iostream>
+#include "DKSBase.h"
+
+#include "Algorithms/ImageReconstruction.h"
+
+#ifdef DKS_CUDA
+#include "CUDA/CudaImageReconstruction.cuh"
+#endif
+
+class DKSImageRecon : public DKSBase {
+
+private:
+  
+  ImageReconstruction *imageRecon;
+
+public:
+
+  DKSImageRecon();
+
+  ~DKSImageRecon();
+
+  /** Image reconstruction analaysis calculate source.
+   * 
+   *
+   */
+  int callCalculateSource(void *image_space, void *image_position, void *source_position, 
+			  void *avg, void *std, float diameter, int total_voxels, 
+			  int total_sources, int start = 0);
+
+  /** Image reconstruction analaysis calculate source.
+   * 
+   *
+   */
+  int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
+			      void *avg, void *std, float diameter, int total_voxels, 
+			      int total_sources, int start = 0);
+
+
+  /** Image reconstruction analaysis calculate source.
+   * 
+   *
+   */
+  int callCalculateSources(void *image_space, void *image_position, void *source_position, 
+			   void *avg, void *std, void *diameter, int total_voxels, 
+			   int total_sources, int start = 0);
+
+  /** Image reconstruction analaysis calculate source.
+   * 
+   *
+   */
+  int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
+			       void *avg, void *std, void *diameter, int total_voxels, 
+			       int total_sources, int start = 0);
+
+  /** Image reconstruction - generate normalization.
+   * 
+   */
+  int callGenerateNormalization(void *recon, void *image_position, 
+				void *det_position, int total_det);
+
+  /** Image reconstruction - forward correction.
+   * 
+   */
+  int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
+			    void *image_position, int num_events);
+
+  /** Image reconstruction - backward projection.
+   * 
+   */
+  int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
+			     void *det_position, void *image_position, 
+			     int num_events, int num_voxels);
+
+  /** Set the voxel dimensins on device.
+   * Values are stored in GPU memory and used in forward and backward projection calculations.
+   * Call set function once to transfer the values from host side to GPU.
+   * If value changes on the host side set functions needs to be called again to update GPU values.
+   */
+  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
+
+  /** Set the image edge.
+   * Values are stored in GPU memory and used in forward and backward projection calculations.
+   * Call set function once to transfer the values from host side to GPU.
+   * If value changes on the host side set functions needs to be called again to update GPU values.
+   */
+  int setEdge(float x_edge, float y_edge, float z_edge);
+
+  /** Set the image edge1.
+   * Values are stored in GPU memory and used in forward and backward projection calculations.
+   * Call set function once to transfer the values from host side to GPU.
+   * If value changes on the host side set functions needs to be called again to update GPU values.
+   */
+  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
+
+  /** Set the minimum crystan in one ring values.
+   * Values are stored in GPU memory and used in forward and backward projection calculations.
+   * Call set function once to transfer the values from host side to GPU.
+   * If value changes on the host side set functions needs to be called again to update GPU values.
+   */
+  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
+
+  /** Set all other required parameters for reconstruction.
+   * Values are stored in GPU memory and used in forward and backward projection calculations.
+   * Call set function once to transfer the values from host side to GPU.
+   * If value changes on the host side set functions needs to be called again to update GPU values.
+   */
+  int setParams(float matrix_distance_factor, float phantom_diameter,
+		float atten_per_mm, float ring_diameter);
+
+
+
+
+
+};
+
+
+#endif
diff --git a/src/DKSStream.h b/src/DKSStream.h
new file mode 100644
index 0000000..17e1089
--- /dev/null
+++ b/src/DKSStream.h
@@ -0,0 +1,24 @@
+/*
+  Author: Uldis Locans
+  
+  Date: 12.12.2014
+
+  Comment: based on device used create different cuda streams, opencl contexts, (mic - dont know yet)
+  that allow handling of asynchronoes data transfer and kernel execution on the device
+     
+*/
+
+#ifndef H_DKSSTREAM
+#define H_DKSSTREAM
+
+#define DKS_SUCCESS 0
+#define DKS_ERROR 1
+
+#include <iostream>
+#include <cuda_runtime.h>
+
+class DKSStream {
+
+  
+
+}
diff --git a/src/MIC/CMakeLists.txt b/src/MIC/CMakeLists.txt
new file mode 100644
index 0000000..d9b8dcd
--- /dev/null
+++ b/src/MIC/CMakeLists.txt
@@ -0,0 +1,25 @@
+SET (_SRCS
+  MICBase.cpp
+  MICChiSquare.cpp
+  MICFFT.cpp
+  MICGreensFunction.cpp  
+  MICCollimatorPhysics.cpp
+  )
+
+SET (_HDRS
+  MICBase.h
+  MICChiSquare.h
+  MICFFT.h
+  MICCollimatorPhysics.h
+  MICGreensFunction.hpp    
+  MICMergeSort.h
+  )
+
+#INCLUDE_DIRECTORIES (
+#  ${CMAKE_CURRENT_SOURCE_DIR}
+#)
+
+ADD_SOURCES (${_SRCS})
+ADD_HEADERS (${_HDRS})
+
+INSTALL(FILES ${_HDRS} DESTINATION include/MIC)
diff --git a/src/MIC/MICBase.cpp b/src/MIC/MICBase.cpp
new file mode 100644
index 0000000..43c15c4
--- /dev/null
+++ b/src/MIC/MICBase.cpp
@@ -0,0 +1,124 @@
+#include "MICBase.h"
+
+//constructor, sets default device id equal to 0
+MICBase::MICBase() {
+  m_device_id = 0;
+  defaultRndSet = -1;
+
+}
+
+//destructor, delete defaultrnd streams if they are set
+MICBase::~MICBase() {
+  mic_deleteRandStreams();
+}
+
+
+//create default rand streams
+int MICBase::mic_createRandStreams(int size) {
+
+  int seed = time(NULL);
+
+#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
+  {
+
+    //get the number of threads
+    int numThreads;
+
+#pragma omp parallel
+    numThreads = omp_get_num_threads();
+
+    //if default rnd stream already allocated delete the array
+    if (defaultRndSet == 1)    
+      delete[] defaultRndStream;
+
+    //allocate defaultRndStream array
+    defaultRndStream = new VSLStreamStatePtr[numThreads];
+
+    //create stream states for each thread
+#pragma omp parallel for
+    for (int i = 0; i < omp_get_num_threads(); i++)
+      vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
+
+    defaultRndSet = 1;
+  }
+  
+  return DKS_SUCCESS;
+
+}
+
+//delete default rand streams
+int MICBase::mic_deleteRandStreams() {
+
+#pragma offload target(mic:m_device_id) inout(defaultRndSet)
+  {
+    if (defaultRndSet == 1) {
+      delete[] defaultRndStream;
+      defaultRndSet = -1;
+    }
+  }
+
+  return DKS_ERROR;
+}
+
+//create a new signal for the mic
+int MICBase::mic_createStream(int & streamId) {
+
+  //use int as signal, create a new int in micStreams vector, return the id
+  int tmpStream = micStreams.size();
+  micStreams.push_back(tmpStream);
+  streamId = micStreams.size() - 1;
+
+  //empty offload to create the signal on the mic
+  /*
+#pragma offload target(mic:m_device_id) signal(mic_getStream(streamId))
+  {
+  }
+  */
+  return DKS_SUCCESS;
+}
+
+//get the signal from the vector
+int& MICBase::mic_getStream(int id) {
+  return micStreams[id];
+}
+
+//delete streams
+int MICBase::mic_deleteStreams() {
+  micStreams.clear();
+
+  return DKS_SUCCESS;
+}
+
+
+//sets device id
+int MICBase::mic_setDeviceId(int id) {
+  m_device_id = id;
+
+  return DKS_SUCCESS;
+}
+
+//get information abaut all available mic devices
+//TODO: find a way to check system for avaialbel mic devices
+
+int MICBase::mic_getDevices() {
+
+  int devices = _Offload_number_of_devices();
+  int thread_count = 0;
+
+  std::cout << "==============================" << std::endl;
+  std::cout << "==========Intel MICs==========" << std::endl;
+  std::cout << "==============================" << std::endl;
+
+  std::cout << "Total mic devices: " << devices << std::endl;
+  //std::cout << "Total mic devices: currently cant be found, but it's 1 on kraftwerk" << std::endl;
+
+#pragma offload target(mic:m_device_id) inout(thread_count)
+  {
+    thread_count = omp_get_max_threads();
+  }
+
+  std::cout << "Max threads: " << thread_count << std::endl;
+
+
+  return DKS_SUCCESS;
+}
diff --git a/src/MIC/MICBase.h b/src/MIC/MICBase.h
new file mode 100644
index 0000000..92b4fe9
--- /dev/null
+++ b/src/MIC/MICBase.h
@@ -0,0 +1,244 @@
+/*
+
+  Name: MIC Base
+  Author: Uldis Locans
+  Info: class to handle set up and data transfer from host to Intel MIC devices
+  Date: 29.09.2014
+
+*/
+#ifndef H_MIC_BASE
+#define H_MIC_BASE
+
+#include <iostream>
+#include <omp.h>
+#include <offload.h>
+#include <mkl_dfti.h>
+#include <mkl_vsl.h>
+#include <vector>
+#include <time.h>
+
+#include "../DKSDefinitions.h"
+
+#define DKS_ALLOC alloc_if(1)
+#define DKS_FREE free_if(1)
+#define DKS_RETAIN free_if(0)
+#define DKS_REUSE alloc_if(0)
+
+#define MIC_WIDTH 128
+
+class MICBase {
+
+private:
+  std::vector<int> micStreams;
+
+protected:
+
+
+  int defaultRndSet;
+
+public:
+  VSLStreamStatePtr *defaultRndStream;
+  int m_device_id;
+
+  /* constructor */
+  MICBase();
+
+  /* destructor */
+  ~MICBase();
+
+  /*
+    Info: create MKL rand streams for each thread
+    Return: success or error code
+  */
+  int mic_createRandStreams(int size);
+
+  /*
+    Info: delete MKL rand streams
+    Return: succes or error code
+  */
+  int mic_deleteRandStreams();
+
+  /*
+    Info: create a new signal for the mic
+    Return: success or error code
+  */
+  int mic_createStream(int & streamId);
+
+  /*
+    Info: get the signal from the vector
+    Return: mic signal
+  */
+  int& mic_getStream(int id);
+
+  /*
+    Info: delete streams
+    Return: success or error code
+  */
+  int mic_deleteStreams();
+
+  /*
+    Info: set device id
+    Return: success or error code
+  */
+  int mic_setDeviceId(int id);
+
+  /*
+    Info: get mic devices
+    Return: success or error code
+  */
+  int mic_getDevices();
+
+  /*
+    Info: allocate memory on MIC device
+    Return: success or error code
+  */
+  template<typename T>
+  void * mic_allocateMemory(int size) {
+
+    int padding = size % MIC_WIDTH;
+    int totalsize = size + padding;
+
+    T *tmp = (T*)_mm_malloc(sizeof(T)*totalsize, 64); // = new T[size];
+#pragma offload_transfer target(mic:m_device_id) nocopy(tmp:length(totalsize) DKS_ALLOC DKS_RETAIN)
+
+    return tmp;
+  }
+
+  /*
+    Info: transfer data to device
+    Return: success or error code
+  */
+  template<typename T>
+  int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
+    T* tmp_ptr = (T*)data_ptr;
+    T* tmp_data = (T*)data;
+
+#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) )
+
+    return DKS_SUCCESS;
+  }
+
+  /*
+    Info: write data to device, non-blocking
+    Return: success or error code
+  */
+  template<typename T>
+  int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
+  {
+    T* tmp_ptr = (T*)data_ptr;
+    T* tmp_data = (T*)data;
+
+#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) )
+
+    return DKS_SUCCESS;
+  }
+  
+
+  /*
+    Info: read data from device
+    Return: success or error code
+  */
+  template<typename T>
+  int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
+    T* tmp_ptr = (T*)data_ptr;
+    T* tmp_result = (T*)result;
+	
+	//std::cout << "try to read data with size = " << size << " adn offset = " << offset << std::endl;
+#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) )
+
+    return DKS_SUCCESS;
+  }
+
+  /*
+    Info: read data from device waiting for signal
+    Return: success or error code
+  */
+  template<typename T>
+  int mic_readDataAsync(const void * data_ptr, void * result, int size, 
+			int streamId = -1, int offset = 0) {
+    T* tmp_ptr = (T*)data_ptr;
+    T* tmp_result = (T*)result;
+
+#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) ) 
+      {
+      }
+
+    return DKS_SUCCESS;
+
+  }
+
+  /* 
+     Info: wait till all the signals are complete
+     Return siccess or error code
+  */
+  int mic_syncDevice() {
+    
+    //empty offload to wait for all the signals to finish and launch a new empy signal
+    /*
+    for (int i = 0; i < micStreams.size(); i++) {
+#pragma offload target(mic:m_device_id) wait(mic_getStream(i)) signal(mic_getStream(i))
+      {
+      }
+    }
+    */
+
+	//std::cout << "done read data" << std::endl;
+
+    return DKS_SUCCESS;
+
+  }
+
+  /*
+    Info: free memory on device
+    Return: success or error code
+  */
+  template<typename T>
+  int mic_freeMemory(void * data_ptr, int size) {
+
+    int padding = size % MIC_WIDTH;
+    int totalsize = size + padding;
+
+    T* tmp_ptr = (T*)data_ptr;
+#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
+    {
+    }
+
+    return DKS_SUCCESS;
+  }
+
+  /*
+    Info: allocate memory and write data to device
+    Return: success or error code
+  */
+  template<typename T>
+  void * mic_pushData(const void * data, int size) {
+    T* tmp_ptr = new T[size];
+    T* tmp_data = (T*)data;
+
+#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_ALLOC DKS_RETAIN 
+    into(tmp_ptr[0:size]) )
+  {
+  }
+
+  return tmp_ptr;
+}
+
+/*
+  Info: read data and free memory on device
+  Return: success or erro code
+*/
+  template<typename T>
+  int mic_pullData(void * data_ptr, void * result, int size) {
+    T* tmp_ptr = (T*)data_ptr;
+    T* tmp_data = (T*)result;
+
+#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[0:size] : DKS_REUSE DKS_FREE into(tmp_data[0:size]) )
+    {
+    }
+
+    return DKS_SUCCESS;
+  }
+
+};
+
+#endif
diff --git a/src/MIC/MICChiSquare.cpp b/src/MIC/MICChiSquare.cpp
new file mode 100644
index 0000000..35b6d77
--- /dev/null
+++ b/src/MIC/MICChiSquare.cpp
@@ -0,0 +1,93 @@
+#include "MICChiSquare.h"
+
+/*
+  calculate chi^2 on intel mic, use data already loaded on device
+*/
+int MICChiSquare::mic_chi2(double *O, double *E, double *result, int size) {
+
+#pragma offload target(mic:m_micbase->m_device_id)		\
+  in(O:length(0) DKS_RETAIN DKS_REUSE)			\
+  in(E:length(0) DKS_RETAIN DKS_REUSE)			\
+  in(result:length(0) DKS_RETAIN DKS_REUSE)		\
+  in(size)
+  {	
+#pragma omp parallel for
+    for (int i = 0; i < size; i++) {
+      result[i] = pow(O[i] - E[i], 2) / E[i];		
+    }
+  }
+
+  return DKS_SUCCESS;
+}
+
+
+/*
+  calculate function N(t), use data already loaded on device
+*/
+int MICChiSquare::mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT) {
+
+#pragma offload target(mic:m_micbase->m_device_id)		\
+  in(nt:length(0) DKS_RETAIN DKS_REUSE)			\
+  in(p:length(0) DKS_RETAIN DKS_REUSE)			\
+  in(psize) in(nsize) in(jsize) in(deltaT)
+  {
+
+    double gamma = 0.01; //???
+    double tau = 0.01; //???
+
+    for (int j = 0; j < jsize; j++) {
+
+      int pid = j*psize;
+      double N0 = p[pid];
+      double Nbkg = p[pid+1];
+      double A0 = p[pid+2];
+      double phi = p[pid+3];
+      double sigma = p[pid+4];
+      double B = p[pid+5];
+
+      int idj = j*nsize;
+
+      double a1 = -0.5*sigma*sigma;
+      double b1 = gamma*B;
+
+#pragma omp parallel for	
+      for (int n = 0; n < nsize; n++) {
+
+	int id = idj + n;
+	double t = n*deltaT;
+
+	double a = a1*t*t;
+	double b = b1*t + phi;
+	double At = A0 * exp2(a) * cos(b);
+
+	double c = -t/tau;
+	double Nt = N0 * exp2(c) * (1 + At) + Nbkg;
+
+	nt[id] = Nt;
+      }
+    }
+
+  }
+
+  return DKS_SUCCESS;
+}
+
+/*
+  calculate sum of array
+*/
+int MICChiSquare::mic_sum(double *data, double *result, int size) {
+  double sum = 0;
+#pragma offload target(mic:m_micbase->m_device_id)		\
+  in(data:length(0) DKS_REUSE DKS_RETAIN)		\
+  in(result:length(0) DKS_REUSE DKS_RETAIN)		\
+  in(size) in(sum)
+  {
+#pragma omp parallel for reduction(+:sum)
+    for (int i = 0; i < size; i++) {
+      sum += data[i];
+    }
+    result[0] = sum;
+  }
+  return DKS_SUCCESS;
+}
+
diff --git a/src/MIC/MICChiSquare.h b/src/MIC/MICChiSquare.h
new file mode 100644
index 0000000..c62de0b
--- /dev/null
+++ b/src/MIC/MICChiSquare.h
@@ -0,0 +1,51 @@
+/*
+
+  Name: MICChiSquare
+  Info: calculate chi^2 using intel mic coporcessor
+  Author: Uldis Locans
+  Date: 29.09.2014
+
+*/
+#ifndef H_MIC_CHI_SQUARE
+#define H_MIC_CHI_SQUARE
+
+#include <math.h>
+#include <omp.h>
+#include <offload.h>
+#include "MICBase.h"
+
+class MICChiSquare {
+
+  MICBase *m_micbase;
+
+public:
+
+  /* constructor */
+  MICChiSquare(MICBase *base) { 
+    m_micbase = base;
+  }
+
+  /* destructor */
+  ~MICChiSquare() { }
+
+  /*
+    Info: calucate chi square
+    Return: success or error code
+  */
+  int mic_chi2(double *O, double *E, double *result, int size);
+
+  /*
+    Info: calculate Nt function
+    Return: success or error code
+  */
+  int mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT = 1);
+
+  /*
+    Info: calculate sum of array
+    Return: success or error code
+  */
+  int mic_sum(double *data, double *result, int size);
+
+};
+
+#endif
diff --git a/src/MIC/MICCollimatorPhysics.cpp b/src/MIC/MICCollimatorPhysics.cpp
new file mode 100644
index 0000000..6a1b937
--- /dev/null
+++ b/src/MIC/MICCollimatorPhysics.cpp
@@ -0,0 +1,876 @@
+#include "MICCollimatorPhysics.h"
+
+#define M_P 0.93827231e+00
+#define C 299792458.0
+#define PI 3.14159265358979323846
+#define AVO 6.022e23
+#define R_E 2.81794092e-15
+#define eM_E 0.51099906e-03
+#define Z_P 1
+#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
+
+#define POSITION 0 
+#define ZSIZE 1
+#define RHO_M 2
+#define Z_M 3
+#define A_M 4
+#define A2_C 5
+#define A3_C 6
+#define A4_C 7
+#define A5_C 8
+#define X0_M 9
+#define I_M 10
+#define DT_M 11
+
+__declspec(target(mic))
+double dot(mic_double3 d1, mic_double3 d2) {
+  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
+}
+
+__declspec(target(mic))
+double dot(double dx, double dy, double dz) {
+  return (dx * dx + dy * dy + dz * dz);
+}
+
+__declspec(target(mic))
+bool checkHit(double &z, double *par) {
+  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
+}
+
+
+__declspec(target(mic))
+void Rot(double &px, double &pz, double &x, double &z, double xplane, 
+	 double normP, double thetacou, double deltas, int coord)
+{
+  double Psixz = 1;
+  double pxz = 1;
+  
+  if ( px >= 0 && pz >= 0 )
+    Psixz = atan(px/pz);
+  else if ( px > 0 && pz < 0 )
+    Psixz = atan(px/pz) + PI;
+  else if (px < 0 && pz > 0)
+    Psixz = atan(px/pz) + 2*PI;
+  else
+    Psixz = atan(px/pz) + PI;
+
+  pxz = sqrt(px*px + pz*pz);
+
+  if(coord == 1) {
+    x = x + deltas * px / normP + xplane*cos(Psixz);
+    z = z - xplane * sin(Psixz);
+  }
+
+  if(coord == 2) {
+    x = x + deltas * px / normP + xplane * cos(Psixz);
+    z = z - xplane * sin(Psixz) + deltas * pz / normP;
+  }
+  
+  px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
+  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
+}
+
+__declspec(target(mic))
+void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
+  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
+  double gamma = (Eng + M_P) / M_P;
+  double normP = sqrt(dot(P, P));
+  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
+  double deltas = par[DT_M] * beta * C;
+
+  double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * 
+    Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
+
+  // x-direction: See Physical Review, "Multiple Scattering"
+  double z1, z2;
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
+  double thetacou = z2 * theta0;
+
+  while(fabs(thetacou) > 3.5 * theta0) {
+    vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
+    vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
+    thetacou = z2 * theta0;
+  }
+
+  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1);
+
+  double P2;//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
+  if(P2 < 0.0047) {
+    double P3, P4;
+    vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
+    vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+
+    if(P4 > 0.5)
+      thetaru = -thetaru;
+    Rot(P.x ,P.z, R.x, R.z, xplane, normP, thetaru, deltas, 0);
+  }
+
+  // y-direction: See Physical Review, "Multiple Scattering"
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
+  thetacou = z2 * theta0;
+
+  while(fabs(thetacou) > 3.5 * theta0) {
+    vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
+    vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
+    thetacou = z2 * theta0;
+  }
+
+  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  Rot(P.y, P.z, R.y, R.z, yplane, normP, thetacou, deltas, 2);
+
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
+  if(P2 < 0.0047) {
+    double P3, P4;
+    vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
+    vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    if(P4 > 0.5)
+      thetaru = -thetaru;
+    Rot(P.y, P.z, R.y, R.z, yplane, normP, thetaru, deltas, 0);
+  }
+
+}
+
+__declspec(target(mic))
+void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
+		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
+{
+ 
+  double normP[MIC_WIDTH] __attribute__((aligned(64)));
+  double deltas[MIC_WIDTH] __attribute__((aligned(64)));
+  double theta0[MIC_WIDTH] __attribute__((aligned(64)));
+  double P1[MIC_WIDTH] __attribute__((aligned(64)));
+  double P2[MIC_WIDTH] __attribute__((aligned(64)));
+  double P3[MIC_WIDTH] __attribute__((aligned(64)));
+
+  double z1[MIC_WIDTH] __attribute__((aligned(64)));
+  double z2[MIC_WIDTH] __attribute__((aligned(64)));
+  double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
+
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+      int idx = i - ii;
+      if (label[i] == 0) {
+	double dotp = dot(px[i], py[i], pz[i]);
+	double Eng = sqrt(dotp + 1.0) * M_P - M_P;
+	double gamma = (Eng + M_P) / M_P;
+	double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
+	
+	normP[idx] = sqrt(dotp);
+	deltas[idx] = par[DT_M] * beta * C;
+	theta0[idx] = 13.6e6 / (beta * normP[idx] * M_P * 1e9) * 
+	  Z_P * sqrt(deltas[idx] / par[X0_M]) * (1.0 + 0.038 * log(deltas[idx] / par[X0_M]));
+    }
+  } 
+  
+  // x-direction: See Physical Review, "Multiple Scattering"
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0); 
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0); 
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + size; i++) {
+    int idx = i - ii;
+    thetacou[idx] = z2[idx] * theta0[idx];
+  }
+
+  //unknown number of iterations, cannot vectorize
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+    int idx = i - ii;
+    if (label[i] == 0) {
+      while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
+	vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
+	vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
+	thetacou[idx] = z2[idx] * theta0[idx];
+      }    
+    }
+  }
+  
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + size; i++) {
+    int idx = i - ii;
+    if (label[i] == 0) {
+      double xplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) + 
+	z2[idx] * deltas[idx] * theta0[idx] / 2.0;
+      Rot(px[i], pz[i], rx[i], rz[i], xplane, normP[idx], thetacou[idx], deltas[idx], 1);
+    }
+  }
+  
+  
+  //generate array of random numbers
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
+
+  //P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+    int idx = i - ii;
+    if (label[i] == 0) {
+      if(P1[idx] < 0.0047) {
+	double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
+
+	if(P3[idx] > 0.5)
+	  thetaru = -thetaru;
+
+	Rot(px[i] ,pz[i], rx[i], rz[i], 0, 0, thetaru, 0, 0);
+      }
+    }
+  }
+  
+  // y-direction: See Physical Review, "Multiple Scattering"
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0); 
+  vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0); 
+
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+    int idx = i - ii;
+    thetacou[idx] = z2[idx] * theta0[idx];
+  }
+
+  //unknown number of iterations, cannot vectorize
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+    int idx = i - ii;
+    if (label[i] == 0) {
+      while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
+	vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
+	vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
+	thetacou[idx] = z2[idx] * theta0[idx];
+      }
+    }
+  }
+
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+    int idx = i - ii;
+    if (label[i] == 0) {
+      double yplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) 
+	+ z2[idx] * deltas[idx] * theta0[idx] / 2.0;
+      Rot(py[i], pz[i], ry[i], rz[i], yplane, normP[idx], thetacou[idx], deltas[idx], 2);
+    }
+  }
+
+  //generate array of random numbers
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
+  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
+
+  //P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
+  #pragma vector aligned
+  #pragma simd
+  for (int i = ii; i < ii + MIC_WIDTH; i++) {
+    int idx = i - ii;
+    if (label[i] == 0) {
+      if(P1[idx] < 0.0047) {
+	double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
+	if(P3[idx] > 0.5)
+	  thetaru = -thetaru;
+	Rot(py[i], pz[i], ry[i], rz[i], 0, 0, thetaru, 0, 0);
+      }
+    }
+  }
+  
+}
+
+__declspec(target(mic))
+void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
+
+  double dEdx = 0.0;
+  const double gamma = (Eng + M_P) / M_P;
+  const double gamma2 = gamma * gamma;
+  const double beta = sqrt(1.0 - 1.0 / gamma2);
+  const double beta2 = beta * beta;
+
+  const double deltas = par[DT_M] * beta * C;
+  const double deltasrho = deltas * 100 * par[RHO_M];
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
+
+  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
+    const double Ts = (Eng * 1E6) / 1.0073; 
+    const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
+    const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
+    const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
+
+    dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
+
+    double tmprnd;
+    vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
+    const double delta_E = deltasrho * dEdx + tmprnd;
+    Eng = Eng + delta_E / 1E3;
+  }
+  
+  if (Eng >= 0.0006) {
+    const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
+      (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
+
+    dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
+      (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * 
+		       Tmax / par[I_M] / par[I_M]) - beta2);
+
+    double tmprnd;
+    vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
+    const double delta_E = deltasrho * dEdx + tmprnd;
+    
+    Eng = Eng + delta_E / 1E3;
+  }
+
+
+  if ((Eng<1E-4) || (dEdx>0))
+    pdead = 1;
+}
+
+__declspec(target(mic))
+void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
+
+  const double gamma = (Eng + M_P) / M_P;
+  const double gamma2 = gamma * gamma;
+  const double beta = sqrt(1.0 - 1.0 / gamma2);
+  const double beta2 = beta * beta;
+
+  const double deltas = par[DT_M] * beta * C;
+  const double deltasrho = deltas * 100 * par[RHO_M];
+  const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); 
+
+  if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
+    const double Ts = (Eng * 1E6) / 1.0073; 
+    const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
+    const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
+    const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
+
+    dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
+
+    const double delta_E = deltasrho * dEdx + sigma_E * randv[ri];
+
+    Eng = Eng + delta_E / 1E3;
+  }
+  
+  if (Eng >= 0.0006) {
+    const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
+      (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
+
+    dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
+      (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * 
+		       Tmax / par[I_M] / par[I_M]) - beta2);
+
+    const double delta_E = deltasrho * dEdx + sigma_E * randv[ri + MIC_WIDTH];
+
+    Eng = Eng + delta_E / 1E3;
+  }
+
+}
+
+int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
+
+  //cast device memory pointers to appropriate types
+  MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
+  double *par = (double*) par_ptr;
+
+#pragma offload target(mic:m_micbase->m_device_id)		\
+  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(numparticles)
+  {
+	
+#pragma omp parallel 
+    {
+      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
+      
+      //for loop trough particles if not checkhit set label to -2 and update R.x
+
+#pragma omp for simd
+      for (int i = 0; i < numparticles; i++) {
+	if ( !checkHit(data[i].Rincol.z, par) ) {
+	  double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
+	  data[i].Rincol.x = data[i].Rincol.x + par[DT_M] * C * data[i].Pincol.x / sq;
+	  data[i].Rincol.y = data[i].Rincol.y + par[DT_M] * C * data[i].Pincol.y / sq;
+	  data[i].Rincol.z = data[i].Rincol.z + par[DT_M] * C * data[i].Pincol.z / sq;
+	  data[i].label = -2;
+	}
+      }
+
+      //for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
+#pragma omp for simd
+      for (int i = 0; i < numparticles; i++) {
+
+	int pdead = -1;
+	double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
+	double Eng = (sq - 1) * M_P;
+
+	if (data[i].label == 0) { 
+	  energyLoss(Eng, pdead, par, stream);
+	}
+
+	if (pdead == -1) {
+	  double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
+	  sq = sqrt(dot(data[i].Pincol, data[i].Pincol));
+	  data[i].Pincol.x = data[i].Pincol.x * ptot / sq;
+	  data[i].Pincol.y = data[i].Pincol.y * ptot / sq;
+	  data[i].Pincol.z = data[i].Pincol.z * ptot / sq;
+	}
+
+	if (pdead == 1)
+	  data[i].label = -1;
+      }
+
+      //for loop trough particles if label == 0 coulomb scat
+#pragma omp for
+      for (int i = 0; i < numparticles; i++) {
+	if (data[i].label == 0) {
+	  coulombScat(data[i].Rincol, data[i].Pincol, par, stream); 
+	}
+      }
+      
+    } //end omp parallel
+
+  } //end offload
+  return DKS_SUCCESS;
+
+}
+
+
+
+int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+					       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+					       void *px_ptr, void *py_ptr, void *pz_ptr,
+					       void *par_ptr, int numparticles)
+{
+
+
+
+  int *label = (int*)label_ptr;
+  unsigned *localID = (unsigned*)localID_ptr;
+  double *rx = (double*)rx_ptr;
+  double *ry = (double*)ry_ptr;
+  double *rz = (double*)rz_ptr;
+  double *px = (double*)px_ptr;
+  double *py = (double*)py_ptr;
+  double *pz = (double*)pz_ptr;
+  double *par = (double*)par_ptr;
+
+  int padding = numparticles % MIC_WIDTH;
+  int totalpart = numparticles + padding;
+
+#pragma offload target (mic:0) \
+  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(rx:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(ry:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(rz:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(px:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(py:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(pz:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(totalpart)
+  {
+
+#pragma omp parallel
+    {
+      //every thread gets its own rnd stream state
+      VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
+
+      
+      #pragma omp for nowait
+      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
+	//vectorize main loop
+	#pragma vector aligned
+	#pragma simd
+	for (int i = ii; i < ii + MIC_WIDTH; i++) {
+	  if ( !checkHit(rz[i], par) ) {
+	    double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
+	    rx[i] = rx[i] + par[DT_M] * C * px[i] / sq;
+	    ry[i] = ry[i] + par[DT_M] * C * py[i] / sq;
+	    rz[i] = rz[i] + par[DT_M] * C * pz[i] / sq;
+	    label[i] = -2;
+	  }
+	}
+      }
+      
+
+      //array of size 2*WIDTH for storing random values for the energyloss function
+      double randv[2*MIC_WIDTH] __attribute__((aligned(64)));
+
+      //for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
+      #pragma omp for nowait
+      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
+	//create array of rand values (2 per thread)
+	vdRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*MIC_WIDTH, randv, 0.0, 1.0);
+
+	#pragma vector aligned
+	#pragma simd
+	for (int i = ii; i < ii + MIC_WIDTH; i++) {
+	  
+	  double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
+	  double Eng = (sq - 1) * M_P;
+	  double dEdx = 0;
+	  
+	  if (label[i] == 0) {
+	    energyLoss(Eng, dEdx, par, randv, i - ii);
+	  }
+	    
+	  if (Eng > 1e-4 && dEdx < 0) {
+	    double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
+	    sq = sqrt(dot(px[i], py[i], pz[i]));
+	    px[i] = px[i] * ptot / sq;
+	    py[i] = py[i] * ptot / sq;
+	    pz[i] = pz[i] * ptot / sq;
+	  }
+
+	  if (Eng < 1e-4 || dEdx > 0)
+	    label[i] = -1;
+	        
+	} //end inner energy loss loop
+
+      } //end outer energy loss loop
+	
+      //vectorize coulomb scattering as much as possible
+#pragma omp for nowait
+      for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
+	coulombScat(rx, ry, rz, px, py, pz, label, par, stream, ii, MIC_WIDTH); 	
+      } //end coulomb scattering
+      
+    } //end omp parallel
+      
+  } //end offload
+     
+  return DKS_SUCCESS;
+}
+
+int MICCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
+						int &numaddback)
+{
+
+  //cast device memory pointers to appropriate types
+  MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
+  int privateback;
+  
+#pragma offload target(mic:m_micbase->m_device_id)		\
+  in(data:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(numparticles)				\
+  out(privateback)
+  {
+    //count dead and addback particles
+    int privateback = 0;
+#pragma omp parallel for reduction(+:privateback)
+    for (int i = 0; i < numparticles; i++) {
+      if (data[i].label < 0)
+	privateback++;
+    }
+    //move particles with label < 0 to the end of the array (serial. can we do this parallel?)
+    if (privateback > 0) {
+
+      int moved = 0;
+      for (int i = numparticles - 1; i > 0; i--) {
+	if (data[i].label < 0) {
+	  int idx = numparticles - 1 - moved;
+	  if (i != idx) {
+	    MIC_PART_SMALL tmp = data[i];
+	    data[i] = data[idx];
+	    data[idx] = tmp;
+	  }
+	  moved++;
+	} 
+      }
+    }
+    numaddback = privateback;
+  }
+  return DKS_SUCCESS;
+}
+
+__declspec(target(mic))
+void micmove(double &a, double &b) {
+  double tmp = a;
+  a = b;
+  b = tmp;
+}
+
+__declspec(target(mic))
+void micmove(int &a, int &b) {
+  int tmp = a;
+  a = b;
+  b = tmp;
+}
+
+__declspec(target(mic))
+void micmove(unsigned &a, unsigned &b) {
+  unsigned tmp = a;
+  a = b;
+  b = tmp;
+}
+
+
+int MICCollimatorPhysics::CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+						   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+						   void *px_ptr, void *py_ptr, void *pz_ptr,
+						   void *par_ptr, int numparticles, 
+						   int &numaddback)
+{
+
+  int *label = (int*)label_ptr;
+  unsigned *localID = (unsigned*)localID_ptr;
+  double *rx = (double*)rx_ptr;
+  double *ry = (double*)ry_ptr;
+  double *rz = (double*)rz_ptr;
+  double *px = (double*)px_ptr;
+  double *py = (double*)py_ptr;
+  double *pz = (double*)pz_ptr;
+  double *par = (double*)par_ptr;
+
+  //int padding = numparticles % WIDTH;
+  //int totalpart = numparticles + padding;
+
+  int privateback;
+
+#pragma offload target (mic:0) \
+  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(rx:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(ry:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(rz:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(px:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(py:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(pz:length(0) DKS_REUSE DKS_RETAIN)	\
+  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(numparticles) \
+  out(privateback)
+  {
+
+    //count dead and addback particles
+    int privateback = 0;
+#pragma omp parallel for reduction(+:privateback)
+    for (int i = 0; i < numparticles; i++) {
+      if (label[i] < 0)
+	privateback++;
+    }
+
+    //move particles with label < 0 to the end of the array (serial. can we do this parallel?)
+    if (privateback > 0) {
+      int moved = 0;
+      for (int i = numparticles - 1; i >= 0; i--) {
+	if (label[i] < 0) {
+	  int idx = numparticles - 1 - moved;
+	  if (i != idx) {
+	    micmove(rx[i], rx[idx]);
+	    micmove(ry[i], ry[idx]);
+	    micmove(rz[i], rz[idx]);
+	    micmove(px[i], px[idx]);
+	    micmove(py[i], py[idx]);
+	    micmove(pz[i], pz[idx]);
+	    micmove(label[i], label[idx]);
+	    micmove(localID[i], localID[idx]);
+	  }
+	  moved++;
+	} 
+      }
+    }
+    numaddback = privateback;
+  }
+
+  return DKS_SUCCESS;
+}
+
+__declspec(target(mic))
+inline void unitlessOff(mic_double3 &a, const double c) {
+  a.x *= c;
+  a.y *= c;
+  a.z *= c;
+}
+
+__declspec(target(mic))
+inline void unitlessOn(mic_double3 &a, const double c) {
+  a.x /= c;
+  a.y /= c;
+  a.z /= c;
+}
+
+__declspec(target(mic))
+mic_double3 deviceTransformTo(const mic_double3 &vec, const mic_double3 &ori) {
+  const double sina = sin(ori.x);
+  const double cosa = cos(ori.x);
+  const double sinb = sin(ori.y);
+  const double cosb = cos(ori.y);
+  const double sinc = sin(ori.z);
+  const double cosc = cos(ori.z);
+
+  mic_double3 temp;
+  temp.x = 0.0;
+  temp.y = 0.0;
+  temp.z = 0.0;
+
+  temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
+  temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x + 
+    (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
+  temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x + 
+    (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
+
+  return temp;
+}
+
+__declspec(target(mic))
+inline void updateR(mic_double3 &R, mic_double3 &P, double dotp, double dtc) {
+  R.x /= dtc;
+  R.x += 0.5 * P.x / dotp;
+  R.x *= dtc;
+  
+  R.y /= dtc;
+  R.y += 0.5 * P.y / dotp;
+  R.y *= dtc;
+
+  R.z /= dtc;
+  R.z += 0.5 * P.z / dotp;
+  R.z *= dtc;
+}
+
+__declspec(target(mic))
+inline void push(mic_double3 *r, mic_double3 *p, double dtc, int npart) {
+#pragma omp parallel for simd
+  for (int i = 0; i < npart; i++) {
+    mic_double3 R = r[i];
+    mic_double3 P = p[i];
+    double dotp = sqrt(1.0 + dot(P, P));
+    updateR(R, P, dotp, dtc);
+    r[i] = R;  
+  }
+}
+
+__declspec(target(mic))
+inline void push(mic_double3 *r, mic_double3 *p, double *gdt, double c, int npart) {
+#pragma omp parallel for simd
+  for (int i = 0; i < npart; i++) {
+    mic_double3 R = r[i];
+    mic_double3 P = p[i];
+    double dtc = gdt[i] * c;
+    double dotp = sqrt(1.0 + dot(P, P));
+    updateR(R, P, dotp, dtc);
+    r[i] = R;
+  }
+}
+
+
+int MICCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
+					       double dt, double c, bool usedt, int streamId) 
+{
+
+  mic_double3 *r = (mic_double3*)r_ptr;
+  mic_double3 *p = (mic_double3*)p_ptr;
+  double *gdt = (double*)dt_ptr;
+  double dtc = dt * c;    
+
+  if (!usedt) {
+#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
+  in(p:length(0) DKS_RETAIN DKS_REUSE) in(npart, dtc)
+    {
+      push(r, p, dtc, npart);
+    }
+
+  } else {
+
+#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
+  in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) in(npart, c)
+    {
+      push(r, p, gdt, c, npart);
+    }
+  }
+
+  return DKS_SUCCESS;
+}
+
+__declspec(target(mic))
+inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
+			   double dtc, int npart, int nsec) 
+{
+
+#pragma omp parallel for simd
+  for (int i = 0; i < npart; i++) {   
+    mic_double3 ori;
+    if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
+      ori = gOrient[gLastSect[i]];
+    } else {
+      ori.x = 0.0;
+      ori.y = 0.0;
+      ori.z = 0.0;
+    }
+
+    mic_double3 tmp = deviceTransformTo(p[i], ori);
+    mic_double3 X = x[i];       
+    double dotp = sqrt(1.0 + dot(tmp, tmp));
+    updateR(X, tmp, dotp, dtc);
+    x[i] = X;
+  }
+
+}
+
+__declspec(target(mic))
+inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
+			   double *gdt, double c, int npart, int nsec) 
+{
+
+#pragma omp parallel for simd
+  for (int i = 0; i < npart; i++) {   
+    mic_double3 ori;
+    if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
+      ori = gOrient[gLastSect[i]];
+    } else {
+      ori.x = 0.0;
+      ori.y = 0.0;
+      ori.z = 0.0;
+    }
+
+    mic_double3 tmp = deviceTransformTo(p[i], ori);
+    mic_double3 X = x[i];       
+    double dotp = sqrt(1.0 + dot(tmp, tmp));
+    double dtc = gdt[i] * c;
+
+    updateR(X, tmp, dotp, dtc);
+    x[i] = X;
+  }
+
+}
+
+int MICCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, 
+							void *lastSec_ptr, 
+							void *orient_ptr, int npart, 
+							int nsec, void *dt_ptr, double dt, 
+							double c, bool usedt, int streamId)
+{
+
+  mic_double3 *x = (mic_double3*)x_ptr;
+  mic_double3 *p = (mic_double3*)p_ptr;
+  mic_double3 *gOrient = (mic_double3*)orient_ptr;
+  double *gdt = (double*)dt_ptr;
+  long *gLastSect = (long*)lastSec_ptr;
+  double dtc = dt * c;
+
+  if (!usedt) {
+    
+#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE)		\
+  in(p:length(0) DKS_RETAIN DKS_REUSE) in(gOrient:length(0) DKS_RETAIN DKS_REUSE)	\
+  in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) in(npart, nsec, dtc)
+    {
+      pushTransform(x, p, gOrient, gLastSect, dtc, npart, nsec);
+    }
+
+  } else {
+
+#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE)  \
+  in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) \
+  in(gOrient:length(0) DKS_RETAIN DKS_REUSE) in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) \
+  in(npart, nsec, c)
+    {
+      pushTransform(x, p, gOrient, gLastSect, gdt, c, npart, nsec);
+    }
+  }
+  
+  return DKS_SUCCESS;
+
+}
+
+
diff --git a/src/MIC/MICCollimatorPhysics.h b/src/MIC/MICCollimatorPhysics.h
new file mode 100644
index 0000000..0795779
--- /dev/null
+++ b/src/MIC/MICCollimatorPhysics.h
@@ -0,0 +1,68 @@
+#ifndef H_MIC_COLLIMATORPHYSICS
+#define H_MIC_COLLIMATORPHYSICS
+
+#include <iostream>
+#include <cstdio>
+#include <cmath>
+#include <omp.h>
+#include <vector>
+
+#include "../Algorithms/CollimatorPhysics.h"
+#include "MICBase.h"
+
+__declspec(target(mic))
+typedef struct {
+  double x;
+  double y;
+  double z;
+} mic_double3;
+
+__declspec(target(mic))
+typedef struct {
+  int label;
+  unsigned localID;
+  mic_double3 Rincol;
+  mic_double3 Pincol;
+} MIC_PART_SMALL;
+
+
+class MICCollimatorPhysics : DKSAlogorithms{
+
+private:
+
+  MICBase *m_micbase;
+
+public:
+
+  MICCollimatorPhysics(MICBase *base) {
+    m_micbase = base;
+  };
+
+  ~MICCollimatorPhysics() { };
+
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
+
+  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			   void *px_ptr, void *py_ptr, void *pz_ptr,
+			   void *par_ptr, int numparticles);
+
+  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
+
+  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			       void *px_ptr, void *py_ptr, void *pz_ptr,
+			       void *par_ptr, int numparticles, int &numaddback);
+
+  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
+			   double dt, double c, bool usedt = false, int streamId = -1);
+
+  int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
+				    void *orient_ptr, int npart, int nsec, 
+				    void *dt_ptr, double dt, double c,
+				    bool usedt = false, int streamId = -1);
+
+};
+
+
+#endif
diff --git a/src/MIC/MICFFT.cpp b/src/MIC/MICFFT.cpp
new file mode 100644
index 0000000..ab82c83
--- /dev/null
+++ b/src/MIC/MICFFT.cpp
@@ -0,0 +1,210 @@
+#include "MICFFT.h"
+#include<stdio.h>
+#include<complex>
+#include <time.h>
+#include <sys/time.h>
+
+MICFFT::MICFFT(MICBase *base) {
+  m_micbase = base;
+}
+
+MICFFT::~MICFFT() {
+#pragma offload target(mic:0)
+  {
+    DftiFreeDescriptor(&FFTHandle_m);
+    DftiFreeDescriptor(&handle);
+  }
+}
+
+//setup fft
+int MICFFT::setupFFT(int ndim, int N[3]) {
+  //set up FFT engine
+#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
+  {
+
+    MKL_LONG sizes[3], strides[4];
+    sizes[0] = N[0]; sizes[1] = N[1]; sizes[2] = N[2];
+    //strides[0] = 0; strides[1] = sizes[1]; strides[2] = 1; strides[3] = sizes[0]*sizes[1];
+    strides[0] = 0; strides[1] = sizes[0]*sizes[1]; strides[2] = sizes[0]; strides[3] = 1;
+
+    MKL_LONG dims = 3;
+    DftiCreateDescriptor(&(this->getHandle()), DFTI_DOUBLE, DFTI_COMPLEX, dims, sizes);
+    DftiSetValue(this->getHandle(), DFTI_INPUT_STRIDES, strides);
+    DftiSetValue(this->getHandle(), DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
+    DftiCommitDescriptor(this->getHandle());
+
+  }
+
+
+  return DKS_SUCCESS;
+}
+//BENI:
+//setup fft
+int MICFFT::setupFFTRC(int ndim, int N[3], double scale) {
+
+  //set up FFT engine for REAL->COMPLEX
+
+#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
+  {
+
+    MKL_LONG sizes[3], real_strides[4], complex_strides[4];
+    sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0];
+    //real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1;
+    real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1;
+    //real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1];
+    //complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1;
+    complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1;
+    //complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1;
+
+    MKL_LONG dims = 3;
+    DftiCreateDescriptor(&(this->getHandleRC()), DFTI_DOUBLE, DFTI_REAL, dims, sizes);
+    DftiSetValue(this->getHandleRC(),DFTI_CONJUGATE_EVEN_STORAGE,  DFTI_COMPLEX_COMPLEX);
+    DftiSetValue(this->getHandleRC(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
+    DftiSetValue(this->getHandleRC(), DFTI_PLACEMENT, DFTI_NOT_INPLACE);
+    DftiSetValue(this->getHandleRC(), DFTI_INPUT_STRIDES, real_strides);
+    DftiSetValue(this->getHandleRC(), DFTI_OUTPUT_STRIDES, complex_strides);
+    DftiSetValue(this->getHandleRC(), DFTI_FORWARD_SCALE, scale);			
+    DftiCommitDescriptor(this->getHandleRC());
+
+  }
+
+  return DKS_SUCCESS;
+}
+
+//BENI:
+//setup fft
+int MICFFT::setupFFTCR(int ndim, int N[3], double scale) {
+
+  //set up FFT engine for COMPLEX->REAL
+
+#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
+  {	
+    MKL_LONG sizes[3], real_strides[4], complex_strides[4];
+    sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0];
+    //real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1;
+    real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1;
+    //real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1];
+    //complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1;
+    complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1;
+    //complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1;
+
+    MKL_LONG dims = 3;
+    DftiCreateDescriptor(&(this->getHandleCR()), DFTI_DOUBLE, DFTI_REAL, dims, sizes);
+    DftiSetValue(this->getHandleCR(),DFTI_CONJUGATE_EVEN_STORAGE,  DFTI_COMPLEX_COMPLEX);
+    DftiSetValue(this->getHandleCR(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
+    DftiSetValue(this->getHandleCR(), DFTI_PLACEMENT, DFTI_NOT_INPLACE);
+    DftiSetValue(this->getHandleCR(), DFTI_INPUT_STRIDES, complex_strides);
+    DftiSetValue(this->getHandleCR(), DFTI_OUTPUT_STRIDES, real_strides);
+    DftiSetValue(this->getHandleCR(), DFTI_BACKWARD_SCALE, scale);			
+    DftiCommitDescriptor(this->getHandleCR());
+
+
+
+  }
+
+  return DKS_SUCCESS;
+}
+
+//execute COMPLEX->COMPLEX FFT
+int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool forward) {
+
+  _Complex double *ptr = (_Complex double*) mem_ptr;
+
+#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(forward)
+  {
+    if (forward)
+      DftiComputeForward(this->getHandle(), ptr);
+    else
+      DftiComputeBackward(this->getHandle(), ptr);
+  }
+
+  return DKS_SUCCESS;
+}
+
+//execute iFFT
+int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
+  return mic_executeFFT(mem_ptr, ndim, N, -1, false);
+}
+
+//execute REAL->COMPLEX FFT
+int MICFFT::executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) {
+
+  double *real_ptr = (double*) in_ptr;
+  //std::complex<double> *compl_ptr = (std::complex<double> *) out_ptr;
+  _Complex double *compl_ptr = (_Complex double *) out_ptr;
+  int sizereal = N[0]*N[1]*N[2];
+  int sizecompl = (N[0]/2+1)*N[1]*N[2];
+
+//std::cout << "start real-compl fft on mic " << std::endl;
+	
+  //std::cout << "real_ptr = " << real_ptr << std::endl;
+  //std::cout << "compl_ptr = " << compl_ptr << std::endl;
+  //std::cout << "EXECUTE AVERAGING OVER 10 LOOPS OF FFT" << std::endl;	
+
+#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE) 
+  //#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE) 
+  {
+    //for (int i=0;i<10;++i){ //loop 10 times for benchmarking
+      DftiComputeForward(this->getHandleRC(), real_ptr, compl_ptr);
+    //}
+  }
+
+//std::cout << "end real-compl fft on mic " << std::endl;
+
+
+  return DKS_SUCCESS;
+}
+
+//execute COMPLEX->REAL FFT
+int MICFFT::executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) {
+
+  //_Complex double *ptr = (_Complex double*) mem_ptr;
+
+  double *real_ptr = (double*) out_ptr;
+  _Complex double *compl_ptr = (_Complex double *) in_ptr;
+
+  //std::cout << "real_ptr = " << real_ptr << std::endl;
+  //std::cout << "compl_ptr = " << compl_ptr << std::endl;
+  int sizereal = N[0]*N[1]*N[2];
+  int sizecompl = (N[0]/2+1)*N[1]*N[2];
+
+  //std::cout << "offload to perform backward fft ... " << std::endl;
+//struct timeval start, end;
+//gettimeofday(&start,NULL);
+#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE) 
+  //#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE) 
+  {
+    //for (int i=0;i<10;++i){ //loop 10 times for benchmarking
+      DftiComputeBackward(this->getHandleCR(), compl_ptr, real_ptr);
+    //}
+  }
+
+// End timing offloaded FFT. 
+//gettimeofday(&end,NULL);
+// Print execution time of offloaded computational loop.
+//printf ("Total time for IFFT spent = %f seconds\n",
+//(double) (end.tv_usec-start.tv_usec) /1000000+(double) (end.tv_sec-start.tv_sec));
+  //std::cout << "IFFT DONE!" << std::endl;
+  return DKS_SUCCESS;
+}
+
+
+//normalize IFFT
+int MICFFT::normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
+
+  int size = N[0] * N[1] * N[2];
+
+  _Complex double *ptr = (_Complex double*) mem_ptr;
+#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(size)
+  {
+#pragma omp parallel for		       
+    for (int i = 0; i < size; i++) {
+      __real__ ptr[i] = __real__ ptr[i] / size;
+      __imag__ ptr[i] = __imag__ ptr[i] / size;
+    }  
+  }
+
+  return DKS_SUCCESS;
+
+}
+
diff --git a/src/MIC/MICFFT.h b/src/MIC/MICFFT.h
new file mode 100644
index 0000000..626fc19
--- /dev/null
+++ b/src/MIC/MICFFT.h
@@ -0,0 +1,79 @@
+#ifndef H_MIC_FFT
+#define H_MIC_FFT
+
+#include <iostream>
+#include <complex>
+
+#include <offload.h>
+#include <mkl_dfti.h>
+
+#include "../Algorithm/DKSFFT.h"
+#include "MICBase.h"
+
+class MICFFT : public DKSFFT {
+
+private:
+
+  MICBase *m_micbase;
+
+  /// Internal FFT object for performing serial FFTs.
+#pragma offload_attribute(push,target(mic))
+  DFTI_DESCRIPTOR_HANDLE FFTHandle_m; //declspec only works for global variables
+  DFTI_DESCRIPTOR_HANDLE handle;
+  DFTI_DESCRIPTOR_HANDLE rc_handle; //handle for REAL->COMPLEX
+  DFTI_DESCRIPTOR_HANDLE cr_handle; //handle for COMPLEX->REAL
+
+#pragma offload_attribute(pop)
+
+  __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle(void) { 
+    return FFTHandle_m; 
+  }
+
+  __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle1(void) { 
+    return handle; 
+  }
+
+  __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleRC(void) { 
+    return rc_handle; 
+  }
+
+  __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleCR(void) { 
+    return cr_handle; 
+  }
+
+public:
+
+  /* constructor */
+  MICFFT(MICBase *base);
+
+  /* destructir */
+  ~MICFFT();
+
+  /* 
+     Info: setup mkl fft
+     Return: success or error code
+  */ 
+  int setupFFT(int ndim, int N[3]);
+  //BENI: 
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0);
+  //BENI: 
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0);
+
+  /* execute FFT on MIC */
+  int executeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
+
+  /* execute IFFT on MIC */
+  int executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
+
+  /* execute REAL->COMPLEX FFT on MIC */
+  int executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1);
+
+  /* execute COMPLEX->REAL FFT on MIC */
+  int executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1);
+
+  /* normalize IFFT on MIC */
+  int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
+
+};
+
+#endif
diff --git a/src/MIC/MICGreensFunction.cpp b/src/MIC/MICGreensFunction.cpp
new file mode 100644
index 0000000..6725a1e
--- /dev/null
+++ b/src/MIC/MICGreensFunction.cpp
@@ -0,0 +1,307 @@
+#include "MICGreensFunction.hpp"
+#include<stdio.h>
+#include<complex>
+#include <cstring>
+
+/* constructor */
+MICGreensFunction::MICGreensFunction(MICBase *base) {
+  m_micbase = base;
+}
+
+/* destructor */
+MICGreensFunction::~MICGreensFunction() {
+}
+
+
+/* compute greens integral analytically */
+// Version with extended domain
+/*
+  int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,double hr_m1, double hr_m2) {
+  double *tmp_ptr = (double*) tmp_ptr_;
+  #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
+  {
+  std::memset(tmp_ptr,0,(I+1)*(J+1)*(K+1));
+  double cellVolume = hr_m0 * hr_m1 * hr_m2;
+  #pragma omp parallel for collapse(3) schedule(dynamic) 
+  for (int k = 0; k < K; k++) {
+  for (int j = 0; j < J; j++) {
+  for (int i = 0; i < I; i++) {
+
+  double vv0 = i * hr_m0 - hr_m0 / 2;
+  double vv1 = j * hr_m1 - hr_m1 / 2;
+  double vv2 = k * hr_m2 - hr_m2 / 2;
+
+  double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
+
+  double tmpgrn = 0;
+  tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
+  tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
+  tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
+
+  tmpgrn = tmpgrn / 2;
+
+  tmpgrn += vv1 * vv2 * log(vv0 + r);	
+  tmpgrn += vv0 * vv2 * log(vv1 + r);
+  tmpgrn += vv0 * vv1 * log(vv2 + r);
+
+  tmpgrn = tmpgrn / cellVolume;
+
+  tmp_ptr[k*(J+1)*(I+1) + j*(I+1) + i] = tmpgrn;
+  }
+  }
+  }
+  }
+  return 0;
+  }
+*/
+
+int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
+					  double hr_m1, double hr_m2) 
+{
+
+  double *tmp_ptr = (double*) tmp_ptr_;
+#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
+  {
+    std::memset(tmp_ptr,0,I*J*K);
+    double cellVolume = hr_m0 * hr_m1 * hr_m2;
+#pragma omp parallel for collapse(3) schedule(dynamic) 
+    for (int k = 0; k < K; k++) {
+      for (int j = 0; j < J; j++) {
+	for (int i = 0; i < I; i++) {
+
+	  double vv0 = i * hr_m0 - hr_m0 / 2;
+	  double vv1 = j * hr_m1 - hr_m1 / 2;
+	  double vv2 = k * hr_m2 - hr_m2 / 2;
+
+	  double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
+
+	  double tmpgrn = 0;
+	  tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
+	  tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
+	  tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
+
+	  tmpgrn = tmpgrn / 2;
+
+	  tmpgrn += vv1 * vv2 * log(vv0 + r);	
+	  tmpgrn += vv0 * vv2 * log(vv1 + r);
+	  tmpgrn += vv0 * vv1 * log(vv2 + r);
+
+	  tmpgrn = tmpgrn / cellVolume;
+
+	  tmp_ptr[k*(J)*(I) + j*(I) + i] = tmpgrn;
+	}
+      }
+    }
+  }
+  return 0;
+}
+
+
+
+/* perform the actual integration */
+// version with extended domain
+/*
+  int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
+  double *tmp_ptr = (double*) tmp_ptr_;
+  double *mem_ptr = (double*) mem_ptr_;
+
+// the actual integration
+#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+{
+int Ii = I;
+int Jj = J;
+int Kk = K;
+int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
+std::memset(mem_ptr,0,II*JJ*KK);
+I=I+1; J=J+1; K=K+1;
+
+#pragma omp parallel for collapse(3)
+for (int i=0; i<Ii; i++) {
+for (int j=0; j<Jj; j++) {
+for (int k=0; k<Kk; k++) {
+//mem_ptr[k*JJ*II + j*II + i] = 0.0;
+mem_ptr[k*JJ*II + j*II + i]  = tmp_ptr[(k+1)*J*I + (j+1)*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + j*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + (j+1)*I + i];
+mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[(k+1)*J*I + j*I + i];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + (j+1)*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + j*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + (j+1)*I + i];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + j*I + i];
+}
+}
+}
+}
+return 0;
+}
+*/
+
+/*
+  int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
+  double *tmp_ptr = (double*) tmp_ptr_;
+  double *mem_ptr = (double*) mem_ptr_;
+
+// the actual integration
+#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+{
+int Ii = I;
+int Jj = J;
+int Kk = K;
+int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
+std::memset(mem_ptr,0,II*JJ*KK);
+//I=I+1; J=J+1; K=K+1;
+
+#pragma omp parallel for collapse(3)
+for (int i=0; i<Ii; i++) {
+for (int j=0; j<Jj; j++) {
+for (int k=0; k<Kk; k++) {
+//mem_ptr[k*JJ*II + j*II + i] = 0.0;
+mem_ptr[k*JJ*II + j*II + i]  = tmp_ptr[(k+1)*J*I + (j+1)*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + j*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + (j+1)*I + i];
+mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[(k+1)*J*I + j*I + i];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + (j+1)*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + j*I + (i+1)];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + (j+1)*I + i];
+mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + j*I + i];
+}
+}
+}
+}
+return 0;
+}
+*/
+
+//CUDA similar version:
+int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
+  double *tmpgreen = (double*) tmp_ptr_;
+  double *mem_ptr = (double*) mem_ptr_;
+
+  // the actual integration
+#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+  {
+    int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); 
+    std::memset(mem_ptr,0,II*JJ*KK);
+    //I=I+1; J=J+1; K=K+1;
+    double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    int NI_tmp=I;
+    int NJ_tmp=J;
+    int NK_tmp=K;
+
+#pragma omp parallel for collapse(3)
+    for (int i=0; i<I; i++) {
+      for (int j=0; j<J; j++) {
+	for (int k=0; k<K; k++) {
+	  tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
+	  tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
+
+	  if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
+	    tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
+
+	  if (i+1 < NI_tmp)
+	    tmp1 = tmpgreen[(i+1) +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+
+	  if (j+1 < NJ_tmp)
+	    tmp2 = tmpgreen[ i    + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];
+
+	  if (k+1 < NK_tmp)
+	    tmp3 = tmpgreen[ i    +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+
+	  if (i+1 < NI_tmp && j+1 < NJ_tmp)
+	    tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp +  k * NI_tmp * NJ_tmp];  
+
+	  if (i+1 < NI_tmp && k+1 < NK_tmp)
+	    tmp5 = tmpgreen[(i+1) +  j    * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+
+	  if (j+1 < NJ_tmp && k+1 < NK_tmp)
+	    tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
+
+	  tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
+
+	  double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
+
+	  mem_ptr[i + j*II +  k*II*JJ] = tmp_rho;
+
+
+	}
+      }
+    }
+  }
+  return 0;
+}
+
+
+
+
+int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
+  double *mem_ptr = (double*) mem_ptr_;	
+
+#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
+  {
+    int id, id_mirr;
+    int II = 2*I; int JJ = 2*J; int KK = 2*K;
+    mem_ptr[0] = mem_ptr[II*JJ];
+#pragma omp parallel for collapse(3) schedule(dynamic) 
+    for (int ie = I+1; ie<2*I; ++ie) {
+      for(int j = 0; j<= J; ++j) {
+	for (int k=0; k<= K; ++k) {
+	  id = k * II * JJ + j * II + ie;
+	  id_mirr = k * II * JJ + j * II + (2*I-ie);
+	  mem_ptr[id] = mem_ptr[id_mirr];
+	}
+      }
+    }
+
+#pragma omp parallel for collapse(3) schedule(dynamic) 
+    for (int ai = 0; ai<2*I; ++ai) {
+      for(int je = J+1; je< 2*J; ++je) {
+	for (int k=0; k<= K; ++k) {
+	  id = k * II * JJ + je * II + ai;
+	  id_mirr = k * II * JJ + (2*J-je) * II + ai;
+	  mem_ptr[id] = mem_ptr[id_mirr];
+	}
+      }
+    }
+
+#pragma omp parallel for collapse(3) schedule(dynamic) 
+    for (int ai = 0; ai<2*I; ++ai) {
+      for(int aj = 0; aj< 2*J; ++aj) {
+	for (int ke=K+1; ke< 2*K; ++ke) {
+	  id = ke * II * JJ + aj * II + ai;
+	  id_mirr = (2*K-ke) * II * JJ + aj * II + ai;
+	  mem_ptr[id] = mem_ptr[id_mirr];
+	}
+      }
+    }
+
+
+  }
+  return 0;
+}
+
+/*multiply complex fields*/
+int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
+  //	  double *mem_ptr1 = (double*) mem_ptr1_;
+  //	  double *mem_ptr2 = (double*) mem_ptr2_;
+  _Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
+  _Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
+
+#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
+  {
+#pragma omp parallel for 
+    for (int i=0; i<size; ++i) {
+      mem_ptr1[i]*=mem_ptr2[i];
+    }
+  }
+
+  return 0;
+}
+
+
+
+
+
+
+
+
diff --git a/src/MIC/MICGreensFunction.hpp b/src/MIC/MICGreensFunction.hpp
new file mode 100644
index 0000000..0b83d56
--- /dev/null
+++ b/src/MIC/MICGreensFunction.hpp
@@ -0,0 +1,44 @@
+//AUTHOR: Benjamin Ulmer
+
+#ifndef H_MIC_GREENS
+#define H_MIC_GREENS
+
+#include <iostream>
+#include <complex>
+
+#include <offload.h>
+#include <mkl_dfti.h>
+
+#include "MICBase.h"
+
+#define DKS_SUCCESS 0
+#define DKS_ERROR 1
+
+class MICGreensFunction {
+
+private:
+  MICBase *m_micbase;
+
+public:
+
+  /* constructor */
+  MICGreensFunction(MICBase *base);
+
+  /* destructor */
+  ~MICGreensFunction();
+
+  /* compute greens integral analytically */
+  int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
+
+  /* perform the actual integration */
+  int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
+
+  /* Mirror rho-Field */
+  int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
+
+  /*multiply complex fields*/
+  int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
+
+};
+
+#endif
diff --git a/src/MIC/MICMergeSort.h b/src/MIC/MICMergeSort.h
new file mode 100644
index 0000000..408037b
--- /dev/null
+++ b/src/MIC/MICMergeSort.h
@@ -0,0 +1,116 @@
+#include <iostream>
+#include <cstdlib>
+#include <omp.h>
+
+/* default comparison function */
+template<typename T>
+inline bool greaterThan(T x, T y) {
+  return x > y;
+}
+
+/* swap a and b */
+template<typename T>
+void mergeswap(T &a, T &b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+template <typename T>
+void split_merge(T *a, int ibegin, int iend, T *b, bool (*comp)(T, T) ) {
+
+  if (iend - ibegin < 500) {
+    quick_sort(a + ibegin, 0, iend - ibegin - 1, comp);
+    return;
+  }
+
+  int imiddle = (iend + ibegin) / 2;
+
+#pragma omp task
+  split_merge(a, ibegin, imiddle, b, comp);
+  split_merge(a, imiddle, iend, b, comp);
+#pragma omp taskwait
+  
+  merge(a, ibegin, imiddle, iend, b, comp);
+
+}
+
+template <typename T>
+void merge(T *a, int ibegin, int imiddle, int iend, T *b, bool (*comp)(T, T)) {
+
+  int i0 = ibegin;
+  int i1 = imiddle;
+
+  //merge two halfs of array a to tmp array b
+  int i = ibegin;
+  while (i < iend) {
+    if (i0 < imiddle && ( i1 >= iend || comp(a[i1], a[i0]) ) )
+      b[i++] = a[i0++];
+    else
+      b[i++] = a[i1++];
+  }
+  
+  //copy b back to a
+  for (int i = ibegin; i < iend; i++)
+    a[i] = b[i];
+
+}
+
+template <typename T>
+int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
+  int p = start;
+  T x = a[start];
+
+  for (int i = start + 1; i <= end; i++) {
+    if ( comp(x, a[i]) ) {
+      p++;
+      mergeswap(a[i], a[p]);
+    }
+  }
+  mergeswap(a[p], a[start]);
+  return p;
+}
+
+template <typename T>
+void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
+
+#pragma omp parallel
+  {
+#pragma omp single
+    {
+      T *b = new T[n];
+      split_merge(list, 0, n, b, comp);
+    }
+  }
+}
+
+template <typename T>
+void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
+
+  if (start < end) {
+    //for small elements move to insertion sort
+    if ( (end - start) < 9 ) {
+      insertion_sort(list, start, end + 1, comp);
+    } else {
+      int part = partition(list, start, end, comp);
+      quick_sort(list, start, part - 1, comp);
+      quick_sort(list, part + 1, end, comp);
+    }
+  }
+
+}
+
+template <typename T>
+void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
+
+  for (int i = start + 1; i < end; i++) {
+    T key = list[i];
+    int j = i - 1;
+    while ( j >= 0 && comp(list[j], key) ) {
+      list[j + 1] = list[j];
+      j--;
+    }
+    list[j + 1] = key;
+  }
+
+}
diff --git a/src/OpenCL/CMakeLists.txt b/src/OpenCL/CMakeLists.txt
new file mode 100644
index 0000000..19cedbe
--- /dev/null
+++ b/src/OpenCL/CMakeLists.txt
@@ -0,0 +1,34 @@
+SET (_SRCS
+	OpenCLBase.cpp
+	OpenCLFFT.cpp
+	OpenCLChiSquare.cpp
+	OpenCLCollimatorPhysics.cpp
+	OpenCLChiSquareRuntime.cpp
+  )
+
+SET (_HDRS
+	OpenCLBase.h
+	OpenCLFFT.h
+	OpenCLChiSquare.h
+	OpenCLCollimatorPhysics.h
+	OpenCLChiSquareRuntime.h
+  )
+
+#INCLUDE_DIRECTORIES (
+#  ${CMAKE_CURRENT_SOURCE_DIR}
+#)
+
+SET (_KERNELS
+  OpenCLKernels/OpenCLChiSquare.cl
+  OpenCLKernels/OpenCLFFT.cl
+  OpenCLKernels/OpenCLFFTStockham.cl
+  OpenCLKernels/OpenCLTranspose.cl
+  OpenCLKernels/OpenCLCollimatorPhysics.cl
+  OpenCLKernels/OpenCLChiSquareRuntime.cl
+  )
+
+ADD_SOURCES (${_SRCS})
+ADD_HEADERS (${_HDRS})
+
+INSTALL(FILES ${_HDRS} DESTINATION include/OpenCL)
+INSTALL(FILES ${_KERNELS} DESTINATION include/OpenCL/OpenCLKernels)
diff --git a/src/OpenCL/OpenCLBase.cpp b/src/OpenCL/OpenCLBase.cpp
new file mode 100644
index 0000000..b40fd64
--- /dev/null
+++ b/src/OpenCL/OpenCLBase.cpp
@@ -0,0 +1,1132 @@
+#include "OpenCLBase.h"
+
+cl_context OpenCLBase::m_context = NULL;
+cl_command_queue OpenCLBase::m_command_queue = NULL;
+cl_platform_id OpenCLBase::m_platform_id = NULL;
+cl_device_id OpenCLBase::m_device_id = NULL;
+cl_event OpenCLBase::m_last_event = NULL;
+
+OpenCLBase::OpenCLBase() {
+  //m_context = NULL;
+  //m_command_queue = NULL;
+  m_program = NULL;
+  m_kernel = NULL;
+  //m_device_id = NULL;
+  //m_platform_id = NULL;
+  m_kernel_file = NULL;
+	
+  m_last_event = NULL;
+	
+  //m_events = new cl_event[500];
+  //m_num_events = 0;
+
+  defaultRndSet = 0;
+
+}
+
+OpenCLBase::~OpenCLBase() {
+  ocl_cleanUp();
+  m_last_event = NULL;
+
+  if (defaultRndSet == 1)
+    ocl_deleteRndStates();
+}
+
+/* create random states */
+int OpenCLBase::ocl_createRndStates(int size) {
+  //load kernel
+  char * kernel_file = new char[500];
+  kernel_file[0] = '\0';
+  strcat(kernel_file, OPENCL_KERNELS);
+  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
+  ocl_loadKernel(kernel_file);
+  delete[] kernel_file;
+
+  //allocate memory for rand states
+  int ierr;
+  defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
+
+  //exec kernel
+  int seed = 0;
+  ocl_createKernel("initRand");
+  ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
+  ocl_setKernelArg(1, sizeof(unsigned int), &seed);
+  ocl_setKernelArg(2, sizeof(int), &size);
+  
+  size_t work_items = size;
+  size_t work_group_size = 1;
+
+  ocl_executeKernel(1, &work_items, &work_group_size);
+
+  defaultRndSet = 1;
+
+  return OCL_SUCCESS;
+
+}
+
+/* destroy rnd states */
+int OpenCLBase::ocl_deleteRndStates() {
+
+  ocl_freeMemory(defaultRndState);
+  defaultRndSet = 0;
+
+  return OCL_SUCCESS;
+
+}
+
+
+/*
+  get platform id and device id of device specified by device_name (device name can be -mic, -cpu, -gpu, -all)
+  finds the first device of the specified type and saves device id and platform id
+*/
+int OpenCLBase::ocl_getDevice(const char* device_name) {
+
+  int ierr = 0;
+
+  cl_platform_id *tmp_platform_ids;
+  cl_uint num_of_platforms, num_of_devices;
+
+  //get device type from name, return with error on failure
+  ierr = ocl_getDeviceType(device_name, m_device_type);
+  if (ierr != OCL_SUCCESS) {
+    DEBUG_MSG("Can't find device, OpenCL error: " << ierr << ", " << device_name);
+    return ierr;
+  }
+
+  //find all available platforms
+  ierr = clGetPlatformIDs(0, NULL, &num_of_platforms);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); 
+    return ierr;
+  }
+		
+  tmp_platform_ids = new cl_platform_id[num_of_platforms];
+  ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr);
+    return ierr;
+  }
+
+  //search each platform for specified device
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+	
+    //get number of devices and first avaialble device id
+    ierr = clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 1, &m_device_id, &num_of_devices);
+		
+    if (ierr != CL_SUCCESS && ierr != CL_DEVICE_NOT_FOUND) {
+      DEBUG_MSG("Can't find device id's, OpenCL error: " << ierr);
+      return ierr;
+    }
+		
+    //if device exists in current platform
+    if (num_of_devices > 0) {
+      //save platform id
+      m_platform_id = tmp_platform_ids[i];
+
+      //get the name of device that will be used and print its name
+      size_t size;
+      clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, 0, NULL, &size);
+			
+      char* info = new char[size];
+      clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, size, info, NULL);
+
+      DEBUG_MSG("Accelerator device: " << info);
+      delete[] info;
+
+      //get the name of the platform
+      clGetPlatformInfo(m_platform_id, CL_PLATFORM_NAME, 0, NULL, &size);
+      info = new char[size];
+      clGetPlatformInfo(m_platform_id, CL_PLATFORM_NAME, size, info, NULL);
+			
+      DEBUG_MSG("Accelerator platform: " << info);
+
+      return OCL_SUCCESS;
+    }
+  }
+
+  return OCL_ERROR;
+}
+
+int OpenCLBase::ocl_getDeviceCount(int &ndev) {
+  int ierr = DKS_SUCCESS;
+
+  cl_platform_id *tmp_platform_ids;
+  cl_uint num_of_platforms, num_of_devices, total_devices;
+	
+  //find platform count
+  ierr = clGetPlatformIDs(0, NULL, &num_of_platforms);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find num of platforms, OpenCL error: " << ierr);
+    return DKS_ERROR;
+  }
+	
+  //find all platform IDs
+  tmp_platform_ids = new cl_platform_id[num_of_platforms];
+  ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr);
+    return ierr;
+  }
+	
+  //for each platform find number of devices
+  total_devices = 0;
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+    //get device count for platform
+    ierr = clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices);
+    if (ierr != CL_SUCCESS && ierr != CL_DEVICE_NOT_FOUND) {
+      DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr);
+      return OCL_ERROR;
+    }
+    total_devices += num_of_devices;
+    num_of_devices = 0;
+  }
+
+  ndev = total_devices;
+  return DKS_SUCCESS;
+    
+}
+
+int OpenCLBase::ocl_getDeviceName(std::string &device_name) {
+
+  int ierr = DKS_SUCCESS;
+  size_t size;
+
+  clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, 0, NULL, &size);
+  char* name = new char[size];
+  clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, size, name, NULL);
+
+  device_name = name;
+  delete[] name;
+  return ierr;
+}
+
+int OpenCLBase::ocl_setDevice(int device) {
+  
+  int ierr;
+
+  cl_device_id *tmp_device_ids;
+  cl_platform_id *tmp_platform_ids;
+  cl_int *tmp_device_counts;
+  cl_uint num_of_platforms, num_of_devices;
+  cl_uint total_devices = 0;
+
+  //find all available platforms
+  ierr = clGetPlatformIDs(0, NULL, &num_of_platforms);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); 
+    return DKS_ERROR;
+  }
+
+  tmp_platform_ids = new cl_platform_id[num_of_platforms];
+  tmp_device_counts = new cl_int[num_of_platforms];
+  ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr);
+    return DKS_ERROR;
+  }
+
+  //search each platform for specified device
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+
+    //get the number of devices in the platform
+    num_of_devices = 0;
+    clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices);
+    tmp_device_counts[i] = num_of_devices;
+    total_devices += num_of_devices;
+  }
+
+  //check in which platform the selected device is located
+  int tmp_count = 0;
+  int checked_count = 0;
+  int id = -1;
+  int platform = -1; 
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+    tmp_count += tmp_device_counts[i];
+    if (device < tmp_count) {
+      id = device - checked_count;
+      platform = i;
+      break;
+    }
+    checked_count += tmp_device_counts[i];
+  }
+
+  ierr = DKS_ERROR;
+  if (id > 0) {
+    num_of_devices = tmp_device_counts[platform];
+    tmp_device_ids = new cl_device_id[num_of_devices];
+    clGetDeviceIDs(tmp_platform_ids[platform], m_device_type, num_of_devices, tmp_device_ids, NULL);
+
+    m_device_id = tmp_device_ids[id];
+    m_platform_id = tmp_platform_ids[platform];
+    ierr = ocl_createContext();
+
+    delete[] tmp_device_ids;
+  }
+
+  delete[] tmp_platform_ids;
+  delete[] tmp_device_counts;
+
+  return ierr;
+}
+
+int OpenCLBase::ocl_getUniqueDevices(std::vector<int> &devices) {
+
+  int ierr;
+
+  size_t size;
+  cl_device_id *tmp_device_ids;
+  cl_platform_id *tmp_platform_ids;
+  cl_uint num_of_platforms, num_of_devices;
+
+  //find all available platforms
+  ierr = clGetPlatformIDs(0, NULL, &num_of_platforms);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); 
+    return DKS_ERROR;
+  }
+
+  tmp_platform_ids = new cl_platform_id[num_of_platforms];
+  ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr);
+    return DKS_ERROR;
+  }
+
+  std::vector< std::string > names;
+  int checked_count = 0;
+  int id = 0;
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+
+    //get the number of devices in the platform
+    num_of_devices = 0;
+    clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices);
+    tmp_device_ids = new cl_device_id[num_of_devices];
+    clGetDeviceIDs(tmp_platform_ids[i], m_device_type, num_of_devices, tmp_device_ids, NULL);
+    
+    for (unsigned int j = 0; j < num_of_devices; j++) {
+      id = checked_count + j;
+      clGetDeviceInfo(tmp_device_ids[j], CL_DEVICE_NAME, 0, NULL, &size);
+      char* name = new char[size];
+      clGetDeviceInfo(tmp_device_ids[j], CL_DEVICE_NAME, size, name, NULL);
+      std::string target = name;
+      if (id == 0) {
+	devices.push_back(id);
+	names.push_back(target);
+      } else {
+	bool isPresent = (std::find(names.begin(), names.end(), target) != names.end());
+	if (!isPresent) {
+	  devices.push_back(id);
+	  names.push_back(target);
+	}
+      }
+      delete[] name;
+    }
+
+    checked_count += num_of_devices;
+    delete[] tmp_device_ids;
+  }
+
+  delete[] tmp_platform_ids;
+
+  return DKS_SUCCESS;
+}
+
+/*
+  checks wether device name is specified and sets device type to search for
+  if invalid device name is specified set device type to default
+*/
+int OpenCLBase::ocl_getDeviceType(const char* device_name, cl_device_type &device_type) {
+
+  device_type = CL_DEVICE_TYPE_DEFAULT;	
+
+  if (strcmp(device_name, "-mic") == 0)
+    device_type = CL_DEVICE_TYPE_ACCELERATOR;
+
+  if (strcmp(device_name, "-cpu") == 0) 
+    device_type = CL_DEVICE_TYPE_CPU;
+
+  if (strcmp(device_name, "-gpu") == 0)
+    device_type = CL_DEVICE_TYPE_GPU;
+	
+  if (strcmp(device_name, "-all") == 0)
+    device_type = CL_DEVICE_TYPE_ALL;
+
+  return OCL_SUCCESS;
+}
+
+/*
+  creates a context and command queue between host and device
+*/
+int OpenCLBase::ocl_createContext() {
+  int ierr;
+	
+  //context properties list
+  m_context_properties[0] = CL_CONTEXT_PLATFORM;
+  m_context_properties[1] = (cl_context_properties) m_platform_id;
+  m_context_properties[2] = 0;
+
+  //create a context with specified device
+  m_context = clCreateContext(m_context_properties, 1, &m_device_id, NULL, NULL, &ierr);
+	
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't create context, OpenCL error: " << ierr);
+    return ierr;
+  }
+
+  //create command queue using context and device
+  //m_command_queue = clCreateCommandQueue(m_context, m_device_id, CL_QUEUE_PROFILING_ENABLE, &ierr);
+  m_command_queue = clCreateCommandQueue(m_context, m_device_id, 0, &ierr);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't create command queue, OpenCL error: " << ierr);
+    return ierr;
+  }
+		
+  return OCL_SUCCESS;
+}
+
+/*
+  read file specified by kernel_file and compile the kernel code contained in kernel_file
+  save reference to the built program to m_program, from witch individual kernels can be extracted
+*/
+int OpenCLBase::ocl_buildProgram(const char *kernel_file) {
+	
+  cl_int ierr;
+  long fsize;
+  char *kernel_source;
+	
+  //open file
+  FILE *fp = fopen(kernel_file, "rb");
+  if (!fp) {
+    DEBUG_MSG("Can't open kernel file: " << kernel_file);
+    return OCL_ERROR;
+  }
+	
+  //get file size and allocate memory	
+  fseek(fp, 0, SEEK_END);
+  fsize = ftell(fp);
+  kernel_source = new char[fsize+1];
+	
+  //read file and content in kernel source
+  rewind(fp);
+  fread(kernel_source, 1, sizeof(char)*fsize, fp);
+  kernel_source[fsize] = '\0';
+  fclose(fp);
+	
+  ierr = ocl_compileProgram(kernel_source);
+
+  //save currently loaded kernel file
+  m_kernel_file = new char[strlen(kernel_file) + 1];
+  strcpy(m_kernel_file, kernel_file);
+
+  return ierr;
+
+}
+
+//given kernel source compile the OpenCL programm
+int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts) {
+  
+  int ierr;
+
+  //create program from kernel
+  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
+    return DKS_ERROR;
+  }
+	
+  //compile the program, if compilation
+  ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
+	
+  /*
+    check if compileng kernel source succeded, if failed return error code
+    if in debug mode get compilation info and print program build log witch
+    will give indication what made the compilation fail
+  */
+#ifdef DEBUG
+  if (ierr != CL_SUCCESS) {
+		
+    //get build status
+    cl_build_status status;
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
+
+    //get log size
+    size_t log_size;
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+		
+    //get log message
+    char *log = new char[log_size];
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, log_size+1, log, NULL);
+		
+    //print log messsage
+    DEBUG_MSG("Build failed! Status:" << status);
+    DEBUG_MSG("LOG: " << log);
+		
+    delete[] log;
+		
+    return DKS_ERROR;
+  }
+#else
+  if (ierr != CL_SUCCESS)
+    return DKS_ERROR;
+#endif
+		
+  return DKS_SUCCESS;
+
+}
+
+
+
+//=========================================//
+//===============public functions==========//
+//=========================================//
+
+/*
+  get all device from all platforms
+*/
+int OpenCLBase::ocl_getAllDevices() {
+
+  int ierr = DKS_SUCCESS;
+	
+  cl_platform_id *tmp_platform_ids, *platform_ids;
+  cl_uint num_of_platforms, num_of_devices, total_devices;
+  cl_device_id *tmp_device_ids, *device_ids;
+	
+  //find platform count
+  ierr = clGetPlatformIDs(0, NULL, &num_of_platforms);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find num of platforms, OpenCL error: " << ierr);
+    return OCL_ERROR;
+  }
+	
+  //find all platform IDs
+  tmp_platform_ids = new cl_platform_id[num_of_platforms];
+  ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr);
+    return ierr;
+  }
+	
+  //for each platform find number of devices
+  total_devices = 0;
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+    //get device count for platform
+    ierr = clGetDeviceIDs(tmp_platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_of_devices);
+    if (ierr != CL_SUCCESS) {
+      DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr);
+      return OCL_ERROR;
+    }
+    total_devices += num_of_devices;
+  }
+	
+  //get all device ids
+  int idx = 0;
+  platform_ids = new cl_platform_id[total_devices];
+  device_ids = new cl_device_id[total_devices];
+  tmp_device_ids = new cl_device_id[total_devices];
+	
+  for (unsigned int i = 0; i < num_of_platforms; i++) {
+    //get device ids
+    ierr = clGetDeviceIDs(tmp_platform_ids[i], CL_DEVICE_TYPE_ALL, total_devices, tmp_device_ids, &num_of_devices);
+    if (ierr != CL_SUCCESS) {
+      DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr);
+      return OCL_ERROR;
+    }
+		
+    for (unsigned j = 0; j < num_of_devices; j++) {
+      platform_ids[idx] = tmp_platform_ids[i];
+      device_ids[idx] = tmp_device_ids[j];
+      idx++;
+    }
+  }
+	
+  std::cout << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::cout << "============OpenCL============" << std::endl;
+  std::cout << "==============================" << std::endl;
+	
+  for (unsigned int i = 0; i < total_devices; i++) {
+	
+    //get the name of device that will be used and print its name
+    size_t size;
+		
+    DEBUG_MSG("Device " << i+1 << ":");
+		
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 0, NULL, &size);	
+    char *device_name = new char[size];
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, size, device_name, NULL);
+    DEBUG_MSG("Name: \"" << device_name << "\"");
+		
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, 0, NULL, &size);	
+    char *device_vendor = new char[size];
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, size, device_vendor, NULL);
+    DEBUG_MSG("Vendor: \"" << device_vendor << "\"");
+		
+    cl_device_type device_type;
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
+
+    if (device_type == CL_DEVICE_TYPE_GPU) {
+      DEBUG_MSG("Device type: GPU");
+    } else if (device_type == CL_DEVICE_TYPE_CPU) {
+      DEBUG_MSG("Device type: CPU");
+    } else if (device_type == CL_DEVICE_TYPE_ACCELERATOR) {
+      DEBUG_MSG("Device type: Accelerator");
+    }
+	
+    std::cout << "==============================" << std::endl;	
+	
+  }
+	
+  return OCL_SUCCESS;
+}
+
+
+/*
+  find available device, create context and command queue, load kernel file and kompile kernel code
+*/
+int OpenCLBase::ocl_setUp(const char *device_name) {
+  cl_int ierr;
+  ierr = ocl_getDevice(device_name);
+  if (ierr != CL_SUCCESS)
+    return ierr;
+	
+  ocl_deviceInfo(false);
+	
+  ierr = ocl_createContext();
+  if (ierr != CL_SUCCESS)
+    return ierr;
+  
+  return DKS_SUCCESS;
+}
+
+/*
+  load and compile kernel file if it has changed
+*/
+int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
+  int ierr = OCL_SUCCESS;
+	
+  //kernel file has changed
+  if (m_kernel_file == NULL) {
+    ierr = ocl_buildProgram(kernel_file);
+  } else {
+    if (strcmp(m_kernel_file, kernel_file) != 0) {
+      ierr = ocl_buildProgram(kernel_file);
+    }
+  }
+	
+  if (ierr != OCL_SUCCESS) {
+    DEBUG_MSG("Failed to build kernel file " << kernel_file);
+    return OCL_ERROR;
+  }
+	
+  return OCL_SUCCESS;
+}
+
+//compile kernel form source code provided
+int OpenCLBase::ocl_loadKernelFromSource(const char *kernel_source, const char *opts) {
+
+  int ierr = ocl_compileProgram(kernel_source, opts);
+
+  return ierr;
+}
+
+/*
+  Allocate memory buffer of specified size and type, 
+  available types (read only, write only, read/write)
+  return memory object
+*/
+cl_mem OpenCLBase::ocl_allocateMemory(size_t size, int type, cl_int &ierr) {
+  cl_mem mem;
+  mem = clCreateBuffer(m_context, type, size, NULL, &ierr);
+  if (ierr != CL_SUCCESS)
+    DEBUG_MSG("Error allocating memory, OpenCL error: " << ierr);
+	
+  return mem;
+}
+
+/*
+  Allocate memory buffer of specified size, type is set to read/write
+  return memory object
+*/
+cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) {
+  cl_mem mem;
+
+  mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, size, NULL, &ierr);	
+  if (ierr != CL_SUCCESS)
+    DEBUG_MSG("Error allocating memory, OpenCL error: " << ierr);
+	
+  return mem;
+}
+
+/*
+  write data specified by in_data to device memory, device memory space defined by cl_mem
+*/
+int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) {
+
+  cl_int ierr;
+	
+	
+  //std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl;
+  ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event);
+	
+  //m_events[m_num_events] = m_last_event;
+  m_events.push_back(m_last_event);
+
+	
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
+    return ierr;
+  }
+	
+  return OCL_SUCCESS;
+}
+
+/*
+  copy src buffer into dst buffer
+*/
+int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) {
+
+  int ierr;
+  ierr = clEnqueueCopyBuffer(m_command_queue, src_ptr, dst_ptr, 0, 0, size, 0, NULL, NULL);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Error copying buffers, OpenCL error: " << ierr);
+    return OCL_ERROR;
+  }
+
+  return OCL_SUCCESS;
+}
+
+
+/*
+  create kernel specified by kernel_name from compiled program
+*/
+int OpenCLBase::ocl_createKernel(const char* kernel_name) {
+  cl_int ierr;
+  m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
+  if (ierr != CL_SUCCESS) {
+    DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
+    return ierr;
+  }
+  return OCL_SUCCESS;
+}
+
+/*
+  set kernel argument, idx is the index of arument, size specifies data size, arg_value value of the argument
+*/
+int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) {
+  cl_int ierr;
+  ierr = clSetKernelArg(m_kernel, idx, size, arg_value);
+	
+  if (ierr != CL_SUCCESS)
+    DEBUG_MSG("Error setting kernel arg, OpenCL error: " << ierr);
+		
+  return ierr;
+}
+
+/*
+  executes set kernel, must provide dimensions ndim (1, 2 or 3) and total number of work items
+  work_items should be an arry of size ndim
+  optional: work_group_size - can specify how work items are divided in work groups, 
+  if left NULL OpenCL implementation handles this part.
+*/
+int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) {
+  cl_int ierr;
+		
+  cl_event tmp_event;
+  if (m_last_event == NULL) {
+    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
+				  0, NULL, &tmp_event);
+  } else {
+    ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 
+				  1, &m_last_event, &tmp_event);
+  }
+	
+  if (ierr != CL_SUCCESS)
+    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
+		
+  m_last_event = tmp_event;
+  m_events.push_back(m_last_event);
+	
+  return ierr;
+}
+
+/*
+  read data from device, mem_ptr points to data on device out_data points to memory in host
+  blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
+*/
+int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) {
+  cl_int ierr;
+	
+  ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event);
+
+  m_events.push_back(m_last_event);
+	
+  if (ierr != CL_SUCCESS)
+    DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
+		
+  return ierr;
+}
+
+/*
+  free device memory specified by mem_ptr
+*/
+int OpenCLBase::ocl_freeMemory(cl_mem mem_ptr) {
+  cl_int ierr;
+  ierr = clReleaseMemObject(mem_ptr);
+  if (ierr != CL_SUCCESS)
+    DEBUG_MSG("Error freeing memory on device, OpenCL error: " << ierr);
+	
+  return ierr;
+}
+
+/*
+  delete created OpenCL resources
+*/
+int OpenCLBase::ocl_cleanUp() {
+
+  if (m_kernel != NULL) {
+    clReleaseKernel(m_kernel);
+    m_kernel = NULL;
+  }
+	
+  if (m_program != NULL) {
+    clReleaseProgram(m_program);
+    m_program = NULL;
+  }
+
+  if (m_command_queue != NULL) {
+    clReleaseCommandQueue(m_command_queue);
+    m_command_queue = NULL;
+  }
+	
+  if (m_context != NULL) {
+    clReleaseContext(m_context);
+    m_context = NULL;
+  }
+	
+  return OCL_SUCCESS;
+}
+
+int OpenCLBase::ocl_deviceInfo(bool verbose) {
+
+	
+  if (m_device_id == NULL) {
+    std::cout << "Device not set" << std::endl;
+    return OCL_ERROR;	
+  }
+		
+
+  char *info;
+  cl_bool b_info;
+  cl_ulong ul_info;
+  cl_uint ui_info;
+  size_t info_size;
+  //size_t *wi_info;
+  cl_device_type device_type;
+	
+  const int count = 12;
+  const char *info_type[count] = {"char", "cl_device_type", "cl_bool",
+				  "cl_bool", "cl_ulong", "cl_uint",
+				  "cl_uint", "cl_ulong", "size_t",
+				  "size_t[]", "cl_ulong", "char"};
+  const char* info_name[count] = {"Name", "Device type","Device available", 
+				  "Compiler available", "Global mem size (gb)", "Max clock freq (MHz)",
+				  "Max compute units", "Max buffer size (bytes)", "Max work group size",
+				  "Max work item sizes", "Local mem size (bytes)", "Extensions"};
+  const cl_device_info info_value[count] = {CL_DEVICE_NAME, CL_DEVICE_TYPE, CL_DEVICE_AVAILABLE, 
+					    CL_DEVICE_COMPILER_AVAILABLE, CL_DEVICE_GLOBAL_MEM_SIZE, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+					    CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+					    CL_DEVICE_MAX_WORK_ITEM_SIZES, CL_DEVICE_LOCAL_MEM_SIZE, CL_DEVICE_EXTENSIONS};
+	
+  int print_count;
+  if (verbose)
+    print_count = count;
+  else
+    print_count = 3;
+	
+	
+  std::cout << "--------------------" << std::endl;
+  std::cout << "OpenCL device information" << std::endl;
+  std::cout << "--------------------" << std::endl;
+	
+  for (int k = 0; k < print_count; k++) {
+    if (strcmp(info_type[k], "char") == 0) {
+      clGetDeviceInfo(m_device_id, info_value[k], 0, NULL, &info_size);
+      info = new char[info_size];
+      clGetDeviceInfo(m_device_id, info_value[k], info_size, info, NULL);
+      std::cout << info_name[k] << ": " << info << std::endl;
+      delete[] info;
+		
+    } else if (strcmp(info_type[k], "cl_bool") == 0) {
+      clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_bool), &b_info, NULL);
+      std::cout << info_name[k] << ": " << b_info << std::endl;
+		
+    } else if (strcmp(info_type[k], "cl_ulong") == 0) {
+      clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_ulong), &ul_info, NULL);
+			
+      if (info_value[k] == CL_DEVICE_GLOBAL_MEM_SIZE) {
+	double gb = (double)ul_info*1e-9;
+	std::cout << info_name[k] << ": " << gb << std::endl;
+      } else if (info_value[k] == CL_DEVICE_LOCAL_MEM_SIZE) {
+	std::cout << info_name[k] << ": " << ul_info << std::endl;
+	std::cout << "512^2 bytes: " << sizeof(cl_double2)*512*5 << std::endl;
+      } else {
+	std::cout << info_name[k] << ": " << ul_info << std::endl;
+      }
+    } else if (strcmp(info_type[k], "cl_uint") == 0) {
+      clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_uint), &ui_info, NULL);
+      std::cout << info_name[k] << ": " << ui_info << std::endl;
+		
+    } else if (strcmp(info_type[k], "size_t") == 0) {
+      clGetDeviceInfo(m_device_id, info_value[k], sizeof(size_t), &info_size, NULL);
+      std::cout << info_name[k] << ": " << info_size << std::endl;
+		
+    } else if (strcmp(info_type[k], "size_t[]") == 0 ){
+      clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &ui_info, NULL);
+      size_t wi_info[3];// = new size_t[ui_info];
+      clGetDeviceInfo(m_device_id, info_value[k], 3 * sizeof(size_t), &wi_info, NULL);
+      std::cout << info_name[k] << ": ";
+      for (unsigned int m = 0; m < ui_info; m++)
+	std::cout << wi_info[m] << " ";
+      std::cout << std::endl;
+		
+    } else if (strcmp(info_type[k], "cl_device_type") == 0) {
+      clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_device_type), &device_type, NULL);
+      switch (device_type) {
+      case CL_DEVICE_TYPE_CPU:
+	std::cout << info_name[k] << ": CPU" << std::endl;
+	break;
+      case CL_DEVICE_TYPE_GPU:
+	std::cout << info_name[k] << ": GPU" << std::endl;
+	break;
+      case CL_DEVICE_TYPE_ACCELERATOR:
+	std::cout << info_name[k] << ": Accelerator" << std::endl;
+	break;
+      case CL_DEVICE_TYPE_DEFAULT:
+	std::cout << info_name[k] << ": Default" << std::endl;
+	break;
+      default:
+	std::cout << info_name[k] << ": Unknown" << std::endl;
+	break;
+      }
+    }
+  }
+  return OCL_SUCCESS;
+}
+
+int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
+				bool double_precision, int &threadsPerBlock)
+{
+
+  //build kernel
+  int ierr = ocl_createKernel(kernel_name);
+  if (ierr != DKS_SUCCESS)
+    return ierr;
+
+  //get device properties
+  size_t max_group_size;
+  clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
+  cl_ulong local_mem_size;
+  clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
+  size_t ext_size;
+  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
+  char *ext = new char[ext_size];
+  clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
+
+  //get kernel properties
+  size_t kernel_group_size;
+  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, 
+			   sizeof(size_t), &kernel_group_size, 0);
+  threadsPerBlock = kernel_group_size;
+
+  cl_ulong kernel_local_mem;
+  clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
+			   sizeof(cl_ulong), &kernel_local_mem, 0);
+
+
+  std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
+
+
+  std::cout << "Work groups: device limit " << max_group_size << ", "
+	    << "kernel limit " << kernel_group_size << ", "
+	    << "required " << work_group_size << std::endl;
+  
+
+  std::cout << "Local memory: device limit " << local_mem_size << std::endl;
+  
+  
+
+  std::cout << "Available extensions: " << ext << std::endl;
+
+  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
+
+  return DKS_SUCCESS;
+}
+
+void OpenCLBase::ocl_clearEvents() {
+
+  m_events.clear();
+	
+  //delete[] m_events;
+  //m_num_events = 0;
+  //m_events = new cl_event[500];
+	
+}
+
+
+
+void OpenCLBase::ocl_eventInfo() {
+	
+  std::cout << "Number of events launched: " << m_events.size() << std::endl;
+
+  if (m_events.size() > 0) {
+	
+    cl_ulong twrite = 0;
+    cl_ulong texec = 0;
+    cl_ulong tread = 0;
+    int cw = 0;
+    int ce = 0;
+    int cr = 0;
+	
+    for (unsigned i = 0; i < m_events.size(); i++) {
+		
+      cl_ulong tqueue, tsubmit, tstart, tend;
+			
+      clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_QUEUED, 
+			      sizeof(cl_ulong), &tqueue, NULL);
+
+      clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_SUBMIT, 
+			      sizeof(cl_ulong), &tsubmit, NULL);
+      
+      clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_START, 
+			      sizeof(cl_ulong), &tstart, NULL);
+      
+      clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_END, 
+			      sizeof(cl_ulong), &tend, NULL);
+			
+      cl_command_type type;
+      clGetEventInfo(m_events[i], CL_EVENT_COMMAND_TYPE, sizeof(cl_int), &type, NULL);
+			
+      if (type == CL_COMMAND_WRITE_BUFFER) {
+	twrite += (tend - tstart);
+	cw++;
+      }
+			
+      if (type == CL_COMMAND_READ_BUFFER) {
+	tread += (tend - tstart);
+	cr++;
+      }
+			
+      if (type == CL_COMMAND_NDRANGE_KERNEL) {
+	texec += (tend - tstart); 
+	ce++;	
+      }
+    }
+		
+    std::cout << "OpenCL write: " << (twrite * 1e-9) << " in: " << cw << std::endl;
+    std::cout << "OpenCL exec: " << (texec * 1e-9) << " in: " << ce << std::endl;
+    std::cout << "OpenCL read: " << (tread * 1e-9) << " in: " << cr << std::endl;
+	
+  }
+
+  /*
+    cl_ulong tqueue, tsubmit, tstart, tend, tref;
+	
+    int *list_bad_events = new int[m_num_events];
+    int num_bad_events = 0;
+
+    if (m_num_events > 0) {
+
+    double *list_ended = new double[m_num_events];
+
+    clGetEventProfilingInfo(m_events[0], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &tref, NULL);
+
+    std::cout << std::endl;
+    std::cout << setw(10) << left << "Event\t| ";
+    std::cout << setw(10) << left << "queued\t| ";
+    std::cout << setw(10) << left << "submited\t| ";
+    std::cout << setw(10) << left << "started\t| ";
+    std::cout << setw(10) << left << "ended \t| ";
+
+    std::cout << setw(10) << left << "in queue" << std::endl;
+    std::cout << setw(10) << "-----------------------------------------------------------------------------------" << std::endl;
+    for (unsigned int i = 0; i < m_num_events; i++) {
+	
+    clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &tqueue, NULL);
+    clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &tsubmit, NULL);
+    clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &tstart, NULL);
+    clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tend, NULL);
+		
+    cl_command_type type;
+    clGetEventInfo(m_events[i], CL_EVENT_COMMAND_TYPE, sizeof(cl_int), &type, NULL);
+
+    tqueue = (tqueue >= tref) ? tqueue - tref : tqueue;
+    tsubmit = (tsubmit > tref) ? tsubmit - tref : tsubmit;
+    tstart = (tstart > tref) ? tstart - tref : tstart;
+    tend = (tend > tref) ? tend - tref : tend;
+		
+    if (type == CL_COMMAND_READ_BUFFER || type == CL_COMMAND_WRITE_BUFFER)
+    std::cout << left << i << "*\t| ";
+    else
+    std::cout << left << i << "\t| ";
+    std::cout << setw(7) << left << tqueue << "\t| ";
+    std::cout << setw(7) << left << tsubmit << "\t| ";
+    std::cout << setw(7) << left << tstart << "\t| ";
+    std::cout << setw(7) << left << tend << "\t| ";
+
+    int count = 0;
+    if (i > 0) {
+    for (unsigned int j = 0; j < i; j++) {
+    if (list_ended[j] > tqueue)
+    count++;
+    }
+    }
+    list_ended[i] = tend;
+			
+    std::cout << setw(7) << left << count << std::endl;
+			
+    //this seems to be a problem on MIC sometimes
+    if (tstart == 0) {
+      list_bad_events[num_bad_events] = i;
+      num_bad_events++;
+    }
+    }
+    std::cout << setw(10) << "-----------------------------------------------------------------------------------" << std::endl << std::endl;
+		
+    //print info about failed events
+    for (int i = 0; i < num_bad_events; i++) {
+    cl_int event_status;
+    int id = list_bad_events[i];
+    clGetEventInfo(m_events[id], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &event_status, NULL);
+    std::cout << "Event " << id << " : ";
+    switch(event_status) {
+    case CL_QUEUED:
+    std::cout << "queued" << std::endl;
+    break;
+    case CL_SUBMITTED:
+    std::cout << "submited" << std::endl;
+    break;
+    case CL_RUNNING:
+    std::cout << "running" << std::endl;
+    break;
+    case CL_COMPLETE:
+    std::cout << "complete" << std::endl;
+    break;
+    default:
+    std::cout << "error" << std::endl;
+    break;
+    }
+    }
+    }
+  */
+	
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h
new file mode 100644
index 0000000..ae0a15c
--- /dev/null
+++ b/src/OpenCL/OpenCLBase.h
@@ -0,0 +1,303 @@
+/*
+
+  Name: OpenCLBase
+
+  Author: Uldis Locans
+
+  Info: OpenCL base class to handle all the common details associated 
+  with kernel launch on OpenCL device
+
+  Date: 2014.09.18
+
+*/
+
+#ifndef H_OPENCL_BASE
+#define H_OPENCL_BASE
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <string.h>
+#include <stdio.h>
+
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#endif
+
+
+
+#include "../DKSDefinitions.h"
+
+/* struct for random number state */
+typedef struct {
+
+  double s10;
+  double s11;
+  double s12;
+  double s20;
+  double s21;
+  double s22;
+  double z;
+  bool gen;
+
+} RNDState;
+
+class OpenCLBase {
+
+private:
+	
+  static cl_context m_context;
+  static cl_command_queue m_command_queue;
+
+  static cl_platform_id m_platform_id;
+  static cl_device_id m_device_id;
+
+  cl_context_properties m_context_properties[3];
+  cl_program m_program;
+  cl_kernel m_kernel;
+	
+  static cl_event m_last_event;
+  cl_int m_num_events;
+  std::vector<cl_event> m_events;
+	
+  char * m_kernel_file;
+
+  cl_device_type m_device_type;
+	
+  /*
+    Name: getPlatforms
+    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
+    Return: success or error code
+  */
+  int ocl_getPlatforms();
+	
+	
+  /*
+    Name: getDevice
+    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
+    ReturnL success or error code
+  */
+  int ocl_getDevice(const char* device_name);
+	
+  /*
+    Name getDeviceType
+    Info: get device type from device name (-gpu, -cpu, -mic)
+    Return: success or error code
+  */
+  int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
+	
+  /*
+    Name: createContext
+    Info: create context with specified device
+    Return: success or error code
+  */
+  int ocl_createContext();
+	
+  /*
+    Name: buildProgram
+    Info: build program from specified kernel file
+    Return: success or error code
+  */
+  int ocl_buildProgram(const char* kernel_file);
+
+  /** Compile program from kernel source string
+   *
+   */
+  int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
+
+protected:
+
+  int defaultRndSet;
+  cl_mem defaultRndState;
+	
+	
+public:
+    
+  /*
+    constructor
+  */
+  OpenCLBase();
+    
+  /*
+    destructor
+  */
+  ~OpenCLBase();
+    
+  /*
+    Create RND states
+    Return: success or error code
+  */
+  int ocl_createRndStates(int size);
+
+  /*
+    Destroy rnd states
+    Return: success or error code
+  */
+  int ocl_deleteRndStates();
+
+
+  /*
+    Name: getAllDevices
+    Info: get all available devices
+    ReturnL success or error code
+  */
+  int ocl_getAllDevices();
+
+  /** Get the OpenCL device count for the set type of device
+   *
+   */
+  int ocl_getDeviceCount(int &ndev);
+
+  /** Get the name of the device used
+   */
+  int ocl_getDeviceName(std::string &device_name);
+
+  /** Set the device to use for OpenCL kernels.
+   *  device id to use is passed as integer.
+   */
+  int ocl_setDevice(int device);
+
+  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
+   *  Used when GPUs of different types might be pressent on the system.
+   */
+  int ocl_getUniqueDevices(std::vector<int> &devices);
+    
+  /*
+    Name: setUp
+    Info: set up opencl resources
+    Return: success or error code
+  */
+  int ocl_setUp(const char* device_name);
+	
+  /*
+    Name: loadKernel
+    Info: load and compile opencl kernel file if it has changed
+    Return: success or error code
+  */
+  int ocl_loadKernel(const char* kernel_file);
+
+
+  /** Build program from kernel source.
+   * Builds a program from source code provided in kernel_source.
+   * If compilation fails will return DKS_ERROR
+   */
+  int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
+	
+  /*
+    Name: allocateMemory
+    Info: allocate memory on device
+    Return: return pointer to memory
+  */
+  cl_mem ocl_allocateMemory(size_t size, int &ierr);
+	
+  /*
+    Name: allocateMemory
+    Info: allocate memory on device
+    Return: return pointer to memory
+  */
+  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
+	
+  /*
+    Name: writeData
+    Info: write data to device memory (needs ptr to mem object)
+    Return: success or error code
+  */
+  int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
+	
+  /*
+    Name: copyData
+    Info: copy data from one buffer on the device to another
+    Return: success or error code
+  */
+  int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
+	
+  /*
+    Name: createKernel
+    Info: create kernel from program
+    Return: success or error code
+  */
+  int ocl_createKernel(const char* kernel_name);
+	
+  /*
+    Name: setKernelArgs
+    Info: set opencl kernel arguments
+    Return: success or error code
+  */
+  int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
+	
+  /*
+    Name: executeKernel
+    Info: execute selected kernel (needs kernel parameters)
+    Return: success or error code
+  */
+  int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
+	
+  /*
+    Name: readData
+    Info: read data from device (needs pointer to mem object)
+    Return: success or error code
+  */
+  int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
+	
+  /*
+    Name: freeMemory
+    Info: free device memory (needs ptr to mem object)
+    Return: success or error code
+  */
+  int ocl_freeMemory(cl_mem mem_ptr);
+	
+  /*
+    Name: cleanUp
+    Info: free opencl resources
+    Return: success or error code
+  */
+  int ocl_cleanUp();
+	
+  /*
+    Name: deviceInfo
+    Info: print device info (mostly for debugging purposes)
+    Return: success or error code
+  */
+  int ocl_deviceInfo(bool verbose = true);
+
+  /* Check OpenCL kernel.
+   * Query device and check if it can run the kernel with required parameters
+   */
+  int ocl_checkKernel(const char* kernel_name, int work_group_size,
+		      bool double_precision, int &threadsPerBlock);
+	
+  /*
+    Name: clearEvents
+    Info: clear saved events (for debuging purposes)
+    Return: nothing
+  */
+  void ocl_clearEvents();
+
+  /*
+    Name: eventInfo
+    Info: print information about kernel timings (for debuging purposes)
+    Return: nothing
+  */
+  void ocl_eventInfo();
+
+  /*
+    Return current command queue
+  */
+  cl_command_queue ocl_getQueue() { return m_command_queue; }
+};
+
+#endif
+
+
+
+
+
+
+
+
diff --git a/src/OpenCL/OpenCLChiSquare.cpp b/src/OpenCL/OpenCLChiSquare.cpp
new file mode 100644
index 0000000..7de4a62
--- /dev/null
+++ b/src/OpenCL/OpenCLChiSquare.cpp
@@ -0,0 +1,157 @@
+#include "OpenCLChiSquare.h"
+
+double OpenCLChiSquare::ocl_sum(cl_mem data, int length) {
+
+  
+  int ierr;
+  //calc number of thread sper workgroup and nr of work groups
+  size_t work_size_sum = 128;
+  size_t work_items = (size_t)length;
+  if (length % work_size_sum > 0)
+    work_items = (length / work_size_sum + 1) * work_size_sum;
+
+  int work_groups = length / work_size_sum + 1;
+  
+  //create tmp array for partial sums
+  cl_mem tmp_ptr;
+
+  double *partial_sums = new double[work_groups];
+  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
+  
+  //execute sum kernel
+  m_oclbase->ocl_createKernel("parallelReductionSum");
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
+  m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); 
+  m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
+  
+  //read partial sums and free temp mempry
+  m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
+  m_oclbase->ocl_freeMemory(tmp_ptr);
+  
+  //sumup partial sums on the host
+  double result = 0;
+  for (int i = 0; i < work_groups; i++)
+    result += partial_sums[i];
+
+  delete[] partial_sums;
+
+  return result;
+
+}
+
+int OpenCLChiSquare::ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result, 
+				     double fTimeResolution, double fRebin,
+				     int sensors, int length, int numpar,
+				     double &result)
+{
+
+  //set number of work items and work group sizes for kernel execution
+  size_t work_size = 128;  
+
+  size_t work_items = (size_t)length * sensors;
+  if (length % work_size > 0)
+    work_items = (length / work_size + 1) * work_size;
+
+  cl_mem data = (cl_mem)mem_data;
+  cl_mem par = (cl_mem)mem_par;
+  cl_mem chi = (cl_mem)mem_result;
+
+  //load and execute PHistotFFcn kernel
+  m_oclbase->ocl_createKernel("kernelPHistoTFFcn");
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &par);
+  m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &chi);
+  m_oclbase->ocl_setKernelArg(3, sizeof(double), &fTimeResolution);
+  m_oclbase->ocl_setKernelArg(4, sizeof(double), &fRebin);
+  m_oclbase->ocl_setKernelArg(5, sizeof(int), &length);
+  m_oclbase->ocl_setKernelArg(6, sizeof(int), &sensors);
+  m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
+  m_oclbase->ocl_setKernelArg(8, sizeof(double)*numpar, NULL);
+  m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+
+  result = ocl_sum(chi, sensors*length);
+  
+  return DKS_SUCCESS;
+}
+
+int OpenCLChiSquare::ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+				       double fTimeResolution, double fRebin, double fGoodBinOffset,
+				       int sensors, int length, int numpar,
+				       double &result)
+{
+
+  //set number of work items and work group sizes for kernel execution
+  size_t work_size = 128;  
+  size_t work_items = (size_t)length * sensors;
+  if (length % work_size > 0)
+    work_items = (length / work_size + 1) * work_size;
+
+  cl_mem data = (cl_mem)mem_data;
+  cl_mem t0 = (cl_mem)mem_t0;
+  cl_mem par = (cl_mem)mem_par;
+  cl_mem chi = (cl_mem)mem_result;
+
+  //load and execute PHistotFFcn kernel
+  m_oclbase->ocl_createKernel("kernelSingleGaussTF");
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0);
+  m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par);
+  m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi);
+  m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution);
+  m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin);
+  m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset);
+  m_oclbase->ocl_setKernelArg(7, sizeof(int), &length);
+  m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors);
+  m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar);
+  m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL);
+  m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+
+  result = ocl_sum(chi, length);
+  
+  return DKS_SUCCESS;
+
+}
+
+
+int OpenCLChiSquare::ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+					 double fTimeResolution, double fRebin, double fGoodBinOffset,
+					 int sensors, int length, int numpar,
+					 double &result)
+{
+
+  //set number of work items and work group sizes for kernel execution
+  size_t work_size = 128;  
+  size_t work_items = (size_t)length * sensors;
+  if (length % work_size > 0)
+    work_items = (length / work_size + 1) * work_size;
+
+  cl_mem data = (cl_mem)mem_data;
+  cl_mem t0 = (cl_mem)mem_t0;
+  cl_mem par = (cl_mem)mem_par;
+  cl_mem chi = (cl_mem)mem_result;
+
+  //load and execute PHistotFFcn kernel
+  m_oclbase->ocl_createKernel("kernelDoubleLorentzTF");
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0);
+  m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par);
+  m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi);
+  m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution);
+  m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin);
+  m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset);
+  m_oclbase->ocl_setKernelArg(7, sizeof(int), &length);
+  m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors);
+  m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar);
+  m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL);
+  m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+
+  result = ocl_sum(chi, length);
+  
+  return DKS_SUCCESS;
+
+}
+
+
+
diff --git a/src/OpenCL/OpenCLChiSquare.h b/src/OpenCL/OpenCLChiSquare.h
new file mode 100644
index 0000000..bbc5da6
--- /dev/null
+++ b/src/OpenCL/OpenCLChiSquare.h
@@ -0,0 +1,53 @@
+#ifndef H_OPENCL_CHI_SQUARE
+#define H_OPENCL_CHI_SQUARE
+
+#include <iostream>
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "OpenCLBase.h"
+
+#define DKS_SUCCESS 0
+#define DKS_ERROR 1
+
+
+class OpenCLChiSquare {
+
+private:
+  
+  OpenCLBase *m_oclbase;
+
+  double ocl_sum(cl_mem data, int length);
+  
+public:
+	
+  OpenCLChiSquare(OpenCLBase *base) {
+    m_oclbase = base;
+  }
+
+  ~OpenCLChiSquare() { }
+	
+  int ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result, 
+		      double fTimeResolution, double fRebin,
+		      int sensors, int length, int numpar,
+		      double &result);
+  
+  int ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			double fTimeResolution, double fRebin, double fGoodBinOffset,
+			int sensors, int length, int numpar,
+			double &result);
+
+  int ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
+			  double fTimeResolution, double fRebin, double fGoodBinOffset,
+			  int sensors, int length, int numpar,
+			  double &result);
+
+
+
+};
+
+#endif
diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp
new file mode 100644
index 0000000..f8e21a6
--- /dev/null
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@@ -0,0 +1,316 @@
+#include "OpenCLChiSquareRuntime.h"
+
+OpenCLChiSquareRuntime::OpenCLChiSquareRuntime(OpenCLBase *base) {
+
+  blockSize_m = BLOCK_SIZE;
+  numBlocks_m = -1;
+
+  m_oclbase = base;
+
+  N0_m = 1.0;
+  tau_m = 1.0;
+  bkg_m = 1.0;
+  alpha_m = 1.0;
+  beta_m = 1.0;
+
+  ptx_m = NULL;
+
+  initDone_m = false;
+
+}
+
+//free temporary resources
+OpenCLChiSquareRuntime::~OpenCLChiSquareRuntime() {
+  delete[] ptx_m;
+  freeChiSquare();
+}
+
+//build program string
+std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
+
+  long fsize;
+  char *kernel_source;
+
+  //get kernel source
+  char * kernel_file = new char[500];
+  kernel_file[0] = '\0';
+  strcat(kernel_file, OPENCL_KERNELS);
+  strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl");
+
+  //read kernels from file
+  FILE *fp = fopen(kernel_file, "rb");
+  if (!fp)
+    DEBUG_MSG("Can't open kernel file" << kernel_file);
+
+  //get file size and allocate memory	
+  fseek(fp, 0, SEEK_END);
+  fsize = ftell(fp);
+  kernel_source = new char[fsize+1];
+
+  //read file and content in kernel source
+  rewind(fp);
+  fread(kernel_source, 1, sizeof(char)*fsize, fp);
+  kernel_source[fsize] = '\0';
+  fclose(fp);
+  
+  std::string kernel_string (kernel_source);
+  return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;
+
+}
+
+int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {
+
+  //build program string
+  std::string openclProg = buildProgram(function);
+
+  //compile flags
+  std::string opts("");
+  if (mlh)
+    opts = "-DMLH";
+
+  //compile opencl program from source string
+  int ierr = m_oclbase->ocl_loadKernelFromSource(openclProg.c_str(), opts.c_str());
+
+  return ierr;
+}
+
+double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
+
+  
+  int ierr;
+  //calc number of thread sper workgroup and nr of work groups
+  size_t work_size_sum = 128;
+
+  /*
+  size_t work_items = (size_t)length;
+  if (length % work_size_sum > 0)
+    work_items = (length / work_size_sum + 1) * work_size_sum;
+  int work_groups = length / work_size_sum + 1;
+  */
+  
+  size_t work_items = 80 * work_size_sum;
+  int work_groups = 80;
+
+  //create tmp array for partial sums
+  cl_mem tmp_ptr;
+
+  double *partial_sums = new double[work_groups];
+  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
+  
+  //execute sum kernel
+  //ocl_createKernel("parallelReductionSum");
+  m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
+  m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); 
+  m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
+  
+  //read partial sums and free temp mempry
+  m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
+  m_oclbase->ocl_freeMemory(tmp_ptr);
+  
+  //sumup partial sums on the host
+  double result = 0;
+  for (int i = 0; i < work_groups; i++)
+    result += partial_sums[i];
+
+  delete[] partial_sums;
+
+  return result;
+
+}
+
+int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
+					    void *mem_data, void *mem_err, int length,
+					    int numpar, int numfunc, int nummap,
+					    double timeStart, double timeStep, double &result)
+{
+
+  int ierr;
+
+  //convert memory to cl_mem
+  cl_mem cl_mem_data = (cl_mem)mem_data;
+  cl_mem cl_mem_err = (cl_mem)mem_err;
+
+  cl_mem cl_param = (cl_mem)mem_param_m;
+  cl_mem cl_chisq = (cl_mem)mem_chisq_m;
+  cl_mem cl_map = (cl_mem)mem_map_m;
+  cl_mem cl_func = (cl_mem)mem_func_m;
+
+  //set work item size
+  size_t work_items;
+  size_t work_size = (size_t)blockSize_m;
+  if (numBlocks_m < 0)
+    work_items = (size_t)length;
+  else
+    work_items = (size_t)numBlocks_m * (size_t)blockSize_m;
+
+  if (work_items % work_size > 0)
+    work_items = (work_items / work_size + 1) * work_size;
+
+  if (fitType == FITTYPE_SINGLE_HISTO) {
+    //create kernel
+    ierr = m_oclbase->ocl_createKernel("kernelChiSquareSingleHisto");
+
+    if (ierr != DKS_SUCCESS)
+      return ierr;
+
+    //set kernel args
+    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
+    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
+    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
+    m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq);
+    m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map);
+    m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func);
+    m_oclbase->ocl_setKernelArg(6, sizeof(int), &length);
+    m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
+    m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc);
+    m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap);
+    m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart);
+    m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
+    m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
+    m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
+    m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
+    m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
+    m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
+
+    if (ierr != DKS_SUCCESS)
+      return ierr;
+  } else if (fitType == FITTYPE_ASYMMETRY) {
+    //create kernel
+    ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
+
+    if (ierr != DKS_SUCCESS)
+      return ierr;
+
+    //set kernel args
+    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
+    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
+    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
+    m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq);
+    m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map);
+    m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func);
+    m_oclbase->ocl_setKernelArg(6, sizeof(int), &length);
+    m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
+    m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc);
+    m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap);
+    m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart);
+    m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
+    m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
+    m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
+    m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
+    m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
+
+    if (ierr != DKS_SUCCESS)
+      return ierr;
+  } else if (fitType == FITTYPE_MU_MINUS) {
+    // not yet implemented
+  } else {
+    return DKS_ERROR;
+  }
+
+  //execute kernel
+  ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+
+  if (ierr != DKS_SUCCESS)
+    return ierr;
+
+  //execute sum kernel
+  result = calculateSum((cl_mem)mem_chisq_m, length);
+
+  return ierr;
+
+}
+
+int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
+  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
+  return ierr;
+}
+
+
+int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
+  if (numfunc == 0)
+    return DKS_SUCCESS;
+
+  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
+  return ierr;
+}
+
+int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
+  if (nummap == 0)
+    return DKS_SUCCESS;
+
+  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
+  return ierr;
+}
+
+int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, 
+					  int size_func, int size_map)
+{
+
+  int ierr = DKS_ERROR;
+  if (initDone_m) {
+    DEBUG_MSG("Reinitializing ChiSquare");
+    freeChiSquare();
+  }
+
+  //allocate temporary memory
+  mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
+  mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
+  if (size_func == 0)
+    size_func = 1;
+  mem_func_m = m_oclbase->ocl_allocateMemory(size_func*sizeof(double), ierr);
+  if (size_map == 0)
+    size_map = 1;
+  mem_map_m = m_oclbase->ocl_allocateMemory(size_map*sizeof(int), ierr);
+  initDone_m = true;
+
+  return ierr;
+
+}
+
+int OpenCLChiSquareRuntime::freeChiSquare() {
+
+  int ierr = DKS_ERROR;
+  if (initDone_m) {
+
+    //free memory
+    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
+    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
+    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
+    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
+    
+    initDone_m = false;
+  }
+
+  return ierr;
+
+}
+
+int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBlock) {
+
+  int ierr;
+  char kernel[64];
+
+  switch (fitType) {
+  case FITTYPE_SINGLE_HISTO:
+    strncpy(kernel, "kernelChiSquareSingleHisto", sizeof(kernel));
+    break;
+  case FITTYPE_ASYMMETRY:
+    strncpy(kernel, "kernelChiSquareAsymmetry", sizeof(kernel));
+    break;
+  case FITTYPE_MU_MINUS:
+    // not yet implemented
+  default:
+    return DKS_ERROR;
+  }
+
+  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
+
+  return ierr;
+
+}
+
diff --git a/src/OpenCL/OpenCLChiSquareRuntime.h b/src/OpenCL/OpenCLChiSquareRuntime.h
new file mode 100644
index 0000000..90b5c7c
--- /dev/null
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@@ -0,0 +1,103 @@
+#ifndef H_OPENCL_CHISQUARE_RUNTIME
+#define H_OPENCL_CHISQUARE_RUNTIME
+
+#include <iostream>
+#include <string>
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "../Algorithms/ChiSquareRuntime.h"
+#include "OpenCLBase.h"
+
+const std::string openclFunctHeader = "double fTheory(double t, __local double *p, __local double *f, __local int *m) {";
+
+const std::string openclFunctFooter = "}\n";
+
+class OpenCLChiSquareRuntime : public ChiSquareRuntime {
+
+private:
+
+  OpenCLBase *m_oclbase;
+
+  /** Private function to add user defined function to kernel string
+   *
+   */
+  std::string buildProgram(std::string function);
+
+  double calculateSum(cl_mem data, int length);
+
+public:
+
+  /** Constructor wiht openclbase argument
+   *
+   */
+  OpenCLChiSquareRuntime(OpenCLBase *base);
+  
+  /** Default constructor
+   *
+   */
+  OpenCLChiSquareRuntime();
+
+  /** Default destructor
+   *
+   */
+  ~OpenCLChiSquareRuntime();
+
+    /** Compile program and save ptx.
+   * Add function string to the calcFunction kernel and compile the program
+   * Function must be valid C math expression. Parameters can be addressed in
+   * a form par[map[idx]]
+   */
+  int compileProgram(std::string function, bool mlh = false);
+
+  /** Launch selected kernel
+   * Launched the selected kernel from the compiled code.
+   * Result is put in &result variable
+   */
+  int launchChiSquare(int fitType,
+		      void *mem_data, void *mem_err, int length,
+		      int numpar, int numfunc, int nummap,
+		      double timeStart, double timeStep,
+		      double &result);
+
+  /** Write params to device.
+   * Write params from double array to mem_param_m memory on the device.
+   */
+  int writeParams(const double *params, int numparams); 
+
+  /** Write functions to device.
+   * Write function values from double array to mem_func_m memory on the device.
+   */
+  int writeFunc(const double *func, int numfunc);
+
+  /** Write maps to device.
+   * Write map values from int array to mem_map_m memory on the device.
+   */
+  int writeMap(const int *map, int nummap);
+
+  /** Allocate temporary memory needed for chi square.
+   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
+   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
+   * size_func and size_map are the maximum number of parameters, functions and maps used in 
+   * calculations.
+   */
+  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
+
+  /** Free temporary memory allocated for chi square.
+   * Frees the chisq temporary memory and memory for params, functions and maps
+   */
+  int freeChiSquare();
+
+  /** Check MuSR kernels for necessary resources.
+   * Query device properties to get if sufficient resources are
+   * available to run the kernels
+   */
+  int checkChiSquareKernels(int fitType, int &threadsPerBlock);
+
+};
+
+#endif
diff --git a/src/OpenCL/OpenCLCollimatorPhysics.cpp b/src/OpenCL/OpenCLCollimatorPhysics.cpp
new file mode 100644
index 0000000..46d8b24
--- /dev/null
+++ b/src/OpenCL/OpenCLCollimatorPhysics.cpp
@@ -0,0 +1,107 @@
+#include "OpenCLCollimatorPhysics.h"
+
+#define M_P 0.93827231e+00
+#define C 299792458.0
+#define PI 3.14159265358979323846
+#define AVO 6.022e23
+#define R_E 2.81794092e-15
+#define eM_E 0.51099906e-03
+#define Z_P 1
+#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
+
+#define POSITION 0 
+#define ZSIZE 1
+#define RHO_M 2
+#define Z_M 3
+#define A_M 4
+#define A2_C 5
+#define A3_C 6
+#define A4_C 7
+#define A5_C 8
+#define X0_M 9
+#define I_M 10
+#define DT_M 11
+
+#define BLOCK_SIZE 128
+#define NUMPAR 12
+
+/*
+TODO: 
+1. test OpenCL kernel 
+  - is it launched for all particles 
+  - does the random number generatror function properly 
+  - is particle structure updated correctly in memory
+2. boost.compute sort for user defined structure crashes
+*/
+int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, 
+					       int numparticles) 
+{
+  /*
+  //set number of total threads, and number threads per block
+  size_t threads = 1;
+  size_t blocks = numparticles;
+
+  //cast void ptrs to cl_mem ptrs
+  cl_mem data = (cl_mem)mem_ptr;
+  cl_mem params = (cl_mem)par_ptr;
+
+  int numparams = 19;
+
+  //set kernel to execute and kernel arguments
+  ocl_createKernel("kernelCollimatorPhysics");
+  ocl_setKernelArg(0, sizeof(cl_mem), &data);
+  ocl_setKernelArg(1, sizeof(cl_mem), &params);
+  ocl_setKernelArg(2, sizeof(cl_mem), &defaultRndState);
+  ocl_setKernelArg(3, sizeof(int), &numparticles);
+  ocl_setKernelArg(4, sizeof(double)*numparams, NULL);
+
+  std::cout << "blocks: " << blocks << ", threads: " << threads << std::endl;
+
+  //execute kernel on device
+  ocl_executeKernel(1, &blocks, &threads);
+
+  //create functions for comparing two particles and counting particles with labels < 0
+  
+  BOOST_COMPUTE_FUNCTION(bool, sort_by_label, (PART_OPENCL a, PART_OPENCL b),
+			 {
+			   return a.label < b.label;
+			 });
+  
+  
+  
+  BOOST_COMPUTE_FUNCTION(bool, count_by_label, (PART_OPENCL a),
+			 {
+			   return a.label < 0;
+			 });
+  
+  
+  //wrap cl_mem memory object in Boost.Compute buffer
+  std::cout << "wrap buffer" << std::endl;
+  boost::compute::buffer buf(data);
+
+  //count particles with labels < 0
+  std::cout << "wrap command queue" << std::endl;
+  boost::compute::command_queue queue(ocl_getQueue());
+  
+  std::cout << "count if" << std::endl;
+
+
+  numaddback = boost::compute::count_if(boost::compute::make_buffer_iterator<PART_OPENCL>(buf,0), 
+					boost::compute::make_buffer_iterator<PART_OPENCL>(buf,numparticles), 
+					count_by_label, queue);
+
+  //sort particles with dead and leaving particles at the end using boos::compute
+  numaddback = 0;
+  if (numaddback > 0) {
+    std::cout << "sort" << std::endl;
+    boost::compute::sort(boost::compute::make_buffer_iterator<PART_OPENCL>(buf,0),
+			 boost::compute::make_buffer_iterator<PART_OPENCL>(buf, numparticles),
+			 sort_by_label, queue);
+  }
+  
+
+  return DKS_SUCCESS;
+*/
+  std::cout << "OpenCL implementation disabled" << std::endl;
+  return DKS_ERROR;
+}
diff --git a/src/OpenCL/OpenCLCollimatorPhysics.h b/src/OpenCL/OpenCLCollimatorPhysics.h
new file mode 100644
index 0000000..7b532ff
--- /dev/null
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@@ -0,0 +1,85 @@
+#ifndef H_OPENCL_DEGRADER
+#define H_OPENCL_DEGRADER
+
+#include <iostream>
+#include <math.h>
+
+#include "../Algorithms/CollimatorPhysics.h"
+#include "OpenCLBase.h"
+
+/*
+#include "boost/compute/types/struct.hpp"
+#include "boost/compute/type_traits/type_name.hpp"
+#include "boost/compute/algorithm/count_if.hpp"
+#include "boost/compute/algorithm/sort.hpp"
+#include "boost/compute/container/vector.hpp"
+#include "boost/compute/iterator/buffer_iterator.hpp"
+#include "boost/compute/core.hpp"
+*/
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Double3;
+
+typedef struct {
+  int label;
+  unsigned localID;
+
+  Double3 Rincol;
+  Double3 Pincol;
+} PART_OPENCL;
+
+//adapt struct PART for use in Boost.Compute
+//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
+//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
+
+class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
+
+private:
+  OpenCLBase *m_oclbase;
+
+public:
+
+  /* constructor */
+  OpenCLCollimatorPhysics(OpenCLBase *base) { 
+    m_oclbase = base;
+  }
+
+  /* destructor */
+  ~OpenCLCollimatorPhysics() { 
+  }
+
+  /* execute degrader code on device */
+  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
+
+  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
+			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			   void *px_ptr, void *py_ptr, void *pz_ptr,
+			   void *par_ptr, int numparticles) { return DKS_ERROR; }
+  
+  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { return DKS_ERROR; }
+
+  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
+			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
+			       void *px_ptr, void *py_ptr, void *pz_ptr,
+			       void *par_ptr, int numparticles, int &numaddback) { return DKS_ERROR; }
+
+  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
+			   double dt, double c, bool usedt = false, int streamId = -1) 
+    { 
+      return DKS_ERROR; 
+    }
+
+  int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
+				    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
+				    double dt, double c, bool usedt = false, 
+				    int streamId = -1)
+    { 
+      return DKS_ERROR; 
+    }
+
+};
+
+#endif
diff --git a/src/OpenCL/OpenCLFFT.cpp b/src/OpenCL/OpenCLFFT.cpp
new file mode 100644
index 0000000..5cbe9e9
--- /dev/null
+++ b/src/OpenCL/OpenCLFFT.cpp
@@ -0,0 +1,303 @@
+#include "OpenCLFFT.h"
+
+//=====================================//
+//==========Private functions==========//
+//=====================================//
+
+/*
+  call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform, 
+  ndim - totla number of dimmensions, N - size of dimension
+*/
+int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) {
+
+  //set the number of work items in each dimension
+  size_t work_items[3];
+  work_items[0] = N;
+  work_items[1] = (ndim > 1) ? N : 1;
+  work_items[2] = (ndim > 1) ? N : 1;
+  work_items[cdim] = N / 2;
+	
+  int f = (forward) ? 1 : 0;
+	
+  //create kernel and set kernel arguments
+  if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
+    return OCL_ERROR;
+
+  if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
+    return OCL_ERROR;
+
+	
+  //execute kernel
+  for (int step = 1; step < N; step <<= 1) {
+    if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS)
+      return OCL_ERROR;
+
+    if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) 
+      return OCL_ERROR;
+  }
+	
+  return OCL_SUCCESS;
+}
+
+/*
+  call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform, 
+  ndim - totla number of dimmensions, N - size of dimension
+*/
+int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) {
+  //set work item size
+  size_t work_items[3];
+  work_items[0] = N;
+  work_items[1] = (ndim > 1) ? N : 1;
+  work_items[2] = (ndim > 2) ? N : 1;
+
+  //create kernel and set kernel arguments
+  if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  int bits = log2(N);
+  if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  //execute kernel
+  if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
+    DEBUG_MSG("Error executing kernel");
+    return OCL_ERROR;
+  }
+
+  return OCL_SUCCESS;
+	
+}
+
+
+//=====================================//
+//==========Public functions==========//
+//=====================================//
+
+/*
+  call fft execution on device for every dimension
+*/
+int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
+  int ierr;
+	
+  cl_mem inout = (cl_mem)data;
+  int n = N[0];
+
+  for (int dim = 0; dim < ndim; dim++) {
+    ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
+    if (ierr != OCL_SUCCESS) {
+      DEBUG_MSG("Error executing bit reverse");
+      return OCL_ERROR;
+    }
+
+    ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
+    if (ierr != OCL_SUCCESS) {
+      DEBUG_MSG("Error executing fft reverse");
+      return OCL_ERROR;
+    }
+  }
+
+  return OCL_SUCCESS;
+}
+	
+/*
+  execute ifft
+*/
+int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
+  executeFFT(data, ndim, N, streamId, false);
+  return OCL_SUCCESS;
+}
+	
+/*
+  call kernel to normalize fft
+*/
+int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
+
+  cl_mem inout = (cl_mem)data;
+
+  int n = N[0];
+
+  //set work item size
+  size_t work_items[3];
+  work_items[0] = n;
+  work_items[1] = (ndim > 1) ? n : 1;
+  work_items[2] = (ndim > 2) ? n : 1;
+	
+  //create kernel
+  if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS)
+    return OCL_ERROR;
+	
+  //set kernel args
+  unsigned int elements = pow(n, ndim);
+  if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS)
+    return OCL_ERROR;
+  if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS)
+    return OCL_ERROR;
+		
+  //execute kernel
+  if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
+    DEBUG_MSG("Error executing kernel");
+    return OCL_ERROR;
+  }
+	
+  return OCL_SUCCESS;
+}
+
+int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
+	
+  int ierr;
+  int size = sizeof(cl_double2)*pow(N,ndim);
+	
+  cl_mem mem_tmp;
+  cl_mem mem_src = (cl_mem)src;
+  cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
+
+  //set the number of work items in each dimension
+  size_t work_items[3];
+  int p = 1;
+  int threads = N / 2;
+  int f = (forward) ? -1 : 1;
+	
+  //execute kernel
+  int n = (int)log2(N);
+  for (int i = 0; i < ndim; i++) {
+
+    int dim = i+1;
+    p = 1;
+    work_items[0] = (dim == 1) ? N/2 : N;
+    work_items[1] = (dim == 2) ? N/2 : N;
+    work_items[2] = (dim == 3) ? N/2 : N;
+		
+    //transpose array if calculating dimension larger than 1
+    //if (dim > 1) 
+    //	ocl_executeTranspose(mem_src, N, ndim, dim);
+		
+    //create kernel and set kernel arguments
+    if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
+      return OCL_ERROR;
+			
+    for (int t = 1; t <= log2(N); t++) {
+		
+      m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
+      m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
+      m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
+      m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
+      m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
+      m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
+		
+      if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) 
+	return OCL_ERROR;
+
+      mem_tmp = mem_src;
+      mem_src = mem_dst;
+      mem_dst = mem_tmp;
+	
+      p = 2*p;
+    }
+		
+    //transpose array back if calculating dimension larger than 1
+    //if (dim > 1)
+    //	ocl_executeTranspose(mem_src, N, ndim, dim);
+  }	
+
+  if (ndim*n % 2 == 1) {
+    m_oclbase->ocl_copyData(mem_src, mem_dst, size);
+    mem_tmp = mem_src;
+    mem_src = mem_dst;
+    mem_dst = mem_tmp;
+  }
+
+  m_oclbase->ocl_freeMemory(mem_dst);
+		
+  return OCL_SUCCESS;
+	
+}
+
+int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
+
+  cl_mem mem_src = (cl_mem)src;
+	
+  size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
+  size_t work_group_size[3] = {(size_t)N/2, 1, 1};
+	
+  m_oclbase->ocl_createKernel("fft_batch3D");
+	
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
+  m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
+  m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
+  m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
+	
+	
+  for (int dim = 1; dim < ndim+1; dim++) {
+    m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
+    m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
+  }
+	
+  return OCL_SUCCESS;
+}
+
+int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
+	
+  cl_mem mem_src = (cl_mem)src;
+	
+  if (ndim == 1)
+    return OCL_SUCCESS;
+		
+  size_t work_items[3];
+  work_items[0] = N[0];
+  work_items[1] = N[1];
+  work_items[2] = 1;
+
+  size_t work_group_size[3];
+  work_group_size[0] = N[0];
+  work_group_size[1] = N[1];
+  work_group_size[2] = 1;
+
+  size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
+	
+  m_oclbase->ocl_createKernel("transpose");
+  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
+  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
+  m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
+  m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
+  m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
+
+  return OCL_SUCCESS;
+}
+
+/*
+void OpenCLFFT::printData3DN4(cl_double2* &data, int N) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N; k++) {
+	double d = data[i*N*N + j*N + k].x;
+	if (d > 10e-5 || d < -10e-5)
+	  std::cout << d << "\t";
+	else 
+	  std::cout << 0 << "\t";
+      }
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+    
+}
+*/
+
+
+
+
diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h
new file mode 100644
index 0000000..31816f9
--- /dev/null
+++ b/src/OpenCL/OpenCLFFT.h
@@ -0,0 +1,113 @@
+/*
+
+  Name: OpenCLFFT
+
+  Author: Uldis Locans
+
+  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
+
+  Data: 19.09.2014
+
+*/
+#ifndef H_OPENCL_FFT
+#define H_OPENCL_FFT
+
+
+#include <iostream>
+#include <math.h>
+#include <complex>
+
+#include "../Algorithms/FFT.h"
+#include "OpenCLBase.h"
+
+class OpenCLFFT : public DKSFFT {
+
+private:
+
+  OpenCLBase *m_oclbase;
+
+  /*
+    Info: call fft kernels to execute FFT of the given domain,
+    data - devevice memory ptr, cdim - current dim to transform, 
+    ndim - totla number of dimmensions, N - size of dimension
+    Return: success or error code
+  */
+  int ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward = true);
+	
+  /*
+    Info: call ifft kernel to execute the bit reverse sort
+    data - devevice memory ptr, cdim - current dim to transform, 
+    ndim - totla number of dimmensions, N - size of dimension
+    Return: success or error code
+  */
+  int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
+
+public:
+
+  /* constructor - currently does nothing*/
+  OpenCLFFT(OpenCLBase *base) {
+    m_oclbase = base;
+  }
+	
+  /* destructor - currently does nothing*/
+  ~OpenCLFFT() { }
+	
+  /*
+    Info: execute forward fft function with data set on device
+    Return: success or error code
+  */
+  //int ocl_executeFFT(cl_mem &data, int ndim, int N, bool forward = true);
+  int executeFFT(void *data, int ndim, int N[3], int streamId = -1, bool forward = true);
+	
+  /*
+    Info: execute inverse fft with data set on device
+    Return: success or error code
+  */
+  //int ocl_executeIFFT(cl_mem &data, int ndim, int N);
+  int executeIFFT(void *data, int ndim, int N[3], int streamId = -1);
+	
+  /*
+    Info: execute normalize kernel
+    Return: success or error code
+  */
+  //int ocl_normalizeFFT(cl_mem &data, int ndim, int N);
+  int normalizeFFT(void *data, int ndim, int N[3], int streamId = -1);
+	
+  /*
+    Info: set FFT size
+    Return: success or error code
+  */
+  int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
+
+  int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
+
+  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
+
+  int destroyFFT() { return DKS_SUCCESS; }
+	
+  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
+				int streamId = -1)
+    {
+      return DKS_ERROR;
+    }
+  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
+				int streamId = -1)
+    {
+      return DKS_ERROR;
+    }
+  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
+    {
+      return DKS_ERROR;
+    }
+
+  int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
+
+  int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
+
+  int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
+	
+  //void printData3DN4(cl_double2* &data, int N);
+
+};
+
+#endif
diff --git a/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl b/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl
new file mode 100644
index 0000000..f08f268
--- /dev/null
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl
@@ -0,0 +1,175 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define TAU 2.197019
+
+
+__kernel void parallelReductionSum(__global double *data_in, __global double *data_out, 
+				   __local double *data_local, int size) 
+{
+
+  //get local and global ids, and work group size
+  int local_id = get_local_id(0);
+  int global_id = get_global_id(0);
+  int group_size = get_local_size(0);
+
+  //copy from global memory to local, if global id out of bounds fill with 0s
+  if (global_id < size)
+    data_local[local_id] = data_in[global_id];
+  else
+    data_local[local_id] = 0;
+
+  //loop trough reduction steps
+  for (uint stride = group_size / 2; stride > 0; stride /= 2) {
+
+    //synch all work items in work group
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //create partials summs each step
+    if (local_id < stride)
+      data_local[local_id] += data_local[local_id + stride];
+  }
+ 
+  //local thread 0 writes final partial sum to global memory
+  if (local_id == 0)
+    data_out[get_group_id(0)] = data_local[0];
+
+}
+
+__kernel void kernelPHistoTFFcn(__global double *data, __global double *par, __global double *chisq,
+				double fTimeResolution, double fRebin,
+				int length, int sensors, int numpar,
+				__local double *p)
+{
+
+  //get work item id and calc global id
+  int tid = get_local_id(0);
+  int j = get_global_id(0);
+
+  //load parameters from global to shared memory
+  if (tid < numpar)
+    p[tid] = par[tid];
+
+  //sync work items inside work group
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+  if (j < length) {
+    
+    double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
+    double time = dt0 + fTimeResolution * fRebin * j;  
+    double w = p[0]*0.08516155035269027;
+    double tt = exp(-time/TAU);
+    double pp = exp(-0.5 * pow(p[1]*time, 2.0));
+    double wt = w * time;
+    
+
+    int idx;
+    double ldata, theo;
+    for (int i = 0; i < sensors; i++) {
+      idx = i * length + j;
+      ldata = data[idx];
+  
+      theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; 
+  
+      if (ldata != 0.0)
+	chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
+      else
+	chisq[idx] = theo * theo;
+    }
+  }
+}
+
+__kernel void kernelSingleGaussTF(__global double *data, __global unsigned int *t0,
+				  __global double *par, __global double *result,
+				  double fTimeResolution, double fRebin, double fGoodBinOffset,
+				  int length, int sensors, int numpar, __local double *p)
+{
+
+  //get work item id and calc global id
+  int tid = get_local_id(0);
+  int j = get_global_id(0);
+  
+  //load para,eters from global to shared memory
+  if (tid < numpar)
+    p[tid] = par[tid];
+
+  //sync work items inside work group
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+  if (j < length) {
+    double dt0 = fTimeResolution*0.5*(fRebin - 1);
+    double w1 = par[0]*0.08516155035269027;
+
+    int idx;
+    double ldata, lft0, theo, time;
+    for (int  i = 0; i < sensors; i++) {
+      idx = i * length + j;
+      lft0 = t0[i];
+      if (j >= lft0 + fGoodBinOffset/fRebin) {
+	ldata = data[idx];
+	time = dt0 + fTimeResolution * fRebin* (j - lft0);
+	theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0))
+					*cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; 
+	// 1.74532925199432955e-2 = pi/180
+
+	if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) )
+	  result[idx] = (theo - ldata) + ldata*log(ldata/theo);
+	else
+	  result[idx] = theo - ldata;
+      } else {
+	result[idx] = 0;
+      }
+    }
+  }
+
+}
+
+__kernel void kernelDoubleLorentzTF(__global double *data, __global unsigned int *t0,
+				    __global double *par, __global double *result,
+				    double fTimeResolution, double fRebin, double fGoodBinOffset,
+				    int length, int sensors, int numpar, __local double *p)
+{
+
+  //get work item id and calc global id
+  int tid = get_local_id(0);
+  int j = get_global_id(0);
+  
+  //load para,eters from global to shared memory
+  if (tid < numpar)
+    p[tid] = par[tid];
+
+  //sync work items inside work group
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+  if (j < length) {
+    double dt0 = fTimeResolution*0.5*(fRebin - 1);
+    double w1 = p[0]*0.08516155035269027;
+    double w2 = p[2]*0.08516155035269027;
+
+    int idx;
+    double ldata, lft0, theo, time;
+    for (int  i = 0; i < sensors; i++) {
+      
+      idx = i * length + j;
+      lft0 = t0[i];
+      if (j >= lft0 + fGoodBinOffset/fRebin) {
+        ldata = data[idx];
+        time = dt0+fTimeResolution*fRebin*(j-lft0);
+
+	theo = p[4+i*5]*exp(-time/TAU)*
+	  (1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)*
+	   cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+
+	   (1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)*
+	   cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5]; 
+	// 1.74532925199432955e-2 = pi/180
+	if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
+	  result[idx] = (theo - ldata) + ldata*log(ldata/theo);
+	else
+	  result[idx] = theo - ldata;
+      } else {
+      result[idx] = 0;
+      }
+    }
+  }
+
+}
+
diff --git a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
new file mode 100644
index 0000000..bdc9374
--- /dev/null
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
@@ -0,0 +1,344 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define PI     3.141592653589793115998
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+/** From 'Numerical Recipes in C' by Press et.al, 1992. */
+//Returns the Bessel function J0(x) for any real x.
+double bessj0(double x) {
+  double ax,z;
+  double xx,y,ans,ans1,ans2; //Accumulate polynomials in double precision.
+  
+  if ((ax=fabs(x)) < 8.0) { //Direct rational function fit.
+    y=x*x;
+    ans1=57568490574.0+y*(-13362590354.0+y*(651619640.7+y*(-11214424.18+y*(77392.33017+y*(-184.9052456)))));
+    ans2=57568490411.0+y*(1029532985.0+y*(9494680.718+y*(59272.64853+y*(267.8532712+y*1.0))));
+    ans=ans1/ans2;
+  } else { //Fitting function (6.5.9).
+    z=8.0/ax;
+    y=z*z;
+    xx=ax-0.785398164;
+    ans1=1.0+y*(-0.1098628627e-2+y*(0.2734510407e-4+y*(-0.2073370639e-5+y*0.2093887211e-6)));
+    ans2 = -0.1562499995e-1+y*(0.1430488765e-3+y*(-0.6911147651e-5+y*(0.7621095161e-6-y*0.934945152e-7)));
+    ans=sqrt(0.636619772/ax)*(cos(xx)*ans1-z*sin(xx)*ans2);
+  }
+  return ans;
+}
+
+/** Theory function declaration.
+ * Definition of the theory function will be build during runtime before compilation.
+ */
+double fTheory(double t, __local double *p, __local double *f, __local int *m);
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double t, double lamda) {
+  return exp( -lamda*t );
+}
+
+double ge(double t, double lamda, double beta) {
+  return exp( -pow(lamda*t, beta) );
+}
+
+double sg(double t, double sigma) {
+  return exp( -0.5 * pow(sigma*t, 2) );
+}
+
+double stg(double t, double sigma) {
+  double sigmatsq = pow(sigma*t,2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double t, double lambda) {
+  double lambdat = lambda*t;
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double t, double lambda, double sigma) {
+  double lambdat = lambda*t;
+  double sigmatsq = pow(sigma*t, 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double t, double sigma, double beta) {
+  if (beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(sigma*t, beta);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
+}
+
+double spg(double t, double lambda, double gamma, double q) {
+  double lam2 = lambda*lambda;
+  double lamt2q = t*t*lam2*q;
+  double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double t, double nu, double lambda) {
+  double nut  = nu*t;
+  double nuth = nu*t/2.0;
+  double lamt = lambda*t;
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double t, double phi, double nu) {
+  double tmp_nu = TWO_PI*nu*t;
+  double tmp_phi = DEG_TO_RAD * phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+
+  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+double b(double t, double phi, double nu) {
+  return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
+}
+
+double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI * nu * t;
+  double ph = DEG_TO_RAD * phi;
+
+  return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+double ab(double t, double sigma, double gamma) {
+  double gt = gamma*t;
+
+  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double t, double Delta0, double Rb) {
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double t, double phi, double nu, double Delta0, double Rb) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double t, double Delta0, double Rb, double nuc) {
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
+}
+
+double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
+}
+
+__kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err,
+            __global double *par, __global double *chisq, __global  int *map, __global double *funcv,
+			      int length, int numpar, int numfunc, int nummap,
+			      double timeStart, double timeStep,         
+			      double tau, double N0, double bkg, 
+			      __local double *p, __local double *f, __local int *m)
+{
+
+  //get thread id and calc global id
+  int tid = get_local_id(0);
+  int j = get_global_id(0);
+  int lsize = get_local_size(0);
+
+  //load parameters from global to shared memory                           
+  while (tid < numpar) {
+    p[tid] = par[tid];
+    tid += lsize;
+  }
+
+  //load functions from global to shared memory
+  tid = get_local_id(0);
+  while (tid < numfunc) {
+    f[tid] = funcv[tid];
+    tid += lsize;
+  }
+
+  //load maps from global memory
+  tid = get_local_id(0);
+  while (tid < nummap) {
+    m[tid] = map[tid];
+    tid += lsize;
+  }
+
+  //sync threads
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  while (j < length) {
+    
+    double t = timeStart + j*timeStep;                                  
+    double ldata = data[j];
+    double lerr = err[j];
+
+    double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg;
+
+    #ifdef MLH
+    if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
+      chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo));
+    else
+      chisq[j] = 2.0 * (theo - ldata);
+    #else
+    if (lerr != 0.0)
+      chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
+    else
+      chisq[j] = theo * theo;
+    #endif
+
+    j += get_global_size(0); 
+  }
+
+}
+
+__kernel void kernelChiSquareAsymmetry(__global double *data, __global double *err,
+            __global double *par, __global double *chisq, __global  int *map, __global double *funcv,
+            int length, int numpar, int numfunc, int nummap,
+            double timeStart, double timeStep,
+            double alpha, double beta,
+            __local double *p, __local double *f, __local int *m)
+{
+
+  //get thread id and calc global id
+  int tid = get_local_id(0);
+  int j = get_global_id(0);
+  int lsize = get_local_size(0);
+
+  //load parameters from global to shared memory
+  while (tid < numpar) {
+    p[tid] = par[tid];
+    tid += lsize;
+  }
+
+  //load functions from global to shared memory
+  tid = get_local_id(0);
+  while (tid < numfunc) {
+    f[tid] = funcv[tid];
+    tid += lsize;
+  }
+
+  //load maps from global memory
+  tid = get_local_id(0);
+  if (tid < nummap) {
+    m[tid] = map[tid];
+    tid += lsize;
+  }
+
+  //sync threads
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  while (j < length) {
+
+    double t = timeStart + j*timeStep;
+    double ldata = data[j];
+    double lerr = err[j];
+
+    double ab = alpha*beta;
+    double theoVal = fTheory(t, p, f, m);
+    double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0)-(ab-1.0)*theoVal);
+
+    #ifdef MLH
+    chisq[j] = 0.0; // max log likelihood not defined for asymmetry fit
+    #else
+    if (lerr != 0.0)
+      chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
+    else
+      chisq[j] = theo * theo;
+    #endif
+
+    j += get_global_size(0); 
+  }
+
+}
+
+__kernel void parallelReductionSum(__global double *data_in, __global double *data_out,
+				   __local double *data_local, int size) 
+{
+
+  //get local and global ids, and work group size
+  int local_id = get_local_id(0);
+  int global_id = get_global_id(0);
+  int group_size = get_local_size(0);
+
+  //copy from global memory to local, if global id out of bounds fill with 0s
+  if (global_id < size)
+    data_local[local_id] = data_in[global_id];
+  else
+    data_local[local_id] = 0;
+
+  //loop trough reduction steps
+  for (uint stride = group_size / 2; stride > 0; stride /= 2) {
+
+    //synch all work items in work group
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //create partials summs each step
+    if (local_id < stride)
+      data_local[local_id] += data_local[local_id + stride];
+  }
+ 
+  //local thread 0 writes final partial sum to global memory
+  if (local_id == 0)
+    data_out[get_group_id(0)] = data_local[0];
+
+}
+
+__kernel void parallelReductionTwoPhase(__global double *data_in, __global double *data_out, 
+					__local double *data_local, int size)
+{
+  //get local and global ids, and work group size
+  int local_id = get_local_id(0);
+  int global_id = get_global_id(0);
+  int global_size = get_global_size(0);
+  int group_size = get_local_size(0);
+
+  double acc = 0;
+  while (global_id < size) {
+    acc += data_in[global_id];
+    global_id += global_size;
+  }
+
+  //parallel reduction on local work group
+  data_local[local_id] = acc;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (uint stride = group_size / 2; stride > 0; stride /= 2) {
+    //synch all work items in work group
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //create partials summs each step
+    if (local_id < stride)
+      data_local[local_id] += data_local[local_id + stride];
+  }
+
+  //local thread 0 writes final partial sum to global memory
+  if (local_id == 0)
+    data_out[get_group_id(0)] = data_local[0];
+
+}
diff --git a/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
new file mode 100644
index 0000000..34b08bd
--- /dev/null
+++ b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
@@ -0,0 +1,362 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#pragma OPENCL EXTENSION 
+
+
+/******Random numbers********/
+
+/* struct for random number state */
+typedef struct {
+
+  double s10;
+  double s11;
+  double s12;
+  double s20;
+  double s21;
+  double s22;
+  double z;
+  bool gen;
+
+} RNDState;
+
+#define NORM 2.328306549295728e-10
+#define M1    4294967087.0
+#define M2    4294944443.0
+#define A12      1403580.0
+#define A13N      810728.0
+#define A21       527612.0
+#define A23N     1370589.0
+
+/* MRG32k3a uniform random number generator */
+double rand_uniform(RNDState *s) {
+  long k;
+  double p1, p2;
+
+  /* Component 1 */
+  p1 = A12 * (*s).s11 - A13N * (*s).s10;
+  k = p1 / M1;
+  p1 -= k * M1;
+  if (p1 < 0.0)
+    p1 += M1;
+  (*s).s10 = (*s).s11;
+  (*s).s11 = (*s).s12;
+  (*s).s12 = p1;
+
+  /* Component 2 */
+  p2 = A21 * (*s).s22 - A23N * (*s).s20;
+  k = p2 / M2;
+  p2 -= k * M2;
+  if (p2 < 0.0)
+    p2 += M2;
+  (*s).s20 = (*s).s21;
+  (*s).s21 = (*s).s22;
+  (*s).s22 = p2;
+
+  /* Combination */
+  if (p1 <= p2)
+    return ((p1 - p2 + M1) * NORM);
+  else
+    return ((p1 - p2) * NORM);
+}
+
+/* get random variable with gaussian distribution */
+double rand_normal(RNDState *s, double mu, double sigma) {
+
+  const double two_pi = 2.0 * 3.141592653589793223846;
+  double z0;
+
+  if (!(*s).gen) {
+    (*s).gen = true;
+    return (*s).z * sigma + mu;
+  }
+
+  double u1, u2;
+  u1 = rand_uniform(s);
+  u2 = rand_uniform(s);
+
+  z0 = sqrt(-2.0 * log(u1)) * cos(two_pi * u2);
+  (*s).z = sqrt(-2.0 * log(u1)) * sin(two_pi * u2);
+  (*s).gen = false;
+
+  return z0 * sigma + mu;
+
+
+}
+
+/* initialize random states */
+__kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
+
+  int id = get_global_id(0);
+
+  if (id < N) {
+    RNDState tmp;
+    int tmp_seed = id;// * 0x100000000ULL;
+    tmp.s10 = 12345 + tmp_seed;
+    tmp.s11 = 12345 + tmp_seed;
+    tmp.s12 = 123 + tmp_seed;
+    tmp.s20 = 12345 + tmp_seed;
+    tmp.s21 = 12345 + tmp_seed;
+    tmp.s22 = 123 + tmp_seed;
+
+    tmp.z = 0;
+    tmp.gen = true;
+
+    s[id] = tmp;
+  }
+
+}
+
+
+/**********Degrader**********/
+enum PARAMS { POSITION, 
+	      ZSIZE, 
+	      M_P, 
+	      C, 
+	      RHO_M, 
+	      PI, 
+	      AVO, 
+	      R_E,
+	      eM_E,
+	      Z_M, 
+	      A_M, 
+	      A2_C, 
+	      A3_C, 
+	      A4_C, 
+	      A5_C, 
+	      Z_P, 
+	      X0_M,
+	      I_M,
+	      DT_M};
+
+
+typedef struct {
+  int label;
+  unsigned localID;
+  double3 Rincol;
+  double3 Pincol;
+} PART;
+
+double Dot(double3 d1, double3 d2) {
+  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
+}
+
+/* check if particle is in degrader material */
+bool checkHit(double z, double position, double zsize) {
+  return ( ( z > position) && ( z <= position + zsize) );
+}
+
+/* calculate particles energy loss */
+void energyLoss(double *Eng, bool *pdead, double deltat, RNDState *s, __local double *par) {
+
+  double dEdx = 0.0;
+  double gamma = ( (*Eng) + par[M_P]) / par[M_P];
+
+  double gamma2 = gamma * gamma;
+
+  double beta = sqrt(1.0 - 1.0 / gamma2);
+  double beta2 = beta * beta;
+  double deltas = deltat * beta * par[C];
+  double deltasrho = deltas * 100 * par[RHO_M];
+  double K = 4.0 * par[PI] * par[AVO] * par[R_E] * par[R_E] * par[eM_E] * 1E7;
+  double sigma_E = sqrt(K * par[eM_E] * par[RHO_M] * (par[Z_M]/par[A_M])* deltas * 1E5);
+
+  if (((*Eng) > 0.00001) && ((*Eng) < 0.0006)) {
+    double Ts = ((*Eng)*1E6)/1.0073; 
+    double epsilon_low = par[A2_C]*pow(Ts,0.45);
+    double epsilon_high = (par[A3_C]/Ts)*log(1+(par[A4_C]/Ts)+(par[A5_C]*Ts));
+    double epsilon = (epsilon_low*epsilon_high)/(epsilon_low + epsilon_high);
+    dEdx = - epsilon /(1E21*(par[A_M]/par[AVO])); 
+    double delta_Eave = deltasrho * dEdx;
+    double delta_E = delta_Eave + rand_normal(s, 0, sigma_E);
+
+    (*Eng) = (*Eng) + delta_E / 1E3;
+  }
+
+  if ((*Eng) >= 0.0006) {
+    double Tmax = 2.0 * par[eM_E] * 1e9 * beta2 * gamma2 /
+      (1.0 + 2.0 * gamma * par[eM_E] / par[M_P] + 
+       (par[eM_E] / par[M_P]) * (par[eM_E] / par[M_P]));
+    dEdx = -K * par[Z_P] * par[Z_P] * par[Z_M] / (par[A_M] * beta2) *
+      (1.0 / 2.0 * log(2 * par[eM_E] * 1e9 * beta2 * gamma2 * 
+		       Tmax / par[I_M] / par[I_M]) - beta2);
+
+    double delta_Eave = deltasrho * dEdx;
+    double delta_E = delta_Eave + rand_normal(s, 0, sigma_E);
+
+    (*Eng) = (*Eng)+delta_E / 1E3;
+  }
+
+  (*pdead) = (((*Eng)<1E-4) || (dEdx>0));
+
+}
+
+/* rotate partocle */
+void Rot(double3 *P,  double3 *R, double xplane, 
+	 double normP, double thetacou, double deltas, int coord,
+	 __local double *par) 
+{
+  double Psixz;
+  double pxz;
+
+  double px = (*P).x;
+  double pz = (*P).z;
+  double x = (*R).x;
+  double z = (*R).z;
+
+  if (px>=0 && pz>=0) Psixz = atan(px/pz);
+  else if (px>0 && pz<0)
+    Psixz = atan(px/pz) + par[PI];
+  else if (px<0 && pz>0)
+    Psixz = atan(px/pz) + 2*par[PI];
+  else
+    Psixz = atan(px/pz) + par[PI];
+
+  pxz = sqrt(px*px + pz*pz);
+  if(coord==1) {
+    (*R).x = x + deltas * px/normP + xplane*cos(Psixz);
+    (*R).z = z - xplane * sin(Psixz);
+  }
+  if(coord==2) {
+    (*R).x = x + deltas * px/normP + xplane*cos(Psixz);
+    (*R).z = z - xplane * sin(Psixz) + deltas * pz / normP;
+  }
+  (*P).x = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
+  (*P).z = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
+}
+
+
+void coulombScat(double3 *R, double3 *P, double deltat,
+		 RNDState *s, __local double* par) {
+
+  double dotP = Dot((*P), (*P));
+
+  double Eng = sqrt(dotP + 1.0) * par[M_P] - par[M_P];
+  double gamma = (Eng + par[M_P]) / par[M_P];
+  double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
+  double normP = sqrt(dotP);
+  double deltas = deltat * beta * par[C];
+  double theta0 = 13.6e6 / (beta * sqrt(dotP) * par[M_P] * 1e9) * 
+    par[Z_P] * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
+
+  // x-direction: See Physical Review, "Multiple Scattering"
+  double z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  double z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  double thetacou = z2 * theta0;
+
+  while(fabs(thetacou) > 3.5 * theta0) {
+    z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    thetacou = z2 * theta0;
+  }
+
+  double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  int coord = 1; 
+  Rot(P, R, xplane, normP, thetacou, deltas, coord, par);
+
+  double P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  if(P2 < 0.0047) {
+    double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    if(P4 > 0.5)
+      thetaru = -thetaru;
+    coord = 0; // no change in coordinates but one in momenta-direction
+    Rot(P, R, xplane, normP, thetaru, deltas, coord, par);
+  }
+
+  // y-direction: See Physical Review, "Multiple Scattering"
+  z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+  thetacou = z2 * theta0;
+
+  while(fabs(thetacou) > 3.5 * theta0) {
+    z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
+    thetacou = z2 * theta0;
+  }
+
+  double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
+  coord = 2; 
+  Rot(P, R, yplane, normP, thetacou, deltas, coord, par);
+
+  P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+  if(P2 < 0.0047) {
+    double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
+    double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
+    if(P4 > 0.5)
+      thetaru = -thetaru;
+    coord = 0; // no change in coordinates but one in momenta-direction
+    Rot(P, R, yplane, normP, thetaru, deltas, coord, par);
+  }
+
+}
+
+#define NUMPARAMS 19
+__kernel void kernelCollimatorPhysics(__global PART *data, __global double *par, 
+				      __global RNDState *state, int numparticles,
+				      __local double *p)
+{
+
+  //get global id
+  int tid = get_local_id(0);
+  int idx = get_global_id(0);
+
+  printf("idx:\n");//, idx);
+
+  //transfer params to local memory
+  if (tid < NUMPARAMS)
+    p[tid] = par[tid];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  RNDState s;
+  double3 R, P;
+  int l = 0;
+  if (idx < numparticles) {
+    R = data[idx].Rincol;
+    P = data[idx].Pincol;
+    s = state[idx];
+  }
+
+  double sq = sqrt(1.0 + Dot(P, P));
+  bool pdead = false;  
+  bool hit = checkHit(R.z, p[POSITION], p[ZSIZE]);
+  double Eng;
+    
+  if (hit) {      
+    Eng = (sq - 1) * p[M_P];
+    energyLoss(&Eng, &pdead, p[DT_M], &s, p);
+  } else {
+    R.x = R.x + p[DT_M] * p[C] * P.x / sq;
+    R.y = R.y + p[DT_M] * p[C] * P.y / sq;
+    R.z = R.z + p[DT_M] * p[C] * P.z / sq;
+    l = -2;
+  }
+    
+  if (hit && !pdead) {
+    double ptot = sqrt((p[M_P] + Eng) * (p[M_P] + Eng) - (p[M_P] * p[M_P])) / p[M_P];
+    sq = sqrt(Dot(P, P));
+    P.x = P.x * ptot / sq;
+    P.y = P.y * ptot / sq;
+    P.z = P.z * ptot / sq;
+    coulombScat(&R, &P, p[DT_M], &s, p); 
+  } 
+  
+  if (hit && pdead)
+    l = -1;
+    
+  if (idx < numparticles) {  
+    data[idx].Rincol = R;
+    data[idx].Pincol = P;
+    data[idx].label = l;
+    state[idx] = s;
+  }
+  
+}
+
+
+/* count dead particles and particles leaving material - boost compute? */
+
+/* sort particles so dead and leaving particles are at the end of PART array - boost compute */
+
+
diff --git a/src/OpenCL/OpenCLKernels/OpenCLFFT.cl b/src/OpenCL/OpenCLKernels/OpenCLFFT.cl
new file mode 100644
index 0000000..1d4763b
--- /dev/null
+++ b/src/OpenCL/OpenCLKernels/OpenCLFFT.cl
@@ -0,0 +1,181 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/* 3D normalize FFT kernel */
+__kernel void normalizeFFT(__global double2 *input, int N) {
+  int i1 = get_global_id(0);
+  int i2 = get_global_id(1);
+  int i3 = get_global_id(2);
+  int n1 = get_global_size(0);
+  int n2 = get_global_size(1);
+  int n3 = get_global_size(2);
+    
+  int id = i1;
+  if (n2 > 1)
+    id += i2*n2;
+  if (n3 > 1)
+    id += i3*n2*n2;
+	
+  input[id].x = input[id].x / N;
+  input[id].y = input[id].y / N;
+}
+
+/* 3D radix 2 FFT kernel */
+__kernel void FFT3D(__global double2 *input, int step, int dim, int forward) {
+    
+  int n1 = get_global_size(0);
+  int n2 = get_global_size(1);
+  int n3 = get_global_size(2);
+  int i1 = get_global_id(0);
+  int i2 = get_global_id(1);
+  int i3 = get_global_id(2);
+    
+  int jump = step << 1;
+    
+  int d, idGroup, idLoc, idTwidle, id, match;
+  if (dim == 0) {
+	
+    d = n1 / step;	// n1 >> log2(step)
+    idLoc = i1 / d;
+    idGroup = i1 & (d-1); //modulo
+
+    idTwidle = idGroup * jump + idLoc;
+    id = i3*n3*n3 + i2*n2 + idTwidle;
+    match = id + step;
+  } else if (dim == 1) {
+	
+    d = n2 / step;
+    idLoc = i2 / d;
+    idGroup = i2 & (d-1);
+	
+    idTwidle = idGroup * jump + idLoc;
+    id = i3*n3*n3 + idTwidle*n1 + i1;
+    match = id + step*n1;
+  } else if (dim == 2) {
+	
+    d = n3 / step;
+    idLoc = i3 / d;
+    idGroup = i3 & (d-1);
+	
+    idTwidle = idGroup * jump + idLoc;
+    id = idTwidle*n1*n1 + i2*n2 + i1;
+    match = id + step*n1*n1;
+  }
+    
+  double alpha;
+  if (forward == 1)
+    alpha = -( 2 * M_PI / jump ) * idTwidle;
+  else
+    alpha = ( 2 * M_PI / jump ) * idTwidle;
+	
+  double wr, wi;
+  wi = sincos(alpha, &wr);
+    
+  double2 cTemp;
+  double2 cTempId = input[id];
+  double2 cTempMatch = input[match];
+    
+  cTemp.x = wr*cTempMatch.x - wi*cTempMatch.y;
+  cTemp.y = wr*cTempMatch.y + wi*cTempMatch.x;
+    
+  input[match] = cTempId - cTemp;
+  input[id] = cTempId + cTemp;
+    
+}
+
+/* 3D bit reversal sort */
+__kernel void BitReverseSort3D(__global double2 *input, int bits, int dim) {
+
+  int n = get_global_size(0);
+  int i1 = get_global_id(0);
+  int i2 = get_global_id(1);
+  int i3 = get_global_id(2);
+
+  int irev, itmp, istart;
+  if (dim == 0) {
+    istart = i1;
+    irev = i1;
+    itmp = i1;
+  } else if (dim == 1) {
+    irev = i2;
+    itmp = i2;
+    istart = i2;
+  } else if (dim == 2) {
+    irev = i3;
+    itmp = i3;
+    istart = i3;
+  }
+     
+  for (int j = 1; j < bits; j++) {
+    itmp >>= 1;
+    irev <<= 1;
+    irev |= itmp & 1;
+  }
+  irev &= n - 1;
+    
+  int id1, id2;
+  if (istart < irev) {
+    double2 tmp;
+    id1 = i3*n*n + i2*n + i1;
+    if (dim == 0) { //i1, irev - w, i2 - h, i3 - d
+      id2 = i3*n*n + i2*n + irev;
+      tmp = input[id1];
+      input[id1] = input[id2];
+      input[id2] = tmp;
+    } else if (dim == 1) { // i1 - w, i2, irev - h, i3 - d
+      id2 = i3*n*n + irev*n + i1;
+      tmp = input[id1];
+      input[id1] = input[id2];
+      input[id2] = tmp;
+    } else if (dim == 2) { // i1 - w, i2 - h, i3, irev - d
+      id2 = irev*n*n + i2*n + i1;
+      tmp = input[id1];
+      input[id1] = input[id2];
+      input[id2] = tmp;
+    }
+  }
+}
+
+
+/* 3D FFT kernel based on Stockham's out-of-place algorithm */
+__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim, const int forward) {
+
+  const int gid1 = get_global_id(0);
+  const int gid2 = get_global_id(1);
+  const int gid3 = get_global_id(2);
+
+  int t2 = 2*t;
+  int k, m, in1, in2, out1, out2;
+  in1 = gid3*t2*t2 + gid2*t2 + gid1;
+  if (ndim == 1) {
+    k = gid1 & (p - 1);
+    m = (gid1 << 1) - k;
+    in2 = in1 + t;
+    out1 = gid3*t2*t2 + gid2*t2 + m;
+    out2 = out1 + p;
+  } else if (ndim == 2) {
+    k = gid2 & (p - 1);
+    m = (gid2 << 1) - k;
+    in2 = in1 + t2*t;
+    out1 = gid3*t2*t2 + m*t2 + gid1;
+    out2 = out1 + t2*p;
+  } else if (ndim == 3) {
+    k = gid3 & (p - 1);
+    m = (gid3 << 1) - k;
+    in2 = in1 + t2*t2*t;
+    out1 = m*t2*t2 + gid2*t2 + gid1;
+    out2 = out1 + p*t2*t2;
+  }
+	
+  const double2 d1 = src[in1];
+  const double2 d2 = src[in2];
+
+  const double theta = (forward*2*M_PI*k) / (p << 1);
+		
+  double cs;
+
+  double sn = sincos(theta, &cs);
+  const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
+	
+  dst[out1] = d1 + temp;
+  dst[out2] = d1 - temp;	
+}
diff --git a/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl b/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl
new file mode 100644
index 0000000..b5d9e51
--- /dev/null
+++ b/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl
@@ -0,0 +1,214 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#define TWOPI 6.28318530718
+
+__kernel void fft_radix2(__global double2* src, __global double2* dst, const int p, const int t) {
+
+  const int gid = get_global_id(0);
+  const int k = gid & (p - 1);
+  const int m = (gid << 1) - k;
+
+  //src += gid;
+  //dst += (gid << 1) - k;
+
+  //const double2 in1 = src[0];
+  //const double2 in2 = src[t];
+
+  const double2 in1 = src[gid];
+  const double2 in2 = src[gid+t];
+
+  const double theta = (-2*M_PI*k) / (p << 1);
+  double cs;
+
+  double sn = sincos(theta, &cs);
+  const double2 temp = (double2) (in2.x * cs - in2.y * sn, in2.y * cs + in2.x * sn);
+
+  //dst[0] = in1 + temp;
+  //dst[p] = in1 - temp;	
+	
+  dst[m] = in1 + temp;
+  dst[m+p] = in1 - temp;
+
+}
+
+__kernel void fft3d_radix2_transpose(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) {
+
+  /* get ids */
+  const int gid1 = get_global_id(0);
+  const int gid2 = get_global_id(1);
+  const int gid3 = get_global_id(2);
+
+  /* calc indexes */
+  int t2 = 2*t;
+	
+  int k = gid1 & (p - 1);
+  int	m = (gid1 << 1) - k;
+	
+  int tmp = gid3*t2*t2 + gid2*t2;
+	
+  int in1 = tmp + gid1;
+  int	in2 = in1 + t;
+	
+  int	out1 = tmp + m;
+  int	out2 = out1 + p;
+	
+  /* calc FFT */
+  const double2 d1 = src[in1];
+  const double2 d2 = src[in2];
+
+  const double theta = (-2*M_PI*k) / (p << 1);
+  double cs;
+
+  double sn = sincos(theta, &cs);
+  const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
+	
+  dst[out1] = d1 + temp;
+  dst[out2] = d1 - temp;
+}
+
+__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) {
+
+  const int gid1 = get_global_id(0);
+  const int gid2 = get_global_id(1);
+  const int gid3 = get_global_id(2);
+
+  int t2 = 2*t;
+  int k, m, in1, in2, out1, out2;
+  in1 = gid3*t2*t2 + gid2*t2 + gid1;
+  if (ndim == 1) {
+    k = gid1 & (p - 1);
+    m = (gid1 << 1) - k;
+    in2 = in1 + t;
+    out1 = gid3*t2*t2 + gid2*t2 + m;
+    out2 = out1 + p;
+  } else if (ndim == 2) {
+    k = gid2 & (p - 1);
+    m = (gid2 << 1) - k;
+    in2 = in1 + t2*t;
+    out1 = gid3*t2*t2 + m*t2 + gid1;
+    out2 = out1 + t2*p;
+  } else if (ndim == 3) {
+    k = gid3 & (p - 1);
+    m = (gid3 << 1) - k;
+    in2 = in1 + t2*t2*t;
+    out1 = m*t2*t2 + gid2*t2 + gid1;
+    out2 = out1 + p*t2*t2;
+  }
+	
+  const double2 d1 = src[in1];
+  const double2 d2 = src[in2];
+
+  const double theta = (-2*M_PI*k) / (p << 1);
+	
+  double cs;
+  double sn = sincos(theta, &cs);
+  const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
+	
+  dst[out1] = d1 + temp;
+  dst[out2] = d1 - temp;	
+}
+
+
+__kernel void transpose(__global double2 *data, int ndim, int dim) {
+
+  int k = get_global_id(0);
+  int j = get_global_id(1);
+  int i = get_global_id(2);
+  int nk = get_global_size(0);
+  int nj = get_global_size(1);
+  int ni = get_global_size(2);
+	
+  int n, m;
+  n = i*ni*ni + j*nj + k;
+  if (dim == 2) 
+    m = i*ni*ni + k*nj + j;
+  else
+    m = k*ni*ni + j*nj + i;
+		
+  if (n < m) {
+    double2 tmp = data[m];
+    data[m] = data[n];
+    data[n] = tmp;
+  }
+}
+
+#define PI2 6.28318530718
+
+__kernel void fft_batch3D(__global double2 *data_in, __local double2 *d, __local double2 *r, __local double2 *tmp, int N, int dim) {
+
+  int id1 = get_global_id(0);
+  int id2 = get_global_id(1);
+  int id3 = get_global_id(2);
+		
+  //calc indexes
+  int sid, offset;
+  if (dim == 1) {
+    sid = id3*N*N + id2*N;
+    offset = 1;
+  } else if (dim == 2) {
+    sid = id3*N*N + id2;
+    offset = N;
+  } else if (dim == 3) {
+    sid = id3*N + id2;
+    offset = N*N;
+  }
+	
+  //copy data from global memory to local
+  int i1 = id1;
+  int i2 = id1+N/2;
+  d[i1] = data_in[sid + i1*offset];
+  d[i2] = data_in[sid + i2*offset];
+	
+  barrier(CLK_LOCAL_MEM_FENCE);
+  //barrier(CLK_GLOBAL_MEM_FENCE);
+	
+  //exec fft
+  int p1, p2, j, k, out1, step, jump, t;
+  double theta, cs, sn;
+	
+  t = 1;
+  step = 1;
+  while (step < N) {
+    jump = step << 1;
+		
+    j = i1 >> (t - 1); // same as i1 / step, because t-1 = log2(step)
+    k = i2 & (step - 1); // same as i2 % step
+		
+    out1 = j * jump + k;
+		
+    theta = -PI2 * k / jump;
+    sn = sincos(theta, &cs);
+		
+    double2 temp = (double2) (d[i2].x*cs - d[i2].y*sn, d[i2].y*cs + d[i2].x * sn);
+		
+		
+    r[out1] = d[i1] + temp;
+    r[out1+step] = d[i1] - temp;
+
+    t++;		
+    step = jump;
+		
+    //swap local arrays
+    tmp = r;
+    r = d;
+    d = tmp;
+		
+    //wait for all threads to finish this iteration
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+	
+  tmp = r;
+  r = d;
+  d = tmp;
+
+  //copy data from local memory to global
+  data_in[sid + i1*offset] = r[i1];
+  data_in[sid + i2*offset] = r[i2];
+
+}
+
+
+
+
+
+
+
diff --git a/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl b/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl
new file mode 100644
index 0000000..ffbd0ba
--- /dev/null
+++ b/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl
@@ -0,0 +1,41 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/* transpose matrix */
+__kernel void transpose(__global double2 *input, __global double2 *output, 
+			int width, int height, __local double2 *block)
+{
+
+  //transfer row in shared memory
+  unsigned int xIdx = get_global_id(0);
+  unsigned int yIdx = get_global_id(1);
+  int block_dim = get_local_size(0);
+
+  if ( (xIdx < width) && (yIdx < height) ) {
+    unsigned int idx_in = yIdx * width + xIdx;
+    block[get_local_id(1)*(block_dim+1)+get_local_id(0)] = input[idx_in];
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  xIdx = get_group_id(1) * block_dim + get_local_id(0);
+  yIdx = get_group_id(0) * block_dim + get_local_id(1);
+
+  if ( (xIdx < height) && (yIdx < width) ) {
+    unsigned int idx_out = yIdx * height + xIdx;
+    output[idx_out] = block[get_local_id(0)*(block_dim+1)+get_local_id(1)];
+  }
+
+}
+
+/* naive transpose matrix kernel */
+__kernel void transpose_naive(__global double2 *input, __global double2 *output, int width, int height)
+{
+  unsigned int xIdx = get_global_id(0);
+  unsigned int yIdx = get_global_id(1);
+
+  if (xIdx < width && yIdx < height) {
+    unsigned int idx_in = xIdx + width * yIdx;
+    unsigned int idx_out = yIdx + height * xIdx;
+    output[idx_out] = input[idx_in];
+  }
+}
diff --git a/src/Utility/CMakeLists.txt b/src/Utility/CMakeLists.txt
new file mode 100644
index 0000000..8a95785
--- /dev/null
+++ b/src/Utility/CMakeLists.txt
@@ -0,0 +1,18 @@
+SET (_SRCS
+	TimeStamp.cpp
+	DKSTimer.cpp
+  )
+
+SET (_HDRS
+	TimeStamp.h
+	DKSTimer.h
+  )
+
+#INCLUDE_DIRECTORIES (
+#  ${CMAKE_CURRENT_SOURCE_DIR}
+#)
+
+ADD_SOURCES (${_SRCS})
+ADD_HEADERS (${_HDRS})
+
+INSTALL(FILES ${_HDRS} DESTINATION include/Utility)
diff --git a/src/Utility/DKSTimer.cpp b/src/Utility/DKSTimer.cpp
new file mode 100644
index 0000000..5f495d3
--- /dev/null
+++ b/src/Utility/DKSTimer.cpp
@@ -0,0 +1,53 @@
+#include "DKSTimer.h"
+
+//set initial values - running to false, timervalue to zero and name to empty string
+DKSTimer::DKSTimer() {
+  running = false;
+  timervalue = 0.0;
+  name = "";
+}
+
+//destructor does nothing
+DKSTimer::~DKSTimer() {
+
+}
+
+//init the timer by setting name and clearing timervalue, also sets running to false
+void DKSTimer::init(std::string n) {
+  running = false;
+  timervalue = 0.0;
+  name = n;
+}
+
+//if timer is not running get the current time and save to timeStart, set the timer as running
+void DKSTimer::start() {
+  if (!running) {
+    gettimeofday(&timeStart, NULL);
+    running = true;
+  }
+}
+
+//if the timer is running get the current time to timeEnd, calculate the elapsed time befor start
+//and end, add elapsed time to timervalue, set the timer as not running
+void DKSTimer::stop() {
+  if (running) {
+    gettimeofday(&timeEnd, NULL);
+    timervalue += ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+		    (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
+    running = false;
+  }
+}
+
+void DKSTimer::reset() {
+  running = false;
+  timervalue = 0.0;
+}
+
+//return the accumulated value of timervalue
+double DKSTimer::gettime() {
+  return timervalue;
+}
+
+void DKSTimer::print() {
+  std::cout << "DKSTimer " << name << " elapsed time\t" << timervalue << "s" << std::endl;
+}
diff --git a/src/Utility/DKSTimer.h b/src/Utility/DKSTimer.h
new file mode 100644
index 0000000..80025c0
--- /dev/null
+++ b/src/Utility/DKSTimer.h
@@ -0,0 +1,59 @@
+#ifndef H_DKSTIMER
+#define H_DKSTIMER
+
+#include <iostream>
+#include <string>
+#include <sys/time.h>
+
+class DKSTimer {
+
+private:
+
+  bool running;
+  double timervalue;
+  struct timeval timeStart;
+  struct timeval timeEnd;
+  std::string name;
+
+public:
+
+  /** Init DKSTimer by seting timer to zero  */
+  DKSTimer();
+
+  ~DKSTimer();
+
+  /** Init the timer
+   *  Set the name for timer and clear all values
+   */
+  void init(std::string n);
+
+  /** Start the timer.
+   *  Get the curret time with gettimeofday and save in timeStart
+   */
+  void start();
+
+  /** Stop the timer 
+   *  Get the curretn time with gettimeofday and save in timeEnd
+   *  Calculate elapsed time by timeEnd - timeStart and add to timervalue
+   */
+  void stop();
+
+  /** Reset timervalue to zero.
+   *  Set timervalue, timeStart and timeEnd to zero
+   */
+  void reset();
+
+  /** Return elapsed time in seconds.
+   *  Return the value of timervalue
+   */
+  double gettime();
+
+  /** Print timer.
+   *  Print the elapsed time of the timer
+   */
+  void print();
+     
+
+};
+
+#endif
diff --git a/src/Utility/TimeStamp.cpp b/src/Utility/TimeStamp.cpp
new file mode 100644
index 0000000..1e239fc
--- /dev/null
+++ b/src/Utility/TimeStamp.cpp
@@ -0,0 +1,11 @@
+#include "TimeStamp.h"
+
+timestamp_t get_timestamp() {
+    struct timeval now;
+    gettimeofday (&now, NULL);
+    return now.tv_usec + (timestamp_t)now.tv_sec * 1000000;
+}
+
+double get_secs(timestamp_t t_start, timestamp_t t_end) {
+    return (t_end - t_start) / 1000000.0L;
+}
\ No newline at end of file
diff --git a/src/Utility/TimeStamp.h b/src/Utility/TimeStamp.h
new file mode 100644
index 0000000..d53104d
--- /dev/null
+++ b/src/Utility/TimeStamp.h
@@ -0,0 +1,14 @@
+#ifndef H_TIMESTAMPE
+#define H_TIMESTAMPE
+
+#include <iostream>
+#include <time.h>
+#include <sys/time.h>
+
+typedef unsigned long long timestamp_t;
+
+timestamp_t get_timestamp();
+double get_secs(timestamp_t t_start, timestamp_t t_end);
+
+
+#endif
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..01f33fb
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,84 @@
+INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+
+LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+
+#ADD_EXECUTABLE(testDKS testDKS.cpp)
+#ADD_EXECUTABLE(testChi testChi.cpp)
+#ADD_EXECUTABLE(testFFT testFFT.cpp)
+#ADD_EXECUTABLE(testMIC testMIC.cpp)
+#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
+#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
+#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
+#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
+#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
+#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
+#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp)
+#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp)
+#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp)
+#ADD_EXECUTABLE(testOffset testOffset.cpp)
+#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp)
+#ADD_EXECUTABLE(testMPI testMPI.cpp)
+#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp)
+#ADD_EXECUTABLE(testGather testGather.cpp)
+#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
+#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
+ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
+#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
+#ADD_EXECUTABLE(testPush testPush.cpp)
+#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
+#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
+#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
+
+#shared library
+#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp)
+
+
+#TARGET_LINK_LIBRARIES(testDKS dks)
+#TARGET_LINK_LIBRARIES(testChi dks)
+#TARGET_LINK_LIBRARIES(testFFT dks)
+#TARGET_LINK_LIBRARIES(testMIC dks)
+#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
+#TARGET_LINK_LIBRARIES(testFFT3D dks)
+#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
+#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
+#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
+#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
+#TARGET_LINK_LIBRARIES(testStockFFT3D dks)
+#TARGET_LINK_LIBRARIES(testMemObjects dks)
+#TARGET_LINK_LIBRARIES(testRCFFT dks)
+#TARGET_LINK_LIBRARIES(testOffset dks)
+#TARGET_LINK_LIBRARIES(testOffsetMPI dks)
+#TARGET_LINK_LIBRARIES(testMPI dks)
+#TARGET_LINK_LIBRARIES(testMPIFFT dks)
+#TARGET_LINK_LIBRARIES(testGather dks)
+#TARGET_LINK_LIBRARIES(testGatherAsync dks)
+#TARGET_LINK_LIBRARIES(testTranspose dks)
+TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
+#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
+#TARGET_LINK_LIBRARIES(testPush dks)
+#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
+#TARGET_LINK_LIBRARIES(testIntegration dks)
+#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
+
+
+#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared)
+
+
+#IF (${COMPILER_NAME} STREQUAL "mpicxx")
+   #ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp)
+   #ADD_EXECUTABLE(testGreens testGreens.cpp)
+   #ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp)
+   #ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp)
+   #TARGET_LINK_LIBRARIES(testGatherAsync2 dks)
+   #TARGET_LINK_LIBRARIES(testGreens dks)
+   #TARGET_LINK_LIBRARIES(testFFTSolver dks)
+   #TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks)	
+#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
+
+#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp)
+#TARGET_LINK_LIBRARIES(testChiSquare dks)
+
+#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
+  #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+  #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
+#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
\ No newline at end of file
diff --git a/test/testChi.cpp b/test/testChi.cpp
new file mode 100644
index 0000000..0181144
--- /dev/null
+++ b/test/testChi.cpp
@@ -0,0 +1,141 @@
+#include <iostream>
+#include <complex>
+#include <cstdlib>
+
+#include "DKSBase.h"
+#include "Utility/TimeStamp.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+  char *api_name = new char[10];
+  char *device_name = new char[4];
+
+
+  if (argc == 3) {
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, argv[2]);
+  } else if (argc == 2){
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, "-gpu");
+  } else {
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << endl;
+
+  cout << "Begin DKS Base tests" << endl;
+	
+  /* inti data */
+  int ierr;
+  int nsize = 4000000;
+  int jsize = 16;
+  int psize = 6;
+  double *data = new double[nsize*jsize];
+  double *p = new double[psize*jsize];
+  double data_out = 0;
+	
+  srand(time(NULL));
+  for (int i = 0; i < nsize*jsize; i++) {
+    //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
+    //data[i] = sign*(double)rand()/RAND_MAX;
+    data[i] = (double)i / (nsize*jsize);
+    //data[i] = 1;
+  }
+  for (int i = 0; i < psize*jsize; i++) {
+    //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
+    //p[i] = sign*(double)rand()/RAND_MAX;
+    p[i] = (double)i / (nsize*jsize);
+    //p[i] = 1;
+  }
+  /* end init */
+	
+  timestamp_t tstart, tend;
+  //timestamp_t t0, t1;
+	
+  tstart = get_timestamp();
+		
+  //init dks base class, set API to opencl and init connection with OpenCL device
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+	
+  //ptrs to hold reference to device memory
+  void *dptr, *ntptr, *pptr;
+	
+  //allocate memory on device
+  //t0 = get_timestamp();
+  dptr = base.allocateMemory<double>(nsize*jsize, ierr);
+  ntptr = base.allocateMemory<double>(nsize*jsize, ierr);
+  pptr = base.allocateMemory<double>(psize*jsize, ierr);
+  //t1 = get_timestamp();
+  //cout << "Allocate memory: " << get_secs(t0, t1) << endl;
+	
+  //write data to device
+  //t0 = get_timestamp();
+  base.writeData<double>(dptr, data, nsize*jsize);	
+  //t1 = get_timestamp();
+  //cout << "Write data set: " << get_secs(t0, t1) << endl << endl;
+	
+  for (int i = 0; i < 5; i++) {
+    //write parameters to device
+    //t0 = get_timestamp();
+    base.writeData<double>(pptr, p, psize*jsize);
+    //t1 = get_timestamp();
+    //cout << "Write parameters: " << get_secs(t0, t1) << endl;
+
+    //set function to calcNt and execute it with necessary parameters
+    //t0 = get_timestamp();
+    base.callNt<double>(ntptr, pptr, psize, nsize, jsize, 0.025);
+    //t1 = get_timestamp();
+		
+    //cout << "Calc N(t): " << get_secs(t0, t1) << endl;
+		
+    //set function to chi2 and execute it with necessary parameters
+    //t0 = get_timestamp();
+    base.callChi2<double>(ntptr, dptr, ntptr, nsize*jsize);
+    //t1 = get_timestamp();
+    //cout << "Calc chi^2: " << get_secs(t0, t1) << endl;
+	
+    //set function so sum and execute it with necessary parameters
+    //t0 = get_timestamp();
+    base.callSum<double>(ntptr, ntptr, nsize*jsize);
+    //t1 = get_timestamp();
+    //cout << "Calc sum: " << get_secs(t0, t1) << endl;
+	
+    //read calculated sum (one value)
+    //t0 = get_timestamp();
+    base.readData<double>(ntptr, &data_out, 1);
+    //t1 = get_timestamp();
+    //cout << "Read sum: " << get_secs(t0, t1) << endl;
+    cout << "Sum nt: " << data_out << endl;
+		
+    /*
+      for (int i = 0; i < psize*jsize; i++) {
+      int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
+      p[i] = sign*(double)rand()/RAND_MAX;
+      }
+    */
+		
+		
+    //cout << endl;
+  }
+	
+  //free device memory
+  //t0 = get_timestamp();
+  base.freeMemory<double>(dptr, nsize*jsize);
+  base.freeMemory<double>(ntptr, nsize*jsize);
+  base.freeMemory<double>(pptr, psize*jsize);
+  //t1 = get_timestamp();
+  //cout << "Free memory: " << get_secs(t0, t1) << endl;
+	
+  tend = get_timestamp();
+	
+  cout << endl  << "time: " << get_secs(tstart, tend) << endl;
+		
+  return 0;
+}
+
diff --git a/test/testChiSquare.cpp b/test/testChiSquare.cpp
new file mode 100644
index 0000000..550f3a4
--- /dev/null
+++ b/test/testChiSquare.cpp
@@ -0,0 +1,168 @@
+#include <iostream>
+#include <vector>
+#include "DKSBase.h"
+
+using namespace std;
+
+void initData(vector< vector<double> > &v, int length) {
+
+  for (unsigned int i = 0; i < v.size(); i++) {
+    for (int j = 0; j < length; j++) {
+      v[i].push_back(j);
+    }
+  }
+
+}
+
+
+void printData(vector< vector<double> > &v) {
+  for (unsigned int i = 0; i < v.size(); i++) {
+    for (unsigned int j = 0; j < v[i].size(); j++) {
+      cout << v[i][j] << "\t";
+    }
+    cout << endl;
+  }
+}
+
+void initData(double *data, int sensors, int length) {
+
+  for (int i = 0; i < sensors; i++) {
+    for (int j = 0; j < length; j++) {
+      data[i*length + j] = j;
+    }
+  }
+
+}
+
+
+void printData(double *data, int sensors, int length) {
+  for (int i = 0; i < sensors; i++) {
+    for (int j = 0; j < length; j++) {
+      cout << data[i*length + j] << "\t";
+    }
+    cout << endl;
+  }
+}
+
+void initPar(double *par, int npar) {
+
+  for (int i = 0; i < npar; i++)
+    par[i] = (double)i / npar;
+
+}
+
+void printDiv(int size) {
+  for (int i = 0; i < size; i++)
+    cout << "=";
+  cout << endl;
+}
+
+void calcChisq(vector< vector<double> > fData, double * par, double fTimeResolution, double fRebin)
+{
+
+  double chisq = 0.0;
+  double theo, data;
+  const double tau=2.197019;
+  const double dt0 = fTimeResolution*0.5*(fRebin-1);
+  double time;
+  double w = par[0]*0.08516155035269027; 
+
+  unsigned int i, j;
+
+  for (i=0; i<fData.size(); i++) {
+    for (j=0; j<fData[0].size(); j++) {
+      data = fData[i][j];
+      time = dt0+fTimeResolution*fRebin*j;
+
+      theo = par[2 + i*4] * exp(-time/tau)*(1.0 + par[3 + i*4]*exp(-0.5 * pow(par[1]*time,2.0))*cos(w*time+par[4+i*4]*1.74532925199432955e-2))+par[5+i*4]; 
+      if (data != 0.0) {
+	chisq += (theo-data)*(theo-data)/data;
+	cout << (theo-data)*(theo-data)/data << "\t";
+      } else {
+	chisq += theo*theo;
+	cout << theo*theo << "\t";
+      }
+    }
+    cout << endl;
+  }
+
+  cout << "Chisq: " << chisq << endl;
+
+}
+
+
+int main(int argc, char *argv[]) {
+
+  bool useCuda = true;
+  if (argc == 2 && atoi(argv[1]) == 1)
+    useCuda = false;
+
+  int ierr;
+  int sensors = 5;
+  int length = 10;
+  int npar = 4 * sensors + 2;
+  int ndata = sensors * length;
+  
+  double result;
+
+  double fTimeResolution = 0.05;
+  double fRebin = 5;
+
+  double *par = new double[npar];
+  initPar(par, npar);
+  
+  vector< vector< double > > fData;
+  fData.resize(sensors);
+  initData(fData, length);
+  printData(fData);
+  printDiv(75);
+
+  DKSBase dksbase;
+  if (useCuda)
+    dksbase.setAPI("Cuda", 4);
+  else
+    dksbase.setAPI("OpenCL", 6);
+  dksbase.setDevice("-gpu", 4);
+  dksbase.initDevice();
+  dksbase.setupFFT(0, NULL);
+
+
+  void *mem_data, *mem_par, *mem_chisq;
+  cout << "Allocate memory" << endl;
+  mem_par = dksbase.allocateMemory<double>(npar, ierr);
+  mem_data = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
+  mem_chisq = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
+  
+  
+  cout << "Write data" << endl;
+  dksbase.writeData<double>(mem_par, par, npar);
+  for (int i = 0; i < sensors; i++)
+    dksbase.writeData<double>(mem_data, &fData[i][0], length, i*length);
+  
+  
+  
+  cout << "Call PHistoTFFcn" << endl;
+  dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq, 
+			  fTimeResolution, fRebin, 
+			  sensors, length, npar, result);
+  cout << "Result: " << result << endl;
+  
+  
+  double *out_data = new double[ndata];
+  dksbase.readData<double>(mem_chisq, out_data, ndata);
+  printDiv(75);
+  printData(out_data, sensors, length);
+  printDiv(75);
+  
+  calcChisq(fData, par, fTimeResolution, fRebin);
+  printDiv(75);
+  
+  cout << "Free memory" << endl;
+  dksbase.freeMemory<double>(mem_par, npar);
+  dksbase.freeMemory<double>(mem_data, ndata);
+  dksbase.freeMemory<double>(mem_chisq, ndata);
+  
+
+  return 0;
+
+}
diff --git a/test/testChiSquareRT.cpp b/test/testChiSquareRT.cpp
new file mode 100644
index 0000000..fcd0b50
--- /dev/null
+++ b/test/testChiSquareRT.cpp
@@ -0,0 +1,193 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <omp.h>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+void initData(double *data, int N, bool ones = false) {
+  for (int i = 0; i < N; i++) {
+    if (ones) 
+      data[i] = 1.0;
+    else
+      data[i] = (double)rand() / RAND_MAX;
+  }
+}
+
+template <typename T>
+void printData(T *data, int N) {
+  for (int i = 0; i < N; i++)
+    std::cout << data[i] << "\t";
+  std::cout << std::endl;
+}
+
+
+const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])";
+//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])";
+//const std::string funct = "p[m[0]] * se(t, p[m[1]])";
+//const std::string funct = "p[m[1]] + p[m[0]]";
+
+double fTheory(double time, double *par, double *func, int *map) {
+  return cos(time*par[0]) - exp(-time*par[map[0]]);
+}
+
+double testFunctionSerial(double *data, double *par, double *func, int *map,
+			  double N0, double tau, double bkg, double timeStep,
+			  int startTimeBin, int endTimeBin) 
+{
+  double time, diff, theo;
+  double chisq = 0;
+  for (int i = startTimeBin; i < endTimeBin; ++i) {
+    time = i * timeStep;
+    theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
+    diff = data[i] - theo;
+
+    chisq += diff * diff / data[i];
+  }
+
+  return chisq;
+}
+
+double testFunctionParallel(double *data, double *par, double *func, int *map,
+			    double N0, double tau, double bkg, double timeStep,
+			    int startTimeBin, int endTimeBin)
+{
+  int i, chunk;
+  double time, diff, theo;
+  double chisq = 0;
+
+  chunk = (endTimeBin - startTimeBin) / omp_get_num_procs();
+  if (chunk < 10)
+    chunk = 10;
+#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq)
+  for (i = startTimeBin; i < endTimeBin; ++i) {
+    time = i * timeStep;
+    theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
+    diff = data[i] - theo;
+    
+    chisq += diff * diff / data[i];
+  }
+
+  return chisq;
+}
+  
+int main(int argc, char *argv[]) {
+
+  int Loop = 100;
+
+  //init test data on the host
+  int Ndata = 8;
+  if (argc  > 1)
+    Ndata = atoi(argv[1]);
+
+  int api = 1;
+  if (argc > 2)
+    api = atoi(argv[2]);
+
+  int Npar = 66;
+  int Nfunc = 1;
+  int Nmap = 4;
+
+  double *data = new double[Ndata];
+  double *par = new double[Npar];
+  double *func = new double[Nfunc];
+  int *map = new int[Nmap];
+
+  initData(data, Ndata);
+  initData(par, Npar);
+  initData(func, Nfunc);
+  map[0] = 1;
+  map[1] = 2;
+  map[2] = 3;
+  map[3] = 4;
+
+  //create timers
+  DKSTimer serialTimer;
+  DKSTimer cudaTimer;
+  DKSTimer ompTimer;
+  DKSTimer gpuOverhead;
+  serialTimer.init("Serial timer");
+  cudaTimer.init("Cuda timer");
+  ompTimer.init("OpenMP timer");
+  gpuOverhead.init("Overhead for gpu");
+
+
+  //serial version
+  double resultSerial;
+
+  serialTimer.start();
+  for (int i = 0; i < Loop; i++)
+    resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
+  serialTimer.stop();
+
+  //openmp version
+  double resultOMP = 0.0;
+
+  ompTimer.start();
+  //for (int i = 0; i < Loop; i++)
+  //  resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
+  ompTimer.stop();
+
+
+  //create and init dkabase
+  gpuOverhead.start();
+
+  DKSBaseMuSR dksbase;
+  if (api == 1)
+    dksbase.setAPI("Cuda");
+  else
+    dksbase.setAPI("OpenCL");
+
+  dksbase.setDevice("-gpu");
+  dksbase.initDevice();
+  dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap);
+
+  //allocate memory on the device
+  int ierr;
+  void *data_ptr;
+
+  data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+  
+  dksbase.writeData<double>(data_ptr, data, Ndata);
+  dksbase.writeFunctions(func, Nfunc);
+  dksbase.writeMaps(map, Nmap);
+
+  dksbase.callCompileProgram(funct);
+  gpuOverhead.stop();
+
+  double resultCuda;
+
+  cudaTimer.start();
+  for (int i = 0; i < Loop; i++) {
+    dksbase.writeParams(par, Npar);
+    int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap,
+					   0.0, 0.1, 0, resultCuda);
+
+    if (ierr != 0)
+      exit (EXIT_FAILURE);
+
+  }
+  cudaTimer.stop();
+
+  std::cout << std::endl;
+  std::cout << "=======================Results=======================" << std::endl;
+  std::cout << "Result serial  = " << resultSerial << std::endl;
+  std::cout << "Result prallel = " << resultOMP << std::endl;
+  std::cout << "Result cuda    = " << resultCuda << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "=======================Timings=======================" << std::endl;
+  serialTimer.print();
+  ompTimer.print();
+  cudaTimer.print();
+  gpuOverhead.print();
+  std::cout << std::endl;
+
+  dksbase.freeMemory<double>(data_ptr, Ndata);
+
+  return 0;
+
+
+}
diff --git a/test/testCollimatorPhysics.cpp b/test/testCollimatorPhysics.cpp
new file mode 100644
index 0000000..bb5d9b5
--- /dev/null
+++ b/test/testCollimatorPhysics.cpp
@@ -0,0 +1,248 @@
+#include <iostream>
+
+#include <vector>
+#include <sys/time.h>
+
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+
+using namespace std;
+
+typedef struct {
+  int label;
+  unsigned localID;
+  double Rincol[3];
+  double Pincol[3];
+} PART_SMALL;
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Vector;
+
+PART_SMALL initPartSmall(int d) {
+
+  PART_SMALL p;
+  p.label = 0;
+  p.localID = d;
+
+  p.Rincol[0] = 0.0;
+  p.Rincol[1] = 0.0;
+  p.Rincol[2] = 0.02;
+
+  p.Pincol[0] = 0.0;
+  p.Pincol[1] = 0.0;
+  p.Pincol[2] = 3.9920183237269791e-01;
+
+  return p;
+}
+
+Vector initVector() {
+  Vector tmp;
+  tmp.x = 0.5;
+  tmp.y = 0.5;
+  tmp.z = 0.5;
+
+  return tmp;
+}
+
+void printPart(PART_SMALL p) {
+  cout << "label: " << p.label << ", ";
+  cout << "localid: " << p.localID << ",";
+  cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
+  cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
+  cout << endl;
+}
+
+void printVector(Vector v) {
+  cout << v.x << "\t" << v.y << "\t" << v.z << endl;
+ }
+
+void initParts(PART_SMALL *p, int N) {
+  for (int i = 0; i < N; i++)
+    p[i] = initPartSmall(i);
+}
+
+void printParts(PART_SMALL *p, int N) {
+  for (int i = 0; i < N; i++)
+    printPart(p[i]);
+  cout << endl;
+}
+
+void initVectors(Vector *v, int N) {
+  for (int i = 0; i < N; i++)
+    v[i] = initVector();
+}
+
+void printVectors(Vector *v, int N) {
+  for (int i = 0; i < N; i++)
+    printVector(v[i]);
+  cout << endl;
+}
+
+
+void initParams(double *data) {
+  data[0]  = 0.0;//2.0000000000000000e-02;
+  data[1]  = 1.0;//1.0000000000000000e-02;	
+  data[2]  = 2.2100000000000000e+00;
+  data[3]  = 6.0000000000000000e+00;	
+  data[4]  = 1.2010700000000000e+01;	
+  data[5]  = 2.6010000000000000e+00;	
+  data[6]  = 1.7010000000000000e+03;	
+  data[7]  = 1.2790000000000000e+03;	
+  data[8]  = 1.6379999999999999e-02;	
+  data[9]  = 1.9321266968325795e-01;	
+  data[10] = 7.9000000000000000e+01;	
+  data[11] = 1.0000000000000002e-12;
+
+}
+
+void printDouble(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    std::cout << data[i] << "\t";
+  std::cout << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 1e5;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+
+    if (argv[i] == string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of particles: " << numpart << endl;
+  cout << "Number of loops: " << loop << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //init part vector to test mc
+  PART_SMALL *parts = new PART_SMALL[numpart];
+  initParts(parts, numpart);
+
+  double *params = new double[12];
+  initParams(params);
+  
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+
+  //init random
+  base.callInitRandoms(numpart);
+
+  //**test collimator physics and sort***//
+  void *part_ptr, *param_ptr;
+
+  //allocate memory for particles
+  part_ptr = base.allocateMemory<PART_SMALL>(numpart, ierr);
+  param_ptr = base.allocateMemory<double>(12, ierr);
+
+  //transfer data to device
+  base.writeData<PART_SMALL>(part_ptr, parts, numpart);
+  base.writeData<double>(param_ptr, params, 12);
+
+  int numaddback;
+  //test calls to do some first executions
+  base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
+  base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);  
+  base.syncDevice();
+  //std::cout << "particles to add back: " << numaddback << std::endl;
+
+  struct timeval timeStart, timeEnd;
+  std::cout << "Start MC" << std::endl;
+
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < loop; i++) {
+    base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
+    base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
+    base.syncDevice();
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  std::cout << "addback: " << numaddback << std::endl;
+
+  std::cout << "End MC" << std::endl;
+  double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	       (timeEnd.tv_usec - timeStart.tv_usec));
+
+  std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl;
+  std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
+
+  //read data from device
+  base.readData<PART_SMALL>(part_ptr, parts, numpart);
+
+  //free memory
+  base.freeMemory<PART_SMALL>(part_ptr, numpart);
+  base.freeMemory<double>(param_ptr, 12);  
+
+
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++) {
+    std::cout << parts[i].label << "\t" 
+	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
+	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
+	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
+	      << std::endl;
+  }
+
+  std:: cout << "..." << std::endl;
+
+  for (int i = numpart - 10; i < numpart; i++) {
+    std::cout << parts[i].label << "\t" 
+	      << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" 
+	      << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
+	      << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
+	      << std::endl;
+  }
+
+  double arx = 0, ary = 0, arz = 0;
+  double apx = 0, apy = 0, apz = 0;
+  for (int i = 0; i < numpart; i++) {
+
+    arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart;
+    ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart;
+    arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart;
+
+    apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart;
+    apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart;
+    apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart;
+
+  }
+
+  std::cout << std::fixed << std::setprecision(10);
+  std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
+	    << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
+
+
+  cout << "==========================END TEST==========================" << endl;
+  return 0;
+
+}
diff --git a/test/testCollimatorPhysicsMPI.cpp b/test/testCollimatorPhysicsMPI.cpp
new file mode 100644
index 0000000..22e8a84
--- /dev/null
+++ b/test/testCollimatorPhysicsMPI.cpp
@@ -0,0 +1,126 @@
+#include <iostream>
+
+#include <vector>
+
+#include "DKSBase.h"
+#include "cuda_runtime.h"
+
+#include <mpi.h>
+
+using namespace std;
+
+typedef struct {
+  int label;
+  unsigned localID;
+  double Rincol[3];
+  double Pincol[3];
+  long IDincol;
+  int Binincol;
+  double DTincol;
+  double Qincol;
+  long LastSecincol;
+  double Bfincol[3];
+  double Efincol[3];
+} PART;
+
+PART initPart(int d) {
+
+  PART p;
+  p.label = d;
+  p.localID = d;
+  for (int i = 0; i < 3; i++) {
+    p.Rincol[i] = 0.5;// / (d+1);
+    p.Pincol[i] = 0.5;// / (d+1);
+    p.Bfincol[i] = 1.0 / (d+1);
+    p.Efincol[i] = 1.0 / (d+1);
+  }
+  p.IDincol = d;
+  p.Binincol = d;
+  p.DTincol = d;
+  p.Qincol = d;
+  p.LastSecincol = d;
+
+  return p;
+
+}
+
+void printPart(PART p) {
+
+  cout << "label: " << p.label << ", ";
+  //cout << "localID: " << p.localID << ", ";
+  cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
+  cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", ";
+  //cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", ";
+  //cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", ";
+  //cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", ";
+  //cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl;
+  cout << endl;
+
+
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  int numpart = 500501;
+
+  DKSBase base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.callInitRandoms(numpart);
+
+  PART tmp;
+  vector<PART> p;
+  vector<PART> p_out;
+  p_out.resize(numpart);
+
+  for (int i = 0; i < numpart; i++) {
+    tmp = initPart(i + 1);
+    p.push_back(tmp);
+  }
+
+  if (numpart <= 20) {
+    for (int i = 0; i < 10; i++)
+      printPart(p[i]);
+    cout << endl;
+  }
+
+  double params[19];
+  for (int i = 0; i < 19; i++)
+    params[i] = 0.05;
+  params[0] = 0;
+  params[1] = 1;
+
+  void *mem_ptr, *par_ptr;
+  
+  par_ptr = base.allocateMemory<double>(19, ierr);
+  base.writeData<double>(par_ptr, params, 19);
+
+  mem_ptr = base.allocateMemory<PART>(numpart, ierr);
+  base.writeData<PART>(mem_ptr, &p[0], numpart);
+
+  int addback, dead;
+  for (int i = 0; i < 100; i++)
+    base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead);
+  cout << "Add back: " << addback << ", dead: " << dead << endl;
+
+  base.readData<PART>(mem_ptr, &p_out[0], numpart);
+  base.freeMemory<PART>(mem_ptr, ierr);
+  base.freeMemory<double>(par_ptr, ierr);
+
+  if (numpart <= 20) {
+    for (int i = 0; i < numpart; i++)
+      printPart(p_out[i]);
+  }
+
+  MPI_Finalize();
+  return 0;
+
+}
diff --git a/test/testCollimatorPhysicsSoA.cpp b/test/testCollimatorPhysicsSoA.cpp
new file mode 100644
index 0000000..bc4bf0b
--- /dev/null
+++ b/test/testCollimatorPhysicsSoA.cpp
@@ -0,0 +1,250 @@
+#include <iostream>
+#include <iomanip>
+
+#include <vector>
+#include <sys/time.h>
+
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+#include <omp.h>
+
+using namespace std;
+
+typedef struct {
+  int *label;
+  unsigned *localID;
+  double *rx;
+  double *ry;
+  double *rz;
+  double *px;
+  double *py;
+  double *pz;
+} PART;
+
+
+void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz,
+	       double *px, double *py, double *pz, int npart) {
+
+  for (int i = 0; i < npart; i++) {
+    label[i] = 0;
+    localID[i] = i;
+    rx[i] = 0.0;
+    ry[i] = 0.0;
+    rz[i] = 0.02;
+    px[i] = 0.0;
+    py[i] = 0.0;
+    pz[i] = 3.9920183237269791e-01;
+  }
+}
+
+void initParams(double *data) {
+  data[0]  = 0.0;//2.0000000000000000e-02;
+  data[1]  = 1.0;//1.0000000000000000e-02;	
+  data[2]  = 2.2100000000000000e+00;
+  data[3]  = 6.0000000000000000e+00;	
+  data[4]  = 1.2010700000000000e+01;	
+  data[5]  = 2.6010000000000000e+00;	
+  data[6]  = 1.7010000000000000e+03;	
+  data[7]  = 1.2790000000000000e+03;	
+  data[8]  = 1.6379999999999999e-02;	
+  data[9]  = 1.9321266968325795e-01;	
+  data[10] = 7.9000000000000000e+01;	
+  data[11] = 1.0000000000000002e-12;
+
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 1e5;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+
+    if (argv[i] == string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  int threads = 0;
+  /*
+#pragma offload target(mic:0) out(threads)
+  {
+    #pragma omp parallel
+    {
+      threads = omp_get_num_threads();
+    }
+  }
+  */
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of particles: " << numpart << endl;
+  cout << "Number of loops: " << loop << endl;
+  cout << "Number of threads: " << threads << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //init part vector to test mc
+  //int *label;
+  //unsigned *localID;
+  //double *rx, *ry, *rz, *px, *py, *pz;
+  PART p;
+  p.label   = (int*) _mm_malloc(sizeof(int)*numpart, 64);
+  p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64);
+  p.rx      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.ry      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.rz      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.px      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.py      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  p.pz      = (double*) _mm_malloc(sizeof(double)*numpart, 64);
+  initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart);
+
+  double *params = new double[12];
+  initParams(params);
+ 
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+
+  //init random
+  base.callInitRandoms(numpart);
+
+  //**test collimator physics and sort***//
+  void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
+
+  //allocate memory for particles
+  label_ptr   = base.allocateMemory<int>(numpart, ierr);
+  localID_ptr = base.allocateMemory<unsigned>(numpart, ierr);
+  rx_ptr      = base.allocateMemory<double>(numpart, ierr);
+  ry_ptr      = base.allocateMemory<double>(numpart, ierr);
+  rz_ptr      = base.allocateMemory<double>(numpart, ierr);
+  px_ptr      = base.allocateMemory<double>(numpart, ierr);
+  py_ptr      = base.allocateMemory<double>(numpart, ierr);
+  pz_ptr      = base.allocateMemory<double>(numpart, ierr);
+
+  param_ptr = base.allocateMemory<double>(12, ierr);
+
+  //transfer data to device
+  base.writeData<int>(label_ptr, p.label, numpart);
+  base.writeData<unsigned>(localID_ptr, p.localID, numpart);
+  base.writeData<double>(rx_ptr, p.rx, numpart);
+  base.writeData<double>(ry_ptr, p.ry, numpart);
+  base.writeData<double>(rz_ptr, p.rz, numpart);
+  base.writeData<double>(px_ptr, p.px, numpart);
+  base.writeData<double>(py_ptr, p.py, numpart);
+  base.writeData<double>(pz_ptr, p.pz, numpart);
+
+  //transfer params to device
+  base.writeData<double>(param_ptr, params, 12);
+
+  std::cout << "test runs" << std::endl;
+
+  int numaddback;
+  //test calls to do some first executions
+  base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+  				py_ptr, pz_ptr, param_ptr, numpart);
+  base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+				    py_ptr, pz_ptr, param_ptr, numpart, numaddback);  
+  base.syncDevice();
+
+  struct timeval timeStart, timeEnd;
+  std::cout << "Start MC" << std::endl;
+
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < loop; i++) {
+    base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+    				  py_ptr, pz_ptr, param_ptr, numpart);
+    base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, 
+				      py_ptr, pz_ptr, param_ptr, numpart, numaddback);
+    base.syncDevice();
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  std::cout << "addback: " << numaddback << std::endl;
+
+  std::cout << "End MC" << std::endl;
+  double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	       (timeEnd.tv_usec - timeStart.tv_usec));
+
+  std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl;
+  std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
+
+  //read data from device
+  base.readData<int>(label_ptr, p.label, numpart);
+  base.readData<unsigned>(localID_ptr, p.localID, numpart);
+  base.readData<double>(rx_ptr, p.rx, numpart);
+  base.readData<double>(ry_ptr, p.ry, numpart);
+  base.readData<double>(rz_ptr, p.rz, numpart);
+  base.readData<double>(px_ptr, p.px, numpart);
+  base.readData<double>(py_ptr, p.py, numpart);
+  base.readData<double>(pz_ptr, p.pz, numpart);
+
+  //free memory
+  base.freeMemory<int>(label_ptr, numpart);
+  base.freeMemory<unsigned>(localID_ptr, numpart);
+  base.freeMemory<double>(rx_ptr, numpart);
+  base.freeMemory<double>(ry_ptr, numpart);
+  base.freeMemory<double>(rz_ptr, numpart);
+  base.freeMemory<double>(px_ptr, numpart);
+  base.freeMemory<double>(py_ptr, numpart);
+  base.freeMemory<double>(pz_ptr, numpart);
+
+  base.freeMemory<double>(param_ptr, 12);
+
+  /*  
+  std::cout << std::fixed << std::setprecision(4);
+  for (int i = 0; i < 10; i++) {
+    std::cout <<  p.label[i] << "\t" << p.rx[i] 
+	      << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] 
+	      << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
+  }
+  std:: cout << "..." << std::endl;
+
+  for (int i = numpart - 10; i < numpart; i++) {
+    std::cout << p.label[i] << "\t" << p.rx[i] 
+	      << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] 
+	      << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
+  }
+
+  double arx = 0, ary = 0, arz = 0;
+  double apx = 0, apy = 0, apz = 0;
+  for (int i = 0; i < numpart; i++) {
+
+    arx += sqrt(p.rx[i] * p.rx[i]) / numpart;
+    ary += sqrt(p.ry[i] * p.ry[i]) / numpart;
+    arz += sqrt(p.rz[i] * p.rz[i]) / numpart;
+
+    apx += sqrt(p.px[i] * p.px[i]) / numpart;
+    apy += sqrt(p.py[i] * p.py[i]) / numpart;
+    apz += sqrt(p.pz[i] * p.pz[i]) / numpart;
+
+  }
+
+  std::cout << std::fixed << std::setprecision(10);
+  std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
+	    << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
+  */
+  cout << "==========================END TEST==========================" << endl;
+  return 0;
+
+}
diff --git a/test/testDKS.cpp b/test/testDKS.cpp
new file mode 100644
index 0000000..4b66732
--- /dev/null
+++ b/test/testDKS.cpp
@@ -0,0 +1,15 @@
+#include <iostream>
+#include <complex>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+	DKSBase base = DKSBase();
+	base.getDevices();
+		
+	return 0;
+}
+
diff --git a/test/testFFT.cpp b/test/testFFT.cpp
new file mode 100644
index 0000000..c3fec1b
--- /dev/null
+++ b/test/testFFT.cpp
@@ -0,0 +1,83 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  if (argc == 2) {
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
+    strcpy(api_name, argv[1]);
+    strcpy(device_name, argv[2]);
+  } else {
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+
+  cout << "Begin DKS Base tests" << endl;
+		
+  int N = 2;
+  int dimsize[3] = {N, N, N};
+	
+  complex<double> *cdata = new complex<double>[N];
+  complex<double> *cfft = new complex<double>[N];
+  for (int i = 0; i < N; i++) {
+    cdata[i] = complex<double>(0, 0);
+    cfft[i] = complex<double>(0, 0);
+  }
+	
+  cdata[0] = complex<double>(1.73205, 1.73205);
+	
+  timestamp_t t0, t1;
+	
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+	
+  void *mem_ptr;
+  int ierr;
+	
+  /* write data to device */	
+  mem_ptr = base.pushData< complex<double> >( (const void*)cdata, N, ierr);
+
+  /* execute fft */
+  base.callFFT(mem_ptr, 1, dimsize);
+	
+  /* execute ifft */	
+  base.callIFFT(mem_ptr, 1, dimsize);
+	
+  /* execute normalize */
+  base.callNormalizeFFT(mem_ptr, 1, dimsize);
+	
+  /* read data from device */
+  base.pullData< complex<double> >(mem_ptr, cfft, N);
+	
+  /* print results */
+	
+  cout << "Data" << endl;
+  for (int i = 0; i < N; i++)
+    cout << cdata[i] << "\t";
+  cout << endl;
+	
+  cout << "FFT" << endl;
+  for (int i = 0; i < N; i++)
+    cout << cfft[i] << "\t";
+  cout << endl;
+	
+		
+  return 0;
+}
+
diff --git a/test/testFFT3D.cpp b/test/testFFT3D.cpp
new file mode 100644
index 0000000..ff14242
--- /dev/null
+++ b/test/testFFT3D.cpp
@@ -0,0 +1,159 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double>* &data, int N, int dim, bool normalize = false);
+void printData3DN4(complex<double>* &data, int N, int dim);
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+/* usage - ./testFFT3D */
+int main(int argc, char *argv[]) {
+
+  int N = 16;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  if (argc == 2) {
+    N = atoi(argv[1]);
+    strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, "-gpu");
+  } else if (argc == 4) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, argv[3]);
+  } else {
+    N = 16;
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N, N, N};
+
+  cout << "Begin DKS Base tests, N = " <<  N << endl;
+
+  int dim = 3;
+  complex<double> *cdata = new complex<double>[N*N*N];
+  complex<double> *cfft = new complex<double>[N*N*N];
+  complex<double> *cifft = new complex<double>[N*N*N];
+	
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
+	cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+	cifft[i*N*N + j*N + k] = complex<double>(0, 0);
+      }
+    }
+  }
+	
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+  base.setupFFT(3, dimsize);
+	
+  void *mem_ptr;
+  int ierr;
+
+  /* allocate memory on device */
+  mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+	
+  /* write data to device */	
+  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+
+  /* execute fft */
+  base.callFFT(mem_ptr, 3, dimsize);
+	
+  /* execute ifft */	
+  base.callIFFT(mem_ptr, 3, dimsize);
+
+  /* execute normalize */
+  base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	
+  /* read data from device */
+  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+	
+  /* free device memory */
+  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+	
+  /* compare results */
+  compareData(cdata, cifft, N, dim);
+		
+  return 0;
+}
+
+void printData(complex<double>* &data, int N, int dim, bool normalize) {
+  int ni, nj, nk;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+    
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	if (!normalize) {
+	  cout << data[i*ni*ni + j*nj + k].real() << " ";
+	  cout << data[i*ni*ni + j*nj + k].imag() << "\t";
+	} else
+	  cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N; k++) {
+	double d = data[i*N*N + j*N + k].real();
+	double a = data[i*N*N + j*N + k].imag();
+				
+	if (d < 10e-5 && d > -10e-5)
+	  d = 0;
+	if (a < 10e-5 && a > -10e-5)
+	  a = 0;
+					
+	cout << d << "; " << a << "\t";
+      }
+    }
+    cout << endl;
+  }
+  cout << endl;
+    
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id].real() - data2[id].real());
+	sum += fabs(data1[id].imag() - data2[id].imag());
+      }
+    }
+  }
+  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
diff --git a/test/testFFT3DRC.cpp b/test/testFFT3DRC.cpp
new file mode 100644
index 0000000..b0a0625
--- /dev/null
+++ b/test/testFFT3DRC.cpp
@@ -0,0 +1,199 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
+void initData(double *data, int dimsize[3]);
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
+void printHelp();
+
+int main(int argc, char *argv[]) {
+
+  int N1 = 8;
+  int N2 = 8;
+  int N3 = 8;
+  int dim = 3;
+  int loop = 10;
+
+  if ( readParams(argc, argv, N1, N2, N3, loop) )
+    return 0;
+
+  int dimsize[3] = {N3, N2, N1};
+  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+  int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
+
+  double *rdata = new double[sizereal];
+  double *outdata = new double[sizereal];
+  complex<double> *cfft = new complex<double>[sizecomp];
+
+  for (int i=0; i<sizecomp; ++i) {
+    cfft[i].real() = 7.;
+    cfft[i].imag() = 3.33;
+  }
+  initData(rdata, dimsize);
+
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+#ifdef DKS_MIC
+  DKSBase base;
+  base.setAPI("OpenMP", 6);
+  base.setDevice("-mic", 4);
+  base.initDevice();
+  base.setupFFTRC(dim, dimsize);
+  /* setup backward fft (COMPLEX->REAL) */
+  base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
+#endif
+
+#ifdef DKS_CUDA
+  DKSBase base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.setupFFT(dim, dimsize);
+#endif
+
+  // allocate memory on device
+  int ierr;
+  void *real_ptr, *comp_ptr, *real_res_ptr;
+  real_ptr = base.allocateMemory<double>(sizereal, ierr);
+  real_res_ptr = base.allocateMemory<double>(sizereal, ierr);
+  comp_ptr = base.allocateMemory< std::complex<double> >(sizecomp, ierr);
+
+  // execute one run before starting the timers
+  base.writeData<double>(real_ptr, rdata, sizereal);
+  base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+  base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
+  base.readData<double>(real_res_ptr, outdata, sizereal);
+
+  //timer for total loop time, FFT and IFFT calls
+  struct timeval timeStart, timeEnd;
+  struct timeval timeFFTStart[loop], timeFFTEnd[loop];
+  struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
+
+  gettimeofday(&timeStart, NULL);
+  for (int i=0; i<loop; ++i){
+
+    // write data to device
+    base.writeData<double>(real_ptr, rdata, sizereal);
+
+    // execute rcfft
+    gettimeofday(&timeFFTStart[i], NULL);
+    base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+    gettimeofday(&timeFFTEnd[i], NULL);
+
+    // execute crfft
+    gettimeofday(&timeIFFTStart[i], NULL);
+    base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
+    gettimeofday(&timeIFFTEnd[i], NULL);
+
+    //normalize
+#ifdef DKS_CUDA
+    base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
+#endif
+
+    // read IFFT data from device
+    base.readData<double>(real_res_ptr, outdata, sizereal);
+
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  // free device memory
+  base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
+  base.freeMemory<double>(real_ptr, sizereal);
+  base.freeMemory<double>(real_res_ptr, sizereal);
+
+  // compare in and out data to see if we get back the same results
+  compareData(rdata, outdata, N1, N2, N3, dim);
+
+  //calculate seconds for total time and fft times
+  double tfft = 0;
+  double tifft = 0;
+  double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + 
+		  (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
+
+  for (int i = 0; i < loop; i++) {
+    tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + 
+	      (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
+
+    tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + 
+	      (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
+  }
+
+  //print timing results
+  std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
+	    << "\nTotal time\t" << ttot <<  "s\tavg time\t"  << ttot / loop  << "s"
+	    << "\nFFT total\t"  << tfft <<  "s\tFFT avg \t"  << tfft / loop  << "s"
+	    << "\nIFFT total\t" << tifft << "s\tIFFT avg\t"  << tifft / loop << "s"
+	    << "\n\n";
+
+  return 0;
+}
+
+void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
+  int id;
+  double sum = 0;
+  for (int i = 0; i < NI; i++) {
+    for (int j = 0; j < NJ; j++) {
+      for (int k = 0; k < NK; k++) {
+	id = k*NI*NJ + j*NI + i;
+	sum += fabs(data1[id] - data2[id]);
+      }
+    }
+  }
+  std::cout << "RC <--> CR diff: " << sum << std::endl;
+}
+
+void initData(double *data, int dimsize[3]) {
+  for (int i = 0; i < dimsize[2]; i++) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
+      }
+    }
+  }
+}
+
+void printHelp() {
+  std::cout << std::endl;
+
+  std::cout << "testFFT3DRC executes 3D real complex and 3D complex real"
+	    << "function on the Intel MIC.\n";
+  std::cout << "Operations performed by testRC are: "
+	    << "write data to MIC -> FFT -> IFFT -> read data from MIC.\n";
+  std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z " 
+	    << "-loop $l\n";
+  std::cout << "where $x $y $z are number of elements in each dimension and "
+	    << "$l is the number of times all the operations will be performed.\n";
+
+  std::cout << std::endl;
+}
+
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
+
+  for (int i = 1; i < argc; i++) {
+
+    if ( argv[i] == std::string("-grid") ) {
+      N1 = atoi(argv[i + 1]);
+      N2 = atoi(argv[i + 2]);
+      N3 = atoi(argv[i + 3]);
+      i += 3;
+    }
+
+    if ( argv[i] == std::string("-loop") ) {
+      loop = atoi(argv[i + 1]);
+      i += 1;
+    }
+
+    if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) {
+      printHelp();
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/test/testFFT3DRC_MIC.cpp b/test/testFFT3DRC_MIC.cpp
new file mode 100644
index 0000000..9eafe04
--- /dev/null
+++ b/test/testFFT3DRC_MIC.cpp
@@ -0,0 +1,220 @@
+#include <iostream>
+#include <stdlib.h>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double>* &data, int N, int dim, bool normalize = false);
+void printData3DN4(complex<double>* &data, int N, int dim);
+void printData3DN4(double* data, int N, int dim);
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+void compareData(double* data1, double* data2, int N, int dim);
+
+/* Compute (K*L)%M accurately */
+static double moda(int K, int L, int M)
+{
+	return (double)(((long long)K * L) % M);
+}
+/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */
+static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4)
+{
+	double TWOPI = 6.2831853071795864769, phase, factor;
+	int n1, n2, n3, S1, S2, S3, index;
+
+	/* Generalized strides for row-major addressing of x */
+	S3 = 1;
+	S2 = (N3/2+1)*2;
+	S1 = N2*(N3/2+1)*2;
+
+	factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0;
+	for (n1 = 0; n1 < N1; n1++)
+	{
+		for (n2 = 0; n2 < N2; n2++)
+		{
+			for (n3 = 0; n3 < N3; n3++)
+			{
+				phase  = moda(n1,H1,N1) / N1;
+				phase += moda(n2,H2,N2) / N2;
+				phase += moda(n3,H3,N3) / N3;
+				index = n1*S1 + n2*S2 + n3*S3;
+				//cout << "index = " << index << endl;
+				x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3);
+			}
+		}
+	}
+}
+
+
+int main(int argc, char *argv[]) {
+
+	int N = atoi(argv[1]);
+	int dim = 3;
+	int dimsize[3] = {N, N, N};
+	int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+	int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2];
+
+	//double *rdata = new double[sizereal];
+	//double *outdata = new double[sizereal];
+	//complex<double> *cfft = new complex<double>[sizecomp];
+	double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
+	double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
+	complex<double> *cfft = (complex<double> *)malloc(sizecomp*sizeof(complex<double>));
+
+	init_r(rdata, N,N,N);
+
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+
+	DKSBase base;
+	base.setAPI("OpenMP", 6);
+	base.setDevice("-mic", 4);
+	base.initDevice();
+
+	/* setup forward fft (REAL->COMPLEX) */
+	base.setupFFTRC(dim, dimsize);
+
+	int ierr;	
+	void *real_ptr, *comp_ptr;
+
+	/* allocate memory on device */;
+	real_ptr = base.allocateMemory<double>(sizereal, ierr);
+	comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+
+	/* write data to device */	
+	base.writeData<double>(real_ptr, rdata, sizereal);
+
+	//printData3DN4(rdata,N,3);
+
+	/* execute rcfft */
+	base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+
+	/* read FFT data from device */
+	base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
+	base.writeData<double>(comp_ptr, cfft, sizereal);
+
+
+	/* setup backward fft (COMPLEX->REAL) */
+	base.setupFFTCR(dim, dimsize,1./(N*N*N));
+	/* execute crfft */
+	base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize);
+
+	/* normalize */
+	//base.callNormalizeC2RFFT(real_ptr, dim, dimsize);
+
+	/* read FFT data from device */
+	//base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
+
+	/* read IFFT data from device */
+	base.readData<double>(real_ptr, outdata, sizereal);
+
+	/* free device memory */
+	base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+	base.freeMemory<double>(real_ptr, sizereal);
+
+	/* compare data */
+	compareData(rdata, outdata, N, dim);
+
+	return 0;
+}
+
+void printData(complex<double>* &data, int N, int dim, bool normalize) {
+	int ni, nj, nk;
+	ni = (dim > 2) ? N : 1;
+	nj = (dim > 1) ? N : 1;
+	nk = N;
+
+	for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+			for (int k = 0; k < nk; k++) {
+				if (!normalize)
+					cout << data[i*ni*ni + j*nj + k].real() << "\t";
+				else
+					cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k].real();
+				double a = data[i*N*N + j*N + k].imag();
+
+				if (d < 10e-5 && d > -10e-5)
+					d = 0;
+				if (a < 10e-5 && a > -10e-5)
+					a = 0;
+
+				cout << d << "; " << a << "\t";
+			}
+		}
+		cout << endl;
+	}
+	cout << endl;
+
+}
+void printData3DN4(double* data, int N, int dim) {
+
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k];
+				//double a = data[i*N*N + j*N + k].imag();
+
+				if (d < 10e-5 && d > -10e-5)
+					d = 0;
+				//if (a < 10e-5 && a > -10e-5)
+				//	a = 0;
+
+				cout << d << "\t";
+			}
+		}
+		cout << endl;
+	}
+	cout << endl;
+
+}
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+	int ni, nj, nk, id;
+	ni = (dim > 2) ? N : 1;
+	nj = (dim > 1) ? N : 1;
+	nk = N;
+	double sum = 0;
+	for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+			for (int k = 0; k < nk; k++) {
+				id = i*ni*ni + j*nj + k;
+				sum += fabs(data1[id].real() - data2[id].real());
+				sum += fabs(data1[id].imag() - data2[id].imag());
+			}
+		}
+	}
+	cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
+void compareData(double* data1, double* data2, int N, int dim) {
+	int ni, nj, nk, id;
+	ni = (dim > 2) ? N : 1;
+	nj = (dim > 1) ? N : 1;
+	nk = N;
+	double sum = 0;
+	for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+			for (int k = 0; k < nk; k++) {
+				id = i*ni*ni + j*nj + k;
+				//sum += fabs(data1[id] - data2[id]/(N*N*N));
+				sum += fabs(data1[id] - data2[id]);
+			}
+		}
+	}
+	cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
+}
diff --git a/test/testFFT3DSO.cpp b/test/testFFT3DSO.cpp
new file mode 100644
index 0000000..ff14242
--- /dev/null
+++ b/test/testFFT3DSO.cpp
@@ -0,0 +1,159 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double>* &data, int N, int dim, bool normalize = false);
+void printData3DN4(complex<double>* &data, int N, int dim);
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+/* usage - ./testFFT3D */
+int main(int argc, char *argv[]) {
+
+  int N = 16;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  if (argc == 2) {
+    N = atoi(argv[1]);
+    strcpy(api_name, "Cuda");
+    strcpy(device_name, "-gpu");
+  } else if (argc == 3) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, "-gpu");
+  } else if (argc == 4) {
+    N = atoi(argv[1]);
+    strcpy(api_name, argv[2]);
+    strcpy(device_name, argv[3]);
+  } else {
+    N = 16;
+    strcpy(api_name, "OpenCL");
+    strcpy(device_name, "-gpu");
+  }
+
+  cout << "Use api: " << api_name << ", " << device_name << endl;
+
+  int dimsize[3] = {N, N, N};
+
+  cout << "Begin DKS Base tests, N = " <<  N << endl;
+
+  int dim = 3;
+  complex<double> *cdata = new complex<double>[N*N*N];
+  complex<double> *cfft = new complex<double>[N*N*N];
+  complex<double> *cifft = new complex<double>[N*N*N];
+	
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
+	cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+	cifft[i*N*N + j*N + k] = complex<double>(0, 0);
+      }
+    }
+  }
+	
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+  base.setupFFT(3, dimsize);
+	
+  void *mem_ptr;
+  int ierr;
+
+  /* allocate memory on device */
+  mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+	
+  /* write data to device */	
+  ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+
+  /* execute fft */
+  base.callFFT(mem_ptr, 3, dimsize);
+	
+  /* execute ifft */	
+  base.callIFFT(mem_ptr, 3, dimsize);
+
+  /* execute normalize */
+  base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	
+  /* read data from device */
+  base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+	
+  /* free device memory */
+  base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+	
+  /* compare results */
+  compareData(cdata, cifft, N, dim);
+		
+  return 0;
+}
+
+void printData(complex<double>* &data, int N, int dim, bool normalize) {
+  int ni, nj, nk;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+    
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	if (!normalize) {
+	  cout << data[i*ni*ni + j*nj + k].real() << " ";
+	  cout << data[i*ni*ni + j*nj + k].imag() << "\t";
+	} else
+	  cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+  for (int j = 0; j < N; j++) {
+    for (int i = 0; i < N; i++) {
+      for (int k = 0; k < N; k++) {
+	double d = data[i*N*N + j*N + k].real();
+	double a = data[i*N*N + j*N + k].imag();
+				
+	if (d < 10e-5 && d > -10e-5)
+	  d = 0;
+	if (a < 10e-5 && a > -10e-5)
+	  a = 0;
+					
+	cout << d << "; " << a << "\t";
+      }
+    }
+    cout << endl;
+  }
+  cout << endl;
+    
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+  int ni, nj, nk, id;
+  ni = (dim > 2) ? N : 1;
+  nj = (dim > 1) ? N : 1;
+  nk = N;
+  double sum = 0;
+  for (int i = 0; i < ni; i++) {
+    for (int j = 0; j < nj; j++) {
+      for (int k = 0; k < nk; k++) {
+	id = i*ni*ni + j*nj + k;
+	sum += fabs(data1[id].real() - data2[id].real());
+	sum += fabs(data1[id].imag() - data2[id].imag());
+      }
+    }
+  }
+  cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
diff --git a/test/testFFT3DTiming.cpp b/test/testFFT3DTiming.cpp
new file mode 100644
index 0000000..27ef7cf
--- /dev/null
+++ b/test/testFFT3DTiming.cpp
@@ -0,0 +1,130 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+
+int main(int argc, char *argv[]) {
+
+	int N = 4;
+	char *api_name = new char[10];
+	char *device_name = new char[10];
+	if (argc == 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else if (argc > 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+		N = atoi(argv[3]);
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+	int dimsize[3] = {N, N, N};
+
+
+	cout << "Use api: " << api_name << endl;
+
+	cout << "Begin DKS Base tests, N = " <<  N << endl;		
+
+	complex<double> *cdata = new complex<double>[N*N*N];
+	complex<double> *cfft = new complex<double>[N*N*N];
+	complex<double> *cifft = new complex<double>[N*N*N];
+	
+	for (int i = 0; i < N; i++) {
+		for (int j = 0; j < N; j++) {
+			for (int k = 0; k < N; k++) {
+				cdata[i*N*N + j*N + k] = complex<double>((double)i / N, 0);
+				cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+				cifft[i*N*N + j*N + k] = complex<double>(0, 0);
+			}
+		}
+	}
+	
+	timestamp_t t0, t1;
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	DKSBase base;
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(api_name));
+	base.initDevice();
+	
+	void *mem_ptr;
+	int ierr;
+	
+	/* run stest funct to init device */
+	mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+	ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+	base.callFFT(mem_ptr, 3, dimsize);
+	base.callIFFT(mem_ptr, 3, dimsize);
+	base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+	base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+	/* end test */
+	
+	int steps = 10;
+	base.oclClearEvents();
+	t0 = get_timestamp();
+	for (int i = 0; i < steps; i++) {
+	
+		/* allocate memory on device */
+		mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+		
+		/* write data to device */	
+		ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+
+		/* execute fft */
+		base.callFFT(mem_ptr, 3, dimsize);
+	
+		/* execute ifft */	
+		base.callIFFT(mem_ptr, 3, dimsize);
+	
+		/* execute normalize */
+		base.callNormalizeFFT(mem_ptr, 3, dimsize);
+	
+		/* read data from device */
+		base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
+		
+		/* free device memory */
+		base.freeMemory< complex<double> >(mem_ptr, N);
+
+		//compareData(cdata, cifft, N, 3);
+	}
+	t1 = get_timestamp();
+	
+	cout << "=========================" << endl;
+	//base.oclEventInfo();
+	cout << "Average total: " << get_secs(t0, t1) / steps << endl;
+	cout << "=========================" << endl;
+	
+	
+	
+		
+	return 0;
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+    int ni, nj, nk, id;
+    ni = (dim > 2) ? N : 1;
+    nj = (dim > 1) ? N : 1;
+    nk = N;
+    double sum = 0;
+    for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+	    	for (int k = 0; k < nk; k++) {
+			id = i*ni*ni + j*nj + k;
+			sum += fabs(data1[id].real() - data2[id].real());
+			sum += fabs(data1[id].imag() - data2[id].imag());
+		    }
+		}
+    }
+    cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
diff --git a/test/testFFTAsync.cpp b/test/testFFTAsync.cpp
new file mode 100644
index 0000000..89550a9
--- /dev/null
+++ b/test/testFFTAsync.cpp
@@ -0,0 +1,117 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include <cufft.h>
+#include <cuda_runtime.h>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+
+
+using namespace std;
+
+void initData(double *data, int dimsize[3]) {
+  for (int i = 0; i < dimsize[2]; i++) {
+    for (int j = 0; j < dimsize[1]; j++) {
+      for (int k = 0; k < dimsize[0]; k++) {
+	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
+      }
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+
+  int N = 8;
+  if (argc == 2)
+    N = atoi(argv[1]);
+
+  int N1 = N; 
+  int N2 = N;
+  int N3 = N;
+  int dim = 3;
+
+  int dimsize[3] = {N3, N2, N1};
+  int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
+  int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1);
+
+  double *data1 = new double[sizereal];
+  double *data2 = new double[sizereal];
+
+  initData(data1, dimsize);
+  initData(data2, dimsize);
+
+  /* init DKSBase */
+  cout << "Init device and set function" << endl;
+
+  DKSBase base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.setupFFT(3, dimsize);
+
+  /* pagelock data */
+  base.allocateHostMemory(data1, sizereal); 
+  base.allocateHostMemory(data2, sizereal);
+
+  /* create streams */
+  int fft1, fft2;
+  base.createStream(fft1);
+  base.createStream(fft2);
+
+  int ierr;	
+  void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2;
+
+  cout << "allocating memory ..." << endl;
+  /* allocate memory on device */;
+  real_ptr1 = base.allocateMemory<double>(sizereal, ierr);
+  real_ptr2 = base.allocateMemory<double>(sizereal, ierr);
+  comp_ptr1 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
+  comp_ptr2 = base.allocateMemory< complex<double> >(sizecomp*2, ierr);
+
+  cufftHandle defaultPlan;
+  cudaStream_t cfft1, cfft2;
+  cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z);
+  cudaStreamCreate(&cfft1);
+  cudaStreamCreate(&cfft2);
+  
+
+  for (int i = 0; i < 5; i++) {
+    
+    cufftHandle plan = defaultPlan;
+
+    cout << "Iteration: " << i << endl;
+    /* write data to device */	
+    base.writeDataAsync<double>(real_ptr1, data1, sizereal, fft1);
+    //cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1);
+
+    /* execute rcfft */
+    base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1);
+    //cufftSetStream(plan, cfft1);
+    //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2);
+
+    /* write data to device */	
+    base.writeDataAsync<double>(real_ptr2, data2, sizereal, fft2);
+    //cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2);
+
+    /* execute rcfft */
+    base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2);
+    //cufftSetStream(plan, cfft2);
+    //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2);
+
+  }
+
+  base.freeMemory<double>(real_ptr1, sizereal);
+  base.freeMemory<double>(real_ptr2, sizereal);
+  base.freeMemory< complex<double> >(comp_ptr1, sizereal);
+  base.freeMemory< complex<double> >(comp_ptr2, sizereal);
+
+  /* free pagelock data */
+  base.freeHostMemory(data1, sizereal); 
+  base.freeHostMemory(data2, sizereal);
+
+  return 0;
+
+}
diff --git a/test/testFFTSolver.cpp b/test/testFFTSolver.cpp
new file mode 100644
index 0000000..4f01bdc
--- /dev/null
+++ b/test/testFFTSolver.cpp
@@ -0,0 +1,301 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void printData3D(double* data, int N, int NI, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+   
+  for (int i = 0; i < NI; i++) {
+	  for (int j = 0; j < N; j++) {
+	    for (int k = 0; k < N; k++) {
+		    cout << data[i*N*N + j*N + k] << "\t";
+	    }
+		  cout << endl;
+		}
+	  cout << endl;
+  }
+    
+}
+
+void initData(double *data, int N) {
+
+  for (int i = 0; i < N/4 + 1; i++) {
+    for (int j = 0; j < N/2 + 1; j++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+        data[i*N*N + j*N + k] = k+1;
+      }
+    }
+  }
+}
+
+void initData2(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = i;
+}
+
+void initComplex( complex<double> *d, int N) {
+
+  for (int i = 0; i < N; i++) {
+    d[i] = complex<double>(2, 0);
+  }
+
+}
+
+void printComplex(complex<double> *d, int N) {
+  
+  for (int i = 0; i < N; i++)
+    cout << d[i] << "\t";
+  cout << endl;
+
+}
+
+void initMirror(double *data, int n1, int n2, int n3) {
+  int d = 1;
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+	  data[i * n2 * n1 + j * n1 + k] = d++;
+	else
+	  data[i * n2 * n1 + j * n1 + k] = 0;
+      }
+    }
+  }
+}
+
+void printDiv(int c) {
+  for (int i = 0; i < c; i++)
+    cout << "-";
+  cout << endl;
+
+}
+
+void printMirror(double *data, int n1, int n2, int n3) {
+  
+  printDiv(75);
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+  cout << endl;
+}
+
+double sumData(double *data, int datasize) {
+
+  double sum = 0;
+  for (int i = 0; i < datasize; i++)
+    sum += data[i];
+
+  return sum;
+}
+
+int main(int argc, char *argv[]) {
+
+  /* mpi init */
+  int rank, nprocs;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  if (nprocs != 8) {
+    cout << "example was set to run with 8 processes" << endl;
+    cout << "exit..." << endl;
+    return 0;
+  }
+
+  /* set domain size */
+  int NG[3] = {64, 64, 32};
+  int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+  int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+  int sizerho = NG[0] * NG[1] * NG[2];
+  int sizegreen = ng[0] * ng[1] * ng[2];
+  int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
+  int id[3];
+
+  id[0] = 0;
+  id[1] = NL[1] * (rank % 4);
+  id[2] = NL[2] * (rank / 4);
+
+  /* print some messages bout the example in the begginig */
+  if (rank == 0) {
+    cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+    cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
+    cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
+    cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
+    int tmp[3];
+    for (int p = 1; p < nprocs; p++) {
+      MPI_Status mpistatus;
+      MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
+      cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
+    }
+  } else {
+    MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
+  }
+
+  /* dks init and create 2 streams */
+  int dkserr;
+  int streamGreens, streamFFT;
+  DKSBase base;// = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  base.createStream(streamFFT);
+  if (rank == 0) {
+    base.createStream(streamGreens);
+    base.setupFFT(3, NG);
+  }
+
+  /* allocate memory and init rho field */
+  double *rho = new double[sizerho];
+  double *rho_out = new double[sizerho];
+  //double *green_out = new double[sizegreen];
+  initMirror(rho, NL[0], NL[1], NL[2]);
+
+  /*
+    allocate memory on device for 
+    - rho field
+    - rho FFT
+    - tmpgreen
+    - greens integral
+    - greens integral FFT
+  */
+  void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
+  if (rank == 0) {
+    tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
+    rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
+    grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
+    rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+    grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+  } else {
+    grntr_ptr = NULL;
+    rho2_ptr = NULL;
+    grn_ptr = NULL;
+    rho2tr_ptr = NULL;
+    tmpgreen_ptr = NULL;
+  }
+
+  /* send and receive pointer to allocated memory on device */
+  if (rank == 0) {
+    for (int p = 1; p < nprocs; p++)
+      base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
+  } else {
+    rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  /* =================================================*/
+  /* =================================================*/
+  /* =====loop trough fftpoison solver iterations=====*/
+  /* =================================================*/
+  /* =================================================*/
+  
+  double old_sum = 0;
+  double tmp_sum = 0;
+  for (int l = 0; l < 10000; l++) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    /* on node 0, calculate tmpgreen on gpu */
+    int hr_m[3] = {1, 1, 1};
+    if (rank == 0)
+      base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
+			      hr_m[0], hr_m[1], hr_m[2], streamGreens);
+
+    /* calculate greens integral on gpu */
+    if (rank == 0)
+      base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens);
+
+    /* mirror the field */
+    if (rank == 0)
+      base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens);
+    
+
+    /* get FFT of mirrored greens integral */
+    if (rank == 0) 
+      base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens);
+
+    /* transfer rho field to device */
+    base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* get FFT of rho field */
+    if (rank == 0) {
+      base.syncDevice();
+      base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
+    }
+
+    /* multiply both FFTs */
+    if (rank == 0)
+      base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    /* inverse fft and transfer data back */
+    /* 
+       multiple device syncs and mpi barriers are used to make sure data 
+       transfer is started when results are ready and progam moves on 
+       only when data transfer is finished
+    */
+    if (rank == 0) {
+      base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      //cout << "result: " << sumData(rho_out, sizerho) << endl;
+      if (l == 0) { 
+	old_sum = sumData(rho_out, sizerho);
+      } else {
+	tmp_sum = sumData(rho_out, sizerho);
+	if (old_sum != tmp_sum) {
+	  cout << "diff in iteration: " << l << endl;
+	}
+      }
+    } else {
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+
+  }
+  /* =================================================*/  
+  /* =================================================*/
+  /* ==========end fftpoison solver test run==========*/
+  /* =================================================*/
+  /* =================================================*/
+
+
+
+  /* free memory on device */
+  if (rank == 0) {
+    base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+    base.freeMemory<double>(grn_ptr, sizerho);
+    base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
+    base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
+    MPI_Barrier(MPI_COMM_WORLD);
+    base.freeMemory<double>(rho2_ptr, sizerho);
+    cout << "Final sum: " << old_sum << endl;
+  } else {
+    base.closeHandle(rho2_ptr);
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+
+  MPI_Finalize();
+
+
+}
diff --git a/test/testFFTSolver_MIC.cpp b/test/testFFTSolver_MIC.cpp
new file mode 100644
index 0000000..29f84f0
--- /dev/null
+++ b/test/testFFTSolver_MIC.cpp
@@ -0,0 +1,319 @@
+#include <iostream>
+//#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void printData3D(double* data, int N, int NI, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+
+	for (int i = 0; i < NI; i++) {
+		for (int j = 0; j < N; j++) {
+			for (int k = 0; k < N; k++) {
+				cout << data[i*N*N + j*N + k] << "\t";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+
+}
+
+void initData(double *data, int N) {
+
+	for (int i = 0; i < N/4 + 1; i++) {
+		for (int j = 0; j < N/2 + 1; j++) {
+			for (int k = 0; k < N/2 + 1; k++) {
+				data[i*N*N + j*N + k] = k+1;
+			}
+		}
+	}
+}
+
+void initData2(double *data, int N) {
+	for (int i = 0; i < N; i++)
+		data[i] = i;
+}
+
+void initComplex( complex<double> *d, int N) {
+
+	for (int i = 0; i < N; i++) {
+		d[i] = complex<double>(2, 0);
+	}
+
+}
+
+void printComplex(complex<double> *d, int N) {
+
+	for (int i = 0; i < N; i++)
+		cout << d[i] << "\t";
+	cout << endl;
+
+}
+
+void initMirror(double *data, int n1, int n2, int n3) {
+	int d = 1;
+	for (int i = 0; i < n3; i++) {
+		for (int j = 0; j < n2; j++) {
+			for (int k = 0; k < n1; k++) {
+				if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+					data[i * n2 * n1 + j * n1 + k] = d++;
+				else
+					data[i * n2 * n1 + j * n1 + k] = 0;
+			}
+		}
+	}
+}
+
+void printDiv(int c) {
+	for (int i = 0; i < c; i++)
+		cout << "-";
+	cout << endl;
+
+}
+
+void printMirror(double *data, int n1, int n2, int n3) {
+
+	printDiv(75);
+	for (int i = 0; i < n3; i++) {
+		for (int j = 0; j < n2; j++) {
+			for (int k = 0; k < n1; k++) {
+				cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+	cout << endl;
+}
+
+double sumData(double *data, int datasize) {
+
+	double sum = 0;
+	for (int i = 0; i < datasize; i++)
+		sum += data[i];
+
+	return sum;
+}
+
+int main(int argc, char *argv[]) {
+
+	/* mpi init */
+	//int rank, nprocs;
+	//MPI_Init(&argc, &argv);
+	//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+	/*
+	   if (nprocs != 8) {
+	   cout << "example was set to run with 8 processes" << endl;
+	   cout << "exit..." << endl;
+	   return 0;
+	   }
+	   */
+
+	/* set domain size */
+	int NG[3] = {64, 64, 32};
+	int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+	int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+	int sizerho = NG[0] * NG[1] * NG[2];
+	int sizegreen = ng[0] * ng[1] * ng[2];
+	int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
+	int id[3];
+
+	//id[0] = 0;
+	//id[1] = NL[1] * (rank % 4);
+	//id[2] = NL[2] * (rank / 4);
+
+	/* print some messages bout the example in the begginig */
+	cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+	//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
+	cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
+	//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
+	int tmp[3];
+	/*  for (int p = 1; p < nprocs; p++) {
+		MPI_Status mpistatus;
+		MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
+		cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
+		}*/
+	// } else {
+	//   MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
+	// }
+
+	/* dks init and create 2 streams */
+	int dkserr;
+	//int streamGreens, streamFFT;
+#ifdef DKS_MIC
+	DKSBase base;
+	base.setAPI("OpenMP", 6);
+	base.setDevice("-mic", 4);
+	base.initDevice();
+#endif
+
+#ifdef DKS_CUDA
+	DKSBase base;
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+#endif
+
+	//base.createStream(streamFFT);
+	//if (rank == 0) {
+	//  base.createStream(streamGreens);
+	base.setupFFT(3, NG);
+	//}
+
+	/* allocate memory and init rho field */
+	double *rho = new double[sizerho];
+	double *rho_out = new double[sizerho];
+	//double *green_out = new double[sizegreen];
+	initMirror(rho, NL[0], NL[1], NL[2]);
+
+	/*
+	   allocate memory on device for 
+	   - rho field
+	   - rho FFT
+	   - tmpgreen
+	   - greens integral
+	   - greens integral FFT
+	   */
+	void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
+	// if (rank == 0) {
+	tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
+	rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
+	grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
+	rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+	grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+	/* } else {
+	   grntr_ptr = NULL;
+	   rho2_ptr = NULL;
+	   grn_ptr = NULL;
+	   rho2tr_ptr = NULL;
+	   tmpgreen_ptr = NULL;
+	   }*/
+
+
+	/* send and receive pointer to allocated memory on device */
+	/*
+	   if (rank == 0) {
+	   for (int p = 1; p < nprocs; p++)
+	   base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
+	   } else {
+	   rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
+	   }
+	   MPI_Barrier(MPI_COMM_WORLD);
+	   */
+
+
+	/* =================================================*/
+	/* =================================================*/
+	/* =====loop trough fftpoison solver iterations=====*/
+	/* =================================================*/
+	/* =================================================*/
+
+	double old_sum = 0;
+	double tmp_sum = 0;
+	for (int l = 0; l < 100; l++) {
+		//MPI_Barrier(MPI_COMM_WORLD);
+		/* on node 0, calculate tmpgreen on gpu */
+		int hr_m[3] = {1, 1, 1};
+		//if (rank == 0)
+		base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
+				hr_m[0], hr_m[1], hr_m[2]);
+
+		/* calculate greens integral on gpu */
+		//if (rank == 0)
+		base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
+
+		/* mirror the field */
+		//if (rank == 0)
+		base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+
+
+		/* get FFT of mirrored greens integral */
+		//if (rank == 0) 
+		base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
+
+		/* transfer rho field to device */
+		//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
+		base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
+		//MPI_Barrier(MPI_COMM_WORLD);
+
+		/* get FFT of rho field */
+		//if (rank == 0) {
+		//base.syncDevice();
+		base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
+		//}
+
+		/* multiply both FFTs */
+		//if (rank == 0)
+		base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
+		//MPI_Barrier(MPI_COMM_WORLD);
+
+		/* inverse fft and transfer data back */
+		/* 
+		   multiple device syncs and mpi barriers are used to make sure data 
+		   transfer is started when results are ready and progam moves on 
+		   only when data transfer is finished
+		   */
+		//if (rank == 0) {
+		base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
+		//base.syncDevice();
+		//MPI_Barrier(MPI_COMM_WORLD);
+		//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+		base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
+		//MPI_Barrier(MPI_COMM_WORLD);
+		//base.syncDevice();
+		//MPI_Barrier(MPI_COMM_WORLD);
+		//cout << "result: " << sumData(rho_out, sizerho) << endl;
+		if (l == 0) { 
+			old_sum = sumData(rho_out, sizerho);
+		} else {
+			tmp_sum = sumData(rho_out, sizerho);
+			if (old_sum != tmp_sum) {
+				cout << "diff in iteration: " << l << endl;
+			}
+		}
+		/*} else {
+		  MPI_Barrier(MPI_COMM_WORLD);
+		  base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
+		  MPI_Barrier(MPI_COMM_WORLD);
+		  MPI_Barrier(MPI_COMM_WORLD);
+		  }
+		  */
+
+
+	}
+/* =================================================*/  
+/* =================================================*/
+/* ==========end fftpoison solver test run==========*/
+/* =================================================*/
+/* =================================================*/
+
+
+
+/* free memory on device */
+//if (rank == 0) {
+base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+base.freeMemory<double>(grn_ptr, sizerho);
+base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
+base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
+//MPI_Barrier(MPI_COMM_WORLD);
+base.freeMemory<double>(rho2_ptr, sizerho);
+cout << "Final sum: " << old_sum << endl;
+/*} else {
+  base.closeHandle(rho2_ptr);
+  MPI_Barrier(MPI_COMM_WORLD);
+  }*/
+
+//MPI_Finalize();
+
+
+}
diff --git a/test/testGather.cpp b/test/testGather.cpp
new file mode 100644
index 0000000..e0f8eaf
--- /dev/null
+++ b/test/testGather.cpp
@@ -0,0 +1,172 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+
+void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
+
+  if (strcmp(message, "") != 0)
+    cout << message;
+
+  for (int i = 0; i < nz; i++) {
+    for (int j = 0; j < ny; j++) {
+      for (int k = 0; k < nx; k++) {
+	cout << data[i*ny*nx + j*nx + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = 0; j < N; j++)
+      cout << data[i*N + j] << "\t";
+    cout << endl;
+  }
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  int N_global[3] = {64, 64, 32};
+  int N_local[3] = {64, 32, 16};
+  int n = N_local[0] * N_local[1] * N_local[2];
+  
+  int idx[4] = {0, 0, 0, 0};
+  int idy[4] = {0, 32, 0, 32};
+  int idz[4] = {0, 0, 16, 16};
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+	
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
+    hdata_in = new int[n];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, n, rank);
+	
+  	
+  for (int i = 0; i < 2; i++) {
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (i == 1)
+      nvtxMarkA("start gather");
+
+    if (rank == 0) {
+	
+      void *mem_ptr, *tmpgreen_ptr;
+
+      mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+
+      //call another kernel
+      int sizegreen = 33 * 33 * 17;
+      tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);	
+      nvtxMarkA("call green");
+      base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007);
+
+      nvtxMarkA("call gather");
+      base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+     
+      //read and print data once for debug only
+      /*
+      if (i == 0 && nprocs*n < 257) {
+      int *hdata_out_all = new int[nprocs*n];
+      base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
+      printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]);
+      }
+      
+      else {
+	int *hout_data = new int[nprocs*n];
+	base.readData<int>(mem_ptr, hout_data, nprocs*n);
+	int sum = 0;
+	for (int s = 0; s < nprocs*n; s++)
+	  sum += hout_data[s];
+
+	cout << "Sum: " << sum << endl;
+      }
+      */
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      nvtxMarkA("call scatter");
+      base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			 idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+
+      base.freeMemory<int>(mem_ptr, n*nprocs);
+      base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+		
+    } else {
+		
+      nvtxMarkA("call gather");
+      base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+     
+      MPI_Barrier(MPI_COMM_WORLD);
+		
+      nvtxMarkA("call scatter");
+      base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			 idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD);
+    }
+
+    if (i == 1)
+      nvtxMarkA("end gather");
+
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  base.freeHostMemory(hdata_in, n);
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+
diff --git a/test/testGatherAsync.cpp b/test/testGatherAsync.cpp
new file mode 100644
index 0000000..4fe35b5
--- /dev/null
+++ b/test/testGatherAsync.cpp
@@ -0,0 +1,144 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = 0; j < N; j++)
+      cout << data[i*N + j] << "\t";
+    cout << endl;
+  }
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  //mpi copy
+  int n = 32*16*16;
+  int N_global[3] = {32, 32, 32};
+  int N_local[3] = {32, 16, 16};
+  int idx[4] = {0, 0, 0, 0};
+  int idy[4] = {0, 0, 16, 16};
+  int idz[4] = {0, 16, 0, 16};
+
+  //greens kernel
+  int n1 = 33;
+  int n2 = 33;
+  int n3 = 17;
+  int sizegreen = n1*n2*n3;
+	
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+	
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) {
+    hdata_in = new int[n];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, n, rank);
+
+  int stream2;
+  for (int i = 0; i < 2; i++) {
+	
+    if (rank == 0) {
+      if (i == 0) { 
+	cudaProfilerStart();
+	base.createStream(stream2);
+      }
+      
+      nvtxMarkA("start gather");
+      
+      void *mem_ptr, *green_ptr;
+
+      mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+      green_ptr = base.allocateMemory<int>(sizegreen, ierr);
+		
+      nvtxMarkA("call gather");
+      MPI_Request request;
+      MPI_Status status;
+
+      base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, 
+			     idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, 
+			     request);
+
+
+      nvtxMarkA("call kernel");
+      base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1, 
+			      4.160715e-03, 4.474911e-03, 1.247311e-02, stream2);
+
+      MPI_Wait(&request, &status);
+      
+
+      base.freeMemory<int>(mem_ptr, n*nprocs);
+      base.freeMemory<int>(green_ptr, sizegreen);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      nvtxMarkA("end gather");
+
+      if (i == 1) cudaProfilerStop();
+    } else {
+      
+      MPI_Request request;
+      base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local, 
+			     idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, 
+			     request);
+    
+      MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+  }
+
+  base.freeHostMemory(hdata_in, n);
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+
diff --git a/test/testGatherAsync2.cpp b/test/testGatherAsync2.cpp
new file mode 100644
index 0000000..a2ab21f
--- /dev/null
+++ b/test/testGatherAsync2.cpp
@@ -0,0 +1,205 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+
+void printData3D(int* data, int N, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+   
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+    
+}
+
+void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") {
+
+  if (strcmp(message, "") != 0)
+    cout << message;
+
+  for (int i = 0; i < nz; i++) {
+    for (int j = 0; j < ny; j++) {
+      for (int k = 0; k < nx; k++) {
+	cout << data[i*ny*nx + j*nx + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+}
+
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+  if (strcmp(message, "") != 0)
+    cout << message;
+		
+  for (int i = 0; i < nprocs*N; i++)
+      cout << data[i] << "\t";
+  cout << endl << endl;
+  
+}
+
+void initData(int *data, int N, int rank) {
+  for (int i = 0; i < N; i++)
+    data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+  int rank, nprocs;
+		
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+  //cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+  int Ng[3] = {128, 128, 64};
+  int Nl[3] = {128, 64, 32};
+  int nglobal = Ng[0] * Ng[1] * Ng[2];
+  int nlocal = Nl[0] * Nl[1] * Nl[2];
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+  int *hdata_in;
+  if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) {
+    hdata_in = new int[nlocal];
+    cout << "pinned allocation failed!" << endl;
+  }
+  initData(hdata_in, nlocal, rank);
+
+  int *hdata_out;
+  if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) {
+    hdata_out = new int[nlocal];
+    cout << "pinned allocation failed!" << endl;
+  }
+
+  //create streams for async execution
+  int stream1, stream2;
+  base.createStream(stream1);
+  base.createStream(stream2);
+  
+  if (rank == 0)
+    base.setupFFT(3, Ng);
+
+  for (int i = 0; i < 1; i++) {
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (i == 1)
+      nvtxMarkA("start gather");
+
+    if (rank == 0) {
+
+      int id[3] = {0, 0, 0};
+      
+      void *mem_ptr, *tmpgreen_ptr, *comp_ptr;
+
+      //allocate memory on device
+      int sizegreen = 65 * 65 * 33;
+      int sizecomp = 65 * 128 * 64;
+      mem_ptr = base.allocateMemory<double>(nglobal, ierr);
+      tmpgreen_ptr = base.allocateMemory<double>(sizegreen, ierr);	
+      comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+        
+      //send pointer to other processes
+      nvtxMarkA("call gather");
+      for (int j = 1; j < nprocs; j++)
+	base.sendPointer(mem_ptr, j, MPI_COMM_WORLD);
+
+      //call another kernel while data transfer is processing 
+      nvtxMarkA("call green");
+      base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2);
+      
+      //write data to device
+      base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
+      
+      /* execute rcfft */
+      //base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng);
+
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+      //read data from device
+      base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+      base.syncDevice();
+      MPI_Barrier(MPI_COMM_WORLD);
+      
+
+      base.freeMemory<double>(mem_ptr, nglobal);
+      base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+      base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+
+    } else {
+      
+      
+      void *mem_ptr;
+      int idy = 0;
+      int idz = 0;//Nl[2]*rank;
+      if (rank / 2 == 1) idy = Ng[1] / 2;
+      if (rank % 2 == 1) idz = Ng[2] / 2;
+      int id[3] = {0, idy, idz};
+      
+      nvtxMarkA("call gather");
+      mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr);
+      base.gather3DDataAsync<int>(mem_ptr, hdata_in, Ng, Nl, id, stream1);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      base.scatter3DDataAsync<int>(mem_ptr, hdata_out, Ng, Nl, id);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      base.closeHandle(mem_ptr);
+      
+    }
+
+    int sum1 = 0;
+    for (int c = 0; c < nlocal; c++)
+      sum1 += hdata_in[c];
+
+    int sum2 = 0;
+    for (int c = 0; c < nlocal; c++)
+      sum2 += hdata_out[c];
+
+    cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl;
+
+
+    if (i == 1)
+      nvtxMarkA("end gather");
+
+  }
+
+  //printData(hdata_in, nlocal, 1);
+  MPI_Barrier(MPI_COMM_WORLD);
+  base.freeHostMemory(hdata_in, nlocal);
+  //delete[] hdata_in;
+
+  MPI_Finalize();
+  return 0;
+}
+
+
+
+
+
diff --git a/test/testGreens.cpp b/test/testGreens.cpp
new file mode 100644
index 0000000..8b554eb
--- /dev/null
+++ b/test/testGreens.cpp
@@ -0,0 +1,239 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+#include <complex>
+
+#include "DKSBase.h"
+#include "nvToolsExt.h"
+#include "cuda_profiler_api.h"
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void printData3D(double* data, int N, int NI, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+   
+  for (int i = 0; i < NI; i++) {
+	  for (int j = 0; j < N; j++) {
+	    for (int k = 0; k < N; k++) {
+		    cout << data[i*N*N + j*N + k] << "\t";
+	    }
+		  cout << endl;
+		}
+	  cout << endl;
+  }
+    
+}
+
+void initData(double *data, int N) {
+
+  for (int i = 0; i < N/4 + 1; i++) {
+    for (int j = 0; j < N/2 + 1; j++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+        data[i*N*N + j*N + k] = k+1;
+      }
+    }
+  }
+}
+
+void initData2(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = i;
+}
+
+void initComplex( complex<double> *d, int N) {
+
+  for (int i = 0; i < N; i++) {
+    d[i] = complex<double>(2, 0);
+  }
+
+}
+
+void printComplex(complex<double> *d, int N) {
+  
+  for (int i = 0; i < N; i++)
+    cout << d[i] << "\t";
+  cout << endl;
+
+}
+
+void initMirror(double *data, int n1, int n2, int n3) {
+  int d = 1;
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+	  data[i * n2 * n1 + j * n1 + k] = d++;
+	else
+	  data[i * n2 * n1 + j * n1 + k] = 0;
+      }
+    }
+  }
+}
+
+void printDiv(int c) {
+  for (int i = 0; i < c; i++)
+    cout << "-";
+  cout << endl;
+
+}
+
+void printMirror(double *data, int n1, int n2, int n3) {
+  
+  printDiv(75);
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+  cout << endl;
+}
+
+double sumData(double *data, int datasize) {
+
+  double sum = 0;
+  for (int i = 0; i < datasize; i++)
+    sum += data[i];
+
+  return sum;
+}
+
+
+
+int main(int argc, char *argv[]) {
+
+  int ierr;
+ 
+  int N1 = 8;
+  int N2 = 8;
+  int N3 = 4;
+
+  int n1 = N1 / 2; 
+  int n2 = N2 / 2;
+  int n3 = N3 / 2;
+
+  int sizegreen = (n1 + 1) * (n2 + 1) * (n3 + 1);
+  int sizerho = N1 * N2 * N3;
+
+  double *data_green; //= new double[sizegreen];
+  double *data_rho; //= new double[sizerho];
+
+  double hr_m0 = +4.0264984513873269e-04;
+  double hr_m1 = +4.3305596731911289e-04;
+  double hr_m2 = +8.3154085085560838e-04;
+
+  DKSBase base = DKSBase();
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+  
+  int stream1, stream2;
+  base.createStream(stream1);
+  base.createStream(stream2);
+  cout << "ID stream1: " << stream1 << endl;
+  cout << "ID stream2: " << stream2 << endl;
+  
+  void *mem_green1, *mem_green2, *mem_rho1, *mem_rho2;
+
+  mem_green1 = base.allocateMemory<double>(sizegreen, ierr);
+  mem_green2 = base.allocateMemory<double>(sizegreen, ierr);
+  mem_rho1 = base.allocateMemory<double>(sizerho, ierr);
+  mem_rho2 = base.allocateMemory<double>(sizerho, ierr);
+
+  printDiv(50);
+
+  data_green = new double[sizegreen];
+  data_rho = new double[sizerho];
+  
+  base.callGreensIntegral(mem_green1, n1+1, n2+1, n3+1, n1+1, n2+1, 
+			  hr_m0, hr_m1, hr_m2, stream1);
+  base.readData<double>(mem_green1, data_green, sizegreen);
+  cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
+  cout << scientific << setprecision(16);
+  for (int p = 0; p < 7; p++)
+    cout << data_green[p] << "\t";
+  cout << endl;
+  //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
+
+  base.callGreensIntegration(mem_rho1, mem_green1, n1 + 1, n2 + 1, n3 + 1, -1);
+  base.readData<double>(mem_rho1, data_rho, sizerho);
+  cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+
+  base.callMirrorRhoField(mem_rho1, n1, n2, n3, -1);
+  base.readData<double>(mem_rho1, data_rho, sizerho);
+  cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+
+  printDiv(50);
+
+  /*
+  base.callGreensIntegral(mem_green2, n1+1, n2+1, n3+1, n1+1, n2+1, 
+			  1, 1, 1, -2);
+  base.readData<double>(mem_green2, data_green, sizegreen);
+  cout << "Sum green: " << sumData(data_green, sizegreen) << endl;
+  //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1);
+
+  base.callGreensIntegration(mem_rho2, mem_green2, n1 + 1, n2 + 1, n3 + 1, -2);
+  base.readData<double>(mem_rho2, data_rho, sizerho);
+  cout << "Sum integral: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+
+  base.callMirrorRhoField(mem_rho2, n1, n2, n3, -2);
+  base.readData<double>(mem_rho2, data_rho, sizerho);
+  cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl;
+  //printMirror(data_rho, N1, N2, N3);
+  */
+  printDiv(50);
+
+  base.freeMemory<double>(mem_green1, sizegreen);
+  base.freeMemory<double>(mem_green2, sizegreen);
+  base.freeMemory<double>(mem_rho1, sizerho);
+  base.freeMemory<double>(mem_rho2, sizerho);
+  
+  delete [] data_green;
+  delete [] data_rho;
+
+  //test complex multiplication
+  int compsize = 300;
+  complex<double> *data1 = new complex<double>[compsize];
+  complex<double> *data2 = new complex<double>[compsize];
+  for (int i = 0; i < compsize; i++) {
+    data1[i] = complex<double>(i+1, i+2);
+    data2[i] = complex<double>(i+3, i+4);
+  }
+
+  for (int i = 0; i < 3; i++)
+    cout << data1[i] << "\t";
+  cout << endl;
+  for (int i = 0; i < 3; i++)
+    cout << data2[i] << "\t";
+  cout << endl;
+
+  void *ptr1, *ptr2;
+  ptr1 = base.allocateMemory< complex<double> >(compsize, ierr);
+  ptr2 = base.allocateMemory< complex<double> >(compsize, ierr);
+
+  base.writeData< complex<double> >(ptr1, data1, compsize);
+  base.writeData< complex<double> >(ptr2, data2, compsize);
+
+  base.callMultiplyComplexFields(ptr1, ptr2, compsize);
+  
+  base.readData< complex<double> >(ptr1, data1, compsize);
+
+  for (int i = 0; i < 3; i++)
+    cout << data1[i] << "\t";
+  cout << endl;
+
+  base.freeMemory< complex<double> >(ptr1, compsize);
+  base.freeMemory< complex<double> >(ptr2, compsize);
+  		
+  return 0;
+}
diff --git a/test/testImageReconstruction.cpp b/test/testImageReconstruction.cpp
new file mode 100644
index 0000000..2dbb27d
--- /dev/null
+++ b/test/testImageReconstruction.cpp
@@ -0,0 +1,191 @@
+#include <iostream>
+#include <cstdlib>
+#include <sys/time.h>
+#include "DKSImageReconstruction.h"
+
+struct voxelPosition {
+  float x;
+  float y;
+  float z;
+};
+
+void initImage(float *image, int size) {
+  for (int i = 0; i < size; i++)
+    image[i] = (float)rand() / RAND_MAX;
+}
+
+void initPosition(voxelPosition *voxel, int N) {
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	int idx = i * N * N + j * N + k;
+	if (k == 0)
+	  voxel[idx].x = 0.0;
+	else
+	  voxel[idx].x = voxel[idx - 1].x + 0.1;
+
+	if (j == 0)
+	  voxel[idx].y = 0.0;
+	else
+	  voxel[idx].y = voxel[idx - N].y + 0.1;
+
+	if (i == 0)
+	  voxel[idx].z = 0.0;
+	else
+	  voxel[idx].z = voxel[idx - N * N].z + 0.1;
+      }
+    }
+  }
+}
+
+void printPosition(voxelPosition *voxel, int size) {
+  for (int i = 0; i < size; i++)
+    std::cout << voxel[i].x << "\t";
+  std::cout << std::endl;
+  for (int i = 0; i < size; i++)
+    std::cout << voxel[i].y << "\t";
+  std::cout << std::endl;
+  for (int i = 0; i < size; i++)
+    std::cout << voxel[i].z << "\t";
+  std::cout << std::endl;
+}
+
+#define DIAMETER 2.0
+bool select_source(voxelPosition *image_tmp, voxelPosition source_temp, int id)
+{
+  float distance_x = pow(image_tmp[id].x-source_temp.x,2);
+  float distance_y = pow(image_tmp[id].y-source_temp.y,2);
+  float distance_z = pow(image_tmp[id].z-source_temp.z,2);
+  float distance = sqrt(distance_x + distance_y + distance_z);
+
+  if ( distance < DIAMETER*0.5 ) {
+      return true;
+  }
+  else
+    return false;
+}
+
+void calculate_source(float *image_space , voxelPosition *image_geometry, 
+		      voxelPosition source, int total_voxels, 
+		      float *average, float *std)
+{
+  
+  int number_selected_maximum = 10000;
+  float *select;
+  select = new float[number_selected_maximum];
+  for (int j=0;j<number_selected_maximum;j++)
+    select[j] = 0.0;
+  int number_selected=0;
+
+  for (int voxel_id = 0; voxel_id < total_voxels; voxel_id++) {
+    if ( select_source( image_geometry, source, voxel_id ) ) {
+      select[number_selected] = image_space[voxel_id];
+      number_selected += 1;
+    }
+  }
+
+  *average = 0.0;
+  *std = 0.0;
+
+  for (int j=0;j<number_selected;j++)
+    *average += select[j];
+  *average /= float(number_selected);
+
+  for (int j=0;j<number_selected;j++)
+    *std += pow(*average-select[j],2);
+  *std = sqrt(*std/number_selected/(number_selected-1));
+
+  delete[] select;
+}
+
+int main(int argc, char *argv[]) {
+
+  int N = 8;
+  if (argc == 2)
+    N = atoi(argv[1]);
+
+  double ttotal;
+  struct timeval timeStart, timeEnd;
+
+  int total = N*N*N;
+  float *image = new float[total];
+  voxelPosition *geometry = new voxelPosition[total];
+
+  initImage(image, total);
+  initPosition(geometry, N);
+
+  voxelPosition source;
+  float avg[total], stdev[total];
+
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < total; i++) {
+    source.x = geometry[i].x;
+    source.y = geometry[i].y;
+    source.z = geometry[i].z;
+    calculate_source(image , geometry, source, total, &avg[i], &stdev[i]);
+  }    
+  gettimeofday(&timeEnd, NULL);
+  ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	     (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
+
+  float avgavg = 0;
+  float avgstdev = 0;
+  for (int i = 0; i < total; i++) {
+    avgavg += avg[i] / total;
+    avgstdev += stdev[i] / total;
+  }
+
+  std::cout << "Total voxels: " << N*N*N << std::endl;
+  std::cout << "Dimensions [" << geometry[0].x << ":" << geometry[N-1].x << "]"
+	    << "[" << geometry[0].y << ":" << geometry[N*N-1].x << "]"
+	    << "[" << geometry[0].z << ":" << geometry[N*N*N-1].x << "]" << std::endl;
+  std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
+
+
+  void *image_space, *image_position, *source_position, *davg, *dstd;
+
+  int ierr;
+  DKSImageRecon base;
+  base.setAPI("Cuda", 4);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+
+  image_space = base.allocateMemory<float>(total, ierr);
+  image_position = base.allocateMemory<voxelPosition>(total, ierr);
+  source_position = base.allocateMemory<voxelPosition>(total, ierr);
+  davg = base.allocateMemory<float>(total, ierr);
+  dstd = base.allocateMemory<float>(total, ierr);
+
+  base.writeData<float>(image_space, image, total);
+  base.writeData<voxelPosition>(image_position, geometry, total);
+  base.writeData<voxelPosition>(source_position, geometry, total);
+
+
+  gettimeofday(&timeStart, NULL);
+  base.callCalculateSource(image_space, image_position, source_position, 
+			   davg, dstd, DIAMETER, total, total);
+
+
+  base.readData<float>(davg, avg, total);
+  base.readData<float>(dstd, stdev, total);
+  gettimeofday(&timeEnd, NULL);
+  ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	     (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
+
+  base.freeMemory<float>(image_space, total);
+  base.freeMemory<voxelPosition>(image_position, total);
+  base.freeMemory<voxelPosition>(source_position, total);
+  base.freeMemory<float>(dstd, total);
+  base.freeMemory<float>(davg, total);
+  
+  avgavg = 0;
+  avgstdev = 0;
+  for (int i = 0; i < total; i++) {
+    avgavg += avg[i] / total;
+    avgstdev += stdev[i] / total;
+  }
+  std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl;
+
+  return N;
+
+}
diff --git a/test/testMIC.cpp b/test/testMIC.cpp
new file mode 100644
index 0000000..354e9e4
--- /dev/null
+++ b/test/testMIC.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include "DKSBase.h"
+
+using namespace std;
+
+int main() {
+
+  DKSBase base;
+	
+  base.setAPI("OpenMP", 6);
+  base.initDevice();
+	
+  //init data
+  int ierr;
+  int N = 8;
+  double *in_data = new double[N];
+  double *in_data2 = new double[N];
+  double *out_data = new double[N];
+  double *out_data2 = new double[N];
+	
+  for (int i = 0; i < N; i++) {
+    in_data[i] = i;
+    in_data2[i] = i*i;
+  }
+		
+  //test memory allocation, write and read operations
+  void *d_ptr, *d2_ptr;
+  
+  d_ptr = base.allocateMemory<double>(N, ierr);
+  d2_ptr = base.allocateMemory<double>(N, ierr);
+ 	
+  base.writeData<double>(d_ptr, in_data, N);
+  base.writeData<double>(d2_ptr, in_data2, N);
+	
+  base.readData<double>(d_ptr, out_data, N);
+  base.readData<double>(d2_ptr, out_data2, N);
+  base.freeMemory<double>(d_ptr, N);
+  base.freeMemory<double>(d2_ptr, N);
+ 		
+  //print results
+  for (int i = 0; i < N; i++)
+    cout << out_data[i] << "\t";
+  cout << endl;	
+	
+  for (int i = 0; i < N; i++)
+    cout << out_data2[i] << "\t";
+  cout << endl;	
+	
+  return 0;
+
+}
diff --git a/test/testMICOpenCL.cpp b/test/testMICOpenCL.cpp
new file mode 100644
index 0000000..110d797
--- /dev/null
+++ b/test/testMICOpenCL.cpp
@@ -0,0 +1,94 @@
+#include <iostream>
+#include <cstdlib>
+#include "DKSBase.h"
+#include "Utility/TimeStamp.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+	char *api_name = new char[10];
+	char *device_name = new char[4];
+
+	if (argc == 3) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+	} else if (argc == 2){
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+
+	cout << "Use api: " << api_name << endl;
+	cout << "Use device: " << device_name << endl;
+
+
+	int ierr;
+	int N = 10000;
+	double *data = new double[N];
+	double *data_out = new double[N];
+	double *data_out2 = new double[N];
+	
+	for (int i = 0; i < N; i++) {
+		data[i] = i;
+	}
+	
+	//init dks base class, set API to opencl and init connection with OpenCL device
+	DKSBase base;
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(device_name));
+	base.initDevice();
+	
+	//data ptr
+	void *data_ptr, *data_ptr2;
+	
+	//allocate memory
+	data_ptr = base.allocateMemory<double>(N, ierr);
+	data_ptr2 = base.allocateMemory<double>(N, ierr);
+	
+	//write data to memory and fill data on device
+	base.writeData<double>(data_ptr, data, N);
+	base.writeData<double>(data_ptr2, data, N);
+	//base.callNt<double>(data_ptr2, data_ptr, 6, N, 1, 0);
+	
+	//calc sum
+	base.callSum<double>(data_ptr2, data_ptr2, N);
+	
+	//base.callSum<double>(data_ptr, data_ptr, N);
+	
+	//chi^2
+	//base.callChi2<double>(data_ptr, data_ptr, data_ptr, N);
+	//base.callChi2<double>(data_ptr2, data_ptr2, data_ptr2, N);
+	
+	//read data
+	base.readData<double>(data_ptr, data_out, N);
+	base.readData<double>(data_ptr2, data_out2, N);
+	
+	//base.oclEventInfo();
+	
+	//free memory
+	base.freeMemory<double>(data_ptr, N);
+	base.freeMemory<double>(data_ptr2, N);
+	
+	
+	/*
+	for (int i = 0; i < N; i++) {
+		cout << data[i] << "\t";
+	}
+	cout << endl << endl;
+	for (int i = 0; i < N; i++) {
+		cout << data_out[i] << "\t";
+	}
+	cout << endl << endl;
+	for (int i = 0; i < N; i++) {
+		cout << data_out2[i] << "\t";
+	}
+	cout << endl;
+	*/
+
+
+
+	return 0;
+}
\ No newline at end of file
diff --git a/test/testMICPush.cpp b/test/testMICPush.cpp
new file mode 100644
index 0000000..a2f7d2a
--- /dev/null
+++ b/test/testMICPush.cpp
@@ -0,0 +1,68 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Part;
+
+void initData(Part *data, int N) {
+  for (int i = 0; i < N; i++) {
+    data[i].x = rand() / RAND_MAX;
+    data[i].y = rand() / RAND_MAX;
+    data[i].z = rand() / RAND_MAX;
+  }
+}
+
+int main() {
+
+  int ierr;
+  int N = 100000;
+
+  //__declspec(align(64)) Part *R = new Part[N];
+  //__declspec(align(64)) Part *P = new Part[N];
+  Part *R = new Part[N];
+  Part *P = new Part[N];
+
+  initData(R, N);
+  initData(P, N);
+
+  DKSBase dksbase;
+  dksbase.setAPI("OpenMP", 6);
+  dksbase.setDevice("-mic", 4);
+  dksbase.initDevice();
+
+  void *r_ptr, *p_ptr, *dt_ptr;
+  r_ptr = dksbase.allocateMemory<Part>(N, ierr);
+  p_ptr = dksbase.allocateMemory<Part>(N, ierr);
+  dt_ptr = dksbase.allocateMemory<double>(N, ierr);
+
+  dksbase.writeData<Part>(r_ptr, R, N);
+
+  cout << "====================START PUSH====================" << endl;
+
+  for (int i = 0; i < 5; i++) {
+    //write r to device
+    dksbase.writeData<Part>(r_ptr, R, N);
+    //calc push
+    dksbase.callParallelTTrackerPush (r_ptr, p_ptr, N, dt_ptr,
+				      0.001, 1, false, NULL);
+    //read R from device
+    dksbase.readDataAsync<Part> (r_ptr, R, N, NULL);
+  }
+
+  cout << "====================END PUSH====================" << endl;
+
+
+
+  dksbase.freeMemory<Part>(r_ptr, N);
+  dksbase.freeMemory<Part>(p_ptr, N);
+  dksbase.freeMemory<double>(dt_ptr, N);
+
+  return 0;
+}
diff --git a/test/testMPI.cpp b/test/testMPI.cpp
new file mode 100644
index 0000000..aef3cd6
--- /dev/null
+++ b/test/testMPI.cpp
@@ -0,0 +1,89 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(int *data, int N, int nprocs, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+		
+	for (int i = 0; i < nprocs; i++) {
+		for (int j = 0; j < N; j++)
+			cout << data[i*N + j] << "\t";
+		cout << endl;
+	}
+}
+
+void initData(int *data, int N, int rank) {
+	for (int i = 0; i < N; i++)
+		data[i] = (rank+1);
+}
+
+int main(int argc, char *argv[]) {
+
+	int ierr;
+	int rank, nprocs;
+		
+	MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+    cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+	int n = 8;
+	int sizen = sizeof(int)*n;
+	int sizeall = sizeof(int)*n*nprocs;
+	
+	int *hdata_in = new int[n];
+	int *hdata_out = new int[n];
+	initData(hdata_in, n, rank);
+	cout << "In data for process " << rank+1 << ":\t";
+	printData(hdata_in, n, 1);
+	
+	
+	DKSBase base = DKSBase();
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	
+	if (rank == 0) {
+	
+		int *hdata_out_all = new int[nprocs*n];
+		void* mem_ptr;
+		mem_ptr = base.allocateMemory<int>(nprocs*n, ierr);
+		
+		MPI_Gather(hdata_in, n, MPI_INT, mem_ptr, n, MPI_INT, 0, MPI_COMM_WORLD);
+		
+		base.readData<int>(mem_ptr, hdata_out_all, n*nprocs);
+		
+		MPI_Scatter(mem_ptr, n, MPI_INT, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
+		
+		base.freeMemory<int>(mem_ptr, n*nprocs);
+		
+		printData(hdata_out_all, n, nprocs, "Out data 1:\n");
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_in, n, 1);
+	} else {
+		
+		MPI_Gather(hdata_in, n, MPI_INT, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
+		
+		MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD);
+		
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_in, n, 1);
+		
+	}
+
+
+	MPI_Finalize();
+	return 0;
+}
+
+
+
+
+
diff --git a/test/testMPIFFT.cpp b/test/testMPIFFT.cpp
new file mode 100644
index 0000000..69512ff
--- /dev/null
+++ b/test/testMPIFFT.cpp
@@ -0,0 +1,91 @@
+#include <iostream>
+#include <mpi.h>
+#include <string.h>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(complex<double> *data, int N, int nprocs, const char *message = "") {
+	if (strcmp(message, "") != 0)
+		cout << message;
+		
+	for (int i = 0; i < nprocs; i++) {
+		for (int j = 0; j < N; j++)
+			cout << data[i*N + j] << "\t";
+		cout << endl;
+	}
+}
+
+void initData(complex<double> *data, int N, int rank) {
+	for (int i = 0; i < N; i++)
+		data[i] = complex<double>((double)rank+1.0, 0.0);
+}
+
+int main(int argc, char *argv[]) {
+
+	int ierr;
+	int rank, nprocs;
+		
+	MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    
+    cout << "Rank " << (rank+1) << " from " << nprocs << endl;
+
+	int n = 8;
+	
+	complex<double> *hdata_in = new complex<double>[n];
+	complex<double> *hdata_out = new complex<double>[n];
+	initData(hdata_in, n, rank);
+	cout << "In data for process " << rank+1 << ":\t";
+	printData(hdata_in, n, 1);
+	
+	
+	DKSBase base = DKSBase();
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	
+	if (rank == 0) {
+	
+		complex<double> *hdata_out_all = new complex<double>[nprocs*n];
+		void* mem_ptr;
+		mem_ptr = base.allocateMemory< complex<double> >(nprocs*n, ierr);
+		
+		
+		MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, mem_ptr, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+		
+		
+		int dimsize[3] = {n*nprocs, 1, 1};
+		base.callFFT(mem_ptr, 1, dimsize);
+		base.readData< complex<double> >(mem_ptr, hdata_out_all, n*nprocs);
+		
+		MPI_Scatter(mem_ptr, n, MPI_DOUBLE_COMPLEX, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+		
+		base.freeMemory< complex<double> >(mem_ptr, n*nprocs);
+		
+		printData(hdata_out_all, n, nprocs, "Out data 1:\n");
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_out, n, 1);
+	} else {
+		
+		MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, NULL, NULL, NULL, 0, MPI_COMM_WORLD);
+		
+		MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
+		
+		cout << "Scatter data for proces: " << rank + 1 << ": \t";
+		printData(hdata_out, n, 1);
+		
+	}
+
+
+	MPI_Finalize();
+	return 0;
+}
+
+
+
+
+
diff --git a/test/testMemObjects.cpp b/test/testMemObjects.cpp
new file mode 100644
index 0000000..5a5eaf0
--- /dev/null
+++ b/test/testMemObjects.cpp
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	
+	int ierr,n, N;
+
+	if (argc > 1)
+		n = atoi(argv[1]);
+	else
+		n = 10;
+
+	N = 2 << n;
+	cout << "Elements: " << N << endl;
+
+	double *data = new double[N];
+	for (int i = 0; i < N; i++)
+		data[i] = (double)i / N;
+
+
+	DKSBase base = DKSBase();
+	base.setAPI("OpenCL", 6);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	void *ptr1;
+	ptr1 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr1, data, N);
+	
+	void *ptr2;
+	ptr2 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr2, data, N);
+	
+	void *ptr3;
+	ptr3 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr3, data, N);
+	
+	void *ptr4;
+	ptr4 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr4, data, N);
+	
+	void *ptr5;
+	ptr5 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr5, data, N);
+	
+	void *ptr6;
+	ptr6 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr6, data, N);
+	
+	void *ptr7;
+	ptr7 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr7, data, N);
+	
+	void *ptr8;
+	ptr8 = base.allocateMemory<double>(N, ierr);
+	ierr = base.writeData<double>(ptr8, data, N);
+	
+	base.freeMemory<double>(ptr1, N);
+	base.freeMemory<double>(ptr2, N);
+	base.freeMemory<double>(ptr3, N);
+	base.freeMemory<double>(ptr4, N);
+	base.freeMemory<double>(ptr5, N);
+	base.freeMemory<double>(ptr6, N);
+	base.freeMemory<double>(ptr7, N);
+	base.freeMemory<double>(ptr8, N);
+	
+	
+		
+	return 0;
+}
+
diff --git a/test/testOffset.cpp b/test/testOffset.cpp
new file mode 100644
index 0000000..cf7e6ec
--- /dev/null
+++ b/test/testOffset.cpp
@@ -0,0 +1,73 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	
+	char *api_name = new char[10];
+	char *device_name = new char[10];
+	if (argc == 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else if (argc == 3) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+	
+	
+	int ierr,n, N;
+
+	N = 8;
+	n = 4;
+
+	double *data_in = new double[N];
+	double *data_out_1 = new double[N];
+	double *data_out_2 = new double[N];
+	for (int i = 0; i < N; i++) {
+		data_in[i] = (double)i / N;
+		data_out_1[i] = 0.0;
+		data_out_2[i] = 0.0;
+	}
+
+	cout << "Run example on: " << api_name << " using " << device_name << endl;
+
+	DKSBase base = DKSBase();
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(api_name));
+	base.initDevice();
+	
+	void *ptr1;
+	ptr1 = base.allocateMemory<double>(N, ierr);
+	
+	ierr = base.writeData<double>(ptr1, data_in, n, 0);
+	ierr = base.writeData<double>(ptr1, data_in, n, 4);
+	
+	ierr = base.readData<double>(ptr1, data_out_1, N);
+	ierr = base.readData<double>(ptr1, data_out_2, n, 2);
+	
+	base.freeMemory<double>(ptr1, N);
+	
+	for (int i = 0; i < N; i++)
+		cout << data_in[i] << "\t";
+	cout << endl;
+	
+	for (int i = 0; i < N; i++)
+		cout << data_out_1[i] << "\t";
+	cout << endl;
+	
+	for (int i = 0; i < N; i++)
+		cout << data_out_2[i] << "\t";
+	cout << endl;
+		
+	
+	
+		
+	return 0;
+}
+
diff --git a/test/testOffsetMPI.cpp b/test/testOffsetMPI.cpp
new file mode 100644
index 0000000..066cf63
--- /dev/null
+++ b/test/testOffsetMPI.cpp
@@ -0,0 +1,81 @@
+#include <mpi.h>
+#include <iostream>
+#include <cstdlib>
+
+
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	
+	int rank, size;
+		
+	MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    
+    cout << "Rank " << rank << " from " << size << endl;
+
+	
+	int ierr, N, n;
+
+	N = 8;
+	n = N / 2;
+
+	double *data_in = new double[n];
+	
+	for (int i = 0; i < n; i++)
+		data_in[i] = (double)rank + 1.0 + (double)i / n;
+
+	DKSBase base = DKSBase();
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	if (rank == 0) {
+		//alocate memory of size N
+		void *ptr1;
+		ptr1 = base.allocateMemory<double>(size*N, ierr);
+		cout << "Sent pointer: " << ptr1 << endl;
+	
+		//send ptr to other processes
+		MPI_Send(&ptr1, sizeof(void*), MPI_BYTE, 1, 123, MPI_COMM_WORLD);
+	
+		//wrtie n data with no offset to device and wait for other processes
+		ierr = base.writeData<double>(ptr1, data_in, n, rank*n);
+		MPI_Barrier(MPI_COMM_WORLD);
+		
+		//read memory of size N from device
+		double *data_out = new double[N];	
+		ierr = base.readData<double>(ptr1, data_out, N);
+	
+		//free device memory
+		base.freeMemory<double>(ptr1, size*N);
+	
+		//print results
+		for (int i = 0; i < n; i++)
+			cout << data_in[i] << "\t";
+		cout << endl;
+	
+		for (int i = 0; i < N; i++)
+			cout << data_out[i] << "\t";
+		cout << endl;
+		
+    } else {
+    	//receive device memory pointer
+    	void *ptr2;
+    	MPI_Recv(&ptr2, sizeof(void*), MPI_BYTE, 0, 123, MPI_COMM_WORLD, NULL);
+    	cout << "Received pointer: " << ptr2 << endl;
+    	//write data with an offset
+    	base.writeData<double>(ptr2, data_in, n, rank*n);
+    	
+    	MPI_Barrier(MPI_COMM_WORLD);
+    }
+    
+    MPI_Finalize();
+	
+		
+	return 0;
+}
+
diff --git a/test/testPush.cpp b/test/testPush.cpp
new file mode 100644
index 0000000..d2f13b0
--- /dev/null
+++ b/test/testPush.cpp
@@ -0,0 +1,57 @@
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+using namespace std;
+
+
+void initData(double3 *data, int N) {
+  for (int i = 0; i < N; i++) {
+    data[i].x = rand() / RAND_MAX;
+    data[i].y = rand() / RAND_MAX;
+    data[i].z = rand() / RAND_MAX;
+  }
+}
+
+
+int main() {
+  
+  int ierr;
+  int N = 1000000;
+  double3 *R = new double3[N];
+  double3 *P = new double3[N];
+
+  initData(R, N);
+  initData(P, N);
+
+  DKSBase dksbase;
+  dksbase.setAPI("Cuda", 4);
+  dksbase.setDevice("-gpu", 4);
+  dksbase.initDevice();
+
+  void *r_ptr, *p_ptr;
+  
+  r_ptr = dksbase.allocateMemory<double3>(N, ierr);
+  p_ptr = dksbase.allocateMemory<double3>(N, ierr);
+
+  dksbase.writeData<double3>(r_ptr, R, N);
+  dksbase.writeData<double3>(p_ptr, P, N);
+
+  for (int i = 0; i < 100; i++)
+    dksbase.callParallelTTrackerPush(r_ptr, p_ptr, N, NULL, 0.5, 1, false);
+
+
+  dksbase.readData<double3>(r_ptr, R, N);
+  dksbase.readData<double3>(p_ptr, P, N);
+
+  dksbase.freeMemory<double3>(r_ptr, N);
+  dksbase.freeMemory<double3>(p_ptr, N);
+
+
+  return 0;
+}
diff --git a/test/testRCFFT.cpp b/test/testRCFFT.cpp
new file mode 100644
index 0000000..841c04a
--- /dev/null
+++ b/test/testRCFFT.cpp
@@ -0,0 +1,168 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData(double* &data, int N1, int N2);
+void printData(complex<double>* &data, int N1, int N2);
+void printData3DN4(complex<double>* &data, int N, int dim);
+void printData3DN4(double* &data, int N, int dim);
+
+
+void compareData(double* &data1, double* &data2, int N, int dim);
+
+
+
+int main(int argc, char *argv[]) {
+
+	int N1 = 4;
+	int N2 = 4;
+
+	if (argc == 3) {
+		N1 = atoi(argv[1]);
+		N2 = atoi(argv[2]);
+	}
+
+	int dimsize[3] = {N1, N2, 1};
+
+	cout << "Begin RC 3D FFT tests, grid = " <<  N1 << "\t" << N2 << endl;		
+	int sizereal = N1*N2;
+	int sizecomp = N1*(N2/2+1);
+
+	int dim = 3;
+	double *cdata = new double[sizereal];
+	complex<double> *cfft = new complex<double>[sizecomp];
+	
+	for (int i = 0; i < N2; i++) {
+		for (int j = 0; j < N1; j++) {
+			cdata[i*N1 + j] = (double)(j) / N1;
+		}
+	}
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	DKSBase base;
+	base.setAPI("Cuda", 4);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	void *real_ptr, *comp_ptr;
+	int ierr;
+	/* allocate memory on device */
+	real_ptr = base.allocateMemory<double>(sizereal, ierr);
+	comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
+	
+	/* write data to device */	
+	ierr = base.writeData<double>(real_ptr, cdata, sizereal);
+
+	/* execute fft */
+	base.callR2CFFT(real_ptr, comp_ptr, 2, dimsize);
+	
+	/* read data from device */
+	base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
+	
+	/* free device memory */
+	base.freeMemory<double>(real_ptr, sizereal);
+	base.freeMemory< complex<double> >(comp_ptr, sizecomp);
+	
+	cout << "FFT complete" << endl;
+	
+	
+	/* print results */
+	printData(cdata, N1, N2);
+	printData(cfft, N1, N2);
+	
+	
+		
+	return 0;
+}
+
+void printData(double* &data, int N1, int N2) {
+    
+    for (int i = 0; i < N2; i++) {
+		for (int j = 0; j < N1; j++) {
+			cout << data[i*N1 + j] << " ";
+		}
+		cout << endl;
+    }
+	cout << endl;
+}
+
+void printData(complex<double>* &data, int N1, int N2) {
+    
+    complex<double> tmp(0.0, 0.0);
+    for (int i = 0; i < N2/2+1; i++) {
+		for (int j = 0; j < N1; j++) {
+			tmp = data[i*N1 + j];
+		    if (tmp.real() < 0.00001 && tmp.real() > -0.00001) tmp = complex<double>(0.0, tmp.imag());
+		    if (tmp.imag() < 0.00001 && tmp.imag() > -0.00001) tmp = complex<double>(tmp.real(), 0.0);
+		    	
+		    cout << tmp << " ";
+		}
+		cout << endl;
+    }
+    cout << endl;
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				
+				double d = data[i*N*N + j*N + k].real();
+				double a = data[i*N*N + j*N + k].imag();
+				
+				if (d < 10e-5 && d > -10e-5)
+					d = 0;
+				if (a < 10e-5 && a > -10e-5)
+					a = 0;
+					
+				cout << d << "; " << a << "\t";
+    		}
+    	}
+	    cout << endl;
+	}
+	cout << endl;
+    
+}
+
+void printData3DN4(double* &data, int N, int dim) {
+    
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k];
+				if (d > 10e-5 || d < -10e-5)
+				    cout << d << "\t";
+				else 
+					cout << 0 << "\t";
+    		}
+    	}
+	    cout << endl;
+	}
+	cout << endl;
+    
+}
+
+void compareData(double* &data1, double* &data2, int N, int dim) {
+    int ni, nj, nk, id;
+    ni = (dim > 2) ? N : 1;
+    nj = (dim > 1) ? N : 1;
+    nk = N;
+    double sum = 0;
+    for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+	    	for (int k = 0; k < nk; k++) {
+				id = i*ni*ni + j*nj + k;
+				sum += fabs(data1[id] - data2[id]);
+		    }
+		}
+    }
+    cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
+}
+
diff --git a/test/testStockFFT3D.cpp b/test/testStockFFT3D.cpp
new file mode 100644
index 0000000..036a7e2
--- /dev/null
+++ b/test/testStockFFT3D.cpp
@@ -0,0 +1,181 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void printData3DN4(complex<double>* &data, int N, int dim);
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
+
+int main(int argc, char *argv[]) {
+
+	int n = 2;
+	if (argc == 2) 
+		n = atoi(argv[1]);
+
+	int N = pow(2,n);
+
+	cout << "Begin DKS Base tests" << endl;
+		
+	cout << "FFT size: " << N << endl;
+
+	int dimsize[3] = {N, N, N};
+
+	
+	complex<double> *cdata = new complex<double>[N*N*N];
+	complex<double> *cfft = new complex<double>[N*N*N];
+	complex<double> *cfft2 = new complex<double>[N*N*N];
+	complex<double> *cfft3 = new complex<double>[N*N*N];
+
+	
+	for (int i = 0; i < N; i++) {
+		for (int j = 0; j < N; j++) {
+			for (int k = 0; k < N; k++) {
+				//cdata[i*N*N + j*N + k] = complex<double>((double)k/(N*N*N), 0);
+				cdata[i*N*N + j*N + k] = complex<double>(k, 0);
+				cfft[i*N*N + j*N + k] = complex<double>(0, 0);
+				cfft2[i*N*N + j*N + k] = complex<double>(0, 0);
+				cfft3[i*N + j*N + k] = complex<double>(0, 0);
+			}
+		}
+	}
+	
+	if (N == 4)
+		printData3DN4(cdata, N, 3);
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	int ierr;
+	
+	
+	timestamp_t t0, t1;
+	
+	/* stockham radix-2 out-of-place fft */
+	DKSBase base2;
+	base2.setAPI("OpenCL", 6);
+	base2.setDevice("-gpu", 4);
+	base2.initDevice();
+	
+	cout << endl;
+	void *src_ptr;
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		src_ptr = base2.allocateMemory< complex<double> >(N*N*N, ierr);
+		base2.writeData< complex<double> >(src_ptr, cdata, N*N*N);
+		base2.callFFTStockham(src_ptr, 3, dimsize);
+		base2.readData< complex<double> >(src_ptr, cfft2, N*N*N);
+		base2.freeMemory< complex<double> >(src_ptr, N*N*N);
+		t1 = get_timestamp();
+		cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+
+	if (N == 4)
+		printData3DN4(cfft2, N, 3);
+	
+	//delete base2;
+	cout << endl;
+	
+	/* CUDA cufft */
+	DKSBase base3;
+	base3.setAPI("Cuda", 4);
+	base3.setDevice("-gpu", 4);
+	base3.initDevice();
+	
+	cout << endl;
+	void *cuda_ptr;
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		cuda_ptr = base3.allocateMemory< complex<double> >(N*N*N, ierr);
+		base3.writeData< complex<double> >(cuda_ptr, cdata, N*N*N);
+		base3.callFFT(cuda_ptr, 3, dimsize);
+		base3.readData< complex<double> >(cuda_ptr, cfft3, N*N*N);
+		base3.freeMemory< complex<double> >(cuda_ptr, N*N*N);
+		t1 = get_timestamp();
+		cout << "Cuda FFT time: " << get_secs(t0, t1) << endl;
+	}
+		
+	if (N == 4)
+		printData3DN4(cfft3, N, 3);
+	
+	//delete base3;
+	cout << endl;
+	
+	
+	/* radix-2 in place fft */
+	DKSBase base;
+	base.setAPI("OpenCL", 6);
+	base.setDevice("-gpu", 4);
+	base.initDevice();
+	
+	cout << endl;
+	void *mem_ptr;
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
+		base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
+		base.callFFT(mem_ptr, 3, dimsize);
+		base.readData< complex<double> >(mem_ptr, cfft, N*N*N);
+		base.freeMemory< complex<double> >(mem_ptr, N*N*N);
+		t1 = get_timestamp();
+		cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+	
+	if (N == 4)	
+		printData3DN4(cfft, N, 3);
+	
+	//delete base;
+	cout << endl;
+	
+	/* compare results */	
+	cout << endl;
+	
+	cout << "Radix 2 vs Stockham: ";
+	compareData(cfft, cfft2, N, 3);
+	
+	cout << "Radix 2 vs Cufft: ";
+	compareData(cfft, cfft3, N, 3);
+	
+	cout << "Stockham vs Cufft: ";
+	compareData(cfft2, cfft3, N, 3);	
+
+	return 0;
+}
+
+void printData3DN4(complex<double>* &data, int N, int dim) {
+    
+	for (int j = 0; j < N; j++) {
+		for (int i = 0; i < N; i++) {
+			for (int k = 0; k < N; k++) {
+				double d = data[i*N*N + j*N + k].real();
+				if (d > 10e-5 || d < -10e-5)
+				    cout << d << "\t";
+				else 
+					cout << 0 << "\t";
+    		}
+    	}
+	    cout << endl;
+	}
+	cout << endl;
+    
+}
+
+void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
+    int ni, nj, nk, id;
+    ni = (dim > 2) ? N : 1;
+    nj = (dim > 1) ? N : 1;
+    nk = N;
+    double sum = 0;
+    for (int i = 0; i < ni; i++) {
+		for (int j = 0; j < nj; j++) {
+	    	for (int k = 0; k < nk; k++) {
+			id = i*ni*ni + j*nj + k;
+			sum += fabs(data1[id].real() - data2[id].real());
+			sum += fabs(data1[id].imag() - data2[id].imag());
+		    }
+		}
+    }
+    cout << "CC <--> CC diff: " << sum << endl;
+}
\ No newline at end of file
diff --git a/test/testStockhamFFT.cpp b/test/testStockhamFFT.cpp
new file mode 100644
index 0000000..fdc1656
--- /dev/null
+++ b/test/testStockhamFFT.cpp
@@ -0,0 +1,107 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+
+	int n = 2;
+	char *api_name = new char[10];
+	char *device_name = new char[10];
+	if (argc == 2) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, "-gpu");
+	} else if (argc == 3) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+	} else if (argc == 4) {
+		strcpy(api_name, argv[1]);
+		strcpy(device_name, argv[2]);
+		n = atoi(argv[3]);
+	} else {
+		strcpy(api_name, "OpenCL");
+		strcpy(device_name, "-gpu");
+	}
+
+	int N = pow(2,n);
+	cout << "Use api: " << api_name << endl;
+
+	cout << "Begin DKS Base tests" << endl;
+		
+	cout << "FFT size: " << N << endl;
+	
+	int dimsize[3] = {N, N, N};
+	
+	complex<double> *cdata = new complex<double>[N];
+	complex<double> *cfft = new complex<double>[N];
+	complex<double> *cfft2 = new complex<double>[N];
+	complex<double> *cfftsrc = new complex<double>[N];
+	for (int i = 0; i < N; i++) {
+		cdata[i] = complex<double>((double)i / N, 0);
+		cfft[i] = complex<double>(0, 0);
+		cfft2[i] = complex<double>(0, 0);
+		cfftsrc[i] = complex<double>(0, 0);
+	}
+	
+	/* init DKSBase */
+	cout << "Init device and set function" << endl;
+	DKSBase base;
+	base.setAPI(api_name, strlen(api_name));
+	base.setDevice(device_name, strlen(api_name));
+	base.initDevice();
+	
+	
+	timestamp_t t0, t1;
+	
+	/* radix-2 in place fft */
+	void *mem_ptr;
+	int ierr;
+	
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		mem_ptr = base.allocateMemory< complex<double> >(N, ierr);
+		base.writeData< complex<double> >(mem_ptr, cdata, N);
+		base.callFFT(mem_ptr, 1, dimsize);
+		base.readData< complex<double> >(mem_ptr, cfft, N);
+		base.freeMemory< complex<double> >(mem_ptr, N);
+		t1 = get_timestamp();
+		cout << "in-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+	
+	cout << endl;
+	
+	/* stockham radix-2 out-of-place fft */
+	void *src_ptr;
+	
+	for (int i = 0; i < 5; i++) {
+		t0 = get_timestamp();
+		src_ptr = base.allocateMemory< complex<double> >(N, ierr);
+		base.writeData< complex<double> >(src_ptr, cdata, N);
+		base.callFFTStockham(src_ptr, 1, dimsize);
+		base.readData< complex<double> >(src_ptr, cfft2, N);
+		base.freeMemory< complex<double> >(src_ptr, N);
+		t1 = get_timestamp();
+		cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl;
+	}
+	
+	double diff = 0;
+	for (int i = 0; i < N; i++) {
+		diff += fabs(cfft[i].real() - cfft2[i].real());
+		diff += fabs(cfft[i].imag() - cfft2[i].imag());
+	}
+	
+	cout << endl << "Difference: " << diff << endl;
+	
+	if (diff > 0.00001) {
+		for (int i = 0; i < 10; i++) {
+			cout << cfft[i] << "\t" << cfft2[i] << endl;
+		}
+	}
+		
+	return 0;
+}
+
diff --git a/test/testTimeIntegration.cpp b/test/testTimeIntegration.cpp
new file mode 100644
index 0000000..80fec6b
--- /dev/null
+++ b/test/testTimeIntegration.cpp
@@ -0,0 +1,227 @@
+#include <iostream>
+#include <vector>
+#include <time.h>
+#include <sys/time.h>
+#include "DKSBase.h"
+
+#include <vector_types.h>
+#include "cuda_runtime.h"
+
+using namespace std;
+
+typedef struct {
+  double x;
+  double y;
+  double z;
+} Vector;
+
+Vector initVector() {
+  Vector tmp;
+  tmp.x = 0.5;
+  tmp.y = 0.5;
+  tmp.z = 0.5;
+
+  return tmp;
+}
+
+void initVectors(Vector *v, int N) {
+  for (int i = 0; i < N; i++)
+    v[i] = initVector();
+}
+
+void initDouble(double *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = 0.005;
+}
+
+void initLastSect(long *data, int N) {
+  for (int i = 0; i < N; i++)
+    data[i] = -1;
+}
+
+void checkSum(Vector *v, int N) {
+  double sum = 0;
+  for (int i = 0; i < N; i++)
+    sum += v[i].x + v[i].y + v[i].z;
+
+  std::cout << "checksum: " << sum << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+
+  int loop = 10;
+  int numpart = 10;
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; i++) {
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-npart")) {
+      numpart = atoi(argv[i+1]);
+      i++;
+    }
+    
+    if (argv[i] == string("-loop")) {
+      loop = atoi(argv[i+1]);
+      i++;
+    }
+
+  }
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of particles: " << numpart << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //init p,r and dt arrays to test time integration
+  Vector *r = new Vector[numpart];
+  Vector *p = new Vector[numpart];
+  Vector *x = new Vector[numpart];
+  Vector *ori = new Vector[5];
+  initVectors(r, numpart);
+  initVectors(p, numpart);
+  initVectors(x, numpart);
+  initVectors(ori, 5);
+
+  double *dt = new double[numpart];
+  initDouble(dt, numpart);
+
+  long *ls = new long[numpart];
+  initLastSect(ls, numpart);
+
+  //init dks
+  int ierr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(api_name));
+  base.initDevice();
+
+  int stream1, stream2;
+  base.createStream(stream1);
+  base.createStream(stream2);
+  
+  base.registerHostMemory(r, numpart);
+  base.registerHostMemory(p, numpart);
+  base.registerHostMemory(x, numpart);
+  base.registerHostMemory(dt, numpart);
+  base.registerHostMemory(ls, numpart);
+
+  //***test parallelttrackerpush***//
+  void *r_ptr, *p_ptr, *x_ptr, *dt_ptr, *ls_ptr, *ori_ptr;
+
+  //allocate memory on the device
+  r_ptr = base.allocateMemory<Vector>(numpart, ierr);
+  p_ptr = base.allocateMemory<Vector>(numpart, ierr);
+  x_ptr = base.allocateMemory<Vector>(numpart, ierr);
+  dt_ptr = base.allocateMemory<double>(numpart, ierr);
+  ls_ptr = base.allocateMemory<long>(numpart, ierr);
+  ori_ptr = base.allocateMemory<Vector>(5, ierr);
+
+  //transfer data to device
+  base.writeData<Vector>(r_ptr, r, numpart);
+  base.writeData<Vector>(p_ptr, p, numpart);
+  base.writeData<Vector>(x_ptr, x, numpart);
+  base.writeData<Vector>(ori_ptr, ori, 5);
+
+  
+  //do some couple of integration loops before the timer is started
+  for (int i = 0; i < 5; i++) {
+    //calc push
+    base.callParallelTTrackerPush (r_ptr, p_ptr, numpart, dt_ptr,
+				      0.05, 1, false, stream1);
+
+    //read R from device
+    base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
+    
+    //write LastSection to device
+    base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
+
+    //calc push
+    base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
+					   dt_ptr, 0.05, 1, false, stream2);
+    //read x from device
+    base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
+    
+    //sync and wait till all tasks and reads are complete
+    base.syncDevice();
+  }
+
+  checkSum(r, numpart);
+  checkSum(x, numpart);
+  
+
+  
+  //start the timing of integration
+  struct timeval timeStart, timeEnd;
+  std::cout << "start integration" << std::endl;
+  
+  gettimeofday(&timeStart, NULL);
+  for (int i = 0; i < loop; i++) {
+
+    //calc push
+    base.callParallelTTrackerPush(r_ptr, p_ptr, numpart, dt_ptr, 0.05, 1, false, stream1);
+
+    //read R from device
+    base.readDataAsync<Vector> (r_ptr, r, numpart, stream1);
+
+    //write LastSection to device
+    base.writeDataAsync<long> (ls_ptr, ls, numpart, stream2);
+
+    //calc push transform
+    base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5,
+    					   dt_ptr, 0.05, 1, false, stream2);
+    
+    //read R from device
+    base.readDataAsync<Vector>(x_ptr, x, numpart, stream2);
+
+    //sync and wait till all tasks and reads are complete
+    base.syncDevice();
+  }
+  gettimeofday(&timeEnd, NULL);
+
+  std::cout << "end integration" << std::endl;
+  double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + 
+	       (timeEnd.tv_usec - timeStart.tv_usec));
+
+  std::cout << "Time for " << numpart << " integrations: " << t * 1e-6 << "s" << std::endl;
+  std::cout << "Average time for integration: " << t * 1e-6 / loop << std::endl;
+
+  checkSum(r, numpart);
+  checkSum(x, numpart);
+
+
+
+  //free memory
+  base.freeMemory<Vector>(r_ptr, numpart);
+  base.freeMemory<Vector>(p_ptr, numpart);
+  base.freeMemory<Vector>(x_ptr, numpart);
+  base.freeMemory<Vector>(ori_ptr, 5);
+  base.freeMemory<double>(dt_ptr, numpart);
+  base.freeMemory<long>(ls_ptr, numpart);
+
+  //unregister host memory
+  base.unregisterHostMemory(r);
+  base.unregisterHostMemory(p);
+  base.unregisterHostMemory(x);
+  base.unregisterHostMemory(dt);
+  base.unregisterHostMemory(ls);  
+
+  //free host memory
+  delete[] r;
+  delete[] x;
+  delete[] p;
+  delete[] dt;
+  delete[] ls;
+  delete[] ori;
+  
+  cout << "==========================END TEST==========================" << endl;
+  return 0;
+
+}
diff --git a/test/testTranspose.cpp b/test/testTranspose.cpp
new file mode 100644
index 0000000..7d7b34c
--- /dev/null
+++ b/test/testTranspose.cpp
@@ -0,0 +1,76 @@
+#include <iostream>
+#include <cstdlib>
+#include <complex>
+
+#include "Utility/TimeStamp.h"
+#include "DKSBase.h"
+
+using namespace std;
+
+void initData(complex<double> *d, int N, int dim) {
+  
+  int size = N;
+  if (dim == 2) size = N*N;
+  if (dim == 3) size = N*N*N;
+
+  for (int i = 0; i < size; i++)
+    d[i] = complex<double>(i, 0);
+
+}
+
+void printData(complex<double> *d, int N, int dim) {
+  
+  int NZ = N;
+  int NY = (dim > 1) ? N : 1;
+  int NX = (dim > 2) ? N : 1;
+
+  for (int i = 0; i < NX; i++) {
+    for (int j = 0; j < NY; j++) {
+      for (int k = 0; k < NZ; k++) {
+	std::cout << d[i*N*N + j*N + k].real() << "\t";
+      }
+      std::cout << std::endl;
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+
+} 
+
+int main(int argc, char *argv[]) {
+  
+  int N = (argc > 1) ? atoi(argv[1]) : 4;
+  int dimN[3] = {N, N, 1};
+  int dim = 2;
+  int ndim = 1;
+  int size = dimN[0] * dimN[1] * dimN[2];
+
+  std::complex<double> *hd_in = new std::complex<double>[size];
+  std::complex<double> *hd_out = new std::complex<double>[size];
+  initData(hd_in, N, dim);
+  printData(hd_in, N, dim);
+
+  DKSBase base;
+  base.setAPI("OpenCL", 6);
+  base.setDevice("-gpu", 4);
+  base.initDevice();
+  
+  int ierr;
+  void *mem_ptr;
+
+  mem_ptr = base.allocateMemory< std::complex<double> >(size, ierr);
+  base.writeData< std::complex<double> >(mem_ptr, hd_in, size);
+  
+  base.callTranspose(mem_ptr, dimN, dim, ndim);
+
+  base.readData< std::complex<double> >(mem_ptr, hd_out, size);
+  base.freeMemory< std::complex<double> >(mem_ptr, size);
+
+  printData(hd_out, N, 2);
+
+  delete[] hd_in;
+  delete[] hd_out;
+
+  return 0;
+
+}