diff --git a/src/CUDA/CudaBase.cuh b/src/CUDA/CudaBase.cuh
index 325016d..6aa502d 100644
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@@ -167,6 +167,40 @@ public:
     return DKS_SUCCESS;
   }		
 
+  /** Zero CUDA memory.
+   *  Set all the elements of the array on the device to zero.
+   */
+  template<typename T>
+  int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
+    cudaError cerror;
+    cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
+    if (cerror != cudaSuccess) {
+      DEBUG_MSG("Error zeroing cuda memory!\n");
+      return DKS_ERROR;
+    }
+    
+    return DKS_SUCCESS;
+  }
+
+  /** Zero CUDA memory.
+   *  Set all the elements of the array on the device to zero.
+   */
+  template<typename T>
+  int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
+    int dkserror = DKS_SUCCESS;
+    cudaError cerror;
+    if (streamId < cuda_numberOfStreams()) {
+	cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size, 
+				 cuda_getStream(streamId));
+	  
+	if (cerror != cudaSuccess)
+	  dkserror = DKS_ERROR;
+    } else
+      dkserror = DKS_ERROR;
+    
+    return dkserror;
+  }
+
   /** 
    * Info: write data to memory
    * Retrun: success or error code
diff --git a/src/CUDA/CudaGreensFunction.cu b/src/CUDA/CudaGreensFunction.cu
index 36e7463..5816acf 100644
--- a/src/CUDA/CudaGreensFunction.cu
+++ b/src/CUDA/CudaGreensFunction.cu
@@ -189,12 +189,11 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
       tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
   
     tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
-  
+    
     double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
-  
+
     rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
   }
-
 }
 
 
@@ -273,7 +272,6 @@ __global__ void mirroredRhoField(double *rho2_m,
     id7 = rk * NI * NJ + rj * NI + i;
     id8 = rk * NI * NJ + rj * NI + ri;
     
-    
     double data = rho2_m[id1];
     if (i != 0) rho2_m[id2] = data;
     
@@ -389,8 +387,10 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
   
   int thread = 128;
   int block = (I * J * K / thread) + 1;
+  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
 
   if (streamId == -1) {
+    m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
     kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, 
 					    2*(I - 1), 2*(J - 1), I, J, K);
     return DKS_SUCCESS;
@@ -399,6 +399,7 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
   
   if (streamId < m_base->cuda_numberOfStreams()) {
     cudaStream_t cs = m_base->cuda_getStream(streamId);
+    m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
     kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, 
 						  2*(I - 1), 2*(J - 1), I, J, K);
     return DKS_SUCCESS;
diff --git a/src/DKSBase.cpp b/src/DKSBase.cpp
index 290cb38..4de3700 100644
--- a/src/DKSBase.cpp
+++ b/src/DKSBase.cpp
@@ -114,6 +114,7 @@ DKSBase::DKSBase() {
   oclfft = new OpenCLFFT(oclbase);
   oclchi = new OpenCLChiSquare(oclbase);
   oclcol = new OpenCLCollimatorPhysics(oclbase);
+  oclgreens = new OpenCLGreensFunction(oclbase);
 #endif
 
 #ifdef DKS_MIC
@@ -149,6 +150,7 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
   oclfft = new OpenCLFFT(oclbase);
   oclchi = new OpenCLChiSquare(oclbase);
   oclcol = new OpenCLCollimatorPhysics(oclbase);
+  oclgreens = new OpenCLGreensFunction(oclbase);
 #endif
 
 #ifdef DKS_MIC
@@ -187,6 +189,7 @@ DKSBase::~DKSBase() {
   delete oclchi;
   delete oclcol;
   delete oclbase;
+  delete oclgreens;
 #endif
 
 
@@ -613,6 +616,9 @@ int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int
   if (apiCuda()) {
     return CUDA_SAFECALL(cgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, 
 						 hz_m0, hz_m1, hz_m2, streamId) );
+  } else if (apiOpenCL()) {
+    return OPENCL_SAFECALL(oclgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ, 
+						     hz_m0, hz_m1, hz_m2) );
   } else if (apiOpenMP()) {
     //BENI:
     return MIC_SAFECALL(micgreens->greensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
@@ -627,6 +633,8 @@ int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
 
   if (apiCuda())
     return CUDA_SAFECALL(cgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
+  else if (apiOpenCL())
+    return OPENCL_SAFECALL(oclgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
   else if (apiOpenMP())
     return MIC_SAFECALL(micgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
   
@@ -638,6 +646,8 @@ int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId
 
   if (apiCuda()) 
     return CUDA_SAFECALL(cgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
+  else if (apiOpenCL())
+    return OPENCL_SAFECALL(oclgreens->mirrorRhoField(mem_ptr, I, J, K, streamId));
   else if (apiOpenMP())
     return MIC_SAFECALL(micgreens->mirrorRhoField(mem_ptr, I, J, K));
   
@@ -649,6 +659,8 @@ int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size,
 
   if (apiCuda())
     return CUDA_SAFECALL(cgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
+  else if (apiOpenCL())
+    return OPENCL_SAFECALL(oclgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
   else if (apiOpenMP())
     return MIC_SAFECALL(micgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size));
 
diff --git a/src/DKSBase.h b/src/DKSBase.h
index dccb022..5fa09e2 100644
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@@ -32,6 +32,7 @@
 #include "OpenCL/OpenCLFFT.h"
 #include "OpenCL/OpenCLChiSquare.h"
 #include "OpenCL/OpenCLCollimatorPhysics.h"
+#include "OpenCL/OpenCLGreensFunction.h"
 #endif
 
 #ifdef DKS_CUDA
@@ -76,6 +77,7 @@ private:
   OpenCLFFT *oclfft;
   OpenCLChiSquare *oclchi;
   OpenCLCollimatorPhysics *oclcol;
+  OpenCLGreensFunction *oclgreens;
 #endif
 
 #ifdef DKS_CUDA
diff --git a/src/OpenCL/CMakeLists.txt b/src/OpenCL/CMakeLists.txt
index 19cedbe..b0a4792 100644
--- a/src/OpenCL/CMakeLists.txt
+++ b/src/OpenCL/CMakeLists.txt
@@ -4,6 +4,7 @@ SET (_SRCS
 	OpenCLChiSquare.cpp
 	OpenCLCollimatorPhysics.cpp
 	OpenCLChiSquareRuntime.cpp
+	OpenCLGreensFunction.cpp
   )
 
 SET (_HDRS
@@ -12,6 +13,7 @@ SET (_HDRS
 	OpenCLChiSquare.h
 	OpenCLCollimatorPhysics.h
 	OpenCLChiSquareRuntime.h
+	OpenCLGreensFunction.h
   )
 
 #INCLUDE_DIRECTORIES (
@@ -25,6 +27,7 @@ SET (_KERNELS
   OpenCLKernels/OpenCLTranspose.cl
   OpenCLKernels/OpenCLCollimatorPhysics.cl
   OpenCLKernels/OpenCLChiSquareRuntime.cl
+  OpenCLKernels/OpenCLGreensFunction.cl
   )
 
 ADD_SOURCES (${_SRCS})
diff --git a/src/OpenCL/OpenCLBase.cpp b/src/OpenCL/OpenCLBase.cpp
index dcb46ab..e3d3898 100644
--- a/src/OpenCL/OpenCLBase.cpp
+++ b/src/OpenCL/OpenCLBase.cpp
@@ -428,7 +428,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
   int ierr;
 
   //create program from kernel
-  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
+  m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, 
+					NULL, &ierr);
   if (ierr != CL_SUCCESS) {
     DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
     return DKS_ERROR;
@@ -438,7 +439,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
   ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
 	
   /*
-    check if compileng kernel source succeded, if failed return error code
+    check if compiling kernel source succeded, if failed return error code
     if in debug mode get compilation info and print program build log witch
     will give indication what made the compilation fail
   */
@@ -447,7 +448,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
 		
     //get build status
     cl_build_status status;
-    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
+    clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, 
+			  sizeof(cl_build_status), &status, NULL);
 
     //get log size
     size_t log_size;
diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h
index ece3ac8..af74dea 100644
--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@@ -30,24 +30,11 @@
 #include <CL/cl_ext.h>
 #endif
 
-
+#include "clRNG/clRNG.h"
+#include "clRNG/mrg31k3p.h"
 
 #include "../DKSDefinitions.h"
 
-/* struct for random number state */
-typedef struct {
-
-  double s10;
-  double s11;
-  double s12;
-  double s20;
-  double s21;
-  double s22;
-  double z;
-  bool gen;
-
-} RNDState;
-
 class OpenCLBase {
 
 private:
@@ -195,7 +182,7 @@ public:
     Return: return pointer to memory
   */
   cl_mem ocl_allocateMemory(size_t size, int &ierr);
-	
+
   /*
     Name: allocateMemory
     Info: allocate memory on device
@@ -203,6 +190,20 @@ public:
   */
   cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
 	
+  /** Zero OpenCL memory buffer
+   *  Set all the elemetns in the device array to zero
+   */
+  template <typename T>
+  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
+  
+    cl_int ierr;
+    ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset, 
+			       sizeof(T)*size, 0, nullptr, nullptr);
+    if (ierr != CL_SUCCESS)
+      return DKS_ERROR;
+    return DKS_SUCCESS;
+  }
+
   /*
     Name: writeData
     Info: write data to device memory (needs ptr to mem object)
diff --git a/src/OpenCL/OpenCLFFT.cpp b/src/OpenCL/OpenCLFFT.cpp
index 9e986d1..20a2fb4 100644
--- a/src/OpenCL/OpenCLFFT.cpp
+++ b/src/OpenCL/OpenCLFFT.cpp
@@ -117,15 +117,13 @@ int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool for
 */
 int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
 
-  std::cout << "execute RCFFT" << std::endl;
-
   int dkserr = DKS_SUCCESS;
   cl_int ierr;
   cl_mem real_in = (cl_mem)real_ptr;
   cl_mem comp_out = (cl_mem)comp_ptr;
 
   ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, 
-				0, NULL, NULL, &real_in, &comp_out, NULL);
+			       0, NULL, NULL, &real_in, &comp_out, NULL);
   
   if (ierr != OCL_SUCCESS) {
     dkserr = DKS_ERROR;
@@ -144,8 +142,6 @@ int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3],
 */
 int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
 
-  std::cout << "execute CRFFT" << std::endl;
-
   int dkserr = DKS_SUCCESS;
   cl_int ierr;
   cl_mem real_in = (cl_mem)real_ptr;
@@ -214,7 +210,13 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
 
   cl_int err;
 
-  clfftDim dim = CLFFT_3D;
+  clfftDim dim;
+  if (ndim == 1)
+    dim = CLFFT_1D;
+  else if (ndim == 2)
+    dim = CLFFT_2D;
+  else 
+    dim = CLFFT_3D;
   size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
 
   /* Create 3D fft plan*/
@@ -244,9 +246,20 @@ int OpenCLFFT::setupFFT(int ndim, int N[3]) {
 int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
   cl_int err;
   
-  clfftDim dim = CLFFT_3D;
+  clfftDim dim;
+  if (ndim == 1)
+    dim = CLFFT_1D;
+  else if (ndim == 2)
+    dim = CLFFT_2D;
+  else 
+    dim = CLFFT_3D;
+
   size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
 
+  size_t half = (size_t)N[0] / 2 + 1;
+  size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
+  size_t clOutStride[3] = {1, half, half * N[1]};
+
   /* Create 3D fft plan*/
   err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
 
@@ -254,6 +267,8 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
   err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
   err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
   err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
+  err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
+  err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
 
   /* Bake the plan */
   err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
@@ -269,9 +284,20 @@ int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
 int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
   cl_int err;
   
-  clfftDim dim = CLFFT_3D;
+  clfftDim dim;
+  if (ndim == 1)
+    dim = CLFFT_1D;
+  else if (ndim == 2)
+    dim = CLFFT_2D;
+  else
+    dim = CLFFT_3D;
+  
   size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
 
+  size_t half = (size_t)N[0] / 2 + 1;
+  size_t clInStride[3] = {1, half, half * N[1]};
+  size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
+
   /* Create 3D fft plan*/
   err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
 
@@ -279,6 +305,8 @@ int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
   err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
   err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
   err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
+  err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
+  err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
 
   /* Bake the plan */
   err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
diff --git a/src/OpenCL/OpenCLGreensFunction.cpp b/src/OpenCL/OpenCLGreensFunction.cpp
index fab16b4..3485b37 100644
--- a/src/OpenCL/OpenCLGreensFunction.cpp
+++ b/src/OpenCL/OpenCLGreensFunction.cpp
@@ -1,5 +1,5 @@
 #include "OpenCLGreensFunction.h"
-#define GREENS_KERNEL "OpenCLKernels/OpenCLGreensFunction.cl"
+#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
 
 OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
   m_base = base;
@@ -29,6 +29,8 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
 					 double hr_m0, double hr_m1, double hr_m2, 
 					 int streamId)
 {
+  int ierr = DKS_SUCCESS;
+
   //compile opencl program from source
   buildProgram();
 
@@ -42,26 +44,28 @@ int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, in
     work_items = (work_items / work_size + 1) * work_size;
 
   //create kernel
-  ierr = m_oclbase->ocl_createKernel("kernelTmpgreen");
+  ierr = m_base->ocl_createKernel("kernelTmpgreen");
 
   //set kernel parameters
-  m_base->setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
-  m_base->setKernelArg(1, sizeof(double), &hr_m0);
-  m_base->setKernelArg(2, sizeof(double), &hr_m1);
-  m_base->setKernelArg(3, sizeof(double), &hr_m2);
-  m_base->setKernelArg(4, sizeof(int), &I);
-  m_base->setKernelArg(5, sizeof(int), &J);
-  m_base->setKernelArg(6, sizeof(int), &K);
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
+  m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
+  m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
+  m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
+  m_base->ocl_setKernelArg(4, sizeof(int), &I);
+  m_base->ocl_setKernelArg(5, sizeof(int), &J);
+  m_base->ocl_setKernelArg(6, sizeof(int), &K);
   
   //execute kernel
-  ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
 
   return ierr;
 }
 		
-int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
-						    int streamId)
+int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
+						    int K, int streamId)
 {
+  int ierr = DKS_SUCCESS;
+
   //compile opencl program from source
   buildProgram();
 
@@ -70,8 +74,6 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
   cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
   int NI = 2*(I - 1);
   int NJ = 2*(J - 1);
-  int NK = 2*(K - 1);
-
 
   //set the work item size
   size_t work_size = 128;
@@ -80,20 +82,22 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
     work_items = (work_items / work_size + 1) * work_size;
 
   //create kernel
-  ierr = m_oclbase->ocl_createKernel("kernelIntegration");
+  ierr = m_base->ocl_createKernel("kernelIntegration");
 
   //set kernel parameters
-  m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
-  m_base->setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
-  m_base->setKernelArg(2, sizeof(int), &I);
-  m_base->setKernelArg(3, sizeof(int), &J);
-  m_base->setKernelArg(4, sizeof(int), &K);
-  m_base->setKernelArg(5, sizeof(int), &NI);
-  m_base->setKernelArg(6, sizeof(int), &NJ);
-  m_base->setKernelArg(7, sizeof(int), &NK);
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
+  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
+  m_base->ocl_setKernelArg(2, sizeof(int), &NI);
+  m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
+  m_base->ocl_setKernelArg(4, sizeof(int), &I);
+  m_base->ocl_setKernelArg(5, sizeof(int), &J);
+  m_base->ocl_setKernelArg(6, sizeof(int), &K);
   
   //execute kernel
-  ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+  double zero = 0.0;
+  int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
+  m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
 
   return ierr;
 
@@ -102,6 +106,8 @@ int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen
  
 int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) 
 {
+  int ierr = DKS_SUCCESS;
+
   //compile opencl program from source
   buildProgram();
 
@@ -114,6 +120,8 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
   int J2 = 2*J;
   int K2 = 2*K;
 
+  int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
+
   //set the work item size
   size_t work_size = 128;
   size_t work_items = NI * NJ * NK;
@@ -121,19 +129,20 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
     work_items = (work_items / work_size + 1) * work_size;
 
   //create kernel
-  ierr = m_oclbase->ocl_createKernel("kernelMirroredRhoField");
+  ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
 
   //set kernel parameters
-  m_base->setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
-  m_base->setKernelArg(1, sizeof(int), &I2);
-  m_base->setKernelArg(2, sizeof(int), &J2);
-  m_base->setKernelArg(3, sizeof(int), &K2);
-  m_base->setKernelArg(4, sizeof(int), &NI);
-  m_base->setKernelArg(5, sizeof(int), &NJ);
-  m_base->setKernelArg(6, sizeof(int), &NK);
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
+  m_base->ocl_setKernelArg(1, sizeof(int), &I2);
+  m_base->ocl_setKernelArg(2, sizeof(int), &J2);
+  m_base->ocl_setKernelArg(3, sizeof(int), &K2);
+  m_base->ocl_setKernelArg(4, sizeof(int), &NI);
+  m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
+  m_base->ocl_setKernelArg(6, sizeof(int), &NK);
+  m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
 
   //execute kernel
-  ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
 
   return ierr;
 }
@@ -141,4 +150,32 @@ int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int
 
 int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
 {
+    int ierr = DKS_SUCCESS;
+
+  //compile opencl program from source
+  buildProgram();
+
+  //cast the input data ptr to cl_mem
+  cl_mem mem_ptr1 = (cl_mem) ptr1;
+  cl_mem mem_ptr2 = (cl_mem) ptr2;
+
+  //set the work item size
+  size_t work_size = 128;
+  size_t work_items = size;
+  if (work_items % work_size > 0) 
+    work_items = (work_items / work_size + 1) * work_size;
+
+  //create kernel
+  ierr = m_base->ocl_createKernel("multiplyComplexFields");
+
+  //set kernel parameters
+  m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
+  m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
+  m_base->ocl_setKernelArg(2, sizeof(int), &size);
+
+  //execute kernel
+  ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
+
+  return ierr;
+
 }
diff --git a/src/OpenCL/OpenCLGreensFunction.h b/src/OpenCL/OpenCLGreensFunction.h
index 33b7502..3e90fba 100644
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@@ -60,4 +60,4 @@ public:
 };
 
 
-#endif H_OPENCL_GREENSFUNCTION
+#endif
diff --git a/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
index fae4526..92fb81c 100644
--- a/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLGreensFunction.cl
@@ -81,28 +81,29 @@ __kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgre
       tmp6 = tmpgreen[ i    + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];  
   
     tmp7 = tmpgreen[ i    +  j    * NI_tmp +  k * NI_tmp * NJ_tmp];
-  
+    
     double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
-  
+
     rho2_m[i + j*ni +  k*ni*nj] = tmp_rho;
   }
-
 }
 
 /** miror rho-field */
-__kernel void mirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
+__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
   rho2_m[0] = rho2_m[NI*NJ];
 }
 
-__kernel void mirroredRhoField(__global double *rho2_m, 
-			       int NI, int NJ, int NK, 
-			       int NI_tmp, int NJ_tmp, int NK_tmp) {
+__kernel void kernelMirroredRhoField(__global double *rho2_m, 
+				     int NI, int NJ, int NK, 
+				     int NI_tmp, int NJ_tmp, int NK_tmp,
+				     int size) 
+{
 
   int tid = get_local_id(0);
   int id = get_global_id(0);
 
   if (id == 0)
-    rho2_m[0] = rho2_m[NI * NJ];
+   rho2_m[0] = rho2_m[NI * NJ];
 
   barrier(CLK_GLOBAL_MEM_FENCE);
 
@@ -127,27 +128,29 @@ __kernel void mirroredRhoField(__global double *rho2_m,
     id7 = rk * NI * NJ + rj * NI + i;
     id8 = rk * NI * NJ + rj * NI + ri;
     
+    double data = 0.0;
+    if (id1 < size)
+      data = rho2_m[id1];
     
-    double data = rho2_m[id1];
-    if (i != 0) rho2_m[id2] = data;
+    if (i != 0 && id2 < size) rho2_m[id2] = data;
     
-    if (j != 0) rho2_m[id3] = data;
+    if (j != 0 && id3 < size) rho2_m[id3] = data;
 
-    if (i != 0 && j != 0) rho2_m[id4] = data;
+    if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
     
-    if (k != 0) rho2_m[id5] = data;
+    if (k != 0 && id5 < size) rho2_m[id5] = data;
     
-    if (k !=  0 && i != 0) rho2_m[id6] = data;
+    if (k !=  0 && i != 0 && id6 < size) rho2_m[id6] = data;
     
-    if (k!= 0 && j != 0) rho2_m[id7] = data;
+    if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
     
-    if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;     
+    if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;     
   }
 
 }
 
 /** multiply complex fields */
-double2 CompelxMul(double2 a, double2 b) {
+double2 ComplexMul(double2 a, double2 b) {
   double2 c;
   c.x = a.x * b.x - a.y * b.y;
   c.y = a.x * b.y + a.y * b.x;
@@ -155,12 +158,13 @@ double2 CompelxMul(double2 a, double2 b) {
   return c;
 }
 
-
-__kernel void multiplyComplexFields_2(__global double2 *ptr1, __global double2 *ptr2, 
-				      int size) 
+__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2, 
+				    int size) 
 {
+  
   int idx = get_global_id(0);
 
   if (idx < size)
     ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
+  
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f8f7399..db9facd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -25,7 +25,7 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
 ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
 ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
 #ADD_EXECUTABLE(testPush testPush.cpp)
-#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
+ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
 #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
 #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
 
@@ -56,7 +56,7 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testPush dks)
-#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
+TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
 #TARGET_LINK_LIBRARIES(testIntegration dks)
 #TARGET_LINK_LIBRARIES(testImageReconstruction dks)
 
diff --git a/test/testFFT3DRC.cpp b/test/testFFT3DRC.cpp
index c5a2c04..48d256c 100644
--- a/test/testFFT3DRC.cpp
+++ b/test/testFFT3DRC.cpp
@@ -1,6 +1,8 @@
 #include <iostream>
 #include <cstdlib>
 #include <complex>
+#include <fstream>
+#include <iomanip>
 
 #include "Utility/TimeStamp.h"
 #include "DKSBase.h"
@@ -8,14 +10,20 @@
 using namespace std;
 
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
-void initData(double *data, int dimsize[3]);
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, 
-		char *api_name, char *device_name);
+void initData(double *data, int dimsize[3], int dim);
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim, 
+		char *api_name, char *device_name, char *file_name);
 void printHelp();
 
 void printData3DN4(complex<double>* &data, int N, int dim);
 void printData3DN4(double* &data, int N, int dim);
 
+double precision(double a) {
+  //if (a < 1e-10)
+  //  return 0.0;
+  //else
+    return a;
+}
 
 int main(int argc, char *argv[]) {
 
@@ -26,32 +34,29 @@ int main(int argc, char *argv[]) {
   int loop = 0;
   char *api_name = new char[10];
   char *device_name = new char[10];
+  char *file_name = new char[50];
 
-  if ( readParams(argc, argv, N1, N2, N3, loop, api_name, device_name) )
+  if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
     return 0;
 
   cout << "Use api: " << api_name << ", " << device_name << endl;
 
-  int dimsize[3] = {N3, N2, N1};
+  int dimsize[3] = {N1, N2, N3};
   int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
   int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
 
   double *rdata = new double[sizereal];
   double *outdata = new double[sizereal];
   complex<double> *cfft = new complex<double>[sizecomp];
-  initData(rdata, dimsize);
+  initData(rdata, dimsize, dim);
 
   /* init DKSBase */
   cout << "Init device and set function" << endl;
-DKSBase base;
+  DKSBase base;
   base.setAPI(api_name, strlen(api_name));
   base.setDevice(device_name, strlen(device_name));
   base.initDevice();
-  base.setupFFT(3, dimsize);
-
-  base.setupFFTRC(dim, dimsize);
-  /* setup backward fft (COMPLEX->REAL) */
-  base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
+  base.setupFFT(dim, dimsize);
 
   // allocate memory on device
   int ierr;
@@ -63,68 +68,59 @@ DKSBase base;
   // execute one run before starting the timers
   base.writeData<double>(real_ptr, rdata, sizereal);
   base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
+  base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
   base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
   base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
   base.readData<double>(real_res_ptr, outdata, sizereal);
 
-  //timer for total loop time, FFT and IFFT calls
-  struct timeval timeStart, timeEnd;
-  struct timeval timeFFTStart[loop], timeFFTEnd[loop];
-  struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
-
-  gettimeofday(&timeStart, NULL);
-  for (int i=0; i<loop; ++i){
-
-    // write data to device
-    base.writeData<double>(real_ptr, rdata, sizereal);
-
-    // execute rcfft
-    gettimeofday(&timeFFTStart[i], NULL);
-    base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
-    gettimeofday(&timeFFTEnd[i], NULL);
-
-    // execute crfft
-    gettimeofday(&timeIFFTStart[i], NULL);
-    base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
-    gettimeofday(&timeIFFTEnd[i], NULL);
-
-    //normalize
-    base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
-
-    // read IFFT data from device
-    base.readData<double>(real_res_ptr, outdata, sizereal);
 
+  
+  ofstream myfile;
+  myfile.open(file_name);
+  myfile<< "in\tout\treal\timag\n";
+  for (int i = 0; i < sizereal; i++) {
+    //myfile << precision(rdata[i]) << "\t";
+    //myfile << precision(outdata[i]) << "\t";
+    if (i < sizecomp) {
+      myfile << precision(cfft[i].real()) << "\t";
+      myfile << precision(cfft[i].imag());
+    }
+    myfile << "\n";
   }
-  gettimeofday(&timeEnd, NULL);
+  myfile.close();
+  
 
+/*
+  if (dim == 2) {
+    for (int i = 0; i < N2; i++) {
+      for (int j = 0; j < N1; j++) {
+	cout << rdata[i*N1 + j] << " ";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+
+
+  if (dim == 2) {
+    for (int i = 0; i < N2; i++) {
+      for (int j = 0; j < N1 / 2 + 1; j++) {
+	cout << cfft[i*(N1 / 2 + 1)  + j] << " ";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+*/
   // free device memory
   base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
   base.freeMemory<double>(real_ptr, sizereal);
   base.freeMemory<double>(real_res_ptr, sizereal);
 
   // compare in and out data to see if we get back the same results
+  cout << "comp" << endl;
   compareData(rdata, outdata, N1, N2, N3, dim);
-
-  //calculate seconds for total time and fft times
-  double tfft = 0;
-  double tifft = 0;
-  double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + 
-		  (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
-
-  for (int i = 0; i < loop; i++) {
-    tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + 
-	      (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
-
-    tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + 
-	      (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
-  }
-
-  //print timing results
-  std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
-	    << "\nTotal time\t" << ttot <<  "s\tavg time\t"  << ttot / loop  << "s"
-	    << "\nFFT total\t"  << tfft <<  "s\tFFT avg \t"  << tfft / loop  << "s"
-	    << "\nIFFT total\t" << tifft << "s\tIFFT avg\t"  << tifft / loop << "s"
-	    << "\n\n";
+  cout << "done" << endl;
 
   return 0;
 }
@@ -132,10 +128,10 @@ DKSBase base;
 void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
   int id;
   double sum = 0;
-  for (int i = 0; i < NI; i++) {
+  for (int i = 0; i < NK; i++) {
     for (int j = 0; j < NJ; j++) {
-      for (int k = 0; k < NK; k++) {
-	id = k*NI*NJ + j*NI + i;
+      for (int k = 0; k < NI; k++) {
+	id = i*NI*NJ + j*NI + k;
 	sum += fabs(data1[id] - data2[id]);
       }
     }
@@ -143,13 +139,21 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
   std::cout << "RC <--> CR diff: " << sum << std::endl;
 }
 
-void initData(double *data, int dimsize[3]) {
-  for (int i = 0; i < dimsize[2]; i++) {
+void initData(double *data, int dimsize[3], int dim) {
+  if (dim == 3) {
+    for (int i = 0; i < dimsize[2]; i++)
+      for (int j = 0; j < dimsize[1]; j++) 
+	for (int k = 0; k < dimsize[0]; k++) 
+	  data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
+  } else if (dim == 2) {
     for (int j = 0; j < dimsize[1]; j++) {
       for (int k = 0; k < dimsize[0]; k++) {
-	data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
+	data[j*dimsize[0] + k] = sin(k);
       }
     }
+  } else {
+    for (int k = 0; k < dimsize[0]; k++) 
+      data[k] = sin(k);
   }
 }
 
@@ -168,12 +172,17 @@ void printHelp() {
   std::cout << std::endl;
 }
 
-bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
-		char *api_name, char *device_name) 
+bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
+		char *api_name, char *device_name, char *file_name) 
 {
 
   for (int i = 1; i < argc; i++) {
 
+    if ( argv[i] == std::string("-dim")) {
+      dim = atoi(argv[i + 1]);
+      i++;
+    }
+
     if ( argv[i] == std::string("-grid") ) {
       N1 = atoi(argv[i + 1]);
       N2 = atoi(argv[i + 2]);
@@ -194,21 +203,25 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop,
     if (argv[i] == string("-cuda")) {
       strcpy(api_name, "Cuda");
       strcpy(device_name, "-gpu");
+      strcpy(file_name, "cuda_fft.dat");
     } 
 
     if (argv[i] == string("-opencl")) {
       strcpy(api_name, "OpenCL");
       strcpy(device_name, "-gpu");
+      strcpy(file_name, "opencl_fft.dat");
     } 
 
     if (argv[i] == string("-mic")) {
       strcpy(api_name, "OpenMP");
       strcpy(device_name, "-mic");
+      strcpy(file_name, "openmp_fft.dat");
     } 
 
     if (argv[i] == string("-cpu")) {
       strcpy(api_name, "OpenCL");
       strcpy(device_name, "-cpu");
+      strcpy(file_name, "opencl_cpu_fft.dat");
     }
   }
 
diff --git a/test/testFFTSolver_MIC.cpp b/test/testFFTSolver_MIC.cpp
index 29f84f0..4c9063b 100644
--- a/test/testFFTSolver_MIC.cpp
+++ b/test/testFFTSolver_MIC.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-//#include <mpi.h>
 #include <string.h>
 
 #include "DKSBase.h"
@@ -11,309 +10,265 @@ using namespace std;
 
 
 void printData3D(double* data, int N, int NI, const char *message = "") {
-	if (strcmp(message, "") != 0)
-		cout << message;
+  if (strcmp(message, "") != 0)
+    cout << message;
 
-	for (int i = 0; i < NI; i++) {
-		for (int j = 0; j < N; j++) {
-			for (int k = 0; k < N; k++) {
-				cout << data[i*N*N + j*N + k] << "\t";
-			}
-			cout << endl;
-		}
-		cout << endl;
-	}
+  for (int i = 0; i < NI; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < N; k++) {
+	cout << data[i*N*N + j*N + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
 
 }
 
 void initData(double *data, int N) {
 
-	for (int i = 0; i < N/4 + 1; i++) {
-		for (int j = 0; j < N/2 + 1; j++) {
-			for (int k = 0; k < N/2 + 1; k++) {
-				data[i*N*N + j*N + k] = k+1;
-			}
-		}
-	}
+  for (int i = 0; i < N/4 + 1; i++) {
+    for (int j = 0; j < N/2 + 1; j++) {
+      for (int k = 0; k < N/2 + 1; k++) {
+	data[i*N*N + j*N + k] = k+1;
+      }
+    }
+  }
 }
 
 void initData2(double *data, int N) {
-	for (int i = 0; i < N; i++)
-		data[i] = i;
+  for (int i = 0; i < N; i++)
+    data[i] = i;
 }
 
 void initComplex( complex<double> *d, int N) {
 
-	for (int i = 0; i < N; i++) {
-		d[i] = complex<double>(2, 0);
-	}
+  for (int i = 0; i < N; i++) {
+    d[i] = complex<double>(2, 0);
+  }
 
 }
 
 void printComplex(complex<double> *d, int N) {
 
-	for (int i = 0; i < N; i++)
-		cout << d[i] << "\t";
-	cout << endl;
+  for (int i = 0; i < N; i++)
+    cout << d[i] << "\t";
+  cout << endl;
+
+}
+
+void printDouble(double *d, int N) {
+
+  for (int i = 0; i < N; i++)
+    cout << d[i] << ", ";
+  cout << endl;
 
 }
 
 void initMirror(double *data, int n1, int n2, int n3) {
-	int d = 1;
-	for (int i = 0; i < n3; i++) {
-		for (int j = 0; j < n2; j++) {
-			for (int k = 0; k < n1; k++) {
-				if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
-					data[i * n2 * n1 + j * n1 + k] = d++;
-				else
-					data[i * n2 * n1 + j * n1 + k] = 0;
-			}
-		}
-	}
+  int d = 1;
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
+	  data[i * n2 * n1 + j * n1 + k] = d++;
+	else
+	  data[i * n2 * n1 + j * n1 + k] = 0;
+      }
+    }
+  }
 }
 
 void printDiv(int c) {
-	for (int i = 0; i < c; i++)
-		cout << "-";
-	cout << endl;
+  for (int i = 0; i < c; i++)
+    cout << "-";
+  cout << endl;
 
 }
 
 void printMirror(double *data, int n1, int n2, int n3) {
 
-	printDiv(75);
-	for (int i = 0; i < n3; i++) {
-		for (int j = 0; j < n2; j++) {
-			for (int k = 0; k < n1; k++) {
-				cout << data[i * n2 * n1 + j * n1 + k] << "\t";
-			}
-			cout << endl;
-		}
-		cout << endl;
-	}
-	cout << endl;
+  printDiv(75);
+  for (int i = 0; i < n3; i++) {
+    for (int j = 0; j < n2; j++) {
+      for (int k = 0; k < n1; k++) {
+	cout << data[i * n2 * n1 + j * n1 + k] << "\t";
+      }
+      cout << endl;
+    }
+    cout << endl;
+  }
+  cout << endl;
 }
 
 double sumData(double *data, int datasize) {
 
-	double sum = 0;
-	for (int i = 0; i < datasize; i++)
-		sum += data[i];
+  double sum = 0;
+  for (int i = 0; i < datasize; i++)
+    sum += data[i];
 
-	return sum;
+  return sum;
 }
 
 int main(int argc, char *argv[]) {
 
-	/* mpi init */
-	//int rank, nprocs;
-	//MPI_Init(&argc, &argv);
-	//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  char *api_name = new char[10];
+  char *device_name = new char[10];
 
-	/*
-	   if (nprocs != 8) {
-	   cout << "example was set to run with 8 processes" << endl;
-	   cout << "exit..." << endl;
-	   return 0;
-	   }
-	   */
+  for (int i = 1; i < argc; i++) {
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    } 
 
-	/* set domain size */
-	int NG[3] = {64, 64, 32};
-	int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
-	int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
-	int sizerho = NG[0] * NG[1] * NG[2];
-	int sizegreen = ng[0] * ng[1] * ng[2];
-	int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
-	int id[3];
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    } 
 
-	//id[0] = 0;
-	//id[1] = NL[1] * (rank % 4);
-	//id[2] = NL[2] * (rank / 4);
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenMP");
+      strcpy(device_name, "-mic");
+    } 
 
-	/* print some messages bout the example in the begginig */
-	cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
-	//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
-	cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
-	//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
-	int tmp[3];
-	/*  for (int p = 1; p < nprocs; p++) {
-		MPI_Status mpistatus;
-		MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
-		cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
-		}*/
-	// } else {
-	//   MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
-	// }
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+  }
 
-	/* dks init and create 2 streams */
-	int dkserr;
-	//int streamGreens, streamFFT;
-#ifdef DKS_MIC
-	DKSBase base;
-	base.setAPI("OpenMP", 6);
-	base.setDevice("-mic", 4);
-	base.initDevice();
-#endif
+  cout << "Use api: " << api_name << ", " << device_name << endl;
 
-#ifdef DKS_CUDA
-	DKSBase base;
-	base.setAPI("Cuda", 4);
-	base.setDevice("-gpu", 4);
-	base.initDevice();
-#endif
+  /* set domain size */
+  int NG[3] = {64, 64, 32};
+  int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
+  int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
+  int sizerho = NG[0] * NG[1] * NG[2];
+  int sizegreen = ng[0] * ng[1] * ng[2];
+  int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
 
-	//base.createStream(streamFFT);
-	//if (rank == 0) {
-	//  base.createStream(streamGreens);
-	base.setupFFT(3, NG);
-	//}
+  /* print some messages bout the example in the begginig */
+  cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
+  cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
 
-	/* allocate memory and init rho field */
-	double *rho = new double[sizerho];
-	double *rho_out = new double[sizerho];
-	//double *green_out = new double[sizegreen];
-	initMirror(rho, NL[0], NL[1], NL[2]);
+  /* dks init and create 2 streams */
+  int dkserr;
+  DKSBase base;
+  base.setAPI(api_name, strlen(api_name));
+  base.setDevice(device_name, strlen(device_name));
+  base.initDevice();
+  base.setupFFT(3, NG);
 
-	/*
-	   allocate memory on device for 
-	   - rho field
-	   - rho FFT
-	   - tmpgreen
-	   - greens integral
-	   - greens integral FFT
-	   */
-	void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
-	// if (rank == 0) {
-	tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
-	rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
-	grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
-	rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
-	grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
-	/* } else {
-	   grntr_ptr = NULL;
-	   rho2_ptr = NULL;
-	   grn_ptr = NULL;
-	   rho2tr_ptr = NULL;
-	   tmpgreen_ptr = NULL;
-	   }*/
+  /* allocate memory and init rho field */
+  double *rho = new double[sizerho];
+  double *rho_out = new double[sizerho];
+  //double *green_out = new double[sizegreen];
+  double *mirror_out = new double[sizerho];
+  //initMirror(rho, NL[0], NL[1], NL[2]);
+  initMirror(rho, NG[0], NG[1], NG[2]);
 
+  /*
+    allocate memory on device for 
+    - rho field
+    - rho FFT
+    - tmpgreen
+    - greens integral
+    - greens integral FFT
+  */
+  void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
+  tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
+  rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
+  grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
+  rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
+  grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
 
-	/* send and receive pointer to allocated memory on device */
-	/*
-	   if (rank == 0) {
-	   for (int p = 1; p < nprocs; p++)
-	   base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
-	   } else {
-	   rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
-	   }
-	   MPI_Barrier(MPI_COMM_WORLD);
-	   */
+  /* =================================================*/
+  /* =================================================*/
+  /* =====loop trough fftpoison solver iterations=====*/
+  /* =================================================*/
+  /* =================================================*/
 
+  double old_sum = 0;
 
-	/* =================================================*/
-	/* =================================================*/
-	/* =====loop trough fftpoison solver iterations=====*/
-	/* =================================================*/
-	/* =================================================*/
+  int hr_m[3] = {1, 1, 1};
+  base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
 
-	double old_sum = 0;
-	double tmp_sum = 0;
-	for (int l = 0; l < 100; l++) {
-		//MPI_Barrier(MPI_COMM_WORLD);
-		/* on node 0, calculate tmpgreen on gpu */
-		int hr_m[3] = {1, 1, 1};
-		//if (rank == 0)
-		base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], 
-				hr_m[0], hr_m[1], hr_m[2]);
+  /* calculate greens integral on gpu */
+  base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
 
-		/* calculate greens integral on gpu */
-		//if (rank == 0)
-		base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
+  /* mirror the field */
+  base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+  /*
+  base.readData<double>(grn_ptr, mirror_out, sizerho);
+  for (int i = 0; i < sizerho; i++)
+    cout << mirror_out[i] << " ";
+  cout << endl << endl;
 
-		/* mirror the field */
-		//if (rank == 0)
-		base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
+  for (int i = 0; i < sizerho; i++)
+    cout << rho[i] << " ";
+  cout << endl << endl;
+  */
+  /* transfer rho field to device */
+  base.writeData<double>(rho2_ptr, rho, sizerho);
 
+  /* get FFT of rho field */
+  base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
 
-		/* get FFT of mirrored greens integral */
-		//if (rank == 0) 
-		base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
+  /* get FFT of mirrored greens integral */
+  base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
 
-		/* transfer rho field to device */
-		//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
-		base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
-		//MPI_Barrier(MPI_COMM_WORLD);
+  /* multiply both FFTs */
+  base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
 
-		/* get FFT of rho field */
-		//if (rank == 0) {
-		//base.syncDevice();
-		base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
-		//}
+  /*
+  complex<double> *crho = new complex<double>[sizecomp];
+  complex<double> *cgre = new complex<double>[sizecomp];
+  base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
+  base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
 
-		/* multiply both FFTs */
-		//if (rank == 0)
-		base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
-		//MPI_Barrier(MPI_COMM_WORLD);
+  for (int i = 0; i < sizecomp; i++)
+    cout << cgre[i].real() << " ";
+  cout << endl << endl;
 
-		/* inverse fft and transfer data back */
-		/* 
-		   multiple device syncs and mpi barriers are used to make sure data 
-		   transfer is started when results are ready and progam moves on 
-		   only when data transfer is finished
-		   */
-		//if (rank == 0) {
-		base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
-		//base.syncDevice();
-		//MPI_Barrier(MPI_COMM_WORLD);
-		//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
-		base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
-		//MPI_Barrier(MPI_COMM_WORLD);
-		//base.syncDevice();
-		//MPI_Barrier(MPI_COMM_WORLD);
-		//cout << "result: " << sumData(rho_out, sizerho) << endl;
-		if (l == 0) { 
-			old_sum = sumData(rho_out, sizerho);
-		} else {
-			tmp_sum = sumData(rho_out, sizerho);
-			if (old_sum != tmp_sum) {
-				cout << "diff in iteration: " << l << endl;
-			}
-		}
-		/*} else {
-		  MPI_Barrier(MPI_COMM_WORLD);
-		  base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
-		  MPI_Barrier(MPI_COMM_WORLD);
-		  MPI_Barrier(MPI_COMM_WORLD);
-		  }
-		  */
+  for (int i = 0; i < sizecomp; i++)
+    cout << crho[i].real() << " ";
+  cout << endl << endl;
+  
+  delete[] crho;
+  delete[] cgre;
+  */
 
+  /* inverse fft and transfer data back */
+  /* 
+     multiple device syncs and mpi barriers are used to make sure data 
+     transfer is started when results are ready and progam moves on 
+       only when data transfer is finished
+  */
+  base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
+  
+  base.readData<double> (rho2_ptr, rho_out, sizerho);
+  
+  for (int i = 0; i < 10; i++)
+    cout << rho_out[i] << " ";
+  cout << endl;
+  
+  old_sum = sumData(rho_out, sizerho);
 
-	}
 /* =================================================*/  
 /* =================================================*/
 /* ==========end fftpoison solver test run==========*/
 /* =================================================*/
 /* =================================================*/
 
+  base.freeMemory<double>(tmpgreen_ptr, sizegreen);
+  base.freeMemory<double>(grn_ptr, sizerho);
+  base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
+  base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
+  base.freeMemory<double>(rho2_ptr, sizerho);
 
-
-/* free memory on device */
-//if (rank == 0) {
-base.freeMemory<double>(tmpgreen_ptr, sizegreen);
-base.freeMemory<double>(grn_ptr, sizerho);
-base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
-base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
-//MPI_Barrier(MPI_COMM_WORLD);
-base.freeMemory<double>(rho2_ptr, sizerho);
-cout << "Final sum: " << old_sum << endl;
-/*} else {
-  base.closeHandle(rho2_ptr);
-  MPI_Barrier(MPI_COMM_WORLD);
-  }*/
-
-//MPI_Finalize();
-
+  delete[] rho_out;
+  delete[] rho;
+  delete[] mirror_out;
+  cout << "Final sum: " << old_sum << endl;
 
 }