updated documentation

2017-08-10 14:57:48 +02:00
parent 7ca93a3a49
commit ccc4329bef
38 changed files with 939 additions and 673 deletions
--- a/src/Algorithms/ChiSquareRuntime.h
+++ b/src/Algorithms/ChiSquareRuntime.h
@@ -15,6 +15,9 @@
 class DKSBaseMuSR;
 /** 
 * Interface to implement ChiSquareRuntime class for musrfit.
 */
 class ChiSquareRuntime {
  friend class DKSBaseMuSR;
@@ -63,23 +66,54 @@ public:
  /** Default constructor */
  //ChiSquareRuntime();
-  /** Default destructor */
+  /** Default destructor. */
  virtual ~ChiSquareRuntime() { };
  /**
   * Compile GPU programm generated at runtime.
   */
  virtual int compileProgram(std::string function, bool mlh = false) = 0;
  /**
   * Launche the compiled chiSquare kernel.
   */
  virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
 			      int numpar, int numfunc, int nummap,
 			      double timeStart, double timeStep,
 			      double &result) = 0;
  /** 
   * Write the parameter values to the GPU.
   */
  virtual int writeParams(const double *params, int numparams) = 0;
  /**
   * Write the function values to the GPU.
   */
  virtual int writeFunc(const double *func, int numfunc) = 0;
  /**
   * Write map values to the GPU.
   */
  virtual int writeMap(const int *map, int nummap) = 0;
  /**
   * Allocate temporary memory needed for the chi square calucaltios on the device.
   */
  virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
  /**
   * Free device memory allocated for chi square calculations.
   */
  virtual int freeChiSquare() = 0;
  /**
   * Check if available device can run the chi square GPU code.
   */
  virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
-  /** Set N0, tau and bgk values to use for the kernel.
+  /** 
   * Set N0, tau and bgk values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@@ -91,7 +125,8 @@ public:
    return DKS_SUCCESS;
  }
-  /** Set alpha and beta values to use for the kernel.
+  /** 
   * Set alpha and beta values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@@ -101,8 +136,9 @@ public:
    return DKS_SUCCESS;
  }
-  /** Set number of blocks and threads.
+  /** 
-   *  Used to set parameters obtained from auto-tuning
+   * Set number of blocks and threads.
   * Used to set parameters obtained from auto-tuning
   */
  int setKernelParams(int numBlocks, int blockSize) {
    int ierr = DKS_ERROR;
@@ -118,8 +154,9 @@ public:
    return ierr;
  }
-  /** Get the number of operations in compiled kernel.
+  /** 
-   *  Count the number of operation in the ptx file for the compiled program.
+   * Get the number of operations in compiled kernel.
   * Count the number of operation in the ptx file for the compiled program.
   */
  int getOperations(int &oper) {
--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@@ -5,6 +5,9 @@
 #include <string>
 #include "../DKSDefinitions.h"
 /**
 * Interface to impelment particle matter interaction for OPAL.
 */
 class DKSCollimatorPhysics {
 protected:
@@ -15,29 +18,61 @@ protected:
 public:
  virtual ~DKSCollimatorPhysics() { }
-  
+
  /** 
   * Execute collimator physics kernel.
   *
   */  
  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
 				bool enableRutherforScattering = true) = 0;
  /** 
   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles) = 0;
  /** 
   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
  /** 
   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				       void *px_ptr, void *py_ptr, void *pz_ptr,
 				       void *par_ptr, int numparticles, int &numaddback) = 0;
  /** 
   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;
  /** 
   * BorisPusher kick function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				   void *bf_ptr, void *dt_ptr, double charge,
 				   double mass, int npart, double c, int streamId = -1) = 0; 
  /** 
   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
--- a/src/Algorithms/FFT.h
+++ b/src/Algorithms/FFT.h
@@ -6,12 +6,21 @@
 #include "../DKSDefinitions.h"
 /**
 * Abstract class defining methods for DKS FFT class.
 * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
 */
 class BaseFFT {
 protected:
  int defaultN[3];
  int defaultNdim;
  /**
   * Check if FFT plan is created for the needed dimension and FFT size.
   * Returns true if the plan has been created and false if no plan for specified dimension
   * and size exists.
   */
  bool useDefaultPlan(int ndim, int N[3]) {
    if (ndim != defaultNdim)
      return false;
@@ -24,18 +33,57 @@ public:
  virtual ~BaseFFT() { }
  /** Setup FFT - init FFT library used by chosen device. */
  virtual int setupFFT(int ndim, int N[3]) = 0;
  /** Setup real to complex FFT - init FFT library used by chosen device. */
  virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
  /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
  virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
  /** Clean up. */
  virtual int destroyFFT() = 0;
  /** 
   * Exectute C2C FFT.
   * mem_ptr - memory ptr on the device for complex data.
   * Performs in place FFT.
   */
  virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
 			 int streamId = -1, bool forward = true) = 0;
  /** 
   * Exectute inverse C2C FFT.
   * mem_ptr - memory ptr on the device for complex data.
   * Performs in place FFT.
   */
  virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
  /**
   * Normalize the FFT or IFFT.
   * mem_ptr - memory to complex data.
   */
  virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
  /** 
   * Exectute R2C FFT.
   * real_ptr - real input data for FFT, comp_ptr - memory on the device where
   * results for the FFT are stored as complex numbers.
   */
  virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
  /** 
   * Exectute C2R FFT.
   * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
   */
  virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
  /**
   * Normalize CR FFT.
   */
  virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
 };
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@@ -4,24 +4,27 @@
 #include <iostream>
 #include <cmath>
 /**
 * Interface to implement Greens function calculations for OPAL.
 */
 class GreensFunction {
 public:
  virtual ~GreensFunction() { }
-  /** calc greens integral, as defined in OPAL */
+  /** calc greens integral, as defined in OPAL. */
  virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
 			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
-  /** integration if rho2_m, see OPAL for more details */
+  /** integration if rho2_m, see OPAL for more details. */
  virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
 					int streamId = -1) = 0;
-  /** mirror rho2_m field */
+  /** mirror rho2_m field. */
  virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
-  /** multiply two complex fields from device memory */
+  /** multiply two complex fields from device memory. */
  virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
 };
--- a/src/Algorithms/ImageReconstruction.h
+++ b/src/Algorithms/ImageReconstruction.h
@@ -5,17 +5,22 @@
 #define BLOCK_SIZE 128
 /** Struct to hold voxel position for PET image. */
 struct VoxelPosition {
  float x;
  float y;
  float z;
 };
 /** Struct that holds pair of detectors that registered an envent. */
 struct ListEvent {
  unsigned detA : 16;
  unsigned detB : 16;
 };
 /**
 * Interface to implement PET image reconstruction.
 */
 class ImageReconstruction {
 protected:
@@ -25,7 +30,8 @@ public:
  virtual ~ImageReconstruction() { }
-  /** Caluclate source.
+  /** 
   * Caluclate source.
   *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
   *  that are inside this sphere. All the sphere used have the same diameter.
   */
@@ -33,7 +39,8 @@ public:
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0) = 0;
-  /** Calculate background.
+  /** 
   * Calculate background.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@@ -42,7 +49,8 @@ public:
 				  void *avg, void *std, float diameter, int total_voxels, 
 				  int total_sources, int start = 0) = 0;
-  /** Caluclate source using differente sources.
+  /** 
   * Caluclate source using differente sources.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * each sphere is given by *diameter array.
@@ -52,7 +60,7 @@ public:
 			       int total_sources, int start = 0) = 0;
  /**
-   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
   * smaller sphere.
@@ -61,7 +69,8 @@ public:
 				   void *avg, void *std, void *diameter, int total_voxels, 
 				   int total_sources, int start = 0) = 0;
-  /** Generate normalization.
+  /** 
   * Generate normalization.
   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
   * that updates voxel values in the image on the slope between these two detectors.
   */
@@ -69,14 +78,16 @@ public:
 				 void *det_position, int total_det) = 0; 
-  /** Calculate forward projection.
+  /** 
   * Calculate forward projection.
   * For image reconstruction calculates forward projections.
   * see recon.cpp for details
   */
  virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 				void *image_position, int num_events) = 0;
-  /** Calculate backward projection.
+  /** 
   * Calculate backward projection.
   * For image reconstruction calculates backward projections.
   * see recon.cpp for details
   */
@@ -84,29 +95,29 @@ public:
 				 void *det_position, void *image_position, 
 				 int num_events, int num_voxels) = 0;
-  /** Set the voxel dimensins on device.
+  /** 
-   * 
+   *Set the voxel dimensins on device. 
   */
  virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
-  /** Set the image edge variables on the device.
+  /** 
-   * 
+   * Set the image edge variables on the device.
   */
  virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
-  /** Set the image edge1 on the device.
+  /** 
-   * 
+   * Set the image edge1 on the device.
   */
  virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
-  /** Set the minimum crystan in one ring values on the device.
+  /** 
-   * 
+   * Set the minimum crystan in one ring values on the device.
   */
  virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
 				  float min_CrystalDist_InOneRing1) = 0;
-  /** Set all other required parameters for reconstruction.
+  /** 
-   * 
+   * Set all other required parameters for reconstruction.
   */
  virtual int setParams(float matrix_distance_factor, float phantom_diameter,
 			float atten_per_mm, float ring_diameter) = 0;
--- a/src/AutoTuning/DKSAutoTuning.h
+++ b/src/AutoTuning/DKSAutoTuning.h
@@ -18,6 +18,17 @@
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;
 /** 
 * DKS autotuning class, allows to auto-tune the defince function.
 * Executes the defined function for auto-tuning and searches for optimal parameters to improve
 * the function execution time. The function that is auto-tuned, parameters and the ranges
 * need to be set. Includes multiple search methods, that searches the parameter space to finde 
 * the optimal solution.
 *  1) exaustive search
 *  2) line search
 *  3) hill climbimg
 *  4) simulated annealing
 */
 class DKSAutoTuning {
 private:
@@ -36,12 +47,13 @@ private:
  int loops_m;
-  /** Update parameters from a state */
+  /** Update parameters from a state. */
  int setParameterValues(States states);
-  /** Evaluate the function and set execution time 
+  /** 
-   *  Returns DKS_ERROR if errors occured during function execution. 
+   * Evaluate the function and set execution time 
-   *  Returns DKS_SUCCESS if function executed as planned. 
+   * Returns DKS_ERROR if errors occured during function execution. 
   * Returns DKS_SUCCESS if function executed as planned. 
   */
  int evaluateFunction(double &value);
@@ -50,12 +62,13 @@ public:
  /** Constructor */
  DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
-  /** Destructor */
+  /** Destructor. */
  ~DKSAutoTuning();
-  /** Set function to auto tune.
+  /** 
-   *  Caller of setFunction is responsible to bind the correct parameters 
+   * Set function to auto tune.
-   *  to the function with std::bind.
+   * Caller of setFunction is responsible to bind the correct parameters 
   * to the function with std::bind.
   */
  void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
    f_m = f;
@@ -63,15 +76,21 @@ public:
    evaluate_time_m = evaluate_time;
  }
  /** 
   * Set function to auto tune.
   * Caller of setFunction is responsible to bind the correct parameters 
   * to the function with std::bind.
   */
  void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
    fd_m = f;
    function_name_m = name;
    evaluate_time_m = evaluate_time;
  }
-  /** Set parameter for auto tuning.
+  /** 
-   *  Provide a pointer to a parameter that will be changed during auto-tuning
+   * Set parameter for auto tuning.
-   *  and a min-max value for this element
+   * Provide a pointer to a parameter that will be changed during auto-tuning
   * and a min-max value for this element
   */
  template <typename T1>
  void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
@@ -85,9 +104,9 @@ public:
  /** Perform exaustive search evaluating all the parameter configurations */
  void exaustiveSearch();
-  /** Perform auto-tuning.
+  /**
-   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
+   * Perform line-search auto-tuning by variying parameters one at a time.
-   *  parameters constant.
+   * After one parameter is auto-tuned the next on is varied
   */
  void lineSearch();  
--- a/src/AutoTuning/DKSAutoTuningTester.h
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <cmath>
 /** Tester class for auto-tuning search algorithms. */
 class DKSAutoTuningTester {
  friend class DKSBaseMuSR;
--- a/src/AutoTuning/DKSConfig.h
+++ b/src/AutoTuning/DKSConfig.h
@@ -1,9 +1,3 @@
 /** Class to save and load DKS autotunning configs.
 * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
 * Uses boost xml_parser to read and write the xml file and boost property tree to store
 * the xml content.
 */
 #ifndef DKS_CONFIG
 #define DKS_CONFIG
@@ -29,6 +23,13 @@ namespace pt = boost::property_tree;
 const std::string config_dir = "/.config/DKS";
 const std::string config_file = "/autotuning.xml";
 /** Class to save and load DKS autotunning configs.
 * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
 * Uses boost xml_parser to read and write the xml file and boost property tree to store
 * the xml content.
 * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
 * so the auto-tuning has no effect.
 */
 class DKSConfig {
 private:
--- a/src/AutoTuning/DKSSearchStates.h
+++ b/src/AutoTuning/DKSSearchStates.h
@@ -9,6 +9,9 @@
 enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
 /** 
 * Parameter class allows to change the searchable parameters during the auto-tuning.
 */
 class Parameter {
 private:
@@ -64,6 +67,10 @@ public:
 };
 /**
 * Struct to hold a auto-tuning state.
 * Holds the current value, min, max and a step to witch a state can change.
 */ 
 struct State {
  double value;
  double min;
@@ -74,6 +81,12 @@ struct State {
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;
 /** 
 * Used by auto-tuning search algorithms to move between parameter configurations.
 * Allows to move from one parameter stat to another, get neighboring states, 
 * move to neighboring states and save state information. Print functions are available
 * for debugging purposes, to follow how algorithm muves between sates.
 */
 class DKSSearchStates {
 private:
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
  return DKS_SUCCESS;
 }
 /*
  Info: allcate memory and write data (push)
  Return: pointer to memory object
 */
 /*
  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
  void * mem_ptr;
  mem_ptr = cuda_allocateMemory(size, ierr);
  if (ierr == DKS_SUCCESS)
  ierr = cuda_writeData(mem_ptr, in_data, size);
  return mem_ptr;
  }
 */
 /*
  Info: read data and free memory (pull)
  Return: success or error code
 */
 /*
  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
  ierr = cuda_readData(mem_ptr, out_data, size);
  if (ierr == DKS_SUCCESS)
  ierr = cuda_freeMemory(mem_ptr);	
  else
  return DKS_ERROR;
  if (ierr == DKS_SUCCESS)	
  return DKS_SUCCESS;
  else
  return DKS_ERROR;
  }
 */
 /*
  Info: execute function
  Return: success or error code
 */
 int CudaBase::cuda_executeFunction() {
  std::cout << "Execute function" << std::endl;
  return DKS_SUCCESS;
 }
 /*
  Info: clean up
  Return: success or error code
 */
 int CudaBase::cuda_cleanUp() {
  std::cout << "clean up" << std::endl;
  return DKS_SUCCESS;
 }
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@@ -16,6 +16,11 @@
 #define BLOCK_SIZE 128
 /**
 * CUDA base class handles device setup and basic communication with the device.
 * Handles devicew setup, memory manegement, data transfers and stream setup for 
 * asynchronous data transfers and kernel executions.
 */
 class CudaBase {
 private:
@@ -48,17 +53,17 @@ public:
  /**
   * Delete curandState.
   * Delete curandState array on the GPU and free memory.
-   *  Return success or error code
+   * Return success or error code
   */
  int cuda_deleteCurandStates();
-  /** Create 'size' random numbers on the device and save in mem_ptr array
+  /** 
-   *
+   * Create 'size' random numbers on the device and save in mem_ptr array.
   */
  int cuda_createRandomNumbers(void *mem_ptr, int size);
-  /** Get a pointer to curand states
+  /** 
-   *
+   * Get a pointer to curand states.
   */
  curandState* cuda_getCurandStates();
@@ -75,93 +80,98 @@ public:
  int cuda_addStream(cudaStream_t tmpStream, int &streamId);
  /**
-   * delete cuda stream
+   * delete cuda stream.
   * success or error code
   */
  int cuda_deleteStream(int id);
  /**
-   * delete all streams
+   * delete all streams.
   * success or error code
   */
  int cuda_deleteStreams();
  /**
-   * set stream to use
+   * set stream to use.
   * success or error code
   */
  int cuda_setStream(int id);
  /**
-   * Info: get stream that is used
+   * get stream that is used.
-   *  Return: return id of curretn stream
+   * Return: return id of curretn stream
   */
  int cuda_getStreamId();
  /**
-   * Info: reset to default stream
+   * reset to default stream.
   * Return: success or error code
   */
  int cuda_defaultStream();
  /**
-   * Info: get number of streams
+   * get number of streams.
   * Return: success or error code
   */
  int cuda_numberOfStreams();
  /**
-   * Info: get stream
+   * get stream.
   * Return: stream
   */
  cudaStream_t cuda_getStream(int id);
  /**
-   * Get default cublass handle
+   * Get default cublass handle.
   */
  cublasHandle_t cuda_getCublas();
  /**
-   * Info: get information on cuda devices
+   * get information on cuda devices.
   * Return: success or error code
   */
  int cuda_getDevices();
-  /** Get CUDA device count.
+  /** 
-   *  Sets the number of devices on the platform that can use CUDA.
+   * Get CUDA device count.
-   *  Returns DKS_SUCCESS
+   * Sets the number of devices on the platform that can use CUDA.
   */
  int cuda_getDeviceCount(int &ndev);
-  /** Get the name of the device.
+  /** 
-   *  QUery the device properties of the used device and set the string device_name
+   * Get the name of the device.
   * QUery the device properties of the used device and set the string device_name
   */
  int cuda_getDeviceName(std::string &device_name);
-  /** Set CUDA device to use.
+  /** 
-   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
+   * Set CUDA device to use.
   * If device passed in is larger than the number of devices use 
   * the default:0 and return DKS_ERROR 
   */
  int cuda_setDevice(int device);
-  /** Get unique devices
+  /** 
-   *  Get array of indeces with the unique CUDA devices available on the paltform
+   * Get unique devices.
   * Get array of indeces with the unique CUDA devices available on the paltform
   */
  int cuda_getUniqueDevices(std::vector<int> &devices);
  /**
-   * Info: init device
+   * Initialize connection to the device.
   * Only needed when runtime compilation is used.
   * Return: success or error code
   */
  int cuda_setUp();
  /**
-   * Info: allocate memory on cuda device
+   * Allocate memory on cuda device.
   * Return: pointer to memory object
   */
  void * cuda_allocateMemory(size_t size, int &ierr);
  /**
-   * Info: allocate host memory in pinned memory
+   * Allocate host memory in pinned memory
   * Return: success or error code
   */
  template<typename T>
@@ -174,8 +184,9 @@ public:
    return DKS_SUCCESS;
  }		
-  /** Zero CUDA memory.
+  /** 
-   *  Set all the elements of the array on the device to zero.
+   * Zero CUDA memory.
   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
@@ -189,8 +200,9 @@ public:
    return DKS_SUCCESS;
  }
-  /** Zero CUDA memory.
+  /** 
-   *  Set all the elements of the array on the device to zero.
+   * Zero CUDA memory async.
   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
@@ -209,7 +221,7 @@ public:
  }
  /** 
-   * Info: write data to memory
+   * Write data to memory
   * Retrun: success or error code
   */
  template<typename T>
@@ -226,7 +238,7 @@ public:
  }
  /**
-   * Info: write data assynchonuously
+   * Write data assynchonuously
   * Return: success or error code
   */
  template<typename T>
@@ -258,7 +270,7 @@ public:
  }
  /**
-   * Info: read data from memory
+   * Read data from memory
   * Return: success or error code
   */
  template<typename T>
@@ -275,7 +287,7 @@ public:
  }
  /**
-   * Info: read data async from device memory
+   * Read data async from device memory
   * Return: success or error code
   */
  template<typename T>
@@ -307,19 +319,19 @@ public:
  }
  /**
-   * Info: free memory on device
+   * Free memory on device
   * Return: success or error code
   */
  int cuda_freeMemory(void * mem_ptr);
  /**
-   * Info: free page locked memory on host
+   * Free page locked memory on host
   * Return: success or erro code
   */
  int cuda_freeHostMemory(void * mem_ptr);
  /**
-   * Info: allcate memory and write data (push)
+   * Allcate memory and write data (push)
   * Return: pointer to memory object
   */
  template<typename T>
@@ -335,7 +347,7 @@ public:
  }
  /**
-   * Info: read data and free memory (pull)
+   * Read data and free memory (pull)
   * Return: success or error code
   */
  template<typename T>
@@ -353,21 +365,10 @@ public:
    else
      return DKS_ERROR;
  }
  /**
   * Info: execute function
   * Return: success or error code
   */
  int cuda_executeFunction();
  /**
   * Info: clean up
   * Return: success or error code
   */
  int cuda_cleanUp();
  /**
-   * Info: sync cuda device
+   * Sync cuda device.
   * Waits till all the tasks on the GPU are finished.
   * Return: success or error code
   */
  int cuda_syncDevice() {
@@ -376,7 +377,7 @@ public:
  }
  /**
-   * Page-lock host memory
+   * Page-lock host memory.
   */
  template<typename T>
  int cuda_hostRegister(T *ptr, int size) {
@@ -390,7 +391,7 @@ public:
  }
  /**
-   * Release page locked memory
+   * Release page locked memory.
   */
  template<typename T>
  int cuda_hostUnregister(T *ptr) {
@@ -403,7 +404,7 @@ public:
  }
  /**
-   * Info: print device memory info (total, used, avail)
+   * Print device memory info (total, used, avail)
   * Return: success or error code
   */
  int cuda_memInfo() {
--- a/src/CUDA/CudaChiSquare.cuh
+++ b/src/CUDA/CudaChiSquare.cuh
@@ -8,6 +8,7 @@
 #include "CudaBase.cuh"
 /** Deprecated, CUDA simpleFit implementation of ChiSquare. */
 class CudaChiSquare {
 private:
--- a/src/CUDA/CudaChiSquareRuntime.cuh
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
 const std::string cudaFunctFooter = "}\n";
 /**
 * CUDA implementation of ChiSquareRuntime class.
 * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
 */
 class CudaChiSquareRuntime : public ChiSquareRuntime{
 private:
@@ -29,65 +33,72 @@ private:
  cublasHandle_t defaultCublasRT;
-  /** Setup to init device
+  /** 
-   *  Create context and init device for RT compilation
+   * Setup to init device.
   * Create context and init device for RT compilation
   */
  void setUpContext();
-  /** Private function to add function to kernel string
+  /** 
-   *
+   * Private function to add function to kernel string.
   */
  std::string buildProgram(std::string function);
 public:
-  /** Constructor with CudaBase argument
+  /** 
-   *
+   * Constructor with CudaBase argument
   */
  CudaChiSquareRuntime(CudaBase *base);
-  /** Default constructor init cuda device
+  /** 
-   *
+   * Default constructor init cuda device
   */
  CudaChiSquareRuntime();
-  /** Default destructor
+  /** 
-   *
+   * Default destructor.
   */
  ~CudaChiSquareRuntime();
-  /** Compile program and save ptx.
+  /** 
   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);
-  /** Launch selected kernel
+  /** 
   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
-   * Result is put in &result variable
+   * Result is put in &result variable.
   */
  int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
 		      int numpar, int numfunc, int nummap,
 		      double timeStart, double timeStep,
 		      double &result);
-  /** Write params to device.
+  /** 
   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 
-  /** Write functions to device.
+  /** 
   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);
-  /** Write maps to device.
+  /** 
   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);
-  /** Allocate temporary memory needed for chi square.
+  /** 
   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@@ -96,14 +107,16 @@ public:
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
-  /** Free temporary memory allocated for chi square.
+  /** 
   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();
-  /** Check if CUDA device is able to run the chi square kernel.
+  /** 
-   *  Redundant - all new CUDA devices that support RT compilation will also support 
+   * Check if CUDA device is able to run the chi square kernel.
-   *  double precision, there are no other requirements to run chi square on GPU
+   * Redundant - all new CUDA devices that support RT compilation will also support 
   * double precision, there are no other requirements to run chi square on GPU
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
    return DKS_SUCCESS;
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@@ -1,5 +1,6 @@
 #include "CudaCollimatorPhysics.cuh"
 //constants used in OPAL
 //#define M_P 0.93827231e+00
 #define M_P 0.93827204e+00
 #define C 299792458.0
@@ -11,6 +12,7 @@
 #define Z_P 1
 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
 //parameter array indexes
 #define POSITION 0 
 #define ZSIZE 1
 #define RHO_M 2
@@ -28,12 +30,18 @@
 #define BLOCK_SIZE 128
 #define NUMPAR 13
 /**
 * CUDA device function for calculating dot product.
 */
 __device__ inline double dot(double3 &d1, double3 &d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }
 /**
 * CUDA devce function to calculate cross product.
 */
 __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  double3 tmp;
  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  return tmp;
 }
 /**
 * CUDA device function to calculate arbitrary rotation.
 */
 __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
  double c=cos(Theta);
  double s=sin(Theta);
@@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
  return tmp;
 } 
 /**
 * CUDA device function to check if particle is still in material.
 * z - particle position, par - parameter array. Particle is considered inside the
 * material if z is > material starting position and z < material starting position - mat size.
 */
 __device__ inline bool checkHit(double &z, double *par) {
  /* check if particle is in the degrader material */
@@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) {
 }
 /**
 * CUDA device function to calculate energyLoss for one particle.
 * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
 * algorith are available in OPAL user guide.
 */
 __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
 {
@@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
 }
 /**
 * CUDA device function for rotation in 2 dimensions.
 * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),  
 * "Passage of particles through matter"
 */
 __device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
 			   double &normP, double &thetacou, double &deltas, int coord,
 			   double *par) 
@@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
 /**
 * CUDA device function to calculate Coulomb scattering for one particle.
 * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
 * For details on the algorithm see OPAL user guide.
 */
 __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
 				   bool enableRutherfordScattering) 
 {
@@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
 }
 /**
 * CUDA kernel that performs one step in particle movement trough mater.
 * One thread is launched for each particle in the simulation. The kernel checks if the particle
 * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
 * particles that are exiting the material.
 * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
 * @param[in] *par array of material properties, always constant size - 13
 * @param[in] *state array holding cuRand states to preserve states between kernel launches
 * @param[in] numparticles number of particles in the simulation
 * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
 */
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
 					int numparticles, bool enableRutherfordScattering)
@@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;
-  //transfer params to shared memory
+  //transfer params and particle positions to shared memory
  //R is kept in shared memory in order to reduce register pressure for the kernel
  extern __shared__ double smem[];
  double *p = (double*)smem;
-  double3 *R = (double3*)&smem[NUMPAR];
+  double3 *R = (double3*)&smem[NUMPAR]; 
-  curandState s; 
+  curandState s; //each tread gets its own cuRand state for random number generation
  double3 P;
  //load parameters to shared memory
  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
    p[tt] = par[tt];
  //sync threads to ensure that parameters are finished loading
  __syncthreads();
  //there might be some empty threads that do no work
  if (idx < numparticles) {
-    s = state[idx];
+    s = state[idx]; //load cuRand state to local memory
-    R[tid] = data[idx].Rincol;
+    R[tid] = data[idx].Rincol; //load position to shared memory
-    P = data[idx].Pincol;
+    P = data[idx].Pincol; //load momentum to local memory
    bool pdead = false;  
    volatile double sq = sqrt(1.0 + dot(P, P));
    double Eng;
    //check if particle is still in the material
    if (checkHit(R[tid].z, p)) {      
      //calculate enery loss
      Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);
      //check if particle is not dead
      if (!pdead) {
 	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
 	sq = sqrt(dot(P, P));
 	//caluclate Coulomb scattering
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
 	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
 	//update particle momentum
 	data[idx].Pincol = P;
      } else {
 	//mark particle as dead (-1)
 	data[idx].label = -1;
      }
-    
+      
      //update cuRand state
      state[idx] = s;
    } else {
-    
+      //particle exits material - drift and mark as exiting (-2)
      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@@ -272,14 +325,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
    }
    //update particle position
    data[idx].Rincol = R[tid];
  }
 }
-__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
+/**
-					 curandState *state, int numparticles,
+ * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
-					 bool enableRutherfordScattering)
+ * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
 * Deprecated - GPU version does not use SoA.
 * @param[in] data structure of arrays containing particle data
 * @param[in] *par array of material properties, always constant size - 13
 * @param[in] *state array holding cuRand states to preserve states between kernel launches
 * @param[in] numparticles number of particles in the simulation
 * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
 */
 __global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, 
 					   curandState *state, int numparticles,
 					   bool enableRutherfordScattering)
 {
  //get global id and thread id
@@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
 }
-
+/**
 * Device function to swich off unitless positions.
 */
 inline __device__ void unitlessOff(double3 &a, const double &c) {
  a.x *= c;
  a.y *= c;
  a.z *= c;
 }
 /**
 * Device function to swich on unitless positions.
 */
 inline __device__ void unitlessOn(double3 &a, const double &c) {
  a.x /= c;
  a.y /= c;
  a.z /= c;
 }
 //swithch to unitless positions with dtc
 __global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    unitlessOn(R, dtc);
    unitlessOn(X, dtc);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 //swithc to unitless positions with dt*c
 __global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    double dt = gdt[idx];
    unitlessOff(R, dt*c);
    unitlessOff(X, dt*c);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 //swithc off unitless positions with dtc
 __global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    unitlessOff(R, dtc);
    unitlessOff(X, dtc);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 //switch off unitelss positions with dt*c
 __global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < npart) {
    double3 R = gR[idx];
    double3 X = gX[idx];
    double dt = gdt[idx];
    unitlessOff(R, dt*c);
    unitlessOff(X, dt*c);
    gR[idx] = R;
    gX[idx] = X;
  }
 }
 /**
 * CUDA kernel to perform particle push.
 * @param[in] *gR array of particle positions
 * @param[in] *gP array of particle momentums
 * @param[in] npart number of particles
 * @param[in] dtc dt*c
 */
 __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  //get global id and thread id
@@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  }
 }
-
+/**
 * CUDA kernel to perform particle push.
 * @param[in] *gR array of particle positions
 * @param[in] *gP array of particle momentums
 * @param[in] *gdt array of time steps for each particle
 * @param[in] npart number of particles
 * @param[in] c speed of light
 */
 __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
  //get global id and thread id
@@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
  }
 }
 /** 
 * CUDA kernel to perform particle kick.
 * @param[in] *gR array of particle positions
 * @param[in] *gP array of particle momentums
 * @param[in] *gEf 
 * @param[in] *gBf
 * @param[in] *gdt array of time steps for each particle
 * @param[in] npart number of particles
 * @param[in] c speed of light
 */
 __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
 			   double3 *gBf, double *gdt, double charge, 
 			   double mass, int npart, double c)
@@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
 }
 struct compare_particle
 {
  int threshold;
  compare_particle() {
    threshold = 0;
  }
  void set_threshold(int t) {
    threshold = t;
  }
  __host__ __device__
  bool operator()(CUDA_PART p1, CUDA_PART p2) {
    return p1.label > p2.label;
  }
  __host__  __device__
  bool operator()(CUDA_PART p1) {
    return p1.label < threshold;
  }
 };
 struct compare_particle_small
 {
  int threshold;
  compare_particle_small() {
    threshold = 0;
  }
  void set_threshold(int t) {
    threshold = t;
  }
  __host__ __device__
  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
    return p1.label > p2.label;
  }
  __host__  __device__
  bool operator()(CUDA_PART_SMALL p1) {
    return p1.label < threshold;
  }
 };
 struct less_then
 {
  __host__ __device__
  bool operator()(int x)
  {
    return x < 0;
  }
 };
 int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
 					     bool enableRutherfordScattering)
 {
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@@ -20,7 +20,8 @@
 #include "CudaBase.cuh"
 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU or MIC as AoS.
 * Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
 */
 typedef struct __align__(16) {
  int label;
@@ -37,7 +38,10 @@ typedef struct __align__(16) {
 } CUDA_PART;
 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as AoS
 * Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
 * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
 * on the host side.
 */
 typedef struct {
  int label;
@@ -47,7 +51,8 @@ typedef struct {
 } CUDA_PART_SMALL;
 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as SoA.
 * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
 */
 typedef struct {
  int *label;
@@ -65,6 +70,9 @@ typedef struct {
 /**
 * Structure for storing particle on GPU
 * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
 * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
 * on the host side.
 */
 typedef struct {
  int *label;
@@ -73,11 +81,39 @@ typedef struct {
  double3 *Pincol;
 } CUDA_PART2_SMALL;
-/** CudaCollimatorPhysics class.
+/** 
- * Contains kerenls that execute CollimatorPhysics functions form OPAL.
+ * Operator used in thrust sort to compare particles by label.
- * For detailed documentation on CollimatorPhysics functions see OPAL documentation
+ * Used to move dead particles to the end of array, since they have label -1 or -2.
 */
-class CudaCollimatorPhysics : public DKSCollimatorPhysics{
+struct compare_particle_small
 {
  int threshold;
  compare_particle_small() {
    threshold = 0;
  }
  void set_threshold(int t) {
    threshold = t;
  }
  __host__ __device__
  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
    return p1.label > p2.label;
  }
  __host__  __device__
  bool operator()(CUDA_PART_SMALL p1) {
    return p1.label < threshold;
  }
 };
 /** 
 * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Contains kerenls that execute CollimatorPhysics functions form OPAL.
 * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
 */
 class CudaCollimatorPhysics : public DKSCollimatorPhysics {
 private:
@@ -86,32 +122,44 @@ private:
 public:
-  /** Constructor with CudaBase argument
+  /** 
-   *
+   * Constructor with CudaBase as argument.
   * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
   */
  CudaCollimatorPhysics(CudaBase *base) {
    m_base = base;
    base_create = false;
  }
-  /** Constructor - empty. */
+  /** 
   * Empty constructor.
   * Create a new instance of CudaCollimatorPhysics with its own CudaBase. 
   */
  CudaCollimatorPhysics() { 
    m_base = new CudaBase();
    base_create = true;
  }
-  /** Destructor - empty */
+  /** 
   * Destructor.
   * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
   */
  ~CudaCollimatorPhysics() { 
    if (base_create)
      delete m_base;
  };
-  /** Execute collimator physics kernel.
+  /** 
   * Execute collimator physics kernel.
   *
   */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			int numpartices, bool enableRutherforScattering = true);
  /** 
   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			   void *px_ptr, void *py_ptr, void *pz_ptr,
@@ -120,12 +168,17 @@ public:
      return DKS_ERROR;
    }
-  /** Sort particle array on GPU.
+  /** 
   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  /** 
   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
   * Used only on the MIC side, was not implemented on the GPU.
   */
  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
@@ -134,18 +187,25 @@ public:
      return DKS_ERROR;
    }
-  /** BorisPusher push function for integration from OPAL.
+  /** 
   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);
  /** 
   * BorisPusher kick function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			   void *bf_ptr, void *dt_ptr, double charge, double mass,
 			   int npart, double c, int streamId = -1); 
-  /** BorisPusher push function with transformto function form OPAL
+  /** 
   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
--- a/src/CUDA/CudaFFT.cuh
+++ b/src/CUDA/CudaFFT.cuh
@@ -10,6 +10,10 @@
 #include "../Algorithms/FFT.h"
 #include "CudaBase.cuh"
 /**
 * Cuda FFT class based on BaseFFT interface.
 * Uses cuFFT library to perform FFTs on nvidias GPUs.
 */
 class CudaFFT : public BaseFFT {
 private:
@@ -34,7 +38,7 @@ public:
  ~CudaFFT();
  /**
-   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Init cufftPlans witch can be reused for all FFTs of the same size and type
   * Return: success or error code
   */
  int setupFFT(int ndim, int N[3]);
@@ -42,45 +46,21 @@ public:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
  /**
-   * Info: destroy default FFT plans
+   * Destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT();
  /*
    Info: execute complex to complex double precision fft using cufft library
    Return: success or error code
  */
  int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
  /*
    Info: execute ifft 
    Return: success or error code
  */
  int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: execute normalize using cuda kernel for complex to complex iFFT
    Return: success or error code
  */
  int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: execute real to complex double precision FFT
    Return: success or error code
  */
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: exectue complex to real double precision FFT
    Return: success or error code
  */
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
  /*
    Info: execute normalize for complex to real iFFT
    Return: success or error code
  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
 };
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@@ -12,6 +12,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "CudaBase.cuh"
 /** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class CudaGreensFunction : public GreensFunction{
 private:
--- a/src/CUDA/CudaImageReconstruction.cuh
+++ b/src/CUDA/CudaImageReconstruction.cuh
@@ -10,6 +10,7 @@
 #include "../Algorithms/ImageReconstruction.h"
 #include "CudaBase.cuh"
 /** CUDA implementation of ImageReconstruction interface. */
 class CudaImageReconstruction : public ImageReconstruction {
 private:
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@@ -1,11 +1,3 @@
 /** DKSBase class.
 * DKSBase.h
 * Author: Uldis Locans
 * Date: 15.09.2014
 * Base class of Dynamic Kernel Scheduler that handles the function calls
 * from host application to DKS
 */
 #ifndef H_DKS_BASE
 #define H_DKS_BASE
@@ -41,7 +33,12 @@
 #include "AutoTuning/DKSConfig.h"
-/** DKSBase class for handling function calls to DKS library */
+/** 
 * API for handling communication function calls to DKS library.
 * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
 * memory manegement, data transfer and other basic communication functions between
 * the host and device.
 */
 class DKSBase {
 private:
@@ -74,7 +71,7 @@ protected:
  DKSConfig dksconfig;
  /** 
-   * Check if current API is set to OpenCL
+   * Check if current API is set to OpenCL.
   * Return true/false wether current api is opencl
   */
  bool apiOpenCL();
@@ -91,11 +88,11 @@ protected:
   */
  bool apiOpenMP();
-  /** Check if device is GPU */
+  /** Check if device is GPU. */
  bool deviceGPU();
-  /** Check if device is CPU */
+  /** Check if device is CPU. */
  bool deviceCPU();
-  /** Check if device is MIC */
+  /** Check if device is MIC. */
  bool deviceMIC();
  /**
--- a/src/DKSBaseMuSR.h
+++ b/src/DKSBaseMuSR.h
@@ -20,6 +20,11 @@
 #include "OpenCL/OpenCLChiSquareRuntime.h"
 #endif
 /**
 * API to handle musrfit calls to DKS library.
 * Using ChiSquareRuntime interface allows to call chi square functions on the 
 * GPU or CPU using CUDA or OpenCL.
 */
 class DKSBaseMuSR : public DKSFFT {
 private:
--- a/src/DKSFFT.h
+++ b/src/DKSFFT.h
@@ -24,6 +24,10 @@
 #include "MIC/MICFFT.h"
 #endif
 /**
 * API to handel calls to DKSFFT.
 * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
 */
 class DKSFFT : public DKSBase {
 private:
--- a/src/DKSImageReconstruction.h
+++ b/src/DKSImageReconstruction.h
@@ -10,6 +10,9 @@
 #include "CUDA/CudaImageReconstruction.cuh"
 #endif
 /**
 * API to handle PET image reconstruction calls.
 */
 class DKSImageRecon : public DKSBase {
 private:
@@ -22,87 +25,88 @@ public:
  ~DKSImageRecon();
-  /** Image reconstruction analaysis calculate source.
+  /** 
-   * 
+   * Image reconstruction analaysis calculate source.
   *
   */
  int callCalculateSource(void *image_space, void *image_position, void *source_position, 
 			  void *avg, void *std, float diameter, int total_voxels, 
 			  int total_sources, int start = 0);
-  /** Image reconstruction analaysis calculate source.
+  /** 
-   * 
+   * Image reconstruction analaysis calculate source.
   *
   */
  int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0);
-  /** Image reconstruction analaysis calculate source.
+  /** 
-   * 
+   * Image reconstruction analaysis calculate source.
   *
   */
  int callCalculateSources(void *image_space, void *image_position, void *source_position, 
 			   void *avg, void *std, void *diameter, int total_voxels, 
 			   int total_sources, int start = 0);
-  /** Image reconstruction analaysis calculate source.
+  /** 
-   * 
+   * Image reconstruction analaysis calculate source.
   *
   */
  int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
 			       void *avg, void *std, void *diameter, int total_voxels, 
 			       int total_sources, int start = 0);
-  /** Image reconstruction - generate normalization.
+  /** 
-   * 
+   * Image reconstruction - generate normalization.
   */
  int callGenerateNormalization(void *recon, void *image_position, 
 				void *det_position, int total_det);
-  /** Image reconstruction - forward correction.
+  /** 
-   * 
+   * Image reconstruction - forward correction.
   */
  int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 			    void *image_position, int num_events);
-  /** Image reconstruction - backward projection.
+  /** 
-   * 
+   * Image reconstruction - backward projection.
   */
  int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
 			     void *det_position, void *image_position, 
 			     int num_events, int num_voxels);
-  /** Set the voxel dimensins on device.
+  /** 
   * Set the voxel dimensins on device.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
-  /** Set the image edge.
+  /** 
   * Set the image edge.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge(float x_edge, float y_edge, float z_edge);
-  /** Set the image edge1.
+  /** 
   * Set the image edge1.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
-  /** Set the minimum crystan in one ring values.
+  /** 
   * Set the minimum crystal in one ring values.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
-  /** Set all other required parameters for reconstruction.
+  /** 
   * Set all other required parameters for reconstruction.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@@ -33,6 +33,11 @@
 #include "MIC/MICCollimatorPhysics.h"
 #endif
 /**
 * API to handle OPAL calls to DKS library.
 * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
 * functions.
 */
 class DKSOPAL : public DKSFFT {
 private: 
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@@ -26,6 +26,9 @@
 #define MIC_WIDTH 128
 /** MIC Base class handles device setup and basic communication with the device.
 * Handles devicew setup, memory manegement and  data transfers.
 */
 class MICBase {
 private:
@@ -45,58 +48,60 @@ public:
  int m_device_id;
-  /* constructor */
+  /** constructor */
  MICBase();
-  /* destructor */
+  /** destructor */
  ~MICBase();
-  /*
+  /**
-    Info: create MKL rand streams for each thread
+   * Create MKL rand streams for each thread
-    Return: success or error code
+   *  Return: success or error code
-  */
+   */
  int mic_createRandStreams(int size);
-  /*
+  /**
-    Info: delete MKL rand streams
+   * Delete MKL rand streams
-    Return: succes or error code
+   * Return: succes or error code
-  */
+   */
  int mic_deleteRandStreams();
-  /*
+  /**
-    Info: create a new signal for the mic
+   * Create a new signal for the mic.
-    Return: success or error code
+   * Signals can be used for assynchronous data transfers.
-  */
+   * Return: success or error code
   */
  int mic_createStream(int & streamId);
-  /*
+  /**
-    Info: get the signal from the vector
+   * Info: get the signal from the vector.
-    Return: mic signal
+   * Return: mic signal
  */
  int& mic_getStream(int id);
-  /*
+  /**
-    Info: delete streams
+   * Info: delete streams.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  int mic_deleteStreams();
-  /*
+  /**
-    Info: set device id
+   * Info: set device id.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  int mic_setDeviceId(int id);
-  /*
+  /**
-    Info: get mic devices
+   * Info: get mic devices.
-    Return: success or error code
+   * Prints information about mic devices.
-  */
+   * Return: success or error code
   */
  int mic_getDevices();
-  /*
+  /**
-    Info: allocate memory on MIC device
+   * Allocate memory on MIC device.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  void * mic_allocateMemory(int size) {
@@ -109,10 +114,10 @@ public:
    return tmp;
  }
-  /*
+  /**
-    Info: transfer data to device
+   * Transfer data to device.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@@ -123,10 +128,10 @@ public:
    return DKS_SUCCESS;
  }
-  /*
+  /**
-    Info: write data to device, non-blocking
+   * Write data to device, non-blocking.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
  {
@@ -139,10 +144,10 @@ public:
  }
-  /*
+  /**
-    Info: read data from device
+   * Read data from device
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@@ -154,10 +159,10 @@ public:
    return DKS_SUCCESS;
  }
-  /*
+  /**
-    Info: read data from device waiting for signal
+   * Read data from device waiting for signal
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  int mic_readDataAsync(const void * data_ptr, void * result, int size, 
 			int streamId = -1, int offset = 0) {
@@ -172,10 +177,10 @@ public:
  }
-  /* 
+  /**
-     Info: wait till all the signals are complete
+   * Wait till all the signals are complete
-     Return siccess or error code
+   * Return siccess or error code
-  */
+   */
  int mic_syncDevice() {
    //empty offload to wait for all the signals to finish and launch a new empy signal
@@ -193,10 +198,10 @@ public:
  }
-  /*
+  /**
-    Info: free memory on device
+   * Free memory on device
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  int mic_freeMemory(void * data_ptr, int size) {
@@ -210,10 +215,10 @@ public:
    return DKS_SUCCESS;
  }
-  /*
+  /**
-    Info: allocate memory and write data to device
+   * Allocate memory and write data to device
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  template<typename T>
  void * mic_pushData(const void * data, int size) {
    T* tmp_ptr = new T[size];
@@ -227,10 +232,10 @@ public:
  return tmp_ptr;
 }
-/*
+  /**
-  Info: read data and free memory on device
+   * Read data and free memory on device
-  Return: success or erro code
+   * Return: success or erro code
-*/
+   */
  template<typename T>
  int mic_pullData(void * data_ptr, void * result, int size) {
    T* tmp_ptr = (T*)data_ptr;
--- a/src/MIC/MICChiSquare.h
+++ b/src/MIC/MICChiSquare.h
@@ -14,6 +14,9 @@
 #include <offload.h>
 #include "MICBase.h"
 /** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. 
 * Not complete and untested because of the poor performance of first MIC devices.
 */
 class MICChiSquare {
  MICBase *m_micbase;
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@@ -22,22 +22,34 @@
 #define I_M 10
 #define DT_M 11
 /**
 * MIC device function for calculating dot product.
 */
 __declspec(target(mic))
 double dot(mic_double3 d1, mic_double3 d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }
 /**
 * MIC device function for calculating dot product.
 */
 __declspec(target(mic))
 double dot(double dx, double dy, double dz) {
  return (dx * dx + dy * dy + dz * dz);
 }
 /**
 * MIC device function to check if particle is still in material.
 */
 __declspec(target(mic))
 bool checkHit(double &z, double *par) {
  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
 }
 /**
 * MIC device function to calculate arbitrary rotation.
 */
 __declspec(target(mic))
 void Rot(double &px, double &pz, double &x, double &z, double xplane, 
 	 double normP, double thetacou, double deltas, int coord)
@@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
 /**
 * MIC device function to calculate Coulomb scattering for one particle.
 * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
 * Uses AoS to store particle positions and momentum, paralelized using OpenMP.
 * For details on the algorithm see OPAL user guide.
 * Deprecated on favor of SoA data layout.
 */
 __declspec(target(mic))
 void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
 }
 /**
 * MIC device function to calculate Coulomb scattering for one particle.
 * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
 * Uses SoA to store particle positions and momentum, paralelized using OpenMP.
 * For details on the algorithm see OPAL user guide.
 */
 __declspec(target(mic))
-void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
+void coulombScat(double *rx, double *ry, double *rz, 
 		 double *px, double *py, double *pz, int *label,
 		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
 {
- 
+
  //arrays for temporary storage, each core proceses MIC_WIDTH particles
  double normP[MIC_WIDTH] __attribute__((aligned(64)));
  double deltas[MIC_WIDTH] __attribute__((aligned(64)));
  double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  double z2[MIC_WIDTH] __attribute__((aligned(64)));
  double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
  //simd instruction tells the compiler its safe to vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + MIC_WIDTH; i++) {
@@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  //vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + size; i++) {
@@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  //generate array of random numbers
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
 }
 /**
 * MIC device function to calculate energyLoss for one particle.
 * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
 * algorith are available in OPAL user guide.
 */
 __declspec(target(mic))
 void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
@@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
    pdead = 1;
 }
 /**
 * MIC device function to calculate energyLoss for one particle.
 * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
 * algorith are available in OPAL user guide.
 */
 __declspec(target(mic))
 void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
@@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
  double *par = (double*) par_ptr;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target(mic:m_micbase->m_device_id)		\
  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
@@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
      //for loop trough particles if not checkhit set label to -2 and update R.x
 #pragma omp for simd
      for (int i = 0; i < numparticles; i++) {
 	if ( !checkHit(data[i].Rincol.z, par) ) {
@@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 {
-
+  //cast device memory pointers to appropriate types
  int *label = (int*)label_ptr;
  unsigned *localID = (unsigned*)localID_ptr;
  double *rx = (double*)rx_ptr;
@@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target (mic:0) \
  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@@ -26,6 +26,12 @@ typedef struct {
 } MIC_PART_SMALL;
 /**
 * MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Implementes OPALs collimator physics class for particle matter interactions using OpenMP
 * and offload mode targetomg Intel Xeon Phi processors.
 * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
 */
 class MICCollimatorPhysics : public DKSCollimatorPhysics {
 private:
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@@ -10,6 +10,10 @@
 #include "../Algorithms/FFT.h"
 #include "MICBase.h"
 /** 
 * MIC FFT based on BaseFFT interface.
 * uses MKL library to offload FFT on Intel Xeon Phi devices.
 */
 class MICFFT : public BaseFFT {
 private:
--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@@ -15,6 +15,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
 /** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class MICGreensFunction : public GreensFunction {
 private:
--- a/src/MIC/MICMergeSort.h
+++ b/src/MIC/MICMergeSort.h
@@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
  return p;
 }
 /**
 * Merge sort implementation for intel MIC.
 * Paralellized over all the MIC cores using OpenMP tasks.
 */
 template <typename T>
 void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
@@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
  }
 }
 /**
 * Quicksort algorithm, developed for use on Intel MIC devices.
 */
 template <typename T>
 void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
@@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
 }
 /** 
 * Insertion sort of @p list, developed for use on Intel MIC.
 * Used by quick_sort to sort small lists.
 */
 template <typename T>
 void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@@ -1,16 +1,3 @@
 /*
  Name: OpenCLBase
  Author: Uldis Locans
  Info: OpenCL base class to handle all the common details associated 
  with kernel launch on OpenCL device
  Date: 2014.09.18
 */
 #ifndef H_OPENCL_BASE
 #define H_OPENCL_BASE
@@ -32,7 +19,7 @@
 #include "../DKSDefinitions.h"
-/* struct for random number state */
+/** struct for random number state. */
 typedef struct {
  double s10;
  double s11;
@@ -44,168 +31,194 @@ typedef struct {
  bool gen;
 } RNDState;
 /**
 * OpenCL base class to handle device setup and basic communication wiht the device.
 * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
 * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
 * class contains methods to read the kernel files, compile the kernel codes and launch kernels
 * from the compiled codes. Which kernel file needs to be loaded for the specif functin is 
 * handled by the base class that is launching the kernel.
 */
 class OpenCLBase {
 private:
-	
+
  //variables containig OpenCL device and platform ids
  static cl_platform_id m_platform_id;
  static cl_device_id m_device_id;
  //variables containit compiled OpenCL program and kernel
  cl_context_properties m_context_properties[3];
  cl_program m_program;
  cl_kernel m_kernel;
  //variables for tracking OpenCL events
  static cl_event m_last_event;
  cl_int m_num_events;
  std::vector<cl_event> m_events;
  //currently load kernel file
  char * m_kernel_file;
  //type of device used by OpenCL
  cl_device_type m_device_type;
-  /*
+  /**
-    Name: getPlatforms
+   * Get all available OpenCL platforms.
-    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
+   * Get all avaialble platforms and save in m_platform_ids, save number of platforms
-    Return: success or error code
+   *  Return: success or error code
-  */
+   */
  int ocl_getPlatforms();
-  /*
+  /**
-    Name: getDevice
+   * Get first available OpenCL device of specified type.
-    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
+   * Get first avaialble devices and save device id and platform id for this device, 
-    ReturnL success or error code
+   * device name: (-gpu, -mic, -cpu)
-  */
+   *  ReturnL success or error code
   */
  int ocl_getDevice(const char* device_name);
-  /*
+  /**
-    Name getDeviceType
+   * Get cl_device_type from the specified device name.
-    Info: get device type from device name (-gpu, -cpu, -mic)
+   * get device type from device name (-gpu, -cpu, -mic)
-    Return: success or error code
+   *  Return: success or error code
-  */
+   */
  int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
-  /*
+  /**
-    Name: createContext
+   * Create OpenCL context with specified device.
-    Info: create context with specified device
+   *  Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_createContext();
-  /*
+  /**
-    Name: buildProgram
+   * Build program from specified kernel file.
-    Info: build program from specified kernel file
+   * Return: success or error code.
    Return: success or error code
  */
  int ocl_buildProgram(const char* kernel_file);
-  /** Compile program from kernel source string
+  /** 
-   *
+   * Compile program from kernel source string.
   * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the 
   * OpenCL program, that can be then executed on the device.
   * opts is a string specifiend additional compiler flags.
   */
  int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
 protected:
  //memory for random number states
  int defaultRndSet;
  cl_mem defaultRndState;
 public:
  //OpenCL context and commad queue
  static cl_context m_context;
-  static cl_command_queue m_command_queue;
+  static cl_command_queue m_command_queue; 
-  /*
+  /**
-    constructor
+   * constructor
-  */
+   */
  OpenCLBase();
-  /*
+  /**
-    destructor
+   * destructor
-  */
+   */
  ~OpenCLBase();
-  /*
+  /**
-    Create RND states
+   * Allocate memory for size random number states and init the rnd states.
-    Return: success or error code
+   * Uses AMD clRng library for random numbers. 
-  */
+   * This library is only compatible with AMD devices.
   */
  int ocl_createRndStates(int size);
-  /* Create an array of random numbers on the device
+  /** 
-   *
+   * Create an array of random numbers on the device.
   * Filles hte mem_ptr with random numbers.
   */
  int ocl_createRandomNumbers(void *mem_ptr, int size);
-  /*
+  /**
-    Destroy rnd states
+   * Destroy rnd states and free device memory.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  int ocl_deleteRndStates();
-  /*
+  /**
-    Name: getAllDevices
+   * Prints info about all the available platforms and devices.
-    Info: get all available devices
+   * Can be used for information purposes to see what devices are available on the system.
-    ReturnL success or error code
+   * ReturnL success or error code.
  */
  int ocl_getAllDevices();
-  /** Get the OpenCL device count for the set type of device
+  /** 
-   *
+   * Get the OpenCL device count for the set type of device.
   * Device count is set in ndev parameter, returns success or error code.
   */
  int ocl_getDeviceCount(int &ndev);
-  /** Get the name of the device used
+  /** 
   * Get the name of the device currently us use.
   */
  int ocl_getDeviceName(std::string &device_name);
-  /** Set the device to use for OpenCL kernels.
+  /** 
-   *  device id to use is passed as integer.
+   * Set the device to use for OpenCL kernels.
   * Device id to use is passed as integer.
   */
  int ocl_setDevice(int device);
-  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
+  /** 
-   *  Used when GPUs of different types might be pressent on the system.
+   * Get a list of all the unique devices of the same type that can run OpenCL kernels.
   * Used when GPUs of different types might be pressent on the system.
   */
  int ocl_getUniqueDevices(std::vector<int> &devices);
-  /*
+  /**
-    Name: setUp
+   * Initialize OpenCL connection with a device of specified type.
-    Info: set up opencl resources
+   * Find if specified device is avaialble, creates a contex and command queue.
-    Return: success or error code
+   * Returns success or error code.
-  */
+   */
  int ocl_setUp(const char* device_name);
-  /*
+  /**
-    Name: loadKernel
+   * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
-    Info: load and compile opencl kernel file if it has changed
+   * Load and compile opencl kernel file if it has changed.
-    Return: success or error code
+   * Return: success or error code
  */
  int ocl_loadKernel(const char* kernel_file);
-  /** Build program from kernel source.
+  /** 
   * Build program from kernel source.
   * Builds a program from source code provided in kernel_source.
   * If compilation fails will return DKS_ERROR
   */
  int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
-  /*
+  /**
-    Name: allocateMemory
+   * Allocate memory on the device.
-    Info: allocate memory on device
+   * Return: return pointer to memory
    Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int &ierr);
-  /*
+  /**
-    Name: allocateMemory
+   * Allocate memory of specific type on device.
-    Info: allocate memory on device
+   * The availabel types are cl_mem_flags type listed in OpenCL documentation:
-    Return: return pointer to memory
+   * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, 
   * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
-  /** Zero OpenCL memory buffer
+  /** 
-   *  Set all the elemetns in the device array to zero
+   * Zero OpenCL memory buffer.
   * Set all the elemetns in the device array to zero.
   */
  template <typename T>
  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@@ -218,92 +231,90 @@ public:
    return DKS_SUCCESS;
  }
-  /*
+  /**
-    Name: writeData
+   * Write data to device memory (needs ptr to mem object)
-    Info: write data to device memory (needs ptr to mem object)
+   * Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
-  /*
+  /** 
-    Name: copyData
+   * Copy data from one buffer on the device to another
-    Info: copy data from one buffer on the device to another
+   * Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
-  /*
+  /** 
-    Name: createKernel
+   * Create kernel from compiled OpenCL program.
-    Info: create kernel from program
+   * Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_createKernel(const char* kernel_name);
-  /*
+  /**
-    Name: setKernelArgs
+   * Set argiments for the kernel that will be launched.
-    Info: set opencl kernel arguments
+   * Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
-  /*
+  /**
-    Name: executeKernel
+   * Execute selected kernel.
-    Info: execute selected kernel (needs kernel parameters)
+   * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
-    Return: success or error code
+   * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
   * kernel arguments must be set.
   * Return: success or error code
  */
  int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
-  /*
+  /**
-    Name: readData
+   * Read data from device (needs pointer to mem object).
-    Info: read data from device (needs pointer to mem object)
+   * Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
-  /*
+  /**
-    Name: freeMemory
+   * Free device memory (needs ptr to mem object).
-    Info: free device memory (needs ptr to mem object)
+   *  Return: success or error code
-    Return: success or error code
+   */
  */
  int ocl_freeMemory(cl_mem mem_ptr);
-  /*
+  /**
-    Name: cleanUp
+   * Free opencl resources.
-    Info: free opencl resources
+   * Deletes the kernel, compiled program, command queue and colese the connection
-    Return: success or error code
+   * to device by releasing the context.
-  */
+   * Return: success or error code
   */
  int ocl_cleanUp();
-  /*
+  /**
-    Name: deviceInfo
+   * Print info of currently selected device.
-    Info: print device info (mostly for debugging purposes)
+   * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
-    Return: success or error code
+   * Return: success or error code
-  */
+   */
  int ocl_deviceInfo(bool verbose = true);
-  /* Check OpenCL kernel.
+  /* 
-   * Query device and check if it can run the kernel with required parameters
+   * Check OpenCL kernel.
   * Query device and check if it can run the kernel with required parameters.
   * Also check the available OpenCL extensions - usefull for checking the supported device
   * features, like double precission.
   */
  int ocl_checkKernel(const char* kernel_name, int work_group_size,
 		      bool double_precision, int &threadsPerBlock);
-  /*
+  /**
-    Name: clearEvents
+   * Clear the event list.
-    Info: clear saved events (for debuging purposes)
+   * Events can be used for timing and synchronization purposes.
-    Return: nothing
+   */
  */
  void ocl_clearEvents();
-  /*
+  /**
-    Name: eventInfo
+   * print information about kernel timings from event list.
-    Info: print information about kernel timings (for debuging purposes)
+   * for debuging purposes
-    Return: nothing
+   */
  */
  void ocl_eventInfo();
-  /*
+  /**
-    Return current command queue
+   * Return current command queue.
-  */
+   */
  cl_command_queue ocl_getQueue() { return m_command_queue; }
 };
--- a/src/OpenCL/OpenCLChiSquare.h
+++ b/src/OpenCL/OpenCLChiSquare.h
@@ -14,7 +14,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
-
+/** Deprecated, SimpleFit implementation of ChiSquare. */
 class OpenCLChiSquare {
 private:
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
 }
 int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
  //write params to gpu
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
  return ierr;
 }
@@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
  if (numfunc == 0)
    return DKS_SUCCESS;
  //write function values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
  return ierr;
 }
@@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  if (nummap == 0)
    return DKS_SUCCESS;
  //wrtie map values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
  return ierr;
 }
@@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
    freeChiSquare();
  }
-  //allocate temporary memory
+  //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
  mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
  mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
  if (size_func == 0)
@@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
  int ierr = DKS_ERROR;
  if (initDone_m) {
-    //free memory
+    //free GPU memory
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
    return DKS_ERROR;
  }
  //check the GPU kernel
  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
  return ierr;
--- a/src/OpenCL/OpenCLChiSquareRuntime.h
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
 const std::string openclFunctFooter = "}\n";
 /**
 * OpenCL implementation of ChiSquareRuntime class.
 * Implements ChiSquareRuntime interface to allow musrfit to target devices that
 * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
 */
 class OpenCLChiSquareRuntime : public ChiSquareRuntime {
 private:
  OpenCLBase *m_oclbase;
-  /** Private function to add user defined function to kernel string
+  /** 
-   *
+   * Private function to add user defined function to kernel string.
   */
  std::string buildProgram(std::string function);
  /**
   * Launch parallel reduction kernel to calculate the sum of data array
   */
  double calculateSum(cl_mem data, int length);
 public:
-  /** Constructor wiht openclbase argument
+  /** 
-   *
+   * Constructor wiht openclbase argument.
   */
  OpenCLChiSquareRuntime(OpenCLBase *base);
-  /** Default constructor
+  /** 
-   *
+   * Default constructor
   */
  OpenCLChiSquareRuntime();
-  /** Default destructor
+  /** 
-   *
+   * Default destructor
   */
  ~OpenCLChiSquareRuntime();
-    /** Compile program and save ptx.
+  /** 
   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);
-  /** Launch selected kernel
+  /** 
   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
   * Result is put in &result variable
   */
@@ -64,22 +74,26 @@ public:
 		      double timeStart, double timeStep,
 		      double &result);
-  /** Write params to device.
+  /** 
   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 
-  /** Write functions to device.
+  /** 
   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);
-  /** Write maps to device.
+  /** 
   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);
-  /** Allocate temporary memory needed for chi square.
+  /** 
   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@@ -87,14 +101,16 @@ public:
   */
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);
-  /** Free temporary memory allocated for chi square.
+  /** 
   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();
-  /** Check MuSR kernels for necessary resources.
+  /** 
   * Check MuSR kernels for necessary resources.
   * Query device properties to get if sufficient resources are
-   * available to run the kernels
+   * available to run the kernels. Also checks if double precission is enabled on the device.
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock);
--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@@ -17,12 +17,16 @@
 #include "boost/compute/core.hpp"
 */
 /** Double3 structure for use in OpenCL code. */
 typedef struct {
  double x;
  double y;
  double z;
 } Double3;
 /**
 * Structure for stroing particles in OpenCL code.
 */
 typedef struct {
  int label;
  unsigned localID;
@@ -35,6 +39,10 @@ typedef struct {
 //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
 //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
 /**
 * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
 * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
 */
 class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
 private:
@@ -42,16 +50,20 @@ private:
 public:
-  /* constructor */
+  /** 
   * Constructor with OpenCLBase as argument.
   * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
   */
  OpenCLCollimatorPhysics(OpenCLBase *base) { 
    m_oclbase = base;
  }
-  /* destructor */
+  /** 
   * Destructor.
   */
  ~OpenCLCollimatorPhysics() { 
  }
  /* execute degrader code on device */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 			bool enableRutherforScattering = true);
--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@@ -1,14 +1,3 @@
 /*
  Name: OpenCLFFT
  Author: Uldis Locans
  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
  Data: 19.09.2014
 */
 #ifndef H_OPENCL_FFT
 #define H_OPENCL_FFT
@@ -22,6 +11,12 @@
 #include "clFFT.h"
 /**
 * OpenCL FFT class based on BaseFFT interface.
 * Uses clFFT library to perform FFTs on AMD gpus.
 * clFFT library works also on nvida GPUs and other devices that
 * support OpenCL.
 */
 class OpenCLFFT : public BaseFFT {
 private:
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@@ -7,6 +7,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "OpenCLBase.h"
 /** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class OpenCLGreensFunction : public GreensFunction {
 private:
@@ -31,7 +32,7 @@ public:
  int buildProgram();
  /**
-    Info: calc itegral on device memory (taken from OPAL src code)
+    Info: calc itegral on device memory (taken from OPAL src code).
    Return: success or error code
  */
  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
@@ -39,20 +40,20 @@ public:
 		       int streamId = -1);
  /**
-    Info: integration of rho2_m field (taken from OPAL src code)
+    Info: integration of rho2_m field (taken from OPAL src code).
    Return: success or error code
  */
  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
 				  int streamId = -1);
  /**
-    Info: mirror rho field (taken from OPAL src code)
+    Info: mirror rho field (taken from OPAL src code).
    Return: succes or error code
  */
  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
  /**
-    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
+    Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
    Return: success or error code
  */
  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
--- a/src/Utility/DKSTimer.h
+++ b/src/Utility/DKSTimer.h
@@ -5,6 +5,10 @@
 #include <string>
 #include <sys/time.h>
 /**
 * Custom timer class.
 * Allows to insert timers in the code to get function exectution times.
 */
 class DKSTimer {
 private:
@@ -17,39 +21,45 @@ private:
 public:
-  /** Init DKSTimer by seting timer to zero  */
+  /** Init DKSTimer by seting timer to zero. */
  DKSTimer();
  ~DKSTimer();
-  /** Init the timer
+  /** 
-   *  Set the name for timer and clear all values
+   * Init the timer.
   * Set the name for timer and clear all values
   */
  void init(std::string n);
-  /** Start the timer.
+  /** 
-   *  Get the curret time with gettimeofday and save in timeStart
+   * Start the timer.
   * Get the curret time with gettimeofday and save in timeStart
   */
  void start();
-  /** Stop the timer 
+  /** 
-   *  Get the curretn time with gettimeofday and save in timeEnd
+   * Stop the timer.
-   *  Calculate elapsed time by timeEnd - timeStart and add to timervalue
+   * Get the curretn time with gettimeofday and save in timeEnd
   * Calculate elapsed time by timeEnd - timeStart and add to timervalue
   */
  void stop();
-  /** Reset timervalue to zero.
+  /** 
-   *  Set timervalue, timeStart and timeEnd to zero
+   * Reset timervalue to zero.
   * Set timervalue, timeStart and timeEnd to zero
   */
  void reset();
-  /** Return elapsed time in seconds.
+  /** 
-   *  Return the value of timervalue
+   * Return elapsed time in seconds.
   * Return the value of timervalue
   */
  double gettime();
-  /** Print timer.
+  /** 
-   *  Print the elapsed time of the timer
+   * Print timer.
   * Print the elapsed time of the timer
   */
  void print();