diff --git a/src/Algorithms/ChiSquareRuntime.h b/src/Algorithms/ChiSquareRuntime.h
index 85e8d9f..68e794b 100644
--- a/src/Algorithms/ChiSquareRuntime.h
+++ b/src/Algorithms/ChiSquareRuntime.h
@@ -15,6 +15,9 @@
 
 class DKSBaseMuSR;
 
+/** 
+ * Interface to implement ChiSquareRuntime class for musrfit.
+ */
 class ChiSquareRuntime {
   friend class DKSBaseMuSR;
 
@@ -63,23 +66,54 @@ public:
   /** Default constructor */
   //ChiSquareRuntime();
 
-  /** Default destructor */
+  /** Default destructor. */
   virtual ~ChiSquareRuntime() { };
 
+  /**
+   * Compile GPU programm generated at runtime.
+   */
   virtual int compileProgram(std::string function, bool mlh = false) = 0;
+
+  /**
+   * Launche the compiled chiSquare kernel.
+   */
   virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
 			      int numpar, int numfunc, int nummap,
 			      double timeStart, double timeStep,
 			      double &result) = 0;
 
+  /** 
+   * Write the parameter values to the GPU.
+   */
   virtual int writeParams(const double *params, int numparams) = 0;
+
+  /**
+   * Write the function values to the GPU.
+   */
   virtual int writeFunc(const double *func, int numfunc) = 0;
+
+  /**
+   * Write map values to the GPU.
+   */
   virtual int writeMap(const int *map, int nummap) = 0;
+
+  /**
+   * Allocate temporary memory needed for the chi square calucaltios on the device.
+   */
   virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
+
+  /**
+   * Free device memory allocated for chi square calculations.
+   */
   virtual int freeChiSquare() = 0;
+
+  /**
+   * Check if available device can run the chi square GPU code.
+   */
   virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
 
-  /** Set N0, tau and bgk values to use for the kernel.
+  /** 
+   * Set N0, tau and bgk values to use for the kernel.
    * If values changes between data sets this needs to be called before
    * every kernel call. Returns DKS_SUCCESS.
    */
@@ -91,7 +125,8 @@ public:
     return DKS_SUCCESS;
   }
 
-  /** Set alpha and beta values to use for the kernel.
+  /** 
+   * Set alpha and beta values to use for the kernel.
    * If values changes between data sets this needs to be called before
    * every kernel call. Returns DKS_SUCCESS.
    */
@@ -101,8 +136,9 @@ public:
     return DKS_SUCCESS;
   }
 
-  /** Set number of blocks and threads.
-   *  Used to set parameters obtained from auto-tuning
+  /** 
+   * Set number of blocks and threads.
+   * Used to set parameters obtained from auto-tuning
    */
   int setKernelParams(int numBlocks, int blockSize) {
     int ierr = DKS_ERROR;
@@ -118,8 +154,9 @@ public:
     return ierr;
   }
 
-  /** Get the number of operations in compiled kernel.
-   *  Count the number of operation in the ptx file for the compiled program.
+  /** 
+   * Get the number of operations in compiled kernel.
+   * Count the number of operation in the ptx file for the compiled program.
    */
   int getOperations(int &oper) {
 
diff --git a/src/Algorithms/CollimatorPhysics.h b/src/Algorithms/CollimatorPhysics.h
index b6c4c61..cbc497d 100644
--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@@ -5,6 +5,9 @@
 #include <string>
 #include "../DKSDefinitions.h"
 
+/**
+ * Interface to impelment particle matter interaction for OPAL.
+ */
 class DKSCollimatorPhysics {
 
 protected:
@@ -15,29 +18,61 @@ protected:
 public:
   
   virtual ~DKSCollimatorPhysics() { }
-  
+
+  /** 
+   * Execute collimator physics kernel.
+   *
+   */  
   virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
 				bool enableRutherforScattering = true) = 0;
 
+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
   virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles) = 0;
   
+  /** 
+   * Sort particle array on GPU.
+   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
+   * array so these particles are at the end of array
+   */
   virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
 
+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
   virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				       void *px_ptr, void *py_ptr, void *pz_ptr,
 				       void *par_ptr, int numparticles, int &numaddback) = 0;
 
+  /** 
+   * BorisPusher push function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
   virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;
 
+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
   virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				   void *bf_ptr, void *dt_ptr, double charge,
 				   double mass, int npart, double c, int streamId = -1) = 0; 
 
+  /** 
+   * BorisPusher push function with transformto function form OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
   virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
diff --git a/src/Algorithms/FFT.h b/src/Algorithms/FFT.h
index 6af8d35..6bf12fc 100644
--- a/src/Algorithms/FFT.h
+++ b/src/Algorithms/FFT.h
@@ -6,12 +6,21 @@
 
 #include "../DKSDefinitions.h"
 
+/**
+ * Abstract class defining methods for DKS FFT class.
+ * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
+ */
 class BaseFFT {
 
 protected:
   int defaultN[3];
   int defaultNdim;
 
+  /**
+   * Check if FFT plan is created for the needed dimension and FFT size.
+   * Returns true if the plan has been created and false if no plan for specified dimension
+   * and size exists.
+   */
   bool useDefaultPlan(int ndim, int N[3]) {
     if (ndim != defaultNdim)
       return false;
@@ -24,18 +33,57 @@ public:
 
   virtual ~BaseFFT() { }
 
+  /** Setup FFT - init FFT library used by chosen device. */
   virtual int setupFFT(int ndim, int N[3]) = 0;
+
+  /** Setup real to complex FFT - init FFT library used by chosen device. */
   virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
   virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Clean up. */
   virtual int destroyFFT() = 0;
+
+  /** 
+   * Exectute C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
   virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
 			 int streamId = -1, bool forward = true) = 0;
+
+  /** 
+   * Exectute inverse C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
   virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /**
+   * Normalize the FFT or IFFT.
+   * mem_ptr - memory to complex data.
+   */
   virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /** 
+   * Exectute R2C FFT.
+   * real_ptr - real input data for FFT, comp_ptr - memory on the device where
+   * results for the FFT are stored as complex numbers.
+   */
   virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /** 
+   * Exectute C2R FFT.
+   * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
+   */
   virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /**
+   * Normalize CR FFT.
+   */
   virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
 
 };
diff --git a/src/Algorithms/GreensFunction.h b/src/Algorithms/GreensFunction.h
index 45674ec..8cc8ccc 100644
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@@ -4,24 +4,27 @@
 #include <iostream>
 #include <cmath>
 
+/**
+ * Interface to implement Greens function calculations for OPAL.
+ */
 class GreensFunction {
 
 public:
   
   virtual ~GreensFunction() { }
 
-  /** calc greens integral, as defined in OPAL */
+  /** calc greens integral, as defined in OPAL. */
   virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
 			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
 
-  /** integration if rho2_m, see OPAL for more details */
+  /** integration if rho2_m, see OPAL for more details. */
   virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
 					int streamId = -1) = 0;
 
-  /** mirror rho2_m field */
+  /** mirror rho2_m field. */
   virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
 
-  /** multiply two complex fields from device memory */
+  /** multiply two complex fields from device memory. */
   virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
 
 };
diff --git a/src/Algorithms/ImageReconstruction.h b/src/Algorithms/ImageReconstruction.h
index 3a6266e..1c63082 100644
--- a/src/Algorithms/ImageReconstruction.h
+++ b/src/Algorithms/ImageReconstruction.h
@@ -5,17 +5,22 @@
 
 #define BLOCK_SIZE 128
 
+/** Struct to hold voxel position for PET image. */
 struct VoxelPosition {
   float x;
   float y;
   float z;
 };
 
+/** Struct that holds pair of detectors that registered an envent. */
 struct ListEvent {
   unsigned detA : 16;
   unsigned detB : 16;
 };
 
+/**
+ * Interface to implement PET image reconstruction.
+ */
 class ImageReconstruction {
 
 protected:
@@ -25,7 +30,8 @@ public:
 
   virtual ~ImageReconstruction() { }
   
-  /** Caluclate source.
+  /** 
+   * Caluclate source.
    *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
    *  that are inside this sphere. All the sphere used have the same diameter.
    */
@@ -33,7 +39,8 @@ public:
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0) = 0;
 
-  /** Calculate background.
+  /** 
+   * Calculate background.
    * Places two sphere at each voxel position, calculates the avg value and std value of pixels
    * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
    * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@@ -42,7 +49,8 @@ public:
 				  void *avg, void *std, float diameter, int total_voxels, 
 				  int total_sources, int start = 0) = 0;
 
-  /** Caluclate source using differente sources.
+  /** 
+   * Caluclate source using differente sources.
    * Places two sphere at each voxel position, calculates the avg value and std value of pixels
    * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
    * each sphere is given by *diameter array.
@@ -52,7 +60,7 @@ public:
 			       int total_sources, int start = 0) = 0;
 
   /**
-   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
    * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
    * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
    * smaller sphere.
@@ -61,7 +69,8 @@ public:
 				   void *avg, void *std, void *diameter, int total_voxels, 
 				   int total_sources, int start = 0) = 0;
 
-  /** Generate normalization.
+  /** 
+   * Generate normalization.
    * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
    * that updates voxel values in the image on the slope between these two detectors.
    */
@@ -69,14 +78,16 @@ public:
 				 void *det_position, int total_det) = 0; 
 
 
-  /** Calculate forward projection.
+  /** 
+   * Calculate forward projection.
    * For image reconstruction calculates forward projections.
    * see recon.cpp for details
    */
   virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 				void *image_position, int num_events) = 0;
 
-  /** Calculate backward projection.
+  /** 
+   * Calculate backward projection.
    * For image reconstruction calculates backward projections.
    * see recon.cpp for details
    */
@@ -84,29 +95,29 @@ public:
 				 void *det_position, void *image_position, 
 				 int num_events, int num_voxels) = 0;
 
-  /** Set the voxel dimensins on device.
-   * 
+  /** 
+   *Set the voxel dimensins on device. 
    */
   virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
 
-  /** Set the image edge variables on the device.
-   * 
+  /** 
+   * Set the image edge variables on the device.
    */
   virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
 
-  /** Set the image edge1 on the device.
-   * 
+  /** 
+   * Set the image edge1 on the device.
    */
   virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
 
-  /** Set the minimum crystan in one ring values on the device.
-   * 
+  /** 
+   * Set the minimum crystan in one ring values on the device.
    */
   virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
 				  float min_CrystalDist_InOneRing1) = 0;
 
-  /** Set all other required parameters for reconstruction.
-   * 
+  /** 
+   * Set all other required parameters for reconstruction.
    */
   virtual int setParams(float matrix_distance_factor, float phantom_diameter,
 			float atten_per_mm, float ring_diameter) = 0;
diff --git a/src/AutoTuning/DKSAutoTuning.h b/src/AutoTuning/DKSAutoTuning.h
index ca8f3a3..c14e7c1 100644
--- a/src/AutoTuning/DKSAutoTuning.h
+++ b/src/AutoTuning/DKSAutoTuning.h
@@ -18,6 +18,17 @@
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;
 
+/** 
+ * DKS autotuning class, allows to auto-tune the defince function.
+ * Executes the defined function for auto-tuning and searches for optimal parameters to improve
+ * the function execution time. The function that is auto-tuned, parameters and the ranges
+ * need to be set. Includes multiple search methods, that searches the parameter space to finde 
+ * the optimal solution.
+ *  1) exaustive search
+ *  2) line search
+ *  3) hill climbimg
+ *  4) simulated annealing
+ */
 class DKSAutoTuning {
 
 private:
@@ -36,12 +47,13 @@ private:
 
   int loops_m;
 
-  /** Update parameters from a state */
+  /** Update parameters from a state. */
   int setParameterValues(States states);
 
-  /** Evaluate the function and set execution time 
-   *  Returns DKS_ERROR if errors occured during function execution. 
-   *  Returns DKS_SUCCESS if function executed as planned. 
+  /** 
+   * Evaluate the function and set execution time 
+   * Returns DKS_ERROR if errors occured during function execution. 
+   * Returns DKS_SUCCESS if function executed as planned. 
    */
   int evaluateFunction(double &value);
 
@@ -50,12 +62,13 @@ public:
   /** Constructor */
   DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
 
-  /** Destructor */
+  /** Destructor. */
   ~DKSAutoTuning();
 
-  /** Set function to auto tune.
-   *  Caller of setFunction is responsible to bind the correct parameters 
-   *  to the function with std::bind.
+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
    */
   void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
     f_m = f;
@@ -63,15 +76,21 @@ public:
     evaluate_time_m = evaluate_time;
   }
 
+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
+   */
   void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
     fd_m = f;
     function_name_m = name;
     evaluate_time_m = evaluate_time;
   }
 
-  /** Set parameter for auto tuning.
-   *  Provide a pointer to a parameter that will be changed during auto-tuning
-   *  and a min-max value for this element
+  /** 
+   * Set parameter for auto tuning.
+   * Provide a pointer to a parameter that will be changed during auto-tuning
+   * and a min-max value for this element
    */
   template <typename T1>
   void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
@@ -85,9 +104,9 @@ public:
   /** Perform exaustive search evaluating all the parameter configurations */
   void exaustiveSearch();
 
-  /** Perform auto-tuning.
-   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
-   *  parameters constant.
+  /**
+   * Perform line-search auto-tuning by variying parameters one at a time.
+   * After one parameter is auto-tuned the next on is varied
    */
   void lineSearch();  
 
diff --git a/src/AutoTuning/DKSAutoTuningTester.h b/src/AutoTuning/DKSAutoTuningTester.h
index 9c44309..96de6ba 100644
--- a/src/AutoTuning/DKSAutoTuningTester.h
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <cmath>
 
+/** Tester class for auto-tuning search algorithms. */
 class DKSAutoTuningTester {
 
   friend class DKSBaseMuSR;
diff --git a/src/AutoTuning/DKSConfig.h b/src/AutoTuning/DKSConfig.h
index 037a282..c747067 100644
--- a/src/AutoTuning/DKSConfig.h
+++ b/src/AutoTuning/DKSConfig.h
@@ -1,9 +1,3 @@
-/** Class to save and load DKS autotunning configs.
- * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
- * Uses boost xml_parser to read and write the xml file and boost property tree to store
- * the xml content.
- */
-
 #ifndef DKS_CONFIG
 #define DKS_CONFIG
 
@@ -29,6 +23,13 @@ namespace pt = boost::property_tree;
 const std::string config_dir = "/.config/DKS";
 const std::string config_file = "/autotuning.xml";
 
+/** Class to save and load DKS autotunning configs.
+ * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
+ * Uses boost xml_parser to read and write the xml file and boost property tree to store
+ * the xml content.
+ * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
+ * so the auto-tuning has no effect.
+ */
 class DKSConfig {
 
 private:
diff --git a/src/AutoTuning/DKSSearchStates.h b/src/AutoTuning/DKSSearchStates.h
index cdd8fb0..dbc8d23 100644
--- a/src/AutoTuning/DKSSearchStates.h
+++ b/src/AutoTuning/DKSSearchStates.h
@@ -9,6 +9,9 @@
 
 enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
 
+/** 
+ * Parameter class allows to change the searchable parameters during the auto-tuning.
+ */
 class Parameter {
 
 private:
@@ -64,6 +67,10 @@ public:
 
 };
 
+/**
+ * Struct to hold a auto-tuning state.
+ * Holds the current value, min, max and a step to witch a state can change.
+ */ 
 struct State {
   double value;
   double min;
@@ -74,6 +81,12 @@ struct State {
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;
 
+/** 
+ * Used by auto-tuning search algorithms to move between parameter configurations.
+ * Allows to move from one parameter stat to another, get neighboring states, 
+ * move to neighboring states and save state information. Print functions are available
+ * for debugging purposes, to follow how algorithm muves between sates.
+ */
 class DKSSearchStates {
 
 private:
diff --git a/src/CUDA/CudaBase.cu b/src/CUDA/CudaBase.cu
index 6317567..51f2729 100644
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
 	
   return DKS_SUCCESS;
 }
-		
-/*
-  Info: allcate memory and write data (push)
-  Return: pointer to memory object
-*/
-/*
-  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
-
-  void * mem_ptr;
-  mem_ptr = cuda_allocateMemory(size, ierr);
-	
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_writeData(mem_ptr, in_data, size);
-		
-  return mem_ptr;
-  }
-*/
-		
-/*
-  Info: read data and free memory (pull)
-  Return: success or error code
-*/
-/*
-  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
-
-  ierr = cuda_readData(mem_ptr, out_data, size);
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_freeMemory(mem_ptr);	
-  else
-  return DKS_ERROR;
-	
-	
-  if (ierr == DKS_SUCCESS)	
-  return DKS_SUCCESS;
-  else
-  return DKS_ERROR;
-  }
-*/
-
-/*
-  Info: execute function
-  Return: success or error code
-*/
-int CudaBase::cuda_executeFunction() {
-
-  std::cout << "Execute function" << std::endl;
-  return DKS_SUCCESS;
-}
-		
-/*
-  Info: clean up
-  Return: success or error code
-*/
-int CudaBase::cuda_cleanUp() {
-
-  std::cout << "clean up" << std::endl;
-  return DKS_SUCCESS;
-	
-}
diff --git a/src/CUDA/CudaBase.cuh b/src/CUDA/CudaBase.cuh
index 8c8dbd6..ce1da4b 100644
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@@ -16,6 +16,11 @@
 
 #define BLOCK_SIZE 128
 
+/**
+ * CUDA base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement, data transfers and stream setup for 
+ * asynchronous data transfers and kernel executions.
+ */
 class CudaBase {
 
 private:
@@ -48,17 +53,17 @@ public:
   /**
    * Delete curandState.
    * Delete curandState array on the GPU and free memory.
-   *  Return success or error code
+   * Return success or error code
    */
   int cuda_deleteCurandStates();
 
-  /** Create 'size' random numbers on the device and save in mem_ptr array
-   *
+  /** 
+   * Create 'size' random numbers on the device and save in mem_ptr array.
    */
   int cuda_createRandomNumbers(void *mem_ptr, int size);
 
-  /** Get a pointer to curand states
-   *
+  /** 
+   * Get a pointer to curand states.
    */
   curandState* cuda_getCurandStates();
 	
@@ -75,93 +80,98 @@ public:
   int cuda_addStream(cudaStream_t tmpStream, int &streamId);
 
   /**
-   * delete cuda stream
+   * delete cuda stream.
    * success or error code
    */
   int cuda_deleteStream(int id);
 
   /**
-   * delete all streams
+   * delete all streams.
    * success or error code
    */
   int cuda_deleteStreams();
 
   /**
-   * set stream to use
+   * set stream to use.
    * success or error code
    */
   int cuda_setStream(int id);
 
   /**
-   * Info: get stream that is used
-   *  Return: return id of curretn stream
+   * get stream that is used.
+   * Return: return id of curretn stream
    */
   int cuda_getStreamId();
 
   /**
-   * Info: reset to default stream
+   * reset to default stream.
    * Return: success or error code
    */
   int cuda_defaultStream();
 
   /**
-   * Info: get number of streams
+   * get number of streams.
    * Return: success or error code
    */
   int cuda_numberOfStreams();
 
   /**
-   * Info: get stream
+   * get stream.
    * Return: stream
    */
   cudaStream_t cuda_getStream(int id);
 
   /**
-   * Get default cublass handle
+   * Get default cublass handle.
    */
   cublasHandle_t cuda_getCublas();
 
   /**
-   * Info: get information on cuda devices
+   * get information on cuda devices.
    * Return: success or error code
    */
   int cuda_getDevices();
 
-  /** Get CUDA device count.
-   *  Sets the number of devices on the platform that can use CUDA.
-   *  Returns DKS_SUCCESS
+  /** 
+   * Get CUDA device count.
+   * Sets the number of devices on the platform that can use CUDA.
    */
   int cuda_getDeviceCount(int &ndev);
 
-  /** Get the name of the device.
-   *  QUery the device properties of the used device and set the string device_name
+  /** 
+   * Get the name of the device.
+   * QUery the device properties of the used device and set the string device_name
    */
   int cuda_getDeviceName(std::string &device_name);
 
-  /** Set CUDA device to use.
-   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
+  /** 
+   * Set CUDA device to use.
+   * If device passed in is larger than the number of devices use 
+   * the default:0 and return DKS_ERROR 
    */
   int cuda_setDevice(int device);
 
-  /** Get unique devices
-   *  Get array of indeces with the unique CUDA devices available on the paltform
+  /** 
+   * Get unique devices.
+   * Get array of indeces with the unique CUDA devices available on the paltform
    */
   int cuda_getUniqueDevices(std::vector<int> &devices);
 	
   /**
-   * Info: init device
+   * Initialize connection to the device.
+   * Only needed when runtime compilation is used.
    * Return: success or error code
    */
   int cuda_setUp();
 	
   /**
-   * Info: allocate memory on cuda device
+   * Allocate memory on cuda device.
    * Return: pointer to memory object
    */
   void * cuda_allocateMemory(size_t size, int &ierr);
 
   /**
-   * Info: allocate host memory in pinned memory
+   * Allocate host memory in pinned memory
    * Return: success or error code
    */
   template<typename T>
@@ -174,8 +184,9 @@ public:
     return DKS_SUCCESS;
   }		
 
-  /** Zero CUDA memory.
-   *  Set all the elements of the array on the device to zero.
+  /** 
+   * Zero CUDA memory.
+   * Set all the elements of the array on the device to zero.
    */
   template<typename T>
   int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
@@ -189,8 +200,9 @@ public:
     return DKS_SUCCESS;
   }
 
-  /** Zero CUDA memory.
-   *  Set all the elements of the array on the device to zero.
+  /** 
+   * Zero CUDA memory async.
+   * Set all the elements of the array on the device to zero.
    */
   template<typename T>
   int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
@@ -209,7 +221,7 @@ public:
   }
 
   /** 
-   * Info: write data to memory
+   * Write data to memory
    * Retrun: success or error code
    */
   template<typename T>
@@ -226,7 +238,7 @@ public:
   }
 
   /**
-   * Info: write data assynchonuously
+   * Write data assynchonuously
    * Return: success or error code
    */
   template<typename T>
@@ -258,7 +270,7 @@ public:
   }
 		
   /**
-   * Info: read data from memory
+   * Read data from memory
    * Return: success or error code
    */
   template<typename T>
@@ -275,7 +287,7 @@ public:
   }
 
   /**
-   * Info: read data async from device memory
+   * Read data async from device memory
    * Return: success or error code
    */
   template<typename T>
@@ -307,19 +319,19 @@ public:
   }
 		
   /**
-   * Info: free memory on device
+   * Free memory on device
    * Return: success or error code
    */
   int cuda_freeMemory(void * mem_ptr);
 	
   /**
-   * Info: free page locked memory on host
+   * Free page locked memory on host
    * Return: success or erro code
    */
   int cuda_freeHostMemory(void * mem_ptr);
 	
   /**
-   * Info: allcate memory and write data (push)
+   * Allcate memory and write data (push)
    * Return: pointer to memory object
    */
   template<typename T>
@@ -335,7 +347,7 @@ public:
   }
 		
   /**
-   * Info: read data and free memory (pull)
+   * Read data and free memory (pull)
    * Return: success or error code
    */
   template<typename T>
@@ -353,21 +365,10 @@ public:
     else
       return DKS_ERROR;
   }
-
-  /**
-   * Info: execute function
-   * Return: success or error code
-   */
-  int cuda_executeFunction();
-		
-  /**
-   * Info: clean up
-   * Return: success or error code
-   */
-  int cuda_cleanUp();
   
   /**
-   * Info: sync cuda device
+   * Sync cuda device.
+   * Waits till all the tasks on the GPU are finished.
    * Return: success or error code
    */
   int cuda_syncDevice() {
@@ -376,7 +377,7 @@ public:
   }
 
   /**
-   * Page-lock host memory
+   * Page-lock host memory.
    */
   template<typename T>
   int cuda_hostRegister(T *ptr, int size) {
@@ -390,7 +391,7 @@ public:
   }
 
   /**
-   * Release page locked memory
+   * Release page locked memory.
    */
   template<typename T>
   int cuda_hostUnregister(T *ptr) {
@@ -403,7 +404,7 @@ public:
   }
 
   /**
-   * Info: print device memory info (total, used, avail)
+   * Print device memory info (total, used, avail)
    * Return: success or error code
    */
   int cuda_memInfo() {
diff --git a/src/CUDA/CudaChiSquare.cuh b/src/CUDA/CudaChiSquare.cuh
index 588dec5..692362c 100644
--- a/src/CUDA/CudaChiSquare.cuh
+++ b/src/CUDA/CudaChiSquare.cuh
@@ -8,6 +8,7 @@
 
 #include "CudaBase.cuh"
 
+/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
 class CudaChiSquare {
 
 private:
diff --git a/src/CUDA/CudaChiSquareRuntime.cuh b/src/CUDA/CudaChiSquareRuntime.cuh
index 79a9af5..581132e 100644
--- a/src/CUDA/CudaChiSquareRuntime.cuh
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
 
 const std::string cudaFunctFooter = "}\n";
 
+/**
+ * CUDA implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
+ */
 class CudaChiSquareRuntime : public ChiSquareRuntime{
 
 private:
@@ -29,65 +33,72 @@ private:
 
   cublasHandle_t defaultCublasRT;
 
-  /** Setup to init device
-   *  Create context and init device for RT compilation
+  /** 
+   * Setup to init device.
+   * Create context and init device for RT compilation
    */
   void setUpContext();
 
-  /** Private function to add function to kernel string
-   *
+  /** 
+   * Private function to add function to kernel string.
    */
   std::string buildProgram(std::string function);
 
 public:
 
-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase argument
    */
   CudaChiSquareRuntime(CudaBase *base);
 
-  /** Default constructor init cuda device
-   *
+  /** 
+   * Default constructor init cuda device
    */
   CudaChiSquareRuntime();
   
-  /** Default destructor
-   *
+  /** 
+   * Default destructor.
    */
   ~CudaChiSquareRuntime();
 
-  /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
    * Add function string to the calcFunction kernel and compile the program
    * Function must be valid C math expression. Parameters can be addressed in
    * a form par[map[idx]]
    */
   int compileProgram(std::string function, bool mlh = false);
 
-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
    * Launched the selected kernel from the compiled code.
-   * Result is put in &result variable
+   * Result is put in &result variable.
    */
   int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
 		      int numpar, int numfunc, int nummap,
 		      double timeStart, double timeStep,
 		      double &result);
 
-  /** Write params to device.
+  /** 
+   * Write params to device.
    * Write params from double array to mem_param_m memory on the device.
    */
   int writeParams(const double *params, int numparams); 
 
-  /** Write functions to device.
+  /** 
+   * Write functions to device.
    * Write function values from double array to mem_func_m memory on the device.
    */
   int writeFunc(const double *func, int numfunc);
 
-  /** Write maps to device.
+  /** 
+   * Write maps to device.
    * Write map values from int array to mem_map_m memory on the device.
    */
   int writeMap(const int *map, int nummap);
 
-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
    * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
    * the maximum number of elements in any datasets that will be used for calculations. Size_param,
    * size_func and size_map are the maximum number of parameters, functions and maps used in 
@@ -96,14 +107,16 @@ public:
   int initChiSquare(int size_data, int size_param, int size_func, int size_map);
 
 
-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
    * Frees the chisq temporary memory and memory for params, functions and maps
    */
   int freeChiSquare();
 
-  /** Check if CUDA device is able to run the chi square kernel.
-   *  Redundant - all new CUDA devices that support RT compilation will also support 
-   *  double precision, there are no other requirements to run chi square on GPU
+  /** 
+   * Check if CUDA device is able to run the chi square kernel.
+   * Redundant - all new CUDA devices that support RT compilation will also support 
+   * double precision, there are no other requirements to run chi square on GPU
    */
   int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
     return DKS_SUCCESS;
diff --git a/src/CUDA/CudaCollimatorPhysics.cu b/src/CUDA/CudaCollimatorPhysics.cu
index 4aca093..2c600d5 100644
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@@ -1,5 +1,6 @@
 #include "CudaCollimatorPhysics.cuh"
 
+//constants used in OPAL
 //#define M_P 0.93827231e+00
 #define M_P 0.93827204e+00
 #define C 299792458.0
@@ -11,6 +12,7 @@
 #define Z_P 1
 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
 
+//parameter array indexes
 #define POSITION 0 
 #define ZSIZE 1
 #define RHO_M 2
@@ -28,12 +30,18 @@
 #define BLOCK_SIZE 128
 #define NUMPAR 13
 
+/**
+ * CUDA device function for calculating dot product.
+ */
 __device__ inline double dot(double3 &d1, double3 &d2) {
 
   return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 
 }
 
+/**
+ * CUDA devce function to calculate cross product.
+ */
 __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
   double3 tmp;
   tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
   return tmp;
 }
 
+/**
+ * CUDA device function to calculate arbitrary rotation.
+ */
 __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
   double c=cos(Theta);
   double s=sin(Theta);
@@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
   return tmp;
 } 
 
+/**
+ * CUDA device function to check if particle is still in material.
+ * z - particle position, par - parameter array. Particle is considered inside the
+ * material if z is > material starting position and z < material starting position - mat size.
+ */
 __device__ inline bool checkHit(double &z, double *par) {
 
   /* check if particle is in the degrader material */
@@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) {
 }
 
 
+/**
+ * CUDA device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
 {
 
@@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
 
 }
 
+/**
+ * CUDA device function for rotation in 2 dimensions.
+ * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),  
+ * "Passage of particles through matter"
+ */
 __device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
 			   double &normP, double &thetacou, double &deltas, int coord,
 			   double *par) 
@@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
   pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
 
+/**
+ * CUDA device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * For details on the algorithm see OPAL user guide.
+ */
 __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
 				   bool enableRutherfordScattering) 
 {
@@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
 }
 
 
+/**
+ * CUDA kernel that performs one step in particle movement trough mater.
+ * One thread is launched for each particle in the simulation. The kernel checks if the particle
+ * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
+ * particles that are exiting the material.
+ * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
 					int numparticles, bool enableRutherfordScattering)
@@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
   volatile int tid = threadIdx.x;
   volatile int idx = blockIdx.x * blockDim.x + tid;
 
-  //transfer params to shared memory
+  //transfer params and particle positions to shared memory
+  //R is kept in shared memory in order to reduce register pressure for the kernel
   extern __shared__ double smem[];
   double *p = (double*)smem;
-  double3 *R = (double3*)&smem[NUMPAR];
+  double3 *R = (double3*)&smem[NUMPAR]; 
 
-  curandState s; 
+  curandState s; //each tread gets its own cuRand state for random number generation
   double3 P;
 
+  //load parameters to shared memory
   for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
     p[tt] = par[tt];
 
+  //sync threads to ensure that parameters are finished loading
   __syncthreads();
 
+  //there might be some empty threads that do no work
   if (idx < numparticles) {
-    s = state[idx];
-    R[tid] = data[idx].Rincol;
-    P = data[idx].Pincol;
+    s = state[idx]; //load cuRand state to local memory
+    R[tid] = data[idx].Rincol; //load position to shared memory
+    P = data[idx].Pincol; //load momentum to local memory
 
     bool pdead = false;  
     volatile double sq = sqrt(1.0 + dot(P, P));
 
     double Eng;
     
+    //check if particle is still in the material
     if (checkHit(R[tid].z, p)) {      
 
+      //calculate enery loss
       Eng = (sq - 1) * M_P;
       energyLoss(Eng, pdead, s, p);
 
+      //check if particle is not dead
       if (!pdead) {
 	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
 	sq = sqrt(dot(P, P));
 
+	//caluclate Coulomb scattering
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
 	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 
 
+	//update particle momentum
 	data[idx].Pincol = P;
       } else {
+	//mark particle as dead (-1)
 	data[idx].label = -1;
       }
-    
+      
+      //update cuRand state
       state[idx] = s;
     } else {
-    
+      //particle exits material - drift and mark as exiting (-2)
       R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
       R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
       R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@@ -272,14 +325,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
       
     }
  
+    //update particle position
     data[idx].Rincol = R[tid];
   }
 
 }
 
-__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
-					 curandState *state, int numparticles,
-					 bool enableRutherfordScattering)
+/**
+ * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
+ * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
+ * Deprecated - GPU version does not use SoA.
+ * @param[in] data structure of arrays containing particle data
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
+__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, 
+					   curandState *state, int numparticles,
+					   bool enableRutherfordScattering)
 {
 
   //get global id and thread id
@@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
 
 }
 
-
+/**
+ * Device function to swich off unitless positions.
+ */
 inline __device__ void unitlessOff(double3 &a, const double &c) {
   a.x *= c;
   a.y *= c;
   a.z *= c;
 }
 
+/**
+ * Device function to swich on unitless positions.
+ */
 inline __device__ void unitlessOn(double3 &a, const double &c) {
   a.x /= c;
   a.y /= c;
   a.z /= c;
 }
 
-//swithch to unitless positions with dtc
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOn(R, dtc);
-    unitlessOn(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//swithc to unitless positions with dt*c
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-
-//swithc off unitless positions with dtc
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOff(R, dtc);
-    unitlessOff(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//switch off unitelss positions with dt*c
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-
 
+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] npart number of particles
+ * @param[in] dtc dt*c
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
 
   //get global id and thread id
@@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
   }
 }
 
-
+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
 
   //get global id and thread id
@@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
   }
 }
 
+/** 
+ * CUDA kernel to perform particle kick.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gEf 
+ * @param[in] *gBf
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
 __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
 			   double3 *gBf, double *gdt, double charge, 
 			   double mass, int npart, double c)
@@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
 
 }
 
-struct compare_particle
-{
-  int threshold;
-
-  compare_particle() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART p1, CUDA_PART p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct compare_particle_small
-{
-  int threshold;
-
-  compare_particle_small() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART_SMALL p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct less_then
-{
-  __host__ __device__
-  bool operator()(int x)
-  {
-    return x < 0;
-  }
-};
-
 int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
 					     bool enableRutherfordScattering)
 {
diff --git a/src/CUDA/CudaCollimatorPhysics.cuh b/src/CUDA/CudaCollimatorPhysics.cuh
index fe2f275..fda8f46 100644
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@@ -20,7 +20,8 @@
 #include "CudaBase.cuh"
 
 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU or MIC as AoS.
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
  */
 typedef struct __align__(16) {
   int label;
@@ -37,7 +38,10 @@ typedef struct __align__(16) {
 } CUDA_PART;
 
 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as AoS
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
  */
 typedef struct {
   int label;
@@ -47,7 +51,8 @@ typedef struct {
 } CUDA_PART_SMALL;
 
 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as SoA.
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
  */
 typedef struct {
   int *label;
@@ -65,6 +70,9 @@ typedef struct {
 
 /**
  * Structure for storing particle on GPU
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
  */
 typedef struct {
   int *label;
@@ -73,11 +81,39 @@ typedef struct {
   double3 *Pincol;
 } CUDA_PART2_SMALL;
 
-/** CudaCollimatorPhysics class.
- * Contains kerenls that execute CollimatorPhysics functions form OPAL.
- * For detailed documentation on CollimatorPhysics functions see OPAL documentation
+/** 
+ * Operator used in thrust sort to compare particles by label.
+ * Used to move dead particles to the end of array, since they have label -1 or -2.
  */
-class CudaCollimatorPhysics : public DKSCollimatorPhysics{
+struct compare_particle_small
+{
+  int threshold;
+
+  compare_particle_small() {
+    threshold = 0;
+  }
+
+  void set_threshold(int t) {
+    threshold = t;
+  }
+
+  __host__ __device__
+  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
+    return p1.label > p2.label;
+  }
+
+  __host__  __device__
+  bool operator()(CUDA_PART_SMALL p1) {
+    return p1.label < threshold;
+  }
+};
+
+/** 
+ * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Contains kerenls that execute CollimatorPhysics functions form OPAL.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
+class CudaCollimatorPhysics : public DKSCollimatorPhysics {
 
 private:
 
@@ -86,32 +122,44 @@ private:
 
 public:
 
-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase as argument.
+   * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
    */
   CudaCollimatorPhysics(CudaBase *base) {
     m_base = base;
     base_create = false;
   }
 
-  /** Constructor - empty. */
+  /** 
+   * Empty constructor.
+   * Create a new instance of CudaCollimatorPhysics with its own CudaBase. 
+   */
   CudaCollimatorPhysics() { 
     m_base = new CudaBase();
     base_create = true;
   }
 
-  /** Destructor - empty */
+  /** 
+   * Destructor.
+   * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
+   */
   ~CudaCollimatorPhysics() { 
     if (base_create)
       delete m_base;
   };
 
-  /** Execute collimator physics kernel.
+  /** 
+   * Execute collimator physics kernel.
    *
    */
   int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			int numpartices, bool enableRutherforScattering = true);
 
+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
   int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			   void *px_ptr, void *py_ptr, void *pz_ptr,
@@ -120,12 +168,17 @@ public:
       return DKS_ERROR;
     }
 
-  /** Sort particle array on GPU.
+  /** 
+   * Sort particle array on GPU.
    * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
    * array so these particles are at the end of array
    */
   int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
   
+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
   int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
@@ -134,18 +187,25 @@ public:
       return DKS_ERROR;
     }
 
-  /** BorisPusher push function for integration from OPAL.
+  /** 
+   * BorisPusher push function for integration from OPAL.
    * ParallelTTracker integration from OPAL implemented in cuda.
    * For more details see ParallelTTracler docomentation in opal
    */
   int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);
 
+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
   int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			   void *bf_ptr, void *dt_ptr, double charge, double mass,
 			   int npart, double c, int streamId = -1); 
 
-  /** BorisPusher push function with transformto function form OPAL
+  /** 
+   * BorisPusher push function with transformto function form OPAL.
    * ParallelTTracker integration from OPAL implemented in cuda.
    * For more details see ParallelTTracler docomentation in opal
    */
diff --git a/src/CUDA/CudaFFT.cuh b/src/CUDA/CudaFFT.cuh
index 2fcbbec..f52ae9f 100644
--- a/src/CUDA/CudaFFT.cuh
+++ b/src/CUDA/CudaFFT.cuh
@@ -10,6 +10,10 @@
 #include "../Algorithms/FFT.h"
 #include "CudaBase.cuh"
 
+/**
+ * Cuda FFT class based on BaseFFT interface.
+ * Uses cuFFT library to perform FFTs on nvidias GPUs.
+ */
 class CudaFFT : public BaseFFT {
 
 private:
@@ -34,7 +38,7 @@ public:
   ~CudaFFT();
 		
   /**
-   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Init cufftPlans witch can be reused for all FFTs of the same size and type
    * Return: success or error code
    */
   int setupFFT(int ndim, int N[3]);
@@ -42,45 +46,21 @@ public:
   int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
 
   /**
-   * Info: destroy default FFT plans
+   * Destroy default FFT plans
    * Return: success or error code
    */
   int destroyFFT();
 
-  /*
-    Info: execute complex to complex double precision fft using cufft library
-    Return: success or error code
-  */
   int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
 		
-  /*
-    Info: execute ifft 
-    Return: success or error code
-  */
   int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute normalize using cuda kernel for complex to complex iFFT
-    Return: success or error code
-  */
   int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute real to complex double precision FFT
-    Return: success or error code
-  */
   int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: exectue complex to real double precision FFT
-    Return: success or error code
-  */
   int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
 
-  /*
-    Info: execute normalize for complex to real iFFT
-    Return: success or error code
-  */
   int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
 
 };
diff --git a/src/CUDA/CudaGreensFunction.cuh b/src/CUDA/CudaGreensFunction.cuh
index 69f29de..57e8cd4 100644
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@@ -12,6 +12,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "CudaBase.cuh"
 
+/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class CudaGreensFunction : public GreensFunction{
 
 private:
diff --git a/src/CUDA/CudaImageReconstruction.cuh b/src/CUDA/CudaImageReconstruction.cuh
index 4cf532c..ab10c2c 100644
--- a/src/CUDA/CudaImageReconstruction.cuh
+++ b/src/CUDA/CudaImageReconstruction.cuh
@@ -10,6 +10,7 @@
 #include "../Algorithms/ImageReconstruction.h"
 #include "CudaBase.cuh"
 
+/** CUDA implementation of ImageReconstruction interface. */
 class CudaImageReconstruction : public ImageReconstruction {
 
 private:
diff --git a/src/DKSBase.h b/src/DKSBase.h
index be10898..d8eb708 100644
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@@ -1,11 +1,3 @@
-/** DKSBase class.
- * DKSBase.h
- * Author: Uldis Locans
- * Date: 15.09.2014
- * Base class of Dynamic Kernel Scheduler that handles the function calls
- * from host application to DKS
- */
-
 #ifndef H_DKS_BASE
 #define H_DKS_BASE
 
@@ -41,7 +33,12 @@
 
 #include "AutoTuning/DKSConfig.h"
 
-/** DKSBase class for handling function calls to DKS library */
+/** 
+ * API for handling communication function calls to DKS library.
+ * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
+ * memory manegement, data transfer and other basic communication functions between
+ * the host and device.
+ */
 class DKSBase {
 
 private:
@@ -74,7 +71,7 @@ protected:
   DKSConfig dksconfig;
 
   /** 
-   * Check if current API is set to OpenCL
+   * Check if current API is set to OpenCL.
    * Return true/false wether current api is opencl
    */
   bool apiOpenCL();
@@ -91,11 +88,11 @@ protected:
    */
   bool apiOpenMP();
 
-  /** Check if device is GPU */
+  /** Check if device is GPU. */
   bool deviceGPU();
-  /** Check if device is CPU */
+  /** Check if device is CPU. */
   bool deviceCPU();
-  /** Check if device is MIC */
+  /** Check if device is MIC. */
   bool deviceMIC();
 
   /**
diff --git a/src/DKSBaseMuSR.h b/src/DKSBaseMuSR.h
index 9f640b6..00e95dc 100644
--- a/src/DKSBaseMuSR.h
+++ b/src/DKSBaseMuSR.h
@@ -20,6 +20,11 @@
 #include "OpenCL/OpenCLChiSquareRuntime.h"
 #endif
 
+/**
+ * API to handle musrfit calls to DKS library.
+ * Using ChiSquareRuntime interface allows to call chi square functions on the 
+ * GPU or CPU using CUDA or OpenCL.
+ */
 class DKSBaseMuSR : public DKSFFT {
 
 private:
diff --git a/src/DKSFFT.h b/src/DKSFFT.h
index cd87523..0e062c1 100644
--- a/src/DKSFFT.h
+++ b/src/DKSFFT.h
@@ -24,6 +24,10 @@
 #include "MIC/MICFFT.h"
 #endif
 
+/**
+ * API to handel calls to DKSFFT.
+ * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
+ */
 class DKSFFT : public DKSBase {
 
 private:
diff --git a/src/DKSImageReconstruction.h b/src/DKSImageReconstruction.h
index 32f67ef..2388309 100644
--- a/src/DKSImageReconstruction.h
+++ b/src/DKSImageReconstruction.h
@@ -10,6 +10,9 @@
 #include "CUDA/CudaImageReconstruction.cuh"
 #endif
 
+/**
+ * API to handle PET image reconstruction calls.
+ */
 class DKSImageRecon : public DKSBase {
 
 private:
@@ -22,87 +25,88 @@ public:
 
   ~DKSImageRecon();
 
-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
    */
   int callCalculateSource(void *image_space, void *image_position, void *source_position, 
 			  void *avg, void *std, float diameter, int total_voxels, 
 			  int total_sources, int start = 0);
 
-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
    */
   int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0);
 
 
-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
    */
   int callCalculateSources(void *image_space, void *image_position, void *source_position, 
 			   void *avg, void *std, void *diameter, int total_voxels, 
 			   int total_sources, int start = 0);
 
-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
    */
   int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
 			       void *avg, void *std, void *diameter, int total_voxels, 
 			       int total_sources, int start = 0);
 
-  /** Image reconstruction - generate normalization.
-   * 
+  /** 
+   * Image reconstruction - generate normalization.
    */
   int callGenerateNormalization(void *recon, void *image_position, 
 				void *det_position, int total_det);
 
-  /** Image reconstruction - forward correction.
-   * 
+  /** 
+   * Image reconstruction - forward correction.
    */
   int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 			    void *image_position, int num_events);
 
-  /** Image reconstruction - backward projection.
-   * 
+  /** 
+   * Image reconstruction - backward projection.
    */
   int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
 			     void *det_position, void *image_position, 
 			     int num_events, int num_voxels);
 
-  /** Set the voxel dimensins on device.
+  /** 
+   * Set the voxel dimensins on device.
    * Values are stored in GPU memory and used in forward and backward projection calculations.
    * Call set function once to transfer the values from host side to GPU.
    * If value changes on the host side set functions needs to be called again to update GPU values.
    */
   int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
 
-  /** Set the image edge.
+  /** 
+   * Set the image edge.
    * Values are stored in GPU memory and used in forward and backward projection calculations.
    * Call set function once to transfer the values from host side to GPU.
    * If value changes on the host side set functions needs to be called again to update GPU values.
    */
   int setEdge(float x_edge, float y_edge, float z_edge);
 
-  /** Set the image edge1.
+  /** 
+   * Set the image edge1.
    * Values are stored in GPU memory and used in forward and backward projection calculations.
    * Call set function once to transfer the values from host side to GPU.
    * If value changes on the host side set functions needs to be called again to update GPU values.
    */
   int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
 
-  /** Set the minimum crystan in one ring values.
+  /** 
+   * Set the minimum crystal in one ring values.
    * Values are stored in GPU memory and used in forward and backward projection calculations.
    * Call set function once to transfer the values from host side to GPU.
    * If value changes on the host side set functions needs to be called again to update GPU values.
    */
   int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
 
-  /** Set all other required parameters for reconstruction.
+  /** 
+   * Set all other required parameters for reconstruction.
    * Values are stored in GPU memory and used in forward and backward projection calculations.
    * Call set function once to transfer the values from host side to GPU.
    * If value changes on the host side set functions needs to be called again to update GPU values.
diff --git a/src/DKSOPAL.h b/src/DKSOPAL.h
index 0577f2d..202654e 100644
--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@@ -33,6 +33,11 @@
 #include "MIC/MICCollimatorPhysics.h"
 #endif
 
+/**
+ * API to handle OPAL calls to DKS library.
+ * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
+ * functions.
+ */
 class DKSOPAL : public DKSFFT {
 
 private: 
diff --git a/src/MIC/MICBase.h b/src/MIC/MICBase.h
index 0087778..9371dd5 100644
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@@ -26,6 +26,9 @@
 
 #define MIC_WIDTH 128
 
+/** MIC Base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement and  data transfers.
+ */
 class MICBase {
 
 private:
@@ -45,58 +48,60 @@ public:
 
   int m_device_id;
 
-  /* constructor */
+  /** constructor */
   MICBase();
 
-  /* destructor */
+  /** destructor */
   ~MICBase();
 
-  /*
-    Info: create MKL rand streams for each thread
-    Return: success or error code
-  */
+  /**
+   * Create MKL rand streams for each thread
+   *  Return: success or error code
+   */
   int mic_createRandStreams(int size);
 
-  /*
-    Info: delete MKL rand streams
-    Return: succes or error code
-  */
+  /**
+   * Delete MKL rand streams
+   * Return: succes or error code
+   */
   int mic_deleteRandStreams();
 
-  /*
-    Info: create a new signal for the mic
-    Return: success or error code
-  */
+  /**
+   * Create a new signal for the mic.
+   * Signals can be used for assynchronous data transfers.
+   * Return: success or error code
+   */
   int mic_createStream(int & streamId);
 
-  /*
-    Info: get the signal from the vector
-    Return: mic signal
+  /**
+   * Info: get the signal from the vector.
+   * Return: mic signal
   */
   int& mic_getStream(int id);
 
-  /*
-    Info: delete streams
-    Return: success or error code
-  */
+  /**
+   * Info: delete streams.
+   * Return: success or error code
+   */
   int mic_deleteStreams();
 
-  /*
-    Info: set device id
-    Return: success or error code
-  */
+  /**
+   * Info: set device id.
+   * Return: success or error code
+   */
   int mic_setDeviceId(int id);
 
-  /*
-    Info: get mic devices
-    Return: success or error code
-  */
+  /**
+   * Info: get mic devices.
+   * Prints information about mic devices.
+   * Return: success or error code
+   */
   int mic_getDevices();
 
-  /*
-    Info: allocate memory on MIC device
-    Return: success or error code
-  */
+  /**
+   * Allocate memory on MIC device.
+   * Return: success or error code
+   */
   template<typename T>
   void * mic_allocateMemory(int size) {
 
@@ -109,10 +114,10 @@ public:
     return tmp;
   }
 
-  /*
-    Info: transfer data to device
-    Return: success or error code
-  */
+  /**
+   * Transfer data to device.
+   * Return: success or error code
+   */
   template<typename T>
   int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
     T* tmp_ptr = (T*)data_ptr;
@@ -123,10 +128,10 @@ public:
     return DKS_SUCCESS;
   }
 
-  /*
-    Info: write data to device, non-blocking
-    Return: success or error code
-  */
+  /**
+   * Write data to device, non-blocking.
+   * Return: success or error code
+   */
   template<typename T>
   int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
   {
@@ -139,10 +144,10 @@ public:
   }
   
 
-  /*
-    Info: read data from device
-    Return: success or error code
-  */
+  /**
+   * Read data from device
+   * Return: success or error code
+   */
   template<typename T>
   int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
     T* tmp_ptr = (T*)data_ptr;
@@ -154,10 +159,10 @@ public:
     return DKS_SUCCESS;
   }
 
-  /*
-    Info: read data from device waiting for signal
-    Return: success or error code
-  */
+  /**
+   * Read data from device waiting for signal
+   * Return: success or error code
+   */
   template<typename T>
   int mic_readDataAsync(const void * data_ptr, void * result, int size, 
 			int streamId = -1, int offset = 0) {
@@ -172,10 +177,10 @@ public:
 
   }
 
-  /* 
-     Info: wait till all the signals are complete
-     Return siccess or error code
-  */
+  /**
+   * Wait till all the signals are complete
+   * Return siccess or error code
+   */
   int mic_syncDevice() {
     
     //empty offload to wait for all the signals to finish and launch a new empy signal
@@ -193,10 +198,10 @@ public:
 
   }
 
-  /*
-    Info: free memory on device
-    Return: success or error code
-  */
+  /**
+   * Free memory on device
+   * Return: success or error code
+   */
   template<typename T>
   int mic_freeMemory(void * data_ptr, int size) {
 
@@ -210,10 +215,10 @@ public:
     return DKS_SUCCESS;
   }
 
-  /*
-    Info: allocate memory and write data to device
-    Return: success or error code
-  */
+  /**
+   * Allocate memory and write data to device
+   * Return: success or error code
+   */
   template<typename T>
   void * mic_pushData(const void * data, int size) {
     T* tmp_ptr = new T[size];
@@ -227,10 +232,10 @@ public:
   return tmp_ptr;
 }
 
-/*
-  Info: read data and free memory on device
-  Return: success or erro code
-*/
+  /**
+   * Read data and free memory on device
+   * Return: success or erro code
+   */
   template<typename T>
   int mic_pullData(void * data_ptr, void * result, int size) {
     T* tmp_ptr = (T*)data_ptr;
diff --git a/src/MIC/MICChiSquare.h b/src/MIC/MICChiSquare.h
index c62de0b..56c0e99 100644
--- a/src/MIC/MICChiSquare.h
+++ b/src/MIC/MICChiSquare.h
@@ -14,6 +14,9 @@
 #include <offload.h>
 #include "MICBase.h"
 
+/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. 
+ * Not complete and untested because of the poor performance of first MIC devices.
+ */
 class MICChiSquare {
 
   MICBase *m_micbase;
diff --git a/src/MIC/MICCollimatorPhysics.cpp b/src/MIC/MICCollimatorPhysics.cpp
index f334d0b..5a8929c 100644
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@@ -22,22 +22,34 @@
 #define I_M 10
 #define DT_M 11
 
+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(mic_double3 d1, mic_double3 d2) {
   return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }
 
+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(double dx, double dy, double dz) {
   return (dx * dx + dy * dy + dz * dz);
 }
 
+/**
+ * MIC device function to check if particle is still in material.
+ */
 __declspec(target(mic))
 bool checkHit(double &z, double *par) {
   return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
 }
 
 
+/**
+ * MIC device function to calculate arbitrary rotation.
+ */
 __declspec(target(mic))
 void Rot(double &px, double &pz, double &x, double &z, double xplane, 
 	 double normP, double thetacou, double deltas, int coord)
@@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
   pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }
 
+
+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses AoS to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ * Deprecated on favor of SoA data layout.
+ */
 __declspec(target(mic))
 void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
   double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
 
 }
 
+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses SoA to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ */
 __declspec(target(mic))
-void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
+void coulombScat(double *rx, double *ry, double *rz, 
+		 double *px, double *py, double *pz, int *label,
 		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
 {
- 
+
+  //arrays for temporary storage, each core proceses MIC_WIDTH particles
   double normP[MIC_WIDTH] __attribute__((aligned(64)));
   double deltas[MIC_WIDTH] __attribute__((aligned(64)));
   double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
   double z2[MIC_WIDTH] __attribute__((aligned(64)));
   double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
 
+  //simd instruction tells the compiler its safe to vectorize the loop
   #pragma vector aligned
   #pragma simd
   for (int i = ii; i < ii + MIC_WIDTH; i++) {
@@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
     }
   }
   
+  //vectorize the loop
   #pragma vector aligned
   #pragma simd
   for (int i = ii; i < ii + size; i++) {
@@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
     }
   }
   
-  
   //generate array of random numbers
   vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
   vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
   
 }
 
+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
 
@@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
     pdead = 1;
 }
 
+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
 
@@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
   double *par = (double*) par_ptr;
   VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
 
+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target(mic:m_micbase->m_device_id)		\
   inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
   in(par:length(0) DKS_RETAIN DKS_REUSE)	\
@@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
       VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
       
       //for loop trough particles if not checkhit set label to -2 and update R.x
-
 #pragma omp for simd
       for (int i = 0; i < numparticles; i++) {
 	if ( !checkHit(data[i].Rincol.z, par) ) {
@@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 {
 
 
-
+  //cast device memory pointers to appropriate types
   int *label = (int*)label_ptr;
   unsigned *localID = (unsigned*)localID_ptr;
   double *rx = (double*)rx_ptr;
@@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 
   VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
 
+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target (mic:0) \
   in(label:length(0) DKS_REUSE DKS_RETAIN)	\
   in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
diff --git a/src/MIC/MICCollimatorPhysics.h b/src/MIC/MICCollimatorPhysics.h
index 910bc99..30bcccd 100644
--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@@ -26,6 +26,12 @@ typedef struct {
 } MIC_PART_SMALL;
 
 
+/**
+ * MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes OPALs collimator physics class for particle matter interactions using OpenMP
+ * and offload mode targetomg Intel Xeon Phi processors.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
 class MICCollimatorPhysics : public DKSCollimatorPhysics {
 
 private:
diff --git a/src/MIC/MICFFT.h b/src/MIC/MICFFT.h
index 491ea06..38c87e9 100644
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@@ -10,6 +10,10 @@
 #include "../Algorithms/FFT.h"
 #include "MICBase.h"
 
+/** 
+ * MIC FFT based on BaseFFT interface.
+ * uses MKL library to offload FFT on Intel Xeon Phi devices.
+ */
 class MICFFT : public BaseFFT {
 
 private:
diff --git a/src/MIC/MICGreensFunction.hpp b/src/MIC/MICGreensFunction.hpp
index db3944f..a1e00fa 100644
--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@@ -15,6 +15,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
 
+/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class MICGreensFunction : public GreensFunction {
 
 private:
diff --git a/src/MIC/MICMergeSort.h b/src/MIC/MICMergeSort.h
index 408037b..3af4e75 100644
--- a/src/MIC/MICMergeSort.h
+++ b/src/MIC/MICMergeSort.h
@@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
   return p;
 }
 
+/**
+ * Merge sort implementation for intel MIC.
+ * Paralellized over all the MIC cores using OpenMP tasks.
+ */
 template <typename T>
 void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
 
@@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
   }
 }
 
+/**
+ * Quicksort algorithm, developed for use on Intel MIC devices.
+ */
 template <typename T>
 void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
 
@@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
 
 }
 
+/** 
+ * Insertion sort of @p list, developed for use on Intel MIC.
+ * Used by quick_sort to sort small lists.
+ */
 template <typename T>
 void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
 
diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h
index e8b9d16..b811ae4 100644
--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@@ -1,16 +1,3 @@
-/*
-
-  Name: OpenCLBase
-
-  Author: Uldis Locans
-
-  Info: OpenCL base class to handle all the common details associated 
-  with kernel launch on OpenCL device
-
-  Date: 2014.09.18
-
-*/
-
 #ifndef H_OPENCL_BASE
 #define H_OPENCL_BASE
 
@@ -32,7 +19,7 @@
 
 #include "../DKSDefinitions.h"
 
-/* struct for random number state */
+/** struct for random number state. */
 typedef struct {
   double s10;
   double s11;
@@ -44,168 +31,194 @@ typedef struct {
   bool gen;
 } RNDState;
 
+/**
+ * OpenCL base class to handle device setup and basic communication wiht the device.
+ * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
+ * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
+ * class contains methods to read the kernel files, compile the kernel codes and launch kernels
+ * from the compiled codes. Which kernel file needs to be loaded for the specif functin is 
+ * handled by the base class that is launching the kernel.
+ */
 class OpenCLBase {
 
 private:
-	
+
+  //variables containig OpenCL device and platform ids
   static cl_platform_id m_platform_id;
   static cl_device_id m_device_id;
 
+  //variables containit compiled OpenCL program and kernel
   cl_context_properties m_context_properties[3];
   cl_program m_program;
   cl_kernel m_kernel;
 	
+  //variables for tracking OpenCL events
   static cl_event m_last_event;
   cl_int m_num_events;
   std::vector<cl_event> m_events;
 	
+  //currently load kernel file
   char * m_kernel_file;
 
+  //type of device used by OpenCL
   cl_device_type m_device_type;
 	
-  /*
-    Name: getPlatforms
-    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
-    Return: success or error code
-  */
+  /**
+   * Get all available OpenCL platforms.
+   * Get all avaialble platforms and save in m_platform_ids, save number of platforms
+   *  Return: success or error code
+   */
   int ocl_getPlatforms();
 	
 	
-  /*
-    Name: getDevice
-    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
-    ReturnL success or error code
-  */
+  /**
+   * Get first available OpenCL device of specified type.
+   * Get first avaialble devices and save device id and platform id for this device, 
+   * device name: (-gpu, -mic, -cpu)
+   *  ReturnL success or error code
+   */
   int ocl_getDevice(const char* device_name);
 	
-  /*
-    Name getDeviceType
-    Info: get device type from device name (-gpu, -cpu, -mic)
-    Return: success or error code
-  */
+  /**
+   * Get cl_device_type from the specified device name.
+   * get device type from device name (-gpu, -cpu, -mic)
+   *  Return: success or error code
+   */
   int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
 	
-  /*
-    Name: createContext
-    Info: create context with specified device
-    Return: success or error code
-  */
+  /**
+   * Create OpenCL context with specified device.
+   *  Return: success or error code
+   */
   int ocl_createContext();
 	
-  /*
-    Name: buildProgram
-    Info: build program from specified kernel file
-    Return: success or error code
+  /**
+   * Build program from specified kernel file.
+   * Return: success or error code.
   */
   int ocl_buildProgram(const char* kernel_file);
 
-  /** Compile program from kernel source string
-   *
+  /** 
+   * Compile program from kernel source string.
+   * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the 
+   * OpenCL program, that can be then executed on the device.
+   * opts is a string specifiend additional compiler flags.
    */
   int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
 
 protected:
 
+  //memory for random number states
   int defaultRndSet;
   cl_mem defaultRndState;
 	
 	
 public:
 
+  //OpenCL context and commad queue
   static cl_context m_context;
-  static cl_command_queue m_command_queue;
+  static cl_command_queue m_command_queue; 
     
-  /*
-    constructor
-  */
+  /**
+   * constructor
+   */
   OpenCLBase();
     
-  /*
-    destructor
-  */
+  /**
+   * destructor
+   */
   ~OpenCLBase();
     
-  /*
-    Create RND states
-    Return: success or error code
-  */
+  /**
+   * Allocate memory for size random number states and init the rnd states.
+   * Uses AMD clRng library for random numbers. 
+   * This library is only compatible with AMD devices.
+   */
   int ocl_createRndStates(int size);
 
-  /* Create an array of random numbers on the device
-   *
+  /** 
+   * Create an array of random numbers on the device.
+   * Filles hte mem_ptr with random numbers.
    */
   int ocl_createRandomNumbers(void *mem_ptr, int size);
 
-  /*
-    Destroy rnd states
-    Return: success or error code
-  */
+  /**
+   * Destroy rnd states and free device memory.
+   * Return: success or error code
+   */
   int ocl_deleteRndStates();
 
 
-  /*
-    Name: getAllDevices
-    Info: get all available devices
-    ReturnL success or error code
+  /**
+   * Prints info about all the available platforms and devices.
+   * Can be used for information purposes to see what devices are available on the system.
+   * ReturnL success or error code.
   */
   int ocl_getAllDevices();
 
-  /** Get the OpenCL device count for the set type of device
-   *
+  /** 
+   * Get the OpenCL device count for the set type of device.
+   * Device count is set in ndev parameter, returns success or error code.
    */
   int ocl_getDeviceCount(int &ndev);
 
-  /** Get the name of the device used
+  /** 
+   * Get the name of the device currently us use.
    */
   int ocl_getDeviceName(std::string &device_name);
 
-  /** Set the device to use for OpenCL kernels.
-   *  device id to use is passed as integer.
+  /** 
+   * Set the device to use for OpenCL kernels.
+   * Device id to use is passed as integer.
    */
   int ocl_setDevice(int device);
 
-  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
-   *  Used when GPUs of different types might be pressent on the system.
+  /** 
+   * Get a list of all the unique devices of the same type that can run OpenCL kernels.
+   * Used when GPUs of different types might be pressent on the system.
    */
   int ocl_getUniqueDevices(std::vector<int> &devices);
     
-  /*
-    Name: setUp
-    Info: set up opencl resources
-    Return: success or error code
-  */
+  /**
+   * Initialize OpenCL connection with a device of specified type.
+   * Find if specified device is avaialble, creates a contex and command queue.
+   * Returns success or error code.
+   */
   int ocl_setUp(const char* device_name);
 	
-  /*
-    Name: loadKernel
-    Info: load and compile opencl kernel file if it has changed
-    Return: success or error code
+  /**
+   * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
+   * Load and compile opencl kernel file if it has changed.
+   * Return: success or error code
   */
   int ocl_loadKernel(const char* kernel_file);
 
 
-  /** Build program from kernel source.
+  /** 
+   * Build program from kernel source.
    * Builds a program from source code provided in kernel_source.
    * If compilation fails will return DKS_ERROR
    */
   int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
 	
-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory on the device.
+   * Return: return pointer to memory
   */
   cl_mem ocl_allocateMemory(size_t size, int &ierr);
 
-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory of specific type on device.
+   * The availabel types are cl_mem_flags type listed in OpenCL documentation:
+   * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, 
+   * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
+   * Return: return pointer to memory
   */
   cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
 	
-  /** Zero OpenCL memory buffer
-   *  Set all the elemetns in the device array to zero
+  /** 
+   * Zero OpenCL memory buffer.
+   * Set all the elemetns in the device array to zero.
    */
   template <typename T>
   int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@@ -218,92 +231,90 @@ public:
     return DKS_SUCCESS;
   }
 
-  /*
-    Name: writeData
-    Info: write data to device memory (needs ptr to mem object)
-    Return: success or error code
-  */
+  /**
+   * Write data to device memory (needs ptr to mem object)
+   * Return: success or error code
+   */
   int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: copyData
-    Info: copy data from one buffer on the device to another
-    Return: success or error code
-  */
+  /** 
+   * Copy data from one buffer on the device to another
+   * Return: success or error code
+   */
   int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
 	
-  /*
-    Name: createKernel
-    Info: create kernel from program
-    Return: success or error code
-  */
+  /** 
+   * Create kernel from compiled OpenCL program.
+   * Return: success or error code
+   */
   int ocl_createKernel(const char* kernel_name);
 	
-  /*
-    Name: setKernelArgs
-    Info: set opencl kernel arguments
-    Return: success or error code
-  */
+  /**
+   * Set argiments for the kernel that will be launched.
+   * Return: success or error code
+   */
   int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
 	
-  /*
-    Name: executeKernel
-    Info: execute selected kernel (needs kernel parameters)
-    Return: success or error code
+  /**
+   * Execute selected kernel.
+   * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
+   * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
+   * kernel arguments must be set.
+   * Return: success or error code
   */
   int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
 	
-  /*
-    Name: readData
-    Info: read data from device (needs pointer to mem object)
-    Return: success or error code
-  */
+  /**
+   * Read data from device (needs pointer to mem object).
+   * Return: success or error code
+   */
   int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: freeMemory
-    Info: free device memory (needs ptr to mem object)
-    Return: success or error code
-  */
+  /**
+   * Free device memory (needs ptr to mem object).
+   *  Return: success or error code
+   */
   int ocl_freeMemory(cl_mem mem_ptr);
 	
-  /*
-    Name: cleanUp
-    Info: free opencl resources
-    Return: success or error code
-  */
+  /**
+   * Free opencl resources.
+   * Deletes the kernel, compiled program, command queue and colese the connection
+   * to device by releasing the context.
+   * Return: success or error code
+   */
   int ocl_cleanUp();
 	
-  /*
-    Name: deviceInfo
-    Info: print device info (mostly for debugging purposes)
-    Return: success or error code
-  */
+  /**
+   * Print info of currently selected device.
+   * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
+   * Return: success or error code
+   */
   int ocl_deviceInfo(bool verbose = true);
 
-  /* Check OpenCL kernel.
-   * Query device and check if it can run the kernel with required parameters
+  /* 
+   * Check OpenCL kernel.
+   * Query device and check if it can run the kernel with required parameters.
+   * Also check the available OpenCL extensions - usefull for checking the supported device
+   * features, like double precission.
    */
   int ocl_checkKernel(const char* kernel_name, int work_group_size,
 		      bool double_precision, int &threadsPerBlock);
 	
-  /*
-    Name: clearEvents
-    Info: clear saved events (for debuging purposes)
-    Return: nothing
-  */
+  /**
+   * Clear the event list.
+   * Events can be used for timing and synchronization purposes.
+   */
   void ocl_clearEvents();
 
-  /*
-    Name: eventInfo
-    Info: print information about kernel timings (for debuging purposes)
-    Return: nothing
-  */
+  /**
+   * print information about kernel timings from event list.
+   * for debuging purposes
+   */
   void ocl_eventInfo();
 
-  /*
-    Return current command queue
-  */
+  /**
+   * Return current command queue.
+   */
   cl_command_queue ocl_getQueue() { return m_command_queue; }
 };
 
diff --git a/src/OpenCL/OpenCLChiSquare.h b/src/OpenCL/OpenCLChiSquare.h
index bbc5da6..5c7ad51 100644
--- a/src/OpenCL/OpenCLChiSquare.h
+++ b/src/OpenCL/OpenCLChiSquare.h
@@ -14,7 +14,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1
 
-
+/** Deprecated, SimpleFit implementation of ChiSquare. */
 class OpenCLChiSquare {
 
 private:
diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp
index f8e21a6..2b97b87 100644
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
 }
 
 int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
+  //write params to gpu
   int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
   return ierr;
 }
@@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
   if (numfunc == 0)
     return DKS_SUCCESS;
 
+  //write function values to the GPU
   int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
   return ierr;
 }
@@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
   if (nummap == 0)
     return DKS_SUCCESS;
 
+  //wrtie map values to the GPU
   int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
   return ierr;
 }
@@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
     freeChiSquare();
   }
 
-  //allocate temporary memory
+  //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
   mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
   mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
   if (size_func == 0)
@@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
   int ierr = DKS_ERROR;
   if (initDone_m) {
 
-    //free memory
+    //free GPU memory
     ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
     ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
     ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
     return DKS_ERROR;
   }
 
+  //check the GPU kernel
   ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
 
   return ierr;
diff --git a/src/OpenCL/OpenCLChiSquareRuntime.h b/src/OpenCL/OpenCLChiSquareRuntime.h
index 90b5c7c..350b6df 100644
--- a/src/OpenCL/OpenCLChiSquareRuntime.h
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
 
 const std::string openclFunctFooter = "}\n";
 
+/**
+ * OpenCL implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to target devices that
+ * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
+ */
 class OpenCLChiSquareRuntime : public ChiSquareRuntime {
 
 private:
 
   OpenCLBase *m_oclbase;
 
-  /** Private function to add user defined function to kernel string
-   *
+  /** 
+   * Private function to add user defined function to kernel string.
    */
   std::string buildProgram(std::string function);
 
+  /**
+   * Launch parallel reduction kernel to calculate the sum of data array
+   */
   double calculateSum(cl_mem data, int length);
 
 public:
 
-  /** Constructor wiht openclbase argument
-   *
+  /** 
+   * Constructor wiht openclbase argument.
    */
   OpenCLChiSquareRuntime(OpenCLBase *base);
   
-  /** Default constructor
-   *
+  /** 
+   * Default constructor
    */
   OpenCLChiSquareRuntime();
 
-  /** Default destructor
-   *
+  /** 
+   * Default destructor
    */
   ~OpenCLChiSquareRuntime();
 
-    /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
    * Add function string to the calcFunction kernel and compile the program
    * Function must be valid C math expression. Parameters can be addressed in
    * a form par[map[idx]]
    */
   int compileProgram(std::string function, bool mlh = false);
 
-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
    * Launched the selected kernel from the compiled code.
    * Result is put in &result variable
    */
@@ -64,22 +74,26 @@ public:
 		      double timeStart, double timeStep,
 		      double &result);
 
-  /** Write params to device.
+  /** 
+   * Write params to device.
    * Write params from double array to mem_param_m memory on the device.
    */
   int writeParams(const double *params, int numparams); 
 
-  /** Write functions to device.
+  /** 
+   * Write functions to device.
    * Write function values from double array to mem_func_m memory on the device.
    */
   int writeFunc(const double *func, int numfunc);
 
-  /** Write maps to device.
+  /** 
+   * Write maps to device.
    * Write map values from int array to mem_map_m memory on the device.
    */
   int writeMap(const int *map, int nummap);
 
-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
    * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
    * the maximum number of elements in any datasets that will be used for calculations. Size_param,
    * size_func and size_map are the maximum number of parameters, functions and maps used in 
@@ -87,14 +101,16 @@ public:
    */
   int initChiSquare(int size_data, int size_param, int size_func, int size_map);
 
-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
    * Frees the chisq temporary memory and memory for params, functions and maps
    */
   int freeChiSquare();
 
-  /** Check MuSR kernels for necessary resources.
+  /** 
+   * Check MuSR kernels for necessary resources.
    * Query device properties to get if sufficient resources are
-   * available to run the kernels
+   * available to run the kernels. Also checks if double precission is enabled on the device.
    */
   int checkChiSquareKernels(int fitType, int &threadsPerBlock);
 
diff --git a/src/OpenCL/OpenCLCollimatorPhysics.h b/src/OpenCL/OpenCLCollimatorPhysics.h
index 0f8accc..2de1408 100644
--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@@ -17,12 +17,16 @@
 #include "boost/compute/core.hpp"
 */
 
+/** Double3 structure for use in OpenCL code. */
 typedef struct {
   double x;
   double y;
   double z;
 } Double3;
 
+/**
+ * Structure for stroing particles in OpenCL code.
+ */
 typedef struct {
   int label;
   unsigned localID;
@@ -35,6 +39,10 @@ typedef struct {
 //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
 //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
 
+/**
+ * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
+ */
 class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
 
 private:
@@ -42,16 +50,20 @@ private:
 
 public:
 
-  /* constructor */
+  /** 
+   * Constructor with OpenCLBase as argument.
+   * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
+   */
   OpenCLCollimatorPhysics(OpenCLBase *base) { 
     m_oclbase = base;
   }
 
-  /* destructor */
+  /** 
+   * Destructor.
+   */
   ~OpenCLCollimatorPhysics() { 
   }
 
-  /* execute degrader code on device */
   int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 			bool enableRutherforScattering = true);
 
diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h
index 6ab8c88..8f930a2 100644
--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@@ -1,14 +1,3 @@
-/*
-
-  Name: OpenCLFFT
-
-  Author: Uldis Locans
-
-  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
-
-  Data: 19.09.2014
-
-*/
 #ifndef H_OPENCL_FFT
 #define H_OPENCL_FFT
 
@@ -22,6 +11,12 @@
 
 #include "clFFT.h"
 
+/**
+ * OpenCL FFT class based on BaseFFT interface.
+ * Uses clFFT library to perform FFTs on AMD gpus.
+ * clFFT library works also on nvida GPUs and other devices that
+ * support OpenCL.
+ */
 class OpenCLFFT : public BaseFFT {
 
 private:
diff --git a/src/OpenCL/OpenCLGreensFunction.h b/src/OpenCL/OpenCLGreensFunction.h
index 3e90fba..9e352ac 100644
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@@ -7,6 +7,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "OpenCLBase.h"
 
+/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class OpenCLGreensFunction : public GreensFunction {
 
 private:
@@ -31,7 +32,7 @@ public:
   int buildProgram();
 
   /**
-    Info: calc itegral on device memory (taken from OPAL src code)
+    Info: calc itegral on device memory (taken from OPAL src code).
     Return: success or error code
   */
   int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
@@ -39,20 +40,20 @@ public:
 		       int streamId = -1);
 		
   /**
-    Info: integration of rho2_m field (taken from OPAL src code)
+    Info: integration of rho2_m field (taken from OPAL src code).
     Return: success or error code
   */
   int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
 				  int streamId = -1);
 		
   /**
-    Info: mirror rho field (taken from OPAL src code)
+    Info: mirror rho field (taken from OPAL src code).
     Return: succes or error code
   */
   int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
 
   /**
-    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
+    Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
     Return: success or error code
   */
   int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
diff --git a/src/Utility/DKSTimer.h b/src/Utility/DKSTimer.h
index 80025c0..fcfce4a 100644
--- a/src/Utility/DKSTimer.h
+++ b/src/Utility/DKSTimer.h
@@ -5,6 +5,10 @@
 #include <string>
 #include <sys/time.h>
 
+/**
+ * Custom timer class.
+ * Allows to insert timers in the code to get function exectution times.
+ */
 class DKSTimer {
 
 private:
@@ -17,39 +21,45 @@ private:
 
 public:
 
-  /** Init DKSTimer by seting timer to zero  */
+  /** Init DKSTimer by seting timer to zero. */
   DKSTimer();
 
   ~DKSTimer();
 
-  /** Init the timer
-   *  Set the name for timer and clear all values
+  /** 
+   * Init the timer.
+   * Set the name for timer and clear all values
    */
   void init(std::string n);
 
-  /** Start the timer.
-   *  Get the curret time with gettimeofday and save in timeStart
+  /** 
+   * Start the timer.
+   * Get the curret time with gettimeofday and save in timeStart
    */
   void start();
 
-  /** Stop the timer 
-   *  Get the curretn time with gettimeofday and save in timeEnd
-   *  Calculate elapsed time by timeEnd - timeStart and add to timervalue
+  /** 
+   * Stop the timer.
+   * Get the curretn time with gettimeofday and save in timeEnd
+   * Calculate elapsed time by timeEnd - timeStart and add to timervalue
    */
   void stop();
 
-  /** Reset timervalue to zero.
-   *  Set timervalue, timeStart and timeEnd to zero
+  /** 
+   * Reset timervalue to zero.
+   * Set timervalue, timeStart and timeEnd to zero
    */
   void reset();
 
-  /** Return elapsed time in seconds.
-   *  Return the value of timervalue
+  /** 
+   * Return elapsed time in seconds.
+   * Return the value of timervalue
    */
   double gettime();
 
-  /** Print timer.
-   *  Print the elapsed time of the timer
+  /** 
+   * Print timer.
+   * Print the elapsed time of the timer
    */
   void print();