updated documentation

2017-08-10 14:57:48 +02:00
parent 7ca93a3a49
commit ccc4329bef
38 changed files with 939 additions and 673 deletions
--- a/src/Algorithms/ChiSquareRuntime.h
+++ b/src/Algorithms/ChiSquareRuntime.h
@ -15,6 +15,9 @@

 class DKSBaseMuSR;

+/** 
+ * Interface to implement ChiSquareRuntime class for musrfit.
+ */
 class ChiSquareRuntime {
  friend class DKSBaseMuSR;

@ -63,23 +66,54 @@ public:
  /** Default constructor */
  //ChiSquareRuntime();

-  /** Default destructor */
+  /** Default destructor. */
  virtual ~ChiSquareRuntime() { };

+  /**
+   * Compile GPU programm generated at runtime.
+   */
  virtual int compileProgram(std::string function, bool mlh = false) = 0;
+
+  /**
+   * Launche the compiled chiSquare kernel.
+   */
  virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, 
 			      int numpar, int numfunc, int nummap,
 			      double timeStart, double timeStep,
 			      double &result) = 0;

+  /** 
+   * Write the parameter values to the GPU.
+   */
  virtual int writeParams(const double *params, int numparams) = 0;
+
+  /**
+   * Write the function values to the GPU.
+   */
  virtual int writeFunc(const double *func, int numfunc) = 0;
+
+  /**
+   * Write map values to the GPU.
+   */
  virtual int writeMap(const int *map, int nummap) = 0;
+
+  /**
+   * Allocate temporary memory needed for the chi square calucaltios on the device.
+   */
  virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
+
+  /**
+   * Free device memory allocated for chi square calculations.
+   */
  virtual int freeChiSquare() = 0;
+
+  /**
+   * Check if available device can run the chi square GPU code.
+   */
  virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;

-  /** Set N0, tau and bgk values to use for the kernel.
+  /** 
+   * Set N0, tau and bgk values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -91,7 +125,8 @@ public:
    return DKS_SUCCESS;
  }

-  /** Set alpha and beta values to use for the kernel.
+  /** 
+   * Set alpha and beta values to use for the kernel.
   * If values changes between data sets this needs to be called before
   * every kernel call. Returns DKS_SUCCESS.
   */
@ -101,8 +136,9 @@ public:
    return DKS_SUCCESS;
  }

-  /** Set number of blocks and threads.
-   *  Used to set parameters obtained from auto-tuning
+  /** 
+   * Set number of blocks and threads.
+   * Used to set parameters obtained from auto-tuning
   */
  int setKernelParams(int numBlocks, int blockSize) {
    int ierr = DKS_ERROR;
@ -118,8 +154,9 @@ public:
    return ierr;
  }

-  /** Get the number of operations in compiled kernel.
-   *  Count the number of operation in the ptx file for the compiled program.
+  /** 
+   * Get the number of operations in compiled kernel.
+   * Count the number of operation in the ptx file for the compiled program.
   */
  int getOperations(int &oper) {

--- a/src/Algorithms/CollimatorPhysics.h
+++ b/src/Algorithms/CollimatorPhysics.h
@ -5,6 +5,9 @@
 #include <string>
 #include "../DKSDefinitions.h"

+/**
+ * Interface to impelment particle matter interaction for OPAL.
+ */
 class DKSCollimatorPhysics {

 protected:
@ -16,28 +19,60 @@ public:
  
  virtual ~DKSCollimatorPhysics() { }

+  /** 
+   * Execute collimator physics kernel.
+   *
+   */  
  virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, 
 				bool enableRutherforScattering = true) = 0;

+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 				   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				   void *px_ptr, void *py_ptr, void *pz_ptr,
 				   void *par_ptr, int numparticles) = 0;
  
+  /** 
+   * Sort particle array on GPU.
+   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
+   * array so these particles are at the end of array
+   */
  virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;

+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 				       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 				       void *px_ptr, void *py_ptr, void *pz_ptr,
 				       void *par_ptr, int numparticles, int &numaddback) = 0;

+  /** 
+   * BorisPusher push function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 				   double dt, double c, bool usedt = false, int streamId = -1) = 0;

+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 				   void *bf_ptr, void *dt_ptr, double charge,
 				   double mass, int npart, double c, int streamId = -1) = 0; 

+  /** 
+   * BorisPusher push function with transformto function form OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, 
 					    void *orient_ptr, int npart, int nsec, void *dt_ptr, 
 					    double dt, double c, bool usedt = false, 
--- a/src/Algorithms/FFT.h
+++ b/src/Algorithms/FFT.h
@ -6,12 +6,21 @@

 #include "../DKSDefinitions.h"

+/**
+ * Abstract class defining methods for DKS FFT class.
+ * Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
+ */
 class BaseFFT {

 protected:
  int defaultN[3];
  int defaultNdim;

+  /**
+   * Check if FFT plan is created for the needed dimension and FFT size.
+   * Returns true if the plan has been created and false if no plan for specified dimension
+   * and size exists.
+   */
  bool useDefaultPlan(int ndim, int N[3]) {
    if (ndim != defaultNdim)
      return false;
@ -24,18 +33,57 @@ public:

  virtual ~BaseFFT() { }

+  /** Setup FFT - init FFT library used by chosen device. */
  virtual int setupFFT(int ndim, int N[3]) = 0;
+
+  /** Setup real to complex FFT - init FFT library used by chosen device. */
  virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
  virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
+
+  /** Clean up. */
  virtual int destroyFFT() = 0;
+
+  /** 
+   * Exectute C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
  virtual int executeFFT(void * mem_ptr, int ndim, int N[3], 
 			 int streamId = -1, bool forward = true) = 0;
+
+  /** 
+   * Exectute inverse C2C FFT.
+   * mem_ptr - memory ptr on the device for complex data.
+   * Performs in place FFT.
+   */
  virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /**
+   * Normalize the FFT or IFFT.
+   * mem_ptr - memory to complex data.
+   */
  virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
+
+  /** 
+   * Exectute R2C FFT.
+   * real_ptr - real input data for FFT, comp_ptr - memory on the device where
+   * results for the FFT are stored as complex numbers.
+   */
  virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /** 
+   * Exectute C2R FFT.
+   * real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
+   */
  virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], 
 				int streamId = -1) = 0;
+
+  /**
+   * Normalize CR FFT.
+   */
  virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;

 };
--- a/src/Algorithms/GreensFunction.h
+++ b/src/Algorithms/GreensFunction.h
@ -4,24 +4,27 @@
 #include <iostream>
 #include <cmath>

+/**
+ * Interface to implement Greens function calculations for OPAL.
+ */
 class GreensFunction {

 public:
  
  virtual ~GreensFunction() { }

-  /** calc greens integral, as defined in OPAL */
+  /** calc greens integral, as defined in OPAL. */
  virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
 			     double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;

-  /** integration if rho2_m, see OPAL for more details */
+  /** integration if rho2_m, see OPAL for more details. */
  virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, 
 					int streamId = -1) = 0;

-  /** mirror rho2_m field */
+  /** mirror rho2_m field. */
  virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;

-  /** multiply two complex fields from device memory */
+  /** multiply two complex fields from device memory. */
  virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;

 };
--- a/src/Algorithms/ImageReconstruction.h
+++ b/src/Algorithms/ImageReconstruction.h
@ -5,17 +5,22 @@

 #define BLOCK_SIZE 128

+/** Struct to hold voxel position for PET image. */
 struct VoxelPosition {
  float x;
  float y;
  float z;
 };

+/** Struct that holds pair of detectors that registered an envent. */
 struct ListEvent {
  unsigned detA : 16;
  unsigned detB : 16;
 };

+/**
+ * Interface to implement PET image reconstruction.
+ */
 class ImageReconstruction {

 protected:
@ -25,7 +30,8 @@ public:

  virtual ~ImageReconstruction() { }
  
-  /** Caluclate source.
+  /** 
+   * Caluclate source.
   *  Places a sphere at each voxel position and calculate the avg value and std value of pixels 
   *  that are inside this sphere. All the sphere used have the same diameter.
   */
@ -33,7 +39,8 @@ public:
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0) = 0;

-  /** Calculate background.
+  /** 
+   * Calculate background.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -42,7 +49,8 @@ public:
 				  void *avg, void *std, float diameter, int total_voxels, 
 				  int total_sources, int start = 0) = 0;

-  /** Caluclate source using differente sources.
+  /** 
+   * Caluclate source using differente sources.
   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * each sphere is given by *diameter array.
@ -52,7 +60,7 @@ public:
 			       int total_sources, int start = 0) = 0;

  /**
-   * Places two sphere at each voxel position, calculates the avg value and std value of pixels
+   * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
   * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
   * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
   * smaller sphere.
@ -61,7 +69,8 @@ public:
 				   void *avg, void *std, void *diameter, int total_voxels, 
 				   int total_sources, int start = 0) = 0;

-  /** Generate normalization.
+  /** 
+   * Generate normalization.
   * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
   * that updates voxel values in the image on the slope between these two detectors.
   */
@ -69,14 +78,16 @@ public:
 				 void *det_position, int total_det) = 0; 


-  /** Calculate forward projection.
+  /** 
+   * Calculate forward projection.
   * For image reconstruction calculates forward projections.
   * see recon.cpp for details
   */
  virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 				void *image_position, int num_events) = 0;

-  /** Calculate backward projection.
+  /** 
+   * Calculate backward projection.
   * For image reconstruction calculates backward projections.
   * see recon.cpp for details
   */
@ -84,29 +95,29 @@ public:
 				 void *det_position, void *image_position, 
 				 int num_events, int num_voxels) = 0;

-  /** Set the voxel dimensins on device.
-   * 
+  /** 
+   *Set the voxel dimensins on device. 
   */
  virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;

-  /** Set the image edge variables on the device.
-   * 
+  /** 
+   * Set the image edge variables on the device.
   */
  virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;

-  /** Set the image edge1 on the device.
-   * 
+  /** 
+   * Set the image edge1 on the device.
   */
  virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;

-  /** Set the minimum crystan in one ring values on the device.
-   * 
+  /** 
+   * Set the minimum crystan in one ring values on the device.
   */
  virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, 
 				  float min_CrystalDist_InOneRing1) = 0;

-  /** Set all other required parameters for reconstruction.
-   * 
+  /** 
+   * Set all other required parameters for reconstruction.
   */
  virtual int setParams(float matrix_distance_factor, float phantom_diameter,
 			float atten_per_mm, float ring_diameter) = 0;
--- a/src/AutoTuning/DKSAutoTuning.h
+++ b/src/AutoTuning/DKSAutoTuning.h
@ -18,6 +18,17 @@
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;

+/** 
+ * DKS autotuning class, allows to auto-tune the defince function.
+ * Executes the defined function for auto-tuning and searches for optimal parameters to improve
+ * the function execution time. The function that is auto-tuned, parameters and the ranges
+ * need to be set. Includes multiple search methods, that searches the parameter space to finde 
+ * the optimal solution.
+ *  1) exaustive search
+ *  2) line search
+ *  3) hill climbimg
+ *  4) simulated annealing
+ */
 class DKSAutoTuning {

 private:
@ -36,12 +47,13 @@ private:

  int loops_m;

-  /** Update parameters from a state */
+  /** Update parameters from a state. */
  int setParameterValues(States states);

-  /** Evaluate the function and set execution time 
-   *  Returns DKS_ERROR if errors occured during function execution. 
-   *  Returns DKS_SUCCESS if function executed as planned. 
+  /** 
+   * Evaluate the function and set execution time 
+   * Returns DKS_ERROR if errors occured during function execution. 
+   * Returns DKS_SUCCESS if function executed as planned. 
   */
  int evaluateFunction(double &value);

@ -50,12 +62,13 @@ public:
  /** Constructor */
  DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);

-  /** Destructor */
+  /** Destructor. */
  ~DKSAutoTuning();

-  /** Set function to auto tune.
-   *  Caller of setFunction is responsible to bind the correct parameters 
-   *  to the function with std::bind.
+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
   */
  void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
    f_m = f;
@ -63,15 +76,21 @@ public:
    evaluate_time_m = evaluate_time;
  }

+  /** 
+   * Set function to auto tune.
+   * Caller of setFunction is responsible to bind the correct parameters 
+   * to the function with std::bind.
+   */
  void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
    fd_m = f;
    function_name_m = name;
    evaluate_time_m = evaluate_time;
  }

-  /** Set parameter for auto tuning.
-   *  Provide a pointer to a parameter that will be changed during auto-tuning
-   *  and a min-max value for this element
+  /** 
+   * Set parameter for auto tuning.
+   * Provide a pointer to a parameter that will be changed during auto-tuning
+   * and a min-max value for this element
   */
  template <typename T1>
  void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
@ -85,9 +104,9 @@ public:
  /** Perform exaustive search evaluating all the parameter configurations */
  void exaustiveSearch();

-  /** Perform auto-tuning.
-   *  Perform line-search auto-tuning by variying parameters one at a time and keeping other 
-   *  parameters constant.
+  /**
+   * Perform line-search auto-tuning by variying parameters one at a time.
+   * After one parameter is auto-tuned the next on is varied
   */
  void lineSearch();  

--- a/src/AutoTuning/DKSAutoTuningTester.h
+++ b/src/AutoTuning/DKSAutoTuningTester.h
@ -4,6 +4,7 @@
 #include <iostream>
 #include <cmath>

+/** Tester class for auto-tuning search algorithms. */
 class DKSAutoTuningTester {

  friend class DKSBaseMuSR;
--- a/src/AutoTuning/DKSConfig.h
+++ b/src/AutoTuning/DKSConfig.h
@ -1,9 +1,3 @@
-/** Class to save and load DKS autotunning configs.
- * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
- * Uses boost xml_parser to read and write the xml file and boost property tree to store
- * the xml content.
- */
-
 #ifndef DKS_CONFIG
 #define DKS_CONFIG

@ -29,6 +23,13 @@ namespace pt = boost::property_tree;
 const std::string config_dir = "/.config/DKS";
 const std::string config_file = "/autotuning.xml";

+/** Class to save and load DKS autotunning configs.
+ * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
+ * Uses boost xml_parser to read and write the xml file and boost property tree to store
+ * the xml content.
+ * TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
+ * so the auto-tuning has no effect.
+ */
 class DKSConfig {

 private:
--- a/src/AutoTuning/DKSSearchStates.h
+++ b/src/AutoTuning/DKSSearchStates.h
@ -9,6 +9,9 @@

 enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };

+/** 
+ * Parameter class allows to change the searchable parameters during the auto-tuning.
+ */
 class Parameter {

 private:
@ -64,6 +67,10 @@ public:

 };

+/**
+ * Struct to hold a auto-tuning state.
+ * Holds the current value, min, max and a step to witch a state can change.
+ */ 
 struct State {
  double value;
  double min;
@ -74,6 +81,12 @@ struct State {
 typedef std::vector<Parameter> Parameters;
 typedef std::vector<State> States;

+/** 
+ * Used by auto-tuning search algorithms to move between parameter configurations.
+ * Allows to move from one parameter stat to another, get neighboring states, 
+ * move to neighboring states and save state information. Print functions are available
+ * for debugging purposes, to follow how algorithm muves between sates.
+ */
 class DKSSearchStates {

 private:
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
 	
  return DKS_SUCCESS;
 }
-		
-/*
-  Info: allcate memory and write data (push)
-  Return: pointer to memory object
-*/
-/*
-  void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
-
-  void * mem_ptr;
-  mem_ptr = cuda_allocateMemory(size, ierr);
-	
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_writeData(mem_ptr, in_data, size);
-		
-  return mem_ptr;
-  }
-*/
-		
-/*
-  Info: read data and free memory (pull)
-  Return: success or error code
-*/
-/*
-  int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
-
-  ierr = cuda_readData(mem_ptr, out_data, size);
-  if (ierr == DKS_SUCCESS)
-  ierr = cuda_freeMemory(mem_ptr);	
-  else
-  return DKS_ERROR;
-	
-	
-  if (ierr == DKS_SUCCESS)	
-  return DKS_SUCCESS;
-  else
-  return DKS_ERROR;
-  }
-*/
-
-/*
-  Info: execute function
-  Return: success or error code
-*/
-int CudaBase::cuda_executeFunction() {
-
-  std::cout << "Execute function" << std::endl;
-  return DKS_SUCCESS;
-}
-		
-/*
-  Info: clean up
-  Return: success or error code
-*/
-int CudaBase::cuda_cleanUp() {
-
-  std::cout << "clean up" << std::endl;
-  return DKS_SUCCESS;
-	
-}
--- a/src/CUDA/CudaBase.cuh
+++ b/src/CUDA/CudaBase.cuh
@ -16,6 +16,11 @@

 #define BLOCK_SIZE 128

+/**
+ * CUDA base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement, data transfers and stream setup for 
+ * asynchronous data transfers and kernel executions.
+ */
 class CudaBase {

 private:
@ -48,17 +53,17 @@ public:
  /**
   * Delete curandState.
   * Delete curandState array on the GPU and free memory.
-   *  Return success or error code
+   * Return success or error code
   */
  int cuda_deleteCurandStates();

-  /** Create 'size' random numbers on the device and save in mem_ptr array
-   *
+  /** 
+   * Create 'size' random numbers on the device and save in mem_ptr array.
   */
  int cuda_createRandomNumbers(void *mem_ptr, int size);

-  /** Get a pointer to curand states
-   *
+  /** 
+   * Get a pointer to curand states.
   */
  curandState* cuda_getCurandStates();
 	
@ -75,93 +80,98 @@ public:
  int cuda_addStream(cudaStream_t tmpStream, int &streamId);

  /**
-   * delete cuda stream
+   * delete cuda stream.
   * success or error code
   */
  int cuda_deleteStream(int id);

  /**
-   * delete all streams
+   * delete all streams.
   * success or error code
   */
  int cuda_deleteStreams();

  /**
-   * set stream to use
+   * set stream to use.
   * success or error code
   */
  int cuda_setStream(int id);

  /**
-   * Info: get stream that is used
-   *  Return: return id of curretn stream
+   * get stream that is used.
+   * Return: return id of curretn stream
   */
  int cuda_getStreamId();

  /**
-   * Info: reset to default stream
+   * reset to default stream.
   * Return: success or error code
   */
  int cuda_defaultStream();

  /**
-   * Info: get number of streams
+   * get number of streams.
   * Return: success or error code
   */
  int cuda_numberOfStreams();

  /**
-   * Info: get stream
+   * get stream.
   * Return: stream
   */
  cudaStream_t cuda_getStream(int id);

  /**
-   * Get default cublass handle
+   * Get default cublass handle.
   */
  cublasHandle_t cuda_getCublas();

  /**
-   * Info: get information on cuda devices
+   * get information on cuda devices.
   * Return: success or error code
   */
  int cuda_getDevices();

-  /** Get CUDA device count.
-   *  Sets the number of devices on the platform that can use CUDA.
-   *  Returns DKS_SUCCESS
+  /** 
+   * Get CUDA device count.
+   * Sets the number of devices on the platform that can use CUDA.
   */
  int cuda_getDeviceCount(int &ndev);

-  /** Get the name of the device.
-   *  QUery the device properties of the used device and set the string device_name
+  /** 
+   * Get the name of the device.
+   * QUery the device properties of the used device and set the string device_name
   */
  int cuda_getDeviceName(std::string &device_name);

-  /** Set CUDA device to use.
-   *  If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR 
+  /** 
+   * Set CUDA device to use.
+   * If device passed in is larger than the number of devices use 
+   * the default:0 and return DKS_ERROR 
   */
  int cuda_setDevice(int device);

-  /** Get unique devices
-   *  Get array of indeces with the unique CUDA devices available on the paltform
+  /** 
+   * Get unique devices.
+   * Get array of indeces with the unique CUDA devices available on the paltform
   */
  int cuda_getUniqueDevices(std::vector<int> &devices);
 	
  /**
-   * Info: init device
+   * Initialize connection to the device.
+   * Only needed when runtime compilation is used.
   * Return: success or error code
   */
  int cuda_setUp();
 	
  /**
-   * Info: allocate memory on cuda device
+   * Allocate memory on cuda device.
   * Return: pointer to memory object
   */
  void * cuda_allocateMemory(size_t size, int &ierr);

  /**
-   * Info: allocate host memory in pinned memory
+   * Allocate host memory in pinned memory
   * Return: success or error code
   */
  template<typename T>
@ -174,8 +184,9 @@ public:
    return DKS_SUCCESS;
  }		

-  /** Zero CUDA memory.
-   *  Set all the elements of the array on the device to zero.
+  /** 
+   * Zero CUDA memory.
+   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
@ -189,8 +200,9 @@ public:
    return DKS_SUCCESS;
  }

-  /** Zero CUDA memory.
-   *  Set all the elements of the array on the device to zero.
+  /** 
+   * Zero CUDA memory async.
+   * Set all the elements of the array on the device to zero.
   */
  template<typename T>
  int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
@ -209,7 +221,7 @@ public:
  }

  /** 
-   * Info: write data to memory
+   * Write data to memory
   * Retrun: success or error code
   */
  template<typename T>
@ -226,7 +238,7 @@ public:
  }

  /**
-   * Info: write data assynchonuously
+   * Write data assynchonuously
   * Return: success or error code
   */
  template<typename T>
@ -258,7 +270,7 @@ public:
  }
 		
  /**
-   * Info: read data from memory
+   * Read data from memory
   * Return: success or error code
   */
  template<typename T>
@ -275,7 +287,7 @@ public:
  }

  /**
-   * Info: read data async from device memory
+   * Read data async from device memory
   * Return: success or error code
   */
  template<typename T>
@ -307,19 +319,19 @@ public:
  }
 		
  /**
-   * Info: free memory on device
+   * Free memory on device
   * Return: success or error code
   */
  int cuda_freeMemory(void * mem_ptr);
 	
  /**
-   * Info: free page locked memory on host
+   * Free page locked memory on host
   * Return: success or erro code
   */
  int cuda_freeHostMemory(void * mem_ptr);
 	
  /**
-   * Info: allcate memory and write data (push)
+   * Allcate memory and write data (push)
   * Return: pointer to memory object
   */
  template<typename T>
@ -335,7 +347,7 @@ public:
  }
 		
  /**
-   * Info: read data and free memory (pull)
+   * Read data and free memory (pull)
   * Return: success or error code
   */
  template<typename T>
@ -355,19 +367,8 @@ public:
  }
  
  /**
-   * Info: execute function
-   * Return: success or error code
-   */
-  int cuda_executeFunction();
-		
-  /**
-   * Info: clean up
-   * Return: success or error code
-   */
-  int cuda_cleanUp();
-  
-  /**
-   * Info: sync cuda device
+   * Sync cuda device.
+   * Waits till all the tasks on the GPU are finished.
   * Return: success or error code
   */
  int cuda_syncDevice() {
@ -376,7 +377,7 @@ public:
  }

  /**
-   * Page-lock host memory
+   * Page-lock host memory.
   */
  template<typename T>
  int cuda_hostRegister(T *ptr, int size) {
@ -390,7 +391,7 @@ public:
  }

  /**
-   * Release page locked memory
+   * Release page locked memory.
   */
  template<typename T>
  int cuda_hostUnregister(T *ptr) {
@ -403,7 +404,7 @@ public:
  }

  /**
-   * Info: print device memory info (total, used, avail)
+   * Print device memory info (total, used, avail)
   * Return: success or error code
   */
  int cuda_memInfo() {
--- a/src/CUDA/CudaChiSquare.cuh
+++ b/src/CUDA/CudaChiSquare.cuh
@ -8,6 +8,7 @@

 #include "CudaBase.cuh"

+/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
 class CudaChiSquare {

 private:
--- a/src/CUDA/CudaChiSquareRuntime.cuh
+++ b/src/CUDA/CudaChiSquareRuntime.cuh
@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double

 const std::string cudaFunctFooter = "}\n";

+/**
+ * CUDA implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
+ */
 class CudaChiSquareRuntime : public ChiSquareRuntime{

 private:
@ -29,65 +33,72 @@ private:

  cublasHandle_t defaultCublasRT;

-  /** Setup to init device
-   *  Create context and init device for RT compilation
+  /** 
+   * Setup to init device.
+   * Create context and init device for RT compilation
   */
  void setUpContext();

-  /** Private function to add function to kernel string
-   *
+  /** 
+   * Private function to add function to kernel string.
   */
  std::string buildProgram(std::string function);

 public:

-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase argument
   */
  CudaChiSquareRuntime(CudaBase *base);

-  /** Default constructor init cuda device
-   *
+  /** 
+   * Default constructor init cuda device
   */
  CudaChiSquareRuntime();
  
-  /** Default destructor
-   *
+  /** 
+   * Default destructor.
   */
  ~CudaChiSquareRuntime();

-  /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);

-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
-   * Result is put in &result variable
+   * Result is put in &result variable.
   */
  int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
 		      int numpar, int numfunc, int nummap,
 		      double timeStart, double timeStep,
 		      double &result);

-  /** Write params to device.
+  /** 
+   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 

-  /** Write functions to device.
+  /** 
+   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);

-  /** Write maps to device.
+  /** 
+   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);

-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -96,14 +107,16 @@ public:
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);


-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();

-  /** Check if CUDA device is able to run the chi square kernel.
-   *  Redundant - all new CUDA devices that support RT compilation will also support 
-   *  double precision, there are no other requirements to run chi square on GPU
+  /** 
+   * Check if CUDA device is able to run the chi square kernel.
+   * Redundant - all new CUDA devices that support RT compilation will also support 
+   * double precision, there are no other requirements to run chi square on GPU
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
    return DKS_SUCCESS;
--- a/src/CUDA/CudaCollimatorPhysics.cu
+++ b/src/CUDA/CudaCollimatorPhysics.cu
@ -1,5 +1,6 @@
 #include "CudaCollimatorPhysics.cuh"

+//constants used in OPAL
 //#define M_P 0.93827231e+00
 #define M_P 0.93827204e+00
 #define C 299792458.0
@ -11,6 +12,7 @@
 #define Z_P 1
 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7

+//parameter array indexes
 #define POSITION 0 
 #define ZSIZE 1
 #define RHO_M 2
@ -28,12 +30,18 @@
 #define BLOCK_SIZE 128
 #define NUMPAR 13

+/**
+ * CUDA device function for calculating dot product.
+ */
 __device__ inline double dot(double3 &d1, double3 &d2) {

  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);

 }

+/**
+ * CUDA devce function to calculate cross product.
+ */
 __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  double3 tmp;
  tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@ -42,6 +50,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
  return tmp;
 }

+/**
+ * CUDA device function to calculate arbitrary rotation.
+ */
 __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
  double c=cos(Theta);
  double s=sin(Theta);
@ -59,6 +70,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
  return tmp;
 } 

+/**
+ * CUDA device function to check if particle is still in material.
+ * z - particle position, par - parameter array. Particle is considered inside the
+ * material if z is > material starting position and z < material starting position - mat size.
+ */
 __device__ inline bool checkHit(double &z, double *par) {

  /* check if particle is in the degrader material */
@ -67,6 +83,11 @@ __device__ inline bool checkHit(double &z, double *par) {
 }


+/**
+ * CUDA device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) 
 {

@ -111,6 +132,11 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,

 }

+/**
+ * CUDA device function for rotation in 2 dimensions.
+ * For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),  
+ * "Passage of particles through matter"
+ */
 __device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, 
 			   double &normP, double &thetacou, double &deltas, int coord,
 			   double *par) 
@ -145,6 +171,11 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }

+/**
+ * CUDA device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * For details on the algorithm see OPAL user guide.
+ */
 __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
 				   bool enableRutherfordScattering) 
 {
@ -211,6 +242,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
 }


+/**
+ * CUDA kernel that performs one step in particle movement trough mater.
+ * One thread is launched for each particle in the simulation. The kernel checks if the particle
+ * is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
+ * particles that are exiting the material.
+ * @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
 template <typename T>
 __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
 					int numparticles, bool enableRutherfordScattering)
@ -220,51 +262,62 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
  volatile int tid = threadIdx.x;
  volatile int idx = blockIdx.x * blockDim.x + tid;

-  //transfer params to shared memory
+  //transfer params and particle positions to shared memory
+  //R is kept in shared memory in order to reduce register pressure for the kernel
  extern __shared__ double smem[];
  double *p = (double*)smem;
  double3 *R = (double3*)&smem[NUMPAR]; 

-  curandState s; 
+  curandState s; //each tread gets its own cuRand state for random number generation
  double3 P;

+  //load parameters to shared memory
  for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
    p[tt] = par[tt];

+  //sync threads to ensure that parameters are finished loading
  __syncthreads();

+  //there might be some empty threads that do no work
  if (idx < numparticles) {
-    s = state[idx];
-    R[tid] = data[idx].Rincol;
-    P = data[idx].Pincol;
+    s = state[idx]; //load cuRand state to local memory
+    R[tid] = data[idx].Rincol; //load position to shared memory
+    P = data[idx].Pincol; //load momentum to local memory

    bool pdead = false;  
    volatile double sq = sqrt(1.0 + dot(P, P));

    double Eng;
    
+    //check if particle is still in the material
    if (checkHit(R[tid].z, p)) {      

+      //calculate enery loss
      Eng = (sq - 1) * M_P;
      energyLoss(Eng, pdead, s, p);

+      //check if particle is not dead
      if (!pdead) {
 	double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
 	sq = sqrt(dot(P, P));

+	//caluclate Coulomb scattering
 	P.x = P.x * ptot / sq;
 	P.y = P.y * ptot / sq;
 	P.z = P.z * ptot / sq;
 	coulombScat(R[tid], P, s, p, enableRutherfordScattering); 

+	//update particle momentum
 	data[idx].Pincol = P;
      } else {
+	//mark particle as dead (-1)
 	data[idx].label = -1;
      }
      
+      //update cuRand state
      state[idx] = s;
    } else {
-    
+      //particle exits material - drift and mark as exiting (-2)
      R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
      R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
      R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -272,14 +325,25 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
      
    }
 
+    //update particle position
    data[idx].Rincol = R[tid];
  }

 }

-__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, 
-					 curandState *state, int numparticles,
-					 bool enableRutherfordScattering)
+/**
+ * CUDA kernel that performs one step in particle movement trough mater using SoA particles.
+ * Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
+ * Deprecated - GPU version does not use SoA.
+ * @param[in] data structure of arrays containing particle data
+ * @param[in] *par array of material properties, always constant size - 13
+ * @param[in] *state array holding cuRand states to preserve states between kernel launches
+ * @param[in] numparticles number of particles in the simulation
+ * @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
+ */
+__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par, 
+					   curandState *state, int numparticles,
+					   bool enableRutherfordScattering)
 {

  //get global id and thread id
@ -338,92 +402,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,

 }

-
+/**
+ * Device function to swich off unitless positions.
+ */
 inline __device__ void unitlessOff(double3 &a, const double &c) {
  a.x *= c;
  a.y *= c;
  a.z *= c;
 }

+/**
+ * Device function to swich on unitless positions.
+ */
 inline __device__ void unitlessOn(double3 &a, const double &c) {
  a.x /= c;
  a.y /= c;
  a.z /= c;
 }

-//swithch to unitless positions with dtc
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOn(R, dtc);
-    unitlessOn(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//swithc to unitless positions with dt*c
-__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-
-//swithc off unitless positions with dtc
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-
-    unitlessOff(R, dtc);
-    unitlessOff(X, dtc);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-
-}
-
-//switch off unitelss positions with dt*c
-__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
-
-  volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (idx < npart) {
-    double3 R = gR[idx];
-    double3 X = gX[idx];
-    double dt = gdt[idx];
-
-    unitlessOff(R, dt*c);
-    unitlessOff(X, dt*c);
-
-    gR[idx] = R;
-    gX[idx] = X;
-  }
-}
-

+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] npart number of particles
+ * @param[in] dtc dt*c
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {

  //get global id and thread id
@ -451,7 +455,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
  }
 }

-
+/**
+ * CUDA kernel to perform particle push.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
 __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {

  //get global id and thread id
@ -478,6 +489,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
  }
 }

+/** 
+ * CUDA kernel to perform particle kick.
+ * @param[in] *gR array of particle positions
+ * @param[in] *gP array of particle momentums
+ * @param[in] *gEf 
+ * @param[in] *gBf
+ * @param[in] *gdt array of time steps for each particle
+ * @param[in] npart number of particles
+ * @param[in] c speed of light
+ */
 __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, 
 			   double3 *gBf, double *gdt, double charge, 
 			   double mass, int npart, double c)
@ -627,63 +648,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection

 }

-struct compare_particle
-{
-  int threshold;
-
-  compare_particle() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART p1, CUDA_PART p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct compare_particle_small
-{
-  int threshold;
-
-  compare_particle_small() {
-    threshold = 0;
-  }
-
-  void set_threshold(int t) {
-    threshold = t;
-  }
-
-  __host__ __device__
-  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
-    return p1.label > p2.label;
-  }
-
-  __host__  __device__
-  bool operator()(CUDA_PART_SMALL p1) {
-    return p1.label < threshold;
-  }
-};
-
-
-struct less_then
-{
-  __host__ __device__
-  bool operator()(int x)
-  {
-    return x < 0;
-  }
-};
-
 int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
 					     bool enableRutherfordScattering)
 {
--- a/src/CUDA/CudaCollimatorPhysics.cuh
+++ b/src/CUDA/CudaCollimatorPhysics.cuh
@ -20,7 +20,8 @@
 #include "CudaBase.cuh"

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU or MIC as AoS.
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
 */
 typedef struct __align__(16) {
  int label;
@ -37,7 +38,10 @@ typedef struct __align__(16) {
 } CUDA_PART;

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as AoS
+ * Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
 */
 typedef struct {
  int label;
@ -47,7 +51,8 @@ typedef struct {
 } CUDA_PART_SMALL;

 /**
- * Structure for storing particle on GPU
+ * Structure for storing particle on GPU as SoA.
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
 */
 typedef struct {
  int *label;
@ -65,6 +70,9 @@ typedef struct {

 /**
 * Structure for storing particle on GPU
+ * Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
+ * contains only data that are used by the GPU kernels, the rest of the particle data must be kept
+ * on the host side.
 */
 typedef struct {
  int *label;
@ -73,11 +81,39 @@ typedef struct {
  double3 *Pincol;
 } CUDA_PART2_SMALL;

-/** CudaCollimatorPhysics class.
- * Contains kerenls that execute CollimatorPhysics functions form OPAL.
- * For detailed documentation on CollimatorPhysics functions see OPAL documentation
+/** 
+ * Operator used in thrust sort to compare particles by label.
+ * Used to move dead particles to the end of array, since they have label -1 or -2.
 */
-class CudaCollimatorPhysics : public DKSCollimatorPhysics{
+struct compare_particle_small
+{
+  int threshold;
+
+  compare_particle_small() {
+    threshold = 0;
+  }
+
+  void set_threshold(int t) {
+    threshold = t;
+  }
+
+  __host__ __device__
+  bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
+    return p1.label > p2.label;
+  }
+
+  __host__  __device__
+  bool operator()(CUDA_PART_SMALL p1) {
+    return p1.label < threshold;
+  }
+};
+
+/** 
+ * CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Contains kerenls that execute CollimatorPhysics functions form OPAL.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
+class CudaCollimatorPhysics : public DKSCollimatorPhysics {

 private:

@ -86,32 +122,44 @@ private:

 public:

-  /** Constructor with CudaBase argument
-   *
+  /** 
+   * Constructor with CudaBase as argument.
+   * Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
   */
  CudaCollimatorPhysics(CudaBase *base) {
    m_base = base;
    base_create = false;
  }

-  /** Constructor - empty. */
+  /** 
+   * Empty constructor.
+   * Create a new instance of CudaCollimatorPhysics with its own CudaBase. 
+   */
  CudaCollimatorPhysics() { 
    m_base = new CudaBase();
    base_create = true;
  }

-  /** Destructor - empty */
+  /** 
+   * Destructor.
+   * Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
+   */
  ~CudaCollimatorPhysics() { 
    if (base_create)
      delete m_base;
  };

-  /** Execute collimator physics kernel.
+  /** 
+   * Execute collimator physics kernel.
   *
   */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, 
 			int numpartices, bool enableRutherforScattering = true);

+  /** 
+   * Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, 
 			   void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			   void *px_ptr, void *py_ptr, void *pz_ptr,
@ -120,12 +168,17 @@ public:
      return DKS_ERROR;
    }

-  /** Sort particle array on GPU.
+  /** 
+   * Sort particle array on GPU.
   * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
   * array so these particles are at the end of array
   */
  int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
  
+  /** 
+   * Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
+   * Used only on the MIC side, was not implemented on the GPU.
+   */
  int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, 
 			       void *rx_ptr, void *ry_ptr, void *rz_ptr, 
 			       void *px_ptr, void *py_ptr, void *pz_ptr,
@ -134,18 +187,25 @@ public:
      return DKS_ERROR;
    }

-  /** BorisPusher push function for integration from OPAL.
+  /** 
+   * BorisPusher push function for integration from OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
  int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, 
 			   double dt, double c, bool usedt = false, int streamId = -1);

+  /** 
+   * BorisPusher kick function for integration from OPAL.
+   * ParallelTTracker integration from OPAL implemented in cuda.
+   * For more details see ParallelTTracler docomentation in opal
+   */
  int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
 			   void *bf_ptr, void *dt_ptr, double charge, double mass,
 			   int npart, double c, int streamId = -1); 

-  /** BorisPusher push function with transformto function form OPAL
+  /** 
+   * BorisPusher push function with transformto function form OPAL.
   * ParallelTTracker integration from OPAL implemented in cuda.
   * For more details see ParallelTTracler docomentation in opal
   */
--- a/src/CUDA/CudaFFT.cuh
+++ b/src/CUDA/CudaFFT.cuh
@ -10,6 +10,10 @@
 #include "../Algorithms/FFT.h"
 #include "CudaBase.cuh"

+/**
+ * Cuda FFT class based on BaseFFT interface.
+ * Uses cuFFT library to perform FFTs on nvidias GPUs.
+ */
 class CudaFFT : public BaseFFT {

 private:
@ -34,7 +38,7 @@ public:
  ~CudaFFT();
 		
  /**
-   * Info: init cufftPlans witch can be reused for all FFTs of the same size and type
+   * Init cufftPlans witch can be reused for all FFTs of the same size and type
   * Return: success or error code
   */
  int setupFFT(int ndim, int N[3]);
@ -42,45 +46,21 @@ public:
  int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }

  /**
-   * Info: destroy default FFT plans
+   * Destroy default FFT plans
   * Return: success or error code
   */
  int destroyFFT();

-  /*
-    Info: execute complex to complex double precision fft using cufft library
-    Return: success or error code
-  */
  int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
 		
-  /*
-    Info: execute ifft 
-    Return: success or error code
-  */
  int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute normalize using cuda kernel for complex to complex iFFT
-    Return: success or error code
-  */
  int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: execute real to complex double precision FFT
-    Return: success or error code
-  */
  int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
 		
-  /*
-    Info: exectue complex to real double precision FFT
-    Return: success or error code
-  */
  int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);

-  /*
-    Info: execute normalize for complex to real iFFT
-    Return: success or error code
-  */
  int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);

 };
--- a/src/CUDA/CudaGreensFunction.cuh
+++ b/src/CUDA/CudaGreensFunction.cuh
@ -12,6 +12,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "CudaBase.cuh"

+/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class CudaGreensFunction : public GreensFunction{

 private:
--- a/src/CUDA/CudaImageReconstruction.cuh
+++ b/src/CUDA/CudaImageReconstruction.cuh
@ -10,6 +10,7 @@
 #include "../Algorithms/ImageReconstruction.h"
 #include "CudaBase.cuh"

+/** CUDA implementation of ImageReconstruction interface. */
 class CudaImageReconstruction : public ImageReconstruction {

 private:
--- a/src/DKSBase.h
+++ b/src/DKSBase.h
@ -1,11 +1,3 @@
-/** DKSBase class.
- * DKSBase.h
- * Author: Uldis Locans
- * Date: 15.09.2014
- * Base class of Dynamic Kernel Scheduler that handles the function calls
- * from host application to DKS
- */
-
 #ifndef H_DKS_BASE
 #define H_DKS_BASE

@ -41,7 +33,12 @@

 #include "AutoTuning/DKSConfig.h"

-/** DKSBase class for handling function calls to DKS library */
+/** 
+ * API for handling communication function calls to DKS library.
+ * DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
+ * memory manegement, data transfer and other basic communication functions between
+ * the host and device.
+ */
 class DKSBase {

 private:
@ -74,7 +71,7 @@ protected:
  DKSConfig dksconfig;

  /** 
-   * Check if current API is set to OpenCL
+   * Check if current API is set to OpenCL.
   * Return true/false wether current api is opencl
   */
  bool apiOpenCL();
@ -91,11 +88,11 @@ protected:
   */
  bool apiOpenMP();

-  /** Check if device is GPU */
+  /** Check if device is GPU. */
  bool deviceGPU();
-  /** Check if device is CPU */
+  /** Check if device is CPU. */
  bool deviceCPU();
-  /** Check if device is MIC */
+  /** Check if device is MIC. */
  bool deviceMIC();

  /**
--- a/src/DKSBaseMuSR.h
+++ b/src/DKSBaseMuSR.h
@ -20,6 +20,11 @@
 #include "OpenCL/OpenCLChiSquareRuntime.h"
 #endif

+/**
+ * API to handle musrfit calls to DKS library.
+ * Using ChiSquareRuntime interface allows to call chi square functions on the 
+ * GPU or CPU using CUDA or OpenCL.
+ */
 class DKSBaseMuSR : public DKSFFT {

 private:
--- a/src/DKSFFT.h
+++ b/src/DKSFFT.h
@ -24,6 +24,10 @@
 #include "MIC/MICFFT.h"
 #endif

+/**
+ * API to handel calls to DKSFFT.
+ * Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
+ */
 class DKSFFT : public DKSBase {

 private:
--- a/src/DKSImageReconstruction.h
+++ b/src/DKSImageReconstruction.h
@ -10,6 +10,9 @@
 #include "CUDA/CudaImageReconstruction.cuh"
 #endif

+/**
+ * API to handle PET image reconstruction calls.
+ */
 class DKSImageRecon : public DKSBase {

 private:
@ -22,87 +25,88 @@ public:

  ~DKSImageRecon();

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateSource(void *image_space, void *image_position, void *source_position, 
 			  void *avg, void *std, float diameter, int total_voxels, 
 			  int total_sources, int start = 0);

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateBackground(void *image_space, void *image_position, void *source_position, 
 			      void *avg, void *std, float diameter, int total_voxels, 
 			      int total_sources, int start = 0);


-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateSources(void *image_space, void *image_position, void *source_position, 
 			   void *avg, void *std, void *diameter, int total_voxels, 
 			   int total_sources, int start = 0);

-  /** Image reconstruction analaysis calculate source.
-   * 
-   *
+  /** 
+   * Image reconstruction analaysis calculate source.
   */
  int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, 
 			       void *avg, void *std, void *diameter, int total_voxels, 
 			       int total_sources, int start = 0);

-  /** Image reconstruction - generate normalization.
-   * 
+  /** 
+   * Image reconstruction - generate normalization.
   */
  int callGenerateNormalization(void *recon, void *image_position, 
 				void *det_position, int total_det);

-  /** Image reconstruction - forward correction.
-   * 
+  /** 
+   * Image reconstruction - forward correction.
   */
  int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, 
 			    void *image_position, int num_events);

-  /** Image reconstruction - backward projection.
-   * 
+  /** 
+   * Image reconstruction - backward projection.
   */
  int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, 
 			     void *det_position, void *image_position, 
 			     int num_events, int num_voxels);

-  /** Set the voxel dimensins on device.
+  /** 
+   * Set the voxel dimensins on device.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);

-  /** Set the image edge.
+  /** 
+   * Set the image edge.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge(float x_edge, float y_edge, float z_edge);

-  /** Set the image edge1.
+  /** 
+   * Set the image edge1.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);

-  /** Set the minimum crystan in one ring values.
+  /** 
+   * Set the minimum crystal in one ring values.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
   */
  int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);

-  /** Set all other required parameters for reconstruction.
+  /** 
+   * Set all other required parameters for reconstruction.
   * Values are stored in GPU memory and used in forward and backward projection calculations.
   * Call set function once to transfer the values from host side to GPU.
   * If value changes on the host side set functions needs to be called again to update GPU values.
--- a/src/DKSOPAL.h
+++ b/src/DKSOPAL.h
@ -33,6 +33,11 @@
 #include "MIC/MICCollimatorPhysics.h"
 #endif

+/**
+ * API to handle OPAL calls to DKS library.
+ * Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
+ * functions.
+ */
 class DKSOPAL : public DKSFFT {

 private: 
--- a/src/MIC/MICBase.h
+++ b/src/MIC/MICBase.h
@ -26,6 +26,9 @@

 #define MIC_WIDTH 128

+/** MIC Base class handles device setup and basic communication with the device.
+ * Handles devicew setup, memory manegement and  data transfers.
+ */
 class MICBase {

 private:
@ -45,58 +48,60 @@ public:

  int m_device_id;

-  /* constructor */
+  /** constructor */
  MICBase();

-  /* destructor */
+  /** destructor */
  ~MICBase();

-  /*
-    Info: create MKL rand streams for each thread
-    Return: success or error code
-  */
+  /**
+   * Create MKL rand streams for each thread
+   *  Return: success or error code
+   */
  int mic_createRandStreams(int size);

-  /*
-    Info: delete MKL rand streams
-    Return: succes or error code
-  */
+  /**
+   * Delete MKL rand streams
+   * Return: succes or error code
+   */
  int mic_deleteRandStreams();

-  /*
-    Info: create a new signal for the mic
-    Return: success or error code
-  */
+  /**
+   * Create a new signal for the mic.
+   * Signals can be used for assynchronous data transfers.
+   * Return: success or error code
+   */
  int mic_createStream(int & streamId);

-  /*
-    Info: get the signal from the vector
-    Return: mic signal
+  /**
+   * Info: get the signal from the vector.
+   * Return: mic signal
  */
  int& mic_getStream(int id);

-  /*
-    Info: delete streams
-    Return: success or error code
-  */
+  /**
+   * Info: delete streams.
+   * Return: success or error code
+   */
  int mic_deleteStreams();

-  /*
-    Info: set device id
-    Return: success or error code
-  */
+  /**
+   * Info: set device id.
+   * Return: success or error code
+   */
  int mic_setDeviceId(int id);

-  /*
-    Info: get mic devices
-    Return: success or error code
-  */
+  /**
+   * Info: get mic devices.
+   * Prints information about mic devices.
+   * Return: success or error code
+   */
  int mic_getDevices();

-  /*
-    Info: allocate memory on MIC device
-    Return: success or error code
-  */
+  /**
+   * Allocate memory on MIC device.
+   * Return: success or error code
+   */
  template<typename T>
  void * mic_allocateMemory(int size) {

@ -109,10 +114,10 @@ public:
    return tmp;
  }

-  /*
-    Info: transfer data to device
-    Return: success or error code
-  */
+  /**
+   * Transfer data to device.
+   * Return: success or error code
+   */
  template<typename T>
  int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@ -123,10 +128,10 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: write data to device, non-blocking
-    Return: success or error code
-  */
+  /**
+   * Write data to device, non-blocking.
+   * Return: success or error code
+   */
  template<typename T>
  int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) 
  {
@ -139,10 +144,10 @@ public:
  }
  

-  /*
-    Info: read data from device
-    Return: success or error code
-  */
+  /**
+   * Read data from device
+   * Return: success or error code
+   */
  template<typename T>
  int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
    T* tmp_ptr = (T*)data_ptr;
@ -154,10 +159,10 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: read data from device waiting for signal
-    Return: success or error code
-  */
+  /**
+   * Read data from device waiting for signal
+   * Return: success or error code
+   */
  template<typename T>
  int mic_readDataAsync(const void * data_ptr, void * result, int size, 
 			int streamId = -1, int offset = 0) {
@ -172,10 +177,10 @@ public:

  }

-  /* 
-     Info: wait till all the signals are complete
-     Return siccess or error code
-  */
+  /**
+   * Wait till all the signals are complete
+   * Return siccess or error code
+   */
  int mic_syncDevice() {
    
    //empty offload to wait for all the signals to finish and launch a new empy signal
@ -193,10 +198,10 @@ public:

  }

-  /*
-    Info: free memory on device
-    Return: success or error code
-  */
+  /**
+   * Free memory on device
+   * Return: success or error code
+   */
  template<typename T>
  int mic_freeMemory(void * data_ptr, int size) {

@ -210,10 +215,10 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Info: allocate memory and write data to device
-    Return: success or error code
-  */
+  /**
+   * Allocate memory and write data to device
+   * Return: success or error code
+   */
  template<typename T>
  void * mic_pushData(const void * data, int size) {
    T* tmp_ptr = new T[size];
@ -227,10 +232,10 @@ public:
  return tmp_ptr;
 }

-/*
-  Info: read data and free memory on device
-  Return: success or erro code
-*/
+  /**
+   * Read data and free memory on device
+   * Return: success or erro code
+   */
  template<typename T>
  int mic_pullData(void * data_ptr, void * result, int size) {
    T* tmp_ptr = (T*)data_ptr;
--- a/src/MIC/MICChiSquare.h
+++ b/src/MIC/MICChiSquare.h
@ -14,6 +14,9 @@
 #include <offload.h>
 #include "MICBase.h"

+/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices. 
+ * Not complete and untested because of the poor performance of first MIC devices.
+ */
 class MICChiSquare {

  MICBase *m_micbase;
--- a/src/MIC/MICCollimatorPhysics.cpp
+++ b/src/MIC/MICCollimatorPhysics.cpp
@ -22,22 +22,34 @@
 #define I_M 10
 #define DT_M 11

+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(mic_double3 d1, mic_double3 d2) {
  return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
 }

+/**
+ * MIC device function for calculating dot product.
+ */
 __declspec(target(mic))
 double dot(double dx, double dy, double dz) {
  return (dx * dx + dy * dy + dz * dz);
 }

+/**
+ * MIC device function to check if particle is still in material.
+ */
 __declspec(target(mic))
 bool checkHit(double &z, double *par) {
  return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
 }


+/**
+ * MIC device function to calculate arbitrary rotation.
+ */
 __declspec(target(mic))
 void Rot(double &px, double &pz, double &x, double &z, double xplane, 
 	 double normP, double thetacou, double deltas, int coord)
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
  pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
 }

+
+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses AoS to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ * Deprecated on favor of SoA data layout.
+ */
 __declspec(target(mic))
 void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
  double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr

 }

+/**
+ * MIC device function to calculate Coulomb scattering for one particle.
+ * Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
+ * Uses SoA to store particle positions and momentum, paralelized using OpenMP.
+ * For details on the algorithm see OPAL user guide.
+ */
 __declspec(target(mic))
-void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
+void coulombScat(double *rx, double *ry, double *rz, 
+		 double *px, double *py, double *pz, int *label,
 		 double *par, VSLStreamStatePtr &stream, int ii, int size) 
 {

+  //arrays for temporary storage, each core proceses MIC_WIDTH particles
  double normP[MIC_WIDTH] __attribute__((aligned(64)));
  double deltas[MIC_WIDTH] __attribute__((aligned(64)));
  double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  double z2[MIC_WIDTH] __attribute__((aligned(64)));
  double thetacou[MIC_WIDTH] __attribute__((aligned(64)));

+  //simd instruction tells the compiler its safe to vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  
+  //vectorize the loop
  #pragma vector aligned
  #pragma simd
  for (int i = ii; i < ii + size; i++) {
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
    }
  }
  
-  
  //generate array of random numbers
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
  vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
  
 }

+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {

@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
    pdead = 1;
 }

+/**
+ * MIC device function to calculate energyLoss for one particle.
+ * Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
+ * algorith are available in OPAL user guide.
+ */
 __declspec(target(mic))
 void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {

@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
  double *par = (double*) par_ptr;
  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;

+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target(mic:m_micbase->m_device_id)		\
  inout(data:length(0) DKS_RETAIN DKS_REUSE)	\
  in(par:length(0) DKS_RETAIN DKS_REUSE)	\
@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
      VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
      
      //for loop trough particles if not checkhit set label to -2 and update R.x
-
 #pragma omp for simd
      for (int i = 0; i < numparticles; i++) {
 	if ( !checkHit(data[i].Rincol.z, par) ) {
@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
 {


-
+  //cast device memory pointers to appropriate types
  int *label = (int*)label_ptr;
  unsigned *localID = (unsigned*)localID_ptr;
  double *rx = (double*)rx_ptr;
@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt

  VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;

+  /* offload the computation to the MIC, reuses the memory already allocated on the mic.
+     the memory allocation and data trasnfer need to be handled before */
 #pragma offload target (mic:0) \
  in(label:length(0) DKS_REUSE DKS_RETAIN)	\
  in(localID:length(0) DKS_REUSE DKS_RETAIN)	\
--- a/src/MIC/MICCollimatorPhysics.h
+++ b/src/MIC/MICCollimatorPhysics.h
@ -26,6 +26,12 @@ typedef struct {
 } MIC_PART_SMALL;


+/**
+ * MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes OPALs collimator physics class for particle matter interactions using OpenMP
+ * and offload mode targetomg Intel Xeon Phi processors.
+ * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
+ */
 class MICCollimatorPhysics : public DKSCollimatorPhysics {

 private:
--- a/src/MIC/MICFFT.h
+++ b/src/MIC/MICFFT.h
@ -10,6 +10,10 @@
 #include "../Algorithms/FFT.h"
 #include "MICBase.h"

+/** 
+ * MIC FFT based on BaseFFT interface.
+ * uses MKL library to offload FFT on Intel Xeon Phi devices.
+ */
 class MICFFT : public BaseFFT {

 private:
--- a/src/MIC/MICGreensFunction.hpp
+++ b/src/MIC/MICGreensFunction.hpp
@ -15,6 +15,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1

+/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class MICGreensFunction : public GreensFunction {

 private:
--- a/src/MIC/MICMergeSort.h
+++ b/src/MIC/MICMergeSort.h
@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
  return p;
 }

+/**
+ * Merge sort implementation for intel MIC.
+ * Paralellized over all the MIC cores using OpenMP tasks.
+ */
 template <typename T>
 void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {

@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
  }
 }

+/**
+ * Quicksort algorithm, developed for use on Intel MIC devices.
+ */
 template <typename T>
 void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

 }

+/** 
+ * Insertion sort of @p list, developed for use on Intel MIC.
+ * Used by quick_sort to sort small lists.
+ */
 template <typename T>
 void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

--- a/src/OpenCL/OpenCLBase.h
+++ b/src/OpenCL/OpenCLBase.h
@ -1,16 +1,3 @@
-/*
-
-  Name: OpenCLBase
-
-  Author: Uldis Locans
-
-  Info: OpenCL base class to handle all the common details associated 
-  with kernel launch on OpenCL device
-
-  Date: 2014.09.18
-
-*/
-
 #ifndef H_OPENCL_BASE
 #define H_OPENCL_BASE

@ -32,7 +19,7 @@

 #include "../DKSDefinitions.h"

-/* struct for random number state */
+/** struct for random number state. */
 typedef struct {
  double s10;
  double s11;
@ -44,168 +31,194 @@ typedef struct {
  bool gen;
 } RNDState;

+/**
+ * OpenCL base class to handle device setup and basic communication wiht the device.
+ * Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
+ * The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
+ * class contains methods to read the kernel files, compile the kernel codes and launch kernels
+ * from the compiled codes. Which kernel file needs to be loaded for the specif functin is 
+ * handled by the base class that is launching the kernel.
+ */
 class OpenCLBase {

 private:

+  //variables containig OpenCL device and platform ids
  static cl_platform_id m_platform_id;
  static cl_device_id m_device_id;

+  //variables containit compiled OpenCL program and kernel
  cl_context_properties m_context_properties[3];
  cl_program m_program;
  cl_kernel m_kernel;
 	
+  //variables for tracking OpenCL events
  static cl_event m_last_event;
  cl_int m_num_events;
  std::vector<cl_event> m_events;
 	
+  //currently load kernel file
  char * m_kernel_file;

+  //type of device used by OpenCL
  cl_device_type m_device_type;
 	
-  /*
-    Name: getPlatforms
-    Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
-    Return: success or error code
-  */
+  /**
+   * Get all available OpenCL platforms.
+   * Get all avaialble platforms and save in m_platform_ids, save number of platforms
+   *  Return: success or error code
+   */
  int ocl_getPlatforms();
 	
 	
-  /*
-    Name: getDevice
-    Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
-    ReturnL success or error code
-  */
+  /**
+   * Get first available OpenCL device of specified type.
+   * Get first avaialble devices and save device id and platform id for this device, 
+   * device name: (-gpu, -mic, -cpu)
+   *  ReturnL success or error code
+   */
  int ocl_getDevice(const char* device_name);
 	
-  /*
-    Name getDeviceType
-    Info: get device type from device name (-gpu, -cpu, -mic)
-    Return: success or error code
-  */
+  /**
+   * Get cl_device_type from the specified device name.
+   * get device type from device name (-gpu, -cpu, -mic)
+   *  Return: success or error code
+   */
  int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
 	
-  /*
-    Name: createContext
-    Info: create context with specified device
-    Return: success or error code
-  */
+  /**
+   * Create OpenCL context with specified device.
+   *  Return: success or error code
+   */
  int ocl_createContext();
 	
-  /*
-    Name: buildProgram
-    Info: build program from specified kernel file
-    Return: success or error code
+  /**
+   * Build program from specified kernel file.
+   * Return: success or error code.
  */
  int ocl_buildProgram(const char* kernel_file);

-  /** Compile program from kernel source string
-   *
+  /** 
+   * Compile program from kernel source string.
+   * Takes a string read from OpenCL kernel file saved in kernel_source and compiles the 
+   * OpenCL program, that can be then executed on the device.
+   * opts is a string specifiend additional compiler flags.
   */
  int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);

 protected:

+  //memory for random number states
  int defaultRndSet;
  cl_mem defaultRndState;
 	
 	
 public:

+  //OpenCL context and commad queue
  static cl_context m_context;
  static cl_command_queue m_command_queue; 
    
-  /*
-    constructor
-  */
+  /**
+   * constructor
+   */
  OpenCLBase();
    
-  /*
-    destructor
-  */
+  /**
+   * destructor
+   */
  ~OpenCLBase();
    
-  /*
-    Create RND states
-    Return: success or error code
-  */
+  /**
+   * Allocate memory for size random number states and init the rnd states.
+   * Uses AMD clRng library for random numbers. 
+   * This library is only compatible with AMD devices.
+   */
  int ocl_createRndStates(int size);

-  /* Create an array of random numbers on the device
-   *
+  /** 
+   * Create an array of random numbers on the device.
+   * Filles hte mem_ptr with random numbers.
   */
  int ocl_createRandomNumbers(void *mem_ptr, int size);

-  /*
-    Destroy rnd states
-    Return: success or error code
-  */
+  /**
+   * Destroy rnd states and free device memory.
+   * Return: success or error code
+   */
  int ocl_deleteRndStates();


-  /*
-    Name: getAllDevices
-    Info: get all available devices
-    ReturnL success or error code
+  /**
+   * Prints info about all the available platforms and devices.
+   * Can be used for information purposes to see what devices are available on the system.
+   * ReturnL success or error code.
  */
  int ocl_getAllDevices();

-  /** Get the OpenCL device count for the set type of device
-   *
+  /** 
+   * Get the OpenCL device count for the set type of device.
+   * Device count is set in ndev parameter, returns success or error code.
   */
  int ocl_getDeviceCount(int &ndev);

-  /** Get the name of the device used
+  /** 
+   * Get the name of the device currently us use.
   */
  int ocl_getDeviceName(std::string &device_name);

-  /** Set the device to use for OpenCL kernels.
-   *  device id to use is passed as integer.
+  /** 
+   * Set the device to use for OpenCL kernels.
+   * Device id to use is passed as integer.
   */
  int ocl_setDevice(int device);

-  /** Get a list of all the unique devices of the same type that can run OpenCL kernels
-   *  Used when GPUs of different types might be pressent on the system.
+  /** 
+   * Get a list of all the unique devices of the same type that can run OpenCL kernels.
+   * Used when GPUs of different types might be pressent on the system.
   */
  int ocl_getUniqueDevices(std::vector<int> &devices);
    
-  /*
-    Name: setUp
-    Info: set up opencl resources
-    Return: success or error code
-  */
+  /**
+   * Initialize OpenCL connection with a device of specified type.
+   * Find if specified device is avaialble, creates a contex and command queue.
+   * Returns success or error code.
+   */
  int ocl_setUp(const char* device_name);
 	
-  /*
-    Name: loadKernel
-    Info: load and compile opencl kernel file if it has changed
-    Return: success or error code
+  /**
+   * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
+   * Load and compile opencl kernel file if it has changed.
+   * Return: success or error code
  */
  int ocl_loadKernel(const char* kernel_file);


-  /** Build program from kernel source.
+  /** 
+   * Build program from kernel source.
   * Builds a program from source code provided in kernel_source.
   * If compilation fails will return DKS_ERROR
   */
  int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
 	
-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory on the device.
+   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int &ierr);

-  /*
-    Name: allocateMemory
-    Info: allocate memory on device
-    Return: return pointer to memory
+  /**
+   * Allocate memory of specific type on device.
+   * The availabel types are cl_mem_flags type listed in OpenCL documentation:
+   * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR, 
+   * CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
+   * Return: return pointer to memory
  */
  cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
 	
-  /** Zero OpenCL memory buffer
-   *  Set all the elemetns in the device array to zero
+  /** 
+   * Zero OpenCL memory buffer.
+   * Set all the elemetns in the device array to zero.
   */
  template <typename T>
  int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@ -218,92 +231,90 @@ public:
    return DKS_SUCCESS;
  }

-  /*
-    Name: writeData
-    Info: write data to device memory (needs ptr to mem object)
-    Return: success or error code
-  */
+  /**
+   * Write data to device memory (needs ptr to mem object)
+   * Return: success or error code
+   */
  int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: copyData
-    Info: copy data from one buffer on the device to another
-    Return: success or error code
-  */
+  /** 
+   * Copy data from one buffer on the device to another
+   * Return: success or error code
+   */
  int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
 	
-  /*
-    Name: createKernel
-    Info: create kernel from program
-    Return: success or error code
-  */
+  /** 
+   * Create kernel from compiled OpenCL program.
+   * Return: success or error code
+   */
  int ocl_createKernel(const char* kernel_name);
 	
-  /*
-    Name: setKernelArgs
-    Info: set opencl kernel arguments
-    Return: success or error code
-  */
+  /**
+   * Set argiments for the kernel that will be launched.
+   * Return: success or error code
+   */
  int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
 	
-  /*
-    Name: executeKernel
-    Info: execute selected kernel (needs kernel parameters)
-    Return: success or error code
+  /**
+   * Execute selected kernel.
+   * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
+   * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
+   * kernel arguments must be set.
+   * Return: success or error code
  */
  int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
 	
-  /*
-    Name: readData
-    Info: read data from device (needs pointer to mem object)
-    Return: success or error code
-  */
+  /**
+   * Read data from device (needs pointer to mem object).
+   * Return: success or error code
+   */
  int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
 	
-  /*
-    Name: freeMemory
-    Info: free device memory (needs ptr to mem object)
-    Return: success or error code
-  */
+  /**
+   * Free device memory (needs ptr to mem object).
+   *  Return: success or error code
+   */
  int ocl_freeMemory(cl_mem mem_ptr);
 	
-  /*
-    Name: cleanUp
-    Info: free opencl resources
-    Return: success or error code
-  */
+  /**
+   * Free opencl resources.
+   * Deletes the kernel, compiled program, command queue and colese the connection
+   * to device by releasing the context.
+   * Return: success or error code
+   */
  int ocl_cleanUp();
 	
-  /*
-    Name: deviceInfo
-    Info: print device info (mostly for debugging purposes)
-    Return: success or error code
-  */
+  /**
+   * Print info of currently selected device.
+   * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
+   * Return: success or error code
+   */
  int ocl_deviceInfo(bool verbose = true);

-  /* Check OpenCL kernel.
-   * Query device and check if it can run the kernel with required parameters
+  /* 
+   * Check OpenCL kernel.
+   * Query device and check if it can run the kernel with required parameters.
+   * Also check the available OpenCL extensions - usefull for checking the supported device
+   * features, like double precission.
   */
  int ocl_checkKernel(const char* kernel_name, int work_group_size,
 		      bool double_precision, int &threadsPerBlock);
 	
-  /*
-    Name: clearEvents
-    Info: clear saved events (for debuging purposes)
-    Return: nothing
-  */
+  /**
+   * Clear the event list.
+   * Events can be used for timing and synchronization purposes.
+   */
  void ocl_clearEvents();

-  /*
-    Name: eventInfo
-    Info: print information about kernel timings (for debuging purposes)
-    Return: nothing
-  */
+  /**
+   * print information about kernel timings from event list.
+   * for debuging purposes
+   */
  void ocl_eventInfo();

-  /*
-    Return current command queue
-  */
+  /**
+   * Return current command queue.
+   */
  cl_command_queue ocl_getQueue() { return m_command_queue; }
 };

--- a/src/OpenCL/OpenCLChiSquare.h
+++ b/src/OpenCL/OpenCLChiSquare.h
@ -14,7 +14,7 @@
 #define DKS_SUCCESS 0
 #define DKS_ERROR 1

-
+/** Deprecated, SimpleFit implementation of ChiSquare. */
 class OpenCLChiSquare {

 private:
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
 }

 int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
+  //write params to gpu
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
  return ierr;
 }
@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
  if (numfunc == 0)
    return DKS_SUCCESS;

+  //write function values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
  return ierr;
 }
@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  if (nummap == 0)
    return DKS_SUCCESS;

+  //wrtie map values to the GPU
  int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
  return ierr;
 }
@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
    freeChiSquare();
  }

-  //allocate temporary memory
+  //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
  mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
  mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
  if (size_func == 0)
@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
  int ierr = DKS_ERROR;
  if (initDone_m) {

-    //free memory
+    //free GPU memory
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@ -308,6 +311,7 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
    return DKS_ERROR;
  }

+  //check the GPU kernel
  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);

  return ierr;
--- a/src/OpenCL/OpenCLChiSquareRuntime.h
+++ b/src/OpenCL/OpenCLChiSquareRuntime.h
@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *

 const std::string openclFunctFooter = "}\n";

+/**
+ * OpenCL implementation of ChiSquareRuntime class.
+ * Implements ChiSquareRuntime interface to allow musrfit to target devices that
+ * support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
+ */
 class OpenCLChiSquareRuntime : public ChiSquareRuntime {

 private:

  OpenCLBase *m_oclbase;

-  /** Private function to add user defined function to kernel string
-   *
+  /** 
+   * Private function to add user defined function to kernel string.
   */
  std::string buildProgram(std::string function);

+  /**
+   * Launch parallel reduction kernel to calculate the sum of data array
+   */
  double calculateSum(cl_mem data, int length);

 public:

-  /** Constructor wiht openclbase argument
-   *
+  /** 
+   * Constructor wiht openclbase argument.
   */
  OpenCLChiSquareRuntime(OpenCLBase *base);
  
-  /** Default constructor
-   *
+  /** 
+   * Default constructor
   */
  OpenCLChiSquareRuntime();

-  /** Default destructor
-   *
+  /** 
+   * Default destructor
   */
  ~OpenCLChiSquareRuntime();

-    /** Compile program and save ptx.
+  /** 
+   * Compile program and save ptx.
   * Add function string to the calcFunction kernel and compile the program
   * Function must be valid C math expression. Parameters can be addressed in
   * a form par[map[idx]]
   */
  int compileProgram(std::string function, bool mlh = false);

-  /** Launch selected kernel
+  /** 
+   * Launch selected kernel.
   * Launched the selected kernel from the compiled code.
   * Result is put in &result variable
   */
@ -64,22 +74,26 @@ public:
 		      double timeStart, double timeStep,
 		      double &result);

-  /** Write params to device.
+  /** 
+   * Write params to device.
   * Write params from double array to mem_param_m memory on the device.
   */
  int writeParams(const double *params, int numparams); 

-  /** Write functions to device.
+  /** 
+   * Write functions to device.
   * Write function values from double array to mem_func_m memory on the device.
   */
  int writeFunc(const double *func, int numfunc);

-  /** Write maps to device.
+  /** 
+   * Write maps to device.
   * Write map values from int array to mem_map_m memory on the device.
   */
  int writeMap(const int *map, int nummap);

-  /** Allocate temporary memory needed for chi square.
+  /** 
+   * Allocate temporary memory needed for chi square.
   * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
   * the maximum number of elements in any datasets that will be used for calculations. Size_param,
   * size_func and size_map are the maximum number of parameters, functions and maps used in 
@ -87,14 +101,16 @@ public:
   */
  int initChiSquare(int size_data, int size_param, int size_func, int size_map);

-  /** Free temporary memory allocated for chi square.
+  /** 
+   * Free temporary memory allocated for chi square.
   * Frees the chisq temporary memory and memory for params, functions and maps
   */
  int freeChiSquare();

-  /** Check MuSR kernels for necessary resources.
+  /** 
+   * Check MuSR kernels for necessary resources.
   * Query device properties to get if sufficient resources are
-   * available to run the kernels
+   * available to run the kernels. Also checks if double precission is enabled on the device.
   */
  int checkChiSquareKernels(int fitType, int &threadsPerBlock);

--- a/src/OpenCL/OpenCLCollimatorPhysics.h
+++ b/src/OpenCL/OpenCLCollimatorPhysics.h
@ -17,12 +17,16 @@
 #include "boost/compute/core.hpp"
 */

+/** Double3 structure for use in OpenCL code. */
 typedef struct {
  double x;
  double y;
  double z;
 } Double3;

+/**
+ * Structure for stroing particles in OpenCL code.
+ */
 typedef struct {
  int label;
  unsigned localID;
@ -35,6 +39,10 @@ typedef struct {
 //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
 //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));

+/**
+ * OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
+ * Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
+ */
 class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {

 private:
@ -42,16 +50,20 @@ private:

 public:

-  /* constructor */
+  /** 
+   * Constructor with OpenCLBase as argument.
+   * Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
+   */
  OpenCLCollimatorPhysics(OpenCLBase *base) { 
    m_oclbase = base;
  }

-  /* destructor */
+  /** 
+   * Destructor.
+   */
  ~OpenCLCollimatorPhysics() { 
  }

-  /* execute degrader code on device */
  int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, 
 			bool enableRutherforScattering = true);

--- a/src/OpenCL/OpenCLFFT.h
+++ b/src/OpenCL/OpenCLFFT.h
@ -1,14 +1,3 @@
-/*
-
-  Name: OpenCLFFT
-
-  Author: Uldis Locans
-
-  Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
-
-  Data: 19.09.2014
-
-*/
 #ifndef H_OPENCL_FFT
 #define H_OPENCL_FFT

@ -22,6 +11,12 @@

 #include "clFFT.h"

+/**
+ * OpenCL FFT class based on BaseFFT interface.
+ * Uses clFFT library to perform FFTs on AMD gpus.
+ * clFFT library works also on nvida GPUs and other devices that
+ * support OpenCL.
+ */
 class OpenCLFFT : public BaseFFT {

 private:
--- a/src/OpenCL/OpenCLGreensFunction.h
+++ b/src/OpenCL/OpenCLGreensFunction.h
@ -7,6 +7,7 @@
 #include "../Algorithms/GreensFunction.h"
 #include "OpenCLBase.h"

+/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
 class OpenCLGreensFunction : public GreensFunction {

 private:
@ -31,7 +32,7 @@ public:
  int buildProgram();

  /**
-    Info: calc itegral on device memory (taken from OPAL src code)
+    Info: calc itegral on device memory (taken from OPAL src code).
    Return: success or error code
  */
  int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, 
@ -39,20 +40,20 @@ public:
 		       int streamId = -1);
 		
  /**
-    Info: integration of rho2_m field (taken from OPAL src code)
+    Info: integration of rho2_m field (taken from OPAL src code).
    Return: success or error code
  */
  int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
 				  int streamId = -1);
 		
  /**
-    Info: mirror rho field (taken from OPAL src code)
+    Info: mirror rho field (taken from OPAL src code).
    Return: succes or error code
  */
  int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);

  /**
-    Info: multiply complex fields already on the GPU memory, result will be put in ptr1
+    Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
    Return: success or error code
  */
  int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
--- a/src/Utility/DKSTimer.h
+++ b/src/Utility/DKSTimer.h
@ -5,6 +5,10 @@
 #include <string>
 #include <sys/time.h>

+/**
+ * Custom timer class.
+ * Allows to insert timers in the code to get function exectution times.
+ */
 class DKSTimer {

 private:
@ -17,39 +21,45 @@ private:

 public:

-  /** Init DKSTimer by seting timer to zero  */
+  /** Init DKSTimer by seting timer to zero. */
  DKSTimer();

  ~DKSTimer();

-  /** Init the timer
-   *  Set the name for timer and clear all values
+  /** 
+   * Init the timer.
+   * Set the name for timer and clear all values
   */
  void init(std::string n);

-  /** Start the timer.
-   *  Get the curret time with gettimeofday and save in timeStart
+  /** 
+   * Start the timer.
+   * Get the curret time with gettimeofday and save in timeStart
   */
  void start();

-  /** Stop the timer 
-   *  Get the curretn time with gettimeofday and save in timeEnd
-   *  Calculate elapsed time by timeEnd - timeStart and add to timervalue
+  /** 
+   * Stop the timer.
+   * Get the curretn time with gettimeofday and save in timeEnd
+   * Calculate elapsed time by timeEnd - timeStart and add to timervalue
   */
  void stop();

-  /** Reset timervalue to zero.
-   *  Set timervalue, timeStart and timeEnd to zero
+  /** 
+   * Reset timervalue to zero.
+   * Set timervalue, timeStart and timeEnd to zero
   */
  void reset();

-  /** Return elapsed time in seconds.
-   *  Return the value of timervalue
+  /** 
+   * Return elapsed time in seconds.
+   * Return the value of timervalue
   */
  double gettime();

-  /** Print timer.
-   *  Print the elapsed time of the timer
+  /** 
+   * Print timer.
+   * Print the elapsed time of the timer
   */
  void print();