snapshot of svn

2016-10-10 14:49:32 +02:00
commit 4fa529aaea
122 changed files with 23153 additions and 0 deletions
--- a/auto-tuning/CMakeLists.txt
+++ b/auto-tuning/CMakeLists.txt
@ -0,0 +1,19 @@
+INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
+
+#chi square kernel tests
+ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
+TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
+
+ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
+TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
+
+IF (USE_UQTK)
+  ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
+  TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
+ENDIF (USE_UQTK)
+#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
+
+#test to verify search functions
+ADD_EXECUTABLE(testSearch testSearch.cpp)
+TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
--- a/auto-tuning/testChiSquareRT.cpp
+++ b/auto-tuning/testChiSquareRT.cpp
@ -0,0 +1,385 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <fstream>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+#define PI 3.14159265358979323846
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+#define N0 0.25
+#define TAU 2.197019
+#define BKG 1.0
+
+#define ALPHA 1.0
+#define BETA 1.0
+
+using namespace std;
+
+void randData(double *data, int N, int scale = 1) {
+  for (int i = 0; i < N; i++)
+    data[i] = ((double)rand() / RAND_MAX ) * scale;
+}
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double t, double lamda) {
+  return exp( -lamda*t );
+}
+
+double ge(double t, double lamda, double beta) {
+  return exp( -pow(lamda*t, beta) );
+}
+
+double sg(double t, double sigma) {
+  return exp( -0.5 * pow(sigma*t, 2) );
+}
+
+double stg(double t, double sigma) {
+  double sigmatsq = pow(sigma*t,2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double t, double lambda) {
+  double lambdat = lambda*t;
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double t, double lambda, double sigma) {
+  double lambdat = lambda*t;
+  double sigmatsq = pow(sigma*t, 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double t, double sigma, double beta) {
+  if (beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(sigma*t, beta);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
+}
+
+double spg(double t, double lambda, double gamma, double q) {
+  double lam2 = lambda*lambda;
+  double lamt2q = t*t*lam2*q;
+  double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double t, double nu, double lambda) {
+  double nut  = nu*t;
+  double nuth = nu*t/2.0;
+  double lamt = lambda*t;
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double t, double phi, double nu) {
+  double tmp_nu = TWO_PI*nu*t;
+  double tmp_phi = DEG_TO_RAD * phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+
+  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+double b(double t, double phi, double nu) {
+  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
+}
+
+double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
+  double wt = TWO_PI * nu * t;
+  double ph = DEG_TO_RAD * phi;
+
+  return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
+}
+
+double ab(double t, double sigma, double gamma) {
+  double gt = gamma*t;
+
+  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double t, double Delta0, double Rb) {
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double t, double phi, double nu, double Delta0, double Rb) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double D0t2 = pow(Delta0*t, 2.0);
+  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double t, double Delta0, double Rb, double nuc) {
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
+}
+
+double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
+  double wt = TWO_PI*nu*t;
+  double ph = DEG_TO_RAD*phi;
+  double nuct = nuc*t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
+}
+
+
+double cpuChiSq(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, 
+		double timeStart, double timeStep, bool mlh = false) 
+{
+
+  double result = 0.0;
+  for (int i = 0; i < Ndata; i++) {
+    
+    double t = timeStart + i*timeStep;
+    double d = data[i];
+    double e = data[i];
+
+    double fTheory = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
+    double theo = N0 * exp(-t/TAU) * (1.0 + fTheory) + BKG;
+
+    if (mlh) {
+      if ((d > 1.0e-9) && (fabs(theo) > 1.0e-9))
+	result += 2.0 * ((theo - d) + d * log(d / theo));
+      else
+	result += 2.0 * (theo - d);
+    } else {
+      if (e != 0.0)
+	result += ( (theo - d) * (theo - d) ) / (e * e);
+      else
+	result += theo * theo;
+    }
+  }
+
+  return result;
+}
+
+double cpuChiSqAsym(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, 
+		    double timeStart, double timeStep, bool mlh = false) 
+{
+
+  double result = 0.0;
+  for (int i = 0; i < Ndata; i++) {
+    
+    double t = timeStart + i*timeStep;
+    double d = data[i];
+    double e = data[i];
+
+    double theoVal = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
+    double ab = ALPHA * BETA;
+    
+
+    double theo = ((ab+1.0)*theoVal - (ALPHA-1.0))/((ALPHA+1.0) - (ab-1.0)*theoVal);
+
+    if (mlh) {
+      result += 0.0; //log max likelihood not defined here
+    } else {
+      if (e != 0.0)
+	result += ( (theo - d) * (theo - d) ) / (e  * e);
+      else
+	result += theo * theo;
+    }
+  }
+
+  return result;
+}
+
+int runTest(const char *api_name, const char *device_name, bool autotune, bool mlh, bool asym) {
+
+  int ierr;
+
+  /* 
+   * Histogram size used in tests. If autotune run kernes with sizes from 1e5 to 1e6.
+   * If autotune is off just run the test once (used for debuging to test the kernel) 
+   */ 
+  int Nstart = 1e5;
+  int Nstep = 1e5;
+  int Nend = (autotune) ? 1e6 : 1e5; 
+
+  //parameter, function and map sizes used in tests
+  int Npar = 66;
+  int Nfnc = 2;
+  int Nmap = 5;
+
+  //print test info
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Max log likelihood: " << std::boolalpha << mlh << endl;
+  cout << "Asymetry fit: " << std::boolalpha << asym << endl;
+
+  DKSBaseMuSR dksbase;
+  dksbase.setAPI(api_name);
+  dksbase.setDevice(device_name);
+  ierr = dksbase.initDevice();
+  if (ierr != DKS_SUCCESS) {
+    std::cout << "Device not supported!" << std::endl;
+    return DKS_ERROR;
+  }
+
+  //get the list of different devices
+  std::vector<int> devices;
+  dksbase.getDeviceList(devices);
+  std::cout << "Unique devices: " << devices.size() << std::endl;
+
+  //create the function string to use in test
+  string sFnc = "p[m[0]] * f[m[1]] * sg(t, p[m[2]]) * tf(t, p[m[3]], f[m[4]])";
+  int map[5] = {0, 0, 1, 2, 1};
+
+  //runt tests from 100k to 1mil data points
+  for (unsigned int device = 0; device < devices.size(); device++) {
+    for (int Ndata = Nstart; Ndata <= Nend; Ndata += Nstep) {
+
+      dksbase.setDefaultDevice(device);
+
+      std::cout << "Ndata: " << Ndata << std::endl;
+
+      //init the chi square calculations
+      dksbase.initChiSquare(Ndata, Npar, Nfnc, Nmap);
+
+      //create random arrays for data, parameter and function storage
+      double *data = new double[Ndata];  
+      double *par = new double[Npar];
+      double *fnc = new double[Nfnc];
+  
+      randData(data, Ndata);
+      randData(par, Npar);
+      randData(fnc, Nfnc, 100);
+
+      //allocate memory on device
+      void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+
+      //write data, params, functions and maps to the device
+      dksbase.writeData<double>(data_ptr, data, Ndata);
+      dksbase.writeParams(par, Npar);
+      dksbase.writeFunctions(fnc, Nfnc);
+      dksbase.writeMaps(map, Nmap);
+    
+      //set musrfit constants
+      dksbase.callSetConsts(N0, TAU, BKG);
+      dksbase.callSetConsts(ALPHA, BETA);
+
+      //compile the program created with the function string
+      dksbase.callCompileProgram(sFnc, mlh);
+
+      //set autotuning on/off
+      if (autotune)
+	dksbase.setAutoTuningOn();
+      
+      //tmp values to store results and tmp values for time steps and start time
+      double result_gpu = 0.0;
+      double result_cpu = 0.0;
+      double dt = 1e-12;
+      double ts = 1e-7;
+
+      //execute kernel on the GPU and execute the same function on the cpu
+      if (!asym) {
+	dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, Npar, Nfnc, 
+				    Nmap, ts, dt, result_gpu);
+	result_cpu = cpuChiSq(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);   
+      } else {
+	dksbase.callLaunchChiSquare(2, data_ptr, data_ptr, Ndata, Npar, Nfnc, 
+				    Nmap, ts, dt, result_gpu);
+	result_cpu = cpuChiSqAsym(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);
+      }
+
+      //check the results 
+      cout << "DKS: " << result_gpu << endl;
+      cout << "CPU: " << result_cpu << endl;
+
+      //free CPU and GPU memory
+      dksbase.freeMemory<double>(data_ptr, Ndata);
+      dksbase.freeChiSquare();
+  
+      delete[] data;
+      delete[] par;
+      delete[] fnc;
+      cout << "------------------------------------------------------------" << endl;
+    }
+  }
+
+  return DKS_SUCCESS;
+}
+
+int main(int argc, char* argv[]) {
+
+  bool asym = false;
+  bool mlh = false;
+  bool autotune = false;
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; ++i) {
+    
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    }
+    
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    }
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-mlh"))
+      mlh = true;
+
+    if (argv[i] == string("-asym"))
+      asym = true;
+
+    if (argv[i] == string("-autotune"))
+      autotune = true;
+
+  }
+
+  int numPlatforms = 2;
+  const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
+  const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
+
+  for (int i = 0; i < numPlatforms; i++) {
+    runTest(api[i], device[i], autotune, mlh, asym);
+  }
+
+  return 0;
+}
--- a/auto-tuning/testChiSquareRTRandom.cpp
+++ b/auto-tuning/testChiSquareRTRandom.cpp
@ -0,0 +1,450 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <fstream>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+#define PI 3.14159265358979323846
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+//#define N0 0.25
+#define N0 1e-10
+#define TAU 2.197019
+#define BKG 0.05
+
+using namespace std;
+
+typedef std::function<double()> doubleF;
+
+void randData(double *data, int N, int scale = 1) {
+  for (int i = 0; i < N; i++)
+    data[i] = ((double)rand() / RAND_MAX ) * scale;
+}
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double *t, double *lamda) {
+  return exp( -*lamda**t );
+}
+
+double ge(double *t, double *lamda, double *beta) {
+  return exp( -pow( (*lamda)*(*t), *beta) );
+}
+
+double sg(double *t, double *sigma) {
+  return exp( -0.5 * pow((*sigma)*(*t), 2) );
+}
+
+double stg(double *t, double *sigma) {
+  double sigmatsq = pow((*sigma)*(*t),2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double *t, double *lambda) {
+  double lambdat = *lambda*(*t);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double *t, double *lambda, double *sigma) {
+  double lambdat = *lambda*(*t);
+  double sigmatsq = pow(*sigma*(*t), 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double *t, double *sigma, double *beta) {
+  if (*beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(*sigma*(*t), (*beta));
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
+}
+
+double spg(double *t, double *lambda, double *gamma, double *q) {
+  double lam2 = (*lambda)*(*lambda);
+  double lamt2q = (*t)*(*t)*lam2*(*q);
+  double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double *t, double *nu, double *lambda) {
+  double nut  = *nu*(*t);
+  double nuth = *nu*(*t)/2.0;
+  double lamt = *lambda*(*t);
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double *t, double *phi, double *nu) {
+  double tmp_nu = TWO_PI**nu**t;
+  double tmp_phi = DEG_TO_RAD * *phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+
+  return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double b(double *t, double *phi, double *nu) {
+  return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
+}
+
+double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI * *nu * *t;
+  double ph = DEG_TO_RAD * *phi;
+
+  return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double ab(double *t, double *sigma, double *gamma) {
+  double gt = *gamma**t;
+
+  return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double *t, double *Delta0, double *Rb) {
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
+}
+
+double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
+}
+
+double evalf(std::vector< std::pair<int, doubleF> > func) {
+
+  double result = 0.0;
+  for (auto f : func) {
+    switch (f.first) {
+    case 0: result += f.second(); break;
+    case 1: result -= f.second(); break;
+    default: result += f.second(); break;
+    }
+  }
+
+  return result;
+}
+
+double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
+
+  double result = 0.0;
+  double ts = *t;
+
+  for (int i = 0; i < ndata; i++) {
+
+    *t = ts + i*dt;
+    double d = data[i];
+    double e = data[i];
+
+    double vf = evalf(func);
+    double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
+    
+    if (e != 0.0)
+      result += ( (theo - d) * (theo - d) ) / (e*e);
+    else
+      result += theo * theo;
+
+  }
+  return result;
+}
+
+//create a random length from 50 - 1000 array and fill with random values from 0 to 1
+void randomParams(double *p, int np) {
+  for (int i = 0; i < np; i++)
+    p[i] = (double)rand() / RAND_MAX;
+}
+
+//create map array of random size and fill with indexes from 0 to max, max < size of param array
+void randomMaps(int *m, int nm, int max) {
+  for (int i = 0; i < nm; i++)
+    m[i] = rand() % max;
+}
+
+int generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc, 
+			   double *t, double *p, int *m, int np, int nm) 
+{
+
+  //nf defines the number of functions to generate (from 1 to 25)
+  int nf = rand() % 25 + 1;
+
+  for (int n = 0; n < nf; n++) {
+    std::string sf = "";
+    doubleF f;
+
+    int r = rand() % 18; //choose random function to use
+
+    int id1 = rand() % nm;
+    int id2 = rand() % nm;
+    int id3 = rand() % nm;
+    int id4 = rand() % nm;
+    int id5 = rand() % nm;
+
+    std::string p1 = "p[m[" + to_string(id1) + "]])";
+    std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
+    std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]])";
+    std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
+    std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
+
+    //get a random index from maps and use it to get the parameter value, bind function and parameter
+    //values to f, and create string for gpu in sfunc
+    switch (r) {
+    case 0:
+      f = std::bind(se, t, &p[m[id1]]);
+      sf = "se(t," + p1;
+      break;
+    case 1:
+      f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ge(t," + p2;
+      break;
+    case 2:
+      f = std::bind(sg, t, &p[m[id1]]);
+      sf = "sg(t, " + p1;
+      break;
+    case 3:
+      f = std::bind(stg, t, &p[m[id1]]);
+      sf = "stg(t, " + p1;
+      break;
+    case 4:
+      f = std::bind(sekt, t, &p[m[id1]]);
+      sf = "sekt(t, " + p1;
+      break;
+    case 5:
+      f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "lgkt(t, " + p2;
+      break;
+    case 6:
+      f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "skt(t, " + p2;
+      break;
+    case 7:
+      f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "spg(t, " + p3;
+      break;
+    case 8:
+      f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "rahf(t, " + p2;
+      break;
+    case 9:
+      f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "tf(t, " + p2;
+      break;
+    case 10:
+      f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ifld(t, " + p5;
+      break;
+    case 11:
+      f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
+      sf = "b(t, " + p2;
+      break;
+    case 12: 
+      f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ib(t, " + p5;
+      break;
+    case 13:
+      f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ab(t, " + p2;
+      break;
+    case 14:
+      f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "snkzf(t, " + p2;
+      break;
+    case 15:
+      f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
+      sf = "snktf(t, " + p4;
+      break;
+    case 16: 
+      f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "dnkzf(t, " + p3;
+      break;
+    case 17: 
+      f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "dnktf(t, " + p5;
+      break;
+    }
+    
+
+    int sign = rand() % 2;
+    if (n == 0) sign = 0;
+    func.push_back( std::make_pair(sign, f) );
+    if (n == 0)
+      sfunc = sf;
+    else {
+      switch(sign) {
+      case 0: sfunc += " + " + sf; break;
+      case 1: sfunc += " - " + sf; break;
+      default: sfunc += " + " + sf; break;
+      }
+
+    }
+  }
+
+  return nf;
+}
+
+int main(int argc, char *argv[]) {
+
+
+  srand(time(NULL));
+
+  int ierr;
+  int Ndata = 1e6;
+
+  bool autotune = false;
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  for (int i = 1; i < argc; ++i) {
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    }
+    
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    }
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-autotune")) {
+      autotune = true;
+    }
+
+  }
+
+  //create a random number of parameters
+  int np = ( rand() % (1000 - 50) ) + 50;
+  int nm = ( rand() % (50 - 5) ) + 5;
+  int nf = ( rand() % (50 - 5) ) + 5;
+
+  int *m = new int[nm];
+  double *p = new double[np];
+  double *f = new double[nf];
+
+  randomParams(p, np);
+  randomMaps(m, nm, np);
+  randomParams(f, nf);
+
+  double dt = 1e-10;
+  double t = 1e-10;
+  std::vector< std::pair<int, doubleF> > func;
+  std::string sfunc;
+  int nfunc = generateRandomFunction(func, sfunc, &t, p, m, np, nm);
+
+  //create DKS base object, set and init device / framework
+  DKSBaseMuSR dksbase;
+  dksbase.setAPI(api_name);
+  dksbase.setDevice(device_name);
+
+  dksbase.initDevice();
+  dksbase.initChiSquare(Ndata, np, nf, nm);
+
+  dksbase.writeParams(p, np);
+  dksbase.writeFunctions(f, nf);
+  dksbase.writeMaps(m, nm);
+
+  dksbase.callSetConsts(N0, TAU, BKG);
+
+  dksbase.callCompileProgram(sfunc);
+
+  if (autotune) 
+    dksbase.setAutoTuningOn();
+
+  int oper = 0;
+  dksbase.getOperations(oper);
+
+  cout << "=========================BEGIN TEST=========================" << endl;
+  cout << "Use api: " << api_name << "\t" << device_name << endl;
+  cout << "Number of params: " << np << endl;
+  cout << "Number of maps: " << nm << endl;
+  cout << "Number of predefined functions: " << nfunc << endl;
+  cout << "Number of ptx instructions: " << oper << endl;
+  cout << "------------------------------------------------------------" << endl;
+  cout << sfunc << endl;
+  cout << "------------------------------------------------------------" << endl;
+
+  //allocate memory on host and device device
+  double *data = new double[Ndata];
+  randomParams(data, Ndata);
+  void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+  dksbase.writeData<double>(data_ptr, data, Ndata);
+
+  for (int N = 1e5; N < Ndata + 1; N += 1e5) {
+    double result_dks, result_cpu;
+
+    t = 1e-10;
+
+    dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, N, np, nf, nm, t, dt, result_dks);
+    result_cpu = cpuChiSq(data, func, N, &t, dt);
+
+    cout << "Npart: " << N << endl;
+    cout << "DKS: " << result_dks << endl;
+    cout << "CPU: " << result_cpu << endl;
+
+  }
+
+  dksbase.freeMemory<double>(data_ptr, Ndata);
+  dksbase.freeChiSquare();
+  delete[] data;
+  delete[] p;
+  delete[] f;
+  delete[] m;
+
+  return 0;
+}
--- a/auto-tuning/testChiSquareRTUQTK.cpp
+++ b/auto-tuning/testChiSquareRTUQTK.cpp
@ -0,0 +1,618 @@
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <fstream>
+
+#include <cstdio>
+#include <stddef.h>
+#include <fstream>
+#include <math.h>
+#include <time.h>
+#include <getopt.h>
+#include <unistd.h>
+
+#include "DKSBaseMuSR.h"
+#include "Utility/DKSTimer.h"
+
+#include "Array1D.h"
+#include "Array2D.h"
+#include "Array3D.h"
+#include "error_handlers.h"
+#include "PCSet.h"
+#include "fast_laplace.h"
+#include "uqtktools.h"
+#include "lreg.h"
+
+#define PI 3.14159265358979323846
+#define TWO_PI 6.283185307179586231996
+#define DEG_TO_RAD 1.7453292519943295474371681e-2
+
+//#define N0 0.25
+#define N0 1e-10
+#define TAU 2.197019
+#define BKG 0.05
+
+using namespace std;
+
+typedef std::function<double()> doubleF;
+
+void randData(double *data, int N, int scale = 1) {
+  for (int i = 0; i < N; i++)
+    data[i] = ((double)rand() / RAND_MAX ) * scale;
+}
+
+/** MusrFit predefined functions.
+ * Predefined functions from MusrFit that can be used to define the theory function.
+ * First parameter in all the functions is alwats time - t, rest of the parameters depend
+ * on the function.
+ */
+double se(double *t, double *lamda) {
+  return exp( -*lamda**t );
+}
+//math func + math oper + memory loads
+//1 + 1 + 2
+
+
+double ge(double *t, double *lamda, double *beta) {
+  return exp( -pow( (*lamda)*(*t), *beta) );
+}
+//2 + 1 + 3
+
+double sg(double *t, double *sigma) {
+  return exp( -0.5 * pow((*sigma)*(*t), 2) );
+}
+//2 + 2 + 2
+
+double stg(double *t, double *sigma) {
+  double sigmatsq = pow((*sigma)*(*t),2);
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
+}
+
+double sekt(double *t, double *lambda) {
+  double lambdat = *lambda*(*t);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
+}
+
+double lgkt(double *t, double *lambda, double *sigma) {
+  double lambdat = *lambda*(*t);
+  double sigmatsq = pow(*sigma*(*t), 2.0);
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
+}
+
+double skt(double *t, double *sigma, double *beta) {
+  if (*beta < 1.0e-3)
+    return 0.0;
+  double sigmatb = pow(*sigma*(*t), (*beta));
+
+  return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
+}
+
+double spg(double *t, double *lambda, double *gamma, double *q) {
+  double lam2 = (*lambda)*(*lambda);
+  double lamt2q = (*t)*(*t)*lam2*(*q);
+  double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
+  double rateL = sqrt(fabs(rate2));
+  double rateT = sqrt(fabs(rate2)+lamt2q);
+
+  return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
+}
+
+double rahf(double *t, double *nu, double *lambda) {
+  double nut  = *nu*(*t);
+  double nuth = *nu*(*t)/2.0;
+  double lamt = *lambda*(*t);
+
+  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
+}
+
+double tf(double *t, double *phi, double *nu) {
+  double tmp_nu = TWO_PI**nu**t;
+  double tmp_phi = DEG_TO_RAD * *phi;
+
+  return cos(tmp_nu + tmp_phi);
+}
+
+double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+
+  return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double b(double *t, double *phi, double *nu) {
+  return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
+}
+
+double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
+  double wt = TWO_PI * *nu * *t;
+  double ph = DEG_TO_RAD * *phi;
+
+  return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
+}
+
+double ab(double *t, double *sigma, double *gamma) {
+  double gt = *gamma**t;
+
+  return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
+}
+
+double snkzf(double *t, double *Delta0, double *Rb) {
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
+}
+
+double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double D0t2 = pow(*Delta0**t, 2.0);
+  double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
+
+  return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
+}
+
+double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
+}
+
+double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
+  double wt = TWO_PI**nu**t;
+  double ph = DEG_TO_RAD**phi;
+  double nuct = *nuc**t;
+  double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
+  double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
+
+  return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
+}
+
+double evalf(std::vector< std::pair<int, doubleF> > func) {
+
+  double result = 0.0;
+  for (auto f : func) {
+    switch (f.first) {
+    case 0: result += f.second(); break;
+    case 1: result -= f.second(); break;
+    default: result += f.second(); break;
+    }
+  }
+
+  return result;
+}
+
+double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
+
+  double result = 0.0;
+  double ts = *t;
+
+  for (int i = 0; i < ndata; i++) {
+
+    *t = ts + i*dt;
+    double d = data[i];
+    double e = data[i];
+
+    double vf = evalf(func);
+    double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
+    
+    if (e != 0.0)
+      result += ( (theo - d) * (theo - d) ) / (e * e);
+    else
+      result += theo * theo;
+
+  }
+  return result;
+}
+
+//create a random length from 50 - 1000 array and fill with random values from 0 to 1
+void randomParams(double *p, int np) {
+  for (int i = 0; i < np; i++)
+    p[i] = (double)rand() / RAND_MAX;
+}
+
+//create map array of random size and fill with indexes from 0 to max, max < size of param array
+void randomMaps(int *m, int nm, int max) {
+  for (int i = 0; i < nm; i++)
+    m[i] = rand() % max;
+}
+
+void generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc, 
+			    double *t, double *p, int *m, int np, int nm, int nfunc) 
+{
+
+  for (int n = 0; n < nfunc; n++) {
+    std::string sf = "";
+    doubleF f;
+
+    int r = rand() % 18; //randomly choose one of the predefined functions to use
+
+    int id1 = rand() % nm; //randomly select parameters to use in the function
+    int id2 = rand() % nm;
+    int id3 = rand() % nm;
+    int id4 = rand() % nm;
+    int id5 = rand() % nm;
+
+    std::string p1 = "p[m[" + to_string(id1) + "]])";
+    std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
+    std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]])";
+    std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
+    std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + 
+      to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
+
+    //get a random index from maps and use it to get the parameter value, bind function and parameter
+    //values to f, and create string for gpu in sfunc
+    switch (r) {
+    case 0:
+      f = std::bind(se, t, &p[m[id1]]);
+      sf = "se(t," + p1;
+      break;
+    case 1:
+      f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ge(t," + p2;
+      break;
+    case 2:
+      f = std::bind(sg, t, &p[m[id1]]);
+      sf = "sg(t, " + p1;
+      break;
+    case 3:
+      f = std::bind(stg, t, &p[m[id1]]);
+      sf = "stg(t, " + p1;
+      break;
+    case 4:
+      f = std::bind(sekt, t, &p[m[id1]]);
+      sf = "sekt(t, " + p1;
+      break;
+    case 5:
+      f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "lgkt(t, " + p2;
+      break;
+    case 6:
+      f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
+      sf = "skt(t, " + p2;
+      break;
+    case 7:
+      f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "spg(t, " + p3;
+      break;
+    case 8:
+      f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "rahf(t, " + p2;
+      break;
+    case 9:
+      f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "tf(t, " + p2;
+      break;
+    case 10:
+      f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ifld(t, " + p5;
+      break;
+    case 11:
+      f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
+      sf = "b(t, " + p2;
+      break;
+    case 12: 
+      f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "ib(t, " + p5;
+      break;
+    case 13:
+      f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
+      sf = "ab(t, " + p2;
+      break;
+    case 14:
+      f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
+      sf = "snkzf(t, " + p2;
+      break;
+    case 15:
+      f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
+      sf = "snktf(t, " + p4;
+      break;
+    case 16: 
+      f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
+      sf = "dnkzf(t, " + p3;
+      break;
+    case 17: 
+      f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
+      sf = "dnktf(t, " + p5;
+      break;
+    }
+    
+
+    int sign = rand() % 2;
+    if (n == 0) sign = 0;
+    func.push_back( std::make_pair(sign, f) );
+    if (n == 0)
+      sfunc = sf;
+    else {
+      switch(sign) {
+      case 0: sfunc += " + " + sf; break;
+      case 1: sfunc += " - " + sf; break;
+      default: sfunc += " + " + sf; break;
+      }
+
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+
+
+  srand(time(NULL));
+
+  bool autotune = false;
+  bool eval = false;
+  bool test = false;
+
+  char *api_name = new char[10];
+  char *device_name = new char[10];
+
+  strcpy(api_name, "Cuda");
+  strcpy(device_name, "-gpu");
+
+  int nord = 15; //the order of the initial, overcomplete basis
+  int loop = 100;
+
+  for (int i = 1; i < argc; ++i) {
+
+    if (argv[i] == string("-cuda")) {
+      strcpy(api_name, "Cuda");
+      strcpy(device_name, "-gpu");
+    }
+    
+    if (argv[i] == string("-opencl")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-gpu");
+    }
+
+    if (argv[i] == string("-mic")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-mic");
+    }
+
+    if (argv[i] == string("-cpu")) {
+      strcpy(api_name, "OpenCL");
+      strcpy(device_name, "-cpu");
+    }
+
+    if (argv[i] == string("-autotune")) {
+      autotune = true;
+    }
+
+    if (argv[i] == string("-eval"))
+      eval = true;
+    
+    if (argv[i] == string("-test"))
+      test = true;
+
+    if (argv[i] == string("-nord"))
+      nord = atoi(argv[i+1]);
+
+    if (argv[i] == string("-loop"))
+      loop = atoi(argv[i+1]);
+
+  }
+
+  //init dks and set chi^2 constants
+  DKSBaseMuSR dksbase;
+  dksbase.setAPI(api_name);
+  dksbase.setDevice(device_name);
+  dksbase.initDevice();
+
+  if (autotune)
+    dksbase.setAutoTuningOn();
+
+  int nydim = 2; //the dimensionality of input
+  int nxdim = 5;
+  //UQTk arrays
+  Array2D<double> xdata(loop, nxdim, 0.0);
+  Array2D<double> ydata(loop, nydim, 0.0);
+  
+  Array2D<double> xdata_pce(loop, nxdim, 0.0);
+  Array2D<double> ydata_pce(loop, nydim, 0.0);
+
+  int size = 10000;
+  Array2D<double> xtmp(size, nxdim, 0.0);
+  Array2D<double> ytmp(size, nydim, 0.0);
+
+  if (eval || test) {
+    for (int l = 0; l < loop; l++) {
+
+      int ierr;
+
+      //create a random number of parameters
+      int n = rand() % 9 + 1;
+      int Ndata = n * 100000; //number of data points 100k to 1milj, with 100k incr.
+      int np = ( rand() % (1000 - 50) ) + 50; //from 50 to 1000 for different shared memory needs
+      int nm = ( rand() % (50 - 5) ) + 5; //use 5 to 50 of the parameters, for different memory access
+      int nf = ( rand() % (50 - 5) ) + 5; //not used in the test case, but changes the shared memory
+      int nfunc = (rand() % (10 - 1) ) + 1; //1 to 10 user defined functions
+    
+      //allocate storage for parameters, maps and functions
+      int *m = new int[nm];
+      double *p = new double[np];
+      double *f = new double[nf];
+
+      //fill with random numbers
+      randomParams(p, np);
+      randomMaps(m, nm, np);
+      randomParams(f, nf);
+
+      //create a random user function that can be passed to GPU kernel and evaluated on the host
+      double dt = 1e-10;
+      double t = 1e-10;
+      std::vector< std::pair<int, doubleF> > func;
+      std::string sfunc;
+      generateRandomFunction(func, sfunc, &t, p, m, np, nm, nfunc);
+      
+      //create a data array and fill with random values
+      double *data = new double[Ndata];
+      randomParams(data, Ndata);
+
+      
+      //allocate device memory for the data and transfer to the GPU
+      void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
+      dksbase.writeData<double>(data_ptr, data, Ndata);
+
+      //init chi^2 
+      dksbase.initChiSquare(Ndata, np, nf, nm);
+      dksbase.callSetConsts(N0, TAU, BKG);
+
+      //write params to the devic
+      dksbase.writeParams(p, np);
+      dksbase.writeFunctions(f, nf);
+      dksbase.writeMaps(m, nm);
+
+      //compile the kernel with the new function
+      dksbase.callCompileProgram(sfunc);
+    
+      //run the kernel on the GPU and evaluate the function on the host
+      double result_dks, result_cpu, tmp_result;
+      
+      ierr = dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, 
+					 t, dt, result_dks);
+      
+      if (ierr == DKS_SUCCESS) {
+	result_cpu = cpuChiSq(data, func, Ndata, &t, dt);
+
+	std::vector<int> config;
+	dksbase.callAutoTuningChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, 
+					t, dt, tmp_result, config);
+      
+	cout << "DKS: " << result_dks << endl;
+	cout << "CPU: " << result_cpu << endl;
+	cout << "Launch parameters: " << config[0] << ", " << config[1] << endl;
+	cout << sfunc << endl;
+	cout << "Kernel parameters: " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
+
+	xdata(l,0) = np;
+	xdata(l,1) = nm;
+	xdata(l,2) = nf;
+	xdata(l,3) = nfunc;
+	xdata(l,4) = Ndata;
+	
+	ydata(l,0) = config[0];
+	ydata(l,1) = config[1];
+
+	std::cout << std::endl << "Loop " << l + 1 << " finished" << std::endl << std::endl;
+      } else {
+	cout << "Created kernel failed! " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
+	cout << sfunc << endl;
+      }
+      
+      
+      //free temporary resources
+      delete[] m;
+      delete[] p;
+      delete[] f;
+      delete[] data;
+      dksbase.freeChiSquare();
+      dksbase.freeMemory<double>(data_ptr, Ndata);
+    }
+  } else {
+    //read_datafileVS(xdata, "xdata.dat");
+    //read_datafileVS(ydata, "ydata.dat");
+    xtmp.SetValue(0.0);
+    ytmp.SetValue(0.0);
+    read_datafileVS(xtmp, "xdata_pce.dat");
+    read_datafileVS(ytmp, "ydata_pce.dat");
+    for (int i = 0; i < loop; i++) {
+      for (int j = 0; j < nxdim; j++)
+	xdata(i,j) = xtmp(i,j);
+      for (int j = 0; j < nydim; j++)
+	ydata(i,j) = ytmp(i,j);
+    }
+  }
+  
+  
+  if (eval) {
+    for (int i = 0; i < nxdim; i++) {
+      for (int j = 0; j < loop; j++) {
+	xdata_pce(j,i) = xdata(j,i);
+	ydata_pce(j,i) = ydata(j,i);
+      }
+    }
+
+    for (int i = 0; i < nydim; i++) {
+      for (int j = 0; j < loop; j++) {
+	xdata_pce(j,i) = xdata(j,i);
+	ydata_pce(j,i) = ydata(j,i);
+      }
+    }
+  } else {
+    //read_datafileVS(xdata_pce, "xdata_pce.dat");
+    //read_datafileVS(ydata_pce, "ydata_pce.dat");
+    xtmp.SetValue(0.0);
+    ytmp.SetValue(0.0);
+    read_datafileVS(xtmp, "xdata_pce.dat");
+    read_datafileVS(ytmp, "ydata_pce.dat");
+    for (int i = 0; i < loop; i++) {
+      for (int j = 0; j < nxdim; j++)
+	xdata_pce(i,j) = xtmp(i,j);
+      for (int j = 0; j < nydim; j++)
+	ydata_pce(i,j) = ytmp(i,j);
+    }
+    std::cout << "Built pce with " << xdata_pce.XSize() << " datapoints" << std::endl;
+  }
+
+  //default input settings
+  string which_chaos="LU"; //PC type
+  string msc="m";
+  
+  Lreg* reg;
+  reg = new PCreg(which_chaos,nord,nxdim);
+  int nbas = reg->GetNbas();  
+
+  Array2D<double> ypc_data(xdata.XSize(), nydim, 0.0);
+  for (int i = 0; i < nydim; i++) {
+    
+    std::cout << "start dim " << i+1 << std::endl;
+
+    Array1D<double> ydata_1d(xdata_pce.XSize(), 0.0);
+    for (unsigned int j = 0; j < xdata_pce.XSize(); j++)
+      ydata_1d(j) = ydata_pce(j,i);
+
+    std::cout << "setup data" << std::endl;
+    reg->SetupData(xdata_pce,ydata_1d);
+    
+    std::cout << "Comput best lambda" << std::endl;
+    double lambda=reg->LSQ_computeBestLambda();
+    Array1D<double> lam(nbas,lambda);
+
+
+    reg->SetWeights(lam);
+
+    std::cout << "LSQ build regr" << std::endl;
+
+    reg->LSQ_BuildRegr();
+    std::cout << std::endl << "Lambda : " << lambda << std::endl;
+
+    Array1D<double> ypc;
+    Array1D<double> ycheck;
+    Array2D<double> ycheck_cov;
+
+    reg->EvalRegr(xdata,msc,ypc,ycheck,ycheck_cov);
+    std::cout << std::endl << "Eval" << std::endl;
+    
+    for (unsigned int j = 0; j < xdata.XSize(); j++)
+      ypc_data(j,i) = ypc(j);
+
+  }
+
+  if (eval) {
+    write_datafile(xdata_pce, "xdata_pce.dat");
+    write_datafile(ydata_pce, "ydata_pce.dat");
+  }
+
+  write_datafile(xdata, "xdata.dat");
+  write_datafile(ydata, "ydata.dat");
+  write_datafile(ypc_data, "ypc_data.dat");
+
+  return 0;
+}
--- a/auto-tuning/testSearch.cpp
+++ b/auto-tuning/testSearch.cpp
@ -0,0 +1,22 @@
+#include <iostream>
+
+#include "DKSBaseMuSR.h"
+
+/** No accelerator device is used, this test is used to confirm, that search functions
+ * used for auto-tuning work properly
+ */
+
+int main() {
+
+  DKSBaseMuSR base;
+
+  std::cout << "Start test" << std::endl;
+
+  base.testAutoTuning();
+
+  std::cout << "Test finished" << std::endl;
+    
+  
+
+  return 0;
+}