#include #include #include #include #include #include "DKSBaseMuSR.h" #include "Utility/DKSTimer.h" void initData(double *data, int N, bool ones = false) { for (int i = 0; i < N; i++) { if (ones) data[i] = 1.0; else data[i] = (double)rand() / RAND_MAX; } } template void printData(T *data, int N) { for (int i = 0; i < N; i++) std::cout << data[i] << "\t"; std::cout << std::endl; } const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])"; //std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])"; //const std::string funct = "p[m[0]] * se(t, p[m[1]])"; //const std::string funct = "p[m[1]] + p[m[0]]"; double fTheory(double time, double *par, double *func, int *map) { return cos(time*par[0]) - exp(-time*par[map[0]]); } double testFunctionSerial(double *data, double *par, double *func, int *map, double N0, double tau, double bkg, double timeStep, int startTimeBin, int endTimeBin) { double time, diff, theo; double chisq = 0; for (int i = startTimeBin; i < endTimeBin; ++i) { time = i * timeStep; theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg; diff = data[i] - theo; chisq += diff * diff / data[i]; } return chisq; } double testFunctionParallel(double *data, double *par, double *func, int *map, double N0, double tau, double bkg, double timeStep, int startTimeBin, int endTimeBin) { int i, chunk; double time, diff, theo; double chisq = 0; chunk = (endTimeBin - startTimeBin) / omp_get_num_procs(); if (chunk < 10) chunk = 10; #pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq) for (i = startTimeBin; i < endTimeBin; ++i) { time = i * timeStep; theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg; diff = data[i] - theo; chisq += diff * diff / data[i]; } return chisq; } int main(int argc, char *argv[]) { int Loop = 100; //init test data on the host int Ndata = 8; if (argc > 1) Ndata = atoi(argv[1]); int api = 1; if (argc > 2) api = atoi(argv[2]); int Npar = 66; int Nfunc = 1; int Nmap = 4; double *data = new double[Ndata]; double *par = new double[Npar]; double *func = new double[Nfunc]; int *map = new int[Nmap]; initData(data, Ndata); initData(par, Npar); initData(func, Nfunc); map[0] = 1; map[1] = 2; map[2] = 3; map[3] = 4; //create timers DKSTimer serialTimer; DKSTimer cudaTimer; DKSTimer ompTimer; DKSTimer gpuOverhead; serialTimer.init("Serial timer"); cudaTimer.init("Cuda timer"); ompTimer.init("OpenMP timer"); gpuOverhead.init("Overhead for gpu"); //serial version double resultSerial; serialTimer.start(); for (int i = 0; i < Loop; i++) resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata); serialTimer.stop(); //openmp version double resultOMP = 0.0; ompTimer.start(); //for (int i = 0; i < Loop; i++) // resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata); ompTimer.stop(); //create and init dkabase gpuOverhead.start(); DKSBaseMuSR dksbase; if (api == 1) dksbase.setAPI("Cuda"); else dksbase.setAPI("OpenCL"); dksbase.setDevice("-gpu"); dksbase.initDevice(); dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap); //allocate memory on the device int ierr; void *data_ptr; data_ptr = dksbase.allocateMemory(Ndata, ierr); dksbase.writeData(data_ptr, data, Ndata); dksbase.writeFunctions(func, Nfunc); dksbase.writeMaps(map, Nmap); dksbase.callCompileProgram(funct); gpuOverhead.stop(); double resultCuda; cudaTimer.start(); for (int i = 0; i < Loop; i++) { dksbase.writeParams(par, Npar); int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap, 0.0, 0.1, 0, resultCuda); if (ierr != 0) exit (EXIT_FAILURE); } cudaTimer.stop(); std::cout << std::endl; std::cout << "=======================Results=======================" << std::endl; std::cout << "Result serial = " << resultSerial << std::endl; std::cout << "Result prallel = " << resultOMP << std::endl; std::cout << "Result cuda = " << resultCuda << std::endl; std::cout << std::endl; std::cout << "=======================Timings=======================" << std::endl; serialTimer.print(); ompTimer.print(); cudaTimer.print(); gpuOverhead.print(); std::cout << std::endl; dksbase.freeMemory(data_ptr, Ndata); return 0; }