changed version to 1.1.4

adapted for CUDA 11
added the two new muSR functions ifgk and ifll (CUDA/OpenCL).
2020-06-09 13:04:48 +02:00 · 2020-06-09 12:55:55 +02:00 · 2019-01-22 14:10:02 +01:00 · 2018-12-11 11:35:32 +01:00
6 changed files with 158 additions and 29 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
 SET (DKS_VERSION_MINOR 1)
-SET (DKS_VERSION_PATCH 2)
+SET (DKS_VERSION_PATCH 4)
 set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
 SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
--- a/ReadMe.first
+++ b/ReadMe.first
@ -1,7 +1,7 @@
 ##################################################################
 #
 # Name:		Dynamic Kernel Scheduler
-# Version:	1.0
+# Version:	1.1
 # Author: 	Uldis Locans
 # Contacts:	locans.uldis@psi.ch
 #
--- a/src/CUDA/CudaChiSquareRuntime.cu
+++ b/src/CUDA/CudaChiSquareRuntime.cu
@ -86,15 +86,19 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {

  //create program
  nvrtcProgram prog;
-  //std::cout << cudaProg.c_str() << std::endl;
-  nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+//  std::cout << cudaProg.c_str() << std::endl;
+  nvrtcResult createResult = nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+  if (createResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("Program creation failed!");
+    return DKS_ERROR;
+  }

  //compile program
-  const char *opts[] = {"-fmad=false", ""};
-  int numopts = 1;
+  const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
+  int numopts = 2;
  if (mlh) {
-    opts[1] = "-DMLH";
-    numopts = 2;
+    opts[2] = "-DMLH";
+    numopts = 3;
  }

  nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
@ -118,7 +122,11 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
  if (ptx_m != NULL)
    delete[] ptx_m;
  size_t ptxSize; 
-  nvrtcGetPTXSize(prog, &ptxSize); 
+  nvrtcResult ptxSizeResult = nvrtcGetPTXSize(prog, &ptxSize);
+  if (ptxSizeResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("PTX get size error!");
+    return DKS_ERROR;
+  }
  ptx_m = new char[ptxSize]; 
  nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);  
  
@ -127,10 +135,26 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
    return DKS_ERROR;
  }

+  // add some additional diagnostics
+  const int buffer_size = 8192;
+  CUjit_option options[3];
+  void* values[3];
+  char error_log[buffer_size];
+  int err;
+  options[0] = CU_JIT_ERROR_LOG_BUFFER;
+  values[0]  = (void*)error_log;
+  options[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  values[1]  = (void*)buffer_size;
+  options[2] = CU_JIT_TARGET_FROM_CUCONTEXT;
+  values[2]  = 0;
  //load module from ptx
-  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); 
+  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 3, options, values); 
  if (loadResult != CUDA_SUCCESS) {
-    DEBUG_MSG("Load module from ptx failed!");
+    const char *err_msg;
+    cuGetErrorString(loadResult, &err_msg);
+    std::string msg = "Load module from ptx failed! (" + std::to_string(loadResult) + ") : " + err_msg;
+    DEBUG_MSG(msg);
+    DEBUG_MSG(error_log);
    return DKS_ERROR;
  }

--- a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
+++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
@ -83,6 +83,56 @@ __device__ double ifld(double t, double alpha, double phi, double nu, double lam
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

+__device__ double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double rate2 = sigma*sigma*t*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
+  }
+
+  return result;
+}
+
+__device__ double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double at = a*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
+  }
+
+  return result;
+}
+
 __device__ double b(double t, double phi, double nu) {
  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@ -42,7 +42,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  if (!fp)
    DEBUG_MSG("Can't open kernel file" << kernel_file);

-  //get file size and allocate memory	
+  //get file size and allocate memory
  fseek(fp, 0, SEEK_END);
  fsize = ftell(fp);
  kernel_source = new char[fsize+1];
@ -52,7 +52,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  fread(kernel_source, 1, sizeof(char)*fsize, fp);
  kernel_source[fsize] = '\0';
  fclose(fp);
-  
+
  std::string kernel_string (kernel_source);
  return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;

@ -76,7 +76,6 @@ int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {

 double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {

-  
  int ierr;
  //calc number of threads per workgroup and nr of work groups
  size_t work_size_sum = (size_t)blockSize_m;
@ -87,7 +86,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
    work_items = (length / work_size_sum + 1) * work_size_sum;
  int work_groups = length / work_size_sum + 1;
  */
-  
+
  size_t work_items = 80 * work_size_sum;
  int work_groups = 80;

@ -96,19 +95,19 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {

  double *partial_sums = new double[work_groups];
  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
-  
+
  //execute sum kernel
  m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
  m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
-  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); 
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
  m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
  
-  //read partial sums and free temp mempry
+  //read partial sums and free temp memory
  m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
  m_oclbase->ocl_freeMemory(tmp_ptr);
-  
+
  //sumup partial sums on the host
  double result = 0;
  for (int i = 0; i < work_groups; i++)
@ -157,6 +156,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
      return ierr;

    //set kernel args
+    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -172,20 +172,23 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
    m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
-    m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
+    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(double)*num, NULL);
+    num = nummap; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(17, sizeof(int)*num, NULL);

    if (ierr != DKS_SUCCESS)
      return ierr;
  } else if (fitType == FITTYPE_ASYMMETRY) {
    //create kernel
    ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
-
    if (ierr != DKS_SUCCESS)
      return ierr;

    //set kernel args
+    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -200,9 +203,12 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
-    m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
-    m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
+    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(14, sizeof(double)*num, NULL);
+    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    num = nummap; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(int)*num, NULL);

    if (ierr != DKS_SUCCESS)
      return ierr;
@ -250,7 +256,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  return ierr;
 }

-int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, 
+int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
 					  int size_func, int size_map)
 {

@ -285,7 +291,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
-    
+
    initDone_m = false;
  }

@ -321,4 +327,3 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
  return ierr;

 }
-
--- a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
@ -106,6 +106,56 @@ double ifld(double t, double alpha, double phi, double nu, double lambdaT, doubl
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

+double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double rate2 = sigma*sigma*t*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
+  }
+
+  return result;
+}
+
+double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double at = a*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
+  }
+
+  return result;
+}
+
 double b(double t, double phi, double nu) {
  return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
Author	SHA1	Message	Date
Andreas Suter	9381b14b87	changed version to 1.1.4	2020-06-09 13:04:48 +02:00
Andreas Suter	43cb9020c4	adapted for CUDA 11	2020-06-09 12:55:55 +02:00
Andreas Suter	3d946f666b	added the two new muSR functions ifgk and ifll (CUDA/OpenCL).	2019-01-22 14:10:02 +01:00
Andreas Suter	e6021eb6e3	Set kernel argument size to a value > 0 For the case that map or fun is not used in the msr-file, the corresponding call to the setKernelArg still needs a 2nd argument > 0, otherwise macOS crashes.	2018-12-11 11:35:32 +01:00