added a couple new theory functions (dglktfzf, dglktflf, fmuf).

adopted DKS to Cuda12 and GV100. For Tesla K40c code lines are still present but commented by as35.
increased minimum required cmake version to 3.2
2025-06-10 14:20:45 +02:00 · 2023-01-30 12:38:52 +01:00 · 2023-01-30 12:27:37 +01:00 · 2022-10-31 17:49:20 +01:00 · 2022-04-08 16:53:34 +02:00 · 2020-12-28 18:17:41 +01:00
11 changed files with 236 additions and 51 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,8 +1,8 @@
 CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
-SET (DKS_VERSION_MINOR 1)
-SET (DKS_VERSION_PATCH 2)
+SET (DKS_VERSION_MINOR 2)
+SET (DKS_VERSION_PATCH 0)
 set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
 SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -148,7 +148,8 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

-    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+#as35: Tesla K40c    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+    SET (CUDA_NVCC_FLAGS "-arch=sm_70; -gencode=arch=compute_70,code=sm_70 -DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")

    IF (NOT STATIC_CUDA)
--- a/ReadMe.first
+++ b/ReadMe.first
@ -1,7 +1,7 @@
 ##################################################################
 #
 # Name:		Dynamic Kernel Scheduler
-# Version:	1.0
+# Version:	1.1
 # Author: 	Uldis Locans
 # Contacts:	locans.uldis@psi.ch
 #
--- a/cmake/DKSConfig.cmake.in
+++ b/cmake/DKSConfig.cmake.in
@ -1,7 +1,7 @@
 SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
 SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
-SET(${PROJECT_NAME}_LIBRARY "dks")
+SET(${PROJECT_NAME}_LIBRARY "dksshared")
 SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
 SET(DKS_CUDA_STATIC ${STATIC_CUDA})
 SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,4 +1,4 @@
-CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.2)

 SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 MACRO (ADD_SOURCES )
--- a/src/CUDA/CMakeListsLibcuda.txt
+++ b/src/CUDA/CMakeListsLibcuda.txt
@ -1,8 +1,9 @@
-CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.2)

 FIND_PACKAGE(CUDA REQUIRED)

-SET (CUDA_NVCC_FLAGS "-arch=sm_30")
+#as35: Tesla K40c SET (CUDA_NVCC_FLAGS "-arch=sm_30")
+SET (CUDA_NVCC_FLAGS "-arch=sm_70")

 SET(LIB_TYPE STATIC)

@ -22,4 +23,4 @@ INCLUDE_DIRECTORIES (
  ${CMAKE_CURRENT_SOURCE_DIR}
 )

-CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})
+CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -245,7 +245,7 @@ int CudaBase::cuda_setDevice(int device) {
  std::cout << "Init: " << device << "\t" << ndev << std::endl;

  if (device < ndev) {
-    std::cout << "set device to: " << ndev << std::endl;
+    std::cout << "set device to: " << device << std::endl;
    cudaSetDevice(device);
  } else {
    if (ndev > 0)
--- a/src/CUDA/CudaChiSquareRuntime.cu
+++ b/src/CUDA/CudaChiSquareRuntime.cu
@ -86,15 +86,20 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {

  //create program
  nvrtcProgram prog;
-  //std::cout << cudaProg.c_str() << std::endl;
-  nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+//  std::cout << cudaProg.c_str() << std::endl;
+  nvrtcResult createResult = nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
+  if (createResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("Program creation failed!");
+    return DKS_ERROR;
+  }

  //compile program
-  const char *opts[] = {"-fmad=false", ""};
-  int numopts = 1;
+//as35: for Tesla K40c  const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
+  const char *opts[] = {"-arch=compute_70", "-fmad=false", ""};
+  int numopts = 2;
  if (mlh) {
-    opts[1] = "-DMLH";
-    numopts = 2;
+    opts[2] = "-DMLH";
+    numopts = 3;
  }

  nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
@ -118,7 +123,11 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
  if (ptx_m != NULL)
    delete[] ptx_m;
  size_t ptxSize; 
-  nvrtcGetPTXSize(prog, &ptxSize); 
+  nvrtcResult ptxSizeResult = nvrtcGetPTXSize(prog, &ptxSize);
+  if (ptxSizeResult != NVRTC_SUCCESS) {
+    DEBUG_MSG("PTX get size error!");
+    return DKS_ERROR;
+  }
  ptx_m = new char[ptxSize]; 
  nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);  
  
@ -127,10 +136,26 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
    return DKS_ERROR;
  }

+  // add some additional diagnostics
+  const int buffer_size = 8192;
+  CUjit_option options[3];
+  void* values[3];
+  char error_log[buffer_size];
+  int err;
+  options[0] = CU_JIT_ERROR_LOG_BUFFER;
+  values[0]  = (void*)error_log;
+  options[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  values[1]  = (void*)buffer_size;
+  options[2] = CU_JIT_TARGET_FROM_CUCONTEXT;
+  values[2]  = 0;
  //load module from ptx
-  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); 
+  CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 3, options, values); 
  if (loadResult != CUDA_SUCCESS) {
-    DEBUG_MSG("Load module from ptx failed!");
+    const char *err_msg;
+    cuGetErrorString(loadResult, &err_msg);
+    std::string msg = "Load module from ptx failed! (" + std::to_string(loadResult) + ") : " + err_msg;
+    DEBUG_MSG(msg);
+    DEBUG_MSG(error_log);
    return DKS_ERROR;
  }

--- a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
+++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
@ -36,6 +36,25 @@ __device__ double sekt(double t, double lambda) {
  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
 }

+__device__ double dglktfzf(double t, double sigma, double hopp) {
+  double nut = hopp*t;
+
+  return exp(-sqrt(4.0*pow(sigma/hopp,2.0)*(exp(-nut)-1.0+nut)));
+}
+
+__device__ double dglktflf(double t, double nu0, double sigma, double hopp) {
+  double w0 = TWO_PI*nu0;
+  double w0_2 = w0*w0;
+  double w0_t = w0*t;
+  double nu_2 = hopp*hopp;
+  double nu_t = hopp*t;
+  double Gamma_t = ((w0_2+nu_2)*nu_t+(w0_2-nu_2)*(1.0-exp(-nu_t)*cos(w0_t))-2.0*hopp*w0*exp(-nu_t)*sin(w0_t))/pow(w0_2+nu_2,2.0);
+  if (Gamma_t < 0.0)
+    Gamma_t = 0.0;
+
+  return exp(-sqrt(4.0*sigma*hopp*Gamma_t));
+}
+
 __device__ double lgkt(double t, double lambda, double sigma) {
  double lambdat = lambda*t;
  double sigmatsq = pow(sigma*t, 2.0);
@ -69,6 +88,12 @@ __device__ double rahf(double t, double nu, double lambda) {
  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
 }

+__device__ double ab(double t, double sigma, double gamma) {
+    double gt = gamma*t;
+
+    return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
 __device__ double tf(double t, double phi, double nu) {
  double tmp_nu = TWO_PI*nu*t;
  double tmp_phi = DEG_TO_RAD*phi;
@ -83,6 +108,56 @@ __device__ double ifld(double t, double alpha, double phi, double nu, double lam
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

+__device__ double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double rate2 = sigma*sigma*t*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
+  }
+
+  return result;
+}
+
+__device__ double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double at = a*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
+  }
+
+  return result;
+}
+
 __device__ double b(double t, double phi, double nu) {
  return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
@ -94,12 +169,6 @@ __device__ double ib(double t, double alpha, double phi, double nu, double lambd
  return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

-__device__ double ab(double t, double sigma, double gamma) {
-  double gt = gamma*t;
-
-  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
-}
-
 __device__ double snkzf(double t, double Delta0, double Rb) {
  double D0t2 = pow(Delta0*t, 2.0);
  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
@ -134,6 +203,13 @@ __device__ double dnktf(double t, double phi, double nu, double Delta0, double R
  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
 }

+__device__ double fmuf(double t, double wd) {
+  double sqrt3 = sqrt(3.0);
+  double wd_t = wd*t;
+
+  return (3.0+cos(sqrt3*wd_t)+(1.0-1.0/sqrt3)*cos(((3.0-sqrt3)/2.0)*wd_t)+(1.0+1.0/sqrt3)*cos(((3.0 + sqrt3)/2.0)*wd_t))/6.0;
+}
+
 /** Theory and chisquare functions.
 * Based on the compiler flags set theory is calculated either in single hist mode or asymetric.
 * Based on the compiler flags calculate either chisq or MLE
--- a/src/DKSBaseMuSR.cpp
+++ b/src/DKSBaseMuSR.cpp
@ -146,10 +146,10 @@ int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int

  if (apiCuda()) {
    ierr = CUDA_SAFECALL( DKS_SUCCESS );
-    chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
+    chiSq = (ChiSquareRuntime*) CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
  } else {
    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
-    chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
+    chiSq = (ChiSquareRuntime*) OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
  }

  if (ierr == DKS_SUCCESS) {
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@ -42,7 +42,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  if (!fp)
    DEBUG_MSG("Can't open kernel file" << kernel_file);

-  //get file size and allocate memory	
+  //get file size and allocate memory
  fseek(fp, 0, SEEK_END);
  fsize = ftell(fp);
  kernel_source = new char[fsize+1];
@ -52,7 +52,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
  fread(kernel_source, 1, sizeof(char)*fsize, fp);
  kernel_source[fsize] = '\0';
  fclose(fp);
-  
+
  std::string kernel_string (kernel_source);
  return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;

@ -76,7 +76,6 @@ int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {

 double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {

-  
  int ierr;
  //calc number of threads per workgroup and nr of work groups
  size_t work_size_sum = (size_t)blockSize_m;
@ -87,7 +86,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
    work_items = (length / work_size_sum + 1) * work_size_sum;
  int work_groups = length / work_size_sum + 1;
  */
-  
+
  size_t work_items = 80 * work_size_sum;
  int work_groups = 80;

@ -96,19 +95,19 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {

  double *partial_sums = new double[work_groups];
  tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
-  
+
  //execute sum kernel
  m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
  m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
  m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
  m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
-  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); 
+  m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
  m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
  
-  //read partial sums and free temp mempry
+  //read partial sums and free temp memory
  m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
  m_oclbase->ocl_freeMemory(tmp_ptr);
-  
+
  //sumup partial sums on the host
  double result = 0;
  for (int i = 0; i < work_groups; i++)
@ -157,6 +156,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
      return ierr;

    //set kernel args
+    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -172,20 +172,23 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
    m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
-    m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
+    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(double)*num, NULL);
+    num = nummap; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(17, sizeof(int)*num, NULL);

    if (ierr != DKS_SUCCESS)
      return ierr;
  } else if (fitType == FITTYPE_ASYMMETRY) {
    //create kernel
    ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
-
    if (ierr != DKS_SUCCESS)
      return ierr;

    //set kernel args
+    size_t num=1;
    m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
    m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
    m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
@ -200,9 +203,12 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
    m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
    m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
    m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
-    m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
-    m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
-    m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
+    num = numpar; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(14, sizeof(double)*num, NULL);
+    num = numfunc; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
+    num = nummap; if (num == 0) num = 1;
+    m_oclbase->ocl_setKernelArg(16, sizeof(int)*num, NULL);

    if (ierr != DKS_SUCCESS)
      return ierr;
@ -250,7 +256,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
  return ierr;
 }

-int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, 
+int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
 					  int size_func, int size_map)
 {

@ -285,7 +291,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
    ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
-    
+
    initDone_m = false;
  }

@ -321,4 +327,3 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
  return ierr;

 }
-
--- a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
@ -59,6 +59,25 @@ double sekt(double t, double lambda) {
  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
 }

+double dglktfzf(double t, double sigma, double hopp) {
+  double nut = hopp*t;
+
+  return exp(-sqrt(4.0*pow(sigma/hopp,2.0)*(exp(-nut)-1.0+nut)));
+}
+
+double dglktflf(double t, double nu0, double sigma, double hopp) {
+  double w0 = TWO_PI*nu0;
+  double w0_2 = w0*w0;
+  double w0_t = w0*t;
+  double nu_2 = hopp*hopp;
+  double nu_t = hopp*t;
+  double Gamma_t = ((w0_2+nu_2)*nu_t+(w0_2-nu_2)*(1.0-exp(-nu_t)*cos(w0_t))-2.0*hopp*w0*exp(-nu_t)*sin(w0_t))/pow(w0_2+nu_2,2.0);
+  if (Gamma_t < 0.0)
+    Gamma_t = 0.0;
+
+  return exp(-sqrt(4.0*sigma*hopp*Gamma_t));
+}
+
 double lgkt(double t, double lambda, double sigma) {
  double lambdat = lambda*t;
  double sigmatsq = pow(sigma*t, 2.0);
@ -92,6 +111,12 @@ double rahf(double t, double nu, double lambda) {
  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
 }

+double ab(double t, double sigma, double gamma) {
+  double gt = gamma*t;
+
+  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
 double tf(double t, double phi, double nu) {
  double tmp_nu = TWO_PI*nu*t;
  double tmp_phi = DEG_TO_RAD * phi;
@ -106,6 +131,56 @@ double ifld(double t, double alpha, double phi, double nu, double lambdaT, doubl
  return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

+double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double rate2 = sigma*sigma*t*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
+  }
+
+  return result;
+}
+
+double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
+  double wt = TWO_PI*nu*t;
+  double at = a*t;
+  double rateL = 0.0;
+  double result = 0.0;
+
+  // make sure lambda > 0
+  if (lambda < 0.0)
+    return 0.0;
+
+  if (beta < 0.001) {
+    rateL = 1.0;
+  } else {
+    rateL = pow(lambda*t, beta);
+  }
+
+  if (nu < 0.01) {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
+  } else {
+    result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
+  }
+
+  return result;
+}
+
 double b(double t, double phi, double nu) {
  return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
 }
@ -117,12 +192,6 @@ double ib(double t, double alpha, double phi, double nu, double lambdaT, double
  return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

-double ab(double t, double sigma, double gamma) {
-  double gt = gamma*t;
-
-  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
-}
-
 double snkzf(double t, double Delta0, double Rb) {
  double D0t2 = pow(Delta0*t, 2.0);
  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
@ -157,6 +226,14 @@ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double n
  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
 }

+double fmuf(double t, double wd) {
+  double sqrt3 = sqrt(3.0);
+  double wd_t = wd*t;
+
+  return (3.0+cos(sqrt3*wd_t)+(1.0-1.0/sqrt3)*cos(((3.0-sqrt3)/2.0)*wd_t)+(1.0+1.0/sqrt3)*cos(((3.0 + sqrt3)/2.0)*wd_t))/6.0;
+}
+
+
 __kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err,
            __global double *par, __global double *chisq, __global  int *map, __global double *funcv,
 			      int length, int numpar, int numfunc, int nummap,
Author	SHA1	Message	Date
Andreas Suter	9d21fc5400	added a couple new theory functions (dglktfzf, dglktflf, fmuf).	2025-06-10 14:20:45 +02:00
Andreas Suter	21b4c591b9	adopted DKS to Cuda12 and GV100. For Tesla K40c code lines are still present but commented by as35.	2023-01-30 12:38:52 +01:00
Andreas Suter	efa3311b45	increased minimum required cmake version to 3.2	2023-01-30 12:27:37 +01:00
Andreas Suter	ee33aacdd6	fix a minor bug on reporting the selected device.	2022-10-31 17:49:20 +01:00
Andreas Suter	48f6f9c25e	make newer compilers happy.	2022-04-08 16:53:34 +02:00
Andreas Suter	db79798da5	switch to the shared dks lib.	2020-12-28 18:17:41 +01:00
Andreas Suter	9381b14b87	changed version to 1.1.4	2020-06-09 13:04:48 +02:00
Andreas Suter	43cb9020c4	adapted for CUDA 11	2020-06-09 12:55:55 +02:00
Andreas Suter	3d946f666b	added the two new muSR functions ifgk and ifll (CUDA/OpenCL).	2019-01-22 14:10:02 +01:00
Andreas Suter	e6021eb6e3	Set kernel argument size to a value > 0 For the case that map or fun is not used in the msr-file, the corresponding call to the setKernelArg still needs a 2nd argument > 0, otherwise macOS crashes.	2018-12-11 11:35:32 +01:00