added a couple new theory functions (dglktfzf, dglktflf, fmuf).

adopted DKS to Cuda12 and GV100. For Tesla K40c code lines are still present but commented by as35.
increased minimum required cmake version to 3.2
2025-06-10 14:20:45 +02:00 · 2023-01-30 12:38:52 +01:00 · 2023-01-30 12:27:37 +01:00 · 2022-10-31 17:49:20 +01:00 · 2022-04-08 16:53:34 +02:00 · 2020-12-28 18:17:41 +01:00
9 changed files with 80 additions and 24 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,8 +1,8 @@
 CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
 PROJECT (DKS)
 SET (DKS_VERSION_MAJOR 1)
-SET (DKS_VERSION_MINOR 1)
-SET (DKS_VERSION_PATCH 4)
+SET (DKS_VERSION_MINOR 2)
+SET (DKS_VERSION_PATCH 0)
 set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
 SET (PACKAGE \"dks\")
 SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -148,7 +148,8 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
    MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
    SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

-    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+#as35: Tesla K40c    SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
+    SET (CUDA_NVCC_FLAGS "-arch=sm_70; -gencode=arch=compute_70,code=sm_70 -DDEBUG;-std=c++11;-D__wsu;-fmad=false")    
    SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")

    IF (NOT STATIC_CUDA)
--- a/cmake/DKSConfig.cmake.in
+++ b/cmake/DKSConfig.cmake.in
@ -1,7 +1,7 @@
 SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
 SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
-SET(${PROJECT_NAME}_LIBRARY "dks")
+SET(${PROJECT_NAME}_LIBRARY "dksshared")
 SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
 SET(DKS_CUDA_STATIC ${STATIC_CUDA})
 SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,4 +1,4 @@
-CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.2)

 SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 MACRO (ADD_SOURCES )
--- a/src/CUDA/CMakeListsLibcuda.txt
+++ b/src/CUDA/CMakeListsLibcuda.txt
@ -1,8 +1,9 @@
-CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.2)

 FIND_PACKAGE(CUDA REQUIRED)

-SET (CUDA_NVCC_FLAGS "-arch=sm_30")
+#as35: Tesla K40c SET (CUDA_NVCC_FLAGS "-arch=sm_30")
+SET (CUDA_NVCC_FLAGS "-arch=sm_70")

 SET(LIB_TYPE STATIC)

@ -22,4 +23,4 @@ INCLUDE_DIRECTORIES (
  ${CMAKE_CURRENT_SOURCE_DIR}
 )

-CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})
+CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})
--- a/src/CUDA/CudaBase.cu
+++ b/src/CUDA/CudaBase.cu
@ -245,7 +245,7 @@ int CudaBase::cuda_setDevice(int device) {
  std::cout << "Init: " << device << "\t" << ndev << std::endl;

  if (device < ndev) {
-    std::cout << "set device to: " << ndev << std::endl;
+    std::cout << "set device to: " << device << std::endl;
    cudaSetDevice(device);
  } else {
    if (ndev > 0)
--- a/src/CUDA/CudaChiSquareRuntime.cu
+++ b/src/CUDA/CudaChiSquareRuntime.cu
@ -94,7 +94,8 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
  }

  //compile program
-  const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
+//as35: for Tesla K40c  const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
+  const char *opts[] = {"-arch=compute_70", "-fmad=false", ""};
  int numopts = 2;
  if (mlh) {
    opts[2] = "-DMLH";
--- a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
+++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
@ -36,6 +36,25 @@ __device__ double sekt(double t, double lambda) {
  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
 }

+__device__ double dglktfzf(double t, double sigma, double hopp) {
+  double nut = hopp*t;
+
+  return exp(-sqrt(4.0*pow(sigma/hopp,2.0)*(exp(-nut)-1.0+nut)));
+}
+
+__device__ double dglktflf(double t, double nu0, double sigma, double hopp) {
+  double w0 = TWO_PI*nu0;
+  double w0_2 = w0*w0;
+  double w0_t = w0*t;
+  double nu_2 = hopp*hopp;
+  double nu_t = hopp*t;
+  double Gamma_t = ((w0_2+nu_2)*nu_t+(w0_2-nu_2)*(1.0-exp(-nu_t)*cos(w0_t))-2.0*hopp*w0*exp(-nu_t)*sin(w0_t))/pow(w0_2+nu_2,2.0);
+  if (Gamma_t < 0.0)
+    Gamma_t = 0.0;
+
+  return exp(-sqrt(4.0*sigma*hopp*Gamma_t));
+}
+
 __device__ double lgkt(double t, double lambda, double sigma) {
  double lambdat = lambda*t;
  double sigmatsq = pow(sigma*t, 2.0);
@ -69,6 +88,12 @@ __device__ double rahf(double t, double nu, double lambda) {
  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
 }

+__device__ double ab(double t, double sigma, double gamma) {
+    double gt = gamma*t;
+
+    return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
 __device__ double tf(double t, double phi, double nu) {
  double tmp_nu = TWO_PI*nu*t;
  double tmp_phi = DEG_TO_RAD*phi;
@ -144,12 +169,6 @@ __device__ double ib(double t, double alpha, double phi, double nu, double lambd
  return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

-__device__ double ab(double t, double sigma, double gamma) {
-  double gt = gamma*t;
-
-  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
-}
-
 __device__ double snkzf(double t, double Delta0, double Rb) {
  double D0t2 = pow(Delta0*t, 2.0);
  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
@ -184,6 +203,13 @@ __device__ double dnktf(double t, double phi, double nu, double Delta0, double R
  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
 }

+__device__ double fmuf(double t, double wd) {
+  double sqrt3 = sqrt(3.0);
+  double wd_t = wd*t;
+
+  return (3.0+cos(sqrt3*wd_t)+(1.0-1.0/sqrt3)*cos(((3.0-sqrt3)/2.0)*wd_t)+(1.0+1.0/sqrt3)*cos(((3.0 + sqrt3)/2.0)*wd_t))/6.0;
+}
+
 /** Theory and chisquare functions.
 * Based on the compiler flags set theory is calculated either in single hist mode or asymetric.
 * Based on the compiler flags calculate either chisq or MLE
--- a/src/DKSBaseMuSR.cpp
+++ b/src/DKSBaseMuSR.cpp
@ -146,10 +146,10 @@ int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int

  if (apiCuda()) {
    ierr = CUDA_SAFECALL( DKS_SUCCESS );
-    chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
+    chiSq = (ChiSquareRuntime*) CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
  } else {
    ierr = OPENCL_SAFECALL( DKS_SUCCESS );
-    chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
+    chiSq = (ChiSquareRuntime*) OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
  }

  if (ierr == DKS_SUCCESS) {
--- a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
+++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
@ -59,6 +59,25 @@ double sekt(double t, double lambda) {
  return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
 }

+double dglktfzf(double t, double sigma, double hopp) {
+  double nut = hopp*t;
+
+  return exp(-sqrt(4.0*pow(sigma/hopp,2.0)*(exp(-nut)-1.0+nut)));
+}
+
+double dglktflf(double t, double nu0, double sigma, double hopp) {
+  double w0 = TWO_PI*nu0;
+  double w0_2 = w0*w0;
+  double w0_t = w0*t;
+  double nu_2 = hopp*hopp;
+  double nu_t = hopp*t;
+  double Gamma_t = ((w0_2+nu_2)*nu_t+(w0_2-nu_2)*(1.0-exp(-nu_t)*cos(w0_t))-2.0*hopp*w0*exp(-nu_t)*sin(w0_t))/pow(w0_2+nu_2,2.0);
+  if (Gamma_t < 0.0)
+    Gamma_t = 0.0;
+
+  return exp(-sqrt(4.0*sigma*hopp*Gamma_t));
+}
+
 double lgkt(double t, double lambda, double sigma) {
  double lambdat = lambda*t;
  double sigmatsq = pow(sigma*t, 2.0);
@ -92,6 +111,12 @@ double rahf(double t, double nu, double lambda) {
  return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
 }

+double ab(double t, double sigma, double gamma) {
+  double gt = gamma*t;
+
+  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
+}
+
 double tf(double t, double phi, double nu) {
  double tmp_nu = TWO_PI*nu*t;
  double tmp_phi = DEG_TO_RAD * phi;
@ -167,12 +192,6 @@ double ib(double t, double alpha, double phi, double nu, double lambdaT, double
  return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
 }

-double ab(double t, double sigma, double gamma) {
-  double gt = gamma*t;
-
-  return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
-}
-
 double snkzf(double t, double Delta0, double Rb) {
  double D0t2 = pow(Delta0*t, 2.0);
  double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
@ -207,6 +226,14 @@ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double n
  return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
 }

+double fmuf(double t, double wd) {
+  double sqrt3 = sqrt(3.0);
+  double wd_t = wd*t;
+
+  return (3.0+cos(sqrt3*wd_t)+(1.0-1.0/sqrt3)*cos(((3.0-sqrt3)/2.0)*wd_t)+(1.0+1.0/sqrt3)*cos(((3.0 + sqrt3)/2.0)*wd_t))/6.0;
+}
+
+
 __kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err,
            __global double *par, __global double *chisq, __global  int *map, __global double *funcv,
 			      int length, int numpar, int numfunc, int nummap,
Author	SHA1	Message	Date
Andreas Suter	9d21fc5400	added a couple new theory functions (dglktfzf, dglktflf, fmuf).	2025-06-10 14:20:45 +02:00
Andreas Suter	21b4c591b9	adopted DKS to Cuda12 and GV100. For Tesla K40c code lines are still present but commented by as35.	2023-01-30 12:38:52 +01:00
Andreas Suter	efa3311b45	increased minimum required cmake version to 3.2	2023-01-30 12:27:37 +01:00
Andreas Suter	ee33aacdd6	fix a minor bug on reporting the selected device.	2022-10-31 17:49:20 +01:00
Andreas Suter	48f6f9c25e	make newer compilers happy.	2022-04-08 16:53:34 +02:00
Andreas Suter	db79798da5	switch to the shared dks lib.	2020-12-28 18:17:41 +01:00