snapshot of svn
This commit is contained in:
174
CMakeLists.txt
Normal file
174
CMakeLists.txt
Normal file
@ -0,0 +1,174 @@
|
||||
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
|
||||
PROJECT (DKS)
|
||||
SET (DKS_VERSION_MAJOR 1)
|
||||
SET (DKS_VERSION_MINOR 0.1)
|
||||
SET (PACKAGE \"dks\")
|
||||
SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\")
|
||||
SET (PACKAGE_NAME \"DKS\")
|
||||
SET (PACKAGE_STRING \"DKS\ 1.0.1\")
|
||||
SET (PACKAGE_TARNAME \"dks\")
|
||||
SET (PACKAGE_VERSION \"1.0.1\")
|
||||
SET (VERSION \"1.0.1\")
|
||||
|
||||
SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
||||
|
||||
#get compiler name
|
||||
#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
|
||||
STRING (REGEX REPLACE ".*/" "" COMPILER_NAME ${CMAKE_CXX_COMPILER})
|
||||
MESSAGE (STATUS "Your compiler is: ${COMPILER_NAME}")
|
||||
MESSAGE (STATUS "Your compiler is: ${CMAKE_CXX_COMPILER}")
|
||||
|
||||
MESSAGE (STATUS "C compiler: ${CMAKE_C_COMPILER_ID}")
|
||||
MESSAGE (STATUS "CXX compiler: ${CMAKE_CXX_COMPILER_ID}")
|
||||
|
||||
#opencl and cuda kernel files are in the builds include directory
|
||||
SET (OPENCL_KERNELS -DOPENCL_KERNELS=\\"${CMAKE_INSTALL_PREFIX}/include/\\")
|
||||
MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
|
||||
|
||||
#find boost
|
||||
set (BOOSTROOT $ENV{BOOST_DIR})
|
||||
SET (Boost_USE_STATIC_LIBS OFF)
|
||||
SET (Boost_USE_STATIC_RUNTIME OFF)
|
||||
FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
|
||||
IF (Boost_FOUND)
|
||||
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
|
||||
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
|
||||
MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
|
||||
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
|
||||
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
||||
ENDIF (Boost_FOUND)
|
||||
|
||||
#enable UQTK
|
||||
OPTION (USE_UQTK "Use UQTK" OFF)
|
||||
|
||||
|
||||
#intel icpc compiler specific flags
|
||||
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
||||
|
||||
#for intel compiler turn on openmp and opencl
|
||||
OPTION (USE_OPENCL "Use OpenCL" ON)
|
||||
OPTION (USE_CUDA "Use CUDA" OFF)
|
||||
OPTION (USE_MIC "Use intel MIC" ON)
|
||||
|
||||
#find xiar and xild and set flags for offload build on mic
|
||||
FIND_PROGRAM(XIAR xiar)
|
||||
IF(XIAR)
|
||||
MESSAGE(STATUS "xiar found: ${XIAR}")
|
||||
SET(CMAKE_AR "${XIAR}")
|
||||
ENDIF(XIAR)
|
||||
MARK_AS_ADVANCED(XIAR)
|
||||
SET(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> rcs -qoffload-build <TARGET> <LINK_FLAGS> <OBJECTS>")
|
||||
SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs -qoffload-build <TARGET> <LINK_FLAGS> <OBJECTS>")
|
||||
|
||||
FIND_PROGRAM(XILD xild)
|
||||
IF(XILD)
|
||||
SET(CMAKE_LINKER "${XILD}")
|
||||
ENDIF(XILD)
|
||||
MARK_AS_ADVANCED(XILD)
|
||||
|
||||
#set flags for openmp and opencl
|
||||
#TODO: check which opencl to use: nvidia, amd, intel, apple
|
||||
SET (CMAKE_CXX_FLAGS "-DDEBUG -O3 -Wall -offload -mkl -openmp -lOpenCL -lpthread -DDKS_MIC -DDKS_OPENCL -qopt-report=5 -qopt-report-phase=vec -std=c++11")
|
||||
|
||||
IF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc")
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
|
||||
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc")
|
||||
|
||||
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
||||
|
||||
#gnu copmpiler specific flags
|
||||
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
|
||||
|
||||
|
||||
OPTION (USE_OPENCL "Use OpenCL" ON)
|
||||
OPTION (USE_CUDA "Use CUDA" OFF)
|
||||
OPTION (USE_MIC "Use intel MIC" OFF)
|
||||
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
|
||||
|
||||
FIND_PACKAGE(CUDA)
|
||||
IF (CUDA_FOUND)
|
||||
SET (USE_CUDA ON)
|
||||
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
|
||||
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
|
||||
|
||||
MESSAGE (STATUS "cuda include: ${CUDA_INCLUDE_DIRS}")
|
||||
MESSAGE (STATUS "cuda libs: ${CUDA_TOOLKIT_ROOT_DIR}/lib64")
|
||||
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
|
||||
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
|
||||
SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
|
||||
|
||||
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
|
||||
|
||||
#if cuda version >= 7.0 add runtime commpilation flags
|
||||
IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
|
||||
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||
|
||||
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
|
||||
|
||||
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
|
||||
#set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
SET(BUILD_SHARED_LIBS OFF)
|
||||
|
||||
ENDIF (CUDA_FOUND)
|
||||
|
||||
IF (NOT CUDA_FOUND)
|
||||
|
||||
MESSAGE(STATUS "CUDA not found, looking for OpenCL")
|
||||
|
||||
FIND_PACKAGE(OpenCL)
|
||||
IF (OpenCL_FOUND)
|
||||
MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
|
||||
MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
|
||||
MESSAGE(STATUS "OpenCL library dir: ${OpenCL_LIBRARY}")
|
||||
INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIR})
|
||||
LINK_DIRECTORIES(${OpenCL_LIBRARY})
|
||||
ENDIF (OpenCL_FOUND)
|
||||
|
||||
ENDIF (NOT CUDA_FOUND)
|
||||
|
||||
#if mac OS and no CUDA set apple opencl flags
|
||||
IF (APPLE AND NOT CUDA_FOUND)
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -framework opencl -lpthread -DDKS_OPENCL")
|
||||
ENDIF(APPLE AND NOT CUDA_FOUND)
|
||||
|
||||
#if cuda found set cuda opencl flags
|
||||
IF (CUDA_FOUND)
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
|
||||
ENDIF (CUDA_FOUND)
|
||||
|
||||
#if cuda not found but amd opencl found set opencl flags
|
||||
IF (NOT CUDA_FOUND AND OpenCL_FOUND)
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
|
||||
ENDIF(NOT CUDA_FOUND AND OpenCL_FOUND)
|
||||
|
||||
#if mpi compiler used set mpi flag
|
||||
IF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
|
||||
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||
|
||||
ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
|
||||
|
||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
|
||||
MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
|
||||
|
||||
ADD_SUBDIRECTORY (src)
|
||||
|
||||
IF (ENABLE_TESTS)
|
||||
ADD_SUBDIRECTORY (test)
|
||||
ENDIF (ENABLE_TESTS)
|
||||
|
||||
ADD_SUBDIRECTORY (auto-tuning)
|
||||
|
||||
### write configure files ###
|
||||
CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake )
|
||||
|
||||
### install files ###
|
||||
INSTALL (
|
||||
FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake
|
||||
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
|
||||
RENAME ${PROJECT_NAME}Config.cmake
|
||||
)
|
82
ReadMe.first
Normal file
82
ReadMe.first
Normal file
@ -0,0 +1,82 @@
|
||||
##################################################################
|
||||
#
|
||||
# Name: Dynamic Kernel Scheduler
|
||||
# Version: 1.0
|
||||
# Author: Uldis Locans
|
||||
# Contacts: locans.uldis@psi.ch
|
||||
#
|
||||
##################################################################
|
||||
|
||||
Dynamic Kernel Scheduler is a library that provides a software layer between host application
|
||||
and hardware accelerators. DKS handles communication between host and device and schedules task
|
||||
execution using predefined algorithms writen using CUDA and OpenCL for GPUs, and OpenMP with
|
||||
offload pragmas for IntelMIC. See DKSBase class documentation for full list of functions provided
|
||||
by DKS.
|
||||
|
||||
#####Requirements#####
|
||||
|
||||
OpenMPI (Cuda aware OpenMPI enabled for full compatability)
|
||||
g++ or icpc compiler
|
||||
Cuda 7.0 or higher (optional)
|
||||
Nvidia or Intel OpenCL SDK (optional)
|
||||
Intel MIC compilers (optional)
|
||||
|
||||
|
||||
######Install######
|
||||
|
||||
#check out DKS
|
||||
svn co svn+ssh://YOULOGIN@savannah02.psi.ch/repos/amas/users/adelmann/Ph.D-students/Locans/work/DKS/trunk DKS
|
||||
|
||||
#set compilers to use
|
||||
#supported c++ compilers: g++, icpc, mpicxx whith g++
|
||||
#supported c compilers: gcc, icc, mpicc whith gcc
|
||||
export CXX_COMPILER=cpp_compiler_name
|
||||
export CC_COMPILER=c_compiler_name
|
||||
|
||||
#set dks root directory directory
|
||||
cd DKS
|
||||
export DKS_ROOT = $PWD
|
||||
|
||||
#set build directory
|
||||
mkdir $DKS_BUILD_DIR
|
||||
cd $DKS_BUILD_DIR
|
||||
|
||||
#set install directory
|
||||
export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
|
||||
|
||||
CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
|
||||
|
||||
make
|
||||
make install
|
||||
|
||||
|
||||
######DKS usage######
|
||||
Make install copies the include files and library files to $DKS_BUILD_DIR/build folder, lib folder
|
||||
in the build directory contains libdks.a and libdksshared.so, on of these libraries can be used to link
|
||||
with DKS. All the necessary include files are located in $DKS_BUILD_DIR/build/include.
|
||||
|
||||
Additional flags needed for CUDA and OpenCL mode:
|
||||
-lcudart -lcufft -lcublas -lnvToolsExt -lOpenCL -lnvrtc -lcuda -DDKS_CUDA -DDKS_OPENCL
|
||||
|
||||
Additional flags needed for IntelMIC and OpenCL mode:
|
||||
-offload -mkl -openmp -lOpenCL -DDKS_MIC -DDKS_OPENCL
|
||||
|
||||
Note: always run make install, during runtime OpenCL and CUDA will search for kernel files in
|
||||
$DKS_INSTALL_DIR/build/include directory for runtime compilation.
|
||||
|
||||
######Running DKS######
|
||||
|
||||
#running with cuda
|
||||
#nvidia multi process service started for better CUDA and MPI execution
|
||||
|
||||
#to start mps service (if multiple users use DKS start MPS as root)
|
||||
nvidia-cuda-mps-control -d
|
||||
#to stop mps service
|
||||
echo quit | nvidia-cuda-mps-control
|
||||
|
||||
|
||||
#runnign dks with MIC
|
||||
#Intel Manycore Platform Software Stack (mpss) service started
|
||||
|
||||
#to start mpss
|
||||
service mpss start
|
19
auto-tuning/CMakeLists.txt
Normal file
19
auto-tuning/CMakeLists.txt
Normal file
@ -0,0 +1,19 @@
|
||||
INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||
|
||||
#chi square kernel tests
|
||||
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
|
||||
|
||||
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
||||
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
|
||||
|
||||
IF (USE_UQTK)
|
||||
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
||||
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||
ENDIF (USE_UQTK)
|
||||
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
|
||||
|
||||
#test to verify search functions
|
||||
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
||||
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
|
385
auto-tuning/testChiSquareRT.cpp
Normal file
385
auto-tuning/testChiSquareRT.cpp
Normal file
@ -0,0 +1,385 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#include "DKSBaseMuSR.h"
|
||||
#include "Utility/DKSTimer.h"
|
||||
|
||||
#define PI 3.14159265358979323846
|
||||
#define TWO_PI 6.283185307179586231996
|
||||
#define DEG_TO_RAD 1.7453292519943295474371681e-2
|
||||
|
||||
#define N0 0.25
|
||||
#define TAU 2.197019
|
||||
#define BKG 1.0
|
||||
|
||||
#define ALPHA 1.0
|
||||
#define BETA 1.0
|
||||
|
||||
using namespace std;
|
||||
|
||||
void randData(double *data, int N, int scale = 1) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = ((double)rand() / RAND_MAX ) * scale;
|
||||
}
|
||||
|
||||
/** MusrFit predefined functions.
|
||||
* Predefined functions from MusrFit that can be used to define the theory function.
|
||||
* First parameter in all the functions is alwats time - t, rest of the parameters depend
|
||||
* on the function.
|
||||
*/
|
||||
double se(double t, double lamda) {
|
||||
return exp( -lamda*t );
|
||||
}
|
||||
|
||||
double ge(double t, double lamda, double beta) {
|
||||
return exp( -pow(lamda*t, beta) );
|
||||
}
|
||||
|
||||
double sg(double t, double sigma) {
|
||||
return exp( -0.5 * pow(sigma*t, 2) );
|
||||
}
|
||||
|
||||
double stg(double t, double sigma) {
|
||||
double sigmatsq = pow(sigma*t,2);
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
|
||||
}
|
||||
|
||||
double sekt(double t, double lambda) {
|
||||
double lambdat = lambda*t;
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
|
||||
}
|
||||
|
||||
double lgkt(double t, double lambda, double sigma) {
|
||||
double lambdat = lambda*t;
|
||||
double sigmatsq = pow(sigma*t, 2.0);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
|
||||
}
|
||||
|
||||
double skt(double t, double sigma, double beta) {
|
||||
if (beta < 1.0e-3)
|
||||
return 0.0;
|
||||
double sigmatb = pow(sigma*t, beta);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
|
||||
}
|
||||
|
||||
double spg(double t, double lambda, double gamma, double q) {
|
||||
double lam2 = lambda*lambda;
|
||||
double lamt2q = t*t*lam2*q;
|
||||
double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
|
||||
double rateL = sqrt(fabs(rate2));
|
||||
double rateT = sqrt(fabs(rate2)+lamt2q);
|
||||
|
||||
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
|
||||
}
|
||||
|
||||
double rahf(double t, double nu, double lambda) {
|
||||
double nut = nu*t;
|
||||
double nuth = nu*t/2.0;
|
||||
double lamt = lambda*t;
|
||||
|
||||
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
|
||||
}
|
||||
|
||||
double tf(double t, double phi, double nu) {
|
||||
double tmp_nu = TWO_PI*nu*t;
|
||||
double tmp_phi = DEG_TO_RAD * phi;
|
||||
|
||||
return cos(tmp_nu + tmp_phi);
|
||||
}
|
||||
|
||||
double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
|
||||
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||
}
|
||||
|
||||
double b(double t, double phi, double nu) {
|
||||
return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
||||
}
|
||||
|
||||
double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
|
||||
double wt = TWO_PI * nu * t;
|
||||
double ph = DEG_TO_RAD * phi;
|
||||
|
||||
return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||
}
|
||||
|
||||
double ab(double t, double sigma, double gamma) {
|
||||
double gt = gamma*t;
|
||||
|
||||
return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
|
||||
}
|
||||
|
||||
double snkzf(double t, double Delta0, double Rb) {
|
||||
double D0t2 = pow(Delta0*t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
|
||||
}
|
||||
|
||||
double snktf(double t, double phi, double nu, double Delta0, double Rb) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
double D0t2 = pow(Delta0*t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
|
||||
|
||||
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
double dnkzf(double t, double Delta0, double Rb, double nuc) {
|
||||
double nuct = nuc*t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
|
||||
double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
|
||||
}
|
||||
|
||||
double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
double nuct = nuc*t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
|
||||
double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
|
||||
double cpuChiSq(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc,
|
||||
double timeStart, double timeStep, bool mlh = false)
|
||||
{
|
||||
|
||||
double result = 0.0;
|
||||
for (int i = 0; i < Ndata; i++) {
|
||||
|
||||
double t = timeStart + i*timeStep;
|
||||
double d = data[i];
|
||||
double e = data[i];
|
||||
|
||||
double fTheory = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
|
||||
double theo = N0 * exp(-t/TAU) * (1.0 + fTheory) + BKG;
|
||||
|
||||
if (mlh) {
|
||||
if ((d > 1.0e-9) && (fabs(theo) > 1.0e-9))
|
||||
result += 2.0 * ((theo - d) + d * log(d / theo));
|
||||
else
|
||||
result += 2.0 * (theo - d);
|
||||
} else {
|
||||
if (e != 0.0)
|
||||
result += ( (theo - d) * (theo - d) ) / (e * e);
|
||||
else
|
||||
result += theo * theo;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
double cpuChiSqAsym(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc,
|
||||
double timeStart, double timeStep, bool mlh = false)
|
||||
{
|
||||
|
||||
double result = 0.0;
|
||||
for (int i = 0; i < Ndata; i++) {
|
||||
|
||||
double t = timeStart + i*timeStep;
|
||||
double d = data[i];
|
||||
double e = data[i];
|
||||
|
||||
double theoVal = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
|
||||
double ab = ALPHA * BETA;
|
||||
|
||||
|
||||
double theo = ((ab+1.0)*theoVal - (ALPHA-1.0))/((ALPHA+1.0) - (ab-1.0)*theoVal);
|
||||
|
||||
if (mlh) {
|
||||
result += 0.0; //log max likelihood not defined here
|
||||
} else {
|
||||
if (e != 0.0)
|
||||
result += ( (theo - d) * (theo - d) ) / (e * e);
|
||||
else
|
||||
result += theo * theo;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int runTest(const char *api_name, const char *device_name, bool autotune, bool mlh, bool asym) {
|
||||
|
||||
int ierr;
|
||||
|
||||
/*
|
||||
* Histogram size used in tests. If autotune run kernes with sizes from 1e5 to 1e6.
|
||||
* If autotune is off just run the test once (used for debuging to test the kernel)
|
||||
*/
|
||||
int Nstart = 1e5;
|
||||
int Nstep = 1e5;
|
||||
int Nend = (autotune) ? 1e6 : 1e5;
|
||||
|
||||
//parameter, function and map sizes used in tests
|
||||
int Npar = 66;
|
||||
int Nfnc = 2;
|
||||
int Nmap = 5;
|
||||
|
||||
//print test info
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Max log likelihood: " << std::boolalpha << mlh << endl;
|
||||
cout << "Asymetry fit: " << std::boolalpha << asym << endl;
|
||||
|
||||
DKSBaseMuSR dksbase;
|
||||
dksbase.setAPI(api_name);
|
||||
dksbase.setDevice(device_name);
|
||||
ierr = dksbase.initDevice();
|
||||
if (ierr != DKS_SUCCESS) {
|
||||
std::cout << "Device not supported!" << std::endl;
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//get the list of different devices
|
||||
std::vector<int> devices;
|
||||
dksbase.getDeviceList(devices);
|
||||
std::cout << "Unique devices: " << devices.size() << std::endl;
|
||||
|
||||
//create the function string to use in test
|
||||
string sFnc = "p[m[0]] * f[m[1]] * sg(t, p[m[2]]) * tf(t, p[m[3]], f[m[4]])";
|
||||
int map[5] = {0, 0, 1, 2, 1};
|
||||
|
||||
//runt tests from 100k to 1mil data points
|
||||
for (unsigned int device = 0; device < devices.size(); device++) {
|
||||
for (int Ndata = Nstart; Ndata <= Nend; Ndata += Nstep) {
|
||||
|
||||
dksbase.setDefaultDevice(device);
|
||||
|
||||
std::cout << "Ndata: " << Ndata << std::endl;
|
||||
|
||||
//init the chi square calculations
|
||||
dksbase.initChiSquare(Ndata, Npar, Nfnc, Nmap);
|
||||
|
||||
//create random arrays for data, parameter and function storage
|
||||
double *data = new double[Ndata];
|
||||
double *par = new double[Npar];
|
||||
double *fnc = new double[Nfnc];
|
||||
|
||||
randData(data, Ndata);
|
||||
randData(par, Npar);
|
||||
randData(fnc, Nfnc, 100);
|
||||
|
||||
//allocate memory on device
|
||||
void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
|
||||
|
||||
//write data, params, functions and maps to the device
|
||||
dksbase.writeData<double>(data_ptr, data, Ndata);
|
||||
dksbase.writeParams(par, Npar);
|
||||
dksbase.writeFunctions(fnc, Nfnc);
|
||||
dksbase.writeMaps(map, Nmap);
|
||||
|
||||
//set musrfit constants
|
||||
dksbase.callSetConsts(N0, TAU, BKG);
|
||||
dksbase.callSetConsts(ALPHA, BETA);
|
||||
|
||||
//compile the program created with the function string
|
||||
dksbase.callCompileProgram(sFnc, mlh);
|
||||
|
||||
//set autotuning on/off
|
||||
if (autotune)
|
||||
dksbase.setAutoTuningOn();
|
||||
|
||||
//tmp values to store results and tmp values for time steps and start time
|
||||
double result_gpu = 0.0;
|
||||
double result_cpu = 0.0;
|
||||
double dt = 1e-12;
|
||||
double ts = 1e-7;
|
||||
|
||||
//execute kernel on the GPU and execute the same function on the cpu
|
||||
if (!asym) {
|
||||
dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, Npar, Nfnc,
|
||||
Nmap, ts, dt, result_gpu);
|
||||
result_cpu = cpuChiSq(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);
|
||||
} else {
|
||||
dksbase.callLaunchChiSquare(2, data_ptr, data_ptr, Ndata, Npar, Nfnc,
|
||||
Nmap, ts, dt, result_gpu);
|
||||
result_cpu = cpuChiSqAsym(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);
|
||||
}
|
||||
|
||||
//check the results
|
||||
cout << "DKS: " << result_gpu << endl;
|
||||
cout << "CPU: " << result_cpu << endl;
|
||||
|
||||
//free CPU and GPU memory
|
||||
dksbase.freeMemory<double>(data_ptr, Ndata);
|
||||
dksbase.freeChiSquare();
|
||||
|
||||
delete[] data;
|
||||
delete[] par;
|
||||
delete[] fnc;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
bool asym = false;
|
||||
bool mlh = false;
|
||||
bool autotune = false;
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
|
||||
if (argv[i] == string("-cuda")) {
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-opencl")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-cpu")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-cpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-mlh"))
|
||||
mlh = true;
|
||||
|
||||
if (argv[i] == string("-asym"))
|
||||
asym = true;
|
||||
|
||||
if (argv[i] == string("-autotune"))
|
||||
autotune = true;
|
||||
|
||||
}
|
||||
|
||||
int numPlatforms = 2;
|
||||
const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
|
||||
const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
|
||||
|
||||
for (int i = 0; i < numPlatforms; i++) {
|
||||
runTest(api[i], device[i], autotune, mlh, asym);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
450
auto-tuning/testChiSquareRTRandom.cpp
Normal file
450
auto-tuning/testChiSquareRTRandom.cpp
Normal file
@ -0,0 +1,450 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#include "DKSBaseMuSR.h"
|
||||
#include "Utility/DKSTimer.h"
|
||||
|
||||
#define PI 3.14159265358979323846
|
||||
#define TWO_PI 6.283185307179586231996
|
||||
#define DEG_TO_RAD 1.7453292519943295474371681e-2
|
||||
|
||||
//#define N0 0.25
|
||||
#define N0 1e-10
|
||||
#define TAU 2.197019
|
||||
#define BKG 0.05
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef std::function<double()> doubleF;
|
||||
|
||||
void randData(double *data, int N, int scale = 1) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = ((double)rand() / RAND_MAX ) * scale;
|
||||
}
|
||||
|
||||
/** MusrFit predefined functions.
|
||||
* Predefined functions from MusrFit that can be used to define the theory function.
|
||||
* First parameter in all the functions is alwats time - t, rest of the parameters depend
|
||||
* on the function.
|
||||
*/
|
||||
double se(double *t, double *lamda) {
|
||||
return exp( -*lamda**t );
|
||||
}
|
||||
|
||||
double ge(double *t, double *lamda, double *beta) {
|
||||
return exp( -pow( (*lamda)*(*t), *beta) );
|
||||
}
|
||||
|
||||
double sg(double *t, double *sigma) {
|
||||
return exp( -0.5 * pow((*sigma)*(*t), 2) );
|
||||
}
|
||||
|
||||
double stg(double *t, double *sigma) {
|
||||
double sigmatsq = pow((*sigma)*(*t),2);
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
|
||||
}
|
||||
|
||||
double sekt(double *t, double *lambda) {
|
||||
double lambdat = *lambda*(*t);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
|
||||
}
|
||||
|
||||
double lgkt(double *t, double *lambda, double *sigma) {
|
||||
double lambdat = *lambda*(*t);
|
||||
double sigmatsq = pow(*sigma*(*t), 2.0);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
|
||||
}
|
||||
|
||||
double skt(double *t, double *sigma, double *beta) {
|
||||
if (*beta < 1.0e-3)
|
||||
return 0.0;
|
||||
double sigmatb = pow(*sigma*(*t), (*beta));
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
|
||||
}
|
||||
|
||||
double spg(double *t, double *lambda, double *gamma, double *q) {
|
||||
double lam2 = (*lambda)*(*lambda);
|
||||
double lamt2q = (*t)*(*t)*lam2*(*q);
|
||||
double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
|
||||
double rateL = sqrt(fabs(rate2));
|
||||
double rateT = sqrt(fabs(rate2)+lamt2q);
|
||||
|
||||
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
|
||||
}
|
||||
|
||||
double rahf(double *t, double *nu, double *lambda) {
|
||||
double nut = *nu*(*t);
|
||||
double nuth = *nu*(*t)/2.0;
|
||||
double lamt = *lambda*(*t);
|
||||
|
||||
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
|
||||
}
|
||||
|
||||
double tf(double *t, double *phi, double *nu) {
|
||||
double tmp_nu = TWO_PI**nu**t;
|
||||
double tmp_phi = DEG_TO_RAD * *phi;
|
||||
|
||||
return cos(tmp_nu + tmp_phi);
|
||||
}
|
||||
|
||||
double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
|
||||
double wt = TWO_PI**nu**t;
|
||||
double ph = DEG_TO_RAD**phi;
|
||||
|
||||
return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
|
||||
}
|
||||
|
||||
double b(double *t, double *phi, double *nu) {
|
||||
return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
|
||||
}
|
||||
|
||||
double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
|
||||
double wt = TWO_PI * *nu * *t;
|
||||
double ph = DEG_TO_RAD * *phi;
|
||||
|
||||
return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
|
||||
}
|
||||
|
||||
double ab(double *t, double *sigma, double *gamma) {
|
||||
double gt = *gamma**t;
|
||||
|
||||
return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
|
||||
}
|
||||
|
||||
double snkzf(double *t, double *Delta0, double *Rb) {
|
||||
double D0t2 = pow(*Delta0**t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
|
||||
}
|
||||
|
||||
double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
|
||||
double wt = TWO_PI**nu**t;
|
||||
double ph = DEG_TO_RAD**phi;
|
||||
double D0t2 = pow(*Delta0**t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
|
||||
|
||||
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
|
||||
double nuct = *nuc**t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
|
||||
double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
|
||||
}
|
||||
|
||||
double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
|
||||
double wt = TWO_PI**nu**t;
|
||||
double ph = DEG_TO_RAD**phi;
|
||||
double nuct = *nuc**t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
|
||||
double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
double evalf(std::vector< std::pair<int, doubleF> > func) {
|
||||
|
||||
double result = 0.0;
|
||||
for (auto f : func) {
|
||||
switch (f.first) {
|
||||
case 0: result += f.second(); break;
|
||||
case 1: result -= f.second(); break;
|
||||
default: result += f.second(); break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
|
||||
|
||||
double result = 0.0;
|
||||
double ts = *t;
|
||||
|
||||
for (int i = 0; i < ndata; i++) {
|
||||
|
||||
*t = ts + i*dt;
|
||||
double d = data[i];
|
||||
double e = data[i];
|
||||
|
||||
double vf = evalf(func);
|
||||
double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
|
||||
|
||||
if (e != 0.0)
|
||||
result += ( (theo - d) * (theo - d) ) / (e*e);
|
||||
else
|
||||
result += theo * theo;
|
||||
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//create a random length from 50 - 1000 array and fill with random values from 0 to 1
|
||||
void randomParams(double *p, int np) {
|
||||
for (int i = 0; i < np; i++)
|
||||
p[i] = (double)rand() / RAND_MAX;
|
||||
}
|
||||
|
||||
//create map array of random size and fill with indexes from 0 to max, max < size of param array
|
||||
void randomMaps(int *m, int nm, int max) {
|
||||
for (int i = 0; i < nm; i++)
|
||||
m[i] = rand() % max;
|
||||
}
|
||||
|
||||
int generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc,
|
||||
double *t, double *p, int *m, int np, int nm)
|
||||
{
|
||||
|
||||
//nf defines the number of functions to generate (from 1 to 25)
|
||||
int nf = rand() % 25 + 1;
|
||||
|
||||
for (int n = 0; n < nf; n++) {
|
||||
std::string sf = "";
|
||||
doubleF f;
|
||||
|
||||
int r = rand() % 18; //choose random function to use
|
||||
|
||||
int id1 = rand() % nm;
|
||||
int id2 = rand() % nm;
|
||||
int id3 = rand() % nm;
|
||||
int id4 = rand() % nm;
|
||||
int id5 = rand() % nm;
|
||||
|
||||
std::string p1 = "p[m[" + to_string(id1) + "]])";
|
||||
std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
|
||||
std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
|
||||
to_string(id3) + "]])";
|
||||
std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
|
||||
to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
|
||||
std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
|
||||
to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
|
||||
|
||||
//get a random index from maps and use it to get the parameter value, bind function and parameter
|
||||
//values to f, and create string for gpu in sfunc
|
||||
switch (r) {
|
||||
case 0:
|
||||
f = std::bind(se, t, &p[m[id1]]);
|
||||
sf = "se(t," + p1;
|
||||
break;
|
||||
case 1:
|
||||
f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "ge(t," + p2;
|
||||
break;
|
||||
case 2:
|
||||
f = std::bind(sg, t, &p[m[id1]]);
|
||||
sf = "sg(t, " + p1;
|
||||
break;
|
||||
case 3:
|
||||
f = std::bind(stg, t, &p[m[id1]]);
|
||||
sf = "stg(t, " + p1;
|
||||
break;
|
||||
case 4:
|
||||
f = std::bind(sekt, t, &p[m[id1]]);
|
||||
sf = "sekt(t, " + p1;
|
||||
break;
|
||||
case 5:
|
||||
f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "lgkt(t, " + p2;
|
||||
break;
|
||||
case 6:
|
||||
f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "skt(t, " + p2;
|
||||
break;
|
||||
case 7:
|
||||
f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
|
||||
sf = "spg(t, " + p3;
|
||||
break;
|
||||
case 8:
|
||||
f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "rahf(t, " + p2;
|
||||
break;
|
||||
case 9:
|
||||
f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "tf(t, " + p2;
|
||||
break;
|
||||
case 10:
|
||||
f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
|
||||
sf = "ifld(t, " + p5;
|
||||
break;
|
||||
case 11:
|
||||
f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "b(t, " + p2;
|
||||
break;
|
||||
case 12:
|
||||
f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
|
||||
sf = "ib(t, " + p5;
|
||||
break;
|
||||
case 13:
|
||||
f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "ab(t, " + p2;
|
||||
break;
|
||||
case 14:
|
||||
f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "snkzf(t, " + p2;
|
||||
break;
|
||||
case 15:
|
||||
f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
|
||||
sf = "snktf(t, " + p4;
|
||||
break;
|
||||
case 16:
|
||||
f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
|
||||
sf = "dnkzf(t, " + p3;
|
||||
break;
|
||||
case 17:
|
||||
f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
|
||||
sf = "dnktf(t, " + p5;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
int sign = rand() % 2;
|
||||
if (n == 0) sign = 0;
|
||||
func.push_back( std::make_pair(sign, f) );
|
||||
if (n == 0)
|
||||
sfunc = sf;
|
||||
else {
|
||||
switch(sign) {
|
||||
case 0: sfunc += " + " + sf; break;
|
||||
case 1: sfunc += " - " + sf; break;
|
||||
default: sfunc += " + " + sf; break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return nf;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
|
||||
srand(time(NULL));
|
||||
|
||||
int ierr;
|
||||
int Ndata = 1e6;
|
||||
|
||||
bool autotune = false;
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
|
||||
if (argv[i] == string("-cuda")) {
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-opencl")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-cpu")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-cpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-autotune")) {
|
||||
autotune = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//create a random number of parameters
|
||||
int np = ( rand() % (1000 - 50) ) + 50;
|
||||
int nm = ( rand() % (50 - 5) ) + 5;
|
||||
int nf = ( rand() % (50 - 5) ) + 5;
|
||||
|
||||
int *m = new int[nm];
|
||||
double *p = new double[np];
|
||||
double *f = new double[nf];
|
||||
|
||||
randomParams(p, np);
|
||||
randomMaps(m, nm, np);
|
||||
randomParams(f, nf);
|
||||
|
||||
double dt = 1e-10;
|
||||
double t = 1e-10;
|
||||
std::vector< std::pair<int, doubleF> > func;
|
||||
std::string sfunc;
|
||||
int nfunc = generateRandomFunction(func, sfunc, &t, p, m, np, nm);
|
||||
|
||||
//create DKS base object, set and init device / framework
|
||||
DKSBaseMuSR dksbase;
|
||||
dksbase.setAPI(api_name);
|
||||
dksbase.setDevice(device_name);
|
||||
|
||||
dksbase.initDevice();
|
||||
dksbase.initChiSquare(Ndata, np, nf, nm);
|
||||
|
||||
dksbase.writeParams(p, np);
|
||||
dksbase.writeFunctions(f, nf);
|
||||
dksbase.writeMaps(m, nm);
|
||||
|
||||
dksbase.callSetConsts(N0, TAU, BKG);
|
||||
|
||||
dksbase.callCompileProgram(sfunc);
|
||||
|
||||
if (autotune)
|
||||
dksbase.setAutoTuningOn();
|
||||
|
||||
int oper = 0;
|
||||
dksbase.getOperations(oper);
|
||||
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Number of params: " << np << endl;
|
||||
cout << "Number of maps: " << nm << endl;
|
||||
cout << "Number of predefined functions: " << nfunc << endl;
|
||||
cout << "Number of ptx instructions: " << oper << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
cout << sfunc << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
|
||||
//allocate memory on host and device device
|
||||
double *data = new double[Ndata];
|
||||
randomParams(data, Ndata);
|
||||
void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
|
||||
dksbase.writeData<double>(data_ptr, data, Ndata);
|
||||
|
||||
for (int N = 1e5; N < Ndata + 1; N += 1e5) {
|
||||
double result_dks, result_cpu;
|
||||
|
||||
t = 1e-10;
|
||||
|
||||
dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, N, np, nf, nm, t, dt, result_dks);
|
||||
result_cpu = cpuChiSq(data, func, N, &t, dt);
|
||||
|
||||
cout << "Npart: " << N << endl;
|
||||
cout << "DKS: " << result_dks << endl;
|
||||
cout << "CPU: " << result_cpu << endl;
|
||||
|
||||
}
|
||||
|
||||
dksbase.freeMemory<double>(data_ptr, Ndata);
|
||||
dksbase.freeChiSquare();
|
||||
delete[] data;
|
||||
delete[] p;
|
||||
delete[] f;
|
||||
delete[] m;
|
||||
|
||||
return 0;
|
||||
}
|
618
auto-tuning/testChiSquareRTUQTK.cpp
Normal file
618
auto-tuning/testChiSquareRTUQTK.cpp
Normal file
@ -0,0 +1,618 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#include <cstdio>
|
||||
#include <stddef.h>
|
||||
#include <fstream>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <getopt.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "DKSBaseMuSR.h"
|
||||
#include "Utility/DKSTimer.h"
|
||||
|
||||
#include "Array1D.h"
|
||||
#include "Array2D.h"
|
||||
#include "Array3D.h"
|
||||
#include "error_handlers.h"
|
||||
#include "PCSet.h"
|
||||
#include "fast_laplace.h"
|
||||
#include "uqtktools.h"
|
||||
#include "lreg.h"
|
||||
|
||||
#define PI 3.14159265358979323846
|
||||
#define TWO_PI 6.283185307179586231996
|
||||
#define DEG_TO_RAD 1.7453292519943295474371681e-2
|
||||
|
||||
//#define N0 0.25
|
||||
#define N0 1e-10
|
||||
#define TAU 2.197019
|
||||
#define BKG 0.05
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef std::function<double()> doubleF;
|
||||
|
||||
void randData(double *data, int N, int scale = 1) {
|
||||
for (int i = 0; i < N; i++)
|
||||
data[i] = ((double)rand() / RAND_MAX ) * scale;
|
||||
}
|
||||
|
||||
/** MusrFit predefined functions.
|
||||
* Predefined functions from MusrFit that can be used to define the theory function.
|
||||
* First parameter in all the functions is alwats time - t, rest of the parameters depend
|
||||
* on the function.
|
||||
*/
|
||||
double se(double *t, double *lamda) {
|
||||
return exp( -*lamda**t );
|
||||
}
|
||||
//math func + math oper + memory loads
|
||||
//1 + 1 + 2
|
||||
|
||||
|
||||
double ge(double *t, double *lamda, double *beta) {
|
||||
return exp( -pow( (*lamda)*(*t), *beta) );
|
||||
}
|
||||
//2 + 1 + 3
|
||||
|
||||
double sg(double *t, double *sigma) {
|
||||
return exp( -0.5 * pow((*sigma)*(*t), 2) );
|
||||
}
|
||||
//2 + 2 + 2
|
||||
|
||||
double stg(double *t, double *sigma) {
|
||||
double sigmatsq = pow((*sigma)*(*t),2);
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
|
||||
}
|
||||
|
||||
double sekt(double *t, double *lambda) {
|
||||
double lambdat = *lambda*(*t);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
|
||||
}
|
||||
|
||||
double lgkt(double *t, double *lambda, double *sigma) {
|
||||
double lambdat = *lambda*(*t);
|
||||
double sigmatsq = pow(*sigma*(*t), 2.0);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
|
||||
}
|
||||
|
||||
double skt(double *t, double *sigma, double *beta) {
|
||||
if (*beta < 1.0e-3)
|
||||
return 0.0;
|
||||
double sigmatb = pow(*sigma*(*t), (*beta));
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
|
||||
}
|
||||
|
||||
double spg(double *t, double *lambda, double *gamma, double *q) {
|
||||
double lam2 = (*lambda)*(*lambda);
|
||||
double lamt2q = (*t)*(*t)*lam2*(*q);
|
||||
double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
|
||||
double rateL = sqrt(fabs(rate2));
|
||||
double rateT = sqrt(fabs(rate2)+lamt2q);
|
||||
|
||||
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
|
||||
}
|
||||
|
||||
double rahf(double *t, double *nu, double *lambda) {
|
||||
double nut = *nu*(*t);
|
||||
double nuth = *nu*(*t)/2.0;
|
||||
double lamt = *lambda*(*t);
|
||||
|
||||
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
|
||||
}
|
||||
|
||||
double tf(double *t, double *phi, double *nu) {
|
||||
double tmp_nu = TWO_PI**nu**t;
|
||||
double tmp_phi = DEG_TO_RAD * *phi;
|
||||
|
||||
return cos(tmp_nu + tmp_phi);
|
||||
}
|
||||
|
||||
double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
|
||||
double wt = TWO_PI**nu**t;
|
||||
double ph = DEG_TO_RAD**phi;
|
||||
|
||||
return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
|
||||
}
|
||||
|
||||
double b(double *t, double *phi, double *nu) {
|
||||
return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
|
||||
}
|
||||
|
||||
double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
|
||||
double wt = TWO_PI * *nu * *t;
|
||||
double ph = DEG_TO_RAD * *phi;
|
||||
|
||||
return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
|
||||
}
|
||||
|
||||
double ab(double *t, double *sigma, double *gamma) {
|
||||
double gt = *gamma**t;
|
||||
|
||||
return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
|
||||
}
|
||||
|
||||
double snkzf(double *t, double *Delta0, double *Rb) {
|
||||
double D0t2 = pow(*Delta0**t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
|
||||
}
|
||||
|
||||
double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
|
||||
double wt = TWO_PI**nu**t;
|
||||
double ph = DEG_TO_RAD**phi;
|
||||
double D0t2 = pow(*Delta0**t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
|
||||
|
||||
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
|
||||
double nuct = *nuc**t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
|
||||
double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
|
||||
}
|
||||
|
||||
double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
|
||||
double wt = TWO_PI**nu**t;
|
||||
double ph = DEG_TO_RAD**phi;
|
||||
double nuct = *nuc**t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
|
||||
double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
double evalf(std::vector< std::pair<int, doubleF> > func) {
|
||||
|
||||
double result = 0.0;
|
||||
for (auto f : func) {
|
||||
switch (f.first) {
|
||||
case 0: result += f.second(); break;
|
||||
case 1: result -= f.second(); break;
|
||||
default: result += f.second(); break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
|
||||
|
||||
double result = 0.0;
|
||||
double ts = *t;
|
||||
|
||||
for (int i = 0; i < ndata; i++) {
|
||||
|
||||
*t = ts + i*dt;
|
||||
double d = data[i];
|
||||
double e = data[i];
|
||||
|
||||
double vf = evalf(func);
|
||||
double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
|
||||
|
||||
if (e != 0.0)
|
||||
result += ( (theo - d) * (theo - d) ) / (e * e);
|
||||
else
|
||||
result += theo * theo;
|
||||
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//create a random length from 50 - 1000 array and fill with random values from 0 to 1
|
||||
void randomParams(double *p, int np) {
|
||||
for (int i = 0; i < np; i++)
|
||||
p[i] = (double)rand() / RAND_MAX;
|
||||
}
|
||||
|
||||
//create map array of random size and fill with indexes from 0 to max, max < size of param array
|
||||
void randomMaps(int *m, int nm, int max) {
|
||||
for (int i = 0; i < nm; i++)
|
||||
m[i] = rand() % max;
|
||||
}
|
||||
|
||||
void generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc,
|
||||
double *t, double *p, int *m, int np, int nm, int nfunc)
|
||||
{
|
||||
|
||||
for (int n = 0; n < nfunc; n++) {
|
||||
std::string sf = "";
|
||||
doubleF f;
|
||||
|
||||
int r = rand() % 18; //randomly choose one of the predefined functions to use
|
||||
|
||||
int id1 = rand() % nm; //randomly select parameters to use in the function
|
||||
int id2 = rand() % nm;
|
||||
int id3 = rand() % nm;
|
||||
int id4 = rand() % nm;
|
||||
int id5 = rand() % nm;
|
||||
|
||||
std::string p1 = "p[m[" + to_string(id1) + "]])";
|
||||
std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
|
||||
std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
|
||||
to_string(id3) + "]])";
|
||||
std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
|
||||
to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
|
||||
std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
|
||||
to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
|
||||
|
||||
//get a random index from maps and use it to get the parameter value, bind function and parameter
|
||||
//values to f, and create string for gpu in sfunc
|
||||
switch (r) {
|
||||
case 0:
|
||||
f = std::bind(se, t, &p[m[id1]]);
|
||||
sf = "se(t," + p1;
|
||||
break;
|
||||
case 1:
|
||||
f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "ge(t," + p2;
|
||||
break;
|
||||
case 2:
|
||||
f = std::bind(sg, t, &p[m[id1]]);
|
||||
sf = "sg(t, " + p1;
|
||||
break;
|
||||
case 3:
|
||||
f = std::bind(stg, t, &p[m[id1]]);
|
||||
sf = "stg(t, " + p1;
|
||||
break;
|
||||
case 4:
|
||||
f = std::bind(sekt, t, &p[m[id1]]);
|
||||
sf = "sekt(t, " + p1;
|
||||
break;
|
||||
case 5:
|
||||
f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "lgkt(t, " + p2;
|
||||
break;
|
||||
case 6:
|
||||
f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "skt(t, " + p2;
|
||||
break;
|
||||
case 7:
|
||||
f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
|
||||
sf = "spg(t, " + p3;
|
||||
break;
|
||||
case 8:
|
||||
f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "rahf(t, " + p2;
|
||||
break;
|
||||
case 9:
|
||||
f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "tf(t, " + p2;
|
||||
break;
|
||||
case 10:
|
||||
f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
|
||||
sf = "ifld(t, " + p5;
|
||||
break;
|
||||
case 11:
|
||||
f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "b(t, " + p2;
|
||||
break;
|
||||
case 12:
|
||||
f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
|
||||
sf = "ib(t, " + p5;
|
||||
break;
|
||||
case 13:
|
||||
f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "ab(t, " + p2;
|
||||
break;
|
||||
case 14:
|
||||
f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
|
||||
sf = "snkzf(t, " + p2;
|
||||
break;
|
||||
case 15:
|
||||
f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
|
||||
sf = "snktf(t, " + p4;
|
||||
break;
|
||||
case 16:
|
||||
f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
|
||||
sf = "dnkzf(t, " + p3;
|
||||
break;
|
||||
case 17:
|
||||
f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
|
||||
sf = "dnktf(t, " + p5;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
int sign = rand() % 2;
|
||||
if (n == 0) sign = 0;
|
||||
func.push_back( std::make_pair(sign, f) );
|
||||
if (n == 0)
|
||||
sfunc = sf;
|
||||
else {
|
||||
switch(sign) {
|
||||
case 0: sfunc += " + " + sf; break;
|
||||
case 1: sfunc += " - " + sf; break;
|
||||
default: sfunc += " + " + sf; break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
|
||||
srand(time(NULL));
|
||||
|
||||
bool autotune = false;
|
||||
bool eval = false;
|
||||
bool test = false;
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
int nord = 15; //the order of the initial, overcomplete basis
|
||||
int loop = 100;
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
|
||||
if (argv[i] == string("-cuda")) {
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-opencl")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-cpu")) {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-cpu");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-autotune")) {
|
||||
autotune = true;
|
||||
}
|
||||
|
||||
if (argv[i] == string("-eval"))
|
||||
eval = true;
|
||||
|
||||
if (argv[i] == string("-test"))
|
||||
test = true;
|
||||
|
||||
if (argv[i] == string("-nord"))
|
||||
nord = atoi(argv[i+1]);
|
||||
|
||||
if (argv[i] == string("-loop"))
|
||||
loop = atoi(argv[i+1]);
|
||||
|
||||
}
|
||||
|
||||
//init dks and set chi^2 constants
|
||||
DKSBaseMuSR dksbase;
|
||||
dksbase.setAPI(api_name);
|
||||
dksbase.setDevice(device_name);
|
||||
dksbase.initDevice();
|
||||
|
||||
if (autotune)
|
||||
dksbase.setAutoTuningOn();
|
||||
|
||||
int nydim = 2; //the dimensionality of input
|
||||
int nxdim = 5;
|
||||
//UQTk arrays
|
||||
Array2D<double> xdata(loop, nxdim, 0.0);
|
||||
Array2D<double> ydata(loop, nydim, 0.0);
|
||||
|
||||
Array2D<double> xdata_pce(loop, nxdim, 0.0);
|
||||
Array2D<double> ydata_pce(loop, nydim, 0.0);
|
||||
|
||||
int size = 10000;
|
||||
Array2D<double> xtmp(size, nxdim, 0.0);
|
||||
Array2D<double> ytmp(size, nydim, 0.0);
|
||||
|
||||
if (eval || test) {
|
||||
for (int l = 0; l < loop; l++) {
|
||||
|
||||
int ierr;
|
||||
|
||||
//create a random number of parameters
|
||||
int n = rand() % 9 + 1;
|
||||
int Ndata = n * 100000; //number of data points 100k to 1milj, with 100k incr.
|
||||
int np = ( rand() % (1000 - 50) ) + 50; //from 50 to 1000 for different shared memory needs
|
||||
int nm = ( rand() % (50 - 5) ) + 5; //use 5 to 50 of the parameters, for different memory access
|
||||
int nf = ( rand() % (50 - 5) ) + 5; //not used in the test case, but changes the shared memory
|
||||
int nfunc = (rand() % (10 - 1) ) + 1; //1 to 10 user defined functions
|
||||
|
||||
//allocate storage for parameters, maps and functions
|
||||
int *m = new int[nm];
|
||||
double *p = new double[np];
|
||||
double *f = new double[nf];
|
||||
|
||||
//fill with random numbers
|
||||
randomParams(p, np);
|
||||
randomMaps(m, nm, np);
|
||||
randomParams(f, nf);
|
||||
|
||||
//create a random user function that can be passed to GPU kernel and evaluated on the host
|
||||
double dt = 1e-10;
|
||||
double t = 1e-10;
|
||||
std::vector< std::pair<int, doubleF> > func;
|
||||
std::string sfunc;
|
||||
generateRandomFunction(func, sfunc, &t, p, m, np, nm, nfunc);
|
||||
|
||||
//create a data array and fill with random values
|
||||
double *data = new double[Ndata];
|
||||
randomParams(data, Ndata);
|
||||
|
||||
|
||||
//allocate device memory for the data and transfer to the GPU
|
||||
void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
|
||||
dksbase.writeData<double>(data_ptr, data, Ndata);
|
||||
|
||||
//init chi^2
|
||||
dksbase.initChiSquare(Ndata, np, nf, nm);
|
||||
dksbase.callSetConsts(N0, TAU, BKG);
|
||||
|
||||
//write params to the devic
|
||||
dksbase.writeParams(p, np);
|
||||
dksbase.writeFunctions(f, nf);
|
||||
dksbase.writeMaps(m, nm);
|
||||
|
||||
//compile the kernel with the new function
|
||||
dksbase.callCompileProgram(sfunc);
|
||||
|
||||
//run the kernel on the GPU and evaluate the function on the host
|
||||
double result_dks, result_cpu, tmp_result;
|
||||
|
||||
ierr = dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm,
|
||||
t, dt, result_dks);
|
||||
|
||||
if (ierr == DKS_SUCCESS) {
|
||||
result_cpu = cpuChiSq(data, func, Ndata, &t, dt);
|
||||
|
||||
std::vector<int> config;
|
||||
dksbase.callAutoTuningChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm,
|
||||
t, dt, tmp_result, config);
|
||||
|
||||
cout << "DKS: " << result_dks << endl;
|
||||
cout << "CPU: " << result_cpu << endl;
|
||||
cout << "Launch parameters: " << config[0] << ", " << config[1] << endl;
|
||||
cout << sfunc << endl;
|
||||
cout << "Kernel parameters: " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
|
||||
|
||||
xdata(l,0) = np;
|
||||
xdata(l,1) = nm;
|
||||
xdata(l,2) = nf;
|
||||
xdata(l,3) = nfunc;
|
||||
xdata(l,4) = Ndata;
|
||||
|
||||
ydata(l,0) = config[0];
|
||||
ydata(l,1) = config[1];
|
||||
|
||||
std::cout << std::endl << "Loop " << l + 1 << " finished" << std::endl << std::endl;
|
||||
} else {
|
||||
cout << "Created kernel failed! " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
|
||||
cout << sfunc << endl;
|
||||
}
|
||||
|
||||
|
||||
//free temporary resources
|
||||
delete[] m;
|
||||
delete[] p;
|
||||
delete[] f;
|
||||
delete[] data;
|
||||
dksbase.freeChiSquare();
|
||||
dksbase.freeMemory<double>(data_ptr, Ndata);
|
||||
}
|
||||
} else {
|
||||
//read_datafileVS(xdata, "xdata.dat");
|
||||
//read_datafileVS(ydata, "ydata.dat");
|
||||
xtmp.SetValue(0.0);
|
||||
ytmp.SetValue(0.0);
|
||||
read_datafileVS(xtmp, "xdata_pce.dat");
|
||||
read_datafileVS(ytmp, "ydata_pce.dat");
|
||||
for (int i = 0; i < loop; i++) {
|
||||
for (int j = 0; j < nxdim; j++)
|
||||
xdata(i,j) = xtmp(i,j);
|
||||
for (int j = 0; j < nydim; j++)
|
||||
ydata(i,j) = ytmp(i,j);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (eval) {
|
||||
for (int i = 0; i < nxdim; i++) {
|
||||
for (int j = 0; j < loop; j++) {
|
||||
xdata_pce(j,i) = xdata(j,i);
|
||||
ydata_pce(j,i) = ydata(j,i);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < nydim; i++) {
|
||||
for (int j = 0; j < loop; j++) {
|
||||
xdata_pce(j,i) = xdata(j,i);
|
||||
ydata_pce(j,i) = ydata(j,i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//read_datafileVS(xdata_pce, "xdata_pce.dat");
|
||||
//read_datafileVS(ydata_pce, "ydata_pce.dat");
|
||||
xtmp.SetValue(0.0);
|
||||
ytmp.SetValue(0.0);
|
||||
read_datafileVS(xtmp, "xdata_pce.dat");
|
||||
read_datafileVS(ytmp, "ydata_pce.dat");
|
||||
for (int i = 0; i < loop; i++) {
|
||||
for (int j = 0; j < nxdim; j++)
|
||||
xdata_pce(i,j) = xtmp(i,j);
|
||||
for (int j = 0; j < nydim; j++)
|
||||
ydata_pce(i,j) = ytmp(i,j);
|
||||
}
|
||||
std::cout << "Built pce with " << xdata_pce.XSize() << " datapoints" << std::endl;
|
||||
}
|
||||
|
||||
//default input settings
|
||||
string which_chaos="LU"; //PC type
|
||||
string msc="m";
|
||||
|
||||
Lreg* reg;
|
||||
reg = new PCreg(which_chaos,nord,nxdim);
|
||||
int nbas = reg->GetNbas();
|
||||
|
||||
Array2D<double> ypc_data(xdata.XSize(), nydim, 0.0);
|
||||
for (int i = 0; i < nydim; i++) {
|
||||
|
||||
std::cout << "start dim " << i+1 << std::endl;
|
||||
|
||||
Array1D<double> ydata_1d(xdata_pce.XSize(), 0.0);
|
||||
for (unsigned int j = 0; j < xdata_pce.XSize(); j++)
|
||||
ydata_1d(j) = ydata_pce(j,i);
|
||||
|
||||
std::cout << "setup data" << std::endl;
|
||||
reg->SetupData(xdata_pce,ydata_1d);
|
||||
|
||||
std::cout << "Comput best lambda" << std::endl;
|
||||
double lambda=reg->LSQ_computeBestLambda();
|
||||
Array1D<double> lam(nbas,lambda);
|
||||
|
||||
|
||||
reg->SetWeights(lam);
|
||||
|
||||
std::cout << "LSQ build regr" << std::endl;
|
||||
|
||||
reg->LSQ_BuildRegr();
|
||||
std::cout << std::endl << "Lambda : " << lambda << std::endl;
|
||||
|
||||
Array1D<double> ypc;
|
||||
Array1D<double> ycheck;
|
||||
Array2D<double> ycheck_cov;
|
||||
|
||||
reg->EvalRegr(xdata,msc,ypc,ycheck,ycheck_cov);
|
||||
std::cout << std::endl << "Eval" << std::endl;
|
||||
|
||||
for (unsigned int j = 0; j < xdata.XSize(); j++)
|
||||
ypc_data(j,i) = ypc(j);
|
||||
|
||||
}
|
||||
|
||||
if (eval) {
|
||||
write_datafile(xdata_pce, "xdata_pce.dat");
|
||||
write_datafile(ydata_pce, "ydata_pce.dat");
|
||||
}
|
||||
|
||||
write_datafile(xdata, "xdata.dat");
|
||||
write_datafile(ydata, "ydata.dat");
|
||||
write_datafile(ypc_data, "ypc_data.dat");
|
||||
|
||||
return 0;
|
||||
}
|
22
auto-tuning/testSearch.cpp
Normal file
22
auto-tuning/testSearch.cpp
Normal file
@ -0,0 +1,22 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "DKSBaseMuSR.h"
|
||||
|
||||
/** No accelerator device is used, this test is used to confirm, that search functions
|
||||
* used for auto-tuning work properly
|
||||
*/
|
||||
|
||||
int main() {
|
||||
|
||||
DKSBaseMuSR base;
|
||||
|
||||
std::cout << "Start test" << std::endl;
|
||||
|
||||
base.testAutoTuning();
|
||||
|
||||
std::cout << "Test finished" << std::endl;
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
4
cmake/DKSConfig.cmake.in
Normal file
4
cmake/DKSConfig.cmake.in
Normal file
@ -0,0 +1,4 @@
|
||||
SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${${PROJECT_NAME}_CXX_FLAGS}")
|
||||
SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
|
||||
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
SET(${PROJECT_NAME}_LIBRARY "dks")
|
139
cmake/Modules/FindOpenCL.cmake
Normal file
139
cmake/Modules/FindOpenCL.cmake
Normal file
@ -0,0 +1,139 @@
|
||||
#.rst:
|
||||
# FindOpenCL
|
||||
# ----------
|
||||
#
|
||||
# Try to find OpenCL
|
||||
#
|
||||
# Once done this will define::
|
||||
#
|
||||
# OpenCL_FOUND - True if OpenCL was found
|
||||
# OpenCL_INCLUDE_DIRS - include directories for OpenCL
|
||||
# OpenCL_LIBRARIES - link against this library to use OpenCL
|
||||
# OpenCL_VERSION_STRING - Highest supported OpenCL version (eg. 1.2)
|
||||
# OpenCL_VERSION_MAJOR - The major version of the OpenCL implementation
|
||||
# OpenCL_VERSION_MINOR - The minor version of the OpenCL implementation
|
||||
#
|
||||
# The module will also define two cache variables::
|
||||
#
|
||||
# OpenCL_INCLUDE_DIR - the OpenCL include directory
|
||||
# OpenCL_LIBRARY - the path to the OpenCL library
|
||||
#
|
||||
|
||||
#=============================================================================
|
||||
# Copyright 2014 Matthaeus G. Chajdas
|
||||
#
|
||||
# Distributed under the OSI-approved BSD License (the "License");
|
||||
# see accompanying file Copyright.txt for details.
|
||||
#
|
||||
# This software is distributed WITHOUT ANY WARRANTY; without even the
|
||||
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# See the License for more information.
|
||||
#=============================================================================
|
||||
# (To distribute this file outside of CMake, substitute the full
|
||||
# License text for the above reference.)
|
||||
|
||||
function(_FIND_OPENCL_VERSION)
|
||||
include(CheckSymbolExists)
|
||||
include(CMakePushCheckState)
|
||||
set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY})
|
||||
|
||||
CMAKE_PUSH_CHECK_STATE()
|
||||
foreach(VERSION "2_0" "1_2" "1_1" "1_0")
|
||||
set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}")
|
||||
|
||||
if(APPLE)
|
||||
# prefer the header from the Framework
|
||||
set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/Headers/cl.h")
|
||||
if(EXISTS "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h")
|
||||
set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h")
|
||||
endif()
|
||||
|
||||
CHECK_SYMBOL_EXISTS(
|
||||
CL_VERSION_${VERSION}
|
||||
${OSX_OpenCL_HEADER}
|
||||
OPENCL_VERSION_${VERSION})
|
||||
else()
|
||||
CHECK_SYMBOL_EXISTS(
|
||||
CL_VERSION_${VERSION}
|
||||
"${OpenCL_INCLUDE_DIR}/CL/cl.h"
|
||||
OPENCL_VERSION_${VERSION})
|
||||
endif()
|
||||
|
||||
if(OPENCL_VERSION_${VERSION})
|
||||
string(REPLACE "_" "." VERSION "${VERSION}")
|
||||
set(OpenCL_VERSION_STRING ${VERSION} PARENT_SCOPE)
|
||||
string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}")
|
||||
list(GET version_components 0 major_version)
|
||||
list(GET version_components 1 minor_version)
|
||||
set(OpenCL_VERSION_MAJOR ${major_version} PARENT_SCOPE)
|
||||
set(OpenCL_VERSION_MINOR ${minor_version} PARENT_SCOPE)
|
||||
break()
|
||||
endif()
|
||||
endforeach()
|
||||
CMAKE_POP_CHECK_STATE()
|
||||
endfunction()
|
||||
|
||||
find_path(OpenCL_INCLUDE_DIR
|
||||
NAMES
|
||||
CL/cl.h OpenCL/cl.h
|
||||
PATHS
|
||||
ENV "PROGRAMFILES(X86)"
|
||||
ENV AMDAPPSDKROOT
|
||||
ENV INTELOCLSDKROOT
|
||||
ENV NVSDKCOMPUTE_ROOT
|
||||
ENV CUDA_PATH
|
||||
ENV ATISTREAMSDKROOT
|
||||
PATH_SUFFIXES
|
||||
include
|
||||
OpenCL/common/inc
|
||||
"AMD APP/include")
|
||||
|
||||
_FIND_OPENCL_VERSION()
|
||||
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
find_path(OpenCL_LIBRARY
|
||||
NAMES libOpenCL.so
|
||||
PATHS
|
||||
ENV "PROGRAMFILES(X86)"
|
||||
ENV AMDAPPSDKROOT
|
||||
ENV INTELOCLSDKROOT
|
||||
ENV CUDA_PATH
|
||||
ENV NVSDKCOMPUTE_ROOT
|
||||
ENV ATISTREAMSDKROOT
|
||||
PATH_SUFFIXES
|
||||
"AMD APP/lib/x86"
|
||||
lib/x86
|
||||
lib/Win32
|
||||
OpenCL/common/lib/Win32)
|
||||
elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
find_path(OpenCL_LIBRARY
|
||||
NAMES libOpenCL.so
|
||||
PATHS
|
||||
ENV "PROGRAMFILES(X86)"
|
||||
ENV AMDAPPSDKROOT
|
||||
ENV INTELOCLSDKROOT
|
||||
ENV CUDA_PATH
|
||||
ENV NVSDKCOMPUTE_ROOT
|
||||
ENV ATISTREAMSDKROOT
|
||||
PATH_SUFFIXES
|
||||
"AMD APP/lib/x86_64"
|
||||
lib/x86_64
|
||||
lib/x64
|
||||
OpenCL/common/lib/x64)
|
||||
endif()
|
||||
|
||||
set(OpenCL_LIBRARIES ${OpenCL_LIBRARY})
|
||||
set(OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# Ubuntu 12.04 / Travis CI have an old version of CMake that doesn't
|
||||
# support "FOUND_VAR OpenCL_FOUND". This could, in principle, be added
|
||||
# at a later date.
|
||||
find_package_handle_standard_args(
|
||||
OpenCL FOUND_VAR OpenCL_FOUND
|
||||
REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR
|
||||
VERSION_VAR OpenCL_VERSION_STRING)
|
||||
|
||||
mark_as_advanced(
|
||||
OpenCL_INCLUDE_DIR
|
||||
OpenCL_LIBRARY)
|
BIN
doc/refman.pdf
Normal file
BIN
doc/refman.pdf
Normal file
Binary file not shown.
97
run_tuning_tests.sh
Executable file
97
run_tuning_tests.sh
Executable file
@ -0,0 +1,97 @@
|
||||
#!/bin/bash
|
||||
export MIC_ENV_PREFIX=MIC
|
||||
echo $MIC_ENV_PREFIX
|
||||
export MIC_OMP_NUM_THREADS=236
|
||||
echo $MIC_OMP_NUM_THREADS
|
||||
export MIC_KMP_PLACE_THREADS=59c4t0o
|
||||
echo $MIC_KMP_PLACE_THREADS
|
||||
export MIC_USE_2MB_BUFFERS=64K
|
||||
echo $MIC_USE_2MB_BUFFERS
|
||||
export MIC_KMP_AFFINITY=scatter
|
||||
echo $MIC_KMP_AFFINITY
|
||||
|
||||
#./testFFT3DRC 256 256 256
|
||||
|
||||
echo 'real strides divisible by 4 but not by 8'
|
||||
#./testFFT3DRC 257 244 268
|
||||
#./testFFT3DRC 244 268 257
|
||||
#./testFFT3DRC 268 257 244
|
||||
#./testFFT3DRC 257 268 244
|
||||
#./testFFT3DRC 244 257 268
|
||||
#./testFFT3DRC 268 244 257
|
||||
|
||||
echo 'real strides divisible by 8 but not by 16'
|
||||
#./testFFT3DRC 257 248 263
|
||||
#./testFFT3DRC 248 263 257
|
||||
#./testFFT3DRC 263 257 248
|
||||
#./testFFT3DRC 257 263 248
|
||||
#./testFFT3DRC 248 257 263
|
||||
#./testFFT3DRC 263 248 257
|
||||
|
||||
echo 'complex strides divisible by 4 but not by 8'
|
||||
#./testFFT3DRC 257 246 268
|
||||
#./testFFT3DRC 246 268 257
|
||||
#./testFFT3DRC 268 257 246
|
||||
#./testFFT3DRC 257 268 246
|
||||
#./testFFT3DRC 246 257 268
|
||||
#./testFFT3DRC 268 246 257
|
||||
|
||||
echo 'complex strides divisible by 8 but not by 16'
|
||||
#./testFFT3DRC 257 206 317
|
||||
#./testFFT3DRC 206 317 257
|
||||
#./testFFT3DRC 317 257 206
|
||||
#./testFFT3DRC 257 317 206
|
||||
#./testFFT3DRC 206 257 317
|
||||
#./testFFT3DRC 317 206 257
|
||||
|
||||
echo 'perform scaling tests'
|
||||
export MIC_OMP_NUM_THREADS=1
|
||||
echo $MIC_OMP_NUM_THREADS
|
||||
export MIC_KMP_PLACE_THREADS=1c1t0o
|
||||
echo $MIC_KMP_PLACE_THREADS
|
||||
#./testFFT3DRC 256 256 256
|
||||
|
||||
export MIC_OMP_NUM_THREADS=2
|
||||
echo $MIC_OMP_NUM_THREADS
|
||||
export MIC_KMP_PLACE_THREADS=1c2t0o
|
||||
echo $MIC_KMP_PLACE_THREADS
|
||||
#./testFFT3DRC 256 256 256
|
||||
|
||||
|
||||
|
||||
export MIC_OMP_NUM_THREADS=3
|
||||
echo $MIC_OMP_NUM_THREADS
|
||||
export MIC_KMP_PLACE_THREADS=1c3t0o
|
||||
echo $MIC_KMP_PLACE_THREADS
|
||||
#./testFFT3DRC 256 256 256
|
||||
|
||||
|
||||
export MIC_OMP_NUM_THREADS=4
|
||||
echo $MIC_OMP_NUM_THREADS
|
||||
export MIC_KMP_PLACE_THREADS=1c4t0o
|
||||
echo $MIC_KMP_PLACE_THREADS
|
||||
#./testFFT3DRC 256 256 256
|
||||
|
||||
NUM_PROC="2 4 8 16 32 59"
|
||||
for p in $NUM_PROC; do
|
||||
t=$(($p * 4))
|
||||
echo $t
|
||||
export MIC_OMP_NUM_THREADS=$t
|
||||
echo $MIC_OMP_NUM_THREADS
|
||||
mystring="$p"
|
||||
mystring+="c4t0o"
|
||||
export MIC_KMP_PLACE_THREADS=$mystring
|
||||
echo $MIC_KMP_PLACE_THREADS
|
||||
./testFFT3DRC 256 256 256
|
||||
|
||||
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
14
src/Algorithms/CMakeLists.txt
Normal file
14
src/Algorithms/CMakeLists.txt
Normal file
@ -0,0 +1,14 @@
|
||||
SET (_SRCS
|
||||
)
|
||||
|
||||
SET (_HDRS
|
||||
ChiSquareRuntime.h
|
||||
ImageReconstruction.h
|
||||
CollimatorPhysics.h
|
||||
FFT.h
|
||||
)
|
||||
|
||||
ADD_SOURCES (${_SRCS})
|
||||
ADD_HEADERS (${_HDRS})
|
||||
|
||||
INSTALL(FILES ${_HDRS} DESTINATION include/Algorithms)
|
158
src/Algorithms/ChiSquareRuntime.h
Normal file
158
src/Algorithms/ChiSquareRuntime.h
Normal file
@ -0,0 +1,158 @@
|
||||
#ifndef H_CHISQUARE_RUNTIME
|
||||
#define H_CHISQUARE_RUNTIME
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
#define BLOCK_SIZE 128
|
||||
|
||||
#define FITTYPE_UNDEFINED 0
|
||||
#define FITTYPE_SINGLE_HISTO 1
|
||||
#define FITTYPE_ASYMMETRY 2
|
||||
#define FITTYPE_MU_MINUS 3
|
||||
|
||||
class DKSBaseMuSR;
|
||||
|
||||
class ChiSquareRuntime {
|
||||
friend class DKSBaseMuSR;
|
||||
|
||||
protected:
|
||||
// single histo fit parameter
|
||||
double N0_m;
|
||||
double tau_m;
|
||||
double bkg_m;
|
||||
// asymmetry fit parameter
|
||||
double alpha_m;
|
||||
double beta_m;
|
||||
|
||||
bool initDone_m;
|
||||
void *mem_chisq_m;
|
||||
void *mem_param_m;
|
||||
void *mem_func_m;
|
||||
void *mem_map_m;
|
||||
|
||||
int numBlocks_m;
|
||||
int blockSize_m;
|
||||
|
||||
char *ptx_m;
|
||||
|
||||
void setN0(double value) {
|
||||
N0_m = value;
|
||||
}
|
||||
|
||||
void setTau(double value) {
|
||||
tau_m = value;
|
||||
}
|
||||
|
||||
void setBKG(double value) {
|
||||
bkg_m = value;
|
||||
}
|
||||
|
||||
void setAlpha(double value) {
|
||||
alpha_m = value;
|
||||
}
|
||||
|
||||
void setBeta(double value) {
|
||||
beta_m = value;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/** Default constructor */
|
||||
//ChiSquareRuntime();
|
||||
|
||||
/** Default destructor */
|
||||
virtual ~ChiSquareRuntime() { };
|
||||
|
||||
virtual int compileProgram(std::string function, bool mlh = false) = 0;
|
||||
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result) = 0;
|
||||
|
||||
virtual int writeParams(const double *params, int numparams) = 0;
|
||||
virtual int writeFunc(const double *func, int numfunc) = 0;
|
||||
virtual int writeMap(const int *map, int nummap) = 0;
|
||||
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
|
||||
virtual int freeChiSquare() = 0;
|
||||
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
|
||||
|
||||
/** Set N0, tau and bgk values to use for the kernel.
|
||||
* If values changes between data sets this needs to be called before
|
||||
* every kernel call. Returns DKS_SUCCESS.
|
||||
*/
|
||||
int setConsts(double N0, double tau, double bkg) {
|
||||
setN0(N0);
|
||||
setTau(tau);
|
||||
setBKG(bkg);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Set alpha and beta values to use for the kernel.
|
||||
* If values changes between data sets this needs to be called before
|
||||
* every kernel call. Returns DKS_SUCCESS.
|
||||
*/
|
||||
int setConsts(double alpha, double beta) {
|
||||
setAlpha(alpha);
|
||||
setBeta(beta);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** Set number of blocks and threads.
|
||||
* Used to set parameters obtained from auto-tuning
|
||||
*/
|
||||
int setKernelParams(int numBlocks, int blockSize) {
|
||||
int ierr = DKS_ERROR;
|
||||
if (numBlocks > 0) {
|
||||
numBlocks_m = numBlocks;
|
||||
ierr = DKS_SUCCESS;
|
||||
}
|
||||
if (blockSize > 0) {
|
||||
blockSize_m = blockSize;
|
||||
ierr = DKS_SUCCESS;
|
||||
}
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
/** Get the number of operations in compiled kernel.
|
||||
* Count the number of operation in the ptx file for the compiled program.
|
||||
*/
|
||||
int getOperations(int &oper) {
|
||||
|
||||
std::string ptx_str(ptx_m);
|
||||
std::istringstream is(ptx_str);
|
||||
|
||||
std::string line;
|
||||
bool start = false;
|
||||
int count = 0;
|
||||
while(std::getline(is, line)) {
|
||||
|
||||
//when fTheory start enable counting of operations
|
||||
size_t f1 = line.find("fTheory");
|
||||
size_t f2 = line.find(".visible");
|
||||
size_t f3 = line.find(";");
|
||||
if (f1 != std::string::npos && f2 != std::string::npos) {
|
||||
start = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
//exit when the new functions begins
|
||||
if (start && f2 != std::string::npos)
|
||||
break;
|
||||
|
||||
//count opertations
|
||||
if (start && f3 != std::string::npos)
|
||||
count++;
|
||||
}
|
||||
|
||||
oper = count;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
47
src/Algorithms/CollimatorPhysics.h
Normal file
47
src/Algorithms/CollimatorPhysics.h
Normal file
@ -0,0 +1,47 @@
|
||||
#ifndef H_COLLIMATOR_PHYSICS
|
||||
#define H_COLLIMATOR_PHYSICS
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
class DKSBaseMuSR;
|
||||
|
||||
class DKSCollimatorPhysics {
|
||||
friend class DKSBaseMuSR;
|
||||
|
||||
protected:
|
||||
|
||||
int numBlocks_m;
|
||||
int blockSize_m;
|
||||
|
||||
public:
|
||||
|
||||
virtual ~DKSCollimatorPhysics() { }
|
||||
|
||||
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;
|
||||
|
||||
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles) = 0;
|
||||
|
||||
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
|
||||
|
||||
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles, int &numaddback) = 0;
|
||||
|
||||
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false, int streamId = -1) = 0;
|
||||
|
||||
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false,
|
||||
int streamId = -1) = 0;
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
43
src/Algorithms/FFT.h
Normal file
43
src/Algorithms/FFT.h
Normal file
@ -0,0 +1,43 @@
|
||||
#ifndef H_DKS_FFT
|
||||
#define H_DKS_FFT
|
||||
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
class DKSFFT {
|
||||
|
||||
protected:
|
||||
int defaultN[3];
|
||||
int defaultNdim;
|
||||
|
||||
bool useDefaultPlan(int ndim, int N[3]) {
|
||||
if (ndim != defaultNdim)
|
||||
return false;
|
||||
if (N[0] != defaultN[0] && N[1] != defaultN[1] && N[2] != defaultN[2])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
virtual ~DKSFFT() { }
|
||||
|
||||
virtual int setupFFT(int ndim, int N[3]) = 0;
|
||||
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
|
||||
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
|
||||
virtual int destroyFFT() = 0;
|
||||
virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
|
||||
int streamId = -1, bool forward = true) = 0;
|
||||
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||
int streamId = -1) = 0;
|
||||
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||
int streamId = -1) = 0;
|
||||
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
117
src/Algorithms/ImageReconstruction.h
Normal file
117
src/Algorithms/ImageReconstruction.h
Normal file
@ -0,0 +1,117 @@
|
||||
#ifndef H_IMAGERECONSTRUCTION
|
||||
#define H_IMAGERECONSTRUCTION
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
#define BLOCK_SIZE 128
|
||||
|
||||
struct VoxelPosition {
|
||||
float x;
|
||||
float y;
|
||||
float z;
|
||||
};
|
||||
|
||||
struct ListEvent {
|
||||
unsigned detA : 16;
|
||||
unsigned detB : 16;
|
||||
};
|
||||
|
||||
class ImageReconstruction {
|
||||
|
||||
protected:
|
||||
void *m_event_branch;
|
||||
|
||||
public:
|
||||
|
||||
virtual ~ImageReconstruction() { }
|
||||
|
||||
/** Caluclate source.
|
||||
* Places a sphere at each voxel position and calculate the avg value and std value of pixels
|
||||
* that are inside this sphere. All the sphere used have the same diameter.
|
||||
*/
|
||||
virtual int calculateSource(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/** Calculate background.
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
|
||||
*/
|
||||
virtual int calculateBackground(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/** Caluclate source using differente sources.
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||
* each sphere is given by *diameter array.
|
||||
*/
|
||||
virtual int calculateSources(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/**
|
||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
|
||||
* smaller sphere.
|
||||
*/
|
||||
virtual int calculateBackgrounds(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0) = 0;
|
||||
|
||||
/** Generate normalization.
|
||||
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
|
||||
* that updates voxel values in the image on the slope between these two detectors.
|
||||
*/
|
||||
virtual int generateNormalization(void *recon, void *image_position,
|
||||
void *det_position, int total_det) = 0;
|
||||
|
||||
|
||||
/** Calculate forward projection.
|
||||
* For image reconstruction calculates forward projections.
|
||||
* see recon.cpp for details
|
||||
*/
|
||||
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||
void *image_position, int num_events) = 0;
|
||||
|
||||
/** Calculate backward projection.
|
||||
* For image reconstruction calculates backward projections.
|
||||
* see recon.cpp for details
|
||||
*/
|
||||
virtual int backwardProjection(void *correction, void *recon_corrector, void *list_data,
|
||||
void *det_position, void *image_position,
|
||||
int num_events, int num_voxels) = 0;
|
||||
|
||||
/** Set the voxel dimensins on device.
|
||||
*
|
||||
*/
|
||||
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
|
||||
|
||||
/** Set the image edge variables on the device.
|
||||
*
|
||||
*/
|
||||
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
|
||||
|
||||
/** Set the image edge1 on the device.
|
||||
*
|
||||
*/
|
||||
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
|
||||
|
||||
/** Set the minimum crystan in one ring values on the device.
|
||||
*
|
||||
*/
|
||||
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
|
||||
float min_CrystalDist_InOneRing1) = 0;
|
||||
|
||||
/** Set all other required parameters for reconstruction.
|
||||
*
|
||||
*/
|
||||
virtual int setParams(float matrix_distance_factor, float phantom_diameter,
|
||||
float atten_per_mm, float ring_diameter) = 0;
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
21
src/AutoTuning/CMakeLists.txt
Normal file
21
src/AutoTuning/CMakeLists.txt
Normal file
@ -0,0 +1,21 @@
|
||||
SET (_SRCS
|
||||
DKSAutoTuning.cpp
|
||||
DKSSearchStates.cpp
|
||||
DKSConfig.cpp
|
||||
)
|
||||
|
||||
SET (_HDRS
|
||||
DKSAutoTuning.h
|
||||
DKSSearchStates.h
|
||||
DKSAutoTuningTester.h
|
||||
DKSConfig.h
|
||||
)
|
||||
|
||||
#INCLUDE_DIRECTORIES (
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
#)
|
||||
|
||||
ADD_SOURCES (${_SRCS})
|
||||
ADD_HEADERS (${_HDRS})
|
||||
|
||||
INSTALL(FILES ${_HDRS} DESTINATION include/AutoTuning)
|
302
src/AutoTuning/DKSAutoTuning.cpp
Normal file
302
src/AutoTuning/DKSAutoTuning.cpp
Normal file
@ -0,0 +1,302 @@
|
||||
#include "DKSAutoTuning.h"
|
||||
|
||||
DKSAutoTuning::DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops) {
|
||||
|
||||
base_m = base;
|
||||
api_name_m = api;
|
||||
device_name_m = device;
|
||||
loops_m = loops;
|
||||
|
||||
evaluate_time_m = true;
|
||||
}
|
||||
|
||||
DKSAutoTuning::~DKSAutoTuning() {
|
||||
params_m.clear();
|
||||
}
|
||||
|
||||
int DKSAutoTuning::setParameterValues(States state) {
|
||||
|
||||
//if states and params don't match in size something has gone wrong
|
||||
if (state.size() != params_m.size()) {
|
||||
DEBUG_MSG("Parameters and states don't match!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//set the value pointed by params to value saved in state
|
||||
for (unsigned int i = 0; i < params_m.size(); i++)
|
||||
params_m[i].setValue(state[i].value);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/** TODO: might need a better timing for GPU code */
|
||||
int DKSAutoTuning::evaluateFunction(double &value) {
|
||||
|
||||
int ierr = DKS_ERROR;
|
||||
DKSTimer t;
|
||||
|
||||
t.init(function_name_m);
|
||||
|
||||
if (evaluate_time_m) {
|
||||
//run for "loop" times and return the average time.
|
||||
//syncDevice() is used to make sure that nothing is running on the device before the timer starts
|
||||
// and to make sure the function has completed on the device before the time stops
|
||||
for (int j = 0; j < loops_m; j++) {
|
||||
base_m->syncDevice();
|
||||
t.start();
|
||||
ierr = f_m();
|
||||
base_m->syncDevice();
|
||||
t.stop();
|
||||
if (ierr != DKS_SUCCESS) //exit loop if kernel execution fials
|
||||
break;
|
||||
}
|
||||
|
||||
//returns
|
||||
value = t.gettime() / loops_m;
|
||||
} else {
|
||||
value = fd_m();
|
||||
ierr = DKS_SUCCESS;
|
||||
}
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
void DKSAutoTuning::clearParameters() {
|
||||
params_m.clear();
|
||||
}
|
||||
|
||||
void DKSAutoTuning::exaustiveSearch() {
|
||||
|
||||
DKSTimer t;
|
||||
t.init("exaustive search");
|
||||
t.start();
|
||||
|
||||
if (params_m.size() < 2)
|
||||
return;
|
||||
|
||||
Parameter p1 = params_m[0];
|
||||
Parameter p2 = params_m[1];
|
||||
|
||||
double time;
|
||||
double mint = 1000000.0;
|
||||
int minv1 = 0;
|
||||
int minv2 = 0;
|
||||
|
||||
//std::ofstream myfile;
|
||||
//std::string filename;
|
||||
//filename = "search_" + api_name_m + "_" + device_name_m + ".dat";
|
||||
//myfile.open(filename);
|
||||
|
||||
for (double v1 = p1.min; v1 <= p1.max; v1 += p1.step) {
|
||||
for (double v2 = p2.min; v2 <= p2.max; v2 += p2.step) {
|
||||
p1.setValue(v1);
|
||||
p2.setValue(v2);
|
||||
|
||||
int ierr = evaluateFunction(time);
|
||||
|
||||
if (ierr == DKS_SUCCESS && time < mint) {
|
||||
mint = time;
|
||||
minv1 = v1;
|
||||
minv2 = v2;
|
||||
}
|
||||
if (ierr == DKS_ERROR)
|
||||
time = 1;
|
||||
|
||||
//myfile << time << "\t";
|
||||
}
|
||||
//myfile << "\n";
|
||||
}
|
||||
//myfile.close();
|
||||
|
||||
//std::cout << "Optimal launch parameters:" << std::endl;
|
||||
//std::cout << mint << "\t" << minv1 << "\t" << minv2 << std::endl;
|
||||
p1.setValue(minv1);
|
||||
p2.setValue(minv2);
|
||||
|
||||
t.stop();
|
||||
//std::cout << "exaustive search: " << t.gettime() << std::endl;
|
||||
}
|
||||
|
||||
void DKSAutoTuning::lineSearch() {
|
||||
DKSTimer t;
|
||||
t.init("line search");
|
||||
t.start();
|
||||
|
||||
double time;
|
||||
int ierr = DKS_ERROR;
|
||||
|
||||
if (params_m.size() < 1) {
|
||||
DEBUG_MSG("Need some parameters to autotune!");
|
||||
return;
|
||||
}
|
||||
|
||||
double mint = 1000000.0;
|
||||
//loop trough parameters one parameter at a time
|
||||
for (auto param : params_m) {
|
||||
int minv = param.getValue();
|
||||
|
||||
//go trough all the values of the parameter, while keeping other parameters const
|
||||
for (double i = param.min; i <= param.max; i += param.step) {
|
||||
//adjust parameters
|
||||
param.setValue(i);
|
||||
|
||||
//run for "loop" times and get average
|
||||
ierr = evaluateFunction(time);
|
||||
|
||||
//if there was no error executing the function and time is better than previou
|
||||
//min time, save the parameter configuration
|
||||
if (ierr == DKS_SUCCESS && time < mint) {
|
||||
mint = time;
|
||||
minv = i;
|
||||
}
|
||||
|
||||
} //repeat
|
||||
|
||||
param.setValue(minv);
|
||||
}
|
||||
|
||||
//DEBUG: print out the found best parameters
|
||||
for (auto param : params_m)
|
||||
std::cout << "Parameter " << param.name << " set to " << param.getValue() << std::endl;
|
||||
|
||||
std::cout << "Best time: " << mint << std::endl;
|
||||
|
||||
t.stop();
|
||||
std::cout << "Line search time: " << t.gettime() << std::endl;
|
||||
|
||||
}
|
||||
|
||||
void DKSAutoTuning::hillClimbing(int restart_loops) {
|
||||
|
||||
DKSTimer t;
|
||||
t.init("hill climbing");
|
||||
t.start();
|
||||
|
||||
std::cout << "hill climbing" << std::endl;
|
||||
|
||||
int ierr;
|
||||
double time_current;
|
||||
double time_next;
|
||||
DKSSearchStates search(params_m);
|
||||
|
||||
std::cout << "start " << restart_loops << std::endl;
|
||||
|
||||
for (int i = 0; i < restart_loops; i++) {
|
||||
|
||||
|
||||
//init random current state
|
||||
search.initCurrentState();
|
||||
|
||||
//evaluate current state
|
||||
setParameterValues(search.getCurrentState());
|
||||
ierr = evaluateFunction(time_current);
|
||||
|
||||
//std::cout << "Start iteration " << i+1 << std::endl;
|
||||
//search.printCurrentState(time_current);
|
||||
|
||||
if (ierr == DKS_ERROR)
|
||||
continue;
|
||||
|
||||
//statr the loop
|
||||
bool topReached = false;
|
||||
while(!topReached) {
|
||||
|
||||
search.getNeighbours();
|
||||
|
||||
//get all the neighbors of the current state
|
||||
bool neighbourFound = false;
|
||||
while (!neighbourFound && search.nextNeighbourExists()) {
|
||||
|
||||
//evaluate all the neighbors of the current state
|
||||
setParameterValues(search.getNextNeighbour());
|
||||
ierr = evaluateFunction(time_next);
|
||||
|
||||
//search.printNeighbour(time_next);
|
||||
|
||||
if (ierr == DKS_ERROR)
|
||||
std::cout << "Error evaluating function" << std::endl;
|
||||
|
||||
//move to the first option that improives the solution
|
||||
if (ierr == DKS_SUCCESS && time_next < time_current) {
|
||||
time_current = time_next;
|
||||
search.moveToNeighbour();
|
||||
neighbourFound = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//if no better option is found save the state and move to step 1
|
||||
if (!neighbourFound) {
|
||||
search.saveCurrentState(time_current);
|
||||
topReached = true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
search.printBest();
|
||||
|
||||
t.stop();
|
||||
std::cout << "hill climbing: " << t.gettime() << std::endl;
|
||||
}
|
||||
|
||||
void DKSAutoTuning::simulatedAnnealing(double Tstart, double Tstep) {
|
||||
|
||||
DKSTimer t;
|
||||
t.init("simulated annealing");
|
||||
t.start();
|
||||
|
||||
int ierr;
|
||||
double time_current;
|
||||
double time_next;
|
||||
|
||||
DKSSearchStates search(params_m);
|
||||
|
||||
//make a random guess
|
||||
search.initCurrentState();
|
||||
|
||||
//evaluate current state
|
||||
setParameterValues(search.getCurrentState());
|
||||
ierr = evaluateFunction(time_current);
|
||||
|
||||
if (ierr == DKS_ERROR)
|
||||
return;
|
||||
|
||||
for (double Temp = Tstart; Temp > 0; Temp -= Tstep) {
|
||||
|
||||
search.printCurrentState(time_current);
|
||||
|
||||
//calucate all the neighbours of current state
|
||||
search.getNeighbours(10);
|
||||
|
||||
//make a move to random neighbour and evaluate the runtime
|
||||
setParameterValues(search.getRandomNeighbour());
|
||||
ierr = evaluateFunction(time_next);
|
||||
|
||||
if (ierr == DKS_ERROR)
|
||||
return;
|
||||
|
||||
//if the solution improves move to this point else move to this point with probabily exp(-dE/T)
|
||||
if (time_next < time_current) {
|
||||
time_current = time_next;
|
||||
search.moveToNeighbour();
|
||||
} else {
|
||||
double p = (double)rand() / RAND_MAX;
|
||||
double dE = time_next - time_current;
|
||||
double P = exp(-dE/Temp);
|
||||
|
||||
if (P > p) {
|
||||
time_current = time_next;
|
||||
search.moveToNeighbour();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
search.printCurrentState(time_current);
|
||||
|
||||
t.stop();
|
||||
std::cout << "Simulated annealing: " << t.gettime() << std::endl;
|
||||
|
||||
}
|
||||
|
103
src/AutoTuning/DKSAutoTuning.h
Normal file
103
src/AutoTuning/DKSAutoTuning.h
Normal file
@ -0,0 +1,103 @@
|
||||
#ifndef DKS_AUTOTUNIG
|
||||
#define DKS_AUTOTUNIG
|
||||
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <cstdlib>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
|
||||
#include "../DKSBase.h"
|
||||
#include "../Utility/DKSTimer.h"
|
||||
#include "DKSSearchStates.h"
|
||||
|
||||
typedef std::vector<Parameter> Parameters;
|
||||
typedef std::vector<State> States;
|
||||
|
||||
class DKSAutoTuning {
|
||||
|
||||
private:
|
||||
|
||||
bool evaluate_time_m;
|
||||
|
||||
std::string api_name_m;
|
||||
std::string device_name_m;
|
||||
std::string function_name_m;
|
||||
|
||||
std::function<int()> f_m;
|
||||
std::function<double()> fd_m;
|
||||
Parameters params_m;
|
||||
|
||||
DKSBase *base_m;
|
||||
|
||||
int loops_m;
|
||||
|
||||
/** Update parameters from a state */
|
||||
int setParameterValues(States states);
|
||||
|
||||
/** Evaluate the function and set execution time
|
||||
* Returns DKS_ERROR if errors occured during function execution.
|
||||
* Returns DKS_SUCCESS if function executed as planned.
|
||||
*/
|
||||
int evaluateFunction(double &value);
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor */
|
||||
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
|
||||
|
||||
/** Destructor */
|
||||
~DKSAutoTuning();
|
||||
|
||||
/** Set function to auto tune.
|
||||
* Caller of setFunction is responsible to bind the correct parameters
|
||||
* to the function with std::bind.
|
||||
*/
|
||||
void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
|
||||
f_m = f;
|
||||
function_name_m = name;
|
||||
evaluate_time_m = evaluate_time;
|
||||
}
|
||||
|
||||
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
|
||||
fd_m = f;
|
||||
function_name_m = name;
|
||||
evaluate_time_m = evaluate_time;
|
||||
}
|
||||
|
||||
/** Set parameter for auto tuning.
|
||||
* Provide a pointer to a parameter that will be changed during auto-tuning
|
||||
* and a min-max value for this element
|
||||
*/
|
||||
template <typename T1>
|
||||
void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
|
||||
Parameter p(value, min, max, step, name);
|
||||
params_m.push_back(p);
|
||||
}
|
||||
|
||||
/** Delete all added parameters */
|
||||
void clearParameters();
|
||||
|
||||
/** Perform exaustive search evaluating all the parameter configurations */
|
||||
void exaustiveSearch();
|
||||
|
||||
/** Perform auto-tuning.
|
||||
* Perform line-search auto-tuning by variying parameters one at a time and keeping other
|
||||
* parameters constant.
|
||||
*/
|
||||
void lineSearch();
|
||||
|
||||
/** Perform hill climbing
|
||||
*/
|
||||
void hillClimbing(int restart_loops = 1);
|
||||
|
||||
/** Perfor simulated annealing to find the parameters */
|
||||
void simulatedAnnealing(double Tstart, double Tstep);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
33
src/AutoTuning/DKSAutoTuningTester.h
Normal file
33
src/AutoTuning/DKSAutoTuningTester.h
Normal file
@ -0,0 +1,33 @@
|
||||
#ifndef DKS_TESTAUTOTUNING
|
||||
#define DKS_TESTAUTOTUNING
|
||||
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
class DKSAutoTuningTester {
|
||||
|
||||
friend class DKSBaseMuSR;
|
||||
|
||||
private:
|
||||
|
||||
double x;
|
||||
double y;
|
||||
|
||||
public:
|
||||
|
||||
DKSAutoTuningTester() {
|
||||
x = 0.0;
|
||||
y = 0.0;
|
||||
}
|
||||
|
||||
~DKSAutoTuningTester();
|
||||
|
||||
double peaksZ() {
|
||||
|
||||
double z = 3 * pow(1-x,2) * exp(-pow(x,2) - pow(y+1,2)) - 10 * (x/5 - pow(x,3) - pow(y,5)) * exp(-pow(x,2) - pow(y,2)) - (1.0/3.0) * exp( - pow(x+1,2) - pow(y,2));
|
||||
return z;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
163
src/AutoTuning/DKSConfig.cpp
Normal file
163
src/AutoTuning/DKSConfig.cpp
Normal file
@ -0,0 +1,163 @@
|
||||
#include "DKSConfig.h"
|
||||
|
||||
DKSConfig::DKSConfig() {
|
||||
|
||||
//get home directory
|
||||
homeset_m = true;
|
||||
if ((homedir_m = getenv("HOME")) == NULL)
|
||||
homeset_m = false;
|
||||
|
||||
loadConfigFile();
|
||||
|
||||
}
|
||||
|
||||
DKSConfig::~DKSConfig() {
|
||||
//delete tree_m;
|
||||
|
||||
saveConfigFile();
|
||||
}
|
||||
|
||||
|
||||
int DKSConfig::loadConfigFile() {
|
||||
|
||||
int ierr = DKS_ERROR;
|
||||
/*
|
||||
if (homeset_m) {
|
||||
//check if $HOME/.config/DKS exists
|
||||
std::string filename = homedir_m + config_dir + config_file;
|
||||
std::cout << "Check for: " << filename << std::endl;
|
||||
if (fs::exists(filename)) {
|
||||
try {
|
||||
pt::read_xml(filename, tree_m);
|
||||
treeloaded_m = true;
|
||||
ierr = DKS_SUCCESS;
|
||||
} catch (std::exception &e) {
|
||||
DEBUG_MSG("Error loading autotuning file!");
|
||||
treeloaded_m = false;
|
||||
ierr = DKS_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
return ierr;
|
||||
}
|
||||
|
||||
|
||||
int DKSConfig::saveConfigFile() {
|
||||
|
||||
int ierr = DKS_ERROR;
|
||||
/*
|
||||
std::string savedir = homedir_m + config_dir;
|
||||
std::string savefile = homedir_m + config_dir + config_file;
|
||||
|
||||
std::cout << savedir << std::endl;
|
||||
std::cout << savefile << std::endl;
|
||||
|
||||
if (homeset_m) {
|
||||
//check if $HOME/.config/DKS directory exists, if not create
|
||||
bool homecreated = false;
|
||||
fs::path p (savedir);
|
||||
if (!fs::is_directory(p))
|
||||
homecreated = fs::create_directory(p);
|
||||
|
||||
try {
|
||||
if (homecreated) {
|
||||
pt::write_xml(savefile, tree_m);
|
||||
ierr = DKS_SUCCESS;
|
||||
}
|
||||
} catch(std::exception &e) {
|
||||
ierr = DKS_ERROR;
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
return ierr;
|
||||
}
|
||||
|
||||
|
||||
int DKSConfig::addConfigParameter(const std::string api, const std::string device,
|
||||
const std::string name, const std::string func,
|
||||
int size, std::string param, int value) {
|
||||
|
||||
|
||||
//keys to acces data in the tree
|
||||
std::string device_name = name;
|
||||
device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end());
|
||||
std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func;
|
||||
std::string parameter = key + ".parameter";
|
||||
std::string attr_size = "<xmlattr>.size";
|
||||
std::string attr_param = "<xmlattr>." + param;
|
||||
|
||||
//tmp node where new attributes are cteated in case the attribute doesn't exist in the tree
|
||||
pt::ptree *tmp;
|
||||
bool newNode = true;
|
||||
|
||||
//loop trough all the items in the node and see if new param needs to be created
|
||||
//or old one updated
|
||||
boost::optional< pt::ptree& > child = tree_m.get_child_optional(key);
|
||||
if (child) {
|
||||
BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) {
|
||||
int oldsize = v.second.get<int>(attr_size,-1);
|
||||
|
||||
//if param with the same size already exists in the tree save pointer to this
|
||||
if (size == oldsize) {
|
||||
tmp = &v.second;
|
||||
newNode = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//if parameter doesnt exist with this size, create a new parameter
|
||||
if (newNode) {
|
||||
tmp = new pt::ptree();
|
||||
tmp->add(attr_size, size);
|
||||
tmp->add(attr_param, value);
|
||||
tree_m.add_child(parameter, *tmp);
|
||||
} else {
|
||||
//if parameter exists update the parameter value
|
||||
tmp->put(attr_param, value);
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int DKSConfig::getConfigParameter(const std::string api, const std::string device,
|
||||
const std::string name, const std::string func,
|
||||
int size, std::string param, int &value) {
|
||||
|
||||
//get the value of the tree, default to -1 if value doesn't exist
|
||||
int ierr = DKS_SUCCESS;
|
||||
|
||||
//define key and attribute values to find parameters in the tree
|
||||
std::string device_name = name;
|
||||
device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end());
|
||||
std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func;
|
||||
std::string attr_size = "<xmlattr>.size";
|
||||
std::string attr_param = "<xmlattr>." + param;
|
||||
|
||||
float maxDist = std::numeric_limits<float>::max();
|
||||
|
||||
//check if the parameters exist
|
||||
boost::optional< pt::ptree& > child = tree_m.get_child_optional(key);
|
||||
if (child) {
|
||||
//loop trough parameters and get the one that is closes to the size specified
|
||||
BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) {
|
||||
int param_size = v.second.get<int>(attr_size,-1); //get parameter size
|
||||
if (param_size > 0) { // if param_size is -1 param is not defined correctly and not usable
|
||||
float dist = abs(param_size - size);
|
||||
if (dist < maxDist) {
|
||||
value = v.second.get<int>(attr_param,-1);
|
||||
maxDist = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
value = -1;
|
||||
ierr = DKS_ERROR;
|
||||
}
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
|
||||
|
69
src/AutoTuning/DKSConfig.h
Normal file
69
src/AutoTuning/DKSConfig.h
Normal file
@ -0,0 +1,69 @@
|
||||
/** Class to save and load DKS autotunning configs.
|
||||
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
|
||||
* Uses boost xml_parser to read and write the xml file and boost property tree to store
|
||||
* the xml content.
|
||||
*/
|
||||
|
||||
#ifndef DKS_CONFIG
|
||||
#define DKS_CONFIG
|
||||
|
||||
#include <boost/property_tree/ptree.hpp>
|
||||
#include <boost/optional/optional.hpp>
|
||||
#include <boost/property_tree/xml_parser.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <exception>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
namespace pt = boost::property_tree;
|
||||
namespace fs = boost::filesystem;
|
||||
|
||||
const std::string config_dir = "/.config/DKS";
|
||||
const std::string config_file = "/autotuning.xml";
|
||||
|
||||
class DKSConfig {
|
||||
|
||||
private:
|
||||
|
||||
pt::ptree tree_m;
|
||||
const char *homedir_m;
|
||||
bool homeset_m;
|
||||
bool treeloaded_m;
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor, set home variable.
|
||||
* If home directory is not set config file can not be read or saved
|
||||
*/
|
||||
DKSConfig();
|
||||
|
||||
~DKSConfig();
|
||||
|
||||
/** Load autotuinig.xml into tree variable if this file exists */
|
||||
int loadConfigFile();
|
||||
|
||||
/** Save autotuning.xml file */
|
||||
int saveConfigFile();
|
||||
|
||||
/** Add config parameter to tree */
|
||||
int addConfigParameter(const std::string api, const std::string device,
|
||||
const std::string name, const std::string func,
|
||||
int size, std::string param, int value);
|
||||
|
||||
/** Get config parameter from the tree */
|
||||
int getConfigParameter(const std::string api, const std::string device,
|
||||
const std::string name, const std::string func,
|
||||
int size, std::string param, int &value);
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
233
src/AutoTuning/DKSSearchStates.cpp
Normal file
233
src/AutoTuning/DKSSearchStates.cpp
Normal file
@ -0,0 +1,233 @@
|
||||
#include "DKSSearchStates.h"
|
||||
|
||||
/** set the current state so that number of parameters and parameter bounds are known */
|
||||
DKSSearchStates::DKSSearchStates(Parameters params) {
|
||||
|
||||
for (auto p : params) {
|
||||
State s;
|
||||
s.value = p.getValue();
|
||||
s.min = p.min;
|
||||
s.max = p.max;
|
||||
s.step = p.step;
|
||||
current_state_m.push_back(s);
|
||||
}
|
||||
|
||||
neighbour_state_m.resize(current_state_m.size());
|
||||
best_state_m.resize(current_state_m.size());
|
||||
|
||||
best_time_m = std::numeric_limits<double>::max();
|
||||
|
||||
next_neighbour_m = -1;
|
||||
|
||||
srand(time(NULL));
|
||||
|
||||
}
|
||||
|
||||
DKSSearchStates::~DKSSearchStates() {
|
||||
current_state_m.clear();
|
||||
neighbour_state_m.clear();
|
||||
best_state_m.clear();
|
||||
neighbours_m.clear();
|
||||
}
|
||||
|
||||
/** Get all the possible neighbours of the current state */
|
||||
void DKSSearchStates::getNeighbours(int dist) {
|
||||
|
||||
std::vector< std::vector<double> > values;
|
||||
|
||||
for (auto state : current_state_m) {
|
||||
std::vector<double> s;
|
||||
|
||||
for (int d = dist; d > 0; d--) {
|
||||
if (state.value - d*state.step >= state.min)
|
||||
s.push_back(state.value - state.step);
|
||||
}
|
||||
|
||||
s.push_back(state.value);
|
||||
|
||||
for (int d = 1; d < dist + 1; d++) {
|
||||
if (state.value + d*state.step <= state.max)
|
||||
s.push_back(state.value + state.step);
|
||||
}
|
||||
|
||||
values.push_back(s);
|
||||
}
|
||||
|
||||
|
||||
std::vector< std::vector<double> > s {{}};
|
||||
for (auto& u : values) {
|
||||
std::vector< std::vector<double> > r;
|
||||
for(auto& x : s) {
|
||||
for( auto y : u) {
|
||||
r.push_back(x);
|
||||
r.back().push_back(y);
|
||||
}
|
||||
}
|
||||
s.swap(r);
|
||||
}
|
||||
|
||||
//get current state values
|
||||
std::vector<double> current;
|
||||
for (auto state : current_state_m)
|
||||
current.push_back(state.value);
|
||||
s.erase(std::remove(s.begin(), s.end(), current));
|
||||
|
||||
neighbours_m.clear();
|
||||
neighbours_m = s;
|
||||
next_neighbour_m = 0;
|
||||
}
|
||||
|
||||
void DKSSearchStates::setCurrentState(std::vector<Parameter> current_state) {
|
||||
|
||||
current_state_m.clear();
|
||||
for (auto& p : current_state) {
|
||||
State s;
|
||||
s.value = p.getValue();
|
||||
s.min = p.min;
|
||||
s.max = p.max;
|
||||
s.step = p.step;
|
||||
current_state_m.push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
void DKSSearchStates::setCurrentState(std::vector<State> current_state) {
|
||||
|
||||
current_state_m.clear();
|
||||
for (auto& p : current_state) {
|
||||
State s;
|
||||
s.value = p.value;
|
||||
s.min = p.min;
|
||||
s.max = p.max;
|
||||
s.step = p.step;
|
||||
current_state_m.push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
void DKSSearchStates::initCurrentState() {
|
||||
|
||||
//go trough parameters in current state and generate a new random value
|
||||
for (auto& s : current_state_m) {
|
||||
//get number of total values
|
||||
int values = (s.max - s.min) / s.step + 1;
|
||||
|
||||
int r = rand() % values;
|
||||
|
||||
s.value = s.min + r * s.step;
|
||||
}
|
||||
|
||||
getNeighbours();
|
||||
}
|
||||
|
||||
States DKSSearchStates::getCurrentState() {
|
||||
return current_state_m;
|
||||
}
|
||||
|
||||
States DKSSearchStates::getNextNeighbour() {
|
||||
|
||||
//check if there are ant neighbours to move on
|
||||
if (next_neighbour_m < (int)neighbours_m.size()) {
|
||||
|
||||
//get the vector of values for each parameters in the neighbour cell
|
||||
std::vector<double> neighbour_values = neighbours_m[next_neighbour_m];
|
||||
|
||||
//set the values to neighbour_state_m
|
||||
for (unsigned int n = 0; n < neighbour_state_m.size(); n++)
|
||||
neighbour_state_m[n].value = neighbour_values[n];
|
||||
|
||||
}
|
||||
|
||||
next_neighbour_m++;
|
||||
return neighbour_state_m;
|
||||
|
||||
}
|
||||
|
||||
States DKSSearchStates::getRandomNeighbour() {
|
||||
|
||||
int rand_neighbour = rand() % (int)neighbours_m.size();
|
||||
|
||||
//get the vector of values for each parameters in the neighbour cell
|
||||
std::vector<double> neighbour_values = neighbours_m[rand_neighbour];
|
||||
|
||||
//set the values to neighbour_state_m
|
||||
for (unsigned int n = 0; n < neighbour_state_m.size(); n++)
|
||||
neighbour_state_m[n].value = neighbour_values[n];
|
||||
|
||||
next_neighbour_m = rand_neighbour + 1;
|
||||
return neighbour_state_m;
|
||||
|
||||
}
|
||||
|
||||
bool DKSSearchStates::nextNeighbourExists() {
|
||||
bool neighbourExists = false;
|
||||
if (next_neighbour_m < (int)neighbours_m.size())
|
||||
neighbourExists = true;
|
||||
|
||||
return neighbourExists;
|
||||
}
|
||||
|
||||
void DKSSearchStates::moveToNeighbour() {
|
||||
|
||||
for (unsigned int i = 0; i < current_state_m.size(); i++)
|
||||
current_state_m[i].value = neighbour_state_m[i].value;
|
||||
|
||||
//getNeighbours();
|
||||
|
||||
}
|
||||
|
||||
void DKSSearchStates::saveCurrentState(double current_time) {
|
||||
|
||||
if (current_time < best_time_m) {
|
||||
for (unsigned int i = 0; i < current_state_m.size(); i++) {
|
||||
best_state_m[i].value = current_state_m[i].value;
|
||||
best_state_m[i].min = current_state_m[i].min;
|
||||
best_state_m[i].max = current_state_m[i].max;
|
||||
best_state_m[i].step = current_state_m[i].step;
|
||||
}
|
||||
|
||||
best_time_m = current_time;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void DKSSearchStates::printCurrentState(double time) {
|
||||
std::cout << "Current state: ";
|
||||
for (auto s : current_state_m)
|
||||
std::cout << s.value << "\t";
|
||||
std::cout << time << std::endl;
|
||||
|
||||
}
|
||||
|
||||
void DKSSearchStates::printInfo() {
|
||||
|
||||
std::cout << "Current state: ";
|
||||
for (auto s : current_state_m)
|
||||
std::cout << s.value << "\t";
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): ";
|
||||
if (next_neighbour_m > 0) {
|
||||
for (auto s : neighbour_state_m)
|
||||
std::cout << s.value << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
}
|
||||
|
||||
void DKSSearchStates::printNeighbour(double time) {
|
||||
std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): ";
|
||||
if (next_neighbour_m > 0) {
|
||||
for (auto s : neighbour_state_m)
|
||||
std::cout << s.value << "\t";
|
||||
}
|
||||
std::cout << time << std::endl;
|
||||
}
|
||||
|
||||
void DKSSearchStates::printBest() {
|
||||
std::cout << "Best state (" << best_time_m << "): ";
|
||||
if (best_time_m > 0) {
|
||||
for (auto s : best_state_m)
|
||||
std::cout << s.value << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
162
src/AutoTuning/DKSSearchStates.h
Normal file
162
src/AutoTuning/DKSSearchStates.h
Normal file
@ -0,0 +1,162 @@
|
||||
#ifndef DKS_SEARCHSTATES
|
||||
#define DKS_SEARCHSTATES
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
|
||||
|
||||
class Parameter {
|
||||
|
||||
private:
|
||||
int *ivalue;
|
||||
double *dvalue;
|
||||
VALUE_TYPE type;
|
||||
|
||||
public:
|
||||
double min;
|
||||
double max;
|
||||
double step;
|
||||
std::string name;
|
||||
|
||||
Parameter(int *_value, int _min, int _max, int _step, std::string _name) {
|
||||
|
||||
ivalue = _value;
|
||||
min = (double)_min;
|
||||
max = (double)_max;
|
||||
step = (double)_step;
|
||||
name = _name;
|
||||
type = DKS_INT;
|
||||
}
|
||||
|
||||
Parameter(double *_value, double _min, double _max, double _step, std::string _name) {
|
||||
|
||||
std::cout << "Double" << std::endl;
|
||||
|
||||
dvalue = _value;
|
||||
min = _min;
|
||||
max = _max;
|
||||
step = _step;
|
||||
name = _name;
|
||||
type = DKS_DOUBLE;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void setValue(T v) {
|
||||
if (type == DKS_INT)
|
||||
*ivalue = (int)v;
|
||||
if (type == DKS_DOUBLE)
|
||||
*dvalue = (double)v;
|
||||
}
|
||||
|
||||
double getValue() {
|
||||
switch (type) {
|
||||
case DKS_INT:
|
||||
return (double)*ivalue;
|
||||
case DKS_DOUBLE:
|
||||
return *dvalue;
|
||||
};
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct State {
|
||||
double value;
|
||||
double min;
|
||||
double max;
|
||||
double step;
|
||||
};
|
||||
|
||||
typedef std::vector<Parameter> Parameters;
|
||||
typedef std::vector<State> States;
|
||||
|
||||
class DKSSearchStates {
|
||||
|
||||
private:
|
||||
|
||||
States current_state_m;
|
||||
States neighbour_state_m;
|
||||
|
||||
States best_state_m;
|
||||
double best_time_m;
|
||||
|
||||
std::vector< std::vector<double> > neighbours_m;
|
||||
int next_neighbour_m;
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor alwats takes params array as variable.
|
||||
* Params array is needed to know how many params will be searched and what are thou bounds
|
||||
* of each parameter.
|
||||
*/
|
||||
DKSSearchStates(Parameters params);
|
||||
|
||||
~DKSSearchStates();
|
||||
|
||||
/** Set current state using parameter vector */
|
||||
void setCurrentState(Parameters current_state);
|
||||
|
||||
/** set current state using the state vector */
|
||||
void setCurrentState(States current_state);
|
||||
|
||||
/** init random current state */
|
||||
void initCurrentState();
|
||||
|
||||
/** get current state */
|
||||
States getCurrentState();
|
||||
|
||||
/** get next neighbour state.
|
||||
* if there are no next neighbore stay at the curretn neighbour
|
||||
*/
|
||||
States getNextNeighbour();
|
||||
|
||||
/** get random neighbour state */
|
||||
States getRandomNeighbour();
|
||||
|
||||
/** calculate all the neighbour states */
|
||||
void getNeighbours(int dist = 1);
|
||||
|
||||
/** Chech if there are more neighbours to evaluate
|
||||
* Return true if more neighbors exist, false if we are at the last neighbour
|
||||
*/
|
||||
bool nextNeighbourExists();
|
||||
|
||||
/** move to next neighbour.
|
||||
* set the current state as the next neighbour,
|
||||
* calculate the neighbours of the new current state.
|
||||
*/
|
||||
void moveToNeighbour();
|
||||
|
||||
/** Save the current state and the evaluation time of the current state.
|
||||
* If evaluation time of the current state is better than the evaluation time of the
|
||||
* best state, save the current state as best.
|
||||
*/
|
||||
void saveCurrentState(double current_time);
|
||||
|
||||
|
||||
//Print functions - mostly usefull for debugging purposes, or for benchmark runs to print the
|
||||
//status of the search
|
||||
|
||||
/** Print current state.
|
||||
* cout the current state. Mostly used for debuging purposes
|
||||
*/
|
||||
void printCurrentState(double time = 0.0);
|
||||
|
||||
/** Print current neighbour info */
|
||||
void printNeighbour(double time = 0.0);
|
||||
|
||||
/** Print info.
|
||||
* Print the whole info about the search: current state, current neighbour, total neighbors
|
||||
*/
|
||||
void printInfo();
|
||||
|
||||
/** Print the best saved state */
|
||||
void printBest();
|
||||
|
||||
};
|
||||
|
||||
#endif
|
130
src/CMakeLists.txt
Normal file
130
src/CMakeLists.txt
Normal file
@ -0,0 +1,130 @@
|
||||
CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
|
||||
|
||||
SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
MACRO (ADD_SOURCES )
|
||||
FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
FOREACH (_src ${ARGN})
|
||||
IF (_relPath)
|
||||
LIST (APPEND DKS_SRCS "${_relPath}/${_src}")
|
||||
ELSE ()
|
||||
LIST (APPEND DKS_SRCS "${_src}")
|
||||
ENDIF ()
|
||||
ENDFOREACH ()
|
||||
IF (_relPath)
|
||||
# propagate SRCS to parent directory
|
||||
SET (DKS_SRCS ${DKS_SRCS} PARENT_SCOPE)
|
||||
ENDIF ()
|
||||
ENDMACRO ()
|
||||
|
||||
MACRO (ADD_HEADERS )
|
||||
FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
FOREACH (_hdr ${ARGN})
|
||||
IF (_relPath)
|
||||
LIST (APPEND DKS_HDRS "${_relPath}/${_hdr}")
|
||||
ELSE ()
|
||||
LIST (APPEND DKS_HDRS "${_hdr}")
|
||||
ENDIF ()
|
||||
ENDFOREACH ()
|
||||
IF (_relPath)
|
||||
# propagate HDRS to parent directory
|
||||
SET (DKS_HDRS ${DKS_HDRS} PARENT_SCOPE)
|
||||
ENDIF ()
|
||||
ENDMACRO ()
|
||||
|
||||
|
||||
SET (DKS_BASEDIR_HDRS
|
||||
DKSBase.h
|
||||
DKSDefinitions.h
|
||||
)
|
||||
|
||||
SET (DKS_BASEDIR_SRCS
|
||||
DKSBase.cpp
|
||||
)
|
||||
|
||||
IF (USE_CUDA OR USE_OPENCL)
|
||||
SET (DKS_BASEDIR_HDRS
|
||||
${DKS_BASEDIR_HDRS}
|
||||
DKSBaseMuSR.h
|
||||
)
|
||||
|
||||
SET (DKS_BASEDIR_SRCS
|
||||
${DKS_BASEDIR_SRCS}
|
||||
DKSBaseMuSR.cpp
|
||||
)
|
||||
ENDIF (USE_CUDA OR USE_OPENCL)
|
||||
|
||||
IF (USE_CUDA)
|
||||
SET (DKS_BASEDIR_HDRS
|
||||
${DKS_BASEDIR_HDRS}
|
||||
DKSImageReconstruction.h
|
||||
)
|
||||
|
||||
SET (DKS_BASEDIR_SRCS
|
||||
${DKS_BASEDIR_SRCS}
|
||||
DKSImageReconstruction.cpp
|
||||
)
|
||||
ENDIF (USE_CUDA)
|
||||
|
||||
ADD_HEADERS (${DKS_BASEDIR_HDRS})
|
||||
ADD_SOURCES (${DKS_BASEDIR_SRCS})
|
||||
|
||||
MESSAGE (STATUS "HEADERS: ${DKS_BASEDIR_HDRS}")
|
||||
MESSAGE (STATUS "SOURCES: ${DKS_BASEDIR_SRCS}")
|
||||
|
||||
#add only those source files that will be used
|
||||
IF (USE_OPENCL)
|
||||
MESSAGE (STATUS "Add OpenCL source files")
|
||||
ADD_SUBDIRECTORY (OpenCL)
|
||||
ENDIF (USE_OPENCL)
|
||||
|
||||
IF (USE_CUDA)
|
||||
MESSAGE (STATUS "Add CUDA source files")
|
||||
ADD_SUBDIRECTORY (CUDA)
|
||||
ENDIF (USE_CUDA)
|
||||
|
||||
IF (USE_MIC)
|
||||
MESSAGE (STATUS "Add MIC source files")
|
||||
ADD_SUBDIRECTORY (MIC)
|
||||
ENDIF (USE_MIC)
|
||||
|
||||
ADD_SUBDIRECTORY (Utility)
|
||||
ADD_SUBDIRECTORY (AutoTuning)
|
||||
ADD_SUBDIRECTORY (Algorithms)
|
||||
|
||||
IF (USE_CUDA)
|
||||
CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
|
||||
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
||||
|
||||
IF (USE_UQTK)
|
||||
TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||
TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||
ELSE (USE_UQTK)
|
||||
TARGET_LINK_LIBRARIES(dks cudadevrt)
|
||||
TARGET_LINK_LIBRARIES(dksshared cudadevrt)
|
||||
ENDIF (USE_UQTK)
|
||||
|
||||
ELSE (USE_CUDA)
|
||||
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
|
||||
ADD_LIBRARY(dks ${DKS_SRCS})
|
||||
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
||||
|
||||
IF (USE_UQTK)
|
||||
TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||
TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||
ELSE (USE_UQTK)
|
||||
TARGET_LINK_LIBRARIES(dks)
|
||||
TARGET_LINK_LIBRARIES(dksshared)
|
||||
ENDIF(USE_UQTK)
|
||||
|
||||
ENDIF (USE_CUDA)
|
||||
|
||||
INSTALL(TARGETS dks DESTINATION lib)
|
||||
INSTALL(TARGETS dksshared DESTINATION lib)
|
||||
INSTALL(FILES ${DKS_BASEDIR_HDRS} DESTINATION include)
|
||||
|
||||
#IF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc"))
|
||||
# INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/libdksMIC.a DESTINATION build/lib)
|
||||
#ENDIF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc"))
|
||||
|
||||
|
||||
|
35
src/CUDA/CMakeLists.txt
Normal file
35
src/CUDA/CMakeLists.txt
Normal file
@ -0,0 +1,35 @@
|
||||
SET (_HDRS
|
||||
CudaBase.cuh
|
||||
CudaFFT.cuh
|
||||
CudaGreensFunction.cuh
|
||||
CudaChiSquare.cuh
|
||||
CudaCollimatorPhysics.cuh
|
||||
CudaImageReconstruction.cuh
|
||||
CudaChiSquareRuntime.cuh
|
||||
)
|
||||
|
||||
SET (_SRCS
|
||||
CudaBase.cu
|
||||
CudaFFT.cu
|
||||
CudaGreensFunction.cu
|
||||
CudaChiSquare.cu
|
||||
CudaCollimatorPhysics.cu
|
||||
CudaImageReconstruction.cu
|
||||
CudaChiSquareRuntime.cu
|
||||
)
|
||||
|
||||
#INCLUDE_DIRECTORIES (
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
#)
|
||||
|
||||
ADD_SOURCES(${_SRCS})
|
||||
ADD_HEADERS(${_HDRS})
|
||||
|
||||
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
|
||||
|
||||
SET (_KERNELS
|
||||
NVRTCKernels/CudaChiSquareKernel.cu
|
||||
)
|
||||
|
||||
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
|
||||
|
25
src/CUDA/CMakeListsLibcuda.txt
Normal file
25
src/CUDA/CMakeListsLibcuda.txt
Normal file
@ -0,0 +1,25 @@
|
||||
CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
|
||||
|
||||
FIND_PACKAGE(CUDA REQUIRED)
|
||||
|
||||
SET (CUDA_NVCC_FLAGS "-arch=sm_30")
|
||||
|
||||
SET(LIB_TYPE STATIC)
|
||||
|
||||
SET (DKS_CUDA_HDRS
|
||||
CudaBase.cuh
|
||||
CudaFFT.cuh
|
||||
CudaGreensFunction.cuh
|
||||
)
|
||||
|
||||
SET (DKS_CUDA_SRCS
|
||||
CudaBase.cu
|
||||
CudaFFT.cu
|
||||
CudaGreensFunction.cu
|
||||
)
|
||||
|
||||
INCLUDE_DIRECTORIES (
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})
|
386
src/CUDA/CudaBase.cu
Normal file
386
src/CUDA/CudaBase.cu
Normal file
@ -0,0 +1,386 @@
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
//=====================================//
|
||||
//============Cuda kernels=============//
|
||||
//=====================================//
|
||||
|
||||
__global__ void initcuRandState(curandState *state, int size, int seed = 0) {
|
||||
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < size) {
|
||||
curand_init(seed + idx, 0, 0, &state[idx]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//=====================================//
|
||||
//==========Private functions==========//
|
||||
//=====================================//
|
||||
|
||||
|
||||
//====================================//
|
||||
//==========Public functions==========//
|
||||
//====================================//
|
||||
|
||||
CudaBase::CudaBase() {
|
||||
|
||||
currentStream = -1;
|
||||
cudaStreams.reserve(10);
|
||||
defaultRndSet = -1;
|
||||
|
||||
}
|
||||
|
||||
CudaBase::~CudaBase() {
|
||||
|
||||
cuda_deleteStreams();
|
||||
cuda_deleteCurandStates();
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
create curandStates
|
||||
*/
|
||||
int CudaBase::cuda_createCurandStates(int size) {
|
||||
|
||||
if (defaultRndSet == 1)
|
||||
cuda_deleteCurandStates();
|
||||
|
||||
int threads = 128;
|
||||
int blocks = size / threads + 1;
|
||||
int seed = time(NULL);
|
||||
|
||||
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
|
||||
cudaMalloc(&defaultRndState, sizeof(curandState)*size);
|
||||
initcuRandState<<<blocks, threads>>>(defaultRndState, size, seed);
|
||||
|
||||
defaultRndSet = 1;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaBase::cuda_deleteCurandStates() {
|
||||
if (defaultRndSet == 1) {
|
||||
cudaFree(defaultRndState);
|
||||
defaultRndSet = -1;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
curandState* CudaBase::cuda_getCurandStates() {
|
||||
return defaultRndState;
|
||||
}
|
||||
|
||||
/*
|
||||
add cuda stream
|
||||
*/
|
||||
int CudaBase::cuda_createStream(int &streamId) {
|
||||
|
||||
cudaStream_t tmpStream;
|
||||
cudaError_t cerror;
|
||||
|
||||
cerror = cudaStreamCreate(&tmpStream);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Failed to create new CUDA stream, cuda error: " << cerror);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
cudaStreams.push_back(tmpStream);
|
||||
streamId = cudaStreams.size() - 1;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
add existing stream to list
|
||||
*/
|
||||
int CudaBase::cuda_addStream(cudaStream_t tmpStream, int &streamId) {
|
||||
cudaStreams.push_back(tmpStream);
|
||||
streamId = cudaStreams.size() - 1;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
delete stream
|
||||
*/
|
||||
int CudaBase::cuda_deleteStream(int id) {
|
||||
//TODO: lets see if this is necessary, currently do nothing
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
delete all streams
|
||||
*/
|
||||
int CudaBase::cuda_deleteStreams() {
|
||||
|
||||
//delete all cuda streams
|
||||
for (unsigned int i = 0; i < cudaStreams.size(); i++) {
|
||||
cudaStreamDestroy(cudaStreams[i]);
|
||||
}
|
||||
cudaStreams.clear();
|
||||
currentStream = -1;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
set stream id
|
||||
*/
|
||||
int CudaBase::cuda_setStream(int id) {
|
||||
currentStream = id;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
return stream id
|
||||
*/
|
||||
int CudaBase::cuda_getStreamId() {
|
||||
return currentStream;
|
||||
}
|
||||
|
||||
/*
|
||||
set default stream as the stream to use
|
||||
*/
|
||||
int CudaBase::cuda_defaultStream() {
|
||||
currentStream = -1;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaBase::cuda_numberOfStreams() {
|
||||
return cudaStreams.size();
|
||||
}
|
||||
|
||||
cudaStream_t CudaBase::cuda_getStream(int id) {
|
||||
return cudaStreams[id];
|
||||
}
|
||||
|
||||
cublasHandle_t CudaBase::cuda_getCublas() {
|
||||
return defaultCublas;
|
||||
}
|
||||
|
||||
/*
|
||||
get all available cuda devices
|
||||
*/
|
||||
int CudaBase::cuda_getDevices() {
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "==============================" << std::endl;
|
||||
std::cout << "=============CUDA=============" << std::endl;
|
||||
std::cout << "==============================" << std::endl;
|
||||
|
||||
int ndev;
|
||||
cudaGetDeviceCount(&ndev);
|
||||
|
||||
std::cout << ndev << std::endl;
|
||||
|
||||
|
||||
for (int i = 0; i < ndev; i++) {
|
||||
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, i);
|
||||
|
||||
std::cout << "Device " << i+1 << ":" << std::endl;
|
||||
std::cout << "Name: " << prop.name << std::endl;
|
||||
std::cout << "PCI bus id: " << prop.pciBusID << std::endl;
|
||||
std::cout << "PCI device id: " << prop.pciDeviceID << std::endl;
|
||||
std::cout << "PCI domain id: " << prop.pciDomainID << std::endl;
|
||||
std::cout << "==============================" << std::endl;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int CudaBase::cuda_getDeviceCount(int &ndev) {
|
||||
cudaGetDeviceCount(&ndev);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaBase::cuda_getDeviceName(std::string &device_name) {
|
||||
|
||||
int ierr = DKS_SUCCESS;
|
||||
|
||||
int ndev = 0;
|
||||
cudaGetDeviceCount(&ndev);
|
||||
if (ndev > 0) {
|
||||
int device = 0;
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDevice(&device);
|
||||
cudaGetDeviceProperties(&prop, device);
|
||||
|
||||
device_name = prop.name;
|
||||
} else {
|
||||
ierr = DKS_ERROR;
|
||||
}
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int CudaBase::cuda_setDevice(int device) {
|
||||
int ierr = DKS_SUCCESS;
|
||||
int ndev = 0;
|
||||
cudaGetDeviceCount(&ndev);
|
||||
|
||||
std::cout << "Init: " << device << "\t" << ndev << std::endl;
|
||||
|
||||
if (device < ndev) {
|
||||
std::cout << "set device to: " << ndev << std::endl;
|
||||
cudaSetDevice(device);
|
||||
} else {
|
||||
if (ndev > 0)
|
||||
cudaSetDevice(0);
|
||||
else
|
||||
ierr = DKS_ERROR;
|
||||
}
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int CudaBase::cuda_getUniqueDevices(std::vector<int> &devices) {
|
||||
|
||||
std::vector< std::string > names;
|
||||
|
||||
int ndev;
|
||||
cudaGetDeviceCount(&ndev);
|
||||
|
||||
for (int i = 0; i < ndev; i++) {
|
||||
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, i);
|
||||
|
||||
//add first device to the list, for other devices check if the name is already in the list
|
||||
if (i == 0) {
|
||||
devices.push_back(i);
|
||||
names.push_back(prop.name);
|
||||
} else {
|
||||
std::string target = prop.name;
|
||||
bool isPresent = (std::find(names.begin(), names.end(), target) != names.end());
|
||||
if (!isPresent) {
|
||||
devices.push_back(i);
|
||||
names.push_back(prop.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
set up cuda device
|
||||
*/
|
||||
int CudaBase::cuda_setUp() {
|
||||
|
||||
std::cout << "set up" << std::endl;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
allocate memory on cuda device
|
||||
*/
|
||||
void * CudaBase::cuda_allocateMemory(size_t size, int &ierr) {
|
||||
|
||||
cudaError cerror;
|
||||
void * mem_ptr = NULL;
|
||||
|
||||
cerror = cudaMalloc((void **) &mem_ptr, size);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Failed to allocate memory, cuda error: " << cerror);
|
||||
std::cout << "Error: " << cudaGetErrorString(cerror) << std::endl;
|
||||
ierr = DKS_ERROR;
|
||||
} else {
|
||||
ierr = DKS_SUCCESS;
|
||||
}
|
||||
|
||||
return mem_ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: free memory on device
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaBase::cuda_freeMemory(void * mem_ptr) {
|
||||
cudaError cerror;
|
||||
|
||||
cerror = cudaFree(mem_ptr);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error freeing memory, cuda error: " << cerror);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
|
||||
cudaError cerror;
|
||||
|
||||
cerror = cudaFreeHost(mem_ptr);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error freeing host memory, cuda error: " << cerror);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: allcate memory and write data (push)
|
||||
Return: pointer to memory object
|
||||
*/
|
||||
/*
|
||||
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
|
||||
|
||||
void * mem_ptr;
|
||||
mem_ptr = cuda_allocateMemory(size, ierr);
|
||||
|
||||
if (ierr == DKS_SUCCESS)
|
||||
ierr = cuda_writeData(mem_ptr, in_data, size);
|
||||
|
||||
return mem_ptr;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
Info: read data and free memory (pull)
|
||||
Return: success or error code
|
||||
*/
|
||||
/*
|
||||
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
|
||||
|
||||
ierr = cuda_readData(mem_ptr, out_data, size);
|
||||
if (ierr == DKS_SUCCESS)
|
||||
ierr = cuda_freeMemory(mem_ptr);
|
||||
else
|
||||
return DKS_ERROR;
|
||||
|
||||
|
||||
if (ierr == DKS_SUCCESS)
|
||||
return DKS_SUCCESS;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
Info: execute function
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaBase::cuda_executeFunction() {
|
||||
|
||||
std::cout << "Execute function" << std::endl;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: clean up
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaBase::cuda_cleanUp() {
|
||||
|
||||
std::cout << "clean up" << std::endl;
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
390
src/CUDA/CudaBase.cuh
Normal file
390
src/CUDA/CudaBase.cuh
Normal file
@ -0,0 +1,390 @@
|
||||
#ifndef H_CUDA_BASE
|
||||
#define H_CUDA_BASE
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cufft.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <curand_kernel.h>
|
||||
#include <nvToolsExt.h>
|
||||
#include <time.h>
|
||||
|
||||
class CudaBase {
|
||||
|
||||
private:
|
||||
|
||||
int currentStream;
|
||||
std::vector<cudaStream_t> cudaStreams;
|
||||
|
||||
protected:
|
||||
|
||||
cublasHandle_t defaultCublas;
|
||||
|
||||
curandState *defaultRndState;
|
||||
int defaultRndSet;
|
||||
|
||||
public:
|
||||
|
||||
CudaBase();
|
||||
|
||||
~CudaBase();
|
||||
|
||||
/**
|
||||
* Init cuda random number (cuRand) states.
|
||||
* Create an array of type curandState with "size" elements on the GPU
|
||||
* and create a curandState with different seed for each array entry.
|
||||
* Return success or error code
|
||||
*/
|
||||
int cuda_createCurandStates(int size);
|
||||
|
||||
/**
|
||||
* Delete curandState.
|
||||
* Delete curandState array on the GPU and free memory.
|
||||
* Return success or error code
|
||||
*/
|
||||
int cuda_deleteCurandStates();
|
||||
|
||||
/** Get a pointer to curand states
|
||||
*
|
||||
*/
|
||||
curandState* cuda_getCurandStates();
|
||||
|
||||
/**
|
||||
* Create a cuda stream and set streamId to index refering to this stream.
|
||||
* Return success or error code
|
||||
*/
|
||||
int cuda_createStream(int &streamId);
|
||||
|
||||
/**
|
||||
* add existing cuda stream to the list.
|
||||
* Return: success or error code.
|
||||
*/
|
||||
int cuda_addStream(cudaStream_t tmpStream, int &streamId);
|
||||
|
||||
/**
|
||||
* delete cuda stream
|
||||
* success or error code
|
||||
*/
|
||||
int cuda_deleteStream(int id);
|
||||
|
||||
/**
|
||||
* delete all streams
|
||||
* success or error code
|
||||
*/
|
||||
int cuda_deleteStreams();
|
||||
|
||||
/**
|
||||
* set stream to use
|
||||
* success or error code
|
||||
*/
|
||||
int cuda_setStream(int id);
|
||||
|
||||
/**
|
||||
* Info: get stream that is used
|
||||
* Return: return id of curretn stream
|
||||
*/
|
||||
int cuda_getStreamId();
|
||||
|
||||
/**
|
||||
* Info: reset to default stream
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_defaultStream();
|
||||
|
||||
/**
|
||||
* Info: get number of streams
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_numberOfStreams();
|
||||
|
||||
/**
|
||||
* Info: get stream
|
||||
* Return: stream
|
||||
*/
|
||||
cudaStream_t cuda_getStream(int id);
|
||||
|
||||
/**
|
||||
* Get default cublass handle
|
||||
*/
|
||||
cublasHandle_t cuda_getCublas();
|
||||
|
||||
/**
|
||||
* Info: get information on cuda devices
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_getDevices();
|
||||
|
||||
/** Get CUDA device count.
|
||||
* Sets the number of devices on the platform that can use CUDA.
|
||||
* Returns DKS_SUCCESS
|
||||
*/
|
||||
int cuda_getDeviceCount(int &ndev);
|
||||
|
||||
/** Get the name of the device.
|
||||
* QUery the device properties of the used device and set the string device_name
|
||||
*/
|
||||
int cuda_getDeviceName(std::string &device_name);
|
||||
|
||||
/** Set CUDA device to use.
|
||||
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR
|
||||
*/
|
||||
int cuda_setDevice(int device);
|
||||
|
||||
/** Get unique devices
|
||||
* Get array of indeces with the unique CUDA devices available on the paltform
|
||||
*/
|
||||
int cuda_getUniqueDevices(std::vector<int> &devices);
|
||||
|
||||
/**
|
||||
* Info: init device
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_setUp();
|
||||
|
||||
/**
|
||||
* Info: allocate memory on cuda device
|
||||
* Return: pointer to memory object
|
||||
*/
|
||||
void * cuda_allocateMemory(size_t size, int &ierr);
|
||||
|
||||
/**
|
||||
* Info: allocate host memory in pinned memory
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_allocateHostMemory(T *&ptr, size_t size) {
|
||||
cudaError cerror;
|
||||
cerror = cudaMallocHost((void**)&ptr, sizeof(T)*size);
|
||||
if (cerror != cudaSuccess)
|
||||
return DKS_ERROR;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: write data to memory
|
||||
* Retrun: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_writeData(T * mem_ptr, const void * in_data, size_t size, int offset = 0) {
|
||||
cudaError cerror;
|
||||
|
||||
cerror = cudaMemcpy(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error copying data to device, cuda error: " << cerror);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: write data assynchonuously
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_writeDataAsync(T *mem_ptr, const void *in_data, size_t size, int streamId = -1, int offset = 0) {
|
||||
cudaError cerror;
|
||||
|
||||
//if default stream or no stream specified, use default write method
|
||||
if (streamId == -1) {
|
||||
cuda_writeData(mem_ptr, in_data, size, offset);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
if (streamId < cuda_numberOfStreams()) {
|
||||
//call async write
|
||||
cerror = cudaMemcpyAsync(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice,
|
||||
cuda_getStream(streamId));
|
||||
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error async data copy, cuda error: " << cerror);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
} else {
|
||||
DEBUG_MSG("Error invalid stream id: " << streamId);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: read data from memory
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_readData(const T * mem_ptr, void * out_data, size_t size, int offset = 0) {
|
||||
cudaError cerror;
|
||||
|
||||
cerror = cudaMemcpy(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error reading data from device");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: read data async from device memory
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_readDataAsync(const T *mem_ptr, void *out_data, size_t size, int streamId = -1, int offset = 0) {
|
||||
cudaError cerror;
|
||||
|
||||
if (streamId == -1) {
|
||||
cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, 0);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error async read from devie default stream");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
if (streamId < cuda_numberOfStreams()) {
|
||||
cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost,
|
||||
cuda_getStream(streamId));
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error async read from device, cuda error: " << cerror);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
} else {
|
||||
DEBUG_MSG("Error invalid stream id: " << streamId);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: free memory on device
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_freeMemory(void * mem_ptr);
|
||||
|
||||
/**
|
||||
* Info: free page locked memory on host
|
||||
* Return: success or erro code
|
||||
*/
|
||||
int cuda_freeHostMemory(void * mem_ptr);
|
||||
|
||||
/**
|
||||
* Info: allcate memory and write data (push)
|
||||
* Return: pointer to memory object
|
||||
*/
|
||||
template<typename T>
|
||||
void * cuda_pushData(const void * in_data, size_t size, int &ierr) {
|
||||
|
||||
void * mem_ptr;
|
||||
mem_ptr = cuda_allocateMemory(size, ierr);
|
||||
|
||||
if (ierr == DKS_SUCCESS)
|
||||
ierr = cuda_writeData((T*)mem_ptr, in_data, size);
|
||||
|
||||
return mem_ptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: read data and free memory (pull)
|
||||
* Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_pullData(T * mem_ptr, void * out_data, size_t size, int &ierr) {
|
||||
|
||||
ierr = cuda_readData(mem_ptr, out_data, size);
|
||||
if (ierr == DKS_SUCCESS)
|
||||
ierr = cuda_freeMemory(mem_ptr);
|
||||
else
|
||||
return DKS_ERROR;
|
||||
|
||||
|
||||
if (ierr == DKS_SUCCESS)
|
||||
return DKS_SUCCESS;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: execute function
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_executeFunction();
|
||||
|
||||
/**
|
||||
* Info: clean up
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_cleanUp();
|
||||
|
||||
/**
|
||||
* Info: sync cuda device
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_syncDevice() {
|
||||
cudaDeviceSynchronize();
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Page-lock host memory
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_hostRegister(T *ptr, int size) {
|
||||
int cerr = cudaHostRegister(ptr, size*sizeof(T), cudaHostRegisterPortable);
|
||||
if (cerr == cudaSuccess) {
|
||||
return DKS_SUCCESS;
|
||||
} else {
|
||||
DEBUG_MSG("Host memroy was not page locked");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Release page locked memory
|
||||
*/
|
||||
template<typename T>
|
||||
int cuda_hostUnregister(T *ptr) {
|
||||
int cerr = cudaHostUnregister(ptr);
|
||||
if (cerr == cudaSuccess)
|
||||
return DKS_SUCCESS;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Info: print device memory info (total, used, avail)
|
||||
* Return: success or error code
|
||||
*/
|
||||
int cuda_memInfo() {
|
||||
int ierr;
|
||||
size_t avail;
|
||||
size_t total;
|
||||
double mb = 1000000.0;
|
||||
|
||||
ierr = cudaMemGetInfo( &avail, &total);
|
||||
|
||||
if (ierr != cudaSuccess) {
|
||||
DEBUG_MSG("Device mem info could not be obtained!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
std::cout << "Device memory info, total: " << total / mb << "MB,\t";
|
||||
std::cout << "used: " << (total - avail) / mb << "MB,\t";
|
||||
std::cout << "avail: " << avail / mb << "MB" << std::endl;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
287
src/CUDA/CudaChiSquare.cu
Normal file
287
src/CUDA/CudaChiSquare.cu
Normal file
@ -0,0 +1,287 @@
|
||||
#include "CudaChiSquare.cuh"
|
||||
|
||||
//simple kernel version
|
||||
__global__ void kernelPHistoTFFcn(double *data, double *par, double *chisq,
|
||||
double fTimeResolution, double fRebin, int n) {
|
||||
|
||||
int j = blockIdx.x;
|
||||
int i = blockIdx.y;
|
||||
|
||||
int idx = i * n + j;
|
||||
|
||||
const double tau = 2.197019;
|
||||
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
|
||||
double time = dt0 + fTimeResolution * fRebin * j;
|
||||
|
||||
double w = par[0]*0.08516155035269027;
|
||||
|
||||
double ldata = data[idx];
|
||||
|
||||
double theo = par[2 + i*4] * exp(-time/tau) * (1.0 + par[3 + i*4] * exp(-0.5 * pow(par[1]*time,2.0) ) * cos(w * time+par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4];
|
||||
|
||||
|
||||
if (ldata != 0.0)
|
||||
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
|
||||
else
|
||||
chisq[idx] = theo * theo;
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelPHistoTFFcn_2(double *data, double *par, double *chisq,
|
||||
double fTimeResolution, double fRebin, int n, int s) {
|
||||
|
||||
int j = blockIdx.x;
|
||||
|
||||
const double tau = 2.197019;
|
||||
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
|
||||
double time = dt0 + fTimeResolution * fRebin * j;
|
||||
double w = par[0]*0.08516155035269027;
|
||||
double tt = exp(-time/tau);
|
||||
double pp = exp(-0.5 * par[1] * time * par[1] * time);
|
||||
double wt = w * time;
|
||||
|
||||
int idx;
|
||||
double ldata, theo;
|
||||
for (int i = 0; i < s; i++) {
|
||||
idx = i * n + j;
|
||||
ldata = data[idx];
|
||||
|
||||
theo = par[2 + i*4] * tt * (1.0 + par[3 + i*4] * pp * cos(wt + par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4];
|
||||
|
||||
if (ldata != 0.0)
|
||||
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
|
||||
else
|
||||
chisq[idx] = theo * theo;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#define TAU 2.197019
|
||||
|
||||
__global__ void kernelPHistoTFFcn_3(double *data, double *par, double *chisq,
|
||||
double fTimeResolution, double fRebin,
|
||||
int length, int sensors, int numpar) {
|
||||
|
||||
|
||||
//define shared variable for parameters
|
||||
extern __shared__ double p[];
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid = threadIdx.x;
|
||||
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
//load parameters from global to shared memory
|
||||
if (tid < numpar)
|
||||
p[tid] = par[tid];
|
||||
|
||||
//sync threads
|
||||
__syncthreads();
|
||||
|
||||
if (j < length) {
|
||||
|
||||
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
|
||||
double time = dt0 + fTimeResolution * fRebin * j;
|
||||
double w = p[0]*0.08516155035269027;
|
||||
double tt = exp(-time/TAU);
|
||||
double pp = exp(-0.5 * pow(p[1]*time, 2.0));
|
||||
double wt = w * time;
|
||||
|
||||
int idx;
|
||||
double ldata, theo;
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
idx = i * length + j;
|
||||
ldata = data[idx];
|
||||
|
||||
theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
|
||||
|
||||
if (ldata != 0.0)
|
||||
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
|
||||
else
|
||||
chisq[idx] = theo * theo;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelSingleGaussTF(double *data, unsigned int *t0, double *par, double *result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int length, int sensors, int numpar)
|
||||
{
|
||||
|
||||
//define shared variable for parameters
|
||||
extern __shared__ double p[];
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid = threadIdx.x;
|
||||
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
//load parameters from global to shared memory
|
||||
if (tid < numpar)
|
||||
p[tid] = par[tid];
|
||||
|
||||
//sync threads
|
||||
__syncthreads();
|
||||
|
||||
if (j < length) {
|
||||
double dt0 = fTimeResolution*0.5*(fRebin - 1);
|
||||
double w1 = par[0]*0.08516155035269027;
|
||||
|
||||
int idx;
|
||||
double ldata, lft0, theo, time;
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
idx = i * length + j;
|
||||
lft0 = t0[i];
|
||||
if (j >= lft0 + fGoodBinOffset/fRebin) {
|
||||
ldata = data[idx];
|
||||
time = dt0 + fTimeResolution * fRebin* (j - lft0);
|
||||
theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0))
|
||||
*cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
|
||||
// 1.74532925199432955e-2 = pi/180
|
||||
|
||||
if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) )
|
||||
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
|
||||
else
|
||||
result[idx] = theo - ldata;
|
||||
} else {
|
||||
result[idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelDoubleLorentzTF(double *data, unsigned int *t0, double *par, double *result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int length, int sensors, int numpar)
|
||||
{
|
||||
|
||||
//define shared variable for parameters
|
||||
extern __shared__ double p[];
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid = threadIdx.x;
|
||||
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
//load parameters from global to shared memory
|
||||
if (tid < numpar)
|
||||
p[tid] = par[tid];
|
||||
|
||||
//sync threads
|
||||
__syncthreads();
|
||||
|
||||
if (j < length) {
|
||||
double dt0 = fTimeResolution*0.5*(fRebin - 1);
|
||||
double w1 = p[0]*0.08516155035269027;
|
||||
double w2 = p[2]*0.08516155035269027;
|
||||
|
||||
int idx;
|
||||
double ldata, lft0, theo, time;
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
|
||||
idx = i * length + j;
|
||||
lft0 = t0[i];
|
||||
if (j >= lft0 + fGoodBinOffset/fRebin) {
|
||||
ldata = data[idx];
|
||||
time = dt0+fTimeResolution*fRebin*(j-lft0);
|
||||
|
||||
theo = p[4+i*5]*exp(-time/TAU)*
|
||||
(1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)*
|
||||
cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+
|
||||
(1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)*
|
||||
cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5];
|
||||
// 1.74532925199432955e-2 = pi/180
|
||||
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
|
||||
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
|
||||
else
|
||||
result[idx] = theo - ldata;
|
||||
} else {
|
||||
result[idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int CudaChiSquare::cuda_PHistoTFFcn(void *mem_data, void *mem_ptr, void *mem_chisq,
|
||||
double fTimeResolution, double fRebin,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
|
||||
int threads = 128;
|
||||
int blocks = length / threads + 1;
|
||||
|
||||
kernelPHistoTFFcn_3<<<blocks, threads, numpar*sizeof(double) >>>((double*)mem_data,
|
||||
(double*)mem_ptr,
|
||||
(double*)mem_chisq,
|
||||
fTimeResolution,
|
||||
fRebin, length,
|
||||
sensors, numpar);
|
||||
|
||||
|
||||
cublasStatus_t status;
|
||||
status = cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_chisq, 1, &result);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("cublas asum failed");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int CudaChiSquare::cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
|
||||
int threads = 128;
|
||||
int blocks = length / threads + 1;
|
||||
|
||||
kernelSingleGaussTF<<<blocks, threads, numpar*sizeof(double) >>>( (double*)mem_data,
|
||||
(unsigned int*)mem_t0,
|
||||
(double*)mem_par,
|
||||
(double*)mem_result,
|
||||
fTimeResolution,
|
||||
fRebin,
|
||||
fGoodBinOffset,
|
||||
length, sensors, numpar);
|
||||
|
||||
cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result);
|
||||
result = 2.0 * result;
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int CudaChiSquare::cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
|
||||
int threads = 128;
|
||||
int blocks = length / threads + 1;
|
||||
|
||||
kernelDoubleLorentzTF<<<blocks, threads, numpar*sizeof(double) >>>( (double*)mem_data,
|
||||
(unsigned int*)mem_t0,
|
||||
(double*)mem_par,
|
||||
(double*)mem_result,
|
||||
fTimeResolution,
|
||||
fRebin,
|
||||
fGoodBinOffset,
|
||||
length, sensors, numpar);
|
||||
|
||||
cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result);
|
||||
result = 2.0 * result;
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
59
src/CUDA/CudaChiSquare.cuh
Normal file
59
src/CUDA/CudaChiSquare.cuh
Normal file
@ -0,0 +1,59 @@
|
||||
#ifndef H_CUDA_CHISQUARE
|
||||
#define H_CUDA_CHISQUARE
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
class CudaChiSquare {
|
||||
|
||||
private:
|
||||
|
||||
bool base_create;
|
||||
CudaBase *m_base;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructor which gets CudaBase as argument
|
||||
*/
|
||||
CudaChiSquare(CudaBase *base) {
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
CudaChiSquare() {
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
~CudaChiSquare() {
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
}
|
||||
|
||||
/* PHistoTFFcn calculation */
|
||||
int cuda_PHistoTFFcn(void * mem_data, void * mem_par, void * mem_chisq,
|
||||
double fTimeResolution, double fRebin,
|
||||
int sensors, int length, int numpar,
|
||||
double &result);
|
||||
|
||||
int cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result);
|
||||
|
||||
int cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result);
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
313
src/CUDA/CudaChiSquareRuntime.cu
Normal file
313
src/CUDA/CudaChiSquareRuntime.cu
Normal file
@ -0,0 +1,313 @@
|
||||
#include "CudaChiSquareRuntime.cuh"
|
||||
|
||||
CudaChiSquareRuntime::CudaChiSquareRuntime(CudaBase *base) {
|
||||
blockSize_m = BLOCK_SIZE;
|
||||
numBlocks_m = -1;
|
||||
|
||||
ptx_m = NULL;
|
||||
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
setUpContext();
|
||||
}
|
||||
|
||||
//constructor, init cuda device and create context
|
||||
CudaChiSquareRuntime::CudaChiSquareRuntime() {
|
||||
blockSize_m = BLOCK_SIZE;
|
||||
numBlocks_m = -1;
|
||||
|
||||
ptx_m = NULL;
|
||||
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
setUpContext();
|
||||
}
|
||||
|
||||
//free resources
|
||||
CudaChiSquareRuntime::~CudaChiSquareRuntime() {
|
||||
delete[] ptx_m;
|
||||
cuCtxDestroy(context_m);
|
||||
|
||||
freeChiSquare();
|
||||
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
}
|
||||
|
||||
void CudaChiSquareRuntime::setUpContext() {
|
||||
cuInit(0);
|
||||
cuDeviceGet(&cuDevice_m, 0);
|
||||
cuCtxCreate(&context_m, 0, cuDevice_m);
|
||||
|
||||
N0_m = 1.0;
|
||||
tau_m = 1.0;
|
||||
bkg_m = 1.0;
|
||||
|
||||
initDone_m = false;
|
||||
}
|
||||
|
||||
//build program string
|
||||
std::string CudaChiSquareRuntime::buildProgram(std::string function) {
|
||||
|
||||
long fsize;
|
||||
char *kernel_source;
|
||||
|
||||
//get kernel source
|
||||
char * kernel_file = new char[500];
|
||||
kernel_file[0] = '\0';
|
||||
strcat(kernel_file, OPENCL_KERNELS);
|
||||
strcat(kernel_file, "CUDA/NVRTCKernels/CudaChiSquareKernel.cu");
|
||||
|
||||
//read kernels from file
|
||||
FILE *fp = fopen(kernel_file, "rb");
|
||||
if (!fp)
|
||||
DEBUG_MSG("Can't open kernel file" << kernel_file);
|
||||
|
||||
//get file size and allocate memory
|
||||
fseek(fp, 0, SEEK_END);
|
||||
fsize = ftell(fp);
|
||||
kernel_source = new char[fsize+1];
|
||||
|
||||
//read file and content in kernel source
|
||||
rewind(fp);
|
||||
fread(kernel_source, 1, sizeof(char)*fsize, fp);
|
||||
kernel_source[fsize] = '\0';
|
||||
fclose(fp);
|
||||
|
||||
std::string kernel_string (kernel_source);
|
||||
return kernel_string + cudaFunctHeader + "return " + function + ";" + cudaFunctFooter;
|
||||
}
|
||||
|
||||
//
|
||||
int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
|
||||
|
||||
//build program string
|
||||
std::string cudaProg = buildProgram(function);
|
||||
|
||||
//create program
|
||||
nvrtcProgram prog;
|
||||
//std::cout << cudaProg.c_str() << std::endl;
|
||||
nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
|
||||
|
||||
//compile program
|
||||
const char *opts[] = {"-fmad=false", ""};
|
||||
int numopts = 1;
|
||||
if (mlh) {
|
||||
opts[1] = "-DMLH";
|
||||
numopts = 2;
|
||||
}
|
||||
|
||||
nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
|
||||
|
||||
if (compileResults != NVRTC_SUCCESS) {
|
||||
//obtain compilation log
|
||||
size_t logSize;
|
||||
nvrtcGetProgramLogSize(prog, &logSize);
|
||||
char *log = new char[logSize];
|
||||
nvrtcGetProgramLog(prog, log);
|
||||
DEBUG_MSG("Compilation failed!");
|
||||
DEBUG_MSG(log);
|
||||
delete[] log;
|
||||
|
||||
return DKS_ERROR;
|
||||
} else {
|
||||
DEBUG_MSG("Compilation successfull!");
|
||||
}
|
||||
|
||||
//obtain PTX from program
|
||||
if (ptx_m != NULL)
|
||||
delete[] ptx_m;
|
||||
size_t ptxSize;
|
||||
nvrtcGetPTXSize(prog, &ptxSize);
|
||||
ptx_m = new char[ptxSize];
|
||||
nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);
|
||||
|
||||
if (nvrtcPTXResult != NVRTC_SUCCESS) {
|
||||
DEBUG_MSG("Get PTX failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//load module from ptx
|
||||
CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0);
|
||||
if (loadResult != CUDA_SUCCESS) {
|
||||
DEBUG_MSG("Load module from ptx failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
// Destroy the program
|
||||
nvrtcDestroyProgram(&prog);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaChiSquareRuntime::launchChiSquare(int fitType,
|
||||
void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep, double &result)
|
||||
{
|
||||
|
||||
if (!initDone_m) {
|
||||
DEBUG_MSG("ChiSquare init needs to be called at some point!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int blocks;
|
||||
int threads = blockSize_m;
|
||||
if (numBlocks_m < 0)
|
||||
blocks = length / threads + 1;
|
||||
else
|
||||
blocks = numBlocks_m;
|
||||
|
||||
CUresult cuStatus;
|
||||
void **args = 0;
|
||||
|
||||
if (fitType == FITTYPE_SINGLE_HISTO) {
|
||||
cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareSingleHisto");
|
||||
|
||||
if (cuStatus != CUDA_SUCCESS) {
|
||||
DEBUG_MSG("Failed to get function from module!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
args = (void**) malloc(15 * sizeof(void*));
|
||||
args[0] = &mem_data;
|
||||
args[1] = &mem_err;
|
||||
args[2] = &mem_param_m;
|
||||
args[3] = &mem_chisq_m;
|
||||
args[4] = &mem_map_m;
|
||||
args[5] = &mem_func_m;
|
||||
args[6] = &length;
|
||||
args[7] = &numpar;
|
||||
args[8] = &numfunc;
|
||||
args[9] = &nummap;
|
||||
args[10] = &timeStart;
|
||||
args[11] = &timeStep;
|
||||
args[12] = &tau_m;
|
||||
args[13] = &N0_m;
|
||||
args[14] = &bkg_m;
|
||||
} else if (fitType == FITTYPE_ASYMMETRY) {
|
||||
cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareAsymmetry");
|
||||
|
||||
if (cuStatus != CUDA_SUCCESS) {
|
||||
DEBUG_MSG("Failed to get function from module!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
args = (void**) malloc(14 * sizeof(void*));
|
||||
args[0] = &mem_data;
|
||||
args[1] = &mem_err;
|
||||
args[2] = &mem_param_m;
|
||||
args[3] = &mem_chisq_m;
|
||||
args[4] = &mem_map_m;
|
||||
args[5] = &mem_func_m;
|
||||
args[6] = &length;
|
||||
args[7] = &numpar;
|
||||
args[8] = &numfunc;
|
||||
args[9] = &nummap;
|
||||
args[10] = &timeStart;
|
||||
args[11] = &timeStep;
|
||||
args[12] = &alpha_m;
|
||||
args[13] = &beta_m;
|
||||
} else if (fitType == FITTYPE_MU_MINUS) {
|
||||
DEBUG_MSG("Not Yet Implemented!");
|
||||
return DKS_ERROR;
|
||||
} else {
|
||||
DEBUG_MSG("Undefined Fit Type!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
cuStatus = cuLaunchKernel(kernel_m,
|
||||
blocks, 1, 1,
|
||||
threads, 1, 1,
|
||||
(numpar + numfunc)*sizeof(double) + nummap*sizeof(int), NULL,
|
||||
args, 0);
|
||||
|
||||
|
||||
|
||||
if (cuStatus != CUDA_SUCCESS) {
|
||||
std::string msg;
|
||||
msg = "Failed to run kernel! (" + std::to_string(blocks) + ", " + std::to_string(threads) + ")";
|
||||
DEBUG_MSG(msg);
|
||||
const char *desc;
|
||||
cuGetErrorString(cuStatus, &desc);
|
||||
std::cout << desc << std::endl;
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
cublasStatus_t status;
|
||||
status = cublasDasum(defaultCublasRT, length, (double*)mem_chisq_m, 1, &result);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("cublas sum failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
// cleanup
|
||||
if (args)
|
||||
free(args);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
int CudaChiSquareRuntime::writeParams(const double *params, int numparams) {
|
||||
int ierr = m_base->cuda_writeData( (double*)mem_param_m, params, sizeof(double)*numparams);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int CudaChiSquareRuntime::writeFunc(const double *func, int numfunc) {
|
||||
int ierr = m_base->cuda_writeData( (double*)mem_func_m, func, sizeof(double)*numfunc);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int CudaChiSquareRuntime::writeMap(const int *map, int nummap) {
|
||||
int ierr = m_base->cuda_writeData( (int*)mem_map_m, map, sizeof(int)*nummap);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int CudaChiSquareRuntime::initChiSquare(int size_data, int size_param, int size_func,
|
||||
int size_map) {
|
||||
|
||||
int ierr = DKS_ERROR;
|
||||
if (initDone_m) {
|
||||
DEBUG_MSG("Reinitializing ChiSquare");
|
||||
freeChiSquare();
|
||||
}
|
||||
|
||||
//init cublas
|
||||
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
||||
status = cublasCreate(&defaultCublasRT);
|
||||
if (status != CUBLAS_STATUS_SUCCESS)
|
||||
DEBUG_MSG("CUBLAS create default handle failed!");
|
||||
|
||||
//allocate temporary memory
|
||||
mem_chisq_m = m_base->cuda_allocateMemory(size_data*sizeof(double), ierr);
|
||||
mem_param_m = m_base->cuda_allocateMemory(size_param*sizeof(double), ierr);
|
||||
mem_func_m = m_base->cuda_allocateMemory(size_func*sizeof(double), ierr);
|
||||
mem_map_m = m_base->cuda_allocateMemory(size_map*sizeof(int), ierr);
|
||||
initDone_m = true;
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int CudaChiSquareRuntime::freeChiSquare() {
|
||||
int ierr = DKS_ERROR;
|
||||
if (initDone_m) {
|
||||
//delete cublas
|
||||
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
||||
status = cublasDestroy(defaultCublasRT);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("CUBLAS delete default handle failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//free memory
|
||||
ierr = m_base->cuda_freeMemory(mem_chisq_m);
|
||||
ierr = m_base->cuda_freeMemory(mem_param_m);
|
||||
ierr = m_base->cuda_freeMemory(mem_func_m);
|
||||
ierr = m_base->cuda_freeMemory(mem_map_m);
|
||||
|
||||
initDone_m = false;
|
||||
}
|
||||
|
||||
return ierr;
|
||||
}
|
114
src/CUDA/CudaChiSquareRuntime.cuh
Normal file
114
src/CUDA/CudaChiSquareRuntime.cuh
Normal file
@ -0,0 +1,114 @@
|
||||
#ifndef H_CUDA_CHISQUARE_RUNTIME
|
||||
#define H_CUDA_CHISQUARE_RUNTIME
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <nvrtc.h>
|
||||
|
||||
#include "../Algorithms/ChiSquareRuntime.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
const std::string cudaFunctHeader = "__device__ double fTheory(double t, double *p, double *f, int *m) {";
|
||||
|
||||
const std::string cudaFunctFooter = "}\n";
|
||||
|
||||
class CudaChiSquareRuntime : public ChiSquareRuntime{
|
||||
|
||||
private:
|
||||
|
||||
bool base_create;
|
||||
CudaBase *m_base;
|
||||
|
||||
CUdevice cuDevice_m;
|
||||
CUcontext context_m;
|
||||
CUmodule module_m;
|
||||
CUfunction kernel_m;
|
||||
|
||||
cublasHandle_t defaultCublasRT;
|
||||
|
||||
/** Setup to init device
|
||||
* Create context and init device for RT compilation
|
||||
*/
|
||||
void setUpContext();
|
||||
|
||||
/** Private function to add function to kernel string
|
||||
*
|
||||
*/
|
||||
std::string buildProgram(std::string function);
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor with CudaBase argument
|
||||
*
|
||||
*/
|
||||
CudaChiSquareRuntime(CudaBase *base);
|
||||
|
||||
/** Default constructor init cuda device
|
||||
*
|
||||
*/
|
||||
CudaChiSquareRuntime();
|
||||
|
||||
/** Default destructor
|
||||
*
|
||||
*/
|
||||
~CudaChiSquareRuntime();
|
||||
|
||||
/** Compile program and save ptx.
|
||||
* Add function string to the calcFunction kernel and compile the program
|
||||
* Function must be valid C math expression. Parameters can be addressed in
|
||||
* a form par[map[idx]]
|
||||
*/
|
||||
int compileProgram(std::string function, bool mlh = false);
|
||||
|
||||
/** Launch selected kernel
|
||||
* Launched the selected kernel from the compiled code.
|
||||
* Result is put in &result variable
|
||||
*/
|
||||
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result);
|
||||
|
||||
/** Write params to device.
|
||||
* Write params from double array to mem_param_m memory on the device.
|
||||
*/
|
||||
int writeParams(const double *params, int numparams);
|
||||
|
||||
/** Write functions to device.
|
||||
* Write function values from double array to mem_func_m memory on the device.
|
||||
*/
|
||||
int writeFunc(const double *func, int numfunc);
|
||||
|
||||
/** Write maps to device.
|
||||
* Write map values from int array to mem_map_m memory on the device.
|
||||
*/
|
||||
int writeMap(const int *map, int nummap);
|
||||
|
||||
/** Allocate temporary memory needed for chi square.
|
||||
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
||||
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
||||
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
||||
* calculations.
|
||||
*/
|
||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||
|
||||
|
||||
/** Free temporary memory allocated for chi square.
|
||||
* Frees the chisq temporary memory and memory for params, functions and maps
|
||||
*/
|
||||
int freeChiSquare();
|
||||
|
||||
/** Check if CUDA device is able to run the chi square kernel.
|
||||
* Redundant - all new CUDA devices that support RT compilation will also support
|
||||
* double precision, there are no other requirements to run chi square on GPU
|
||||
*/
|
||||
int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
728
src/CUDA/CudaCollimatorPhysics.cu
Normal file
728
src/CUDA/CudaCollimatorPhysics.cu
Normal file
@ -0,0 +1,728 @@
|
||||
#include "CudaCollimatorPhysics.cuh"
|
||||
|
||||
//#define M_P 0.93827231e+00
|
||||
#define M_P 0.93827204e+00
|
||||
#define C 299792458.0
|
||||
#define PI 3.14159265358979323846
|
||||
#define AVO 6.022e23
|
||||
#define R_E 2.81794092e-15
|
||||
//#define eM_E 0.51099906e-03
|
||||
#define eM_E 0.51099892e-03
|
||||
#define Z_P 1
|
||||
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
|
||||
|
||||
#define POSITION 0
|
||||
#define ZSIZE 1
|
||||
#define RHO_M 2
|
||||
#define Z_M 3
|
||||
#define A_M 4
|
||||
#define A2_C 5
|
||||
#define A3_C 6
|
||||
#define A4_C 7
|
||||
#define A5_C 8
|
||||
#define X0_M 9
|
||||
#define I_M 10
|
||||
#define DT_M 11
|
||||
|
||||
#define BLOCK_SIZE 128
|
||||
#define NUMPAR 12
|
||||
|
||||
__device__ inline double dot(double3 &d1, double3 &d2) {
|
||||
|
||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||
|
||||
}
|
||||
|
||||
__device__ inline bool checkHit(double &z, double *par) {
|
||||
|
||||
/* check if particle is in the degrader material */
|
||||
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
|
||||
|
||||
}
|
||||
|
||||
|
||||
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
|
||||
{
|
||||
|
||||
volatile double dEdx = 0.0;
|
||||
|
||||
volatile double gamma = (Eng + M_P) / M_P;
|
||||
volatile double gamma2 = gamma * gamma;
|
||||
|
||||
double beta = sqrt(1.0 - 1.0 / gamma2);
|
||||
volatile double beta2 = beta * beta;
|
||||
|
||||
double deltas = par[DT_M] * beta * C;
|
||||
volatile double deltasrho = deltas * 100 * par[RHO_M];
|
||||
volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
|
||||
|
||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||
double Ts = (Eng * 1E6) / 1.0073;
|
||||
double epsilon_low = par[A2_C] * pow(Ts, 0.45);
|
||||
double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
|
||||
double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
|
||||
|
||||
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
|
||||
|
||||
double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
|
||||
Eng = Eng + delta_E / 1E3;
|
||||
}
|
||||
|
||||
if (Eng >= 0.0006) {
|
||||
double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
|
||||
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
|
||||
|
||||
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
|
||||
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
|
||||
Tmax / par[I_M] / par[I_M]) - beta2);
|
||||
|
||||
double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
|
||||
|
||||
Eng = Eng + delta_E / 1E3;
|
||||
}
|
||||
|
||||
pdead = ((Eng<1E-4) || (dEdx>0));
|
||||
|
||||
}
|
||||
|
||||
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
|
||||
double &normP, double &thetacou, double &deltas, int coord,
|
||||
double *par)
|
||||
{
|
||||
double Psixz;
|
||||
double pxz;
|
||||
|
||||
if (px>=0 && pz>=0)
|
||||
Psixz = atan(px/pz);
|
||||
else if (px>0 && pz<0)
|
||||
Psixz = atan(px/pz) + PI;
|
||||
else if (px<0 && pz>0)
|
||||
Psixz = atan(px/pz) + 2*PI;
|
||||
else
|
||||
Psixz = atan(px/pz) + PI;
|
||||
|
||||
pxz = sqrt(px*px + pz*pz);
|
||||
|
||||
if(coord==1) {
|
||||
x = x + deltas * px/normP + xplane*cos(Psixz);
|
||||
z = z - xplane * sin(Psixz);
|
||||
}
|
||||
|
||||
if(coord==2) {
|
||||
x = x + deltas * px/normP + xplane*cos(Psixz);
|
||||
z = z - xplane * sin(Psixz) + deltas * pz / normP;
|
||||
}
|
||||
|
||||
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
|
||||
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||
}
|
||||
|
||||
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
|
||||
|
||||
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
||||
double gamma = (Eng + M_P) / M_P;
|
||||
double normP = sqrt(dot(P, P));
|
||||
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
|
||||
double deltas = par[DT_M] * beta * C;
|
||||
|
||||
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
|
||||
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
|
||||
|
||||
// x-direction: See Physical Review, "Multiple Scattering"
|
||||
double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
double thetacou = z2 * theta0;
|
||||
|
||||
while(fabs(thetacou) > 3.5 * theta0) {
|
||||
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
thetacou = z2 * theta0;
|
||||
}
|
||||
|
||||
//__syncthreads();
|
||||
|
||||
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
|
||||
|
||||
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P2 < 0.0047) {
|
||||
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P4 > 0.5)
|
||||
thetaru = -thetaru;
|
||||
Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
|
||||
}
|
||||
|
||||
// y-direction: See Physical Review, "Multiple Scattering"
|
||||
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
thetacou = z2 * theta0;
|
||||
|
||||
while(fabs(thetacou) > 3.5 * theta0) {
|
||||
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
thetacou = z2 * theta0;
|
||||
}
|
||||
|
||||
//__syncthreads();
|
||||
|
||||
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
|
||||
|
||||
P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P2 < 0.0047) {
|
||||
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P4 > 0.5)
|
||||
thetaru = -thetaru;
|
||||
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
|
||||
int numparticles)
|
||||
{
|
||||
|
||||
//get global id and thread id
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
//transfer params to shared memory
|
||||
extern __shared__ double smem[];
|
||||
double *p = (double*)smem;
|
||||
double3 *R = (double3*)&smem[NUMPAR];
|
||||
|
||||
curandState s;
|
||||
double3 P;
|
||||
|
||||
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
|
||||
p[tt] = par[tt];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (idx < numparticles) {
|
||||
s = state[idx];
|
||||
R[tid] = data[idx].Rincol;
|
||||
P = data[idx].Pincol;
|
||||
|
||||
bool pdead = false;
|
||||
volatile double sq = sqrt(1.0 + dot(P, P));
|
||||
|
||||
double Eng;
|
||||
|
||||
if (checkHit(R[tid].z, p)) {
|
||||
|
||||
Eng = (sq - 1) * M_P;
|
||||
energyLoss(Eng, pdead, s, p);
|
||||
|
||||
if (!pdead) {
|
||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||
sq = sqrt(dot(P, P));
|
||||
|
||||
P.x = P.x * ptot / sq;
|
||||
P.y = P.y * ptot / sq;
|
||||
P.z = P.z * ptot / sq;
|
||||
coulombScat(R[tid], P, s, p);
|
||||
|
||||
data[idx].Pincol = P;
|
||||
} else {
|
||||
data[idx].label = -1;
|
||||
}
|
||||
|
||||
state[idx] = s;
|
||||
} else {
|
||||
|
||||
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
|
||||
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
|
||||
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
|
||||
data[idx].label = -2;
|
||||
|
||||
}
|
||||
|
||||
data[idx].Rincol = R[tid];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
||||
curandState *state, int numparticles)
|
||||
{
|
||||
|
||||
//get global id and thread id
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
//transfer params to shared memory
|
||||
__shared__ double p[NUMPAR];
|
||||
__shared__ double3 R[BLOCK_SIZE];
|
||||
|
||||
if (tid < NUMPAR)
|
||||
p[tid] = par[tid];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
curandState s;
|
||||
double3 P;
|
||||
if (idx < numparticles) {
|
||||
R[tid] = data.Rincol[idx];
|
||||
P = data.Pincol[idx];
|
||||
s = state[idx];
|
||||
|
||||
double sq = sqrt(1.0 + dot(P, P));
|
||||
bool pdead = false;
|
||||
|
||||
if (checkHit(R[tid].z, p)) {
|
||||
|
||||
double Eng = (sq - 1) * M_P;
|
||||
energyLoss(Eng, pdead, s, p);
|
||||
|
||||
if (!pdead) {
|
||||
|
||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||
sq = sqrt(dot(P, P));
|
||||
P.x = P.x * ptot / sq;
|
||||
P.y = P.y * ptot / sq;
|
||||
P.z = P.z * ptot / sq;
|
||||
coulombScat(R[tid], P, s, p);
|
||||
|
||||
data.Pincol[idx] = P;
|
||||
} else {
|
||||
data.label[idx] = -1;
|
||||
}
|
||||
|
||||
} else {
|
||||
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
|
||||
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
|
||||
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
|
||||
|
||||
data.label[idx] = -2;
|
||||
}
|
||||
|
||||
data.Rincol[idx] = R[tid];
|
||||
state[idx] = s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
inline __device__ void unitlessOff(double3 &a, const double &c) {
|
||||
a.x *= c;
|
||||
a.y *= c;
|
||||
a.z *= c;
|
||||
}
|
||||
|
||||
inline __device__ void unitlessOn(double3 &a, const double &c) {
|
||||
a.x /= c;
|
||||
a.y /= c;
|
||||
a.z /= c;
|
||||
}
|
||||
|
||||
//swithch to unitless positions with dtc
|
||||
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
|
||||
unitlessOn(R, dtc);
|
||||
unitlessOn(X, dtc);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//swithc to unitless positions with dt*c
|
||||
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
double dt = gdt[idx];
|
||||
|
||||
unitlessOff(R, dt*c);
|
||||
unitlessOff(X, dt*c);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
}
|
||||
|
||||
//swithc off unitless positions with dtc
|
||||
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
|
||||
unitlessOff(R, dtc);
|
||||
unitlessOff(X, dtc);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//switch off unitelss positions with dt*c
|
||||
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
|
||||
|
||||
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < npart) {
|
||||
double3 R = gR[idx];
|
||||
double3 X = gX[idx];
|
||||
double dt = gdt[idx];
|
||||
|
||||
unitlessOff(R, dt*c);
|
||||
unitlessOff(X, dt*c);
|
||||
|
||||
gR[idx] = R;
|
||||
gX[idx] = X;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
||||
|
||||
//get global id and thread id
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
if (idx < npart) {
|
||||
|
||||
double3 R = gR[idx];
|
||||
double3 P = gP[idx];
|
||||
|
||||
//switch to unitless positions
|
||||
unitlessOn(R, dtc);
|
||||
|
||||
//push
|
||||
double tmp = sqrt(1.0 + dot(P, P));
|
||||
R.x += 0.5 * P.x / tmp;
|
||||
R.y += 0.5 * P.y / tmp;
|
||||
R.z += 0.5 * P.z / tmp;
|
||||
|
||||
//switch off unitless positions with dt*c
|
||||
unitlessOff(R, dtc);
|
||||
|
||||
gR[idx] = R;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
|
||||
|
||||
//get global id and thread id
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
if (idx < npart) {
|
||||
|
||||
double3 R = gR[idx];
|
||||
double3 P = gP[idx];
|
||||
double dt = gdt[idx];
|
||||
|
||||
//switch to unitless positions with dt*c
|
||||
unitlessOn(R, dt*c);
|
||||
|
||||
R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P));
|
||||
R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P));
|
||||
R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P));
|
||||
|
||||
//switch off unitless positions with dt*c
|
||||
unitlessOff(R, dt*c);
|
||||
|
||||
gR[idx] = R;
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: kernel for push with switch off unitless positions with dt[i]*c
|
||||
|
||||
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
|
||||
|
||||
const double sina = sin(ori.x);
|
||||
const double cosa = cos(ori.x);
|
||||
const double sinb = sin(ori.y);
|
||||
const double cosb = cos(ori.y);
|
||||
const double sinc = sin(ori.z);
|
||||
const double cosc = cos(ori.z);
|
||||
|
||||
double3 temp;
|
||||
temp.x = 0.0;
|
||||
temp.y = 0.0;
|
||||
temp.z = 0.0;
|
||||
|
||||
temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
|
||||
temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
|
||||
(cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
|
||||
temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
|
||||
(sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
|
||||
|
||||
return temp;
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
|
||||
int npart, int nsect, double dtc)
|
||||
{
|
||||
|
||||
//get global id and thread id
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
|
||||
if (idx < npart) {
|
||||
|
||||
double3 X = gX[idx];
|
||||
double3 P = gP[idx];
|
||||
long lLastSection = gLastSection[idx];
|
||||
|
||||
double3 ori;
|
||||
if (lLastSection > -1 && lLastSection < nsect) {
|
||||
ori = gOrient[lLastSection];
|
||||
} else {
|
||||
ori.x = 0.0;
|
||||
ori.y = 0.0;
|
||||
ori.z = 0.0;
|
||||
}
|
||||
|
||||
double3 tmp = deviceTransformTo(P, ori);
|
||||
|
||||
unitlessOn(X, dtc);
|
||||
|
||||
X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
|
||||
X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
|
||||
X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
|
||||
|
||||
unitlessOff(X, dtc);
|
||||
|
||||
gX[idx] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
|
||||
int npart, int nsect, double *gdt, double c)
|
||||
{
|
||||
|
||||
//get global id and thread id
|
||||
volatile int tid = threadIdx.x;
|
||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
|
||||
if (idx < npart) {
|
||||
|
||||
double3 X = gX[idx];
|
||||
double3 P = gP[idx];
|
||||
long lLastSection = gLastSection[idx];
|
||||
double dt = gdt[idx];
|
||||
|
||||
double3 ori;
|
||||
if (lLastSection > -1 && lLastSection < nsect) {
|
||||
ori = gOrient[lLastSection];
|
||||
} else {
|
||||
ori.x = 0.0;
|
||||
ori.y = 0.0;
|
||||
ori.z = 0.0;
|
||||
}
|
||||
|
||||
double3 tmp = deviceTransformTo(P, ori);
|
||||
|
||||
unitlessOn(X, dt*c);
|
||||
|
||||
X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
|
||||
X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
|
||||
X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
|
||||
|
||||
unitlessOff(X, dt*c);
|
||||
|
||||
gX[idx] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
struct compare_particle
|
||||
{
|
||||
int threshold;
|
||||
|
||||
compare_particle() {
|
||||
threshold = 0;
|
||||
}
|
||||
|
||||
void set_threshold(int t) {
|
||||
threshold = t;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART p1, CUDA_PART p2) {
|
||||
return p1.label > p2.label;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART p1) {
|
||||
return p1.label < threshold;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct compare_particle_small
|
||||
{
|
||||
int threshold;
|
||||
|
||||
compare_particle_small() {
|
||||
threshold = 0;
|
||||
}
|
||||
|
||||
void set_threshold(int t) {
|
||||
threshold = t;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
|
||||
return p1.label > p2.label;
|
||||
}
|
||||
|
||||
__host__ __device__
|
||||
bool operator()(CUDA_PART_SMALL p1) {
|
||||
return p1.label < threshold;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct less_then
|
||||
{
|
||||
__host__ __device__
|
||||
bool operator()(int x)
|
||||
{
|
||||
return x < 0;
|
||||
}
|
||||
};
|
||||
|
||||
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
|
||||
{
|
||||
|
||||
int threads = BLOCK_SIZE;
|
||||
int blocks = numparticles / threads + 1;
|
||||
|
||||
//calc shared memory size
|
||||
int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE;
|
||||
|
||||
//call kernel
|
||||
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
|
||||
(double*)par_ptr,
|
||||
m_base->cuda_getCurandStates(),
|
||||
numparticles);
|
||||
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess)
|
||||
std::cout << "Err2: " << cudaGetErrorString(err) << std::endl;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
|
||||
int &numaddback)
|
||||
{
|
||||
|
||||
//wrap mem_ptr with thrust device ptr
|
||||
thrust::device_ptr<CUDA_PART_SMALL> dev_ptr( (CUDA_PART_SMALL*)mem_ptr);
|
||||
|
||||
//count -2 and -1 particles
|
||||
compare_particle_small comp;
|
||||
comp.set_threshold(0);
|
||||
numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp);
|
||||
|
||||
//sort particles
|
||||
if (numaddback > 0)
|
||||
thrust::sort(dev_ptr, dev_ptr + numparticles, comp);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
||||
void *dt_ptr, double dt, double c, bool usedt,
|
||||
int streamId)
|
||||
{
|
||||
|
||||
int threads = BLOCK_SIZE;
|
||||
int blocks = npart / threads + 1;
|
||||
|
||||
//call kernel
|
||||
if (!usedt) {
|
||||
if (streamId == -1) {
|
||||
kernelPush<<<blocks, threads >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
|
||||
} else {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
|
||||
}
|
||||
} else {
|
||||
if (streamId == -1) {
|
||||
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart,
|
||||
(double*)dt_ptr, c);
|
||||
} else {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart,
|
||||
(double*)dt_ptr, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||
void *lastSec_ptr, void *orient_ptr,
|
||||
int npart, int nsec,
|
||||
void *dt_ptr, double dt,
|
||||
double c, bool usedt,
|
||||
int streamId)
|
||||
{
|
||||
|
||||
int threads = BLOCK_SIZE;
|
||||
int blocks = npart / threads + 1;
|
||||
int smem = sizeof(double3) * nsec;
|
||||
|
||||
//call kernel
|
||||
if (!usedt) {
|
||||
if (streamId == -1) {
|
||||
kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
|
||||
(long*)lastSec_ptr, (double3*)orient_ptr,
|
||||
npart, nsec, dt*c);
|
||||
} else {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
|
||||
(long*)lastSec_ptr, (double3*)orient_ptr,
|
||||
npart, nsec, dt*c);
|
||||
}
|
||||
} else {
|
||||
if (streamId == -1) {
|
||||
kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
|
||||
(long*)lastSec_ptr, (double3*)orient_ptr,
|
||||
npart, nsec, (double*)dt_ptr, c);
|
||||
} else {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
|
||||
(long*)lastSec_ptr, (double3*)orient_ptr,
|
||||
npart, nsec, (double*)dt_ptr, c);
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
155
src/CUDA/CudaCollimatorPhysics.cuh
Normal file
155
src/CUDA/CudaCollimatorPhysics.cuh
Normal file
@ -0,0 +1,155 @@
|
||||
#ifndef H_CUDA_COLLIMATORPHYSICS
|
||||
#define H_CUDA_COLLIMATORPHYSICS
|
||||
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <vector_types.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/count.h>
|
||||
|
||||
#include <cublas_v2.h>
|
||||
|
||||
#include "../Algorithms/CollimatorPhysics.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
*/
|
||||
typedef struct __align__(16) {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double3 Rincol;
|
||||
double3 Pincol;
|
||||
long IDincol;
|
||||
int Binincol;
|
||||
double DTincol;
|
||||
double Qincol;
|
||||
long LastSecincol;
|
||||
double3 Bfincol;
|
||||
double3 Efincol;
|
||||
} CUDA_PART;
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
*/
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double3 Rincol;
|
||||
double3 Pincol;
|
||||
} CUDA_PART_SMALL;
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
*/
|
||||
typedef struct {
|
||||
int *label;
|
||||
unsigned *localID;
|
||||
double3 *Rincol;
|
||||
double3 *Pincol;
|
||||
long *IDincol;
|
||||
int *Binincol;
|
||||
double *DTincol;
|
||||
double *Qincol;
|
||||
long *LastSecincol;
|
||||
double3 *Bfincol;
|
||||
double3 *Efincol;
|
||||
} CUDA_PART2;
|
||||
|
||||
/**
|
||||
* Structure for storing particle on GPU
|
||||
*/
|
||||
typedef struct {
|
||||
int *label;
|
||||
unsigned *localID;
|
||||
double3 *Rincol;
|
||||
double3 *Pincol;
|
||||
} CUDA_PART2_SMALL;
|
||||
|
||||
/** CudaCollimatorPhysics class.
|
||||
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
|
||||
* For detailed documentation on CollimatorPhysics functions see OPAL documentation
|
||||
*/
|
||||
class CudaCollimatorPhysics : public DKSCollimatorPhysics{
|
||||
|
||||
private:
|
||||
|
||||
bool base_create;
|
||||
CudaBase *m_base;
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor with CudaBase argument
|
||||
*
|
||||
*/
|
||||
CudaCollimatorPhysics(CudaBase *base) {
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
}
|
||||
|
||||
/** Constructor - empty. */
|
||||
CudaCollimatorPhysics() {
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
}
|
||||
|
||||
/** Destructor - empty */
|
||||
~CudaCollimatorPhysics() {
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
};
|
||||
|
||||
/** Execute collimator physics kernel.
|
||||
*
|
||||
*/
|
||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||
int numpartices);
|
||||
|
||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/** Sort particle array on GPU.
|
||||
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
|
||||
* array so these particles are at the end of array
|
||||
*/
|
||||
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
||||
|
||||
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles, int &numaddback)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/** BorisPusher push function for integration from OPAL.
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false, int streamId = -1);
|
||||
|
||||
/** BorisPusher push function with transformto function form OPAL
|
||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||
* For more details see ParallelTTracler docomentation in opal
|
||||
*/
|
||||
int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||
void *orient_ptr, int npart, int nsec,
|
||||
void *dt_ptr, double dt, double c,
|
||||
bool usedt = false, int streamId = -1);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
376
src/CUDA/CudaFFT.cu
Normal file
376
src/CUDA/CudaFFT.cu
Normal file
@ -0,0 +1,376 @@
|
||||
#include "CudaFFT.cuh"
|
||||
|
||||
__global__ void normalize(cufftDoubleComplex *in, int N) {
|
||||
|
||||
int id = blockIdx.x; //*blockDim.x + threadIdx.x;
|
||||
if (id < N) {
|
||||
in[id].x = in[id].x / N;
|
||||
in[id].y = in[id].y / N;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CudaFFT::CudaFFT(CudaBase *base) {
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
CudaFFT::CudaFFT() {
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
CudaFFT::~CudaFFT() {
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: execute fft using cufft library
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::executeFFT(void * mem_ptr, int ndim, int N[3], int streamId, bool forward) {
|
||||
|
||||
//create fft plan
|
||||
cufftResult cresult;
|
||||
cufftHandle plan;
|
||||
|
||||
if (useDefaultPlan(ndim, N)) {
|
||||
plan = defaultPlanZ2Z;
|
||||
} else {
|
||||
switch (ndim) {
|
||||
case 1:
|
||||
cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2Z, 1);
|
||||
break;
|
||||
case 2:
|
||||
cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2Z);
|
||||
break;
|
||||
case 3:
|
||||
cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2Z);
|
||||
break;
|
||||
default:
|
||||
cresult = CUFFT_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error creating plan, cuda error: " << cresult);
|
||||
if (cresult == CUFFT_SETUP_FAILED)
|
||||
DEBUG_MSG("Setup failed");
|
||||
|
||||
if (cresult == CUFFT_INVALID_SIZE)
|
||||
DEBUG_MSG("Invalid size");
|
||||
|
||||
if (cresult == CUFFT_INVALID_TYPE)
|
||||
DEBUG_MSG("Invalid type");
|
||||
|
||||
if (cresult == CUFFT_ALLOC_FAILED)
|
||||
DEBUG_MSG("Alloc failed");
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
|
||||
cufftSetStream(plan, m_base->cuda_getStream(streamId));
|
||||
else
|
||||
cufftSetStream(plan, 0);
|
||||
|
||||
//execute perform in place FFT on created plan
|
||||
if (forward) {
|
||||
cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr,
|
||||
(cufftDoubleComplex*)mem_ptr, CUFFT_FORWARD);
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error executing fft, cuda error: " << cresult);
|
||||
cufftDestroy(plan);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
} else {
|
||||
cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr,
|
||||
(cufftDoubleComplex*)mem_ptr, CUFFT_INVERSE);
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error executing ifft, cuda error: " << cresult);
|
||||
cufftDestroy(plan);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
//clean up resources
|
||||
if (!useDefaultPlan(ndim, N))
|
||||
cufftDestroy(plan);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: execute ifft
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId) {
|
||||
return executeFFT(mem_ptr, ndim, N, streamId, false);
|
||||
}
|
||||
|
||||
/*
|
||||
Info: execute normalize using cuda kernel
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
cublasStatus_t status;
|
||||
unsigned int size = N[0]*N[1]*N[2];
|
||||
cuDoubleComplex alpha = make_cuDoubleComplex(1.0/size, 0);
|
||||
|
||||
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
|
||||
cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId));
|
||||
|
||||
status = cublasZscal(defaultCublasFFT, size, &alpha, (cuDoubleComplex*)mem_ptr, 1);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("CUBLAS exec Zscal failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: execute real to complex double precision FFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
//create fft plan
|
||||
cufftResult cresult;
|
||||
cufftHandle plan;
|
||||
if (useDefaultPlan(ndim, N)) {
|
||||
plan = defaultPlanD2Z;
|
||||
} else {
|
||||
switch (ndim) {
|
||||
case 1:
|
||||
cresult = cufftPlan1d(&plan, N[0], CUFFT_D2Z, 1);
|
||||
break;
|
||||
case 2:
|
||||
cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_D2Z);
|
||||
break;
|
||||
case 3:
|
||||
cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_D2Z);
|
||||
break;
|
||||
default:
|
||||
cresult = CUFFT_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error creating plan, cuda error: " << cresult);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
|
||||
cresult = cufftSetStream(plan, m_base->cuda_getStream(streamId));
|
||||
else
|
||||
cufftSetStream(plan, 0);
|
||||
|
||||
//execute perform in place FFT on created plan
|
||||
cresult = cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr, (cufftDoubleComplex*)comp_ptr);
|
||||
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error executing fft, cuda error: " << cresult);
|
||||
if (cresult == CUFFT_INVALID_PLAN)
|
||||
DEBUG_MSG("invalid plan");
|
||||
if (cresult == CUFFT_INVALID_VALUE)
|
||||
DEBUG_MSG("invalid value");
|
||||
if (cresult == CUFFT_INTERNAL_ERROR)
|
||||
DEBUG_MSG("internal error");
|
||||
if (cresult == CUFFT_EXEC_FAILED)
|
||||
DEBUG_MSG("exec failed");
|
||||
if (cresult == CUFFT_SETUP_FAILED)
|
||||
DEBUG_MSG("setup failed");
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//clean up resources
|
||||
if (!useDefaultPlan(ndim, N)) {
|
||||
cresult = cufftDestroy(plan);
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: exectue complex to real double precision FFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
//create fft plan
|
||||
cufftResult cresult;
|
||||
cufftHandle plan;
|
||||
|
||||
if (useDefaultPlan(ndim, N)) {
|
||||
plan = defaultPlanZ2D;
|
||||
} else {
|
||||
switch (ndim) {
|
||||
case 1:
|
||||
cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2D, 1);
|
||||
break;
|
||||
case 2:
|
||||
cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2D);
|
||||
break;
|
||||
case 3:
|
||||
cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2D);
|
||||
break;
|
||||
default:
|
||||
cresult = CUFFT_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error creating plan, cuda error: " << cresult);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
|
||||
cufftSetStream(plan, m_base->cuda_getStream(streamId));
|
||||
else
|
||||
cufftSetStream(plan, 0);
|
||||
|
||||
//execute perform in place FFT on created plan
|
||||
cresult = cufftExecZ2D(plan, (cufftDoubleComplex*)comp_ptr, (cufftDoubleReal*)real_ptr);
|
||||
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error executing fft, cuda error: " << cresult);
|
||||
cufftDestroy(plan);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//clean up resources
|
||||
if (!useDefaultPlan(ndim, N)) {
|
||||
cresult = cufftDestroy(plan);
|
||||
if (cresult != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult);
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: execute normalize for complex to real iFFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId) {
|
||||
cublasStatus_t status;
|
||||
unsigned int size = N[0]*N[1]*N[2];
|
||||
double alpha = 1.0/size;
|
||||
|
||||
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
|
||||
cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId));
|
||||
|
||||
status = cublasDscal(defaultCublasFFT, size, &alpha, (double*)real_ptr, 1);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("CUBLAS exec Zscal failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: init cufftPlans witch can be reused for all FFTs of the same size and type
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::setupFFT(int ndim, int N[3]) {
|
||||
|
||||
cufftResult cr1 = CUFFT_SUCCESS;
|
||||
cufftResult cr2 = CUFFT_SUCCESS;
|
||||
cufftResult cr3 = CUFFT_SUCCESS;
|
||||
|
||||
//create default fft plans
|
||||
if (ndim == 1) {
|
||||
cr1 = cufftPlan1d(&defaultPlanZ2Z, N[0], CUFFT_Z2Z, 1);
|
||||
cr2 = cufftPlan1d(&defaultPlanD2Z, N[0], CUFFT_D2Z, 1);
|
||||
cr3 = cufftPlan1d(&defaultPlanZ2D, N[0], CUFFT_Z2D, 1);
|
||||
}
|
||||
|
||||
if (ndim == 2) {
|
||||
cr1 = cufftPlan2d(&defaultPlanZ2Z, N[1], N[0], CUFFT_Z2Z);
|
||||
cr2 = cufftPlan2d(&defaultPlanD2Z, N[1], N[0], CUFFT_D2Z);
|
||||
cr3 = cufftPlan2d(&defaultPlanZ2D, N[1], N[0], CUFFT_Z2D);
|
||||
}
|
||||
|
||||
if (ndim == 3) {
|
||||
cr1 = cufftPlan3d(&defaultPlanZ2Z, N[2], N[1], N[0], CUFFT_Z2Z);
|
||||
cr2 = cufftPlan3d(&defaultPlanD2Z, N[2], N[1], N[0], CUFFT_D2Z);
|
||||
cr3 = cufftPlan3d(&defaultPlanZ2D, N[2], N[1], N[0], CUFFT_Z2D);
|
||||
}
|
||||
|
||||
if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error creating default plan");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//create cublas plan
|
||||
cublasStatus_t status;
|
||||
status = cublasCreate(&defaultCublasFFT);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("CUBLAS create default handle failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
//std::cout << "cublas created" << std::endl;
|
||||
|
||||
defaultNdim = ndim;
|
||||
if (ndim > 0) {
|
||||
defaultN[0] = N[0];
|
||||
defaultN[1] = N[1];
|
||||
defaultN[2] = N[2];
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Info: destroy default FFT plans
|
||||
Return: success or error code
|
||||
*/
|
||||
int CudaFFT::destroyFFT() {
|
||||
|
||||
cufftResult cr1 = CUFFT_SUCCESS;
|
||||
cufftResult cr2 = CUFFT_SUCCESS;
|
||||
cufftResult cr3 = CUFFT_SUCCESS;
|
||||
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
||||
|
||||
if (defaultNdim > 0) {
|
||||
//clean up resources
|
||||
cr1 = cufftDestroy(defaultPlanZ2Z);
|
||||
cr2 = cufftDestroy(defaultPlanD2Z);
|
||||
cr3 = cufftDestroy(defaultPlanZ2D);
|
||||
|
||||
if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) {
|
||||
DEBUG_MSG("Error destroying default cufft plans");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (defaultNdim > -1) {
|
||||
status = cublasDestroy(defaultCublasFFT);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
DEBUG_MSG("CUBLAS delete default handle failed!");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
defaultN[0] = -1;
|
||||
defaultN[1] = -1;
|
||||
defaultN[2] = -1;
|
||||
defaultNdim = -1;
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
88
src/CUDA/CudaFFT.cuh
Normal file
88
src/CUDA/CudaFFT.cuh
Normal file
@ -0,0 +1,88 @@
|
||||
#ifndef H_CUDA_FFT
|
||||
#define H_CUDA_FFT
|
||||
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cufft.h>
|
||||
#include "cublas_v2.h"
|
||||
|
||||
#include "../Algorithms/FFT.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
class CudaFFT : public DKSFFT{
|
||||
|
||||
private:
|
||||
|
||||
bool base_create;
|
||||
CudaBase *m_base;
|
||||
|
||||
cufftHandle defaultPlanZ2Z;
|
||||
cufftHandle defaultPlanD2Z;
|
||||
cufftHandle defaultPlanZ2D;
|
||||
cublasHandle_t defaultCublasFFT;
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor with CudaBase as argument */
|
||||
CudaFFT(CudaBase *base);
|
||||
|
||||
/** constructor */
|
||||
CudaFFT();
|
||||
|
||||
/** destructor */
|
||||
~CudaFFT();
|
||||
|
||||
/**
|
||||
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type
|
||||
* Return: success or error code
|
||||
*/
|
||||
int setupFFT(int ndim, int N[3]);
|
||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||
|
||||
/**
|
||||
* Info: destroy default FFT plans
|
||||
* Return: success or error code
|
||||
*/
|
||||
int destroyFFT();
|
||||
|
||||
/*
|
||||
Info: execute complex to complex double precision fft using cufft library
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
|
||||
|
||||
/*
|
||||
Info: execute ifft
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute normalize using cuda kernel for complex to complex iFFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute real to complex double precision FFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: exectue complex to real double precision FFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute normalize for complex to real iFFT
|
||||
Return: success or error code
|
||||
*/
|
||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
469
src/CUDA/CudaGreensFunction.cu
Normal file
469
src/CUDA/CudaGreensFunction.cu
Normal file
@ -0,0 +1,469 @@
|
||||
#include "CudaGreensFunction.cuh"
|
||||
|
||||
__global__ void kernelTmpgreen(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ) {
|
||||
|
||||
|
||||
int i = blockIdx.x;
|
||||
int j = blockIdx.y;
|
||||
int k = blockIdx.z;
|
||||
|
||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
||||
|
||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
||||
|
||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
||||
|
||||
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
||||
|
||||
tmpgrn = tmpgrn / 2;
|
||||
|
||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
||||
|
||||
tmpgreen[i + j * NI + k * NI * NJ] = tmpgrn / cellVolume;
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelTmpgreen_2(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ, int NK) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int id = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
if (id < NI * NJ * NK) {
|
||||
int i = id % NI;
|
||||
int k = id / (NI * NJ);
|
||||
int j = (id - k * NI * NJ) / NI;
|
||||
|
||||
|
||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
||||
|
||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
||||
|
||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
||||
|
||||
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
||||
|
||||
tmpgrn = tmpgrn / 2;
|
||||
|
||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
||||
|
||||
tmpgreen[id] = tmpgrn / cellVolume;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//calculate greens integral on cpu and transfer to gpu
|
||||
void kernelTmpgreenCPU(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
|
||||
int NI, int NJ, int NK)
|
||||
{
|
||||
|
||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
||||
|
||||
for (int k = 0; k < NK; k++) {
|
||||
for (int j = 0; j < NJ; j++) {
|
||||
for (int i = 0; i < NI; i++) {
|
||||
|
||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
||||
|
||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
||||
|
||||
double tmpgrn = 0;
|
||||
tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
||||
|
||||
tmpgrn = tmpgrn / 2;
|
||||
|
||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
||||
|
||||
tmpgrn = tmpgrn / cellVolume;
|
||||
|
||||
tmpgreen[k*NJ*NI + j*NJ + i] = tmpgrn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void kernelIngration(double *rho2_m, double *tmpgreen, int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) {
|
||||
|
||||
int i = blockIdx.x;
|
||||
int j = blockIdx.y;
|
||||
int k = blockIdx.z;
|
||||
|
||||
int ni = NI;
|
||||
int nj = NJ;
|
||||
|
||||
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
|
||||
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
|
||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
||||
|
||||
|
||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp)
|
||||
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (j+1 < NJ_tmp)
|
||||
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (k+1 < NK_tmp)
|
||||
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
||||
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
||||
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
|
||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||
|
||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
|
||||
int NI, int NJ,
|
||||
int NI_tmp, int NJ_tmp, int NK_tmp) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int id = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
int ni = NI;
|
||||
int nj = NJ;
|
||||
|
||||
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
|
||||
if (id < NI_tmp * NJ_tmp * NK_tmp) {
|
||||
int i = id % NI_tmp;
|
||||
int k = id / (NI_tmp * NJ_tmp);
|
||||
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
|
||||
|
||||
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
|
||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
||||
|
||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp)
|
||||
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (j+1 < NJ_tmp)
|
||||
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (k+1 < NK_tmp)
|
||||
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
||||
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
||||
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||
|
||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//just one kernel will be executed
|
||||
__global__ void mirroredRhoField0(double *rho2_m, int NI, int NJ) {
|
||||
rho2_m[0] = rho2_m[NI*NJ];
|
||||
}
|
||||
|
||||
__global__ void mirroredRhoFieldI(double *rho2_m, int NI, int NJ) {
|
||||
|
||||
int i = blockIdx.x;
|
||||
int j = blockIdx.y;
|
||||
int k = blockIdx.z;
|
||||
|
||||
int idx1 = i + j*NI + k*NI*NJ;
|
||||
int idx2 = (NI-i) + j*NI + k*NI*NJ;
|
||||
|
||||
if (NI-i < NI)
|
||||
rho2_m[idx2] = rho2_m[idx1];
|
||||
|
||||
}
|
||||
|
||||
__global__ void mirroredRhoFieldJ(double *rho2_m, int NI, int NJ) {
|
||||
|
||||
int i = blockIdx.x;
|
||||
int j = blockIdx.y;
|
||||
int k = blockIdx.z;
|
||||
|
||||
int idx1 = i + j*NI + k*NI*NJ;
|
||||
int idx2 = i + (NJ-j)*NI + k*NI*NJ;
|
||||
|
||||
if (NJ-j < NJ)
|
||||
rho2_m[idx2] = rho2_m[idx1];
|
||||
|
||||
}
|
||||
|
||||
__global__ void mirroredRhoFieldK(double *rho2_m, int NI, int NJ, int NK) {
|
||||
|
||||
int i = blockIdx.x;
|
||||
int j = blockIdx.y;
|
||||
int k = blockIdx.z;
|
||||
|
||||
int idx1 = i + j*NI + k*NI*NJ;
|
||||
int idx2 = i + j*NI + (NK-k)*NI*NJ;
|
||||
|
||||
if (NK-k < NK)
|
||||
rho2_m[idx2] = rho2_m[idx1];
|
||||
|
||||
}
|
||||
|
||||
__global__ void mirroredRhoField(double *rho2_m,
|
||||
int NI, int NJ, int NK,
|
||||
int NI_tmp, int NJ_tmp, int NK_tmp) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int id = blockIdx.x * blockDim.x + tid;
|
||||
|
||||
int id1, id2, id3, id4, id5, id6, id7, id8;
|
||||
|
||||
if (id < NI_tmp * NJ_tmp * NK_tmp) {
|
||||
int i = id % NI_tmp;
|
||||
int k = id / (NI_tmp * NJ_tmp);
|
||||
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
|
||||
|
||||
int ri = NI - i;
|
||||
int rj = NJ - j;
|
||||
int rk = NK - k;
|
||||
|
||||
id1 = k * NI * NJ + j * NI + i;
|
||||
id2 = k * NI * NJ + j * NI + ri;
|
||||
id3 = k * NI * NJ + rj * NI + i;
|
||||
id4 = k * NI * NJ + rj * NI + ri;
|
||||
|
||||
id5 = rk * NI * NJ + j * NI + i;
|
||||
id6 = rk * NI * NJ + j * NI + ri;
|
||||
id7 = rk * NI * NJ + rj * NI + i;
|
||||
id8 = rk * NI * NJ + rj * NI + ri;
|
||||
|
||||
|
||||
double data = rho2_m[id1];
|
||||
if (i != 0)
|
||||
rho2_m[id2] = data;
|
||||
|
||||
if (j != 0)
|
||||
rho2_m[id3] = data;
|
||||
|
||||
if (i != 0 && j != 0)
|
||||
rho2_m[id4] = data;
|
||||
|
||||
if (k != 0)
|
||||
rho2_m[id5] = data;
|
||||
|
||||
if (k != 0 && i != 0)
|
||||
rho2_m[id6] = data;
|
||||
|
||||
if (k!= 0 && j != 0)
|
||||
rho2_m[id7] = data;
|
||||
|
||||
if (k != 0 && j != 0 & i != 0)
|
||||
rho2_m[id8] = data;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__device__ inline cuDoubleComplex ComplexMul(cuDoubleComplex a, cuDoubleComplex b) {
|
||||
|
||||
cuDoubleComplex c;
|
||||
c.x = a.x * b.x - a.y * b.y;
|
||||
c.y = a.x * b.y + a.y * b.x;
|
||||
|
||||
return c;
|
||||
|
||||
}
|
||||
|
||||
__global__ void multiplyComplexFields(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2) {
|
||||
|
||||
int idx = blockIdx.x;
|
||||
|
||||
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
copy data in shared memory first to improve memory access (few global memory accesses, maybo no improvements)
|
||||
use more threads per block to improve occupancy of hardware (test for best block and thread sizes)
|
||||
*/
|
||||
__global__ void multiplyComplexFields_2(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2,
|
||||
int size)
|
||||
{
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
|
||||
extern __shared__ cuDoubleComplex data[];
|
||||
|
||||
if (idx < size) {
|
||||
data[2*tid] = ptr1[idx];
|
||||
data[2*tid + 1] = ptr2[idx];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (idx < size)
|
||||
ptr1[idx] = ComplexMul(data[2*tid], data[2*tid+1]);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
CudaGreensFunction::CudaGreensFunction(CudaBase *base) {
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
}
|
||||
|
||||
/* constructor */
|
||||
CudaGreensFunction::CudaGreensFunction() {
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
CudaGreensFunction::~CudaGreensFunction() {
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
}
|
||||
|
||||
int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
|
||||
double hr_m0, double hr_m1, double hr_m2,
|
||||
int streamId)
|
||||
{
|
||||
|
||||
int thread = 128;
|
||||
int block = (I * J * K / thread) + 1;
|
||||
|
||||
//if no stream specified use default stream
|
||||
if (streamId == -1) {
|
||||
kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen,
|
||||
int I, int J, int K,
|
||||
int streamId)
|
||||
{
|
||||
|
||||
int thread = 128;
|
||||
int block = (I * J * K / thread) + 1;
|
||||
|
||||
if (streamId == -1) {
|
||||
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
||||
2*(I - 1), 2*(J - 1), I, J, K);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
||||
2*(I - 1), 2*(J - 1), I, J, K);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
||||
|
||||
int thread = 128;
|
||||
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
|
||||
|
||||
if (streamId == -1) {
|
||||
mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J);
|
||||
mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J);
|
||||
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2,
|
||||
int size, int streamId) {
|
||||
|
||||
int threads = 128;
|
||||
int blocks = size / threads + 1;
|
||||
int datasize = 2 * threads * sizeof(cuDoubleComplex);
|
||||
|
||||
if (streamId == -1) {
|
||||
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
|
||||
(cuDoubleComplex*)ptr2,
|
||||
size);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||
multiplyComplexFields_2<<<blocks, threads, datasize, cs >>> ( (cuDoubleComplex*)ptr1,
|
||||
(cuDoubleComplex*) ptr2, size);
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
63
src/CUDA/CudaGreensFunction.cuh
Normal file
63
src/CUDA/CudaGreensFunction.cuh
Normal file
@ -0,0 +1,63 @@
|
||||
#ifndef H_CUDA_GREENSFUNCTION
|
||||
#define H_CUDA_GREENSFUNCTION
|
||||
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuComplex.h>
|
||||
#include "cublas_v2.h"
|
||||
|
||||
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
class CudaGreensFunction {
|
||||
|
||||
private:
|
||||
|
||||
bool base_create;
|
||||
CudaBase *m_base;
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor with CudaBase argument */
|
||||
CudaGreensFunction(CudaBase *base);
|
||||
|
||||
/* constructor */
|
||||
CudaGreensFunction();
|
||||
|
||||
/* destructor */
|
||||
~CudaGreensFunction();
|
||||
|
||||
/*
|
||||
Info: calc itegral on device memory (taken from OPAL src code)
|
||||
Return: success or error code
|
||||
*/
|
||||
int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
|
||||
double hr_m0, double hr_m1, double hr_m2,
|
||||
int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: integration of rho2_m field (taken from OPAL src code)
|
||||
Return: success or error code
|
||||
*/
|
||||
int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
||||
int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: mirror rho field (taken from OPAL src code)
|
||||
Return: succes or error code
|
||||
*/
|
||||
int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
||||
Return: success or error code
|
||||
*/
|
||||
int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
1221
src/CUDA/CudaImageReconstruction.cu
Normal file
1221
src/CUDA/CudaImageReconstruction.cu
Normal file
File diff suppressed because it is too large
Load Diff
118
src/CUDA/CudaImageReconstruction.cuh
Normal file
118
src/CUDA/CudaImageReconstruction.cuh
Normal file
@ -0,0 +1,118 @@
|
||||
#ifndef H_CUDA_IMAGERECONSTRUCTION
|
||||
#define H_CUDA_IMAGERECONSTRUCTION
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/count.h>
|
||||
|
||||
#include "../Algorithms/ImageReconstruction.h"
|
||||
#include "CudaBase.cuh"
|
||||
|
||||
class CudaImageReconstruction : public ImageReconstruction {
|
||||
|
||||
private:
|
||||
|
||||
bool base_create;
|
||||
CudaBase *m_base;
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor */
|
||||
CudaImageReconstruction() {
|
||||
m_base = new CudaBase();
|
||||
base_create = true;
|
||||
};
|
||||
|
||||
/** Constructor with base **/
|
||||
CudaImageReconstruction(CudaBase *base) {
|
||||
m_base = base;
|
||||
base_create = false;
|
||||
}
|
||||
|
||||
/** Destructor */
|
||||
~CudaImageReconstruction() {
|
||||
if (base_create)
|
||||
delete m_base;
|
||||
};
|
||||
|
||||
/** CUDA implementation of caluclate source
|
||||
*/
|
||||
int calculateSource(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Cuda implementation of calculate background
|
||||
*/
|
||||
int calculateBackground(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/**
|
||||
* Caluclate source for differente sources
|
||||
*/
|
||||
int calculateSources(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/**
|
||||
* Calculate background for differente sources
|
||||
*/
|
||||
int calculateBackgrounds(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Generate normalization.
|
||||
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
|
||||
* that updates voxel values in the image on the slope between these two detectors.
|
||||
*/
|
||||
int generateNormalization(void *recon, void *image_position,
|
||||
void *det_position, int total_det);
|
||||
|
||||
|
||||
/** Calculate forward projection.
|
||||
* For image reconstruction calculates forward projections.
|
||||
* see recon.cpp for details
|
||||
*/
|
||||
int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||
void *image_position, int num_events);
|
||||
|
||||
/** Calculate backward projection.
|
||||
* For image reconstruction calculates backward projections.
|
||||
* see recon.cpp for details
|
||||
*/
|
||||
int backwardProjection(void *correction, void *recon_corrector, void *list_data,
|
||||
void *det_position, void *image_position,
|
||||
int num_events, int num_voxels);
|
||||
|
||||
/** Set the voxel dimensins on device.
|
||||
*
|
||||
*/
|
||||
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
|
||||
|
||||
/** Set the image edge.
|
||||
*
|
||||
*/
|
||||
int setEdge(float x_edge, float y_edge, float z_edge);
|
||||
|
||||
/** Set the image edge1.
|
||||
*
|
||||
*/
|
||||
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
|
||||
|
||||
/** Set the minimum crystan in one ring values.
|
||||
*
|
||||
*/
|
||||
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
|
||||
|
||||
/** Set all other required parameters for reconstruction.
|
||||
*
|
||||
*/
|
||||
int setParams(float matrix_distance_factor, float phantom_diameter,
|
||||
float atten_per_mm, float ring_diameter);
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
316
src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
Normal file
316
src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu
Normal file
@ -0,0 +1,316 @@
|
||||
#define PI 3.141592653589793115998
|
||||
#define TWO_PI 6.283185307179586231996
|
||||
#define DEG_TO_RAD 1.7453292519943295474371681e-2
|
||||
|
||||
/** Theory function declaration.
|
||||
* Definition of the theory function will be build during runtime before compilation.
|
||||
*/
|
||||
__device__ double fTheory(double t, double *p, double *f, int *m);
|
||||
|
||||
/** MusrFit predefined functions.
|
||||
* Predefined functions from MusrFit that can be used to define the theory function.
|
||||
* First parameter in all the functions is alwats time - t, rest of the parameters depend
|
||||
* on the function.
|
||||
*/
|
||||
__device__ double se(double t, double lamda) {
|
||||
return exp( -lamda*t );
|
||||
}
|
||||
|
||||
__device__ double ge(double t, double lamda, double beta) {
|
||||
return exp( -pow(lamda*t, beta) );
|
||||
}
|
||||
|
||||
__device__ double sg(double t, double sigma) {
|
||||
return exp( -0.5*pow(sigma*t, 2.0) );
|
||||
}
|
||||
|
||||
__device__ double stg(double t, double sigma) {
|
||||
double sigmatsq = pow(sigma*t, 2.0);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5*sigmatsq);
|
||||
}
|
||||
|
||||
__device__ double sekt(double t, double lambda) {
|
||||
double lambdat = lambda*t;
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
|
||||
}
|
||||
|
||||
__device__ double lgkt(double t, double lambda, double sigma) {
|
||||
double lambdat = lambda*t;
|
||||
double sigmatsq = pow(sigma*t, 2.0);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
|
||||
}
|
||||
|
||||
__device__ double skt(double t, double sigma, double beta) {
|
||||
if (beta < 1.0e-3)
|
||||
return 0.0;
|
||||
double sigmatb = pow(sigma*t, beta);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
|
||||
}
|
||||
|
||||
__device__ double spg(double t, double lambda, double gamma, double q) {
|
||||
double lam2 = lambda*lambda;
|
||||
double lamt2q = t*t*lam2*q;
|
||||
double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
|
||||
double rateL = sqrt(fabs(rate2));
|
||||
double rateT = sqrt(fabs(rate2)+lamt2q);
|
||||
|
||||
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
|
||||
}
|
||||
|
||||
__device__ double rahf(double t, double nu, double lambda) {
|
||||
double nut = nu*t;
|
||||
double nuth = nu*t/2.0;
|
||||
double lamt = lambda*t;
|
||||
|
||||
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
|
||||
}
|
||||
|
||||
__device__ double tf(double t, double phi, double nu) {
|
||||
double tmp_nu = TWO_PI*nu*t;
|
||||
double tmp_phi = DEG_TO_RAD*phi;
|
||||
|
||||
return cos(tmp_nu + tmp_phi);
|
||||
}
|
||||
|
||||
__device__ double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
|
||||
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||
}
|
||||
|
||||
__device__ double b(double t, double phi, double nu) {
|
||||
return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
||||
}
|
||||
|
||||
__device__ double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
|
||||
double wt = TWO_PI * nu * t;
|
||||
double ph = DEG_TO_RAD * phi;
|
||||
|
||||
return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||
}
|
||||
|
||||
__device__ double ab(double t, double sigma, double gamma) {
|
||||
double gt = gamma*t;
|
||||
|
||||
return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
|
||||
}
|
||||
|
||||
__device__ double snkzf(double t, double Delta0, double Rb) {
|
||||
double D0t2 = pow(Delta0*t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
|
||||
}
|
||||
|
||||
__device__ double snktf(double t, double phi, double nu, double Delta0, double Rb) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
double D0t2 = pow(Delta0*t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
|
||||
|
||||
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
__device__ double dnkzf(double t, double Delta0, double Rb, double nuc) {
|
||||
double nuct = nuc*t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
|
||||
double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
|
||||
}
|
||||
|
||||
__device__ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
double nuct = nuc*t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
|
||||
double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
/** Theory and chisquare functions.
|
||||
* Based on the compiler flags set theory is calculated either in single hist mode or asymetric.
|
||||
* Based on the compiler flags calculate either chisq or MLE
|
||||
*/
|
||||
|
||||
__device__ inline double singleHist(double &N0, double &tau, double &bkg, double &f, double &t) {
|
||||
return N0 * exp (-t/tau ) * (1.0 + f) + bkg;
|
||||
}
|
||||
|
||||
__device__ inline double asymetry(double &a, double &b, double &f) {
|
||||
return (f * (a * b) - (a - 1.0)) / ((a + 1.0) - f * (a * b - 1.0));
|
||||
}
|
||||
|
||||
__device__ inline double getTheory(double &c1, double &c2, double &c3, double &f, double &t) {
|
||||
#ifndef ASYMETRY
|
||||
return singleHist(c1, c2, c3, f, t);
|
||||
#elif
|
||||
return asymetry(c1, c2, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ inline double chiSq(double &data, double &theo, double &err) {
|
||||
double res = (theo - data) * (theo - data);
|
||||
if (err != 0.0)
|
||||
res /= err;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double mle(double &data, double &theo, double &err) {
|
||||
double res = (theo - data);
|
||||
if ( data > 1.0e-9 && fabs(theo) > 1.0e-9 )
|
||||
res += data * log(data / theo);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double getChiSq(double &data, double &theo, double &err) {
|
||||
#ifndef MLE
|
||||
return chiSq(data, theo, err);
|
||||
#elif
|
||||
return mle(data, theo, err);
|
||||
#endif
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* Kernel to calculate theory function and chisquare/mle values for single histogram fits.
|
||||
*/
|
||||
extern "C" __global__ void kernelChiSquareSingleHisto(double *data, double *err, double *par,
|
||||
double *chisq, int *map, double *funcv, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double tau, double N0, double bkg) {
|
||||
//define shared variable for parameters
|
||||
extern __shared__ double smem[];
|
||||
double *p = (double*)smem;
|
||||
double *f = (double*)&smem[numpar];
|
||||
int *m = (int*)&smem[numpar + numfunc];
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid;
|
||||
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
//load parameters from global to shared memory
|
||||
tid = threadIdx.x;
|
||||
while (tid < numpar) {
|
||||
p[tid] = par[tid];
|
||||
tid += blockDim.x;
|
||||
}
|
||||
|
||||
//load functions from global to shared memory
|
||||
tid = threadIdx.x;
|
||||
while (tid < numfunc) {
|
||||
f[tid] = funcv[tid];
|
||||
tid += blockDim.x;
|
||||
}
|
||||
|
||||
//load maps from global memory
|
||||
tid = threadIdx.x;
|
||||
while (tid < nummap) {
|
||||
m[tid] = map[tid];
|
||||
tid += blockDim.x;
|
||||
}
|
||||
|
||||
//sync threads
|
||||
__syncthreads();
|
||||
|
||||
while (j < length) {
|
||||
|
||||
double t = timeStart + j*timeStep;
|
||||
double ldata = data[j];
|
||||
double lerr = err[j];
|
||||
|
||||
double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg;
|
||||
|
||||
#ifdef MLH
|
||||
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
|
||||
chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo));
|
||||
else
|
||||
chisq[j] = 2.0 * (theo - ldata);
|
||||
#else
|
||||
if (lerr != 0.0)
|
||||
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
|
||||
else
|
||||
chisq[j] = theo * theo;
|
||||
#endif
|
||||
|
||||
j += gridDim.x * blockDim.x;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* Kernel to calculate theory function and chisquare/mle values for asymmetry fits.
|
||||
*/
|
||||
extern "C" __global__ void kernelChiSquareAsymmetry(double *data, double *err, double *par,
|
||||
double *chisq, int *map, double *funcv, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double alpha, double beta) {
|
||||
//define shared variable for parameters
|
||||
extern __shared__ double smem[];
|
||||
double *p = (double*)smem;
|
||||
double *f = (double*)&smem[numpar];
|
||||
int *m = (int*)&smem[numpar + numfunc];
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid;
|
||||
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
//load parameters from global to shared memory
|
||||
tid = threadIdx.x;
|
||||
while (tid < numpar) {
|
||||
p[tid] = par[tid];
|
||||
tid += blockDim.x;
|
||||
}
|
||||
|
||||
//load functions from global to shared memory
|
||||
tid = threadIdx.x;
|
||||
while (tid < numfunc) {
|
||||
f[tid] = funcv[tid];
|
||||
tid += blockDim.x;
|
||||
}
|
||||
|
||||
//load maps from global memory
|
||||
tid = threadIdx.x;
|
||||
while (tid < nummap) {
|
||||
m[tid] = map[tid];
|
||||
tid += blockDim.x;
|
||||
}
|
||||
|
||||
//sync threads
|
||||
__syncthreads();
|
||||
|
||||
while (j < length) {
|
||||
|
||||
double t = timeStart + j*timeStep;
|
||||
double ldata = data[j];
|
||||
double lerr = err[j];
|
||||
|
||||
double theoVal = fTheory(t, p, f, m);
|
||||
double ab = alpha*beta;
|
||||
|
||||
double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0) - (ab-1.0)*theoVal);
|
||||
|
||||
#ifdef MLH
|
||||
chisq[j] = 0.0; // log max likelihood not defined here
|
||||
#else
|
||||
if (lerr != 0.0)
|
||||
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
|
||||
else
|
||||
chisq[j] = theo * theo;
|
||||
#endif
|
||||
|
||||
j += gridDim.x * blockDim.x;
|
||||
}
|
||||
}
|
||||
|
861
src/DKSBase.cpp
Normal file
861
src/DKSBase.cpp
Normal file
@ -0,0 +1,861 @@
|
||||
#include "DKSBase.h"
|
||||
|
||||
#define API_OPENCL "OpenCL"
|
||||
#define API_CUDA "Cuda"
|
||||
#define API_OPENMP "OpenMP"
|
||||
|
||||
#define DEVICE_GPU "-gpu"
|
||||
#define DEVICE_CPU "-cpu"
|
||||
#define DEVICE_MIC "-mic"
|
||||
|
||||
//=====================================//
|
||||
//==========Private functions==========//
|
||||
//=====================================//
|
||||
|
||||
bool DKSBase::apiOpenCL() {
|
||||
|
||||
if (!m_api_set)
|
||||
return false;
|
||||
|
||||
if (strcmp(m_api_name, API_OPENCL) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DKSBase::apiCuda() {
|
||||
|
||||
if (!m_api_set)
|
||||
return false;
|
||||
|
||||
if (strcmp(m_api_name, API_CUDA) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DKSBase::apiOpenMP() {
|
||||
if (!m_api_set)
|
||||
return false;
|
||||
|
||||
if (strcmp(m_api_name, API_OPENMP) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DKSBase::deviceGPU() {
|
||||
if (!m_device_set)
|
||||
return false;
|
||||
if (strcmp(m_device_name, DEVICE_GPU) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DKSBase::deviceCPU() {
|
||||
if (!m_device_set)
|
||||
return false;
|
||||
if (strcmp(m_device_name, DEVICE_CPU) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DKSBase::deviceMIC() {
|
||||
if (!m_device_set)
|
||||
return false;
|
||||
if (strcmp(m_device_name, DEVICE_MIC) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int DKSBase::loadOpenCLKernel(const char *kernel_name) {
|
||||
//load kernel
|
||||
char * kernel_file = new char[500];
|
||||
kernel_file[0] = '\0';
|
||||
strcat(kernel_file, OPENCL_KERNELS);
|
||||
strcat(kernel_file, kernel_name);
|
||||
int ierr = OPENCL_SAFECALL( oclbase->ocl_loadKernel(kernel_file) );
|
||||
delete[] kernel_file;
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
//=====================================//
|
||||
//==========Public functions===========//
|
||||
//=====================================//
|
||||
|
||||
DKSBase::DKSBase() {
|
||||
|
||||
m_device_name = NULL;
|
||||
m_api_name = NULL;
|
||||
m_function_name = NULL;
|
||||
|
||||
m_device_set = false;
|
||||
m_api_set = false;
|
||||
m_function_set = false;
|
||||
|
||||
m_auto_tuning = false;
|
||||
m_use_config = false;
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
cbase = new CudaBase();
|
||||
cfft = new CudaFFT(cbase);
|
||||
cgreens = new CudaGreensFunction(cbase);
|
||||
cchi = new CudaChiSquare(cbase);
|
||||
ccol = new CudaCollimatorPhysics(cbase);
|
||||
#endif
|
||||
|
||||
#ifdef DKS_OPENCL
|
||||
oclbase = new OpenCLBase();
|
||||
oclfft = new OpenCLFFT(oclbase);
|
||||
oclchi = new OpenCLChiSquare(oclbase);
|
||||
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
micbase = new MICBase();
|
||||
micfft = new MICFFT(micbase);
|
||||
miccol = new MICCollimatorPhysics(micbase);
|
||||
micgreens = new MICGreensFunction(micbase);
|
||||
micchi = new MICChiSquare(micbase);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
DKSBase::DKSBase(const char* api_name, const char* device_name) {
|
||||
|
||||
setAPI(api_name, strlen(api_name));
|
||||
setDevice(device_name, strlen(device_name));
|
||||
m_function_name = NULL;
|
||||
m_function_set = false;
|
||||
|
||||
m_auto_tuning = false;
|
||||
m_use_config = false;
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
cbase = new CudaBase();
|
||||
cfft = new CudaFFT(cbase);
|
||||
cgreens = new CudaGreensFunction(cbase);
|
||||
cchi = new CudaChiSquare(cbase);
|
||||
ccol = new CudaCollimatorPhysics(cbase);
|
||||
#endif
|
||||
|
||||
#ifdef DKS_OPENCL
|
||||
oclbase = new OpenCLBase();
|
||||
oclfft = new OpenCLFFT(oclbase);
|
||||
oclchi = new OpenCLChiSquare(oclbase);
|
||||
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
micbase = new MICBase();
|
||||
micfft = new MICFFT(micbase);
|
||||
miccol = new MICCollimatorPhysics(micbase);
|
||||
micgreens = new MICGreensFunction(micbase);
|
||||
micchi = new MICChiSquare(micbase);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
DKSBase::~DKSBase() {
|
||||
|
||||
if (m_device_name != NULL)
|
||||
delete[] m_device_name;
|
||||
|
||||
if (m_api_name != NULL)
|
||||
delete[] m_api_name;
|
||||
|
||||
if (m_function_name != NULL)
|
||||
delete[] m_function_name;
|
||||
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
delete cfft;
|
||||
delete cgreens;
|
||||
delete cchi;
|
||||
delete ccol;
|
||||
delete cbase;
|
||||
#endif
|
||||
|
||||
#ifdef DKS_OPENCL
|
||||
delete oclfft;
|
||||
delete oclchi;
|
||||
delete oclcol;
|
||||
delete oclbase;
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
delete micfft;
|
||||
delete miccol;
|
||||
delete micgreens;
|
||||
delete micchi;
|
||||
delete micbase;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Name: setDevice
|
||||
Info: sets specific device to use. length specifies device_name string length (deprecated)
|
||||
Return: success or error code
|
||||
*/
|
||||
int DKSBase::setDevice(const char* device_name, int length) {
|
||||
|
||||
if (m_device_set)
|
||||
delete[] m_device_name;
|
||||
|
||||
int l = strlen(device_name);
|
||||
m_device_name = new char[l+1];
|
||||
|
||||
for (int i = 0; i < l; i++)
|
||||
m_device_name[i] = device_name[i];
|
||||
m_device_name[l] = '\0';
|
||||
|
||||
m_device_set = true;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Name: setAPI
|
||||
Info: sets specific api (OpenCL, CUDA, OpenACC, OpenMP) to use
|
||||
Return: success or error code
|
||||
*/
|
||||
int DKSBase::setAPI(const char* api_name, int length) {
|
||||
|
||||
if (m_api_set)
|
||||
delete[] m_api_name;
|
||||
|
||||
int l = strlen(api_name);
|
||||
m_api_name = new char[l+1];
|
||||
|
||||
for (int i = 0; i < l; i++)
|
||||
m_api_name[i] = api_name[i];
|
||||
m_api_name[l] = '\0';
|
||||
|
||||
m_api_set = true;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Name: getDevices
|
||||
Info: get all available devices
|
||||
Return: success or error code
|
||||
*/
|
||||
int DKSBase::getDevices() {
|
||||
|
||||
int ierr1 = OPENCL_SAFECALL( oclbase->ocl_getAllDevices() );
|
||||
int ierr2 = CUDA_SAFECALL( cbase->cuda_getDevices() );
|
||||
int ierr3 = MIC_SAFECALL( micbase->mic_getDevices() );
|
||||
|
||||
if (ierr1 + ierr2 + ierr3 != DKS_SUCCESS)
|
||||
return DKS_ERROR;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int DKSBase::getDeviceCount(int &ndev) {
|
||||
ndev = 0;
|
||||
if (apiOpenCL())
|
||||
return OPENCL_SAFECALL( oclbase->ocl_getDeviceCount(ndev) );
|
||||
else if (apiCuda())
|
||||
return CUDA_SAFECALL( cbase->cuda_getDeviceCount(ndev) );
|
||||
else if (apiOpenMP())
|
||||
return DKS_ERROR;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::getDeviceName(std::string &device_name) {
|
||||
if (apiOpenCL())
|
||||
return OPENCL_SAFECALL( oclbase->ocl_getDeviceName(device_name) );
|
||||
else if (apiCuda())
|
||||
return CUDA_SAFECALL( cbase->cuda_getDeviceName(device_name) );
|
||||
else if (apiOpenMP())
|
||||
return DKS_ERROR;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::setDefaultDevice(int device) {
|
||||
std::cout << "Set device " << device << std::endl;
|
||||
if (apiOpenCL())
|
||||
return OPENCL_SAFECALL( oclbase->ocl_setDevice(device) );
|
||||
else if (apiCuda())
|
||||
return CUDA_SAFECALL( cbase->cuda_setDevice(device) );
|
||||
else if (apiOpenMP())
|
||||
return DKS_ERROR;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::getDeviceList(std::vector<int> &devices) {
|
||||
if (apiOpenCL())
|
||||
return OPENCL_SAFECALL( oclbase->ocl_getUniqueDevices(devices) );
|
||||
else if (apiCuda())
|
||||
return CUDA_SAFECALL( cbase->cuda_getUniqueDevices(devices) );
|
||||
else if (apiOpenMP())
|
||||
return DKS_ERROR;
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
init device
|
||||
*/
|
||||
int DKSBase::initDevice() {
|
||||
|
||||
//if api is not set default is OpenCL
|
||||
if (!m_api_set) {
|
||||
setDevice("-gpu", 4);
|
||||
setAPI(API_OPENCL, 6);
|
||||
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
||||
} else {
|
||||
if (apiOpenCL()) {
|
||||
if (!m_device_set) {
|
||||
setDevice("-gpu", 4);
|
||||
setAPI(API_OPENCL, 6);
|
||||
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
||||
} else {
|
||||
setAPI(API_OPENCL, 6);
|
||||
return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
|
||||
}
|
||||
} else if (apiCuda()) {
|
||||
setDevice("-gpu", 4);
|
||||
setAPI(API_CUDA, 4);
|
||||
return CUDA_SAFECALL(DKS_SUCCESS);
|
||||
} else if (apiOpenMP()) {
|
||||
setDevice("-mic", 4);
|
||||
setAPI(API_OPENMP, 6);
|
||||
return MIC_SAFECALL(DKS_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
set up cuda, opencl and mic to allow async data transfer and kernel execution.
|
||||
name stream 'stolen' from cuda. opencl context ~ cuda stream.
|
||||
TODO: implementations for OpenCL and MIC still needed
|
||||
*/
|
||||
int DKSBase::createStream(int &streamId) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL( cbase->cuda_createStream(streamId) );
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL( micbase->mic_createStream(streamId) );
|
||||
|
||||
DEBUG_MSG("Streams not enbled for this platforms jet");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* send device pointer to other processes */
|
||||
#ifdef DKS_MPI
|
||||
int DKSBase::sendPointer(void *mem_ptr, int dest, MPI_Comm comm) {
|
||||
|
||||
if ( apiCuda() ) {
|
||||
#ifdef DKS_CUDA
|
||||
cudaError cerror;
|
||||
cudaIpcMemHandle_t shandle;
|
||||
cerror = cudaIpcGetMemHandle(&shandle, mem_ptr);
|
||||
MPI_Send(&shandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, dest, 100, comm);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error geting mem handle");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
#endif
|
||||
}
|
||||
else if (apiOpenMP()) {
|
||||
#ifdef DKS_MIC
|
||||
//BENI:
|
||||
DEBUG_MSG("No SendPointer for MIC is implemented");
|
||||
return DKS_ERROR;
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
DEBUG_MSG("Send device pointer not implemented on selected platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
return DKS_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* receive device pointer */
|
||||
#ifdef DKS_MPI
|
||||
void * DKSBase::receivePointer(int hostproc, MPI_Comm comm, int &ierr) {
|
||||
|
||||
void *mem_ptr;
|
||||
if (apiCuda()) {
|
||||
#ifdef DKS_CUDA
|
||||
cudaError cerror;
|
||||
cudaIpcMemHandle_t rhandle;
|
||||
MPI_Recv(&rhandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, hostproc, 100, comm, NULL);
|
||||
cerror = cudaIpcOpenMemHandle(&mem_ptr, rhandle, cudaIpcMemLazyEnablePeerAccess);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error opening received handle");
|
||||
ierr = DKS_ERROR;
|
||||
}
|
||||
#endif
|
||||
return mem_ptr;
|
||||
}
|
||||
else if (apiOpenMP()) {
|
||||
#ifdef DKS_MIC
|
||||
//BENI:
|
||||
DEBUG_MSG("No ReceivePointer for MIC is implemented");
|
||||
return DKS_SUCCESS;
|
||||
#endif
|
||||
return mem_ptr;
|
||||
}
|
||||
else {
|
||||
ierr = DKS_ERROR;
|
||||
DEBUG_MSG("Receive device pointer not implemented for selected platform");
|
||||
return mem_ptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* close received handle */
|
||||
int DKSBase::closeHandle(void *mem_ptr) {
|
||||
|
||||
if (apiCuda()) {
|
||||
#ifdef DKS_CUDA
|
||||
cudaError cerror;
|
||||
cerror = cudaIpcCloseMemHandle(mem_ptr);
|
||||
if (cerror != cudaSuccess) {
|
||||
DEBUG_MSG("Error closing memory handle");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
#endif
|
||||
}
|
||||
|
||||
DEBUG_MSG("Memory handles not implemented for selected platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
/* sync device calls */
|
||||
int DKSBase::syncDevice() {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL( cbase->cuda_syncDevice() );
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL( micbase->mic_syncDevice() );
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* setup fft plans to reuse if multiple ffts of same size are needed */
|
||||
int DKSBase::setupFFT(int ndim, int N[3]) {
|
||||
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
|
||||
} else if (apiOpenMP()) {
|
||||
//micbase.mic_setupFFT(ndim, N);
|
||||
//BENI: setting up RC and CR transformations on MIC
|
||||
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
|
||||
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
|
||||
if (ierr1 != DKS_SUCCESS)
|
||||
return ierr1;
|
||||
if (ierr2 != DKS_SUCCESS)
|
||||
return ierr2;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
//BENI:
|
||||
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
|
||||
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
//BENI:
|
||||
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
|
||||
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
/* call OpenCL FFT function for selected platform */
|
||||
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||
|
||||
if (apiOpenCL()) {
|
||||
//load kernel and execute
|
||||
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
|
||||
else
|
||||
return DKS_ERROR;
|
||||
} else if (apiCuda()) {
|
||||
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
|
||||
} else if (apiOpenMP()) {
|
||||
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* call OpenCL IFFT function for selected platform */
|
||||
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||
if (apiOpenCL()) {
|
||||
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
|
||||
else
|
||||
return DKS_ERROR;
|
||||
} else if (apiCuda()) {
|
||||
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
|
||||
} else if (apiOpenMP()) {
|
||||
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* call normalize FFT function for selected platform */
|
||||
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||
|
||||
if (apiOpenCL()) {
|
||||
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
|
||||
else
|
||||
return DKS_ERROR;
|
||||
} else if (apiCuda()) {
|
||||
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
|
||||
} else if (apiOpenMP()) {
|
||||
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* call real to complex FFT */
|
||||
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* call complex to real FFT */
|
||||
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
/* normalize complex to real iFFT */
|
||||
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/* normalize complex to real iFFT */
|
||||
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
|
||||
if (apiOpenCL()) {
|
||||
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
|
||||
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selected platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
||||
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
||||
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||
hz_m0, hz_m1, hz_m2, streamId) );
|
||||
} else if (apiOpenMP()) {
|
||||
//BENI:
|
||||
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
||||
int I, int J, int K, int streamId) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
|
||||
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
|
||||
double fTimeResolution, double fRebin,
|
||||
int sensors, int length, int numpar, double &result)
|
||||
{
|
||||
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||
fTimeResolution, fRebin,
|
||||
sensors, length, numpar,
|
||||
result));
|
||||
} else if (apiOpenCL()) {
|
||||
|
||||
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||
fTimeResolution, fRebin,
|
||||
sensors, length, numpar, result));
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
|
||||
fTimeResolution, fRebin, fGoodBinOffset,
|
||||
sensors, length, numpar,
|
||||
result));
|
||||
} else if (apiOpenCL()) {
|
||||
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
|
||||
fTimeResolution, fRebin, fGoodBinOffset,
|
||||
sensors, length, numpar, result));
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
|
||||
fTimeResolution, fRebin, fGoodBinOffset,
|
||||
sensors, length, numpar,
|
||||
result));
|
||||
} else if (apiOpenCL()) {
|
||||
|
||||
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
|
||||
fTimeResolution, fRebin, fGoodBinOffset,
|
||||
sensors, length, numpar, result));
|
||||
else
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||
int numparticles, int numparams,
|
||||
int &numaddback, int &numdead)
|
||||
{
|
||||
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||
} else if (apiOpenCL()) {
|
||||
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
|
||||
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||
else
|
||||
return DKS_ERROR;
|
||||
|
||||
} else if (apiOpenMP()) {
|
||||
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||
}
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
|
||||
{
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles)
|
||||
{
|
||||
|
||||
if (apiOpenMP()) {
|
||||
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
|
||||
rx_ptr, ry_ptr, rz_ptr,
|
||||
px_ptr, py_ptr, pz_ptr,
|
||||
par_ptr, numparticles) );
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
|
||||
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
|
||||
{
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles, int &numaddback)
|
||||
{
|
||||
|
||||
if (apiOpenMP()) {
|
||||
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
|
||||
rx_ptr, ry_ptr, rz_ptr,
|
||||
px_ptr, py_ptr, pz_ptr,
|
||||
par_ptr, numparticles, numaddback));
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int DKSBase::callInitRandoms(int size) {
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(cbase->cuda_createCurandStates(size));
|
||||
else if (apiOpenCL())
|
||||
return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(micbase->mic_createRandStreams(size));
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
||||
void *dt_ptr, double dt, double c,
|
||||
bool usedt, int streamId)
|
||||
{
|
||||
|
||||
if (apiCuda())
|
||||
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
|
||||
usedt, streamId));
|
||||
else if (apiOpenMP())
|
||||
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
|
||||
c, usedt, streamId));
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
||||
|
||||
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||
void *lastSec_ptr, void *orient_ptr,
|
||||
int npart, int nsec, void *dt_ptr, double dt,
|
||||
double c, bool usedt, int streamId)
|
||||
{
|
||||
|
||||
if (apiCuda()) {
|
||||
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
|
||||
lastSec_ptr, orient_ptr,
|
||||
npart, nsec, dt_ptr, dt,
|
||||
c, usedt, streamId));
|
||||
} else if (apiOpenMP()) {
|
||||
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
|
||||
lastSec_ptr, orient_ptr,
|
||||
npart, nsec, dt_ptr, dt,
|
||||
c, usedt, streamId));
|
||||
}
|
||||
|
||||
DEBUG_MSG("No implementation for selceted platform");
|
||||
return DKS_ERROR;
|
||||
|
||||
}
|
1133
src/DKSBase.h
Normal file
1133
src/DKSBase.h
Normal file
File diff suppressed because it is too large
Load Diff
196
src/DKSBaseMuSR.cpp
Normal file
196
src/DKSBaseMuSR.cpp
Normal file
@ -0,0 +1,196 @@
|
||||
#include "DKSBaseMuSR.h"
|
||||
|
||||
DKSBaseMuSR::DKSBaseMuSR() {
|
||||
chiSq = nullptr;
|
||||
chiSquareSize_m = -1;
|
||||
}
|
||||
|
||||
DKSBaseMuSR::~DKSBaseMuSR() {
|
||||
freeChiSquare();
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::callCompileProgram(std::string function, bool mlh) {
|
||||
return chiSq->compileProgram(function, mlh);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::callLaunchChiSquare(int fitType,
|
||||
void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result)
|
||||
{
|
||||
|
||||
|
||||
//if we are not auto tuning and the size of the problem has changed find the new parameters
|
||||
//from autotuning config file
|
||||
if (!isAutoTuningOn() && length != chiSquareSize_m) {
|
||||
int numBlocks, blockSize;
|
||||
std::string device_name;
|
||||
getDeviceName(device_name);
|
||||
dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare",
|
||||
length, "NumBlocks", numBlocks);
|
||||
dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare",
|
||||
length, "BlockSize", blockSize);
|
||||
chiSq->setKernelParams(numBlocks, blockSize);
|
||||
|
||||
//std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
|
||||
|
||||
chiSquareSize_m = length;
|
||||
}
|
||||
|
||||
int ierr = chiSq->launchChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc,
|
||||
nummap, timeStart, timeStep, result);
|
||||
|
||||
if ( isAutoTuningOn() ) {
|
||||
std::vector<int> config;
|
||||
callAutoTuningChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, nummap, timeStart,
|
||||
timeStep, result, config);
|
||||
}
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result, std::vector<int> &config)
|
||||
{
|
||||
|
||||
int loops = 100;
|
||||
DKSAutoTuning *autoTuning;
|
||||
if (apiCuda())
|
||||
autoTuning = new DKSAutoTuning(this, API_CUDA, DEVICE_GPU_NEW, loops);
|
||||
else if (apiOpenCL() && deviceGPU())
|
||||
autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_GPU_NEW, loops);
|
||||
else if (apiOpenCL() && deviceCPU())
|
||||
autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_CPU_NEW, loops);
|
||||
else if (apiOpenCL() && deviceMIC())
|
||||
autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_MIC_NEW, loops);
|
||||
else
|
||||
autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW, loops);
|
||||
|
||||
|
||||
int maxThreadsPerBlock = 1024;
|
||||
checkMuSRKernels(fitType, maxThreadsPerBlock);
|
||||
std::cout << "Max threads for autotune " << maxThreadsPerBlock << std::endl;
|
||||
|
||||
//create the function to be timed
|
||||
std::function<int()> f = std::bind(&ChiSquareRuntime::launchChiSquare, chiSq,
|
||||
fitType, mem_data, mem_err, length, numpar, numfunc, nummap,
|
||||
timeStart, timeStep, result);
|
||||
autoTuning->setFunction(f, "launchChiSquare");
|
||||
|
||||
//create the parameters for auto-tuning
|
||||
autoTuning->addParameter(&chiSq->blockSize_m, 32, maxThreadsPerBlock, 32, "BlockSize");
|
||||
autoTuning->addParameter(&chiSq->numBlocks_m, 100, 5000, 100, "NumBlocks");
|
||||
|
||||
autoTuning->lineSearch();
|
||||
|
||||
//autoTuning->hillClimbing(100);
|
||||
|
||||
//autoTuning->simulatedAnnealing(1e-3, 1e-6);
|
||||
|
||||
//autoTuning->exaustiveSearch();
|
||||
|
||||
std::string device_name;
|
||||
getDeviceName(device_name);
|
||||
dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length,
|
||||
"NumBlocks", chiSq->numBlocks_m);
|
||||
dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length,
|
||||
"BlockSize", chiSq->blockSize_m);
|
||||
|
||||
|
||||
config.push_back(chiSq->blockSize_m);
|
||||
config.push_back(chiSq->numBlocks_m);
|
||||
|
||||
delete autoTuning;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::testAutoTuning() {
|
||||
|
||||
DKSAutoTuning *autoTuning;
|
||||
DKSAutoTuningTester *tester;
|
||||
|
||||
autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW);
|
||||
tester = new DKSAutoTuningTester();
|
||||
|
||||
std::function<double()> f = std::bind(&DKSAutoTuningTester::peaksZ, tester);
|
||||
autoTuning->setFunction(f, "testAutoTuner", false);
|
||||
|
||||
autoTuning->addParameter(&tester->x, -3.0, 3.0, 0.5, "x");
|
||||
autoTuning->addParameter(&tester->y, -3.0, 3.0, 0.5, "y");
|
||||
|
||||
autoTuning->exaustiveSearch();
|
||||
|
||||
autoTuning->hillClimbing(10);
|
||||
|
||||
autoTuning->simulatedAnnealing(10, 0.0005);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::callSetConsts(double N0, double tau, double bkg) {
|
||||
return chiSq->setConsts(N0, tau, bkg);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::callSetConsts(double alpha, double beta) {
|
||||
return chiSq->setConsts(alpha, beta);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int size_map) {
|
||||
int ierr;
|
||||
|
||||
if (apiCuda()) {
|
||||
ierr = CUDA_SAFECALL( DKS_SUCCESS );
|
||||
chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
|
||||
} else {
|
||||
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
|
||||
chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
|
||||
}
|
||||
|
||||
if (ierr == DKS_SUCCESS) {
|
||||
return chiSq->initChiSquare(size_data, size_param, size_func, size_map);
|
||||
} else {
|
||||
DEBUG_MSG("DKS API not set, or DKS compiled without sellected API support");
|
||||
return DKS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::freeChiSquare() {
|
||||
int ierr = DKS_SUCCESS;
|
||||
if (chiSq != NULL) {
|
||||
ierr = chiSq->freeChiSquare();
|
||||
delete chiSq;
|
||||
chiSq = NULL;
|
||||
}
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::writeParams(const double *params, int numparams) {
|
||||
return chiSq->writeParams(params, numparams);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::writeFunctions(const double *func, int numfunc) {
|
||||
return chiSq->writeFunc(func, numfunc);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::writeMaps(const int *map, int numfunc) {
|
||||
return chiSq->writeMap(map, numfunc);;
|
||||
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::checkMuSRKernels(int fitType) {
|
||||
int threadsPerBlock = 1;
|
||||
return chiSq->checkChiSquareKernels(fitType, threadsPerBlock);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::checkMuSRKernels(int fitType, int &threadsPerBlock) {
|
||||
return chiSq->checkChiSquareKernels(fitType, threadsPerBlock);
|
||||
}
|
||||
|
||||
int DKSBaseMuSR::getOperations(int &oper) {
|
||||
return chiSq->getOperations(oper);
|
||||
}
|
137
src/DKSBaseMuSR.h
Normal file
137
src/DKSBaseMuSR.h
Normal file
@ -0,0 +1,137 @@
|
||||
#ifndef H_DKS_BASEMUSR
|
||||
#define H_DKS_BASEMUSR
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "AutoTuning/DKSAutoTuning.h"
|
||||
#include "AutoTuning/DKSAutoTuningTester.h"
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include "Algorithms/ChiSquareRuntime.h"
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
#include "CUDA/CudaChiSquareRuntime.cuh"
|
||||
#endif
|
||||
|
||||
#ifdef DKS_OPENCL
|
||||
#include "OpenCL/OpenCLChiSquareRuntime.h"
|
||||
#endif
|
||||
|
||||
class DKSBaseMuSR : public DKSBase {
|
||||
|
||||
private:
|
||||
|
||||
ChiSquareRuntime *chiSq;
|
||||
|
||||
int chiSquareSize_m;
|
||||
|
||||
public:
|
||||
|
||||
DKSBaseMuSR();
|
||||
|
||||
~DKSBaseMuSR();
|
||||
|
||||
/** Compile the program with kernels to be run.
|
||||
* String function contains the string that will be added to the code to compile in the
|
||||
* function: __device__ double fTheory(double t, double *p, double *f, int *m);
|
||||
* Function string must be a valid C math expression. It can contain operators, math functions
|
||||
* and predefined functions listed in:
|
||||
* http://lmu.web.psi.ch/musrfit/user/MUSR/MusrFit.html#A_4.3_The_THEORY_Block
|
||||
* Predifined functions can be accessed by the abbreviation given in the table
|
||||
* Parameters can be accesed in form p[idx] or p[m[idx]] - where p represents parameter array
|
||||
* m represents map array and idx is the index to use from the maps. Precalculated function
|
||||
* values can be accessed the same way - f[idx] or f[m[idx]]. Returns DKS_SUCCESS if everythin
|
||||
* runs successfully, otherwise returns DKS_ERROR. If DKS is compiled with debug flag enabled
|
||||
* prints DKS error message in case something fails
|
||||
*/
|
||||
int callCompileProgram(std::string function, bool mlh = false);
|
||||
|
||||
/** Launch chi square calculation on data set writen in mem_data memory on device.
|
||||
* mem_par, mem_map and mem_func hold pointers to parameter, function and map values
|
||||
* for this data set (parameter array is one for all the data sets, maps and functions
|
||||
* change between data sets). Resulting chi square value for this dataset will be put in
|
||||
* result variable. Returns DKS_SUCCESS if everythin runs successfully, otherwise returns
|
||||
* DKS_ERROR. If DKS is compiled with debug flag enabled prints DKS error message in case
|
||||
* something fails
|
||||
*/
|
||||
int callLaunchChiSquare(int fitType,
|
||||
void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result);
|
||||
|
||||
/** Launch auto-tuning of chisquare function for the selected device.
|
||||
* Creates a function pointer to callLaunchChiSquare with necessary arguments bind to
|
||||
* function call. CUDA and OpenCL version - gives AutoTuning class access to numThreads
|
||||
* parameter which is varied to find the optimal value by AutoTuning class. Uses brute force
|
||||
* method to test all the values.
|
||||
*/
|
||||
int callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result, std::vector<int> &config);
|
||||
|
||||
/** Set N0, tau and BKG values for the run.
|
||||
* Needs to be called before kernel launch if these values are changing
|
||||
*/
|
||||
int callSetConsts(double N0, double tau, double bkg);
|
||||
|
||||
/** Set alpha and beta values for the run.
|
||||
* Needs to be called before kernel launch if these values are changing
|
||||
*/
|
||||
int callSetConsts(double alpha, double beta);
|
||||
|
||||
/** Init chisquare calculations.
|
||||
* Size is the maximum number of elements in any of the data sets used.
|
||||
*/
|
||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||
|
||||
/** Free temporary device storage allocated for chi^2 kernel.
|
||||
* Return error code if freeing the device fails.
|
||||
*/
|
||||
int freeChiSquare();
|
||||
|
||||
/** Write params to device.
|
||||
* Write pramas from double array to device, params device memory is managed by DKS.
|
||||
*/
|
||||
int writeParams(const double *params, int numparams);
|
||||
|
||||
/** Write function values to device.
|
||||
* Write precalculated function values to device, memory for functions on device is handled
|
||||
* by DKS.
|
||||
*/
|
||||
int writeFunctions(const double *func, int numfunc);
|
||||
|
||||
/** Write map indexes to device.
|
||||
* Write map indexes to use in defined theory function to devive. Memory for map indexes is
|
||||
* handeld by DKS.
|
||||
*/
|
||||
int writeMaps(const int *map, int numfunc);
|
||||
|
||||
/** Check if device can run necessary kernels.
|
||||
* Check selected device properties to see if device
|
||||
* suports double precision and if device can run the
|
||||
* necessary number of work_items / work_groups to successfully
|
||||
* execute CUDA/OpenCL kernels.
|
||||
*/
|
||||
int checkMuSRKernels(int fitType);
|
||||
|
||||
/** Perform the same check as checkMuSRKernels(int fitType) and return max threads per block.
|
||||
* Used for autotuning to check what is the device limit for threads per block to correctly
|
||||
* set the upper bound when searching the parameter space.
|
||||
*/
|
||||
int checkMuSRKernels(int fitType, int &threadsPerBlock);
|
||||
|
||||
/** Debug function to test auto-tuning search functions
|
||||
*/
|
||||
int testAutoTuning();
|
||||
|
||||
/** Get the number of operations in compiled kernel.
|
||||
*/
|
||||
int getOperations(int &oper);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
71
src/DKSDefinitions.h
Normal file
71
src/DKSDefinitions.h
Normal file
@ -0,0 +1,71 @@
|
||||
#ifndef H_DKS_DEFINITIONS
|
||||
#define H_DKS_DEFINITIONS
|
||||
|
||||
#define API_OPENCL "OpenCL"
|
||||
#define API_CUDA "Cuda"
|
||||
#define API_OPENMP "OpenMP"
|
||||
#define API_UNKNOWN "Unknown"
|
||||
|
||||
#define DEVICE_GPU_NEW "GPU"
|
||||
#define DEVICE_CPU_NEW "CPU"
|
||||
#define DEVICE_MIC_NEW "MIC"
|
||||
#define DEVICE_UNKNOWN_NEW "Unknown"
|
||||
|
||||
#define DEVICE_GPU "-gpu"
|
||||
#define DEVICE_CPU "-cpu"
|
||||
#define DEVICE_MIC "-mic"
|
||||
|
||||
//define macro for printing debug messages if debug flag is set
|
||||
#ifdef DEBUG
|
||||
#define DEBUG_MSG(x) (std::cout << x << std::endl)
|
||||
#else
|
||||
#define DEBUG_MSG(x)
|
||||
#endif
|
||||
|
||||
//define DKS error codes
|
||||
#define DKS_SUCCESS 0
|
||||
#define DKS_ERROR 1
|
||||
#define DKS_API_NOT_ENABLED 100
|
||||
|
||||
#define OCL_SUCCESS 0
|
||||
#define OCL_ERROR 1
|
||||
|
||||
//define macros to enable or disable calls to specific frameworks
|
||||
//if framework specific flag is set execute the satement, of not give DKS_API_NOT_ENABLED error
|
||||
#ifdef DKS_CUDA
|
||||
#define CUDA_SAFECALL(...) ( __VA_ARGS__ )
|
||||
#else
|
||||
#define CUDA_SAFECALL(...) ( DKS_API_NOT_ENABLED )
|
||||
#endif
|
||||
|
||||
#ifdef DKS_OPENCL
|
||||
#define OPENCL_SAFECALL(...) ( __VA_ARGS__ )
|
||||
#else
|
||||
#define OPENCL_SAFECALL(...) ( DKS_API_NOT_ENABLED )
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
#define MIC_SAFECALL(...) ( __VA_ARGS__ )
|
||||
#else
|
||||
#define MIC_SAFECALL(...) ( DKS_API_NOT_ENABLED )
|
||||
#endif
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
#define CUDA_SAFEINIT(x) ( x )
|
||||
#else
|
||||
#define CUDA_SAFEINIT(x) ( NULL )
|
||||
#endif
|
||||
|
||||
#ifdef DKS_OPENCL
|
||||
#define OPENCL_SAFEINIT(x) ( x )
|
||||
#else
|
||||
#define OPENCL_SAFEINIT(x) ( NULL )
|
||||
#endif
|
||||
|
||||
#ifdef DKS_MIC
|
||||
#define MIC_SAFEINIT(x) ( x )
|
||||
#else
|
||||
#define MIC_SAFEINIT(x) ( NULL )
|
||||
#endif
|
||||
|
||||
#endif
|
0
src/DKSDevice.cpp
Normal file
0
src/DKSDevice.cpp
Normal file
37
src/DKSDevice.h
Normal file
37
src/DKSDevice.h
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
|
||||
Author: Uldis Locans
|
||||
|
||||
Info: class that holds information about the compute device
|
||||
|
||||
Data: 25.09.2014
|
||||
|
||||
*/
|
||||
|
||||
#define DKS_DEVICE_TYPE_GPU 1
|
||||
#define DKS_DEVICE_TYPE_MIC 2
|
||||
#define DKS_DEVICE_TYPE_CPU 3
|
||||
|
||||
class Device {
|
||||
|
||||
private:
|
||||
int m_device_id;
|
||||
int m_device_type;
|
||||
char *m_device_name;
|
||||
char *m_device_vendor;
|
||||
|
||||
bool m_sup_opencl;
|
||||
bool m_sup_cuda;
|
||||
bool m_sup_openmp;
|
||||
bool m_sup_openacc;
|
||||
|
||||
int m_pci_bus_id;
|
||||
|
||||
public:
|
||||
|
||||
Device();
|
||||
~Device();
|
||||
|
||||
|
||||
|
||||
};
|
130
src/DKSImageReconstruction.cpp
Normal file
130
src/DKSImageReconstruction.cpp
Normal file
@ -0,0 +1,130 @@
|
||||
#include "DKSImageReconstruction.h"
|
||||
|
||||
DKSImageRecon::DKSImageRecon() {
|
||||
|
||||
//set up base. since reconstruction is always using cuda, set up base to CUDA
|
||||
setAPI("Cuda");
|
||||
setDevice("-gpu");
|
||||
initDevice();
|
||||
|
||||
imageRecon = CUDA_SAFEINIT( new CudaImageReconstruction(getCudaBase()) );
|
||||
}
|
||||
|
||||
DKSImageRecon::~DKSImageRecon() {
|
||||
delete[] imageRecon;
|
||||
}
|
||||
|
||||
int DKSImageRecon::callCalculateSource(void *image_space, void *image_position,
|
||||
void *source_position, void *avg, void *std,
|
||||
float diameter, int total_voxels,
|
||||
int total_sources, int start)
|
||||
{
|
||||
int ierr;
|
||||
ierr = imageRecon->calculateSource(image_space, image_position, source_position,
|
||||
avg, std, diameter, total_voxels,
|
||||
total_sources, start);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::callCalculateBackground(void *image_space, void *image_position,
|
||||
void *source_position, void *avg, void *std,
|
||||
float diameter, int total_voxels,
|
||||
int total_sources, int start)
|
||||
{
|
||||
|
||||
int ierr;
|
||||
ierr = imageRecon->calculateBackground(image_space, image_position,
|
||||
source_position, avg, std, diameter,
|
||||
total_voxels, total_sources, start);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::callCalculateSources(void *image_space, void *image_position,
|
||||
void *source_position, void *avg, void *std,
|
||||
void *diameter, int total_voxels,
|
||||
int total_sources, int start)
|
||||
{
|
||||
int ierr;
|
||||
ierr = imageRecon->calculateSources(image_space, image_position,
|
||||
source_position, avg, std, diameter,
|
||||
total_voxels, total_sources, start);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::callCalculateBackgrounds(void *image_space, void *image_position,
|
||||
void *source_position, void *avg, void *std,
|
||||
void *diameter, int total_voxels,
|
||||
int total_sources, int start)
|
||||
{
|
||||
|
||||
int ierr;
|
||||
ierr = imageRecon->calculateBackgrounds(image_space, image_position,
|
||||
source_position, avg, std, diameter,
|
||||
total_voxels, total_sources, start);
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
|
||||
int DKSImageRecon::callGenerateNormalization(void *recon, void *image_position,
|
||||
void *det_position, int total_det)
|
||||
{
|
||||
|
||||
int ierr = imageRecon->generateNormalization(recon, image_position,
|
||||
det_position, total_det);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
|
||||
int DKSImageRecon::callForwardProjection(void *correction, void *recon, void *list_data,
|
||||
void *det_position, void *image_position, int num_events)
|
||||
{
|
||||
|
||||
int ierr;
|
||||
ierr = imageRecon->forwardProjection(correction, recon, list_data, det_position,
|
||||
image_position, num_events);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
|
||||
void *det_position, void *image_position,
|
||||
int num_events, int num_voxels)
|
||||
{
|
||||
|
||||
int ierr;
|
||||
ierr = imageRecon->backwardProjection(correction, recon_corrector, list_data,
|
||||
det_position, image_position, num_events,
|
||||
num_voxels);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) {
|
||||
int ierr = imageRecon->setDimensions(voxel_x, voxel_y, voxel_z, voxel_size);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::setEdge(float x_edge, float y_edge, float z_edge) {
|
||||
int ierr = imageRecon->setEdge(x_edge, y_edge, z_edge);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) {
|
||||
int ierr = imageRecon->setEdge1(x_edge1, y_edge1, z_edge1, z_edge2);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::setMinCrystalInRing(float min_CrystalDist_InOneRing,
|
||||
float min_CrystalDist_InOneRing1)
|
||||
{
|
||||
int ierr = imageRecon->setMinCrystalInRing(min_CrystalDist_InOneRing,
|
||||
min_CrystalDist_InOneRing1);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int DKSImageRecon::setParams(float matrix_distance_factor, float phantom_diameter,
|
||||
float atten_per_mm, float ring_diameter)
|
||||
{
|
||||
int ierr = imageRecon->setParams(matrix_distance_factor, phantom_diameter,
|
||||
atten_per_mm, ring_diameter);
|
||||
return ierr;
|
||||
}
|
120
src/DKSImageReconstruction.h
Normal file
120
src/DKSImageReconstruction.h
Normal file
@ -0,0 +1,120 @@
|
||||
#ifndef H_DKS_IMAGERECONSTRUCTION
|
||||
#define H_DKS_IMAGERECONSTRUCTION
|
||||
|
||||
#include <iostream>
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include "Algorithms/ImageReconstruction.h"
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
#include "CUDA/CudaImageReconstruction.cuh"
|
||||
#endif
|
||||
|
||||
class DKSImageRecon : public DKSBase {
|
||||
|
||||
private:
|
||||
|
||||
ImageReconstruction *imageRecon;
|
||||
|
||||
public:
|
||||
|
||||
DKSImageRecon();
|
||||
|
||||
~DKSImageRecon();
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
*/
|
||||
int callCalculateSource(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
*/
|
||||
int callCalculateBackground(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, float diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
*/
|
||||
int callCalculateSources(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Image reconstruction analaysis calculate source.
|
||||
*
|
||||
*
|
||||
*/
|
||||
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
|
||||
void *avg, void *std, void *diameter, int total_voxels,
|
||||
int total_sources, int start = 0);
|
||||
|
||||
/** Image reconstruction - generate normalization.
|
||||
*
|
||||
*/
|
||||
int callGenerateNormalization(void *recon, void *image_position,
|
||||
void *det_position, int total_det);
|
||||
|
||||
/** Image reconstruction - forward correction.
|
||||
*
|
||||
*/
|
||||
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||
void *image_position, int num_events);
|
||||
|
||||
/** Image reconstruction - backward projection.
|
||||
*
|
||||
*/
|
||||
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
|
||||
void *det_position, void *image_position,
|
||||
int num_events, int num_voxels);
|
||||
|
||||
/** Set the voxel dimensins on device.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
|
||||
|
||||
/** Set the image edge.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setEdge(float x_edge, float y_edge, float z_edge);
|
||||
|
||||
/** Set the image edge1.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
|
||||
|
||||
/** Set the minimum crystan in one ring values.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
|
||||
|
||||
/** Set all other required parameters for reconstruction.
|
||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||
* Call set function once to transfer the values from host side to GPU.
|
||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||
*/
|
||||
int setParams(float matrix_distance_factor, float phantom_diameter,
|
||||
float atten_per_mm, float ring_diameter);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif
|
24
src/DKSStream.h
Normal file
24
src/DKSStream.h
Normal file
@ -0,0 +1,24 @@
|
||||
/*
|
||||
Author: Uldis Locans
|
||||
|
||||
Date: 12.12.2014
|
||||
|
||||
Comment: based on device used create different cuda streams, opencl contexts, (mic - dont know yet)
|
||||
that allow handling of asynchronoes data transfer and kernel execution on the device
|
||||
|
||||
*/
|
||||
|
||||
#ifndef H_DKSSTREAM
|
||||
#define H_DKSSTREAM
|
||||
|
||||
#define DKS_SUCCESS 0
|
||||
#define DKS_ERROR 1
|
||||
|
||||
#include <iostream>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
class DKSStream {
|
||||
|
||||
|
||||
|
||||
}
|
25
src/MIC/CMakeLists.txt
Normal file
25
src/MIC/CMakeLists.txt
Normal file
@ -0,0 +1,25 @@
|
||||
SET (_SRCS
|
||||
MICBase.cpp
|
||||
MICChiSquare.cpp
|
||||
MICFFT.cpp
|
||||
MICGreensFunction.cpp
|
||||
MICCollimatorPhysics.cpp
|
||||
)
|
||||
|
||||
SET (_HDRS
|
||||
MICBase.h
|
||||
MICChiSquare.h
|
||||
MICFFT.h
|
||||
MICCollimatorPhysics.h
|
||||
MICGreensFunction.hpp
|
||||
MICMergeSort.h
|
||||
)
|
||||
|
||||
#INCLUDE_DIRECTORIES (
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
#)
|
||||
|
||||
ADD_SOURCES (${_SRCS})
|
||||
ADD_HEADERS (${_HDRS})
|
||||
|
||||
INSTALL(FILES ${_HDRS} DESTINATION include/MIC)
|
124
src/MIC/MICBase.cpp
Normal file
124
src/MIC/MICBase.cpp
Normal file
@ -0,0 +1,124 @@
|
||||
#include "MICBase.h"
|
||||
|
||||
//constructor, sets default device id equal to 0
|
||||
MICBase::MICBase() {
|
||||
m_device_id = 0;
|
||||
defaultRndSet = -1;
|
||||
|
||||
}
|
||||
|
||||
//destructor, delete defaultrnd streams if they are set
|
||||
MICBase::~MICBase() {
|
||||
mic_deleteRandStreams();
|
||||
}
|
||||
|
||||
|
||||
//create default rand streams
|
||||
int MICBase::mic_createRandStreams(int size) {
|
||||
|
||||
int seed = time(NULL);
|
||||
|
||||
#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
|
||||
{
|
||||
|
||||
//get the number of threads
|
||||
int numThreads;
|
||||
|
||||
#pragma omp parallel
|
||||
numThreads = omp_get_num_threads();
|
||||
|
||||
//if default rnd stream already allocated delete the array
|
||||
if (defaultRndSet == 1)
|
||||
delete[] defaultRndStream;
|
||||
|
||||
//allocate defaultRndStream array
|
||||
defaultRndStream = new VSLStreamStatePtr[numThreads];
|
||||
|
||||
//create stream states for each thread
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < omp_get_num_threads(); i++)
|
||||
vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
|
||||
|
||||
defaultRndSet = 1;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
//delete default rand streams
|
||||
int MICBase::mic_deleteRandStreams() {
|
||||
|
||||
#pragma offload target(mic:m_device_id) inout(defaultRndSet)
|
||||
{
|
||||
if (defaultRndSet == 1) {
|
||||
delete[] defaultRndStream;
|
||||
defaultRndSet = -1;
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//create a new signal for the mic
|
||||
int MICBase::mic_createStream(int & streamId) {
|
||||
|
||||
//use int as signal, create a new int in micStreams vector, return the id
|
||||
int tmpStream = micStreams.size();
|
||||
micStreams.push_back(tmpStream);
|
||||
streamId = micStreams.size() - 1;
|
||||
|
||||
//empty offload to create the signal on the mic
|
||||
/*
|
||||
#pragma offload target(mic:m_device_id) signal(mic_getStream(streamId))
|
||||
{
|
||||
}
|
||||
*/
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
//get the signal from the vector
|
||||
int& MICBase::mic_getStream(int id) {
|
||||
return micStreams[id];
|
||||
}
|
||||
|
||||
//delete streams
|
||||
int MICBase::mic_deleteStreams() {
|
||||
micStreams.clear();
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
//sets device id
|
||||
int MICBase::mic_setDeviceId(int id) {
|
||||
m_device_id = id;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
//get information abaut all available mic devices
|
||||
//TODO: find a way to check system for avaialbel mic devices
|
||||
|
||||
int MICBase::mic_getDevices() {
|
||||
|
||||
int devices = _Offload_number_of_devices();
|
||||
int thread_count = 0;
|
||||
|
||||
std::cout << "==============================" << std::endl;
|
||||
std::cout << "==========Intel MICs==========" << std::endl;
|
||||
std::cout << "==============================" << std::endl;
|
||||
|
||||
std::cout << "Total mic devices: " << devices << std::endl;
|
||||
//std::cout << "Total mic devices: currently cant be found, but it's 1 on kraftwerk" << std::endl;
|
||||
|
||||
#pragma offload target(mic:m_device_id) inout(thread_count)
|
||||
{
|
||||
thread_count = omp_get_max_threads();
|
||||
}
|
||||
|
||||
std::cout << "Max threads: " << thread_count << std::endl;
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
244
src/MIC/MICBase.h
Normal file
244
src/MIC/MICBase.h
Normal file
@ -0,0 +1,244 @@
|
||||
/*
|
||||
|
||||
Name: MIC Base
|
||||
Author: Uldis Locans
|
||||
Info: class to handle set up and data transfer from host to Intel MIC devices
|
||||
Date: 29.09.2014
|
||||
|
||||
*/
|
||||
#ifndef H_MIC_BASE
|
||||
#define H_MIC_BASE
|
||||
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <offload.h>
|
||||
#include <mkl_dfti.h>
|
||||
#include <mkl_vsl.h>
|
||||
#include <vector>
|
||||
#include <time.h>
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
#define DKS_ALLOC alloc_if(1)
|
||||
#define DKS_FREE free_if(1)
|
||||
#define DKS_RETAIN free_if(0)
|
||||
#define DKS_REUSE alloc_if(0)
|
||||
|
||||
#define MIC_WIDTH 128
|
||||
|
||||
class MICBase {
|
||||
|
||||
private:
|
||||
std::vector<int> micStreams;
|
||||
|
||||
protected:
|
||||
|
||||
|
||||
int defaultRndSet;
|
||||
|
||||
public:
|
||||
VSLStreamStatePtr *defaultRndStream;
|
||||
int m_device_id;
|
||||
|
||||
/* constructor */
|
||||
MICBase();
|
||||
|
||||
/* destructor */
|
||||
~MICBase();
|
||||
|
||||
/*
|
||||
Info: create MKL rand streams for each thread
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_createRandStreams(int size);
|
||||
|
||||
/*
|
||||
Info: delete MKL rand streams
|
||||
Return: succes or error code
|
||||
*/
|
||||
int mic_deleteRandStreams();
|
||||
|
||||
/*
|
||||
Info: create a new signal for the mic
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_createStream(int & streamId);
|
||||
|
||||
/*
|
||||
Info: get the signal from the vector
|
||||
Return: mic signal
|
||||
*/
|
||||
int& mic_getStream(int id);
|
||||
|
||||
/*
|
||||
Info: delete streams
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_deleteStreams();
|
||||
|
||||
/*
|
||||
Info: set device id
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_setDeviceId(int id);
|
||||
|
||||
/*
|
||||
Info: get mic devices
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_getDevices();
|
||||
|
||||
/*
|
||||
Info: allocate memory on MIC device
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
void * mic_allocateMemory(int size) {
|
||||
|
||||
int padding = size % MIC_WIDTH;
|
||||
int totalsize = size + padding;
|
||||
|
||||
T *tmp = (T*)_mm_malloc(sizeof(T)*totalsize, 64); // = new T[size];
|
||||
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp:length(totalsize) DKS_ALLOC DKS_RETAIN)
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: transfer data to device
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
T* tmp_data = (T*)data;
|
||||
|
||||
#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) )
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: write data to device, non-blocking
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
|
||||
{
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
T* tmp_data = (T*)data;
|
||||
|
||||
#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) )
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Info: read data from device
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
T* tmp_result = (T*)result;
|
||||
|
||||
//std::cout << "try to read data with size = " << size << " adn offset = " << offset << std::endl;
|
||||
#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) )
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: read data from device waiting for signal
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_readDataAsync(const void * data_ptr, void * result, int size,
|
||||
int streamId = -1, int offset = 0) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
T* tmp_result = (T*)result;
|
||||
|
||||
#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) )
|
||||
{
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Info: wait till all the signals are complete
|
||||
Return siccess or error code
|
||||
*/
|
||||
int mic_syncDevice() {
|
||||
|
||||
//empty offload to wait for all the signals to finish and launch a new empy signal
|
||||
/*
|
||||
for (int i = 0; i < micStreams.size(); i++) {
|
||||
#pragma offload target(mic:m_device_id) wait(mic_getStream(i)) signal(mic_getStream(i))
|
||||
{
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
//std::cout << "done read data" << std::endl;
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Info: free memory on device
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_freeMemory(void * data_ptr, int size) {
|
||||
|
||||
int padding = size % MIC_WIDTH;
|
||||
int totalsize = size + padding;
|
||||
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
|
||||
{
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: allocate memory and write data to device
|
||||
Return: success or error code
|
||||
*/
|
||||
template<typename T>
|
||||
void * mic_pushData(const void * data, int size) {
|
||||
T* tmp_ptr = new T[size];
|
||||
T* tmp_data = (T*)data;
|
||||
|
||||
#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_ALLOC DKS_RETAIN
|
||||
into(tmp_ptr[0:size]) )
|
||||
{
|
||||
}
|
||||
|
||||
return tmp_ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
Info: read data and free memory on device
|
||||
Return: success or erro code
|
||||
*/
|
||||
template<typename T>
|
||||
int mic_pullData(void * data_ptr, void * result, int size) {
|
||||
T* tmp_ptr = (T*)data_ptr;
|
||||
T* tmp_data = (T*)result;
|
||||
|
||||
#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[0:size] : DKS_REUSE DKS_FREE into(tmp_data[0:size]) )
|
||||
{
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
93
src/MIC/MICChiSquare.cpp
Normal file
93
src/MIC/MICChiSquare.cpp
Normal file
@ -0,0 +1,93 @@
|
||||
#include "MICChiSquare.h"
|
||||
|
||||
/*
|
||||
calculate chi^2 on intel mic, use data already loaded on device
|
||||
*/
|
||||
int MICChiSquare::mic_chi2(double *O, double *E, double *result, int size) {
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||
in(O:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(E:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(result:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < size; i++) {
|
||||
result[i] = pow(O[i] - E[i], 2) / E[i];
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
calculate function N(t), use data already loaded on device
|
||||
*/
|
||||
int MICChiSquare::mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT) {
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||
in(nt:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(p:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(psize) in(nsize) in(jsize) in(deltaT)
|
||||
{
|
||||
|
||||
double gamma = 0.01; //???
|
||||
double tau = 0.01; //???
|
||||
|
||||
for (int j = 0; j < jsize; j++) {
|
||||
|
||||
int pid = j*psize;
|
||||
double N0 = p[pid];
|
||||
double Nbkg = p[pid+1];
|
||||
double A0 = p[pid+2];
|
||||
double phi = p[pid+3];
|
||||
double sigma = p[pid+4];
|
||||
double B = p[pid+5];
|
||||
|
||||
int idj = j*nsize;
|
||||
|
||||
double a1 = -0.5*sigma*sigma;
|
||||
double b1 = gamma*B;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int n = 0; n < nsize; n++) {
|
||||
|
||||
int id = idj + n;
|
||||
double t = n*deltaT;
|
||||
|
||||
double a = a1*t*t;
|
||||
double b = b1*t + phi;
|
||||
double At = A0 * exp2(a) * cos(b);
|
||||
|
||||
double c = -t/tau;
|
||||
double Nt = N0 * exp2(c) * (1 + At) + Nbkg;
|
||||
|
||||
nt[id] = Nt;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
calculate sum of array
|
||||
*/
|
||||
int MICChiSquare::mic_sum(double *data, double *result, int size) {
|
||||
double sum = 0;
|
||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||
in(data:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(result:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(size) in(sum)
|
||||
{
|
||||
#pragma omp parallel for reduction(+:sum)
|
||||
for (int i = 0; i < size; i++) {
|
||||
sum += data[i];
|
||||
}
|
||||
result[0] = sum;
|
||||
}
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
51
src/MIC/MICChiSquare.h
Normal file
51
src/MIC/MICChiSquare.h
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
|
||||
Name: MICChiSquare
|
||||
Info: calculate chi^2 using intel mic coporcessor
|
||||
Author: Uldis Locans
|
||||
Date: 29.09.2014
|
||||
|
||||
*/
|
||||
#ifndef H_MIC_CHI_SQUARE
|
||||
#define H_MIC_CHI_SQUARE
|
||||
|
||||
#include <math.h>
|
||||
#include <omp.h>
|
||||
#include <offload.h>
|
||||
#include "MICBase.h"
|
||||
|
||||
class MICChiSquare {
|
||||
|
||||
MICBase *m_micbase;
|
||||
|
||||
public:
|
||||
|
||||
/* constructor */
|
||||
MICChiSquare(MICBase *base) {
|
||||
m_micbase = base;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
~MICChiSquare() { }
|
||||
|
||||
/*
|
||||
Info: calucate chi square
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_chi2(double *O, double *E, double *result, int size);
|
||||
|
||||
/*
|
||||
Info: calculate Nt function
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT = 1);
|
||||
|
||||
/*
|
||||
Info: calculate sum of array
|
||||
Return: success or error code
|
||||
*/
|
||||
int mic_sum(double *data, double *result, int size);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
876
src/MIC/MICCollimatorPhysics.cpp
Normal file
876
src/MIC/MICCollimatorPhysics.cpp
Normal file
@ -0,0 +1,876 @@
|
||||
#include "MICCollimatorPhysics.h"
|
||||
|
||||
#define M_P 0.93827231e+00
|
||||
#define C 299792458.0
|
||||
#define PI 3.14159265358979323846
|
||||
#define AVO 6.022e23
|
||||
#define R_E 2.81794092e-15
|
||||
#define eM_E 0.51099906e-03
|
||||
#define Z_P 1
|
||||
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
|
||||
|
||||
#define POSITION 0
|
||||
#define ZSIZE 1
|
||||
#define RHO_M 2
|
||||
#define Z_M 3
|
||||
#define A_M 4
|
||||
#define A2_C 5
|
||||
#define A3_C 6
|
||||
#define A4_C 7
|
||||
#define A5_C 8
|
||||
#define X0_M 9
|
||||
#define I_M 10
|
||||
#define DT_M 11
|
||||
|
||||
__declspec(target(mic))
|
||||
double dot(mic_double3 d1, mic_double3 d2) {
|
||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
double dot(double dx, double dy, double dz) {
|
||||
return (dx * dx + dy * dy + dz * dz);
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
bool checkHit(double &z, double *par) {
|
||||
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
|
||||
}
|
||||
|
||||
|
||||
__declspec(target(mic))
|
||||
void Rot(double &px, double &pz, double &x, double &z, double xplane,
|
||||
double normP, double thetacou, double deltas, int coord)
|
||||
{
|
||||
double Psixz = 1;
|
||||
double pxz = 1;
|
||||
|
||||
if ( px >= 0 && pz >= 0 )
|
||||
Psixz = atan(px/pz);
|
||||
else if ( px > 0 && pz < 0 )
|
||||
Psixz = atan(px/pz) + PI;
|
||||
else if (px < 0 && pz > 0)
|
||||
Psixz = atan(px/pz) + 2*PI;
|
||||
else
|
||||
Psixz = atan(px/pz) + PI;
|
||||
|
||||
pxz = sqrt(px*px + pz*pz);
|
||||
|
||||
if(coord == 1) {
|
||||
x = x + deltas * px / normP + xplane*cos(Psixz);
|
||||
z = z - xplane * sin(Psixz);
|
||||
}
|
||||
|
||||
if(coord == 2) {
|
||||
x = x + deltas * px / normP + xplane * cos(Psixz);
|
||||
z = z - xplane * sin(Psixz) + deltas * pz / normP;
|
||||
}
|
||||
|
||||
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
|
||||
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
|
||||
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
||||
double gamma = (Eng + M_P) / M_P;
|
||||
double normP = sqrt(dot(P, P));
|
||||
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
|
||||
double deltas = par[DT_M] * beta * C;
|
||||
|
||||
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
|
||||
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
|
||||
|
||||
// x-direction: See Physical Review, "Multiple Scattering"
|
||||
double z1, z2;
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
|
||||
double thetacou = z2 * theta0;
|
||||
|
||||
while(fabs(thetacou) > 3.5 * theta0) {
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
|
||||
thetacou = z2 * theta0;
|
||||
}
|
||||
|
||||
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1);
|
||||
|
||||
double P2;//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
|
||||
if(P2 < 0.0047) {
|
||||
double P3, P4;
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
|
||||
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||
|
||||
if(P4 > 0.5)
|
||||
thetaru = -thetaru;
|
||||
Rot(P.x ,P.z, R.x, R.z, xplane, normP, thetaru, deltas, 0);
|
||||
}
|
||||
|
||||
// y-direction: See Physical Review, "Multiple Scattering"
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
|
||||
thetacou = z2 * theta0;
|
||||
|
||||
while(fabs(thetacou) > 3.5 * theta0) {
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
|
||||
thetacou = z2 * theta0;
|
||||
}
|
||||
|
||||
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||
Rot(P.y, P.z, R.y, R.z, yplane, normP, thetacou, deltas, 2);
|
||||
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
|
||||
if(P2 < 0.0047) {
|
||||
double P3, P4;
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
|
||||
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||
if(P4 > 0.5)
|
||||
thetaru = -thetaru;
|
||||
Rot(P.y, P.z, R.y, R.z, yplane, normP, thetaru, deltas, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
|
||||
double *par, VSLStreamStatePtr &stream, int ii, int size)
|
||||
{
|
||||
|
||||
double normP[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double P1[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double P2[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double P3[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
|
||||
double z1[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double z2[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
|
||||
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
double dotp = dot(px[i], py[i], pz[i]);
|
||||
double Eng = sqrt(dotp + 1.0) * M_P - M_P;
|
||||
double gamma = (Eng + M_P) / M_P;
|
||||
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
|
||||
|
||||
normP[idx] = sqrt(dotp);
|
||||
deltas[idx] = par[DT_M] * beta * C;
|
||||
theta0[idx] = 13.6e6 / (beta * normP[idx] * M_P * 1e9) *
|
||||
Z_P * sqrt(deltas[idx] / par[X0_M]) * (1.0 + 0.038 * log(deltas[idx] / par[X0_M]));
|
||||
}
|
||||
}
|
||||
|
||||
// x-direction: See Physical Review, "Multiple Scattering"
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0);
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0);
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + size; i++) {
|
||||
int idx = i - ii;
|
||||
thetacou[idx] = z2[idx] * theta0[idx];
|
||||
}
|
||||
|
||||
//unknown number of iterations, cannot vectorize
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
|
||||
thetacou[idx] = z2[idx] * theta0[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + size; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
double xplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) +
|
||||
z2[idx] * deltas[idx] * theta0[idx] / 2.0;
|
||||
Rot(px[i], pz[i], rx[i], rz[i], xplane, normP[idx], thetacou[idx], deltas[idx], 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//generate array of random numbers
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
|
||||
|
||||
//P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
if(P1[idx] < 0.0047) {
|
||||
double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
|
||||
|
||||
if(P3[idx] > 0.5)
|
||||
thetaru = -thetaru;
|
||||
|
||||
Rot(px[i] ,pz[i], rx[i], rz[i], 0, 0, thetaru, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// y-direction: See Physical Review, "Multiple Scattering"
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0);
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0);
|
||||
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
thetacou[idx] = z2[idx] * theta0[idx];
|
||||
}
|
||||
|
||||
//unknown number of iterations, cannot vectorize
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
|
||||
thetacou[idx] = z2[idx] * theta0[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
double yplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0)
|
||||
+ z2[idx] * deltas[idx] * theta0[idx] / 2.0;
|
||||
Rot(py[i], pz[i], ry[i], rz[i], yplane, normP[idx], thetacou[idx], deltas[idx], 2);
|
||||
}
|
||||
}
|
||||
|
||||
//generate array of random numbers
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
|
||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
|
||||
|
||||
//P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
int idx = i - ii;
|
||||
if (label[i] == 0) {
|
||||
if(P1[idx] < 0.0047) {
|
||||
double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
|
||||
if(P3[idx] > 0.5)
|
||||
thetaru = -thetaru;
|
||||
Rot(py[i], pz[i], ry[i], rz[i], 0, 0, thetaru, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
|
||||
|
||||
double dEdx = 0.0;
|
||||
const double gamma = (Eng + M_P) / M_P;
|
||||
const double gamma2 = gamma * gamma;
|
||||
const double beta = sqrt(1.0 - 1.0 / gamma2);
|
||||
const double beta2 = beta * beta;
|
||||
|
||||
const double deltas = par[DT_M] * beta * C;
|
||||
const double deltasrho = deltas * 100 * par[RHO_M];
|
||||
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
|
||||
|
||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||
const double Ts = (Eng * 1E6) / 1.0073;
|
||||
const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
|
||||
const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
|
||||
const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
|
||||
|
||||
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
|
||||
|
||||
double tmprnd;
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
|
||||
const double delta_E = deltasrho * dEdx + tmprnd;
|
||||
Eng = Eng + delta_E / 1E3;
|
||||
}
|
||||
|
||||
if (Eng >= 0.0006) {
|
||||
const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
|
||||
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
|
||||
|
||||
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
|
||||
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
|
||||
Tmax / par[I_M] / par[I_M]) - beta2);
|
||||
|
||||
double tmprnd;
|
||||
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
|
||||
const double delta_E = deltasrho * dEdx + tmprnd;
|
||||
|
||||
Eng = Eng + delta_E / 1E3;
|
||||
}
|
||||
|
||||
|
||||
if ((Eng<1E-4) || (dEdx>0))
|
||||
pdead = 1;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
||||
|
||||
const double gamma = (Eng + M_P) / M_P;
|
||||
const double gamma2 = gamma * gamma;
|
||||
const double beta = sqrt(1.0 - 1.0 / gamma2);
|
||||
const double beta2 = beta * beta;
|
||||
|
||||
const double deltas = par[DT_M] * beta * C;
|
||||
const double deltasrho = deltas * 100 * par[RHO_M];
|
||||
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
|
||||
|
||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||
const double Ts = (Eng * 1E6) / 1.0073;
|
||||
const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
|
||||
const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
|
||||
const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
|
||||
|
||||
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
|
||||
|
||||
const double delta_E = deltasrho * dEdx + sigma_E * randv[ri];
|
||||
|
||||
Eng = Eng + delta_E / 1E3;
|
||||
}
|
||||
|
||||
if (Eng >= 0.0006) {
|
||||
const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
|
||||
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
|
||||
|
||||
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
|
||||
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
|
||||
Tmax / par[I_M] / par[I_M]) - beta2);
|
||||
|
||||
const double delta_E = deltasrho * dEdx + sigma_E * randv[ri + MIC_WIDTH];
|
||||
|
||||
Eng = Eng + delta_E / 1E3;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
|
||||
|
||||
//cast device memory pointers to appropriate types
|
||||
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
|
||||
double *par = (double*) par_ptr;
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(numparticles)
|
||||
{
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
||||
|
||||
//for loop trough particles if not checkhit set label to -2 and update R.x
|
||||
|
||||
#pragma omp for simd
|
||||
for (int i = 0; i < numparticles; i++) {
|
||||
if ( !checkHit(data[i].Rincol.z, par) ) {
|
||||
double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
|
||||
data[i].Rincol.x = data[i].Rincol.x + par[DT_M] * C * data[i].Pincol.x / sq;
|
||||
data[i].Rincol.y = data[i].Rincol.y + par[DT_M] * C * data[i].Pincol.y / sq;
|
||||
data[i].Rincol.z = data[i].Rincol.z + par[DT_M] * C * data[i].Pincol.z / sq;
|
||||
data[i].label = -2;
|
||||
}
|
||||
}
|
||||
|
||||
//for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
|
||||
#pragma omp for simd
|
||||
for (int i = 0; i < numparticles; i++) {
|
||||
|
||||
int pdead = -1;
|
||||
double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
|
||||
double Eng = (sq - 1) * M_P;
|
||||
|
||||
if (data[i].label == 0) {
|
||||
energyLoss(Eng, pdead, par, stream);
|
||||
}
|
||||
|
||||
if (pdead == -1) {
|
||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||
sq = sqrt(dot(data[i].Pincol, data[i].Pincol));
|
||||
data[i].Pincol.x = data[i].Pincol.x * ptot / sq;
|
||||
data[i].Pincol.y = data[i].Pincol.y * ptot / sq;
|
||||
data[i].Pincol.z = data[i].Pincol.z * ptot / sq;
|
||||
}
|
||||
|
||||
if (pdead == 1)
|
||||
data[i].label = -1;
|
||||
}
|
||||
|
||||
//for loop trough particles if label == 0 coulomb scat
|
||||
#pragma omp for
|
||||
for (int i = 0; i < numparticles; i++) {
|
||||
if (data[i].label == 0) {
|
||||
coulombScat(data[i].Rincol, data[i].Pincol, par, stream);
|
||||
}
|
||||
}
|
||||
|
||||
} //end omp parallel
|
||||
|
||||
} //end offload
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles)
|
||||
{
|
||||
|
||||
|
||||
|
||||
int *label = (int*)label_ptr;
|
||||
unsigned *localID = (unsigned*)localID_ptr;
|
||||
double *rx = (double*)rx_ptr;
|
||||
double *ry = (double*)ry_ptr;
|
||||
double *rz = (double*)rz_ptr;
|
||||
double *px = (double*)px_ptr;
|
||||
double *py = (double*)py_ptr;
|
||||
double *pz = (double*)pz_ptr;
|
||||
double *par = (double*)par_ptr;
|
||||
|
||||
int padding = numparticles % MIC_WIDTH;
|
||||
int totalpart = numparticles + padding;
|
||||
|
||||
#pragma offload target (mic:0) \
|
||||
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(rx:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(ry:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(rz:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(px:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(py:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(totalpart)
|
||||
{
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
//every thread gets its own rnd stream state
|
||||
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
||||
|
||||
|
||||
#pragma omp for nowait
|
||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||
//vectorize main loop
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
if ( !checkHit(rz[i], par) ) {
|
||||
double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
|
||||
rx[i] = rx[i] + par[DT_M] * C * px[i] / sq;
|
||||
ry[i] = ry[i] + par[DT_M] * C * py[i] / sq;
|
||||
rz[i] = rz[i] + par[DT_M] * C * pz[i] / sq;
|
||||
label[i] = -2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//array of size 2*WIDTH for storing random values for the energyloss function
|
||||
double randv[2*MIC_WIDTH] __attribute__((aligned(64)));
|
||||
|
||||
//for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
|
||||
#pragma omp for nowait
|
||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||
//create array of rand values (2 per thread)
|
||||
vdRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*MIC_WIDTH, randv, 0.0, 1.0);
|
||||
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||
|
||||
double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
|
||||
double Eng = (sq - 1) * M_P;
|
||||
double dEdx = 0;
|
||||
|
||||
if (label[i] == 0) {
|
||||
energyLoss(Eng, dEdx, par, randv, i - ii);
|
||||
}
|
||||
|
||||
if (Eng > 1e-4 && dEdx < 0) {
|
||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||
sq = sqrt(dot(px[i], py[i], pz[i]));
|
||||
px[i] = px[i] * ptot / sq;
|
||||
py[i] = py[i] * ptot / sq;
|
||||
pz[i] = pz[i] * ptot / sq;
|
||||
}
|
||||
|
||||
if (Eng < 1e-4 || dEdx > 0)
|
||||
label[i] = -1;
|
||||
|
||||
} //end inner energy loss loop
|
||||
|
||||
} //end outer energy loss loop
|
||||
|
||||
//vectorize coulomb scattering as much as possible
|
||||
#pragma omp for nowait
|
||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||
coulombScat(rx, ry, rz, px, py, pz, label, par, stream, ii, MIC_WIDTH);
|
||||
} //end coulomb scattering
|
||||
|
||||
} //end omp parallel
|
||||
|
||||
} //end offload
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int MICCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
|
||||
int &numaddback)
|
||||
{
|
||||
|
||||
//cast device memory pointers to appropriate types
|
||||
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
|
||||
int privateback;
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||
in(data:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(numparticles) \
|
||||
out(privateback)
|
||||
{
|
||||
//count dead and addback particles
|
||||
int privateback = 0;
|
||||
#pragma omp parallel for reduction(+:privateback)
|
||||
for (int i = 0; i < numparticles; i++) {
|
||||
if (data[i].label < 0)
|
||||
privateback++;
|
||||
}
|
||||
//move particles with label < 0 to the end of the array (serial. can we do this parallel?)
|
||||
if (privateback > 0) {
|
||||
|
||||
int moved = 0;
|
||||
for (int i = numparticles - 1; i > 0; i--) {
|
||||
if (data[i].label < 0) {
|
||||
int idx = numparticles - 1 - moved;
|
||||
if (i != idx) {
|
||||
MIC_PART_SMALL tmp = data[i];
|
||||
data[i] = data[idx];
|
||||
data[idx] = tmp;
|
||||
}
|
||||
moved++;
|
||||
}
|
||||
}
|
||||
}
|
||||
numaddback = privateback;
|
||||
}
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void micmove(double &a, double &b) {
|
||||
double tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void micmove(int &a, int &b) {
|
||||
int tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
void micmove(unsigned &a, unsigned &b) {
|
||||
unsigned tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
|
||||
int MICCollimatorPhysics::CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles,
|
||||
int &numaddback)
|
||||
{
|
||||
|
||||
int *label = (int*)label_ptr;
|
||||
unsigned *localID = (unsigned*)localID_ptr;
|
||||
double *rx = (double*)rx_ptr;
|
||||
double *ry = (double*)ry_ptr;
|
||||
double *rz = (double*)rz_ptr;
|
||||
double *px = (double*)px_ptr;
|
||||
double *py = (double*)py_ptr;
|
||||
double *pz = (double*)pz_ptr;
|
||||
double *par = (double*)par_ptr;
|
||||
|
||||
//int padding = numparticles % WIDTH;
|
||||
//int totalpart = numparticles + padding;
|
||||
|
||||
int privateback;
|
||||
|
||||
#pragma offload target (mic:0) \
|
||||
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(rx:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(ry:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(rz:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(px:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(py:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
|
||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(numparticles) \
|
||||
out(privateback)
|
||||
{
|
||||
|
||||
//count dead and addback particles
|
||||
int privateback = 0;
|
||||
#pragma omp parallel for reduction(+:privateback)
|
||||
for (int i = 0; i < numparticles; i++) {
|
||||
if (label[i] < 0)
|
||||
privateback++;
|
||||
}
|
||||
|
||||
//move particles with label < 0 to the end of the array (serial. can we do this parallel?)
|
||||
if (privateback > 0) {
|
||||
int moved = 0;
|
||||
for (int i = numparticles - 1; i >= 0; i--) {
|
||||
if (label[i] < 0) {
|
||||
int idx = numparticles - 1 - moved;
|
||||
if (i != idx) {
|
||||
micmove(rx[i], rx[idx]);
|
||||
micmove(ry[i], ry[idx]);
|
||||
micmove(rz[i], rz[idx]);
|
||||
micmove(px[i], px[idx]);
|
||||
micmove(py[i], py[idx]);
|
||||
micmove(pz[i], pz[idx]);
|
||||
micmove(label[i], label[idx]);
|
||||
micmove(localID[i], localID[idx]);
|
||||
}
|
||||
moved++;
|
||||
}
|
||||
}
|
||||
}
|
||||
numaddback = privateback;
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void unitlessOff(mic_double3 &a, const double c) {
|
||||
a.x *= c;
|
||||
a.y *= c;
|
||||
a.z *= c;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void unitlessOn(mic_double3 &a, const double c) {
|
||||
a.x /= c;
|
||||
a.y /= c;
|
||||
a.z /= c;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
mic_double3 deviceTransformTo(const mic_double3 &vec, const mic_double3 &ori) {
|
||||
const double sina = sin(ori.x);
|
||||
const double cosa = cos(ori.x);
|
||||
const double sinb = sin(ori.y);
|
||||
const double cosb = cos(ori.y);
|
||||
const double sinc = sin(ori.z);
|
||||
const double cosc = cos(ori.z);
|
||||
|
||||
mic_double3 temp;
|
||||
temp.x = 0.0;
|
||||
temp.y = 0.0;
|
||||
temp.z = 0.0;
|
||||
|
||||
temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
|
||||
temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
|
||||
(cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
|
||||
temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
|
||||
(sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void updateR(mic_double3 &R, mic_double3 &P, double dotp, double dtc) {
|
||||
R.x /= dtc;
|
||||
R.x += 0.5 * P.x / dotp;
|
||||
R.x *= dtc;
|
||||
|
||||
R.y /= dtc;
|
||||
R.y += 0.5 * P.y / dotp;
|
||||
R.y *= dtc;
|
||||
|
||||
R.z /= dtc;
|
||||
R.z += 0.5 * P.z / dotp;
|
||||
R.z *= dtc;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void push(mic_double3 *r, mic_double3 *p, double dtc, int npart) {
|
||||
#pragma omp parallel for simd
|
||||
for (int i = 0; i < npart; i++) {
|
||||
mic_double3 R = r[i];
|
||||
mic_double3 P = p[i];
|
||||
double dotp = sqrt(1.0 + dot(P, P));
|
||||
updateR(R, P, dotp, dtc);
|
||||
r[i] = R;
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void push(mic_double3 *r, mic_double3 *p, double *gdt, double c, int npart) {
|
||||
#pragma omp parallel for simd
|
||||
for (int i = 0; i < npart; i++) {
|
||||
mic_double3 R = r[i];
|
||||
mic_double3 P = p[i];
|
||||
double dtc = gdt[i] * c;
|
||||
double dotp = sqrt(1.0 + dot(P, P));
|
||||
updateR(R, P, dotp, dtc);
|
||||
r[i] = R;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int MICCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt, int streamId)
|
||||
{
|
||||
|
||||
mic_double3 *r = (mic_double3*)r_ptr;
|
||||
mic_double3 *p = (mic_double3*)p_ptr;
|
||||
double *gdt = (double*)dt_ptr;
|
||||
double dtc = dt * c;
|
||||
|
||||
if (!usedt) {
|
||||
#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(p:length(0) DKS_RETAIN DKS_REUSE) in(npart, dtc)
|
||||
{
|
||||
push(r, p, dtc, npart);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) in(npart, c)
|
||||
{
|
||||
push(r, p, gdt, c, npart);
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
|
||||
double dtc, int npart, int nsec)
|
||||
{
|
||||
|
||||
#pragma omp parallel for simd
|
||||
for (int i = 0; i < npart; i++) {
|
||||
mic_double3 ori;
|
||||
if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
|
||||
ori = gOrient[gLastSect[i]];
|
||||
} else {
|
||||
ori.x = 0.0;
|
||||
ori.y = 0.0;
|
||||
ori.z = 0.0;
|
||||
}
|
||||
|
||||
mic_double3 tmp = deviceTransformTo(p[i], ori);
|
||||
mic_double3 X = x[i];
|
||||
double dotp = sqrt(1.0 + dot(tmp, tmp));
|
||||
updateR(X, tmp, dotp, dtc);
|
||||
x[i] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__declspec(target(mic))
|
||||
inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
|
||||
double *gdt, double c, int npart, int nsec)
|
||||
{
|
||||
|
||||
#pragma omp parallel for simd
|
||||
for (int i = 0; i < npart; i++) {
|
||||
mic_double3 ori;
|
||||
if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
|
||||
ori = gOrient[gLastSect[i]];
|
||||
} else {
|
||||
ori.x = 0.0;
|
||||
ori.y = 0.0;
|
||||
ori.z = 0.0;
|
||||
}
|
||||
|
||||
mic_double3 tmp = deviceTransformTo(p[i], ori);
|
||||
mic_double3 X = x[i];
|
||||
double dotp = sqrt(1.0 + dot(tmp, tmp));
|
||||
double dtc = gdt[i] * c;
|
||||
|
||||
updateR(X, tmp, dotp, dtc);
|
||||
x[i] = X;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int MICCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||
void *lastSec_ptr,
|
||||
void *orient_ptr, int npart,
|
||||
int nsec, void *dt_ptr, double dt,
|
||||
double c, bool usedt, int streamId)
|
||||
{
|
||||
|
||||
mic_double3 *x = (mic_double3*)x_ptr;
|
||||
mic_double3 *p = (mic_double3*)p_ptr;
|
||||
mic_double3 *gOrient = (mic_double3*)orient_ptr;
|
||||
double *gdt = (double*)dt_ptr;
|
||||
long *gLastSect = (long*)lastSec_ptr;
|
||||
double dtc = dt * c;
|
||||
|
||||
if (!usedt) {
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gOrient:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) in(npart, nsec, dtc)
|
||||
{
|
||||
pushTransform(x, p, gOrient, gLastSect, dtc, npart, nsec);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(gOrient:length(0) DKS_RETAIN DKS_REUSE) in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) \
|
||||
in(npart, nsec, c)
|
||||
{
|
||||
pushTransform(x, p, gOrient, gLastSect, gdt, c, npart, nsec);
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
68
src/MIC/MICCollimatorPhysics.h
Normal file
68
src/MIC/MICCollimatorPhysics.h
Normal file
@ -0,0 +1,68 @@
|
||||
#ifndef H_MIC_COLLIMATORPHYSICS
|
||||
#define H_MIC_COLLIMATORPHYSICS
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdio>
|
||||
#include <cmath>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
#include "../Algorithms/CollimatorPhysics.h"
|
||||
#include "MICBase.h"
|
||||
|
||||
__declspec(target(mic))
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} mic_double3;
|
||||
|
||||
__declspec(target(mic))
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
mic_double3 Rincol;
|
||||
mic_double3 Pincol;
|
||||
} MIC_PART_SMALL;
|
||||
|
||||
|
||||
class MICCollimatorPhysics : DKSAlogorithms{
|
||||
|
||||
private:
|
||||
|
||||
MICBase *m_micbase;
|
||||
|
||||
public:
|
||||
|
||||
MICCollimatorPhysics(MICBase *base) {
|
||||
m_micbase = base;
|
||||
};
|
||||
|
||||
~MICCollimatorPhysics() { };
|
||||
|
||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
|
||||
|
||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles);
|
||||
|
||||
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
||||
|
||||
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles, int &numaddback);
|
||||
|
||||
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false, int streamId = -1);
|
||||
|
||||
int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||
void *orient_ptr, int npart, int nsec,
|
||||
void *dt_ptr, double dt, double c,
|
||||
bool usedt = false, int streamId = -1);
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif
|
210
src/MIC/MICFFT.cpp
Normal file
210
src/MIC/MICFFT.cpp
Normal file
@ -0,0 +1,210 @@
|
||||
#include "MICFFT.h"
|
||||
#include<stdio.h>
|
||||
#include<complex>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
MICFFT::MICFFT(MICBase *base) {
|
||||
m_micbase = base;
|
||||
}
|
||||
|
||||
MICFFT::~MICFFT() {
|
||||
#pragma offload target(mic:0)
|
||||
{
|
||||
DftiFreeDescriptor(&FFTHandle_m);
|
||||
DftiFreeDescriptor(&handle);
|
||||
}
|
||||
}
|
||||
|
||||
//setup fft
|
||||
int MICFFT::setupFFT(int ndim, int N[3]) {
|
||||
//set up FFT engine
|
||||
#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
|
||||
{
|
||||
|
||||
MKL_LONG sizes[3], strides[4];
|
||||
sizes[0] = N[0]; sizes[1] = N[1]; sizes[2] = N[2];
|
||||
//strides[0] = 0; strides[1] = sizes[1]; strides[2] = 1; strides[3] = sizes[0]*sizes[1];
|
||||
strides[0] = 0; strides[1] = sizes[0]*sizes[1]; strides[2] = sizes[0]; strides[3] = 1;
|
||||
|
||||
MKL_LONG dims = 3;
|
||||
DftiCreateDescriptor(&(this->getHandle()), DFTI_DOUBLE, DFTI_COMPLEX, dims, sizes);
|
||||
DftiSetValue(this->getHandle(), DFTI_INPUT_STRIDES, strides);
|
||||
DftiSetValue(this->getHandle(), DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
|
||||
DftiCommitDescriptor(this->getHandle());
|
||||
|
||||
}
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
//BENI:
|
||||
//setup fft
|
||||
int MICFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
||||
|
||||
//set up FFT engine for REAL->COMPLEX
|
||||
|
||||
#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
|
||||
{
|
||||
|
||||
MKL_LONG sizes[3], real_strides[4], complex_strides[4];
|
||||
sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0];
|
||||
//real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1;
|
||||
real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1;
|
||||
//real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1];
|
||||
//complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1;
|
||||
complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1;
|
||||
//complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1;
|
||||
|
||||
MKL_LONG dims = 3;
|
||||
DftiCreateDescriptor(&(this->getHandleRC()), DFTI_DOUBLE, DFTI_REAL, dims, sizes);
|
||||
DftiSetValue(this->getHandleRC(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
|
||||
DftiSetValue(this->getHandleRC(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
|
||||
DftiSetValue(this->getHandleRC(), DFTI_PLACEMENT, DFTI_NOT_INPLACE);
|
||||
DftiSetValue(this->getHandleRC(), DFTI_INPUT_STRIDES, real_strides);
|
||||
DftiSetValue(this->getHandleRC(), DFTI_OUTPUT_STRIDES, complex_strides);
|
||||
DftiSetValue(this->getHandleRC(), DFTI_FORWARD_SCALE, scale);
|
||||
DftiCommitDescriptor(this->getHandleRC());
|
||||
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
//BENI:
|
||||
//setup fft
|
||||
int MICFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
||||
|
||||
//set up FFT engine for COMPLEX->REAL
|
||||
|
||||
#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
|
||||
{
|
||||
MKL_LONG sizes[3], real_strides[4], complex_strides[4];
|
||||
sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0];
|
||||
//real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1;
|
||||
real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1;
|
||||
//real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1];
|
||||
//complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1;
|
||||
complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1;
|
||||
//complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1;
|
||||
|
||||
MKL_LONG dims = 3;
|
||||
DftiCreateDescriptor(&(this->getHandleCR()), DFTI_DOUBLE, DFTI_REAL, dims, sizes);
|
||||
DftiSetValue(this->getHandleCR(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
|
||||
DftiSetValue(this->getHandleCR(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
|
||||
DftiSetValue(this->getHandleCR(), DFTI_PLACEMENT, DFTI_NOT_INPLACE);
|
||||
DftiSetValue(this->getHandleCR(), DFTI_INPUT_STRIDES, complex_strides);
|
||||
DftiSetValue(this->getHandleCR(), DFTI_OUTPUT_STRIDES, real_strides);
|
||||
DftiSetValue(this->getHandleCR(), DFTI_BACKWARD_SCALE, scale);
|
||||
DftiCommitDescriptor(this->getHandleCR());
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
//execute COMPLEX->COMPLEX FFT
|
||||
int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool forward) {
|
||||
|
||||
_Complex double *ptr = (_Complex double*) mem_ptr;
|
||||
|
||||
#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(forward)
|
||||
{
|
||||
if (forward)
|
||||
DftiComputeForward(this->getHandle(), ptr);
|
||||
else
|
||||
DftiComputeBackward(this->getHandle(), ptr);
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
//execute iFFT
|
||||
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
|
||||
return mic_executeFFT(mem_ptr, ndim, N, -1, false);
|
||||
}
|
||||
|
||||
//execute REAL->COMPLEX FFT
|
||||
int MICFFT::executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
double *real_ptr = (double*) in_ptr;
|
||||
//std::complex<double> *compl_ptr = (std::complex<double> *) out_ptr;
|
||||
_Complex double *compl_ptr = (_Complex double *) out_ptr;
|
||||
int sizereal = N[0]*N[1]*N[2];
|
||||
int sizecompl = (N[0]/2+1)*N[1]*N[2];
|
||||
|
||||
//std::cout << "start real-compl fft on mic " << std::endl;
|
||||
|
||||
//std::cout << "real_ptr = " << real_ptr << std::endl;
|
||||
//std::cout << "compl_ptr = " << compl_ptr << std::endl;
|
||||
//std::cout << "EXECUTE AVERAGING OVER 10 LOOPS OF FFT" << std::endl;
|
||||
|
||||
#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE)
|
||||
//#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE)
|
||||
{
|
||||
//for (int i=0;i<10;++i){ //loop 10 times for benchmarking
|
||||
DftiComputeForward(this->getHandleRC(), real_ptr, compl_ptr);
|
||||
//}
|
||||
}
|
||||
|
||||
//std::cout << "end real-compl fft on mic " << std::endl;
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
//execute COMPLEX->REAL FFT
|
||||
int MICFFT::executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
//_Complex double *ptr = (_Complex double*) mem_ptr;
|
||||
|
||||
double *real_ptr = (double*) out_ptr;
|
||||
_Complex double *compl_ptr = (_Complex double *) in_ptr;
|
||||
|
||||
//std::cout << "real_ptr = " << real_ptr << std::endl;
|
||||
//std::cout << "compl_ptr = " << compl_ptr << std::endl;
|
||||
int sizereal = N[0]*N[1]*N[2];
|
||||
int sizecompl = (N[0]/2+1)*N[1]*N[2];
|
||||
|
||||
//std::cout << "offload to perform backward fft ... " << std::endl;
|
||||
//struct timeval start, end;
|
||||
//gettimeofday(&start,NULL);
|
||||
#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE)
|
||||
//#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE)
|
||||
{
|
||||
//for (int i=0;i<10;++i){ //loop 10 times for benchmarking
|
||||
DftiComputeBackward(this->getHandleCR(), compl_ptr, real_ptr);
|
||||
//}
|
||||
}
|
||||
|
||||
// End timing offloaded FFT.
|
||||
//gettimeofday(&end,NULL);
|
||||
// Print execution time of offloaded computational loop.
|
||||
//printf ("Total time for IFFT spent = %f seconds\n",
|
||||
//(double) (end.tv_usec-start.tv_usec) /1000000+(double) (end.tv_sec-start.tv_sec));
|
||||
//std::cout << "IFFT DONE!" << std::endl;
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
//normalize IFFT
|
||||
int MICFFT::normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
|
||||
|
||||
int size = N[0] * N[1] * N[2];
|
||||
|
||||
_Complex double *ptr = (_Complex double*) mem_ptr;
|
||||
#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < size; i++) {
|
||||
__real__ ptr[i] = __real__ ptr[i] / size;
|
||||
__imag__ ptr[i] = __imag__ ptr[i] / size;
|
||||
}
|
||||
}
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
79
src/MIC/MICFFT.h
Normal file
79
src/MIC/MICFFT.h
Normal file
@ -0,0 +1,79 @@
|
||||
#ifndef H_MIC_FFT
|
||||
#define H_MIC_FFT
|
||||
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
#include <offload.h>
|
||||
#include <mkl_dfti.h>
|
||||
|
||||
#include "../Algorithm/DKSFFT.h"
|
||||
#include "MICBase.h"
|
||||
|
||||
class MICFFT : public DKSFFT {
|
||||
|
||||
private:
|
||||
|
||||
MICBase *m_micbase;
|
||||
|
||||
/// Internal FFT object for performing serial FFTs.
|
||||
#pragma offload_attribute(push,target(mic))
|
||||
DFTI_DESCRIPTOR_HANDLE FFTHandle_m; //declspec only works for global variables
|
||||
DFTI_DESCRIPTOR_HANDLE handle;
|
||||
DFTI_DESCRIPTOR_HANDLE rc_handle; //handle for REAL->COMPLEX
|
||||
DFTI_DESCRIPTOR_HANDLE cr_handle; //handle for COMPLEX->REAL
|
||||
|
||||
#pragma offload_attribute(pop)
|
||||
|
||||
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle(void) {
|
||||
return FFTHandle_m;
|
||||
}
|
||||
|
||||
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle1(void) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleRC(void) {
|
||||
return rc_handle;
|
||||
}
|
||||
|
||||
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleCR(void) {
|
||||
return cr_handle;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/* constructor */
|
||||
MICFFT(MICBase *base);
|
||||
|
||||
/* destructir */
|
||||
~MICFFT();
|
||||
|
||||
/*
|
||||
Info: setup mkl fft
|
||||
Return: success or error code
|
||||
*/
|
||||
int setupFFT(int ndim, int N[3]);
|
||||
//BENI:
|
||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
||||
//BENI:
|
||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
||||
|
||||
/* execute FFT on MIC */
|
||||
int executeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
|
||||
|
||||
/* execute IFFT on MIC */
|
||||
int executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/* execute REAL->COMPLEX FFT on MIC */
|
||||
int executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/* execute COMPLEX->REAL FFT on MIC */
|
||||
int executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/* normalize IFFT on MIC */
|
||||
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
307
src/MIC/MICGreensFunction.cpp
Normal file
307
src/MIC/MICGreensFunction.cpp
Normal file
@ -0,0 +1,307 @@
|
||||
#include "MICGreensFunction.hpp"
|
||||
#include<stdio.h>
|
||||
#include<complex>
|
||||
#include <cstring>
|
||||
|
||||
/* constructor */
|
||||
MICGreensFunction::MICGreensFunction(MICBase *base) {
|
||||
m_micbase = base;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
MICGreensFunction::~MICGreensFunction() {
|
||||
}
|
||||
|
||||
|
||||
/* compute greens integral analytically */
|
||||
// Version with extended domain
|
||||
/*
|
||||
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,double hr_m1, double hr_m2) {
|
||||
double *tmp_ptr = (double*) tmp_ptr_;
|
||||
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
|
||||
{
|
||||
std::memset(tmp_ptr,0,(I+1)*(J+1)*(K+1));
|
||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
||||
#pragma omp parallel for collapse(3) schedule(dynamic)
|
||||
for (int k = 0; k < K; k++) {
|
||||
for (int j = 0; j < J; j++) {
|
||||
for (int i = 0; i < I; i++) {
|
||||
|
||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
||||
|
||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
||||
|
||||
double tmpgrn = 0;
|
||||
tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
||||
|
||||
tmpgrn = tmpgrn / 2;
|
||||
|
||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
||||
|
||||
tmpgrn = tmpgrn / cellVolume;
|
||||
|
||||
tmp_ptr[k*(J+1)*(I+1) + j*(I+1) + i] = tmpgrn;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
|
||||
double hr_m1, double hr_m2)
|
||||
{
|
||||
|
||||
double *tmp_ptr = (double*) tmp_ptr_;
|
||||
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
|
||||
{
|
||||
std::memset(tmp_ptr,0,I*J*K);
|
||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
||||
#pragma omp parallel for collapse(3) schedule(dynamic)
|
||||
for (int k = 0; k < K; k++) {
|
||||
for (int j = 0; j < J; j++) {
|
||||
for (int i = 0; i < I; i++) {
|
||||
|
||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
||||
|
||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
||||
|
||||
double tmpgrn = 0;
|
||||
tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
||||
|
||||
tmpgrn = tmpgrn / 2;
|
||||
|
||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
||||
|
||||
tmpgrn = tmpgrn / cellVolume;
|
||||
|
||||
tmp_ptr[k*(J)*(I) + j*(I) + i] = tmpgrn;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* perform the actual integration */
|
||||
// version with extended domain
|
||||
/*
|
||||
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
|
||||
double *tmp_ptr = (double*) tmp_ptr_;
|
||||
double *mem_ptr = (double*) mem_ptr_;
|
||||
|
||||
// the actual integration
|
||||
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||
{
|
||||
int Ii = I;
|
||||
int Jj = J;
|
||||
int Kk = K;
|
||||
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
||||
std::memset(mem_ptr,0,II*JJ*KK);
|
||||
I=I+1; J=J+1; K=K+1;
|
||||
|
||||
#pragma omp parallel for collapse(3)
|
||||
for (int i=0; i<Ii; i++) {
|
||||
for (int j=0; j<Jj; j++) {
|
||||
for (int k=0; k<Kk; k++) {
|
||||
//mem_ptr[k*JJ*II + j*II + i] = 0.0;
|
||||
mem_ptr[k*JJ*II + j*II + i] = tmp_ptr[(k+1)*J*I + (j+1)*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + j*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + (j+1)*I + i];
|
||||
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[(k+1)*J*I + j*I + i];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + (j+1)*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + j*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + (j+1)*I + i];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + j*I + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
|
||||
double *tmp_ptr = (double*) tmp_ptr_;
|
||||
double *mem_ptr = (double*) mem_ptr_;
|
||||
|
||||
// the actual integration
|
||||
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||
{
|
||||
int Ii = I;
|
||||
int Jj = J;
|
||||
int Kk = K;
|
||||
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
||||
std::memset(mem_ptr,0,II*JJ*KK);
|
||||
//I=I+1; J=J+1; K=K+1;
|
||||
|
||||
#pragma omp parallel for collapse(3)
|
||||
for (int i=0; i<Ii; i++) {
|
||||
for (int j=0; j<Jj; j++) {
|
||||
for (int k=0; k<Kk; k++) {
|
||||
//mem_ptr[k*JJ*II + j*II + i] = 0.0;
|
||||
mem_ptr[k*JJ*II + j*II + i] = tmp_ptr[(k+1)*J*I + (j+1)*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + j*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + (j+1)*I + i];
|
||||
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[(k+1)*J*I + j*I + i];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + (j+1)*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + j*I + (i+1)];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + (j+1)*I + i];
|
||||
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + j*I + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
//CUDA similar version:
|
||||
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
|
||||
double *tmpgreen = (double*) tmp_ptr_;
|
||||
double *mem_ptr = (double*) mem_ptr_;
|
||||
|
||||
// the actual integration
|
||||
#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||
{
|
||||
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
||||
std::memset(mem_ptr,0,II*JJ*KK);
|
||||
//I=I+1; J=J+1; K=K+1;
|
||||
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
|
||||
int NI_tmp=I;
|
||||
int NJ_tmp=J;
|
||||
int NK_tmp=K;
|
||||
|
||||
#pragma omp parallel for collapse(3)
|
||||
for (int i=0; i<I; i++) {
|
||||
for (int j=0; j<J; j++) {
|
||||
for (int k=0; k<K; k++) {
|
||||
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
|
||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
||||
|
||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp)
|
||||
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (j+1 < NJ_tmp)
|
||||
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (k+1 < NK_tmp)
|
||||
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
||||
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
||||
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||
|
||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||
|
||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||
|
||||
mem_ptr[i + j*II + k*II*JJ] = tmp_rho;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
|
||||
double *mem_ptr = (double*) mem_ptr_;
|
||||
|
||||
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||
{
|
||||
int id, id_mirr;
|
||||
int II = 2*I; int JJ = 2*J; int KK = 2*K;
|
||||
mem_ptr[0] = mem_ptr[II*JJ];
|
||||
#pragma omp parallel for collapse(3) schedule(dynamic)
|
||||
for (int ie = I+1; ie<2*I; ++ie) {
|
||||
for(int j = 0; j<= J; ++j) {
|
||||
for (int k=0; k<= K; ++k) {
|
||||
id = k * II * JJ + j * II + ie;
|
||||
id_mirr = k * II * JJ + j * II + (2*I-ie);
|
||||
mem_ptr[id] = mem_ptr[id_mirr];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for collapse(3) schedule(dynamic)
|
||||
for (int ai = 0; ai<2*I; ++ai) {
|
||||
for(int je = J+1; je< 2*J; ++je) {
|
||||
for (int k=0; k<= K; ++k) {
|
||||
id = k * II * JJ + je * II + ai;
|
||||
id_mirr = k * II * JJ + (2*J-je) * II + ai;
|
||||
mem_ptr[id] = mem_ptr[id_mirr];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for collapse(3) schedule(dynamic)
|
||||
for (int ai = 0; ai<2*I; ++ai) {
|
||||
for(int aj = 0; aj< 2*J; ++aj) {
|
||||
for (int ke=K+1; ke< 2*K; ++ke) {
|
||||
id = ke * II * JJ + aj * II + ai;
|
||||
id_mirr = (2*K-ke) * II * JJ + aj * II + ai;
|
||||
mem_ptr[id] = mem_ptr[id_mirr];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*multiply complex fields*/
|
||||
int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
|
||||
// double *mem_ptr1 = (double*) mem_ptr1_;
|
||||
// double *mem_ptr2 = (double*) mem_ptr2_;
|
||||
_Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
|
||||
_Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
|
||||
|
||||
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int i=0; i<size; ++i) {
|
||||
mem_ptr1[i]*=mem_ptr2[i];
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
44
src/MIC/MICGreensFunction.hpp
Normal file
44
src/MIC/MICGreensFunction.hpp
Normal file
@ -0,0 +1,44 @@
|
||||
//AUTHOR: Benjamin Ulmer
|
||||
|
||||
#ifndef H_MIC_GREENS
|
||||
#define H_MIC_GREENS
|
||||
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
#include <offload.h>
|
||||
#include <mkl_dfti.h>
|
||||
|
||||
#include "MICBase.h"
|
||||
|
||||
#define DKS_SUCCESS 0
|
||||
#define DKS_ERROR 1
|
||||
|
||||
class MICGreensFunction {
|
||||
|
||||
private:
|
||||
MICBase *m_micbase;
|
||||
|
||||
public:
|
||||
|
||||
/* constructor */
|
||||
MICGreensFunction(MICBase *base);
|
||||
|
||||
/* destructor */
|
||||
~MICGreensFunction();
|
||||
|
||||
/* compute greens integral analytically */
|
||||
int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
|
||||
|
||||
/* perform the actual integration */
|
||||
int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
|
||||
|
||||
/* Mirror rho-Field */
|
||||
int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
|
||||
|
||||
/*multiply complex fields*/
|
||||
int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
116
src/MIC/MICMergeSort.h
Normal file
116
src/MIC/MICMergeSort.h
Normal file
@ -0,0 +1,116 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <omp.h>
|
||||
|
||||
/* default comparison function */
|
||||
template<typename T>
|
||||
inline bool greaterThan(T x, T y) {
|
||||
return x > y;
|
||||
}
|
||||
|
||||
/* swap a and b */
|
||||
template<typename T>
|
||||
void mergeswap(T &a, T &b) {
|
||||
T tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void split_merge(T *a, int ibegin, int iend, T *b, bool (*comp)(T, T) ) {
|
||||
|
||||
if (iend - ibegin < 500) {
|
||||
quick_sort(a + ibegin, 0, iend - ibegin - 1, comp);
|
||||
return;
|
||||
}
|
||||
|
||||
int imiddle = (iend + ibegin) / 2;
|
||||
|
||||
#pragma omp task
|
||||
split_merge(a, ibegin, imiddle, b, comp);
|
||||
split_merge(a, imiddle, iend, b, comp);
|
||||
#pragma omp taskwait
|
||||
|
||||
merge(a, ibegin, imiddle, iend, b, comp);
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void merge(T *a, int ibegin, int imiddle, int iend, T *b, bool (*comp)(T, T)) {
|
||||
|
||||
int i0 = ibegin;
|
||||
int i1 = imiddle;
|
||||
|
||||
//merge two halfs of array a to tmp array b
|
||||
int i = ibegin;
|
||||
while (i < iend) {
|
||||
if (i0 < imiddle && ( i1 >= iend || comp(a[i1], a[i0]) ) )
|
||||
b[i++] = a[i0++];
|
||||
else
|
||||
b[i++] = a[i1++];
|
||||
}
|
||||
|
||||
//copy b back to a
|
||||
for (int i = ibegin; i < iend; i++)
|
||||
a[i] = b[i];
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
|
||||
int p = start;
|
||||
T x = a[start];
|
||||
|
||||
for (int i = start + 1; i <= end; i++) {
|
||||
if ( comp(x, a[i]) ) {
|
||||
p++;
|
||||
mergeswap(a[i], a[p]);
|
||||
}
|
||||
}
|
||||
mergeswap(a[p], a[start]);
|
||||
return p;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp single
|
||||
{
|
||||
T *b = new T[n];
|
||||
split_merge(list, 0, n, b, comp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||
|
||||
if (start < end) {
|
||||
//for small elements move to insertion sort
|
||||
if ( (end - start) < 9 ) {
|
||||
insertion_sort(list, start, end + 1, comp);
|
||||
} else {
|
||||
int part = partition(list, start, end, comp);
|
||||
quick_sort(list, start, part - 1, comp);
|
||||
quick_sort(list, part + 1, end, comp);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||
|
||||
for (int i = start + 1; i < end; i++) {
|
||||
T key = list[i];
|
||||
int j = i - 1;
|
||||
while ( j >= 0 && comp(list[j], key) ) {
|
||||
list[j + 1] = list[j];
|
||||
j--;
|
||||
}
|
||||
list[j + 1] = key;
|
||||
}
|
||||
|
||||
}
|
34
src/OpenCL/CMakeLists.txt
Normal file
34
src/OpenCL/CMakeLists.txt
Normal file
@ -0,0 +1,34 @@
|
||||
SET (_SRCS
|
||||
OpenCLBase.cpp
|
||||
OpenCLFFT.cpp
|
||||
OpenCLChiSquare.cpp
|
||||
OpenCLCollimatorPhysics.cpp
|
||||
OpenCLChiSquareRuntime.cpp
|
||||
)
|
||||
|
||||
SET (_HDRS
|
||||
OpenCLBase.h
|
||||
OpenCLFFT.h
|
||||
OpenCLChiSquare.h
|
||||
OpenCLCollimatorPhysics.h
|
||||
OpenCLChiSquareRuntime.h
|
||||
)
|
||||
|
||||
#INCLUDE_DIRECTORIES (
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
#)
|
||||
|
||||
SET (_KERNELS
|
||||
OpenCLKernels/OpenCLChiSquare.cl
|
||||
OpenCLKernels/OpenCLFFT.cl
|
||||
OpenCLKernels/OpenCLFFTStockham.cl
|
||||
OpenCLKernels/OpenCLTranspose.cl
|
||||
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
||||
OpenCLKernels/OpenCLChiSquareRuntime.cl
|
||||
)
|
||||
|
||||
ADD_SOURCES (${_SRCS})
|
||||
ADD_HEADERS (${_HDRS})
|
||||
|
||||
INSTALL(FILES ${_HDRS} DESTINATION include/OpenCL)
|
||||
INSTALL(FILES ${_KERNELS} DESTINATION include/OpenCL/OpenCLKernels)
|
1132
src/OpenCL/OpenCLBase.cpp
Normal file
1132
src/OpenCL/OpenCLBase.cpp
Normal file
File diff suppressed because it is too large
Load Diff
303
src/OpenCL/OpenCLBase.h
Normal file
303
src/OpenCL/OpenCLBase.h
Normal file
@ -0,0 +1,303 @@
|
||||
/*
|
||||
|
||||
Name: OpenCLBase
|
||||
|
||||
Author: Uldis Locans
|
||||
|
||||
Info: OpenCL base class to handle all the common details associated
|
||||
with kernel launch on OpenCL device
|
||||
|
||||
Date: 2014.09.18
|
||||
|
||||
*/
|
||||
|
||||
#ifndef H_OPENCL_BASE
|
||||
#define H_OPENCL_BASE
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/opencl.h>
|
||||
#include <OpenCL/cl_ext.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#include <CL/cl_ext.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#include "../DKSDefinitions.h"
|
||||
|
||||
/* struct for random number state */
|
||||
typedef struct {
|
||||
|
||||
double s10;
|
||||
double s11;
|
||||
double s12;
|
||||
double s20;
|
||||
double s21;
|
||||
double s22;
|
||||
double z;
|
||||
bool gen;
|
||||
|
||||
} RNDState;
|
||||
|
||||
class OpenCLBase {
|
||||
|
||||
private:
|
||||
|
||||
static cl_context m_context;
|
||||
static cl_command_queue m_command_queue;
|
||||
|
||||
static cl_platform_id m_platform_id;
|
||||
static cl_device_id m_device_id;
|
||||
|
||||
cl_context_properties m_context_properties[3];
|
||||
cl_program m_program;
|
||||
cl_kernel m_kernel;
|
||||
|
||||
static cl_event m_last_event;
|
||||
cl_int m_num_events;
|
||||
std::vector<cl_event> m_events;
|
||||
|
||||
char * m_kernel_file;
|
||||
|
||||
cl_device_type m_device_type;
|
||||
|
||||
/*
|
||||
Name: getPlatforms
|
||||
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_getPlatforms();
|
||||
|
||||
|
||||
/*
|
||||
Name: getDevice
|
||||
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
|
||||
ReturnL success or error code
|
||||
*/
|
||||
int ocl_getDevice(const char* device_name);
|
||||
|
||||
/*
|
||||
Name getDeviceType
|
||||
Info: get device type from device name (-gpu, -cpu, -mic)
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
|
||||
|
||||
/*
|
||||
Name: createContext
|
||||
Info: create context with specified device
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_createContext();
|
||||
|
||||
/*
|
||||
Name: buildProgram
|
||||
Info: build program from specified kernel file
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_buildProgram(const char* kernel_file);
|
||||
|
||||
/** Compile program from kernel source string
|
||||
*
|
||||
*/
|
||||
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
|
||||
|
||||
protected:
|
||||
|
||||
int defaultRndSet;
|
||||
cl_mem defaultRndState;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/*
|
||||
constructor
|
||||
*/
|
||||
OpenCLBase();
|
||||
|
||||
/*
|
||||
destructor
|
||||
*/
|
||||
~OpenCLBase();
|
||||
|
||||
/*
|
||||
Create RND states
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_createRndStates(int size);
|
||||
|
||||
/*
|
||||
Destroy rnd states
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_deleteRndStates();
|
||||
|
||||
|
||||
/*
|
||||
Name: getAllDevices
|
||||
Info: get all available devices
|
||||
ReturnL success or error code
|
||||
*/
|
||||
int ocl_getAllDevices();
|
||||
|
||||
/** Get the OpenCL device count for the set type of device
|
||||
*
|
||||
*/
|
||||
int ocl_getDeviceCount(int &ndev);
|
||||
|
||||
/** Get the name of the device used
|
||||
*/
|
||||
int ocl_getDeviceName(std::string &device_name);
|
||||
|
||||
/** Set the device to use for OpenCL kernels.
|
||||
* device id to use is passed as integer.
|
||||
*/
|
||||
int ocl_setDevice(int device);
|
||||
|
||||
/** Get a list of all the unique devices of the same type that can run OpenCL kernels
|
||||
* Used when GPUs of different types might be pressent on the system.
|
||||
*/
|
||||
int ocl_getUniqueDevices(std::vector<int> &devices);
|
||||
|
||||
/*
|
||||
Name: setUp
|
||||
Info: set up opencl resources
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_setUp(const char* device_name);
|
||||
|
||||
/*
|
||||
Name: loadKernel
|
||||
Info: load and compile opencl kernel file if it has changed
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_loadKernel(const char* kernel_file);
|
||||
|
||||
|
||||
/** Build program from kernel source.
|
||||
* Builds a program from source code provided in kernel_source.
|
||||
* If compilation fails will return DKS_ERROR
|
||||
*/
|
||||
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
|
||||
|
||||
/*
|
||||
Name: allocateMemory
|
||||
Info: allocate memory on device
|
||||
Return: return pointer to memory
|
||||
*/
|
||||
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
||||
|
||||
/*
|
||||
Name: allocateMemory
|
||||
Info: allocate memory on device
|
||||
Return: return pointer to memory
|
||||
*/
|
||||
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
||||
|
||||
/*
|
||||
Name: writeData
|
||||
Info: write data to device memory (needs ptr to mem object)
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
||||
|
||||
/*
|
||||
Name: copyData
|
||||
Info: copy data from one buffer on the device to another
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
|
||||
|
||||
/*
|
||||
Name: createKernel
|
||||
Info: create kernel from program
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_createKernel(const char* kernel_name);
|
||||
|
||||
/*
|
||||
Name: setKernelArgs
|
||||
Info: set opencl kernel arguments
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
|
||||
|
||||
/*
|
||||
Name: executeKernel
|
||||
Info: execute selected kernel (needs kernel parameters)
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
|
||||
|
||||
/*
|
||||
Name: readData
|
||||
Info: read data from device (needs pointer to mem object)
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
||||
|
||||
/*
|
||||
Name: freeMemory
|
||||
Info: free device memory (needs ptr to mem object)
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_freeMemory(cl_mem mem_ptr);
|
||||
|
||||
/*
|
||||
Name: cleanUp
|
||||
Info: free opencl resources
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_cleanUp();
|
||||
|
||||
/*
|
||||
Name: deviceInfo
|
||||
Info: print device info (mostly for debugging purposes)
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_deviceInfo(bool verbose = true);
|
||||
|
||||
/* Check OpenCL kernel.
|
||||
* Query device and check if it can run the kernel with required parameters
|
||||
*/
|
||||
int ocl_checkKernel(const char* kernel_name, int work_group_size,
|
||||
bool double_precision, int &threadsPerBlock);
|
||||
|
||||
/*
|
||||
Name: clearEvents
|
||||
Info: clear saved events (for debuging purposes)
|
||||
Return: nothing
|
||||
*/
|
||||
void ocl_clearEvents();
|
||||
|
||||
/*
|
||||
Name: eventInfo
|
||||
Info: print information about kernel timings (for debuging purposes)
|
||||
Return: nothing
|
||||
*/
|
||||
void ocl_eventInfo();
|
||||
|
||||
/*
|
||||
Return current command queue
|
||||
*/
|
||||
cl_command_queue ocl_getQueue() { return m_command_queue; }
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
157
src/OpenCL/OpenCLChiSquare.cpp
Normal file
157
src/OpenCL/OpenCLChiSquare.cpp
Normal file
@ -0,0 +1,157 @@
|
||||
#include "OpenCLChiSquare.h"
|
||||
|
||||
double OpenCLChiSquare::ocl_sum(cl_mem data, int length) {
|
||||
|
||||
|
||||
int ierr;
|
||||
//calc number of thread sper workgroup and nr of work groups
|
||||
size_t work_size_sum = 128;
|
||||
size_t work_items = (size_t)length;
|
||||
if (length % work_size_sum > 0)
|
||||
work_items = (length / work_size_sum + 1) * work_size_sum;
|
||||
|
||||
int work_groups = length / work_size_sum + 1;
|
||||
|
||||
//create tmp array for partial sums
|
||||
cl_mem tmp_ptr;
|
||||
|
||||
double *partial_sums = new double[work_groups];
|
||||
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
|
||||
|
||||
//execute sum kernel
|
||||
m_oclbase->ocl_createKernel("parallelReductionSum");
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
|
||||
m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
|
||||
m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
|
||||
|
||||
//read partial sums and free temp mempry
|
||||
m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
|
||||
m_oclbase->ocl_freeMemory(tmp_ptr);
|
||||
|
||||
//sumup partial sums on the host
|
||||
double result = 0;
|
||||
for (int i = 0; i < work_groups; i++)
|
||||
result += partial_sums[i];
|
||||
|
||||
delete[] partial_sums;
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLChiSquare::ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
|
||||
//set number of work items and work group sizes for kernel execution
|
||||
size_t work_size = 128;
|
||||
|
||||
size_t work_items = (size_t)length * sensors;
|
||||
if (length % work_size > 0)
|
||||
work_items = (length / work_size + 1) * work_size;
|
||||
|
||||
cl_mem data = (cl_mem)mem_data;
|
||||
cl_mem par = (cl_mem)mem_par;
|
||||
cl_mem chi = (cl_mem)mem_result;
|
||||
|
||||
//load and execute PHistotFFcn kernel
|
||||
m_oclbase->ocl_createKernel("kernelPHistoTFFcn");
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &par);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &chi);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(double), &fTimeResolution);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(double), &fRebin);
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(int), &length);
|
||||
m_oclbase->ocl_setKernelArg(6, sizeof(int), &sensors);
|
||||
m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
|
||||
m_oclbase->ocl_setKernelArg(8, sizeof(double)*numpar, NULL);
|
||||
m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
result = ocl_sum(chi, sensors*length);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
}
|
||||
|
||||
int OpenCLChiSquare::ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
|
||||
//set number of work items and work group sizes for kernel execution
|
||||
size_t work_size = 128;
|
||||
size_t work_items = (size_t)length * sensors;
|
||||
if (length % work_size > 0)
|
||||
work_items = (length / work_size + 1) * work_size;
|
||||
|
||||
cl_mem data = (cl_mem)mem_data;
|
||||
cl_mem t0 = (cl_mem)mem_t0;
|
||||
cl_mem par = (cl_mem)mem_par;
|
||||
cl_mem chi = (cl_mem)mem_result;
|
||||
|
||||
//load and execute PHistotFFcn kernel
|
||||
m_oclbase->ocl_createKernel("kernelSingleGaussTF");
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution);
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin);
|
||||
m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset);
|
||||
m_oclbase->ocl_setKernelArg(7, sizeof(int), &length);
|
||||
m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors);
|
||||
m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar);
|
||||
m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL);
|
||||
m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
result = ocl_sum(chi, length);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int OpenCLChiSquare::ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result)
|
||||
{
|
||||
|
||||
//set number of work items and work group sizes for kernel execution
|
||||
size_t work_size = 128;
|
||||
size_t work_items = (size_t)length * sensors;
|
||||
if (length % work_size > 0)
|
||||
work_items = (length / work_size + 1) * work_size;
|
||||
|
||||
cl_mem data = (cl_mem)mem_data;
|
||||
cl_mem t0 = (cl_mem)mem_t0;
|
||||
cl_mem par = (cl_mem)mem_par;
|
||||
cl_mem chi = (cl_mem)mem_result;
|
||||
|
||||
//load and execute PHistotFFcn kernel
|
||||
m_oclbase->ocl_createKernel("kernelDoubleLorentzTF");
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution);
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin);
|
||||
m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset);
|
||||
m_oclbase->ocl_setKernelArg(7, sizeof(int), &length);
|
||||
m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors);
|
||||
m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar);
|
||||
m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL);
|
||||
m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
result = ocl_sum(chi, length);
|
||||
|
||||
return DKS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
53
src/OpenCL/OpenCLChiSquare.h
Normal file
53
src/OpenCL/OpenCLChiSquare.h
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef H_OPENCL_CHI_SQUARE
|
||||
#define H_OPENCL_CHI_SQUARE
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#include "OpenCLBase.h"
|
||||
|
||||
#define DKS_SUCCESS 0
|
||||
#define DKS_ERROR 1
|
||||
|
||||
|
||||
class OpenCLChiSquare {
|
||||
|
||||
private:
|
||||
|
||||
OpenCLBase *m_oclbase;
|
||||
|
||||
double ocl_sum(cl_mem data, int length);
|
||||
|
||||
public:
|
||||
|
||||
OpenCLChiSquare(OpenCLBase *base) {
|
||||
m_oclbase = base;
|
||||
}
|
||||
|
||||
~OpenCLChiSquare() { }
|
||||
|
||||
int ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin,
|
||||
int sensors, int length, int numpar,
|
||||
double &result);
|
||||
|
||||
int ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result);
|
||||
|
||||
int ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int sensors, int length, int numpar,
|
||||
double &result);
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
316
src/OpenCL/OpenCLChiSquareRuntime.cpp
Normal file
316
src/OpenCL/OpenCLChiSquareRuntime.cpp
Normal file
@ -0,0 +1,316 @@
|
||||
#include "OpenCLChiSquareRuntime.h"
|
||||
|
||||
OpenCLChiSquareRuntime::OpenCLChiSquareRuntime(OpenCLBase *base) {
|
||||
|
||||
blockSize_m = BLOCK_SIZE;
|
||||
numBlocks_m = -1;
|
||||
|
||||
m_oclbase = base;
|
||||
|
||||
N0_m = 1.0;
|
||||
tau_m = 1.0;
|
||||
bkg_m = 1.0;
|
||||
alpha_m = 1.0;
|
||||
beta_m = 1.0;
|
||||
|
||||
ptx_m = NULL;
|
||||
|
||||
initDone_m = false;
|
||||
|
||||
}
|
||||
|
||||
//free temporary resources
|
||||
OpenCLChiSquareRuntime::~OpenCLChiSquareRuntime() {
|
||||
delete[] ptx_m;
|
||||
freeChiSquare();
|
||||
}
|
||||
|
||||
//build program string
|
||||
std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
|
||||
|
||||
long fsize;
|
||||
char *kernel_source;
|
||||
|
||||
//get kernel source
|
||||
char * kernel_file = new char[500];
|
||||
kernel_file[0] = '\0';
|
||||
strcat(kernel_file, OPENCL_KERNELS);
|
||||
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl");
|
||||
|
||||
//read kernels from file
|
||||
FILE *fp = fopen(kernel_file, "rb");
|
||||
if (!fp)
|
||||
DEBUG_MSG("Can't open kernel file" << kernel_file);
|
||||
|
||||
//get file size and allocate memory
|
||||
fseek(fp, 0, SEEK_END);
|
||||
fsize = ftell(fp);
|
||||
kernel_source = new char[fsize+1];
|
||||
|
||||
//read file and content in kernel source
|
||||
rewind(fp);
|
||||
fread(kernel_source, 1, sizeof(char)*fsize, fp);
|
||||
kernel_source[fsize] = '\0';
|
||||
fclose(fp);
|
||||
|
||||
std::string kernel_string (kernel_source);
|
||||
return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {
|
||||
|
||||
//build program string
|
||||
std::string openclProg = buildProgram(function);
|
||||
|
||||
//compile flags
|
||||
std::string opts("");
|
||||
if (mlh)
|
||||
opts = "-DMLH";
|
||||
|
||||
//compile opencl program from source string
|
||||
int ierr = m_oclbase->ocl_loadKernelFromSource(openclProg.c_str(), opts.c_str());
|
||||
|
||||
return ierr;
|
||||
}
|
||||
|
||||
double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
|
||||
|
||||
|
||||
int ierr;
|
||||
//calc number of thread sper workgroup and nr of work groups
|
||||
size_t work_size_sum = 128;
|
||||
|
||||
/*
|
||||
size_t work_items = (size_t)length;
|
||||
if (length % work_size_sum > 0)
|
||||
work_items = (length / work_size_sum + 1) * work_size_sum;
|
||||
int work_groups = length / work_size_sum + 1;
|
||||
*/
|
||||
|
||||
size_t work_items = 80 * work_size_sum;
|
||||
int work_groups = 80;
|
||||
|
||||
//create tmp array for partial sums
|
||||
cl_mem tmp_ptr;
|
||||
|
||||
double *partial_sums = new double[work_groups];
|
||||
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
|
||||
|
||||
//execute sum kernel
|
||||
//ocl_createKernel("parallelReductionSum");
|
||||
m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
|
||||
m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
|
||||
m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
|
||||
|
||||
//read partial sums and free temp mempry
|
||||
m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
|
||||
m_oclbase->ocl_freeMemory(tmp_ptr);
|
||||
|
||||
//sumup partial sums on the host
|
||||
double result = 0;
|
||||
for (int i = 0; i < work_groups; i++)
|
||||
result += partial_sums[i];
|
||||
|
||||
delete[] partial_sums;
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
||||
void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep, double &result)
|
||||
{
|
||||
|
||||
int ierr;
|
||||
|
||||
//convert memory to cl_mem
|
||||
cl_mem cl_mem_data = (cl_mem)mem_data;
|
||||
cl_mem cl_mem_err = (cl_mem)mem_err;
|
||||
|
||||
cl_mem cl_param = (cl_mem)mem_param_m;
|
||||
cl_mem cl_chisq = (cl_mem)mem_chisq_m;
|
||||
cl_mem cl_map = (cl_mem)mem_map_m;
|
||||
cl_mem cl_func = (cl_mem)mem_func_m;
|
||||
|
||||
//set work item size
|
||||
size_t work_items;
|
||||
size_t work_size = (size_t)blockSize_m;
|
||||
if (numBlocks_m < 0)
|
||||
work_items = (size_t)length;
|
||||
else
|
||||
work_items = (size_t)numBlocks_m * (size_t)blockSize_m;
|
||||
|
||||
if (work_items % work_size > 0)
|
||||
work_items = (work_items / work_size + 1) * work_size;
|
||||
|
||||
if (fitType == FITTYPE_SINGLE_HISTO) {
|
||||
//create kernel
|
||||
ierr = m_oclbase->ocl_createKernel("kernelChiSquareSingleHisto");
|
||||
|
||||
if (ierr != DKS_SUCCESS)
|
||||
return ierr;
|
||||
|
||||
//set kernel args
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map);
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func);
|
||||
m_oclbase->ocl_setKernelArg(6, sizeof(int), &length);
|
||||
m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
|
||||
m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc);
|
||||
m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap);
|
||||
m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart);
|
||||
m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
|
||||
m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
|
||||
m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
|
||||
m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
|
||||
m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
|
||||
m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
|
||||
m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
|
||||
|
||||
if (ierr != DKS_SUCCESS)
|
||||
return ierr;
|
||||
} else if (fitType == FITTYPE_ASYMMETRY) {
|
||||
//create kernel
|
||||
ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
|
||||
|
||||
if (ierr != DKS_SUCCESS)
|
||||
return ierr;
|
||||
|
||||
//set kernel args
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map);
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func);
|
||||
m_oclbase->ocl_setKernelArg(6, sizeof(int), &length);
|
||||
m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
|
||||
m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc);
|
||||
m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap);
|
||||
m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart);
|
||||
m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
|
||||
m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
|
||||
m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
|
||||
m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
|
||||
m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
|
||||
m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
|
||||
|
||||
if (ierr != DKS_SUCCESS)
|
||||
return ierr;
|
||||
} else if (fitType == FITTYPE_MU_MINUS) {
|
||||
// not yet implemented
|
||||
} else {
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
//execute kernel
|
||||
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
|
||||
|
||||
if (ierr != DKS_SUCCESS)
|
||||
return ierr;
|
||||
|
||||
//execute sum kernel
|
||||
result = calculateSum((cl_mem)mem_chisq_m, length);
|
||||
|
||||
return ierr;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
|
||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
|
||||
int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
|
||||
if (numfunc == 0)
|
||||
return DKS_SUCCESS;
|
||||
|
||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
|
||||
if (nummap == 0)
|
||||
return DKS_SUCCESS;
|
||||
|
||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
|
||||
return ierr;
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
|
||||
int size_func, int size_map)
|
||||
{
|
||||
|
||||
int ierr = DKS_ERROR;
|
||||
if (initDone_m) {
|
||||
DEBUG_MSG("Reinitializing ChiSquare");
|
||||
freeChiSquare();
|
||||
}
|
||||
|
||||
//allocate temporary memory
|
||||
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
|
||||
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
|
||||
if (size_func == 0)
|
||||
size_func = 1;
|
||||
mem_func_m = m_oclbase->ocl_allocateMemory(size_func*sizeof(double), ierr);
|
||||
if (size_map == 0)
|
||||
size_map = 1;
|
||||
mem_map_m = m_oclbase->ocl_allocateMemory(size_map*sizeof(int), ierr);
|
||||
initDone_m = true;
|
||||
|
||||
return ierr;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::freeChiSquare() {
|
||||
|
||||
int ierr = DKS_ERROR;
|
||||
if (initDone_m) {
|
||||
|
||||
//free memory
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
|
||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
|
||||
|
||||
initDone_m = false;
|
||||
}
|
||||
|
||||
return ierr;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBlock) {
|
||||
|
||||
int ierr;
|
||||
char kernel[64];
|
||||
|
||||
switch (fitType) {
|
||||
case FITTYPE_SINGLE_HISTO:
|
||||
strncpy(kernel, "kernelChiSquareSingleHisto", sizeof(kernel));
|
||||
break;
|
||||
case FITTYPE_ASYMMETRY:
|
||||
strncpy(kernel, "kernelChiSquareAsymmetry", sizeof(kernel));
|
||||
break;
|
||||
case FITTYPE_MU_MINUS:
|
||||
// not yet implemented
|
||||
default:
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
|
||||
|
||||
return ierr;
|
||||
|
||||
}
|
||||
|
103
src/OpenCL/OpenCLChiSquareRuntime.h
Normal file
103
src/OpenCL/OpenCLChiSquareRuntime.h
Normal file
@ -0,0 +1,103 @@
|
||||
#ifndef H_OPENCL_CHISQUARE_RUNTIME
|
||||
#define H_OPENCL_CHISQUARE_RUNTIME
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#include "../Algorithms/ChiSquareRuntime.h"
|
||||
#include "OpenCLBase.h"
|
||||
|
||||
const std::string openclFunctHeader = "double fTheory(double t, __local double *p, __local double *f, __local int *m) {";
|
||||
|
||||
const std::string openclFunctFooter = "}\n";
|
||||
|
||||
class OpenCLChiSquareRuntime : public ChiSquareRuntime {
|
||||
|
||||
private:
|
||||
|
||||
OpenCLBase *m_oclbase;
|
||||
|
||||
/** Private function to add user defined function to kernel string
|
||||
*
|
||||
*/
|
||||
std::string buildProgram(std::string function);
|
||||
|
||||
double calculateSum(cl_mem data, int length);
|
||||
|
||||
public:
|
||||
|
||||
/** Constructor wiht openclbase argument
|
||||
*
|
||||
*/
|
||||
OpenCLChiSquareRuntime(OpenCLBase *base);
|
||||
|
||||
/** Default constructor
|
||||
*
|
||||
*/
|
||||
OpenCLChiSquareRuntime();
|
||||
|
||||
/** Default destructor
|
||||
*
|
||||
*/
|
||||
~OpenCLChiSquareRuntime();
|
||||
|
||||
/** Compile program and save ptx.
|
||||
* Add function string to the calcFunction kernel and compile the program
|
||||
* Function must be valid C math expression. Parameters can be addressed in
|
||||
* a form par[map[idx]]
|
||||
*/
|
||||
int compileProgram(std::string function, bool mlh = false);
|
||||
|
||||
/** Launch selected kernel
|
||||
* Launched the selected kernel from the compiled code.
|
||||
* Result is put in &result variable
|
||||
*/
|
||||
int launchChiSquare(int fitType,
|
||||
void *mem_data, void *mem_err, int length,
|
||||
int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double &result);
|
||||
|
||||
/** Write params to device.
|
||||
* Write params from double array to mem_param_m memory on the device.
|
||||
*/
|
||||
int writeParams(const double *params, int numparams);
|
||||
|
||||
/** Write functions to device.
|
||||
* Write function values from double array to mem_func_m memory on the device.
|
||||
*/
|
||||
int writeFunc(const double *func, int numfunc);
|
||||
|
||||
/** Write maps to device.
|
||||
* Write map values from int array to mem_map_m memory on the device.
|
||||
*/
|
||||
int writeMap(const int *map, int nummap);
|
||||
|
||||
/** Allocate temporary memory needed for chi square.
|
||||
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
||||
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
||||
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
||||
* calculations.
|
||||
*/
|
||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||
|
||||
/** Free temporary memory allocated for chi square.
|
||||
* Frees the chisq temporary memory and memory for params, functions and maps
|
||||
*/
|
||||
int freeChiSquare();
|
||||
|
||||
/** Check MuSR kernels for necessary resources.
|
||||
* Query device properties to get if sufficient resources are
|
||||
* available to run the kernels
|
||||
*/
|
||||
int checkChiSquareKernels(int fitType, int &threadsPerBlock);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
107
src/OpenCL/OpenCLCollimatorPhysics.cpp
Normal file
107
src/OpenCL/OpenCLCollimatorPhysics.cpp
Normal file
@ -0,0 +1,107 @@
|
||||
#include "OpenCLCollimatorPhysics.h"
|
||||
|
||||
#define M_P 0.93827231e+00
|
||||
#define C 299792458.0
|
||||
#define PI 3.14159265358979323846
|
||||
#define AVO 6.022e23
|
||||
#define R_E 2.81794092e-15
|
||||
#define eM_E 0.51099906e-03
|
||||
#define Z_P 1
|
||||
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
|
||||
|
||||
#define POSITION 0
|
||||
#define ZSIZE 1
|
||||
#define RHO_M 2
|
||||
#define Z_M 3
|
||||
#define A_M 4
|
||||
#define A2_C 5
|
||||
#define A3_C 6
|
||||
#define A4_C 7
|
||||
#define A5_C 8
|
||||
#define X0_M 9
|
||||
#define I_M 10
|
||||
#define DT_M 11
|
||||
|
||||
#define BLOCK_SIZE 128
|
||||
#define NUMPAR 12
|
||||
|
||||
/*
|
||||
TODO:
|
||||
1. test OpenCL kernel
|
||||
- is it launched for all particles
|
||||
- does the random number generatror function properly
|
||||
- is particle structure updated correctly in memory
|
||||
2. boost.compute sort for user defined structure crashes
|
||||
*/
|
||||
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||
int numparticles)
|
||||
{
|
||||
/*
|
||||
//set number of total threads, and number threads per block
|
||||
size_t threads = 1;
|
||||
size_t blocks = numparticles;
|
||||
|
||||
//cast void ptrs to cl_mem ptrs
|
||||
cl_mem data = (cl_mem)mem_ptr;
|
||||
cl_mem params = (cl_mem)par_ptr;
|
||||
|
||||
int numparams = 19;
|
||||
|
||||
//set kernel to execute and kernel arguments
|
||||
ocl_createKernel("kernelCollimatorPhysics");
|
||||
ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||
ocl_setKernelArg(1, sizeof(cl_mem), ¶ms);
|
||||
ocl_setKernelArg(2, sizeof(cl_mem), &defaultRndState);
|
||||
ocl_setKernelArg(3, sizeof(int), &numparticles);
|
||||
ocl_setKernelArg(4, sizeof(double)*numparams, NULL);
|
||||
|
||||
std::cout << "blocks: " << blocks << ", threads: " << threads << std::endl;
|
||||
|
||||
//execute kernel on device
|
||||
ocl_executeKernel(1, &blocks, &threads);
|
||||
|
||||
//create functions for comparing two particles and counting particles with labels < 0
|
||||
|
||||
BOOST_COMPUTE_FUNCTION(bool, sort_by_label, (PART_OPENCL a, PART_OPENCL b),
|
||||
{
|
||||
return a.label < b.label;
|
||||
});
|
||||
|
||||
|
||||
|
||||
BOOST_COMPUTE_FUNCTION(bool, count_by_label, (PART_OPENCL a),
|
||||
{
|
||||
return a.label < 0;
|
||||
});
|
||||
|
||||
|
||||
//wrap cl_mem memory object in Boost.Compute buffer
|
||||
std::cout << "wrap buffer" << std::endl;
|
||||
boost::compute::buffer buf(data);
|
||||
|
||||
//count particles with labels < 0
|
||||
std::cout << "wrap command queue" << std::endl;
|
||||
boost::compute::command_queue queue(ocl_getQueue());
|
||||
|
||||
std::cout << "count if" << std::endl;
|
||||
|
||||
|
||||
numaddback = boost::compute::count_if(boost::compute::make_buffer_iterator<PART_OPENCL>(buf,0),
|
||||
boost::compute::make_buffer_iterator<PART_OPENCL>(buf,numparticles),
|
||||
count_by_label, queue);
|
||||
|
||||
//sort particles with dead and leaving particles at the end using boos::compute
|
||||
numaddback = 0;
|
||||
if (numaddback > 0) {
|
||||
std::cout << "sort" << std::endl;
|
||||
boost::compute::sort(boost::compute::make_buffer_iterator<PART_OPENCL>(buf,0),
|
||||
boost::compute::make_buffer_iterator<PART_OPENCL>(buf, numparticles),
|
||||
sort_by_label, queue);
|
||||
}
|
||||
|
||||
|
||||
return DKS_SUCCESS;
|
||||
*/
|
||||
std::cout << "OpenCL implementation disabled" << std::endl;
|
||||
return DKS_ERROR;
|
||||
}
|
85
src/OpenCL/OpenCLCollimatorPhysics.h
Normal file
85
src/OpenCL/OpenCLCollimatorPhysics.h
Normal file
@ -0,0 +1,85 @@
|
||||
#ifndef H_OPENCL_DEGRADER
|
||||
#define H_OPENCL_DEGRADER
|
||||
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
|
||||
#include "../Algorithms/CollimatorPhysics.h"
|
||||
#include "OpenCLBase.h"
|
||||
|
||||
/*
|
||||
#include "boost/compute/types/struct.hpp"
|
||||
#include "boost/compute/type_traits/type_name.hpp"
|
||||
#include "boost/compute/algorithm/count_if.hpp"
|
||||
#include "boost/compute/algorithm/sort.hpp"
|
||||
#include "boost/compute/container/vector.hpp"
|
||||
#include "boost/compute/iterator/buffer_iterator.hpp"
|
||||
#include "boost/compute/core.hpp"
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} Double3;
|
||||
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
|
||||
Double3 Rincol;
|
||||
Double3 Pincol;
|
||||
} PART_OPENCL;
|
||||
|
||||
//adapt struct PART for use in Boost.Compute
|
||||
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
|
||||
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
|
||||
|
||||
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
|
||||
|
||||
private:
|
||||
OpenCLBase *m_oclbase;
|
||||
|
||||
public:
|
||||
|
||||
/* constructor */
|
||||
OpenCLCollimatorPhysics(OpenCLBase *base) {
|
||||
m_oclbase = base;
|
||||
}
|
||||
|
||||
/* destructor */
|
||||
~OpenCLCollimatorPhysics() {
|
||||
}
|
||||
|
||||
/* execute degrader code on device */
|
||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
|
||||
|
||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles) { return DKS_ERROR; }
|
||||
|
||||
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { return DKS_ERROR; }
|
||||
|
||||
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||
void *par_ptr, int numparticles, int &numaddback) { return DKS_ERROR; }
|
||||
|
||||
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false, int streamId = -1)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
||||
double dt, double c, bool usedt = false,
|
||||
int streamId = -1)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
303
src/OpenCL/OpenCLFFT.cpp
Normal file
303
src/OpenCL/OpenCLFFT.cpp
Normal file
@ -0,0 +1,303 @@
|
||||
#include "OpenCLFFT.h"
|
||||
|
||||
//=====================================//
|
||||
//==========Private functions==========//
|
||||
//=====================================//
|
||||
|
||||
/*
|
||||
call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform,
|
||||
ndim - totla number of dimmensions, N - size of dimension
|
||||
*/
|
||||
int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) {
|
||||
|
||||
//set the number of work items in each dimension
|
||||
size_t work_items[3];
|
||||
work_items[0] = N;
|
||||
work_items[1] = (ndim > 1) ? N : 1;
|
||||
work_items[2] = (ndim > 1) ? N : 1;
|
||||
work_items[cdim] = N / 2;
|
||||
|
||||
int f = (forward) ? 1 : 0;
|
||||
|
||||
//create kernel and set kernel arguments
|
||||
if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
|
||||
//execute kernel
|
||||
for (int step = 1; step < N; step <<= 1) {
|
||||
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
}
|
||||
|
||||
return OCL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform,
|
||||
ndim - totla number of dimmensions, N - size of dimension
|
||||
*/
|
||||
int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) {
|
||||
//set work item size
|
||||
size_t work_items[3];
|
||||
work_items[0] = N;
|
||||
work_items[1] = (ndim > 1) ? N : 1;
|
||||
work_items[2] = (ndim > 2) ? N : 1;
|
||||
|
||||
//create kernel and set kernel arguments
|
||||
if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
int bits = log2(N);
|
||||
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
//execute kernel
|
||||
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
|
||||
DEBUG_MSG("Error executing kernel");
|
||||
return OCL_ERROR;
|
||||
}
|
||||
|
||||
return OCL_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
//=====================================//
|
||||
//==========Public functions==========//
|
||||
//=====================================//
|
||||
|
||||
/*
|
||||
call fft execution on device for every dimension
|
||||
*/
|
||||
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
||||
int ierr;
|
||||
|
||||
cl_mem inout = (cl_mem)data;
|
||||
int n = N[0];
|
||||
|
||||
for (int dim = 0; dim < ndim; dim++) {
|
||||
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
|
||||
if (ierr != OCL_SUCCESS) {
|
||||
DEBUG_MSG("Error executing bit reverse");
|
||||
return OCL_ERROR;
|
||||
}
|
||||
|
||||
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
|
||||
if (ierr != OCL_SUCCESS) {
|
||||
DEBUG_MSG("Error executing fft reverse");
|
||||
return OCL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return OCL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
execute ifft
|
||||
*/
|
||||
int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
|
||||
executeFFT(data, ndim, N, streamId, false);
|
||||
return OCL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
call kernel to normalize fft
|
||||
*/
|
||||
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
||||
|
||||
cl_mem inout = (cl_mem)data;
|
||||
|
||||
int n = N[0];
|
||||
|
||||
//set work item size
|
||||
size_t work_items[3];
|
||||
work_items[0] = n;
|
||||
work_items[1] = (ndim > 1) ? n : 1;
|
||||
work_items[2] = (ndim > 2) ? n : 1;
|
||||
|
||||
//create kernel
|
||||
if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
//set kernel args
|
||||
unsigned int elements = pow(n, ndim);
|
||||
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
//execute kernel
|
||||
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
|
||||
DEBUG_MSG("Error executing kernel");
|
||||
return OCL_ERROR;
|
||||
}
|
||||
|
||||
return OCL_SUCCESS;
|
||||
}
|
||||
|
||||
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
|
||||
|
||||
int ierr;
|
||||
int size = sizeof(cl_double2)*pow(N,ndim);
|
||||
|
||||
cl_mem mem_tmp;
|
||||
cl_mem mem_src = (cl_mem)src;
|
||||
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
|
||||
|
||||
//set the number of work items in each dimension
|
||||
size_t work_items[3];
|
||||
int p = 1;
|
||||
int threads = N / 2;
|
||||
int f = (forward) ? -1 : 1;
|
||||
|
||||
//execute kernel
|
||||
int n = (int)log2(N);
|
||||
for (int i = 0; i < ndim; i++) {
|
||||
|
||||
int dim = i+1;
|
||||
p = 1;
|
||||
work_items[0] = (dim == 1) ? N/2 : N;
|
||||
work_items[1] = (dim == 2) ? N/2 : N;
|
||||
work_items[2] = (dim == 3) ? N/2 : N;
|
||||
|
||||
//transpose array if calculating dimension larger than 1
|
||||
//if (dim > 1)
|
||||
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
||||
|
||||
//create kernel and set kernel arguments
|
||||
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
for (int t = 1; t <= log2(N); t++) {
|
||||
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
|
||||
|
||||
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
||||
return OCL_ERROR;
|
||||
|
||||
mem_tmp = mem_src;
|
||||
mem_src = mem_dst;
|
||||
mem_dst = mem_tmp;
|
||||
|
||||
p = 2*p;
|
||||
}
|
||||
|
||||
//transpose array back if calculating dimension larger than 1
|
||||
//if (dim > 1)
|
||||
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
||||
}
|
||||
|
||||
if (ndim*n % 2 == 1) {
|
||||
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
|
||||
mem_tmp = mem_src;
|
||||
mem_src = mem_dst;
|
||||
mem_dst = mem_tmp;
|
||||
}
|
||||
|
||||
m_oclbase->ocl_freeMemory(mem_dst);
|
||||
|
||||
return OCL_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
|
||||
|
||||
cl_mem mem_src = (cl_mem)src;
|
||||
|
||||
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
|
||||
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
|
||||
|
||||
m_oclbase->ocl_createKernel("fft_batch3D");
|
||||
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
|
||||
|
||||
|
||||
for (int dim = 1; dim < ndim+1; dim++) {
|
||||
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
|
||||
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
|
||||
}
|
||||
|
||||
return OCL_SUCCESS;
|
||||
}
|
||||
|
||||
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
|
||||
|
||||
cl_mem mem_src = (cl_mem)src;
|
||||
|
||||
if (ndim == 1)
|
||||
return OCL_SUCCESS;
|
||||
|
||||
size_t work_items[3];
|
||||
work_items[0] = N[0];
|
||||
work_items[1] = N[1];
|
||||
work_items[2] = 1;
|
||||
|
||||
size_t work_group_size[3];
|
||||
work_group_size[0] = N[0];
|
||||
work_group_size[1] = N[1];
|
||||
work_group_size[2] = 1;
|
||||
|
||||
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
|
||||
|
||||
m_oclbase->ocl_createKernel("transpose");
|
||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
|
||||
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
|
||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
|
||||
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
|
||||
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
|
||||
|
||||
return OCL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
void OpenCLFFT::printData3DN4(cl_double2* &data, int N) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].x;
|
||||
if (d > 10e-5 || d < -10e-5)
|
||||
std::cout << d << "\t";
|
||||
else
|
||||
std::cout << 0 << "\t";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
113
src/OpenCL/OpenCLFFT.h
Normal file
113
src/OpenCL/OpenCLFFT.h
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
|
||||
Name: OpenCLFFT
|
||||
|
||||
Author: Uldis Locans
|
||||
|
||||
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
|
||||
|
||||
Data: 19.09.2014
|
||||
|
||||
*/
|
||||
#ifndef H_OPENCL_FFT
|
||||
#define H_OPENCL_FFT
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <complex>
|
||||
|
||||
#include "../Algorithms/FFT.h"
|
||||
#include "OpenCLBase.h"
|
||||
|
||||
class OpenCLFFT : public DKSFFT {
|
||||
|
||||
private:
|
||||
|
||||
OpenCLBase *m_oclbase;
|
||||
|
||||
/*
|
||||
Info: call fft kernels to execute FFT of the given domain,
|
||||
data - devevice memory ptr, cdim - current dim to transform,
|
||||
ndim - totla number of dimmensions, N - size of dimension
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward = true);
|
||||
|
||||
/*
|
||||
Info: call ifft kernel to execute the bit reverse sort
|
||||
data - devevice memory ptr, cdim - current dim to transform,
|
||||
ndim - totla number of dimmensions, N - size of dimension
|
||||
Return: success or error code
|
||||
*/
|
||||
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
||||
|
||||
public:
|
||||
|
||||
/* constructor - currently does nothing*/
|
||||
OpenCLFFT(OpenCLBase *base) {
|
||||
m_oclbase = base;
|
||||
}
|
||||
|
||||
/* destructor - currently does nothing*/
|
||||
~OpenCLFFT() { }
|
||||
|
||||
/*
|
||||
Info: execute forward fft function with data set on device
|
||||
Return: success or error code
|
||||
*/
|
||||
//int ocl_executeFFT(cl_mem &data, int ndim, int N, bool forward = true);
|
||||
int executeFFT(void *data, int ndim, int N[3], int streamId = -1, bool forward = true);
|
||||
|
||||
/*
|
||||
Info: execute inverse fft with data set on device
|
||||
Return: success or error code
|
||||
*/
|
||||
//int ocl_executeIFFT(cl_mem &data, int ndim, int N);
|
||||
int executeIFFT(void *data, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: execute normalize kernel
|
||||
Return: success or error code
|
||||
*/
|
||||
//int ocl_normalizeFFT(cl_mem &data, int ndim, int N);
|
||||
int normalizeFFT(void *data, int ndim, int N[3], int streamId = -1);
|
||||
|
||||
/*
|
||||
Info: set FFT size
|
||||
Return: success or error code
|
||||
*/
|
||||
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
|
||||
|
||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||
|
||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||
|
||||
int destroyFFT() { return DKS_SUCCESS; }
|
||||
|
||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||
int streamId = -1)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||
int streamId = -1)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
|
||||
{
|
||||
return DKS_ERROR;
|
||||
}
|
||||
|
||||
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
|
||||
|
||||
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
|
||||
|
||||
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
|
||||
|
||||
//void printData3DN4(cl_double2* &data, int N);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
175
src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl
Normal file
175
src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl
Normal file
@ -0,0 +1,175 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
|
||||
#define TAU 2.197019
|
||||
|
||||
|
||||
__kernel void parallelReductionSum(__global double *data_in, __global double *data_out,
|
||||
__local double *data_local, int size)
|
||||
{
|
||||
|
||||
//get local and global ids, and work group size
|
||||
int local_id = get_local_id(0);
|
||||
int global_id = get_global_id(0);
|
||||
int group_size = get_local_size(0);
|
||||
|
||||
//copy from global memory to local, if global id out of bounds fill with 0s
|
||||
if (global_id < size)
|
||||
data_local[local_id] = data_in[global_id];
|
||||
else
|
||||
data_local[local_id] = 0;
|
||||
|
||||
//loop trough reduction steps
|
||||
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
|
||||
|
||||
//synch all work items in work group
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//create partials summs each step
|
||||
if (local_id < stride)
|
||||
data_local[local_id] += data_local[local_id + stride];
|
||||
}
|
||||
|
||||
//local thread 0 writes final partial sum to global memory
|
||||
if (local_id == 0)
|
||||
data_out[get_group_id(0)] = data_local[0];
|
||||
|
||||
}
|
||||
|
||||
__kernel void kernelPHistoTFFcn(__global double *data, __global double *par, __global double *chisq,
|
||||
double fTimeResolution, double fRebin,
|
||||
int length, int sensors, int numpar,
|
||||
__local double *p)
|
||||
{
|
||||
|
||||
//get work item id and calc global id
|
||||
int tid = get_local_id(0);
|
||||
int j = get_global_id(0);
|
||||
|
||||
//load parameters from global to shared memory
|
||||
if (tid < numpar)
|
||||
p[tid] = par[tid];
|
||||
|
||||
//sync work items inside work group
|
||||
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
if (j < length) {
|
||||
|
||||
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
|
||||
double time = dt0 + fTimeResolution * fRebin * j;
|
||||
double w = p[0]*0.08516155035269027;
|
||||
double tt = exp(-time/TAU);
|
||||
double pp = exp(-0.5 * pow(p[1]*time, 2.0));
|
||||
double wt = w * time;
|
||||
|
||||
|
||||
int idx;
|
||||
double ldata, theo;
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
idx = i * length + j;
|
||||
ldata = data[idx];
|
||||
|
||||
theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
|
||||
|
||||
if (ldata != 0.0)
|
||||
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
|
||||
else
|
||||
chisq[idx] = theo * theo;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernelSingleGaussTF(__global double *data, __global unsigned int *t0,
|
||||
__global double *par, __global double *result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int length, int sensors, int numpar, __local double *p)
|
||||
{
|
||||
|
||||
//get work item id and calc global id
|
||||
int tid = get_local_id(0);
|
||||
int j = get_global_id(0);
|
||||
|
||||
//load para,eters from global to shared memory
|
||||
if (tid < numpar)
|
||||
p[tid] = par[tid];
|
||||
|
||||
//sync work items inside work group
|
||||
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
if (j < length) {
|
||||
double dt0 = fTimeResolution*0.5*(fRebin - 1);
|
||||
double w1 = par[0]*0.08516155035269027;
|
||||
|
||||
int idx;
|
||||
double ldata, lft0, theo, time;
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
idx = i * length + j;
|
||||
lft0 = t0[i];
|
||||
if (j >= lft0 + fGoodBinOffset/fRebin) {
|
||||
ldata = data[idx];
|
||||
time = dt0 + fTimeResolution * fRebin* (j - lft0);
|
||||
theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0))
|
||||
*cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
|
||||
// 1.74532925199432955e-2 = pi/180
|
||||
|
||||
if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) )
|
||||
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
|
||||
else
|
||||
result[idx] = theo - ldata;
|
||||
} else {
|
||||
result[idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__kernel void kernelDoubleLorentzTF(__global double *data, __global unsigned int *t0,
|
||||
__global double *par, __global double *result,
|
||||
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||
int length, int sensors, int numpar, __local double *p)
|
||||
{
|
||||
|
||||
//get work item id and calc global id
|
||||
int tid = get_local_id(0);
|
||||
int j = get_global_id(0);
|
||||
|
||||
//load para,eters from global to shared memory
|
||||
if (tid < numpar)
|
||||
p[tid] = par[tid];
|
||||
|
||||
//sync work items inside work group
|
||||
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
if (j < length) {
|
||||
double dt0 = fTimeResolution*0.5*(fRebin - 1);
|
||||
double w1 = p[0]*0.08516155035269027;
|
||||
double w2 = p[2]*0.08516155035269027;
|
||||
|
||||
int idx;
|
||||
double ldata, lft0, theo, time;
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
|
||||
idx = i * length + j;
|
||||
lft0 = t0[i];
|
||||
if (j >= lft0 + fGoodBinOffset/fRebin) {
|
||||
ldata = data[idx];
|
||||
time = dt0+fTimeResolution*fRebin*(j-lft0);
|
||||
|
||||
theo = p[4+i*5]*exp(-time/TAU)*
|
||||
(1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)*
|
||||
cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+
|
||||
(1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)*
|
||||
cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5];
|
||||
// 1.74532925199432955e-2 = pi/180
|
||||
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
|
||||
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
|
||||
else
|
||||
result[idx] = theo - ldata;
|
||||
} else {
|
||||
result[idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
344
src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
Normal file
344
src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl
Normal file
@ -0,0 +1,344 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
|
||||
#define PI 3.141592653589793115998
|
||||
#define TWO_PI 6.283185307179586231996
|
||||
#define DEG_TO_RAD 1.7453292519943295474371681e-2
|
||||
|
||||
/** From 'Numerical Recipes in C' by Press et.al, 1992. */
|
||||
//Returns the Bessel function J0(x) for any real x.
|
||||
double bessj0(double x) {
|
||||
double ax,z;
|
||||
double xx,y,ans,ans1,ans2; //Accumulate polynomials in double precision.
|
||||
|
||||
if ((ax=fabs(x)) < 8.0) { //Direct rational function fit.
|
||||
y=x*x;
|
||||
ans1=57568490574.0+y*(-13362590354.0+y*(651619640.7+y*(-11214424.18+y*(77392.33017+y*(-184.9052456)))));
|
||||
ans2=57568490411.0+y*(1029532985.0+y*(9494680.718+y*(59272.64853+y*(267.8532712+y*1.0))));
|
||||
ans=ans1/ans2;
|
||||
} else { //Fitting function (6.5.9).
|
||||
z=8.0/ax;
|
||||
y=z*z;
|
||||
xx=ax-0.785398164;
|
||||
ans1=1.0+y*(-0.1098628627e-2+y*(0.2734510407e-4+y*(-0.2073370639e-5+y*0.2093887211e-6)));
|
||||
ans2 = -0.1562499995e-1+y*(0.1430488765e-3+y*(-0.6911147651e-5+y*(0.7621095161e-6-y*0.934945152e-7)));
|
||||
ans=sqrt(0.636619772/ax)*(cos(xx)*ans1-z*sin(xx)*ans2);
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
/** Theory function declaration.
|
||||
* Definition of the theory function will be build during runtime before compilation.
|
||||
*/
|
||||
double fTheory(double t, __local double *p, __local double *f, __local int *m);
|
||||
|
||||
/** MusrFit predefined functions.
|
||||
* Predefined functions from MusrFit that can be used to define the theory function.
|
||||
* First parameter in all the functions is alwats time - t, rest of the parameters depend
|
||||
* on the function.
|
||||
*/
|
||||
double se(double t, double lamda) {
|
||||
return exp( -lamda*t );
|
||||
}
|
||||
|
||||
double ge(double t, double lamda, double beta) {
|
||||
return exp( -pow(lamda*t, beta) );
|
||||
}
|
||||
|
||||
double sg(double t, double sigma) {
|
||||
return exp( -0.5 * pow(sigma*t, 2) );
|
||||
}
|
||||
|
||||
double stg(double t, double sigma) {
|
||||
double sigmatsq = pow(sigma*t,2);
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
|
||||
}
|
||||
|
||||
double sekt(double t, double lambda) {
|
||||
double lambdat = lambda*t;
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
|
||||
}
|
||||
|
||||
double lgkt(double t, double lambda, double sigma) {
|
||||
double lambdat = lambda*t;
|
||||
double sigmatsq = pow(sigma*t, 2.0);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
|
||||
}
|
||||
|
||||
double skt(double t, double sigma, double beta) {
|
||||
if (beta < 1.0e-3)
|
||||
return 0.0;
|
||||
double sigmatb = pow(sigma*t, beta);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
|
||||
}
|
||||
|
||||
double spg(double t, double lambda, double gamma, double q) {
|
||||
double lam2 = lambda*lambda;
|
||||
double lamt2q = t*t*lam2*q;
|
||||
double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
|
||||
double rateL = sqrt(fabs(rate2));
|
||||
double rateT = sqrt(fabs(rate2)+lamt2q);
|
||||
|
||||
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
|
||||
}
|
||||
|
||||
double rahf(double t, double nu, double lambda) {
|
||||
double nut = nu*t;
|
||||
double nuth = nu*t/2.0;
|
||||
double lamt = lambda*t;
|
||||
|
||||
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
|
||||
}
|
||||
|
||||
double tf(double t, double phi, double nu) {
|
||||
double tmp_nu = TWO_PI*nu*t;
|
||||
double tmp_phi = DEG_TO_RAD * phi;
|
||||
|
||||
return cos(tmp_nu + tmp_phi);
|
||||
}
|
||||
|
||||
double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
|
||||
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||
}
|
||||
|
||||
double b(double t, double phi, double nu) {
|
||||
return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
||||
}
|
||||
|
||||
double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
|
||||
double wt = TWO_PI * nu * t;
|
||||
double ph = DEG_TO_RAD * phi;
|
||||
|
||||
return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||
}
|
||||
|
||||
double ab(double t, double sigma, double gamma) {
|
||||
double gt = gamma*t;
|
||||
|
||||
return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
|
||||
}
|
||||
|
||||
double snkzf(double t, double Delta0, double Rb) {
|
||||
double D0t2 = pow(Delta0*t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
|
||||
|
||||
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
|
||||
}
|
||||
|
||||
double snktf(double t, double phi, double nu, double Delta0, double Rb) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
double D0t2 = pow(Delta0*t, 2.0);
|
||||
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
|
||||
|
||||
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
double dnkzf(double t, double Delta0, double Rb, double nuc) {
|
||||
double nuct = nuc*t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
|
||||
double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
|
||||
}
|
||||
|
||||
double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
|
||||
double wt = TWO_PI*nu*t;
|
||||
double ph = DEG_TO_RAD*phi;
|
||||
double nuct = nuc*t;
|
||||
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
|
||||
double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
|
||||
|
||||
return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
|
||||
}
|
||||
|
||||
__kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err,
|
||||
__global double *par, __global double *chisq, __global int *map, __global double *funcv,
|
||||
int length, int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double tau, double N0, double bkg,
|
||||
__local double *p, __local double *f, __local int *m)
|
||||
{
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid = get_local_id(0);
|
||||
int j = get_global_id(0);
|
||||
int lsize = get_local_size(0);
|
||||
|
||||
//load parameters from global to shared memory
|
||||
while (tid < numpar) {
|
||||
p[tid] = par[tid];
|
||||
tid += lsize;
|
||||
}
|
||||
|
||||
//load functions from global to shared memory
|
||||
tid = get_local_id(0);
|
||||
while (tid < numfunc) {
|
||||
f[tid] = funcv[tid];
|
||||
tid += lsize;
|
||||
}
|
||||
|
||||
//load maps from global memory
|
||||
tid = get_local_id(0);
|
||||
while (tid < nummap) {
|
||||
m[tid] = map[tid];
|
||||
tid += lsize;
|
||||
}
|
||||
|
||||
//sync threads
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
while (j < length) {
|
||||
|
||||
double t = timeStart + j*timeStep;
|
||||
double ldata = data[j];
|
||||
double lerr = err[j];
|
||||
|
||||
double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg;
|
||||
|
||||
#ifdef MLH
|
||||
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
|
||||
chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo));
|
||||
else
|
||||
chisq[j] = 2.0 * (theo - ldata);
|
||||
#else
|
||||
if (lerr != 0.0)
|
||||
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
|
||||
else
|
||||
chisq[j] = theo * theo;
|
||||
#endif
|
||||
|
||||
j += get_global_size(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__kernel void kernelChiSquareAsymmetry(__global double *data, __global double *err,
|
||||
__global double *par, __global double *chisq, __global int *map, __global double *funcv,
|
||||
int length, int numpar, int numfunc, int nummap,
|
||||
double timeStart, double timeStep,
|
||||
double alpha, double beta,
|
||||
__local double *p, __local double *f, __local int *m)
|
||||
{
|
||||
|
||||
//get thread id and calc global id
|
||||
int tid = get_local_id(0);
|
||||
int j = get_global_id(0);
|
||||
int lsize = get_local_size(0);
|
||||
|
||||
//load parameters from global to shared memory
|
||||
while (tid < numpar) {
|
||||
p[tid] = par[tid];
|
||||
tid += lsize;
|
||||
}
|
||||
|
||||
//load functions from global to shared memory
|
||||
tid = get_local_id(0);
|
||||
while (tid < numfunc) {
|
||||
f[tid] = funcv[tid];
|
||||
tid += lsize;
|
||||
}
|
||||
|
||||
//load maps from global memory
|
||||
tid = get_local_id(0);
|
||||
if (tid < nummap) {
|
||||
m[tid] = map[tid];
|
||||
tid += lsize;
|
||||
}
|
||||
|
||||
//sync threads
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
while (j < length) {
|
||||
|
||||
double t = timeStart + j*timeStep;
|
||||
double ldata = data[j];
|
||||
double lerr = err[j];
|
||||
|
||||
double ab = alpha*beta;
|
||||
double theoVal = fTheory(t, p, f, m);
|
||||
double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0)-(ab-1.0)*theoVal);
|
||||
|
||||
#ifdef MLH
|
||||
chisq[j] = 0.0; // max log likelihood not defined for asymmetry fit
|
||||
#else
|
||||
if (lerr != 0.0)
|
||||
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
|
||||
else
|
||||
chisq[j] = theo * theo;
|
||||
#endif
|
||||
|
||||
j += get_global_size(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__kernel void parallelReductionSum(__global double *data_in, __global double *data_out,
|
||||
__local double *data_local, int size)
|
||||
{
|
||||
|
||||
//get local and global ids, and work group size
|
||||
int local_id = get_local_id(0);
|
||||
int global_id = get_global_id(0);
|
||||
int group_size = get_local_size(0);
|
||||
|
||||
//copy from global memory to local, if global id out of bounds fill with 0s
|
||||
if (global_id < size)
|
||||
data_local[local_id] = data_in[global_id];
|
||||
else
|
||||
data_local[local_id] = 0;
|
||||
|
||||
//loop trough reduction steps
|
||||
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
|
||||
|
||||
//synch all work items in work group
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//create partials summs each step
|
||||
if (local_id < stride)
|
||||
data_local[local_id] += data_local[local_id + stride];
|
||||
}
|
||||
|
||||
//local thread 0 writes final partial sum to global memory
|
||||
if (local_id == 0)
|
||||
data_out[get_group_id(0)] = data_local[0];
|
||||
|
||||
}
|
||||
|
||||
__kernel void parallelReductionTwoPhase(__global double *data_in, __global double *data_out,
|
||||
__local double *data_local, int size)
|
||||
{
|
||||
//get local and global ids, and work group size
|
||||
int local_id = get_local_id(0);
|
||||
int global_id = get_global_id(0);
|
||||
int global_size = get_global_size(0);
|
||||
int group_size = get_local_size(0);
|
||||
|
||||
double acc = 0;
|
||||
while (global_id < size) {
|
||||
acc += data_in[global_id];
|
||||
global_id += global_size;
|
||||
}
|
||||
|
||||
//parallel reduction on local work group
|
||||
data_local[local_id] = acc;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
|
||||
//synch all work items in work group
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//create partials summs each step
|
||||
if (local_id < stride)
|
||||
data_local[local_id] += data_local[local_id + stride];
|
||||
}
|
||||
|
||||
//local thread 0 writes final partial sum to global memory
|
||||
if (local_id == 0)
|
||||
data_out[get_group_id(0)] = data_local[0];
|
||||
|
||||
}
|
362
src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
Normal file
362
src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl
Normal file
@ -0,0 +1,362 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
#pragma OPENCL EXTENSION
|
||||
|
||||
|
||||
/******Random numbers********/
|
||||
|
||||
/* struct for random number state */
|
||||
typedef struct {
|
||||
|
||||
double s10;
|
||||
double s11;
|
||||
double s12;
|
||||
double s20;
|
||||
double s21;
|
||||
double s22;
|
||||
double z;
|
||||
bool gen;
|
||||
|
||||
} RNDState;
|
||||
|
||||
#define NORM 2.328306549295728e-10
|
||||
#define M1 4294967087.0
|
||||
#define M2 4294944443.0
|
||||
#define A12 1403580.0
|
||||
#define A13N 810728.0
|
||||
#define A21 527612.0
|
||||
#define A23N 1370589.0
|
||||
|
||||
/* MRG32k3a uniform random number generator */
|
||||
double rand_uniform(RNDState *s) {
|
||||
long k;
|
||||
double p1, p2;
|
||||
|
||||
/* Component 1 */
|
||||
p1 = A12 * (*s).s11 - A13N * (*s).s10;
|
||||
k = p1 / M1;
|
||||
p1 -= k * M1;
|
||||
if (p1 < 0.0)
|
||||
p1 += M1;
|
||||
(*s).s10 = (*s).s11;
|
||||
(*s).s11 = (*s).s12;
|
||||
(*s).s12 = p1;
|
||||
|
||||
/* Component 2 */
|
||||
p2 = A21 * (*s).s22 - A23N * (*s).s20;
|
||||
k = p2 / M2;
|
||||
p2 -= k * M2;
|
||||
if (p2 < 0.0)
|
||||
p2 += M2;
|
||||
(*s).s20 = (*s).s21;
|
||||
(*s).s21 = (*s).s22;
|
||||
(*s).s22 = p2;
|
||||
|
||||
/* Combination */
|
||||
if (p1 <= p2)
|
||||
return ((p1 - p2 + M1) * NORM);
|
||||
else
|
||||
return ((p1 - p2) * NORM);
|
||||
}
|
||||
|
||||
/* get random variable with gaussian distribution */
|
||||
double rand_normal(RNDState *s, double mu, double sigma) {
|
||||
|
||||
const double two_pi = 2.0 * 3.141592653589793223846;
|
||||
double z0;
|
||||
|
||||
if (!(*s).gen) {
|
||||
(*s).gen = true;
|
||||
return (*s).z * sigma + mu;
|
||||
}
|
||||
|
||||
double u1, u2;
|
||||
u1 = rand_uniform(s);
|
||||
u2 = rand_uniform(s);
|
||||
|
||||
z0 = sqrt(-2.0 * log(u1)) * cos(two_pi * u2);
|
||||
(*s).z = sqrt(-2.0 * log(u1)) * sin(two_pi * u2);
|
||||
(*s).gen = false;
|
||||
|
||||
return z0 * sigma + mu;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* initialize random states */
|
||||
__kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
|
||||
|
||||
int id = get_global_id(0);
|
||||
|
||||
if (id < N) {
|
||||
RNDState tmp;
|
||||
int tmp_seed = id;// * 0x100000000ULL;
|
||||
tmp.s10 = 12345 + tmp_seed;
|
||||
tmp.s11 = 12345 + tmp_seed;
|
||||
tmp.s12 = 123 + tmp_seed;
|
||||
tmp.s20 = 12345 + tmp_seed;
|
||||
tmp.s21 = 12345 + tmp_seed;
|
||||
tmp.s22 = 123 + tmp_seed;
|
||||
|
||||
tmp.z = 0;
|
||||
tmp.gen = true;
|
||||
|
||||
s[id] = tmp;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**********Degrader**********/
|
||||
enum PARAMS { POSITION,
|
||||
ZSIZE,
|
||||
M_P,
|
||||
C,
|
||||
RHO_M,
|
||||
PI,
|
||||
AVO,
|
||||
R_E,
|
||||
eM_E,
|
||||
Z_M,
|
||||
A_M,
|
||||
A2_C,
|
||||
A3_C,
|
||||
A4_C,
|
||||
A5_C,
|
||||
Z_P,
|
||||
X0_M,
|
||||
I_M,
|
||||
DT_M};
|
||||
|
||||
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double3 Rincol;
|
||||
double3 Pincol;
|
||||
} PART;
|
||||
|
||||
double Dot(double3 d1, double3 d2) {
|
||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||
}
|
||||
|
||||
/* check if particle is in degrader material */
|
||||
bool checkHit(double z, double position, double zsize) {
|
||||
return ( ( z > position) && ( z <= position + zsize) );
|
||||
}
|
||||
|
||||
/* calculate particles energy loss */
|
||||
void energyLoss(double *Eng, bool *pdead, double deltat, RNDState *s, __local double *par) {
|
||||
|
||||
double dEdx = 0.0;
|
||||
double gamma = ( (*Eng) + par[M_P]) / par[M_P];
|
||||
|
||||
double gamma2 = gamma * gamma;
|
||||
|
||||
double beta = sqrt(1.0 - 1.0 / gamma2);
|
||||
double beta2 = beta * beta;
|
||||
double deltas = deltat * beta * par[C];
|
||||
double deltasrho = deltas * 100 * par[RHO_M];
|
||||
double K = 4.0 * par[PI] * par[AVO] * par[R_E] * par[R_E] * par[eM_E] * 1E7;
|
||||
double sigma_E = sqrt(K * par[eM_E] * par[RHO_M] * (par[Z_M]/par[A_M])* deltas * 1E5);
|
||||
|
||||
if (((*Eng) > 0.00001) && ((*Eng) < 0.0006)) {
|
||||
double Ts = ((*Eng)*1E6)/1.0073;
|
||||
double epsilon_low = par[A2_C]*pow(Ts,0.45);
|
||||
double epsilon_high = (par[A3_C]/Ts)*log(1+(par[A4_C]/Ts)+(par[A5_C]*Ts));
|
||||
double epsilon = (epsilon_low*epsilon_high)/(epsilon_low + epsilon_high);
|
||||
dEdx = - epsilon /(1E21*(par[A_M]/par[AVO]));
|
||||
double delta_Eave = deltasrho * dEdx;
|
||||
double delta_E = delta_Eave + rand_normal(s, 0, sigma_E);
|
||||
|
||||
(*Eng) = (*Eng) + delta_E / 1E3;
|
||||
}
|
||||
|
||||
if ((*Eng) >= 0.0006) {
|
||||
double Tmax = 2.0 * par[eM_E] * 1e9 * beta2 * gamma2 /
|
||||
(1.0 + 2.0 * gamma * par[eM_E] / par[M_P] +
|
||||
(par[eM_E] / par[M_P]) * (par[eM_E] / par[M_P]));
|
||||
dEdx = -K * par[Z_P] * par[Z_P] * par[Z_M] / (par[A_M] * beta2) *
|
||||
(1.0 / 2.0 * log(2 * par[eM_E] * 1e9 * beta2 * gamma2 *
|
||||
Tmax / par[I_M] / par[I_M]) - beta2);
|
||||
|
||||
double delta_Eave = deltasrho * dEdx;
|
||||
double delta_E = delta_Eave + rand_normal(s, 0, sigma_E);
|
||||
|
||||
(*Eng) = (*Eng)+delta_E / 1E3;
|
||||
}
|
||||
|
||||
(*pdead) = (((*Eng)<1E-4) || (dEdx>0));
|
||||
|
||||
}
|
||||
|
||||
/* rotate partocle */
|
||||
void Rot(double3 *P, double3 *R, double xplane,
|
||||
double normP, double thetacou, double deltas, int coord,
|
||||
__local double *par)
|
||||
{
|
||||
double Psixz;
|
||||
double pxz;
|
||||
|
||||
double px = (*P).x;
|
||||
double pz = (*P).z;
|
||||
double x = (*R).x;
|
||||
double z = (*R).z;
|
||||
|
||||
if (px>=0 && pz>=0) Psixz = atan(px/pz);
|
||||
else if (px>0 && pz<0)
|
||||
Psixz = atan(px/pz) + par[PI];
|
||||
else if (px<0 && pz>0)
|
||||
Psixz = atan(px/pz) + 2*par[PI];
|
||||
else
|
||||
Psixz = atan(px/pz) + par[PI];
|
||||
|
||||
pxz = sqrt(px*px + pz*pz);
|
||||
if(coord==1) {
|
||||
(*R).x = x + deltas * px/normP + xplane*cos(Psixz);
|
||||
(*R).z = z - xplane * sin(Psixz);
|
||||
}
|
||||
if(coord==2) {
|
||||
(*R).x = x + deltas * px/normP + xplane*cos(Psixz);
|
||||
(*R).z = z - xplane * sin(Psixz) + deltas * pz / normP;
|
||||
}
|
||||
(*P).x = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
|
||||
(*P).z = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||
}
|
||||
|
||||
|
||||
void coulombScat(double3 *R, double3 *P, double deltat,
|
||||
RNDState *s, __local double* par) {
|
||||
|
||||
double dotP = Dot((*P), (*P));
|
||||
|
||||
double Eng = sqrt(dotP + 1.0) * par[M_P] - par[M_P];
|
||||
double gamma = (Eng + par[M_P]) / par[M_P];
|
||||
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
|
||||
double normP = sqrt(dotP);
|
||||
double deltas = deltat * beta * par[C];
|
||||
double theta0 = 13.6e6 / (beta * sqrt(dotP) * par[M_P] * 1e9) *
|
||||
par[Z_P] * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
|
||||
|
||||
// x-direction: See Physical Review, "Multiple Scattering"
|
||||
double z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
double z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
double thetacou = z2 * theta0;
|
||||
|
||||
while(fabs(thetacou) > 3.5 * theta0) {
|
||||
z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
thetacou = z2 * theta0;
|
||||
}
|
||||
|
||||
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||
int coord = 1;
|
||||
Rot(P, R, xplane, normP, thetacou, deltas, coord, par);
|
||||
|
||||
double P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P2 < 0.0047) {
|
||||
double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||
double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P4 > 0.5)
|
||||
thetaru = -thetaru;
|
||||
coord = 0; // no change in coordinates but one in momenta-direction
|
||||
Rot(P, R, xplane, normP, thetaru, deltas, coord, par);
|
||||
}
|
||||
|
||||
// y-direction: See Physical Review, "Multiple Scattering"
|
||||
z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
thetacou = z2 * theta0;
|
||||
|
||||
while(fabs(thetacou) > 3.5 * theta0) {
|
||||
z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||
thetacou = z2 * theta0;
|
||||
}
|
||||
|
||||
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||
coord = 2;
|
||||
Rot(P, R, yplane, normP, thetacou, deltas, coord, par);
|
||||
|
||||
P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P2 < 0.0047) {
|
||||
double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||
double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||
if(P4 > 0.5)
|
||||
thetaru = -thetaru;
|
||||
coord = 0; // no change in coordinates but one in momenta-direction
|
||||
Rot(P, R, yplane, normP, thetaru, deltas, coord, par);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#define NUMPARAMS 19
|
||||
__kernel void kernelCollimatorPhysics(__global PART *data, __global double *par,
|
||||
__global RNDState *state, int numparticles,
|
||||
__local double *p)
|
||||
{
|
||||
|
||||
//get global id
|
||||
int tid = get_local_id(0);
|
||||
int idx = get_global_id(0);
|
||||
|
||||
printf("idx:\n");//, idx);
|
||||
|
||||
//transfer params to local memory
|
||||
if (tid < NUMPARAMS)
|
||||
p[tid] = par[tid];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
RNDState s;
|
||||
double3 R, P;
|
||||
int l = 0;
|
||||
if (idx < numparticles) {
|
||||
R = data[idx].Rincol;
|
||||
P = data[idx].Pincol;
|
||||
s = state[idx];
|
||||
}
|
||||
|
||||
double sq = sqrt(1.0 + Dot(P, P));
|
||||
bool pdead = false;
|
||||
bool hit = checkHit(R.z, p[POSITION], p[ZSIZE]);
|
||||
double Eng;
|
||||
|
||||
if (hit) {
|
||||
Eng = (sq - 1) * p[M_P];
|
||||
energyLoss(&Eng, &pdead, p[DT_M], &s, p);
|
||||
} else {
|
||||
R.x = R.x + p[DT_M] * p[C] * P.x / sq;
|
||||
R.y = R.y + p[DT_M] * p[C] * P.y / sq;
|
||||
R.z = R.z + p[DT_M] * p[C] * P.z / sq;
|
||||
l = -2;
|
||||
}
|
||||
|
||||
if (hit && !pdead) {
|
||||
double ptot = sqrt((p[M_P] + Eng) * (p[M_P] + Eng) - (p[M_P] * p[M_P])) / p[M_P];
|
||||
sq = sqrt(Dot(P, P));
|
||||
P.x = P.x * ptot / sq;
|
||||
P.y = P.y * ptot / sq;
|
||||
P.z = P.z * ptot / sq;
|
||||
coulombScat(&R, &P, p[DT_M], &s, p);
|
||||
}
|
||||
|
||||
if (hit && pdead)
|
||||
l = -1;
|
||||
|
||||
if (idx < numparticles) {
|
||||
data[idx].Rincol = R;
|
||||
data[idx].Pincol = P;
|
||||
data[idx].label = l;
|
||||
state[idx] = s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* count dead particles and particles leaving material - boost compute? */
|
||||
|
||||
/* sort particles so dead and leaving particles are at the end of PART array - boost compute */
|
||||
|
||||
|
181
src/OpenCL/OpenCLKernels/OpenCLFFT.cl
Normal file
181
src/OpenCL/OpenCLKernels/OpenCLFFT.cl
Normal file
@ -0,0 +1,181 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
|
||||
/* 3D normalize FFT kernel */
|
||||
__kernel void normalizeFFT(__global double2 *input, int N) {
|
||||
int i1 = get_global_id(0);
|
||||
int i2 = get_global_id(1);
|
||||
int i3 = get_global_id(2);
|
||||
int n1 = get_global_size(0);
|
||||
int n2 = get_global_size(1);
|
||||
int n3 = get_global_size(2);
|
||||
|
||||
int id = i1;
|
||||
if (n2 > 1)
|
||||
id += i2*n2;
|
||||
if (n3 > 1)
|
||||
id += i3*n2*n2;
|
||||
|
||||
input[id].x = input[id].x / N;
|
||||
input[id].y = input[id].y / N;
|
||||
}
|
||||
|
||||
/* 3D radix 2 FFT kernel */
|
||||
__kernel void FFT3D(__global double2 *input, int step, int dim, int forward) {
|
||||
|
||||
int n1 = get_global_size(0);
|
||||
int n2 = get_global_size(1);
|
||||
int n3 = get_global_size(2);
|
||||
int i1 = get_global_id(0);
|
||||
int i2 = get_global_id(1);
|
||||
int i3 = get_global_id(2);
|
||||
|
||||
int jump = step << 1;
|
||||
|
||||
int d, idGroup, idLoc, idTwidle, id, match;
|
||||
if (dim == 0) {
|
||||
|
||||
d = n1 / step; // n1 >> log2(step)
|
||||
idLoc = i1 / d;
|
||||
idGroup = i1 & (d-1); //modulo
|
||||
|
||||
idTwidle = idGroup * jump + idLoc;
|
||||
id = i3*n3*n3 + i2*n2 + idTwidle;
|
||||
match = id + step;
|
||||
} else if (dim == 1) {
|
||||
|
||||
d = n2 / step;
|
||||
idLoc = i2 / d;
|
||||
idGroup = i2 & (d-1);
|
||||
|
||||
idTwidle = idGroup * jump + idLoc;
|
||||
id = i3*n3*n3 + idTwidle*n1 + i1;
|
||||
match = id + step*n1;
|
||||
} else if (dim == 2) {
|
||||
|
||||
d = n3 / step;
|
||||
idLoc = i3 / d;
|
||||
idGroup = i3 & (d-1);
|
||||
|
||||
idTwidle = idGroup * jump + idLoc;
|
||||
id = idTwidle*n1*n1 + i2*n2 + i1;
|
||||
match = id + step*n1*n1;
|
||||
}
|
||||
|
||||
double alpha;
|
||||
if (forward == 1)
|
||||
alpha = -( 2 * M_PI / jump ) * idTwidle;
|
||||
else
|
||||
alpha = ( 2 * M_PI / jump ) * idTwidle;
|
||||
|
||||
double wr, wi;
|
||||
wi = sincos(alpha, &wr);
|
||||
|
||||
double2 cTemp;
|
||||
double2 cTempId = input[id];
|
||||
double2 cTempMatch = input[match];
|
||||
|
||||
cTemp.x = wr*cTempMatch.x - wi*cTempMatch.y;
|
||||
cTemp.y = wr*cTempMatch.y + wi*cTempMatch.x;
|
||||
|
||||
input[match] = cTempId - cTemp;
|
||||
input[id] = cTempId + cTemp;
|
||||
|
||||
}
|
||||
|
||||
/* 3D bit reversal sort */
|
||||
__kernel void BitReverseSort3D(__global double2 *input, int bits, int dim) {
|
||||
|
||||
int n = get_global_size(0);
|
||||
int i1 = get_global_id(0);
|
||||
int i2 = get_global_id(1);
|
||||
int i3 = get_global_id(2);
|
||||
|
||||
int irev, itmp, istart;
|
||||
if (dim == 0) {
|
||||
istart = i1;
|
||||
irev = i1;
|
||||
itmp = i1;
|
||||
} else if (dim == 1) {
|
||||
irev = i2;
|
||||
itmp = i2;
|
||||
istart = i2;
|
||||
} else if (dim == 2) {
|
||||
irev = i3;
|
||||
itmp = i3;
|
||||
istart = i3;
|
||||
}
|
||||
|
||||
for (int j = 1; j < bits; j++) {
|
||||
itmp >>= 1;
|
||||
irev <<= 1;
|
||||
irev |= itmp & 1;
|
||||
}
|
||||
irev &= n - 1;
|
||||
|
||||
int id1, id2;
|
||||
if (istart < irev) {
|
||||
double2 tmp;
|
||||
id1 = i3*n*n + i2*n + i1;
|
||||
if (dim == 0) { //i1, irev - w, i2 - h, i3 - d
|
||||
id2 = i3*n*n + i2*n + irev;
|
||||
tmp = input[id1];
|
||||
input[id1] = input[id2];
|
||||
input[id2] = tmp;
|
||||
} else if (dim == 1) { // i1 - w, i2, irev - h, i3 - d
|
||||
id2 = i3*n*n + irev*n + i1;
|
||||
tmp = input[id1];
|
||||
input[id1] = input[id2];
|
||||
input[id2] = tmp;
|
||||
} else if (dim == 2) { // i1 - w, i2 - h, i3, irev - d
|
||||
id2 = irev*n*n + i2*n + i1;
|
||||
tmp = input[id1];
|
||||
input[id1] = input[id2];
|
||||
input[id2] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* 3D FFT kernel based on Stockham's out-of-place algorithm */
|
||||
__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim, const int forward) {
|
||||
|
||||
const int gid1 = get_global_id(0);
|
||||
const int gid2 = get_global_id(1);
|
||||
const int gid3 = get_global_id(2);
|
||||
|
||||
int t2 = 2*t;
|
||||
int k, m, in1, in2, out1, out2;
|
||||
in1 = gid3*t2*t2 + gid2*t2 + gid1;
|
||||
if (ndim == 1) {
|
||||
k = gid1 & (p - 1);
|
||||
m = (gid1 << 1) - k;
|
||||
in2 = in1 + t;
|
||||
out1 = gid3*t2*t2 + gid2*t2 + m;
|
||||
out2 = out1 + p;
|
||||
} else if (ndim == 2) {
|
||||
k = gid2 & (p - 1);
|
||||
m = (gid2 << 1) - k;
|
||||
in2 = in1 + t2*t;
|
||||
out1 = gid3*t2*t2 + m*t2 + gid1;
|
||||
out2 = out1 + t2*p;
|
||||
} else if (ndim == 3) {
|
||||
k = gid3 & (p - 1);
|
||||
m = (gid3 << 1) - k;
|
||||
in2 = in1 + t2*t2*t;
|
||||
out1 = m*t2*t2 + gid2*t2 + gid1;
|
||||
out2 = out1 + p*t2*t2;
|
||||
}
|
||||
|
||||
const double2 d1 = src[in1];
|
||||
const double2 d2 = src[in2];
|
||||
|
||||
const double theta = (forward*2*M_PI*k) / (p << 1);
|
||||
|
||||
double cs;
|
||||
|
||||
double sn = sincos(theta, &cs);
|
||||
const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
|
||||
|
||||
dst[out1] = d1 + temp;
|
||||
dst[out2] = d1 - temp;
|
||||
}
|
214
src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl
Normal file
214
src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl
Normal file
@ -0,0 +1,214 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
#define TWOPI 6.28318530718
|
||||
|
||||
__kernel void fft_radix2(__global double2* src, __global double2* dst, const int p, const int t) {
|
||||
|
||||
const int gid = get_global_id(0);
|
||||
const int k = gid & (p - 1);
|
||||
const int m = (gid << 1) - k;
|
||||
|
||||
//src += gid;
|
||||
//dst += (gid << 1) - k;
|
||||
|
||||
//const double2 in1 = src[0];
|
||||
//const double2 in2 = src[t];
|
||||
|
||||
const double2 in1 = src[gid];
|
||||
const double2 in2 = src[gid+t];
|
||||
|
||||
const double theta = (-2*M_PI*k) / (p << 1);
|
||||
double cs;
|
||||
|
||||
double sn = sincos(theta, &cs);
|
||||
const double2 temp = (double2) (in2.x * cs - in2.y * sn, in2.y * cs + in2.x * sn);
|
||||
|
||||
//dst[0] = in1 + temp;
|
||||
//dst[p] = in1 - temp;
|
||||
|
||||
dst[m] = in1 + temp;
|
||||
dst[m+p] = in1 - temp;
|
||||
|
||||
}
|
||||
|
||||
__kernel void fft3d_radix2_transpose(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) {
|
||||
|
||||
/* get ids */
|
||||
const int gid1 = get_global_id(0);
|
||||
const int gid2 = get_global_id(1);
|
||||
const int gid3 = get_global_id(2);
|
||||
|
||||
/* calc indexes */
|
||||
int t2 = 2*t;
|
||||
|
||||
int k = gid1 & (p - 1);
|
||||
int m = (gid1 << 1) - k;
|
||||
|
||||
int tmp = gid3*t2*t2 + gid2*t2;
|
||||
|
||||
int in1 = tmp + gid1;
|
||||
int in2 = in1 + t;
|
||||
|
||||
int out1 = tmp + m;
|
||||
int out2 = out1 + p;
|
||||
|
||||
/* calc FFT */
|
||||
const double2 d1 = src[in1];
|
||||
const double2 d2 = src[in2];
|
||||
|
||||
const double theta = (-2*M_PI*k) / (p << 1);
|
||||
double cs;
|
||||
|
||||
double sn = sincos(theta, &cs);
|
||||
const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
|
||||
|
||||
dst[out1] = d1 + temp;
|
||||
dst[out2] = d1 - temp;
|
||||
}
|
||||
|
||||
__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) {
|
||||
|
||||
const int gid1 = get_global_id(0);
|
||||
const int gid2 = get_global_id(1);
|
||||
const int gid3 = get_global_id(2);
|
||||
|
||||
int t2 = 2*t;
|
||||
int k, m, in1, in2, out1, out2;
|
||||
in1 = gid3*t2*t2 + gid2*t2 + gid1;
|
||||
if (ndim == 1) {
|
||||
k = gid1 & (p - 1);
|
||||
m = (gid1 << 1) - k;
|
||||
in2 = in1 + t;
|
||||
out1 = gid3*t2*t2 + gid2*t2 + m;
|
||||
out2 = out1 + p;
|
||||
} else if (ndim == 2) {
|
||||
k = gid2 & (p - 1);
|
||||
m = (gid2 << 1) - k;
|
||||
in2 = in1 + t2*t;
|
||||
out1 = gid3*t2*t2 + m*t2 + gid1;
|
||||
out2 = out1 + t2*p;
|
||||
} else if (ndim == 3) {
|
||||
k = gid3 & (p - 1);
|
||||
m = (gid3 << 1) - k;
|
||||
in2 = in1 + t2*t2*t;
|
||||
out1 = m*t2*t2 + gid2*t2 + gid1;
|
||||
out2 = out1 + p*t2*t2;
|
||||
}
|
||||
|
||||
const double2 d1 = src[in1];
|
||||
const double2 d2 = src[in2];
|
||||
|
||||
const double theta = (-2*M_PI*k) / (p << 1);
|
||||
|
||||
double cs;
|
||||
double sn = sincos(theta, &cs);
|
||||
const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
|
||||
|
||||
dst[out1] = d1 + temp;
|
||||
dst[out2] = d1 - temp;
|
||||
}
|
||||
|
||||
|
||||
__kernel void transpose(__global double2 *data, int ndim, int dim) {
|
||||
|
||||
int k = get_global_id(0);
|
||||
int j = get_global_id(1);
|
||||
int i = get_global_id(2);
|
||||
int nk = get_global_size(0);
|
||||
int nj = get_global_size(1);
|
||||
int ni = get_global_size(2);
|
||||
|
||||
int n, m;
|
||||
n = i*ni*ni + j*nj + k;
|
||||
if (dim == 2)
|
||||
m = i*ni*ni + k*nj + j;
|
||||
else
|
||||
m = k*ni*ni + j*nj + i;
|
||||
|
||||
if (n < m) {
|
||||
double2 tmp = data[m];
|
||||
data[m] = data[n];
|
||||
data[n] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
#define PI2 6.28318530718
|
||||
|
||||
__kernel void fft_batch3D(__global double2 *data_in, __local double2 *d, __local double2 *r, __local double2 *tmp, int N, int dim) {
|
||||
|
||||
int id1 = get_global_id(0);
|
||||
int id2 = get_global_id(1);
|
||||
int id3 = get_global_id(2);
|
||||
|
||||
//calc indexes
|
||||
int sid, offset;
|
||||
if (dim == 1) {
|
||||
sid = id3*N*N + id2*N;
|
||||
offset = 1;
|
||||
} else if (dim == 2) {
|
||||
sid = id3*N*N + id2;
|
||||
offset = N;
|
||||
} else if (dim == 3) {
|
||||
sid = id3*N + id2;
|
||||
offset = N*N;
|
||||
}
|
||||
|
||||
//copy data from global memory to local
|
||||
int i1 = id1;
|
||||
int i2 = id1+N/2;
|
||||
d[i1] = data_in[sid + i1*offset];
|
||||
d[i2] = data_in[sid + i2*offset];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
//exec fft
|
||||
int p1, p2, j, k, out1, step, jump, t;
|
||||
double theta, cs, sn;
|
||||
|
||||
t = 1;
|
||||
step = 1;
|
||||
while (step < N) {
|
||||
jump = step << 1;
|
||||
|
||||
j = i1 >> (t - 1); // same as i1 / step, because t-1 = log2(step)
|
||||
k = i2 & (step - 1); // same as i2 % step
|
||||
|
||||
out1 = j * jump + k;
|
||||
|
||||
theta = -PI2 * k / jump;
|
||||
sn = sincos(theta, &cs);
|
||||
|
||||
double2 temp = (double2) (d[i2].x*cs - d[i2].y*sn, d[i2].y*cs + d[i2].x * sn);
|
||||
|
||||
|
||||
r[out1] = d[i1] + temp;
|
||||
r[out1+step] = d[i1] - temp;
|
||||
|
||||
t++;
|
||||
step = jump;
|
||||
|
||||
//swap local arrays
|
||||
tmp = r;
|
||||
r = d;
|
||||
d = tmp;
|
||||
|
||||
//wait for all threads to finish this iteration
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
tmp = r;
|
||||
r = d;
|
||||
d = tmp;
|
||||
|
||||
//copy data from local memory to global
|
||||
data_in[sid + i1*offset] = r[i1];
|
||||
data_in[sid + i2*offset] = r[i2];
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
41
src/OpenCL/OpenCLKernels/OpenCLTranspose.cl
Normal file
41
src/OpenCL/OpenCLKernels/OpenCLTranspose.cl
Normal file
@ -0,0 +1,41 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
|
||||
/* transpose matrix */
|
||||
__kernel void transpose(__global double2 *input, __global double2 *output,
|
||||
int width, int height, __local double2 *block)
|
||||
{
|
||||
|
||||
//transfer row in shared memory
|
||||
unsigned int xIdx = get_global_id(0);
|
||||
unsigned int yIdx = get_global_id(1);
|
||||
int block_dim = get_local_size(0);
|
||||
|
||||
if ( (xIdx < width) && (yIdx < height) ) {
|
||||
unsigned int idx_in = yIdx * width + xIdx;
|
||||
block[get_local_id(1)*(block_dim+1)+get_local_id(0)] = input[idx_in];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
xIdx = get_group_id(1) * block_dim + get_local_id(0);
|
||||
yIdx = get_group_id(0) * block_dim + get_local_id(1);
|
||||
|
||||
if ( (xIdx < height) && (yIdx < width) ) {
|
||||
unsigned int idx_out = yIdx * height + xIdx;
|
||||
output[idx_out] = block[get_local_id(0)*(block_dim+1)+get_local_id(1)];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* naive transpose matrix kernel */
|
||||
__kernel void transpose_naive(__global double2 *input, __global double2 *output, int width, int height)
|
||||
{
|
||||
unsigned int xIdx = get_global_id(0);
|
||||
unsigned int yIdx = get_global_id(1);
|
||||
|
||||
if (xIdx < width && yIdx < height) {
|
||||
unsigned int idx_in = xIdx + width * yIdx;
|
||||
unsigned int idx_out = yIdx + height * xIdx;
|
||||
output[idx_out] = input[idx_in];
|
||||
}
|
||||
}
|
18
src/Utility/CMakeLists.txt
Normal file
18
src/Utility/CMakeLists.txt
Normal file
@ -0,0 +1,18 @@
|
||||
SET (_SRCS
|
||||
TimeStamp.cpp
|
||||
DKSTimer.cpp
|
||||
)
|
||||
|
||||
SET (_HDRS
|
||||
TimeStamp.h
|
||||
DKSTimer.h
|
||||
)
|
||||
|
||||
#INCLUDE_DIRECTORIES (
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
#)
|
||||
|
||||
ADD_SOURCES (${_SRCS})
|
||||
ADD_HEADERS (${_HDRS})
|
||||
|
||||
INSTALL(FILES ${_HDRS} DESTINATION include/Utility)
|
53
src/Utility/DKSTimer.cpp
Normal file
53
src/Utility/DKSTimer.cpp
Normal file
@ -0,0 +1,53 @@
|
||||
#include "DKSTimer.h"
|
||||
|
||||
//set initial values - running to false, timervalue to zero and name to empty string
|
||||
DKSTimer::DKSTimer() {
|
||||
running = false;
|
||||
timervalue = 0.0;
|
||||
name = "";
|
||||
}
|
||||
|
||||
//destructor does nothing
|
||||
DKSTimer::~DKSTimer() {
|
||||
|
||||
}
|
||||
|
||||
//init the timer by setting name and clearing timervalue, also sets running to false
|
||||
void DKSTimer::init(std::string n) {
|
||||
running = false;
|
||||
timervalue = 0.0;
|
||||
name = n;
|
||||
}
|
||||
|
||||
//if timer is not running get the current time and save to timeStart, set the timer as running
|
||||
void DKSTimer::start() {
|
||||
if (!running) {
|
||||
gettimeofday(&timeStart, NULL);
|
||||
running = true;
|
||||
}
|
||||
}
|
||||
|
||||
//if the timer is running get the current time to timeEnd, calculate the elapsed time befor start
|
||||
//and end, add elapsed time to timervalue, set the timer as not running
|
||||
void DKSTimer::stop() {
|
||||
if (running) {
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
timervalue += ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
|
||||
running = false;
|
||||
}
|
||||
}
|
||||
|
||||
void DKSTimer::reset() {
|
||||
running = false;
|
||||
timervalue = 0.0;
|
||||
}
|
||||
|
||||
//return the accumulated value of timervalue
|
||||
double DKSTimer::gettime() {
|
||||
return timervalue;
|
||||
}
|
||||
|
||||
void DKSTimer::print() {
|
||||
std::cout << "DKSTimer " << name << " elapsed time\t" << timervalue << "s" << std::endl;
|
||||
}
|
59
src/Utility/DKSTimer.h
Normal file
59
src/Utility/DKSTimer.h
Normal file
@ -0,0 +1,59 @@
|
||||
#ifndef H_DKSTIMER
|
||||
#define H_DKSTIMER
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <sys/time.h>
|
||||
|
||||
class DKSTimer {
|
||||
|
||||
private:
|
||||
|
||||
bool running;
|
||||
double timervalue;
|
||||
struct timeval timeStart;
|
||||
struct timeval timeEnd;
|
||||
std::string name;
|
||||
|
||||
public:
|
||||
|
||||
/** Init DKSTimer by seting timer to zero */
|
||||
DKSTimer();
|
||||
|
||||
~DKSTimer();
|
||||
|
||||
/** Init the timer
|
||||
* Set the name for timer and clear all values
|
||||
*/
|
||||
void init(std::string n);
|
||||
|
||||
/** Start the timer.
|
||||
* Get the curret time with gettimeofday and save in timeStart
|
||||
*/
|
||||
void start();
|
||||
|
||||
/** Stop the timer
|
||||
* Get the curretn time with gettimeofday and save in timeEnd
|
||||
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
|
||||
*/
|
||||
void stop();
|
||||
|
||||
/** Reset timervalue to zero.
|
||||
* Set timervalue, timeStart and timeEnd to zero
|
||||
*/
|
||||
void reset();
|
||||
|
||||
/** Return elapsed time in seconds.
|
||||
* Return the value of timervalue
|
||||
*/
|
||||
double gettime();
|
||||
|
||||
/** Print timer.
|
||||
* Print the elapsed time of the timer
|
||||
*/
|
||||
void print();
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
11
src/Utility/TimeStamp.cpp
Normal file
11
src/Utility/TimeStamp.cpp
Normal file
@ -0,0 +1,11 @@
|
||||
#include "TimeStamp.h"
|
||||
|
||||
timestamp_t get_timestamp() {
|
||||
struct timeval now;
|
||||
gettimeofday (&now, NULL);
|
||||
return now.tv_usec + (timestamp_t)now.tv_sec * 1000000;
|
||||
}
|
||||
|
||||
double get_secs(timestamp_t t_start, timestamp_t t_end) {
|
||||
return (t_end - t_start) / 1000000.0L;
|
||||
}
|
14
src/Utility/TimeStamp.h
Normal file
14
src/Utility/TimeStamp.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef H_TIMESTAMPE
|
||||
#define H_TIMESTAMPE
|
||||
|
||||
#include <iostream>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
typedef unsigned long long timestamp_t;
|
||||
|
||||
timestamp_t get_timestamp();
|
||||
double get_secs(timestamp_t t_start, timestamp_t t_end);
|
||||
|
||||
|
||||
#endif
|
84
test/CMakeLists.txt
Normal file
84
test/CMakeLists.txt
Normal file
@ -0,0 +1,84 @@
|
||||
INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||
|
||||
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||
|
||||
#ADD_EXECUTABLE(testDKS testDKS.cpp)
|
||||
#ADD_EXECUTABLE(testChi testChi.cpp)
|
||||
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
||||
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
||||
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
||||
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
||||
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
||||
#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp)
|
||||
#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp)
|
||||
#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp)
|
||||
#ADD_EXECUTABLE(testOffset testOffset.cpp)
|
||||
#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp)
|
||||
#ADD_EXECUTABLE(testMPI testMPI.cpp)
|
||||
#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp)
|
||||
#ADD_EXECUTABLE(testGather testGather.cpp)
|
||||
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
||||
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
||||
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
||||
#ADD_EXECUTABLE(testPush testPush.cpp)
|
||||
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
||||
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
||||
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
||||
|
||||
#shared library
|
||||
#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp)
|
||||
|
||||
|
||||
#TARGET_LINK_LIBRARIES(testDKS dks)
|
||||
#TARGET_LINK_LIBRARIES(testChi dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testMIC dks)
|
||||
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3D dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
||||
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testStockFFT3D dks)
|
||||
#TARGET_LINK_LIBRARIES(testMemObjects dks)
|
||||
#TARGET_LINK_LIBRARIES(testRCFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testOffset dks)
|
||||
#TARGET_LINK_LIBRARIES(testOffsetMPI dks)
|
||||
#TARGET_LINK_LIBRARIES(testMPI dks)
|
||||
#TARGET_LINK_LIBRARIES(testMPIFFT dks)
|
||||
#TARGET_LINK_LIBRARIES(testGather dks)
|
||||
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
||||
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
|
||||
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
|
||||
#TARGET_LINK_LIBRARIES(testPush dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
||||
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
||||
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
||||
|
||||
|
||||
#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared)
|
||||
|
||||
|
||||
#IF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||
#ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp)
|
||||
#ADD_EXECUTABLE(testGreens testGreens.cpp)
|
||||
#ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp)
|
||||
#ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp)
|
||||
#TARGET_LINK_LIBRARIES(testGatherAsync2 dks)
|
||||
#TARGET_LINK_LIBRARIES(testGreens dks)
|
||||
#TARGET_LINK_LIBRARIES(testFFTSolver dks)
|
||||
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks)
|
||||
#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||
|
||||
#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp)
|
||||
#TARGET_LINK_LIBRARIES(testChiSquare dks)
|
||||
|
||||
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
|
||||
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
141
test/testChi.cpp
Normal file
141
test/testChi.cpp
Normal file
@ -0,0 +1,141 @@
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "Utility/TimeStamp.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[4];
|
||||
|
||||
|
||||
if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else if (argc == 2){
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests" << endl;
|
||||
|
||||
/* inti data */
|
||||
int ierr;
|
||||
int nsize = 4000000;
|
||||
int jsize = 16;
|
||||
int psize = 6;
|
||||
double *data = new double[nsize*jsize];
|
||||
double *p = new double[psize*jsize];
|
||||
double data_out = 0;
|
||||
|
||||
srand(time(NULL));
|
||||
for (int i = 0; i < nsize*jsize; i++) {
|
||||
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
|
||||
//data[i] = sign*(double)rand()/RAND_MAX;
|
||||
data[i] = (double)i / (nsize*jsize);
|
||||
//data[i] = 1;
|
||||
}
|
||||
for (int i = 0; i < psize*jsize; i++) {
|
||||
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
|
||||
//p[i] = sign*(double)rand()/RAND_MAX;
|
||||
p[i] = (double)i / (nsize*jsize);
|
||||
//p[i] = 1;
|
||||
}
|
||||
/* end init */
|
||||
|
||||
timestamp_t tstart, tend;
|
||||
//timestamp_t t0, t1;
|
||||
|
||||
tstart = get_timestamp();
|
||||
|
||||
//init dks base class, set API to opencl and init connection with OpenCL device
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
|
||||
//ptrs to hold reference to device memory
|
||||
void *dptr, *ntptr, *pptr;
|
||||
|
||||
//allocate memory on device
|
||||
//t0 = get_timestamp();
|
||||
dptr = base.allocateMemory<double>(nsize*jsize, ierr);
|
||||
ntptr = base.allocateMemory<double>(nsize*jsize, ierr);
|
||||
pptr = base.allocateMemory<double>(psize*jsize, ierr);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Allocate memory: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//write data to device
|
||||
//t0 = get_timestamp();
|
||||
base.writeData<double>(dptr, data, nsize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Write data set: " << get_secs(t0, t1) << endl << endl;
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
//write parameters to device
|
||||
//t0 = get_timestamp();
|
||||
base.writeData<double>(pptr, p, psize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Write parameters: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//set function to calcNt and execute it with necessary parameters
|
||||
//t0 = get_timestamp();
|
||||
base.callNt<double>(ntptr, pptr, psize, nsize, jsize, 0.025);
|
||||
//t1 = get_timestamp();
|
||||
|
||||
//cout << "Calc N(t): " << get_secs(t0, t1) << endl;
|
||||
|
||||
//set function to chi2 and execute it with necessary parameters
|
||||
//t0 = get_timestamp();
|
||||
base.callChi2<double>(ntptr, dptr, ntptr, nsize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Calc chi^2: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//set function so sum and execute it with necessary parameters
|
||||
//t0 = get_timestamp();
|
||||
base.callSum<double>(ntptr, ntptr, nsize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Calc sum: " << get_secs(t0, t1) << endl;
|
||||
|
||||
//read calculated sum (one value)
|
||||
//t0 = get_timestamp();
|
||||
base.readData<double>(ntptr, &data_out, 1);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Read sum: " << get_secs(t0, t1) << endl;
|
||||
cout << "Sum nt: " << data_out << endl;
|
||||
|
||||
/*
|
||||
for (int i = 0; i < psize*jsize; i++) {
|
||||
int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
|
||||
p[i] = sign*(double)rand()/RAND_MAX;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
//cout << endl;
|
||||
}
|
||||
|
||||
//free device memory
|
||||
//t0 = get_timestamp();
|
||||
base.freeMemory<double>(dptr, nsize*jsize);
|
||||
base.freeMemory<double>(ntptr, nsize*jsize);
|
||||
base.freeMemory<double>(pptr, psize*jsize);
|
||||
//t1 = get_timestamp();
|
||||
//cout << "Free memory: " << get_secs(t0, t1) << endl;
|
||||
|
||||
tend = get_timestamp();
|
||||
|
||||
cout << endl << "time: " << get_secs(tstart, tend) << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
168
test/testChiSquare.cpp
Normal file
168
test/testChiSquare.cpp
Normal file
@ -0,0 +1,168 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void initData(vector< vector<double> > &v, int length) {
|
||||
|
||||
for (unsigned int i = 0; i < v.size(); i++) {
|
||||
for (int j = 0; j < length; j++) {
|
||||
v[i].push_back(j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void printData(vector< vector<double> > &v) {
|
||||
for (unsigned int i = 0; i < v.size(); i++) {
|
||||
for (unsigned int j = 0; j < v[i].size(); j++) {
|
||||
cout << v[i][j] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initData(double *data, int sensors, int length) {
|
||||
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
for (int j = 0; j < length; j++) {
|
||||
data[i*length + j] = j;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void printData(double *data, int sensors, int length) {
|
||||
for (int i = 0; i < sensors; i++) {
|
||||
for (int j = 0; j < length; j++) {
|
||||
cout << data[i*length + j] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initPar(double *par, int npar) {
|
||||
|
||||
for (int i = 0; i < npar; i++)
|
||||
par[i] = (double)i / npar;
|
||||
|
||||
}
|
||||
|
||||
void printDiv(int size) {
|
||||
for (int i = 0; i < size; i++)
|
||||
cout << "=";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void calcChisq(vector< vector<double> > fData, double * par, double fTimeResolution, double fRebin)
|
||||
{
|
||||
|
||||
double chisq = 0.0;
|
||||
double theo, data;
|
||||
const double tau=2.197019;
|
||||
const double dt0 = fTimeResolution*0.5*(fRebin-1);
|
||||
double time;
|
||||
double w = par[0]*0.08516155035269027;
|
||||
|
||||
unsigned int i, j;
|
||||
|
||||
for (i=0; i<fData.size(); i++) {
|
||||
for (j=0; j<fData[0].size(); j++) {
|
||||
data = fData[i][j];
|
||||
time = dt0+fTimeResolution*fRebin*j;
|
||||
|
||||
theo = par[2 + i*4] * exp(-time/tau)*(1.0 + par[3 + i*4]*exp(-0.5 * pow(par[1]*time,2.0))*cos(w*time+par[4+i*4]*1.74532925199432955e-2))+par[5+i*4];
|
||||
if (data != 0.0) {
|
||||
chisq += (theo-data)*(theo-data)/data;
|
||||
cout << (theo-data)*(theo-data)/data << "\t";
|
||||
} else {
|
||||
chisq += theo*theo;
|
||||
cout << theo*theo << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
cout << "Chisq: " << chisq << endl;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
bool useCuda = true;
|
||||
if (argc == 2 && atoi(argv[1]) == 1)
|
||||
useCuda = false;
|
||||
|
||||
int ierr;
|
||||
int sensors = 5;
|
||||
int length = 10;
|
||||
int npar = 4 * sensors + 2;
|
||||
int ndata = sensors * length;
|
||||
|
||||
double result;
|
||||
|
||||
double fTimeResolution = 0.05;
|
||||
double fRebin = 5;
|
||||
|
||||
double *par = new double[npar];
|
||||
initPar(par, npar);
|
||||
|
||||
vector< vector< double > > fData;
|
||||
fData.resize(sensors);
|
||||
initData(fData, length);
|
||||
printData(fData);
|
||||
printDiv(75);
|
||||
|
||||
DKSBase dksbase;
|
||||
if (useCuda)
|
||||
dksbase.setAPI("Cuda", 4);
|
||||
else
|
||||
dksbase.setAPI("OpenCL", 6);
|
||||
dksbase.setDevice("-gpu", 4);
|
||||
dksbase.initDevice();
|
||||
dksbase.setupFFT(0, NULL);
|
||||
|
||||
|
||||
void *mem_data, *mem_par, *mem_chisq;
|
||||
cout << "Allocate memory" << endl;
|
||||
mem_par = dksbase.allocateMemory<double>(npar, ierr);
|
||||
mem_data = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
|
||||
mem_chisq = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
|
||||
|
||||
|
||||
cout << "Write data" << endl;
|
||||
dksbase.writeData<double>(mem_par, par, npar);
|
||||
for (int i = 0; i < sensors; i++)
|
||||
dksbase.writeData<double>(mem_data, &fData[i][0], length, i*length);
|
||||
|
||||
|
||||
|
||||
cout << "Call PHistoTFFcn" << endl;
|
||||
dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||
fTimeResolution, fRebin,
|
||||
sensors, length, npar, result);
|
||||
cout << "Result: " << result << endl;
|
||||
|
||||
|
||||
double *out_data = new double[ndata];
|
||||
dksbase.readData<double>(mem_chisq, out_data, ndata);
|
||||
printDiv(75);
|
||||
printData(out_data, sensors, length);
|
||||
printDiv(75);
|
||||
|
||||
calcChisq(fData, par, fTimeResolution, fRebin);
|
||||
printDiv(75);
|
||||
|
||||
cout << "Free memory" << endl;
|
||||
dksbase.freeMemory<double>(mem_par, npar);
|
||||
dksbase.freeMemory<double>(mem_data, ndata);
|
||||
dksbase.freeMemory<double>(mem_chisq, ndata);
|
||||
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
193
test/testChiSquareRT.cpp
Normal file
193
test/testChiSquareRT.cpp
Normal file
@ -0,0 +1,193 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <omp.h>
|
||||
|
||||
#include "DKSBaseMuSR.h"
|
||||
#include "Utility/DKSTimer.h"
|
||||
|
||||
void initData(double *data, int N, bool ones = false) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (ones)
|
||||
data[i] = 1.0;
|
||||
else
|
||||
data[i] = (double)rand() / RAND_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void printData(T *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
std::cout << data[i] << "\t";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
|
||||
const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])";
|
||||
//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])";
|
||||
//const std::string funct = "p[m[0]] * se(t, p[m[1]])";
|
||||
//const std::string funct = "p[m[1]] + p[m[0]]";
|
||||
|
||||
double fTheory(double time, double *par, double *func, int *map) {
|
||||
return cos(time*par[0]) - exp(-time*par[map[0]]);
|
||||
}
|
||||
|
||||
double testFunctionSerial(double *data, double *par, double *func, int *map,
|
||||
double N0, double tau, double bkg, double timeStep,
|
||||
int startTimeBin, int endTimeBin)
|
||||
{
|
||||
double time, diff, theo;
|
||||
double chisq = 0;
|
||||
for (int i = startTimeBin; i < endTimeBin; ++i) {
|
||||
time = i * timeStep;
|
||||
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
|
||||
diff = data[i] - theo;
|
||||
|
||||
chisq += diff * diff / data[i];
|
||||
}
|
||||
|
||||
return chisq;
|
||||
}
|
||||
|
||||
double testFunctionParallel(double *data, double *par, double *func, int *map,
|
||||
double N0, double tau, double bkg, double timeStep,
|
||||
int startTimeBin, int endTimeBin)
|
||||
{
|
||||
int i, chunk;
|
||||
double time, diff, theo;
|
||||
double chisq = 0;
|
||||
|
||||
chunk = (endTimeBin - startTimeBin) / omp_get_num_procs();
|
||||
if (chunk < 10)
|
||||
chunk = 10;
|
||||
#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq)
|
||||
for (i = startTimeBin; i < endTimeBin; ++i) {
|
||||
time = i * timeStep;
|
||||
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
|
||||
diff = data[i] - theo;
|
||||
|
||||
chisq += diff * diff / data[i];
|
||||
}
|
||||
|
||||
return chisq;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int Loop = 100;
|
||||
|
||||
//init test data on the host
|
||||
int Ndata = 8;
|
||||
if (argc > 1)
|
||||
Ndata = atoi(argv[1]);
|
||||
|
||||
int api = 1;
|
||||
if (argc > 2)
|
||||
api = atoi(argv[2]);
|
||||
|
||||
int Npar = 66;
|
||||
int Nfunc = 1;
|
||||
int Nmap = 4;
|
||||
|
||||
double *data = new double[Ndata];
|
||||
double *par = new double[Npar];
|
||||
double *func = new double[Nfunc];
|
||||
int *map = new int[Nmap];
|
||||
|
||||
initData(data, Ndata);
|
||||
initData(par, Npar);
|
||||
initData(func, Nfunc);
|
||||
map[0] = 1;
|
||||
map[1] = 2;
|
||||
map[2] = 3;
|
||||
map[3] = 4;
|
||||
|
||||
//create timers
|
||||
DKSTimer serialTimer;
|
||||
DKSTimer cudaTimer;
|
||||
DKSTimer ompTimer;
|
||||
DKSTimer gpuOverhead;
|
||||
serialTimer.init("Serial timer");
|
||||
cudaTimer.init("Cuda timer");
|
||||
ompTimer.init("OpenMP timer");
|
||||
gpuOverhead.init("Overhead for gpu");
|
||||
|
||||
|
||||
//serial version
|
||||
double resultSerial;
|
||||
|
||||
serialTimer.start();
|
||||
for (int i = 0; i < Loop; i++)
|
||||
resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
|
||||
serialTimer.stop();
|
||||
|
||||
//openmp version
|
||||
double resultOMP = 0.0;
|
||||
|
||||
ompTimer.start();
|
||||
//for (int i = 0; i < Loop; i++)
|
||||
// resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
|
||||
ompTimer.stop();
|
||||
|
||||
|
||||
//create and init dkabase
|
||||
gpuOverhead.start();
|
||||
|
||||
DKSBaseMuSR dksbase;
|
||||
if (api == 1)
|
||||
dksbase.setAPI("Cuda");
|
||||
else
|
||||
dksbase.setAPI("OpenCL");
|
||||
|
||||
dksbase.setDevice("-gpu");
|
||||
dksbase.initDevice();
|
||||
dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap);
|
||||
|
||||
//allocate memory on the device
|
||||
int ierr;
|
||||
void *data_ptr;
|
||||
|
||||
data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
|
||||
|
||||
dksbase.writeData<double>(data_ptr, data, Ndata);
|
||||
dksbase.writeFunctions(func, Nfunc);
|
||||
dksbase.writeMaps(map, Nmap);
|
||||
|
||||
dksbase.callCompileProgram(funct);
|
||||
gpuOverhead.stop();
|
||||
|
||||
double resultCuda;
|
||||
|
||||
cudaTimer.start();
|
||||
for (int i = 0; i < Loop; i++) {
|
||||
dksbase.writeParams(par, Npar);
|
||||
int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap,
|
||||
0.0, 0.1, 0, resultCuda);
|
||||
|
||||
if (ierr != 0)
|
||||
exit (EXIT_FAILURE);
|
||||
|
||||
}
|
||||
cudaTimer.stop();
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "=======================Results=======================" << std::endl;
|
||||
std::cout << "Result serial = " << resultSerial << std::endl;
|
||||
std::cout << "Result prallel = " << resultOMP << std::endl;
|
||||
std::cout << "Result cuda = " << resultCuda << std::endl;
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "=======================Timings=======================" << std::endl;
|
||||
serialTimer.print();
|
||||
ompTimer.print();
|
||||
cudaTimer.print();
|
||||
gpuOverhead.print();
|
||||
std::cout << std::endl;
|
||||
|
||||
dksbase.freeMemory<double>(data_ptr, Ndata);
|
||||
|
||||
return 0;
|
||||
|
||||
|
||||
}
|
248
test/testCollimatorPhysics.cpp
Normal file
248
test/testCollimatorPhysics.cpp
Normal file
@ -0,0 +1,248 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <vector>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include <vector_types.h>
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double Rincol[3];
|
||||
double Pincol[3];
|
||||
} PART_SMALL;
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double z;
|
||||
} Vector;
|
||||
|
||||
PART_SMALL initPartSmall(int d) {
|
||||
|
||||
PART_SMALL p;
|
||||
p.label = 0;
|
||||
p.localID = d;
|
||||
|
||||
p.Rincol[0] = 0.0;
|
||||
p.Rincol[1] = 0.0;
|
||||
p.Rincol[2] = 0.02;
|
||||
|
||||
p.Pincol[0] = 0.0;
|
||||
p.Pincol[1] = 0.0;
|
||||
p.Pincol[2] = 3.9920183237269791e-01;
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
Vector initVector() {
|
||||
Vector tmp;
|
||||
tmp.x = 0.5;
|
||||
tmp.y = 0.5;
|
||||
tmp.z = 0.5;
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
void printPart(PART_SMALL p) {
|
||||
cout << "label: " << p.label << ", ";
|
||||
cout << "localid: " << p.localID << ",";
|
||||
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
|
||||
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void printVector(Vector v) {
|
||||
cout << v.x << "\t" << v.y << "\t" << v.z << endl;
|
||||
}
|
||||
|
||||
void initParts(PART_SMALL *p, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
p[i] = initPartSmall(i);
|
||||
}
|
||||
|
||||
void printParts(PART_SMALL *p, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
printPart(p[i]);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void initVectors(Vector *v, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
v[i] = initVector();
|
||||
}
|
||||
|
||||
void printVectors(Vector *v, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
printVector(v[i]);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
|
||||
void initParams(double *data) {
|
||||
data[0] = 0.0;//2.0000000000000000e-02;
|
||||
data[1] = 1.0;//1.0000000000000000e-02;
|
||||
data[2] = 2.2100000000000000e+00;
|
||||
data[3] = 6.0000000000000000e+00;
|
||||
data[4] = 1.2010700000000000e+01;
|
||||
data[5] = 2.6010000000000000e+00;
|
||||
data[6] = 1.7010000000000000e+03;
|
||||
data[7] = 1.2790000000000000e+03;
|
||||
data[8] = 1.6379999999999999e-02;
|
||||
data[9] = 1.9321266968325795e-01;
|
||||
data[10] = 7.9000000000000000e+01;
|
||||
data[11] = 1.0000000000000002e-12;
|
||||
|
||||
}
|
||||
|
||||
void printDouble(double *data, int N) {
|
||||
for (int i = 0; i < N; i++)
|
||||
std::cout << data[i] << "\t";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int loop = 10;
|
||||
int numpart = 1e5;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenMP");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-npart")) {
|
||||
numpart = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (argv[i] == string("-loop")) {
|
||||
loop = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Number of particles: " << numpart << endl;
|
||||
cout << "Number of loops: " << loop << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
|
||||
//init part vector to test mc
|
||||
PART_SMALL *parts = new PART_SMALL[numpart];
|
||||
initParts(parts, numpart);
|
||||
|
||||
double *params = new double[12];
|
||||
initParams(params);
|
||||
|
||||
//init dks
|
||||
int ierr;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
//init random
|
||||
base.callInitRandoms(numpart);
|
||||
|
||||
//**test collimator physics and sort***//
|
||||
void *part_ptr, *param_ptr;
|
||||
|
||||
//allocate memory for particles
|
||||
part_ptr = base.allocateMemory<PART_SMALL>(numpart, ierr);
|
||||
param_ptr = base.allocateMemory<double>(12, ierr);
|
||||
|
||||
//transfer data to device
|
||||
base.writeData<PART_SMALL>(part_ptr, parts, numpart);
|
||||
base.writeData<double>(param_ptr, params, 12);
|
||||
|
||||
int numaddback;
|
||||
//test calls to do some first executions
|
||||
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
//std::cout << "particles to add back: " << numaddback << std::endl;
|
||||
|
||||
struct timeval timeStart, timeEnd;
|
||||
std::cout << "Start MC" << std::endl;
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i = 0; i < loop; i++) {
|
||||
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
std::cout << "addback: " << numaddback << std::endl;
|
||||
|
||||
std::cout << "End MC" << std::endl;
|
||||
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec));
|
||||
|
||||
std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl;
|
||||
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
|
||||
|
||||
//read data from device
|
||||
base.readData<PART_SMALL>(part_ptr, parts, numpart);
|
||||
|
||||
//free memory
|
||||
base.freeMemory<PART_SMALL>(part_ptr, numpart);
|
||||
base.freeMemory<double>(param_ptr, 12);
|
||||
|
||||
|
||||
std::cout << std::fixed << std::setprecision(4);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
std::cout << parts[i].label << "\t"
|
||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
std:: cout << "..." << std::endl;
|
||||
|
||||
for (int i = numpart - 10; i < numpart; i++) {
|
||||
std::cout << parts[i].label << "\t"
|
||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
double arx = 0, ary = 0, arz = 0;
|
||||
double apx = 0, apy = 0, apz = 0;
|
||||
for (int i = 0; i < numpart; i++) {
|
||||
|
||||
arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart;
|
||||
ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart;
|
||||
arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart;
|
||||
|
||||
apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart;
|
||||
apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart;
|
||||
apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart;
|
||||
|
||||
}
|
||||
|
||||
std::cout << std::fixed << std::setprecision(10);
|
||||
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
|
||||
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
|
||||
|
||||
|
||||
cout << "==========================END TEST==========================" << endl;
|
||||
return 0;
|
||||
|
||||
}
|
126
test/testCollimatorPhysicsMPI.cpp
Normal file
126
test/testCollimatorPhysicsMPI.cpp
Normal file
@ -0,0 +1,126 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "DKSBase.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
int label;
|
||||
unsigned localID;
|
||||
double Rincol[3];
|
||||
double Pincol[3];
|
||||
long IDincol;
|
||||
int Binincol;
|
||||
double DTincol;
|
||||
double Qincol;
|
||||
long LastSecincol;
|
||||
double Bfincol[3];
|
||||
double Efincol[3];
|
||||
} PART;
|
||||
|
||||
PART initPart(int d) {
|
||||
|
||||
PART p;
|
||||
p.label = d;
|
||||
p.localID = d;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
p.Rincol[i] = 0.5;// / (d+1);
|
||||
p.Pincol[i] = 0.5;// / (d+1);
|
||||
p.Bfincol[i] = 1.0 / (d+1);
|
||||
p.Efincol[i] = 1.0 / (d+1);
|
||||
}
|
||||
p.IDincol = d;
|
||||
p.Binincol = d;
|
||||
p.DTincol = d;
|
||||
p.Qincol = d;
|
||||
p.LastSecincol = d;
|
||||
|
||||
return p;
|
||||
|
||||
}
|
||||
|
||||
void printPart(PART p) {
|
||||
|
||||
cout << "label: " << p.label << ", ";
|
||||
//cout << "localID: " << p.localID << ", ";
|
||||
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
|
||||
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", ";
|
||||
//cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", ";
|
||||
//cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", ";
|
||||
//cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", ";
|
||||
//cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl;
|
||||
cout << endl;
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int ierr;
|
||||
int rank, nprocs;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
int numpart = 500501;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
base.callInitRandoms(numpart);
|
||||
|
||||
PART tmp;
|
||||
vector<PART> p;
|
||||
vector<PART> p_out;
|
||||
p_out.resize(numpart);
|
||||
|
||||
for (int i = 0; i < numpart; i++) {
|
||||
tmp = initPart(i + 1);
|
||||
p.push_back(tmp);
|
||||
}
|
||||
|
||||
if (numpart <= 20) {
|
||||
for (int i = 0; i < 10; i++)
|
||||
printPart(p[i]);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
double params[19];
|
||||
for (int i = 0; i < 19; i++)
|
||||
params[i] = 0.05;
|
||||
params[0] = 0;
|
||||
params[1] = 1;
|
||||
|
||||
void *mem_ptr, *par_ptr;
|
||||
|
||||
par_ptr = base.allocateMemory<double>(19, ierr);
|
||||
base.writeData<double>(par_ptr, params, 19);
|
||||
|
||||
mem_ptr = base.allocateMemory<PART>(numpart, ierr);
|
||||
base.writeData<PART>(mem_ptr, &p[0], numpart);
|
||||
|
||||
int addback, dead;
|
||||
for (int i = 0; i < 100; i++)
|
||||
base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead);
|
||||
cout << "Add back: " << addback << ", dead: " << dead << endl;
|
||||
|
||||
base.readData<PART>(mem_ptr, &p_out[0], numpart);
|
||||
base.freeMemory<PART>(mem_ptr, ierr);
|
||||
base.freeMemory<double>(par_ptr, ierr);
|
||||
|
||||
if (numpart <= 20) {
|
||||
for (int i = 0; i < numpart; i++)
|
||||
printPart(p_out[i]);
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
|
||||
}
|
250
test/testCollimatorPhysicsSoA.cpp
Normal file
250
test/testCollimatorPhysicsSoA.cpp
Normal file
@ -0,0 +1,250 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#include <vector>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
#include <vector_types.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef struct {
|
||||
int *label;
|
||||
unsigned *localID;
|
||||
double *rx;
|
||||
double *ry;
|
||||
double *rz;
|
||||
double *px;
|
||||
double *py;
|
||||
double *pz;
|
||||
} PART;
|
||||
|
||||
|
||||
void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz,
|
||||
double *px, double *py, double *pz, int npart) {
|
||||
|
||||
for (int i = 0; i < npart; i++) {
|
||||
label[i] = 0;
|
||||
localID[i] = i;
|
||||
rx[i] = 0.0;
|
||||
ry[i] = 0.0;
|
||||
rz[i] = 0.02;
|
||||
px[i] = 0.0;
|
||||
py[i] = 0.0;
|
||||
pz[i] = 3.9920183237269791e-01;
|
||||
}
|
||||
}
|
||||
|
||||
void initParams(double *data) {
|
||||
data[0] = 0.0;//2.0000000000000000e-02;
|
||||
data[1] = 1.0;//1.0000000000000000e-02;
|
||||
data[2] = 2.2100000000000000e+00;
|
||||
data[3] = 6.0000000000000000e+00;
|
||||
data[4] = 1.2010700000000000e+01;
|
||||
data[5] = 2.6010000000000000e+00;
|
||||
data[6] = 1.7010000000000000e+03;
|
||||
data[7] = 1.2790000000000000e+03;
|
||||
data[8] = 1.6379999999999999e-02;
|
||||
data[9] = 1.9321266968325795e-01;
|
||||
data[10] = 7.9000000000000000e+01;
|
||||
data[11] = 1.0000000000000002e-12;
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int loop = 10;
|
||||
int numpart = 1e5;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if (argv[i] == string("-mic")) {
|
||||
strcpy(api_name, "OpenMP");
|
||||
strcpy(device_name, "-mic");
|
||||
}
|
||||
|
||||
if (argv[i] == string("-npart")) {
|
||||
numpart = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (argv[i] == string("-loop")) {
|
||||
loop = atoi(argv[i+1]);
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int threads = 0;
|
||||
/*
|
||||
#pragma offload target(mic:0) out(threads)
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
threads = omp_get_num_threads();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
cout << "=========================BEGIN TEST=========================" << endl;
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
cout << "Number of particles: " << numpart << endl;
|
||||
cout << "Number of loops: " << loop << endl;
|
||||
cout << "Number of threads: " << threads << endl;
|
||||
cout << "------------------------------------------------------------" << endl;
|
||||
|
||||
//init part vector to test mc
|
||||
//int *label;
|
||||
//unsigned *localID;
|
||||
//double *rx, *ry, *rz, *px, *py, *pz;
|
||||
PART p;
|
||||
p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64);
|
||||
p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64);
|
||||
p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
|
||||
initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart);
|
||||
|
||||
double *params = new double[12];
|
||||
initParams(params);
|
||||
|
||||
//init dks
|
||||
int ierr;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
//init random
|
||||
base.callInitRandoms(numpart);
|
||||
|
||||
//**test collimator physics and sort***//
|
||||
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
|
||||
|
||||
//allocate memory for particles
|
||||
label_ptr = base.allocateMemory<int>(numpart, ierr);
|
||||
localID_ptr = base.allocateMemory<unsigned>(numpart, ierr);
|
||||
rx_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
ry_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
rz_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
px_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
py_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
pz_ptr = base.allocateMemory<double>(numpart, ierr);
|
||||
|
||||
param_ptr = base.allocateMemory<double>(12, ierr);
|
||||
|
||||
//transfer data to device
|
||||
base.writeData<int>(label_ptr, p.label, numpart);
|
||||
base.writeData<unsigned>(localID_ptr, p.localID, numpart);
|
||||
base.writeData<double>(rx_ptr, p.rx, numpart);
|
||||
base.writeData<double>(ry_ptr, p.ry, numpart);
|
||||
base.writeData<double>(rz_ptr, p.rz, numpart);
|
||||
base.writeData<double>(px_ptr, p.px, numpart);
|
||||
base.writeData<double>(py_ptr, p.py, numpart);
|
||||
base.writeData<double>(pz_ptr, p.pz, numpart);
|
||||
|
||||
//transfer params to device
|
||||
base.writeData<double>(param_ptr, params, 12);
|
||||
|
||||
std::cout << "test runs" << std::endl;
|
||||
|
||||
int numaddback;
|
||||
//test calls to do some first executions
|
||||
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
|
||||
struct timeval timeStart, timeEnd;
|
||||
std::cout << "Start MC" << std::endl;
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i = 0; i < loop; i++) {
|
||||
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart);
|
||||
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
|
||||
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
|
||||
base.syncDevice();
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
std::cout << "addback: " << numaddback << std::endl;
|
||||
|
||||
std::cout << "End MC" << std::endl;
|
||||
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec));
|
||||
|
||||
std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl;
|
||||
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
|
||||
|
||||
//read data from device
|
||||
base.readData<int>(label_ptr, p.label, numpart);
|
||||
base.readData<unsigned>(localID_ptr, p.localID, numpart);
|
||||
base.readData<double>(rx_ptr, p.rx, numpart);
|
||||
base.readData<double>(ry_ptr, p.ry, numpart);
|
||||
base.readData<double>(rz_ptr, p.rz, numpart);
|
||||
base.readData<double>(px_ptr, p.px, numpart);
|
||||
base.readData<double>(py_ptr, p.py, numpart);
|
||||
base.readData<double>(pz_ptr, p.pz, numpart);
|
||||
|
||||
//free memory
|
||||
base.freeMemory<int>(label_ptr, numpart);
|
||||
base.freeMemory<unsigned>(localID_ptr, numpart);
|
||||
base.freeMemory<double>(rx_ptr, numpart);
|
||||
base.freeMemory<double>(ry_ptr, numpart);
|
||||
base.freeMemory<double>(rz_ptr, numpart);
|
||||
base.freeMemory<double>(px_ptr, numpart);
|
||||
base.freeMemory<double>(py_ptr, numpart);
|
||||
base.freeMemory<double>(pz_ptr, numpart);
|
||||
|
||||
base.freeMemory<double>(param_ptr, 12);
|
||||
|
||||
/*
|
||||
std::cout << std::fixed << std::setprecision(4);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
std::cout << p.label[i] << "\t" << p.rx[i]
|
||||
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
|
||||
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
|
||||
}
|
||||
std:: cout << "..." << std::endl;
|
||||
|
||||
for (int i = numpart - 10; i < numpart; i++) {
|
||||
std::cout << p.label[i] << "\t" << p.rx[i]
|
||||
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
|
||||
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
|
||||
}
|
||||
|
||||
double arx = 0, ary = 0, arz = 0;
|
||||
double apx = 0, apy = 0, apz = 0;
|
||||
for (int i = 0; i < numpart; i++) {
|
||||
|
||||
arx += sqrt(p.rx[i] * p.rx[i]) / numpart;
|
||||
ary += sqrt(p.ry[i] * p.ry[i]) / numpart;
|
||||
arz += sqrt(p.rz[i] * p.rz[i]) / numpart;
|
||||
|
||||
apx += sqrt(p.px[i] * p.px[i]) / numpart;
|
||||
apy += sqrt(p.py[i] * p.py[i]) / numpart;
|
||||
apz += sqrt(p.pz[i] * p.pz[i]) / numpart;
|
||||
|
||||
}
|
||||
|
||||
std::cout << std::fixed << std::setprecision(10);
|
||||
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
|
||||
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
|
||||
*/
|
||||
cout << "==========================END TEST==========================" << endl;
|
||||
return 0;
|
||||
|
||||
}
|
15
test/testDKS.cpp
Normal file
15
test/testDKS.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
DKSBase base = DKSBase();
|
||||
base.getDevices();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
83
test/testFFT.cpp
Normal file
83
test/testFFT.cpp
Normal file
@ -0,0 +1,83 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests" << endl;
|
||||
|
||||
int N = 2;
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
complex<double> *cdata = new complex<double>[N];
|
||||
complex<double> *cfft = new complex<double>[N];
|
||||
for (int i = 0; i < N; i++) {
|
||||
cdata[i] = complex<double>(0, 0);
|
||||
cfft[i] = complex<double>(0, 0);
|
||||
}
|
||||
|
||||
cdata[0] = complex<double>(1.73205, 1.73205);
|
||||
|
||||
timestamp_t t0, t1;
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* write data to device */
|
||||
mem_ptr = base.pushData< complex<double> >( (const void*)cdata, N, ierr);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 1, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 1, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 1, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.pullData< complex<double> >(mem_ptr, cfft, N);
|
||||
|
||||
/* print results */
|
||||
|
||||
cout << "Data" << endl;
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << cdata[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
cout << "FFT" << endl;
|
||||
for (int i = 0; i < N; i++)
|
||||
cout << cfft[i] << "\t";
|
||||
cout << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
159
test/testFFT3D.cpp
Normal file
159
test/testFFT3D.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
/* usage - ./testFFT3D */
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 16;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 4) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, argv[3]);
|
||||
} else {
|
||||
N = 16;
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
cout << "Begin DKS Base tests, N = " << N << endl;
|
||||
|
||||
int dim = 3;
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cifft = new complex<double>[N*N*N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
base.setupFFT(3, dimsize);
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* allocate memory on device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
|
||||
/* compare results */
|
||||
compareData(cdata, cifft, N, dim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize) {
|
||||
int ni, nj, nk;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
if (!normalize) {
|
||||
cout << data[i*ni*ni + j*nj + k].real() << " ";
|
||||
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
|
||||
} else
|
||||
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
199
test/testFFT3DRC.cpp
Normal file
199
test/testFFT3DRC.cpp
Normal file
@ -0,0 +1,199 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
||||
void initData(double *data, int dimsize[3]);
|
||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
|
||||
void printHelp();
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N1 = 8;
|
||||
int N2 = 8;
|
||||
int N3 = 8;
|
||||
int dim = 3;
|
||||
int loop = 10;
|
||||
|
||||
if ( readParams(argc, argv, N1, N2, N3, loop) )
|
||||
return 0;
|
||||
|
||||
int dimsize[3] = {N3, N2, N1};
|
||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
||||
|
||||
double *rdata = new double[sizereal];
|
||||
double *outdata = new double[sizereal];
|
||||
complex<double> *cfft = new complex<double>[sizecomp];
|
||||
|
||||
for (int i=0; i<sizecomp; ++i) {
|
||||
cfft[i].real() = 7.;
|
||||
cfft[i].imag() = 3.33;
|
||||
}
|
||||
initData(rdata, dimsize);
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
#ifdef DKS_MIC
|
||||
DKSBase base;
|
||||
base.setAPI("OpenMP", 6);
|
||||
base.setDevice("-mic", 4);
|
||||
base.initDevice();
|
||||
base.setupFFTRC(dim, dimsize);
|
||||
/* setup backward fft (COMPLEX->REAL) */
|
||||
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
||||
#endif
|
||||
|
||||
#ifdef DKS_CUDA
|
||||
DKSBase base;
|
||||
base.setAPI("Cuda", 4);
|
||||
base.setDevice("-gpu", 4);
|
||||
base.initDevice();
|
||||
base.setupFFT(dim, dimsize);
|
||||
#endif
|
||||
|
||||
// allocate memory on device
|
||||
int ierr;
|
||||
void *real_ptr, *comp_ptr, *real_res_ptr;
|
||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
real_res_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
comp_ptr = base.allocateMemory< std::complex<double> >(sizecomp, ierr);
|
||||
|
||||
// execute one run before starting the timers
|
||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||
|
||||
//timer for total loop time, FFT and IFFT calls
|
||||
struct timeval timeStart, timeEnd;
|
||||
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
|
||||
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
|
||||
|
||||
gettimeofday(&timeStart, NULL);
|
||||
for (int i=0; i<loop; ++i){
|
||||
|
||||
// write data to device
|
||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||
|
||||
// execute rcfft
|
||||
gettimeofday(&timeFFTStart[i], NULL);
|
||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
gettimeofday(&timeFFTEnd[i], NULL);
|
||||
|
||||
// execute crfft
|
||||
gettimeofday(&timeIFFTStart[i], NULL);
|
||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||
gettimeofday(&timeIFFTEnd[i], NULL);
|
||||
|
||||
//normalize
|
||||
#ifdef DKS_CUDA
|
||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||
#endif
|
||||
|
||||
// read IFFT data from device
|
||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||
|
||||
}
|
||||
gettimeofday(&timeEnd, NULL);
|
||||
|
||||
// free device memory
|
||||
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
||||
base.freeMemory<double>(real_ptr, sizereal);
|
||||
base.freeMemory<double>(real_res_ptr, sizereal);
|
||||
|
||||
// compare in and out data to see if we get back the same results
|
||||
compareData(rdata, outdata, N1, N2, N3, dim);
|
||||
|
||||
//calculate seconds for total time and fft times
|
||||
double tfft = 0;
|
||||
double tifft = 0;
|
||||
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
|
||||
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
|
||||
|
||||
for (int i = 0; i < loop; i++) {
|
||||
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
|
||||
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
|
||||
|
||||
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
|
||||
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
|
||||
}
|
||||
|
||||
//print timing results
|
||||
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
|
||||
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
|
||||
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
|
||||
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
|
||||
<< "\n\n";
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
||||
int id;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < NI; i++) {
|
||||
for (int j = 0; j < NJ; j++) {
|
||||
for (int k = 0; k < NK; k++) {
|
||||
id = k*NI*NJ + j*NI + i;
|
||||
sum += fabs(data1[id] - data2[id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
||||
}
|
||||
|
||||
void initData(double *data, int dimsize[3]) {
|
||||
for (int i = 0; i < dimsize[2]; i++) {
|
||||
for (int j = 0; j < dimsize[1]; j++) {
|
||||
for (int k = 0; k < dimsize[0]; k++) {
|
||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printHelp() {
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "testFFT3DRC executes 3D real complex and 3D complex real"
|
||||
<< "function on the Intel MIC.\n";
|
||||
std::cout << "Operations performed by testRC are: "
|
||||
<< "write data to MIC -> FFT -> IFFT -> read data from MIC.\n";
|
||||
std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z "
|
||||
<< "-loop $l\n";
|
||||
std::cout << "where $x $y $z are number of elements in each dimension and "
|
||||
<< "$l is the number of times all the operations will be performed.\n";
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
|
||||
if ( argv[i] == std::string("-grid") ) {
|
||||
N1 = atoi(argv[i + 1]);
|
||||
N2 = atoi(argv[i + 2]);
|
||||
N3 = atoi(argv[i + 3]);
|
||||
i += 3;
|
||||
}
|
||||
|
||||
if ( argv[i] == std::string("-loop") ) {
|
||||
loop = atoi(argv[i + 1]);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) {
|
||||
printHelp();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
220
test/testFFT3DRC_MIC.cpp
Normal file
220
test/testFFT3DRC_MIC.cpp
Normal file
@ -0,0 +1,220 @@
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
void printData3DN4(double* data, int N, int dim);
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
void compareData(double* data1, double* data2, int N, int dim);
|
||||
|
||||
/* Compute (K*L)%M accurately */
|
||||
static double moda(int K, int L, int M)
|
||||
{
|
||||
return (double)(((long long)K * L) % M);
|
||||
}
|
||||
/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */
|
||||
static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4)
|
||||
{
|
||||
double TWOPI = 6.2831853071795864769, phase, factor;
|
||||
int n1, n2, n3, S1, S2, S3, index;
|
||||
|
||||
/* Generalized strides for row-major addressing of x */
|
||||
S3 = 1;
|
||||
S2 = (N3/2+1)*2;
|
||||
S1 = N2*(N3/2+1)*2;
|
||||
|
||||
factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0;
|
||||
for (n1 = 0; n1 < N1; n1++)
|
||||
{
|
||||
for (n2 = 0; n2 < N2; n2++)
|
||||
{
|
||||
for (n3 = 0; n3 < N3; n3++)
|
||||
{
|
||||
phase = moda(n1,H1,N1) / N1;
|
||||
phase += moda(n2,H2,N2) / N2;
|
||||
phase += moda(n3,H3,N3) / N3;
|
||||
index = n1*S1 + n2*S2 + n3*S3;
|
||||
//cout << "index = " << index << endl;
|
||||
x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = atoi(argv[1]);
|
||||
int dim = 3;
|
||||
int dimsize[3] = {N, N, N};
|
||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||
int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2];
|
||||
|
||||
//double *rdata = new double[sizereal];
|
||||
//double *outdata = new double[sizereal];
|
||||
//complex<double> *cfft = new complex<double>[sizecomp];
|
||||
double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
|
||||
double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
|
||||
complex<double> *cfft = (complex<double> *)malloc(sizecomp*sizeof(complex<double>));
|
||||
|
||||
init_r(rdata, N,N,N);
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI("OpenMP", 6);
|
||||
base.setDevice("-mic", 4);
|
||||
base.initDevice();
|
||||
|
||||
/* setup forward fft (REAL->COMPLEX) */
|
||||
base.setupFFTRC(dim, dimsize);
|
||||
|
||||
int ierr;
|
||||
void *real_ptr, *comp_ptr;
|
||||
|
||||
/* allocate memory on device */;
|
||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
||||
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
||||
|
||||
/* write data to device */
|
||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||
|
||||
//printData3DN4(rdata,N,3);
|
||||
|
||||
/* execute rcfft */
|
||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
|
||||
/* read FFT data from device */
|
||||
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
||||
base.writeData<double>(comp_ptr, cfft, sizereal);
|
||||
|
||||
|
||||
/* setup backward fft (COMPLEX->REAL) */
|
||||
base.setupFFTCR(dim, dimsize,1./(N*N*N));
|
||||
/* execute crfft */
|
||||
base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||
|
||||
/* normalize */
|
||||
//base.callNormalizeC2RFFT(real_ptr, dim, dimsize);
|
||||
|
||||
/* read FFT data from device */
|
||||
//base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
||||
|
||||
/* read IFFT data from device */
|
||||
base.readData<double>(real_ptr, outdata, sizereal);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
||||
base.freeMemory<double>(real_ptr, sizereal);
|
||||
|
||||
/* compare data */
|
||||
compareData(rdata, outdata, N, dim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize) {
|
||||
int ni, nj, nk;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
if (!normalize)
|
||||
cout << data[i*ni*ni + j*nj + k].real() << "\t";
|
||||
else
|
||||
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
void printData3DN4(double* data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k];
|
||||
//double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
//if (a < 10e-5 && a > -10e-5)
|
||||
// a = 0;
|
||||
|
||||
cout << d << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
||||
void compareData(double* data1, double* data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
//sum += fabs(data1[id] - data2[id]/(N*N*N));
|
||||
sum += fabs(data1[id] - data2[id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
|
||||
}
|
159
test/testFFT3DSO.cpp
Normal file
159
test/testFFT3DSO.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
|
||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
/* usage - ./testFFT3D */
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 16;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, "Cuda");
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 3) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc == 4) {
|
||||
N = atoi(argv[1]);
|
||||
strcpy(api_name, argv[2]);
|
||||
strcpy(device_name, argv[3]);
|
||||
} else {
|
||||
N = 16;
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
|
||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
cout << "Begin DKS Base tests, N = " << N << endl;
|
||||
|
||||
int dim = 3;
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cifft = new complex<double>[N*N*N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(device_name));
|
||||
base.initDevice();
|
||||
base.setupFFT(3, dimsize);
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* allocate memory on device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
|
||||
/* compare results */
|
||||
compareData(cdata, cifft, N, dim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printData(complex<double>* &data, int N, int dim, bool normalize) {
|
||||
int ni, nj, nk;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
if (!normalize) {
|
||||
cout << data[i*ni*ni + j*nj + k].real() << " ";
|
||||
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
|
||||
} else
|
||||
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
double d = data[i*N*N + j*N + k].real();
|
||||
double a = data[i*N*N + j*N + k].imag();
|
||||
|
||||
if (d < 10e-5 && d > -10e-5)
|
||||
d = 0;
|
||||
if (a < 10e-5 && a > -10e-5)
|
||||
a = 0;
|
||||
|
||||
cout << d << "; " << a << "\t";
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
130
test/testFFT3DTiming.cpp
Normal file
130
test/testFFT3DTiming.cpp
Normal file
@ -0,0 +1,130 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <complex>
|
||||
|
||||
#include "Utility/TimeStamp.h"
|
||||
#include "DKSBase.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int N = 4;
|
||||
char *api_name = new char[10];
|
||||
char *device_name = new char[10];
|
||||
if (argc == 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, "-gpu");
|
||||
} else if (argc > 2) {
|
||||
strcpy(api_name, argv[1]);
|
||||
strcpy(device_name, argv[2]);
|
||||
N = atoi(argv[3]);
|
||||
} else {
|
||||
strcpy(api_name, "OpenCL");
|
||||
strcpy(device_name, "-gpu");
|
||||
}
|
||||
int dimsize[3] = {N, N, N};
|
||||
|
||||
|
||||
cout << "Use api: " << api_name << endl;
|
||||
|
||||
cout << "Begin DKS Base tests, N = " << N << endl;
|
||||
|
||||
complex<double> *cdata = new complex<double>[N*N*N];
|
||||
complex<double> *cfft = new complex<double>[N*N*N];
|
||||
complex<double> *cifft = new complex<double>[N*N*N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int k = 0; k < N; k++) {
|
||||
cdata[i*N*N + j*N + k] = complex<double>((double)i / N, 0);
|
||||
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
timestamp_t t0, t1;
|
||||
|
||||
/* init DKSBase */
|
||||
cout << "Init device and set function" << endl;
|
||||
DKSBase base;
|
||||
base.setAPI(api_name, strlen(api_name));
|
||||
base.setDevice(device_name, strlen(api_name));
|
||||
base.initDevice();
|
||||
|
||||
void *mem_ptr;
|
||||
int ierr;
|
||||
|
||||
/* run stest funct to init device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||
/* end test */
|
||||
|
||||
int steps = 10;
|
||||
base.oclClearEvents();
|
||||
t0 = get_timestamp();
|
||||
for (int i = 0; i < steps; i++) {
|
||||
|
||||
/* allocate memory on device */
|
||||
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
|
||||
|
||||
/* write data to device */
|
||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||
|
||||
/* execute fft */
|
||||
base.callFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute ifft */
|
||||
base.callIFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* execute normalize */
|
||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
||||
|
||||
/* read data from device */
|
||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||
|
||||
/* free device memory */
|
||||
base.freeMemory< complex<double> >(mem_ptr, N);
|
||||
|
||||
//compareData(cdata, cifft, N, 3);
|
||||
}
|
||||
t1 = get_timestamp();
|
||||
|
||||
cout << "=========================" << endl;
|
||||
//base.oclEventInfo();
|
||||
cout << "Average total: " << get_secs(t0, t1) / steps << endl;
|
||||
cout << "=========================" << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
|
||||
int ni, nj, nk, id;
|
||||
ni = (dim > 2) ? N : 1;
|
||||
nj = (dim > 1) ? N : 1;
|
||||
nk = N;
|
||||
double sum = 0;
|
||||
for (int i = 0; i < ni; i++) {
|
||||
for (int j = 0; j < nj; j++) {
|
||||
for (int k = 0; k < nk; k++) {
|
||||
id = i*ni*ni + j*nj + k;
|
||||
sum += fabs(data1[id].real() - data2[id].real());
|
||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user