23 Commits

Author SHA1 Message Date
24604042e7 Coulomb scattering and rot updated in cuda version 2017-09-19 13:36:42 +02:00
5c9048a308 added a macOS specific flag which deals with path variables of shared libs 2017-08-21 14:16:57 +02:00
ba701a5744 remove opencl event tracking since it causes memory leak 2017-08-18 14:11:35 +02:00
74b12ebdc0 release old OpenCL kernel before creating a new one 2017-08-18 13:13:55 +02:00
6d14df5b32 update chiSquare test functions to check the created kernel before execution 2017-08-17 16:57:30 +02:00
79833cf7f5 update work item size correctly for devices where supported size is smaller than DKS default 2017-08-17 16:56:57 +02:00
ce3491740c remove openmp flags for apple machines 2017-08-17 16:54:07 +02:00
fc6d2ccd4a update doxygen HTML output 2017-08-10 15:50:25 +02:00
d1774b84e8 added doxygen files 2017-08-10 14:58:12 +02:00
ccc4329bef updated documentation 2017-08-10 14:57:48 +02:00
7ca93a3a49 dont link with boost filesystem 2017-06-07 11:10:38 +02:00
aa14065994 Merge branch 'master' of gitlab.psi.ch:uldis_l/DKS 2017-06-06 14:24:20 +02:00
50ecb31042 remove any OpenCL calls when DKS compiled without OpenCL 2017-06-06 14:23:24 +02:00
3d130aa01f write CUDA libraries that are needed for linking in config file 2017-06-06 14:22:34 +02:00
5071ea5741 add option to link DKS with static cuda libraries 2017-06-06 14:22:00 +02:00
efe5f0db38 remove nvToolsExt for manual profiling 2017-05-30 11:07:22 +02:00
1d420504cc fix FFT test when OPAL and/or Musr modules are not compiled 2017-05-30 11:05:12 +02:00
cc59f550ab move patch version forward 2017-05-29 13:29:08 +02:00
d20fea2caa readme updated 2017-05-29 13:28:23 +02:00
8b7d824b3a updated documentation 2017-05-29 13:21:34 +02:00
2c9fe4ea6f fix for seperate FFT 2017-05-29 13:18:37 +02:00
e32f9aaff2 include FFT in DKSOPAL and DKSBaseMuSR 2017-05-29 12:49:32 +02:00
f3527969cb seperate FFT from DKSOPAL 2017-05-29 09:39:25 +02:00
57 changed files with 4019 additions and 1053 deletions

View File

@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
PROJECT (DKS) PROJECT (DKS)
SET (DKS_VERSION_MAJOR 1) SET (DKS_VERSION_MAJOR 1)
SET (DKS_VERSION_MINOR 1) SET (DKS_VERSION_MINOR 1)
SET (DKS_VERSION_PATCH 1) SET (DKS_VERSION_PATCH 2)
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH}) set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
SET (PACKAGE \"dks\") SET (PACKAGE \"dks\")
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\") SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -10,6 +10,9 @@ SET (PACKAGE_NAME \"DKS\")
SET (PACKAGE_TARNAME \"dks\") SET (PACKAGE_TARNAME \"dks\")
SET (DKS_VERSION_STR "\"${DKS_VERSION}\"") SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
if (APPLE)
SET (CMAKE_MACOSX_RPATH TRUE)
endif (APPLE)
#get compiler name #get compiler name
#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER}) #STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
@ -28,11 +31,13 @@ MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
set (BOOSTROOT $ENV{BOOST_DIR}) set (BOOSTROOT $ENV{BOOST_DIR})
SET (Boost_USE_STATIC_LIBS OFF) SET (Boost_USE_STATIC_LIBS OFF)
SET (Boost_USE_STATIC_RUNTIME OFF) SET (Boost_USE_STATIC_RUNTIME OFF)
FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system) #FIND_PACKAGE(Boost 1.55 REQUIRED COMPONENTS filesystem system)
FIND_PACKAGE(Boost 1.41 REQUIRED)
IF (Boost_FOUND) IF (Boost_FOUND)
MESSAGE (STATUS "Boost version: ${Boost_VERSION}")
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}") MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}") MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}") #MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS}) INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
ENDIF (Boost_FOUND) ENDIF (Boost_FOUND)
@ -79,7 +84,7 @@ OPTION (USE_UQTK "Use UQTK" OFF)
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
#for intel compiler turn on openmp and opencl #for intel compiler turn on openmp and opencl
OPTION (USE_OPENCL "Use OpenCL" ON) OPTION (USE_OPENCL "Use OpenCL" OFF)
OPTION (USE_CUDA "Use CUDA" OFF) OPTION (USE_CUDA "Use CUDA" OFF)
OPTION (USE_MIC "Use intel MIC" ON) OPTION (USE_MIC "Use intel MIC" ON)
@ -110,18 +115,30 @@ IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
#gnu copmpiler specific flags #gnu copmpiler specific flags
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL) IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
OPTION (USE_OPENCL "Use OpenCL" ON) OPTION (USE_OPENCL "Use OpenCL" OFF)
OPTION (USE_CUDA "Use CUDA" OFF) OPTION (USE_CUDA "Use CUDA" OFF)
OPTION (USE_MIC "Use intel MIC" OFF) OPTION (USE_MIC "Use intel MIC" OFF)
OPTION (STATIC_CUDA "Link static cuda libraries" OFF)
IF (ENABLE_MUSR)
SET (USE_OPENCL ON)
ENDIF (ENABLE_MUSR)
#dont set openmp flag for apple devices
IF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu") SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
ELSE ($CMAKE_C_COMPILER_ID} STREQUAL "GNU")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -std=c++11 -D__wsu")
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
FIND_PACKAGE(CUDA) FIND_PACKAGE(CUDA)
IF (CUDA_FOUND) IF (CUDA_FOUND)
SET (USE_CUDA ON) SET (USE_CUDA ON)
OPTION(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cuda libraries" OFF)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64) LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs) LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -131,20 +148,27 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}") MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
SET(CUDA_PROPAGATE_HOST_FLAGS OFF) SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA") SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")
SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false") SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -DDEBUG -std=c++11 -D__wsu")
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}") IF (NOT STATIC_CUDA)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA")
SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart -lcufft -lcublas")
ELSE (NOT STATIC_CUDA)
SET (CUDA_SEPARABLE_COMPILATION ON)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA -fPIC")
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-rdc=true;-lcufft_static;-lcublas_static;-lcurand_static")
SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart_static -lcufft_static -lcublas_static -lculibos")
ENDIF (NOT STATIC_CUDA)
#if cuda version >= 7.0 add runtime commpilation flags #if cuda version >= 7.0 add runtime commpilation flags
IF (NOT CUDA_VERSION VERSION_LESS "7.0") IF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda") SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}") MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
#set(CUDA_SEPARABLE_COMPILATION ON)
SET(BUILD_SHARED_LIBS OFF) SET(BUILD_SHARED_LIBS OFF)
ENDIF (CUDA_FOUND) ENDIF (CUDA_FOUND)
@ -154,6 +178,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
MESSAGE(STATUS "CUDA not found, looking for OpenCL") MESSAGE(STATUS "CUDA not found, looking for OpenCL")
FIND_PACKAGE(OpenCL) FIND_PACKAGE(OpenCL)
MESSAGE("after FIND_PACKAGE(OpenCL): version: ${OpenCL_VERSION_STRING}")
MESSAGE("after FIND_PACKAGE(OpenCL): inc dir: ${OpenCL_INCLUDE_DIR}")
MESSAGE("after FIND_PACKAGE(OpenCL): lib dir: ${OpenCL_LIBRARY}")
IF (OpenCL_FOUND) IF (OpenCL_FOUND)
MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}") MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}") MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
@ -171,9 +198,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
ENDIF(APPLE AND NOT CUDA_FOUND) ENDIF(APPLE AND NOT CUDA_FOUND)
#if cuda found set cuda opencl flags #if cuda found set cuda opencl flags
IF (CUDA_FOUND) IF (CUDA_FOUND AND USE_OPENCL)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL") SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
ENDIF (CUDA_FOUND) ENDIF (CUDA_FOUND AND USE_OPENCL)
#if cuda not found but amd opencl found set opencl flags #if cuda not found but amd opencl found set opencl flags
IF (NOT CUDA_FOUND AND OpenCL_FOUND) IF (NOT CUDA_FOUND AND OpenCL_FOUND)
@ -185,7 +212,7 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI") SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx") ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL) ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}") SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}") MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")

2356
Doxyfile Normal file

File diff suppressed because it is too large Load Diff

View File

@ -29,30 +29,30 @@ Intel MIC compilers (optional)
######Source###### ######Source######
https://gitlab.psi.ch/uldis_l/DKS https://gitlab.psi.ch/uldis_l/DKS
######Changes from DKS-1.0.x version######
DKS is split into three modules that can be enabled/disabled at compile time depending on which software it is used for.
By default only DKSBase and DKSFFT modules are enabled. In order to install other modules the necessary otion needs to be enabled.
Supported options are:
-DENABLE_OPAL option should be enabled if DKS will be used for OPAL
-DENABLE_MUSR option should be enable if DKS will be used for musrfit
-DENABLE_PET option should be enabled if DKS will be used for PET image reconstruction
See install instructions for more details on how to enable the necessary options in DKS
######Install###### ######Install######
#consult the https://gitlab.psi.ch/uldis_l/DKS/wikis/home for full install isntructions
#clone DKS #clone DKS
git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS
#set compilers to use #switch to the desired version (OPTIONAL)
#supported c++ compilers: g++, icpc, mpicxx whith g++ git checkout DKS-1.1.0
#supported c compilers: gcc, icc, mpicc whith gcc
export CXX_COMPILER=cpp_compiler_name
export CC_COMPILER=c_compiler_name
#set dks root directory directory #configure installation in build directory
cd DKS #enable DKS modules to compile -DENABLE_OPAL, -DENABLE_MUSR, -DENABLE_PET
export DKS_ROOT = $PWD CXX=<c++ compiler> CC=<c compiler> -DCMAKE_INSTALL_PREFIX=<install dir> <path to DKS source> [-DENABLE_OPAL=1 -DENABLE_MUSR=1 -DENABLE_PET=1]
#set build directory
mkdir $DKS_BUILD_DIR
cd $DKS_BUILD_DIR
#set install directory
export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
#install DKS
make make
make install make install

View File

@ -4,28 +4,30 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#chi square kernel tests #chi square kernel tests
IF (ENABLE_MUSR) IF (ENABLE_MUSR)
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testChiSquareRT dks ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp) ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${CLFFT_LIBRARIES})
IF (USE_UQTK) IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp) ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK) ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions #test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp) ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testSearch dks ${CLFFT_LIBRARIES})
ENDIF (ENABLE_MUSR) ENDIF (ENABLE_MUSR)
IF (ENABLE_OPAL) IF (ENABLE_OPAL)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testPushKick testPushKick.cpp) ADD_EXECUTABLE(testPushKick testPushKick.cpp)
TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testPushKick dks ${CLFFT_LIBRARIES})
ENDIF(ENABLE_OPAL) ENDIF(ENABLE_OPAL)
ADD_EXECUTABLE(testFFT testFFT.cpp)
TARGET_LINK_LIBRARIES(testFFT dks ${CLFFT_LIBRARIES})

View File

@ -293,6 +293,9 @@ int runTest(const char *api_name, const char *device_name, bool autotune, bool m
if (autotune) if (autotune)
dksbase.setAutoTuningOn(); dksbase.setAutoTuningOn();
//check kernel
dksbase.checkMuSRKernels(1);
//tmp values to store results and tmp values for time steps and start time //tmp values to store results and tmp values for time steps and start time
double result_gpu = 0.0; double result_gpu = 0.0;
double result_cpu = 0.0; double result_cpu = 0.0;
@ -373,11 +376,11 @@ int main(int argc, char* argv[]) {
} }
int numPlatforms = 2; int numPlatforms = 3;
const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"}; const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"}; const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
for (int i = 0; i < numPlatforms; i++) { for (int i = 2; i < numPlatforms; i++) {
runTest(api[i], device[i], autotune, mlh, asym); runTest(api[i], device[i], autotune, mlh, asym);
} }

View File

@ -392,7 +392,10 @@ int main(int argc, char *argv[]) {
dksbase.setAPI(api_name); dksbase.setAPI(api_name);
dksbase.setDevice(device_name); dksbase.setDevice(device_name);
std::cout << "Init device" << std::endl;
dksbase.initDevice(); dksbase.initDevice();
std::cout << "Init chi square" << std::endl;
dksbase.initChiSquare(Ndata, np, nf, nm); dksbase.initChiSquare(Ndata, np, nf, nm);
dksbase.writeParams(p, np); dksbase.writeParams(p, np);
@ -401,20 +404,24 @@ int main(int argc, char *argv[]) {
dksbase.callSetConsts(N0, TAU, BKG); dksbase.callSetConsts(N0, TAU, BKG);
std::cout << "Compile program" << std::endl;
dksbase.callCompileProgram(sfunc); dksbase.callCompileProgram(sfunc);
dksbase.checkMuSRKernels(1);
if (autotune) if (autotune)
dksbase.setAutoTuningOn(); dksbase.setAutoTuningOn();
int oper = 0; //std::cout << "Get operations" << std::endl;
dksbase.getOperations(oper); //int oper = 0;
//dksbase.getOperations(oper);
cout << "=========================BEGIN TEST=========================" << endl; cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl; cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of params: " << np << endl; cout << "Number of params: " << np << endl;
cout << "Number of maps: " << nm << endl; cout << "Number of maps: " << nm << endl;
cout << "Number of predefined functions: " << nfunc << endl; cout << "Number of predefined functions: " << nfunc << endl;
cout << "Number of ptx instructions: " << oper << endl; //cout << "Number of ptx instructions: " << oper << endl;
cout << "------------------------------------------------------------" << endl; cout << "------------------------------------------------------------" << endl;
cout << sfunc << endl; cout << sfunc << endl;
cout << "------------------------------------------------------------" << endl; cout << "------------------------------------------------------------" << endl;

214
auto-tuning/testFFT.cpp Normal file
View File

@ -0,0 +1,214 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSFFT.h"
using namespace std;
void compareData(complex<double>* data1, complex<double>* data2, int N, int dim);
void compareData(double* data1, double *data2, int N, int dim);
void initData(complex<double> *data, int dimsize[3], int dim);
void initData(double *data, int dimsize[3], int dim);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
char *api_name, char *device_name);
void printHelp();
int main(int argc, char *argv[]) {
int ierr;
int N1 = 8;
int N2 = 8;
int N3 = 8;
int dim = 3;
char *api_name = new char[10];
char *device_name = new char[10];
if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) )
return 0;
cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N1, N2, N3};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal];
double *ordata = new double[sizereal];
complex<double> *cdata = new complex<double>[sizereal];
complex<double> *codata = new complex<double>[sizereal];
initData(rdata, dimsize, 3);
initData(cdata, dimsize, 3);
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSFFT base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
cout << "init device" << endl;
base.initDevice();
cout << "setup fft" << endl;
base.setupFFT(dim, dimsize);
//Test RC FFT -> CR FFT
void *real_ptr, *comp_ptr, *res_ptr;
cout << "allocate memory" << endl;
real_ptr = base.allocateMemory<double>(sizereal, ierr);
res_ptr = base.allocateMemory<double>(sizereal, ierr);
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
cout << "write data" << endl;
base.writeData<double>(real_ptr, rdata, sizereal);
cout << "perform fft" << endl;
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize);
base.callNormalizeC2RFFT(res_ptr, dim, dimsize);
cout << "read data" << endl;
base.readData<double>(res_ptr, ordata, sizereal);
compareData(rdata, ordata, N1, 3);
base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(res_ptr, sizereal);
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
//Test CC FFT
void *mem_ptr;
mem_ptr = base.allocateMemory< complex<double> >(sizereal, ierr);
base.writeData< complex<double> >(mem_ptr, cdata, sizereal);
base.callFFT(mem_ptr, 3, dimsize);
base.callIFFT(mem_ptr, 3, dimsize);
base.callNormalizeFFT(mem_ptr, 3, dimsize);
base.readData< complex<double> >(mem_ptr, codata, sizereal);
compareData(cdata, codata, N1, 3);
base.freeMemory< complex<double> > (mem_ptr, sizereal);
delete[] rdata;
delete[] ordata;
delete[] cdata;
delete[] codata;
}
void compareData(complex<double>* data1, complex<double>* data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}
void compareData(double* data1, double* data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id] - data2[id]);
}
}
}
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
}
void initData(complex<double> *data, int dimsize[3], int dim) {
if (dim == 3) {
for (int i = 0; i < dimsize[2]; i++)
for (int j = 0; j < dimsize[1]; j++)
for (int k = 0; k < dimsize[0]; k++)
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
} else if (dim == 2) {
for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) {
data[j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
}
}
} else {
for (int k = 0; k < dimsize[0]; k++)
data[k] = complex<double>(sin(k), 0.0);
}
}
void initData(double *data, int dimsize[3], int dim) {
if (dim == 3) {
for (int i = 0; i < dimsize[2]; i++)
for (int j = 0; j < dimsize[1]; j++)
for (int k = 0; k < dimsize[0]; k++)
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
} else if (dim == 2) {
for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) {
data[j*dimsize[0] + k] = sin(k);
}
}
} else {
for (int k = 0; k < dimsize[0]; k++)
data[k] = sin(k);
}
}
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
char *api_name, char *device_name)
{
for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-dim")) {
dim = atoi(argv[i + 1]);
i++;
}
if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]);
N3 = atoi(argv[i + 3]);
i += 3;
}
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
}
return false;
}

View File

@ -3,5 +3,7 @@ SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib") SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
SET(${PROJECT_NAME}_LIBRARY "dks") SET(${PROJECT_NAME}_LIBRARY "dks")
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH}) SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
SET(DKS_CUDA_STATIC ${STATIC_CUDA})
SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
SET(DKS_VERSION ${DKS_VERSION}) SET(DKS_VERSION ${DKS_VERSION})
SET(DKS_VERSION_STR ${DKS_VERSION_STR}) SET(DKS_VERSION_STR ${DKS_VERSION_STR})

Binary file not shown.

View File

@ -15,6 +15,9 @@
class DKSBaseMuSR; class DKSBaseMuSR;
/**
* Interface to implement ChiSquareRuntime class for musrfit.
*/
class ChiSquareRuntime { class ChiSquareRuntime {
friend class DKSBaseMuSR; friend class DKSBaseMuSR;
@ -63,23 +66,54 @@ public:
/** Default constructor */ /** Default constructor */
//ChiSquareRuntime(); //ChiSquareRuntime();
/** Default destructor */ /** Default destructor. */
virtual ~ChiSquareRuntime() { }; virtual ~ChiSquareRuntime() { };
/**
* Compile GPU programm generated at runtime.
*/
virtual int compileProgram(std::string function, bool mlh = false) = 0; virtual int compileProgram(std::string function, bool mlh = false) = 0;
/**
* Launche the compiled chiSquare kernel.
*/
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap, int numpar, int numfunc, int nummap,
double timeStart, double timeStep, double timeStart, double timeStep,
double &result) = 0; double &result) = 0;
/**
* Write the parameter values to the GPU.
*/
virtual int writeParams(const double *params, int numparams) = 0; virtual int writeParams(const double *params, int numparams) = 0;
/**
* Write the function values to the GPU.
*/
virtual int writeFunc(const double *func, int numfunc) = 0; virtual int writeFunc(const double *func, int numfunc) = 0;
/**
* Write map values to the GPU.
*/
virtual int writeMap(const int *map, int nummap) = 0; virtual int writeMap(const int *map, int nummap) = 0;
/**
* Allocate temporary memory needed for the chi square calucaltios on the device.
*/
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0; virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
/**
* Free device memory allocated for chi square calculations.
*/
virtual int freeChiSquare() = 0; virtual int freeChiSquare() = 0;
/**
* Check if available device can run the chi square GPU code.
*/
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0; virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
/** Set N0, tau and bgk values to use for the kernel. /**
* Set N0, tau and bgk values to use for the kernel.
* If values changes between data sets this needs to be called before * If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS. * every kernel call. Returns DKS_SUCCESS.
*/ */
@ -91,7 +125,8 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Set alpha and beta values to use for the kernel. /**
* Set alpha and beta values to use for the kernel.
* If values changes between data sets this needs to be called before * If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS. * every kernel call. Returns DKS_SUCCESS.
*/ */
@ -101,7 +136,8 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Set number of blocks and threads. /**
* Set number of blocks and threads.
* Used to set parameters obtained from auto-tuning * Used to set parameters obtained from auto-tuning
*/ */
int setKernelParams(int numBlocks, int blockSize) { int setKernelParams(int numBlocks, int blockSize) {
@ -118,7 +154,8 @@ public:
return ierr; return ierr;
} }
/** Get the number of operations in compiled kernel. /**
* Get the number of operations in compiled kernel.
* Count the number of operation in the ptx file for the compiled program. * Count the number of operation in the ptx file for the compiled program.
*/ */
int getOperations(int &oper) { int getOperations(int &oper) {

View File

@ -5,6 +5,9 @@
#include <string> #include <string>
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/**
* Interface to impelment particle matter interaction for OPAL.
*/
class DKSCollimatorPhysics { class DKSCollimatorPhysics {
protected: protected:
@ -16,28 +19,60 @@ public:
virtual ~DKSCollimatorPhysics() { } virtual ~DKSCollimatorPhysics() { }
/**
* Execute collimator physics kernel.
*
*/
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
bool enableRutherforScattering = true) = 0; bool enableRutherforScattering = true) = 0;
/**
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles) = 0; void *par_ptr, int numparticles) = 0;
/**
* Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array
*/
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0; virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
/**
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback) = 0; void *par_ptr, int numparticles, int &numaddback) = 0;
/**
* BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0; double dt, double c, bool usedt = false, int streamId = -1) = 0;
/**
* BorisPusher kick function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1) = 0; double mass, int npart, double c, int streamId = -1) = 0;
/**
* BorisPusher push function with transformto function form OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false, double dt, double c, bool usedt = false,

View File

@ -6,12 +6,21 @@
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
class DKSFFT { /**
* Abstract class defining methods for DKS FFT class.
* Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
*/
class BaseFFT {
protected: protected:
int defaultN[3]; int defaultN[3];
int defaultNdim; int defaultNdim;
/**
* Check if FFT plan is created for the needed dimension and FFT size.
* Returns true if the plan has been created and false if no plan for specified dimension
* and size exists.
*/
bool useDefaultPlan(int ndim, int N[3]) { bool useDefaultPlan(int ndim, int N[3]) {
if (ndim != defaultNdim) if (ndim != defaultNdim)
return false; return false;
@ -22,20 +31,59 @@ protected:
public: public:
virtual ~DKSFFT() { } virtual ~BaseFFT() { }
/** Setup FFT - init FFT library used by chosen device. */
virtual int setupFFT(int ndim, int N[3]) = 0; virtual int setupFFT(int ndim, int N[3]) = 0;
/** Setup real to complex FFT - init FFT library used by chosen device. */
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0; virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
/** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0; virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
/** Clean up. */
virtual int destroyFFT() = 0; virtual int destroyFFT() = 0;
/**
* Exectute C2C FFT.
* mem_ptr - memory ptr on the device for complex data.
* Performs in place FFT.
*/
virtual int executeFFT(void * mem_ptr, int ndim, int N[3], virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
int streamId = -1, bool forward = true) = 0; int streamId = -1, bool forward = true) = 0;
/**
* Exectute inverse C2C FFT.
* mem_ptr - memory ptr on the device for complex data.
* Performs in place FFT.
*/
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
/**
* Normalize the FFT or IFFT.
* mem_ptr - memory to complex data.
*/
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
/**
* Exectute R2C FFT.
* real_ptr - real input data for FFT, comp_ptr - memory on the device where
* results for the FFT are stored as complex numbers.
*/
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0; int streamId = -1) = 0;
/**
* Exectute C2R FFT.
* real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
*/
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0; int streamId = -1) = 0;
/**
* Normalize CR FFT.
*/
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0; virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
}; };

View File

@ -4,24 +4,27 @@
#include <iostream> #include <iostream>
#include <cmath> #include <cmath>
/**
* Interface to implement Greens function calculations for OPAL.
*/
class GreensFunction { class GreensFunction {
public: public:
virtual ~GreensFunction() { } virtual ~GreensFunction() { }
/** calc greens integral, as defined in OPAL */ /** calc greens integral, as defined in OPAL. */
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0; double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
/** integration if rho2_m, see OPAL for more details */ /** integration if rho2_m, see OPAL for more details. */
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1) = 0; int streamId = -1) = 0;
/** mirror rho2_m field */ /** mirror rho2_m field. */
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0; virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
/** multiply two complex fields from device memory */ /** multiply two complex fields from device memory. */
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0; virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
}; };

View File

@ -5,17 +5,22 @@
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
/** Struct to hold voxel position for PET image. */
struct VoxelPosition { struct VoxelPosition {
float x; float x;
float y; float y;
float z; float z;
}; };
/** Struct that holds pair of detectors that registered an envent. */
struct ListEvent { struct ListEvent {
unsigned detA : 16; unsigned detA : 16;
unsigned detB : 16; unsigned detB : 16;
}; };
/**
* Interface to implement PET image reconstruction.
*/
class ImageReconstruction { class ImageReconstruction {
protected: protected:
@ -25,7 +30,8 @@ public:
virtual ~ImageReconstruction() { } virtual ~ImageReconstruction() { }
/** Caluclate source. /**
* Caluclate source.
* Places a sphere at each voxel position and calculate the avg value and std value of pixels * Places a sphere at each voxel position and calculate the avg value and std value of pixels
* that are inside this sphere. All the sphere used have the same diameter. * that are inside this sphere. All the sphere used have the same diameter.
*/ */
@ -33,7 +39,8 @@ public:
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** Calculate background. /**
* Calculate background.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels * Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter. * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
@ -42,7 +49,8 @@ public:
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** Caluclate source using differente sources. /**
* Caluclate source using differente sources.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels * Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* each sphere is given by *diameter array. * each sphere is given by *diameter array.
@ -52,7 +60,7 @@ public:
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** /**
* Places two sphere at each voxel position, calculates the avg value and std value of pixels * Places two sphere at each voxel position, calculates the avg value and std value of pixels.
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
* smaller sphere. * smaller sphere.
@ -61,7 +69,8 @@ public:
void *avg, void *std, void *diameter, int total_voxels, void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0) = 0; int total_sources, int start = 0) = 0;
/** Generate normalization. /**
* Generate normalization.
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
* that updates voxel values in the image on the slope between these two detectors. * that updates voxel values in the image on the slope between these two detectors.
*/ */
@ -69,14 +78,16 @@ public:
void *det_position, int total_det) = 0; void *det_position, int total_det) = 0;
/** Calculate forward projection. /**
* Calculate forward projection.
* For image reconstruction calculates forward projections. * For image reconstruction calculates forward projections.
* see recon.cpp for details * see recon.cpp for details
*/ */
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events) = 0; void *image_position, int num_events) = 0;
/** Calculate backward projection. /**
* Calculate backward projection.
* For image reconstruction calculates backward projections. * For image reconstruction calculates backward projections.
* see recon.cpp for details * see recon.cpp for details
*/ */
@ -84,29 +95,29 @@ public:
void *det_position, void *image_position, void *det_position, void *image_position,
int num_events, int num_voxels) = 0; int num_events, int num_voxels) = 0;
/** Set the voxel dimensins on device. /**
* *Set the voxel dimensins on device.
*/ */
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0; virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
/** Set the image edge variables on the device. /**
* * Set the image edge variables on the device.
*/ */
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0; virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
/** Set the image edge1 on the device. /**
* * Set the image edge1 on the device.
*/ */
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0; virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
/** Set the minimum crystan in one ring values on the device. /**
* * Set the minimum crystan in one ring values on the device.
*/ */
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
float min_CrystalDist_InOneRing1) = 0; float min_CrystalDist_InOneRing1) = 0;
/** Set all other required parameters for reconstruction. /**
* * Set all other required parameters for reconstruction.
*/ */
virtual int setParams(float matrix_distance_factor, float phantom_diameter, virtual int setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter) = 0; float atten_per_mm, float ring_diameter) = 0;

View File

@ -18,6 +18,17 @@
typedef std::vector<Parameter> Parameters; typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States; typedef std::vector<State> States;
/**
* DKS autotuning class, allows to auto-tune the defince function.
* Executes the defined function for auto-tuning and searches for optimal parameters to improve
* the function execution time. The function that is auto-tuned, parameters and the ranges
* need to be set. Includes multiple search methods, that searches the parameter space to finde
* the optimal solution.
* 1) exaustive search
* 2) line search
* 3) hill climbimg
* 4) simulated annealing
*/
class DKSAutoTuning { class DKSAutoTuning {
private: private:
@ -36,10 +47,11 @@ private:
int loops_m; int loops_m;
/** Update parameters from a state */ /** Update parameters from a state. */
int setParameterValues(States states); int setParameterValues(States states);
/** Evaluate the function and set execution time /**
* Evaluate the function and set execution time
* Returns DKS_ERROR if errors occured during function execution. * Returns DKS_ERROR if errors occured during function execution.
* Returns DKS_SUCCESS if function executed as planned. * Returns DKS_SUCCESS if function executed as planned.
*/ */
@ -50,10 +62,11 @@ public:
/** Constructor */ /** Constructor */
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100); DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
/** Destructor */ /** Destructor. */
~DKSAutoTuning(); ~DKSAutoTuning();
/** Set function to auto tune. /**
* Set function to auto tune.
* Caller of setFunction is responsible to bind the correct parameters * Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind. * to the function with std::bind.
*/ */
@ -63,13 +76,19 @@ public:
evaluate_time_m = evaluate_time; evaluate_time_m = evaluate_time;
} }
/**
* Set function to auto tune.
* Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind.
*/
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) { void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
fd_m = f; fd_m = f;
function_name_m = name; function_name_m = name;
evaluate_time_m = evaluate_time; evaluate_time_m = evaluate_time;
} }
/** Set parameter for auto tuning. /**
* Set parameter for auto tuning.
* Provide a pointer to a parameter that will be changed during auto-tuning * Provide a pointer to a parameter that will be changed during auto-tuning
* and a min-max value for this element * and a min-max value for this element
*/ */
@ -85,9 +104,9 @@ public:
/** Perform exaustive search evaluating all the parameter configurations */ /** Perform exaustive search evaluating all the parameter configurations */
void exaustiveSearch(); void exaustiveSearch();
/** Perform auto-tuning. /**
* Perform line-search auto-tuning by variying parameters one at a time and keeping other * Perform line-search auto-tuning by variying parameters one at a time.
* parameters constant. * After one parameter is auto-tuned the next on is varied
*/ */
void lineSearch(); void lineSearch();

View File

@ -4,6 +4,7 @@
#include <iostream> #include <iostream>
#include <cmath> #include <cmath>
/** Tester class for auto-tuning search algorithms. */
class DKSAutoTuningTester { class DKSAutoTuningTester {
friend class DKSBaseMuSR; friend class DKSBaseMuSR;

View File

@ -1,9 +1,3 @@
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
*/
#ifndef DKS_CONFIG #ifndef DKS_CONFIG
#define DKS_CONFIG #define DKS_CONFIG
@ -11,7 +5,7 @@
#include <boost/optional/optional.hpp> #include <boost/optional/optional.hpp>
#include <boost/property_tree/xml_parser.hpp> #include <boost/property_tree/xml_parser.hpp>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <boost/filesystem.hpp> //#include <boost/filesystem.hpp>
#include <string> #include <string>
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
@ -24,11 +18,18 @@
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
namespace pt = boost::property_tree; namespace pt = boost::property_tree;
namespace fs = boost::filesystem; //namespace fs = boost::filesystem;
const std::string config_dir = "/.config/DKS"; const std::string config_dir = "/.config/DKS";
const std::string config_file = "/autotuning.xml"; const std::string config_file = "/autotuning.xml";
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
* TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
* so the auto-tuning has no effect.
*/
class DKSConfig { class DKSConfig {
private: private:

View File

@ -9,6 +9,9 @@
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE }; enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
/**
* Parameter class allows to change the searchable parameters during the auto-tuning.
*/
class Parameter { class Parameter {
private: private:
@ -64,6 +67,10 @@ public:
}; };
/**
* Struct to hold a auto-tuning state.
* Holds the current value, min, max and a step to witch a state can change.
*/
struct State { struct State {
double value; double value;
double min; double min;
@ -74,6 +81,12 @@ struct State {
typedef std::vector<Parameter> Parameters; typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States; typedef std::vector<State> States;
/**
* Used by auto-tuning search algorithms to move between parameter configurations.
* Allows to move from one parameter stat to another, get neighboring states,
* move to neighboring states and save state information. Print functions are available
* for debugging purposes, to follow how algorithm muves between sates.
*/
class DKSSearchStates { class DKSSearchStates {
private: private:

View File

@ -35,12 +35,12 @@ ENDMACRO ()
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
DKSBase.h DKSBase.h
DKSDefinitions.h DKSDefinitions.h
DKSOPAL.h DKSFFT.h
) )
SET (DKS_BASEDIR_SRCS SET (DKS_BASEDIR_SRCS
DKSBase.cpp DKSBase.cpp
DKSOPAL.cpp DKSFFT.cpp
) )
#add opal to DKS if enable_opal is set #add opal to DKS if enable_opal is set
@ -112,26 +112,18 @@ IF (USE_CUDA)
CUDA_ADD_LIBRARY(dks ${DKS_SRCS}) CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS}) CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
IF (USE_UQTK) TARGET_LINK_LIBRARIES(dks ${DKS_CUDA_LIBS})
TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) TARGET_LINK_LIBRARIES(dksshared ${DKS_CUDA_LIBS})
TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) #TARGET_LINK_LIBRARIES(dks)
ELSE (USE_UQTK) #TARGET_LINK_LIBRARIES(dksshared)
TARGET_LINK_LIBRARIES(dks cudadevrt)
TARGET_LINK_LIBRARIES(dksshared cudadevrt)
ENDIF (USE_UQTK)
ELSE (USE_CUDA) ELSE (USE_CUDA)
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}") MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
ADD_LIBRARY(dks ${DKS_SRCS}) ADD_LIBRARY(dks ${DKS_SRCS})
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS}) ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
IF (USE_UQTK)
TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ELSE (USE_UQTK)
TARGET_LINK_LIBRARIES(dks) TARGET_LINK_LIBRARIES(dks)
TARGET_LINK_LIBRARIES(dksshared) TARGET_LINK_LIBRARIES(dksshared)
ENDIF(USE_UQTK)
ENDIF (USE_CUDA) ENDIF (USE_CUDA)

View File

@ -1,9 +1,9 @@
SET (_HDRS CudaBase.cuh) SET (_HDRS CudaBase.cuh CudaFFT.cuh)
SET (_SRCS CudaBase.cu) SET (_SRCS CudaBase.cu CudaFFT.cu)
IF (ENABLE_OPAL) IF (ENABLE_OPAL)
SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh) SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu) SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu)
ENDIF (ENABLE_OPAL) ENDIF (ENABLE_OPAL)
IF (ENABLE_MUSR) IF (ENABLE_MUSR)

View File

@ -342,62 +342,3 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/*
Info: allcate memory and write data (push)
Return: pointer to memory object
*/
/*
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
void * mem_ptr;
mem_ptr = cuda_allocateMemory(size, ierr);
if (ierr == DKS_SUCCESS)
ierr = cuda_writeData(mem_ptr, in_data, size);
return mem_ptr;
}
*/
/*
Info: read data and free memory (pull)
Return: success or error code
*/
/*
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
ierr = cuda_readData(mem_ptr, out_data, size);
if (ierr == DKS_SUCCESS)
ierr = cuda_freeMemory(mem_ptr);
else
return DKS_ERROR;
if (ierr == DKS_SUCCESS)
return DKS_SUCCESS;
else
return DKS_ERROR;
}
*/
/*
Info: execute function
Return: success or error code
*/
int CudaBase::cuda_executeFunction() {
std::cout << "Execute function" << std::endl;
return DKS_SUCCESS;
}
/*
Info: clean up
Return: success or error code
*/
int CudaBase::cuda_cleanUp() {
std::cout << "clean up" << std::endl;
return DKS_SUCCESS;
}

View File

@ -12,11 +12,15 @@
#include <cufft.h> #include <cufft.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <curand_kernel.h> #include <curand_kernel.h>
#include <nvToolsExt.h>
#include <time.h> #include <time.h>
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
/**
* CUDA base class handles device setup and basic communication with the device.
* Handles devicew setup, memory manegement, data transfers and stream setup for
* asynchronous data transfers and kernel executions.
*/
class CudaBase { class CudaBase {
private: private:
@ -53,13 +57,13 @@ public:
*/ */
int cuda_deleteCurandStates(); int cuda_deleteCurandStates();
/** Create 'size' random numbers on the device and save in mem_ptr array /**
* * Create 'size' random numbers on the device and save in mem_ptr array.
*/ */
int cuda_createRandomNumbers(void *mem_ptr, int size); int cuda_createRandomNumbers(void *mem_ptr, int size);
/** Get a pointer to curand states /**
* * Get a pointer to curand states.
*/ */
curandState* cuda_getCurandStates(); curandState* cuda_getCurandStates();
@ -76,93 +80,98 @@ public:
int cuda_addStream(cudaStream_t tmpStream, int &streamId); int cuda_addStream(cudaStream_t tmpStream, int &streamId);
/** /**
* delete cuda stream * delete cuda stream.
* success or error code * success or error code
*/ */
int cuda_deleteStream(int id); int cuda_deleteStream(int id);
/** /**
* delete all streams * delete all streams.
* success or error code * success or error code
*/ */
int cuda_deleteStreams(); int cuda_deleteStreams();
/** /**
* set stream to use * set stream to use.
* success or error code * success or error code
*/ */
int cuda_setStream(int id); int cuda_setStream(int id);
/** /**
* Info: get stream that is used * get stream that is used.
* Return: return id of curretn stream * Return: return id of curretn stream
*/ */
int cuda_getStreamId(); int cuda_getStreamId();
/** /**
* Info: reset to default stream * reset to default stream.
* Return: success or error code * Return: success or error code
*/ */
int cuda_defaultStream(); int cuda_defaultStream();
/** /**
* Info: get number of streams * get number of streams.
* Return: success or error code * Return: success or error code
*/ */
int cuda_numberOfStreams(); int cuda_numberOfStreams();
/** /**
* Info: get stream * get stream.
* Return: stream * Return: stream
*/ */
cudaStream_t cuda_getStream(int id); cudaStream_t cuda_getStream(int id);
/** /**
* Get default cublass handle * Get default cublass handle.
*/ */
cublasHandle_t cuda_getCublas(); cublasHandle_t cuda_getCublas();
/** /**
* Info: get information on cuda devices * get information on cuda devices.
* Return: success or error code * Return: success or error code
*/ */
int cuda_getDevices(); int cuda_getDevices();
/** Get CUDA device count. /**
* Get CUDA device count.
* Sets the number of devices on the platform that can use CUDA. * Sets the number of devices on the platform that can use CUDA.
* Returns DKS_SUCCESS
*/ */
int cuda_getDeviceCount(int &ndev); int cuda_getDeviceCount(int &ndev);
/** Get the name of the device. /**
* Get the name of the device.
* QUery the device properties of the used device and set the string device_name * QUery the device properties of the used device and set the string device_name
*/ */
int cuda_getDeviceName(std::string &device_name); int cuda_getDeviceName(std::string &device_name);
/** Set CUDA device to use. /**
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR * Set CUDA device to use.
* If device passed in is larger than the number of devices use
* the default:0 and return DKS_ERROR
*/ */
int cuda_setDevice(int device); int cuda_setDevice(int device);
/** Get unique devices /**
* Get unique devices.
* Get array of indeces with the unique CUDA devices available on the paltform * Get array of indeces with the unique CUDA devices available on the paltform
*/ */
int cuda_getUniqueDevices(std::vector<int> &devices); int cuda_getUniqueDevices(std::vector<int> &devices);
/** /**
* Info: init device * Initialize connection to the device.
* Only needed when runtime compilation is used.
* Return: success or error code * Return: success or error code
*/ */
int cuda_setUp(); int cuda_setUp();
/** /**
* Info: allocate memory on cuda device * Allocate memory on cuda device.
* Return: pointer to memory object * Return: pointer to memory object
*/ */
void * cuda_allocateMemory(size_t size, int &ierr); void * cuda_allocateMemory(size_t size, int &ierr);
/** /**
* Info: allocate host memory in pinned memory * Allocate host memory in pinned memory
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -175,7 +184,8 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory. /**
* Zero CUDA memory.
* Set all the elements of the array on the device to zero. * Set all the elements of the array on the device to zero.
*/ */
template<typename T> template<typename T>
@ -190,7 +200,8 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory. /**
* Zero CUDA memory async.
* Set all the elements of the array on the device to zero. * Set all the elements of the array on the device to zero.
*/ */
template<typename T> template<typename T>
@ -210,7 +221,7 @@ public:
} }
/** /**
* Info: write data to memory * Write data to memory
* Retrun: success or error code * Retrun: success or error code
*/ */
template<typename T> template<typename T>
@ -227,7 +238,7 @@ public:
} }
/** /**
* Info: write data assynchonuously * Write data assynchonuously
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -259,7 +270,7 @@ public:
} }
/** /**
* Info: read data from memory * Read data from memory
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -276,7 +287,7 @@ public:
} }
/** /**
* Info: read data async from device memory * Read data async from device memory
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -308,19 +319,19 @@ public:
} }
/** /**
* Info: free memory on device * Free memory on device
* Return: success or error code * Return: success or error code
*/ */
int cuda_freeMemory(void * mem_ptr); int cuda_freeMemory(void * mem_ptr);
/** /**
* Info: free page locked memory on host * Free page locked memory on host
* Return: success or erro code * Return: success or erro code
*/ */
int cuda_freeHostMemory(void * mem_ptr); int cuda_freeHostMemory(void * mem_ptr);
/** /**
* Info: allcate memory and write data (push) * Allcate memory and write data (push)
* Return: pointer to memory object * Return: pointer to memory object
*/ */
template<typename T> template<typename T>
@ -336,7 +347,7 @@ public:
} }
/** /**
* Info: read data and free memory (pull) * Read data and free memory (pull)
* Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
@ -356,19 +367,8 @@ public:
} }
/** /**
* Info: execute function * Sync cuda device.
* Return: success or error code * Waits till all the tasks on the GPU are finished.
*/
int cuda_executeFunction();
/**
* Info: clean up
* Return: success or error code
*/
int cuda_cleanUp();
/**
* Info: sync cuda device
* Return: success or error code * Return: success or error code
*/ */
int cuda_syncDevice() { int cuda_syncDevice() {
@ -377,7 +377,7 @@ public:
} }
/** /**
* Page-lock host memory * Page-lock host memory.
*/ */
template<typename T> template<typename T>
int cuda_hostRegister(T *ptr, int size) { int cuda_hostRegister(T *ptr, int size) {
@ -391,7 +391,7 @@ public:
} }
/** /**
* Release page locked memory * Release page locked memory.
*/ */
template<typename T> template<typename T>
int cuda_hostUnregister(T *ptr) { int cuda_hostUnregister(T *ptr) {
@ -404,7 +404,7 @@ public:
} }
/** /**
* Info: print device memory info (total, used, avail) * Print device memory info (total, used, avail)
* Return: success or error code * Return: success or error code
*/ */
int cuda_memInfo() { int cuda_memInfo() {

View File

@ -8,6 +8,7 @@
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
class CudaChiSquare { class CudaChiSquare {
private: private:

View File

@ -15,6 +15,10 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
const std::string cudaFunctFooter = "}\n"; const std::string cudaFunctFooter = "}\n";
/**
* CUDA implementation of ChiSquareRuntime class.
* Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
*/
class CudaChiSquareRuntime : public ChiSquareRuntime{ class CudaChiSquareRuntime : public ChiSquareRuntime{
private: private:
@ -29,65 +33,72 @@ private:
cublasHandle_t defaultCublasRT; cublasHandle_t defaultCublasRT;
/** Setup to init device /**
* Setup to init device.
* Create context and init device for RT compilation * Create context and init device for RT compilation
*/ */
void setUpContext(); void setUpContext();
/** Private function to add function to kernel string /**
* * Private function to add function to kernel string.
*/ */
std::string buildProgram(std::string function); std::string buildProgram(std::string function);
public: public:
/** Constructor with CudaBase argument /**
* * Constructor with CudaBase argument
*/ */
CudaChiSquareRuntime(CudaBase *base); CudaChiSquareRuntime(CudaBase *base);
/** Default constructor init cuda device /**
* * Default constructor init cuda device
*/ */
CudaChiSquareRuntime(); CudaChiSquareRuntime();
/** Default destructor /**
* * Default destructor.
*/ */
~CudaChiSquareRuntime(); ~CudaChiSquareRuntime();
/** Compile program and save ptx. /**
* Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program * Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in * Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]] * a form par[map[idx]]
*/ */
int compileProgram(std::string function, bool mlh = false); int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel /**
* Launch selected kernel.
* Launched the selected kernel from the compiled code. * Launched the selected kernel from the compiled code.
* Result is put in &result variable * Result is put in &result variable.
*/ */
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap, int numpar, int numfunc, int nummap,
double timeStart, double timeStep, double timeStart, double timeStep,
double &result); double &result);
/** Write params to device. /**
* Write params to device.
* Write params from double array to mem_param_m memory on the device. * Write params from double array to mem_param_m memory on the device.
*/ */
int writeParams(const double *params, int numparams); int writeParams(const double *params, int numparams);
/** Write functions to device. /**
* Write functions to device.
* Write function values from double array to mem_func_m memory on the device. * Write function values from double array to mem_func_m memory on the device.
*/ */
int writeFunc(const double *func, int numfunc); int writeFunc(const double *func, int numfunc);
/** Write maps to device. /**
* Write maps to device.
* Write map values from int array to mem_map_m memory on the device. * Write map values from int array to mem_map_m memory on the device.
*/ */
int writeMap(const int *map, int nummap); int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square. /**
* Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param, * the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in * size_func and size_map are the maximum number of parameters, functions and maps used in
@ -96,12 +107,14 @@ public:
int initChiSquare(int size_data, int size_param, int size_func, int size_map); int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square. /**
* Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps * Frees the chisq temporary memory and memory for params, functions and maps
*/ */
int freeChiSquare(); int freeChiSquare();
/** Check if CUDA device is able to run the chi square kernel. /**
* Check if CUDA device is able to run the chi square kernel.
* Redundant - all new CUDA devices that support RT compilation will also support * Redundant - all new CUDA devices that support RT compilation will also support
* double precision, there are no other requirements to run chi square on GPU * double precision, there are no other requirements to run chi square on GPU
*/ */

View File

@ -1,16 +1,16 @@
#include "CudaCollimatorPhysics.cuh" #include "CudaCollimatorPhysics.cuh"
//#define M_P 0.93827231e+00 //constants used in OPAL
#define M_P 0.93827204e+00 #define M_P 0.93827204e+00
#define C 299792458.0 #define C 299792458.0
#define PI 3.14159265358979323846 #define PI 3.14159265358979323846
#define AVO 6.022e23 #define AVO 6.022e23
#define R_E 2.81794092e-15 #define R_E 2.81794092e-15
//#define eM_E 0.51099906e-03
#define eM_E 0.51099892e-03 #define eM_E 0.51099892e-03
#define Z_P 1 #define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 #define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
//parameter array indexes
#define POSITION 0 #define POSITION 0
#define ZSIZE 1 #define ZSIZE 1
#define RHO_M 2 #define RHO_M 2
@ -28,12 +28,18 @@
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
#define NUMPAR 13 #define NUMPAR 13
/**
* CUDA device function for calculating dot product.
*/
__device__ inline double dot(double3 &d1, double3 &d2) { __device__ inline double dot(double3 &d1, double3 &d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
} }
/**
* CUDA devce function to calculate cross product.
*/
__device__ inline double3 cross(double3 &lhs, double3 &rhs) { __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp; double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y; tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
@ -42,6 +48,9 @@ __device__ inline double3 cross(double3 &lhs, double3 &rhs) {
return tmp; return tmp;
} }
/**
* CUDA device function to calculate arbitrary rotation.
*/
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) { __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
double c=cos(Theta); double c=cos(Theta);
double s=sin(Theta); double s=sin(Theta);
@ -59,6 +68,11 @@ __device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Th
return tmp; return tmp;
} }
/**
* CUDA device function to check if particle is still in material.
* z - particle position, par - parameter array. Particle is considered inside the
* material if z is > material starting position and z < material starting position - mat size.
*/
__device__ inline bool checkHit(double &z, double *par) { __device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */ /* check if particle is in the degrader material */
@ -67,6 +81,11 @@ __device__ inline bool checkHit(double &z, double *par) {
} }
/**
* CUDA device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
{ {
@ -111,51 +130,53 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
} }
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, /**
double &normP, double &thetacou, double &deltas, int coord, * CUDA device function for rotation in 2 dimensions.
* For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),
* "Passage of particles through matter"
*/
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &plane,
double &betaGamma, double &thetacou, double &deltas, int coord,
double *par) double *par)
{ {
double Psixz; // Calculate the angle between the px and pz momenta to change from beam coordinate to lab coordinate
double pxz; const double Psi = atan2(px, pz);
const double pxz = sqrt(px*px + pz*pz);
const double cosPsi = cos(Psi);
const double sinPsi = sin(Psi);
const double cosTheta = cos(thetacou);
const double sinTheta = sin(thetacou);
/* // Apply the rotation about the random angle thetacou & change from beam
if (px>=0 && pz>=0) // coordinate system to the lab coordinate system using Psixz (2 dimensions)
Psixz = atan(px/pz); x += deltas * px / betaGamma + plane * cosPsi;
else if (px>0 && pz<0) z -= plane * sinPsi;
Psixz = atan(px/pz) + PI;
else if (px<0 && pz>0)
Psixz = atan(px/pz) + 2*PI;
else
Psixz = atan(px/pz) + PI;
*/
Psixz = atan2(px, pz);
pxz = sqrt(px*px + pz*pz);
if (coord == 1) { if (coord == 1) {
x = x + deltas * px/normP + xplane*cos(Psixz); z += deltas * pz / betaGamma;
z = z - xplane * sin(Psixz);
} }
if(coord==2) { px = pxz * (cosPsi * sinTheta + sinPsi * cosTheta);
x = x + deltas * px/normP + xplane*cos(Psixz); pz = pxz * (-sinPsi * sinTheta + cosPsi * cosTheta);
z = z - xplane * sin(Psixz) + deltas * pz / normP;
}
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
} }
/**
* CUDA device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* For details on the algorithm see OPAL user guide.
*/
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par, __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
bool enableRutherfordScattering) bool enableRutherfordScattering)
{ {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P; double gamma = (Eng + M_P) / M_P;
double normP = sqrt(dot(P, P));
double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
double betaGamma = sqrt(dot(P, P));
double deltas = par[DT_M] * beta * C; double deltas = par[DT_M] * beta * C;
double mass = M_P * 1e9; // in eV
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * double theta0 = 13.6e6 / (beta * betaGamma * mass) *
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
// x-direction: See Physical Review, "Multiple Scattering" // x-direction: See Physical Review, "Multiple Scattering"
@ -170,8 +191,9 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
} }
//__syncthreads(); //__syncthreads();
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; //double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par); double xplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
Rot(P.x, P.z, R.x, R.z, xplane, betaGamma, thetacou, deltas, 0, par);
// y-direction: See Physical Review, "Multiple Scattering" // y-direction: See Physical Review, "Multiple Scattering"
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
@ -184,10 +206,9 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
thetacou = z2 * theta0; thetacou = z2 * theta0;
} }
//__syncthreads(); //double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
double yplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; Rot(P.y,P.z,R.y,R.z, yplane, betaGamma, thetacou, deltas, 1, par);
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if( (P2 < 0.0047) && enableRutherfordScattering) { if( (P2 < 0.0047) && enableRutherfordScattering) {
@ -211,6 +232,17 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
} }
/**
* CUDA kernel that performs one step in particle movement trough mater.
* One thread is launched for each particle in the simulation. The kernel checks if the particle
* is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
* particles that are exiting the material.
* @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
* @param[in] *par array of material properties, always constant size - 13
* @param[in] *state array holding cuRand states to preserve states between kernel launches
* @param[in] numparticles number of particles in the simulation
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
*/
template <typename T> template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles, bool enableRutherfordScattering) int numparticles, bool enableRutherfordScattering)
@ -220,51 +252,63 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
volatile int tid = threadIdx.x; volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid; volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory //transfer params and particle positions to shared memory
//R is kept in shared memory in order to reduce register pressure for the kernel
extern __shared__ double smem[]; extern __shared__ double smem[];
double *p = (double*)smem; double *p = (double*)smem;
double3 *R = (double3*)&smem[NUMPAR]; double3 *R = (double3*)&smem[NUMPAR];
curandState s; curandState s; //each tread gets its own cuRand state for random number generation
double3 P; double3 P;
//load parameters to shared memory
for (int tt = tid; tt < NUMPAR; tt += blockDim.x) for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
p[tt] = par[tt]; p[tt] = par[tt];
//sync threads to ensure that parameters are finished loading
__syncthreads(); __syncthreads();
//there might be some empty threads that do no work
if (idx < numparticles) { if (idx < numparticles) {
s = state[idx]; s = state[idx]; //load cuRand state to local memory
R[tid] = data[idx].Rincol; R[tid] = data[idx].Rincol; //load position to shared memory
P = data[idx].Pincol; P = data[idx].Pincol; //load momentum to local memory
bool pdead = false; bool pdead = false;
volatile double sq = sqrt(1.0 + dot(P, P)); volatile double sq = sqrt(1.0 + dot(P, P));
double Eng; double Eng;
//check if particle is still in the material
if (checkHit(R[tid].z, p)) { if (checkHit(R[tid].z, p)) {
//calculate enery loss
Eng = (sq - 1) * M_P; Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p); energyLoss(Eng, pdead, s, p);
//check if particle is not dead
if (!pdead) { if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P)); sq = sqrt(dot(P, P));
//caluclate Coulomb scattering
P.x = P.x * ptot / sq; P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq; P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq; P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering); coulombScat(R[tid], P, s, p, enableRutherfordScattering);
//update particle momentum
data[idx].Pincol = P; data[idx].Pincol = P;
} else { } else {
//mark particle as dead (-1)
data[idx].label = -1; data[idx].label = -1;
} }
//update cuRand state
state[idx] = s; state[idx] = s;
} else { } else {
//particle exits material - drift and mark as exiting (-2)
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
@ -272,12 +316,23 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
} }
//update particle position
data[idx].Rincol = R[tid]; data[idx].Rincol = R[tid];
} }
} }
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, /**
* CUDA kernel that performs one step in particle movement trough mater using SoA particles.
* Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
* Deprecated - GPU version does not use SoA.
* @param[in] data structure of arrays containing particle data
* @param[in] *par array of material properties, always constant size - 13
* @param[in] *state array holding cuRand states to preserve states between kernel launches
* @param[in] numparticles number of particles in the simulation
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
*/
__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles, curandState *state, int numparticles,
bool enableRutherfordScattering) bool enableRutherfordScattering)
{ {
@ -338,92 +393,32 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
} }
/**
* Device function to swich off unitless positions.
*/
inline __device__ void unitlessOff(double3 &a, const double &c) { inline __device__ void unitlessOff(double3 &a, const double &c) {
a.x *= c; a.x *= c;
a.y *= c; a.y *= c;
a.z *= c; a.z *= c;
} }
/**
* Device function to swich on unitless positions.
*/
inline __device__ void unitlessOn(double3 &a, const double &c) { inline __device__ void unitlessOn(double3 &a, const double &c) {
a.x /= c; a.x /= c;
a.y /= c; a.y /= c;
a.z /= c; a.z /= c;
} }
//swithch to unitless positions with dtc
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOn(R, dtc);
unitlessOn(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc to unitless positions with dt*c
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc off unitless positions with dtc
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOff(R, dtc);
unitlessOff(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//switch off unitelss positions with dt*c
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
/**
* CUDA kernel to perform particle push.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] npart number of particles
* @param[in] dtc dt*c
*/
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
//get global id and thread id //get global id and thread id
@ -451,7 +446,14 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
} }
} }
/**
* CUDA kernel to perform particle push.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] *gdt array of time steps for each particle
* @param[in] npart number of particles
* @param[in] c speed of light
*/
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) { __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
//get global id and thread id //get global id and thread id
@ -478,6 +480,16 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
} }
} }
/**
* CUDA kernel to perform particle kick.
* @param[in] *gR array of particle positions
* @param[in] *gP array of particle momentums
* @param[in] *gEf
* @param[in] *gBf
* @param[in] *gdt array of time steps for each particle
* @param[in] npart number of particles
* @param[in] c speed of light
*/
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
double3 *gBf, double *gdt, double charge, double3 *gBf, double *gdt, double charge,
double mass, int npart, double c) double mass, int npart, double c)
@ -627,63 +639,6 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
} }
struct compare_particle
{
int threshold;
compare_particle() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART p1, CUDA_PART p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART p1) {
return p1.label < threshold;
}
};
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
struct less_then
{
__host__ __device__
bool operator()(int x)
{
return x < 0;
}
};
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering) bool enableRutherfordScattering)
{ {

View File

@ -20,7 +20,8 @@
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU or MIC as AoS.
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
*/ */
typedef struct __align__(16) { typedef struct __align__(16) {
int label; int label;
@ -37,7 +38,10 @@ typedef struct __align__(16) {
} CUDA_PART; } CUDA_PART;
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU as AoS
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
* on the host side.
*/ */
typedef struct { typedef struct {
int label; int label;
@ -47,7 +51,8 @@ typedef struct {
} CUDA_PART_SMALL; } CUDA_PART_SMALL;
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU as SoA.
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
*/ */
typedef struct { typedef struct {
int *label; int *label;
@ -65,6 +70,9 @@ typedef struct {
/** /**
* Structure for storing particle on GPU * Structure for storing particle on GPU
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
* on the host side.
*/ */
typedef struct { typedef struct {
int *label; int *label;
@ -73,9 +81,37 @@ typedef struct {
double3 *Pincol; double3 *Pincol;
} CUDA_PART2_SMALL; } CUDA_PART2_SMALL;
/** CudaCollimatorPhysics class. /**
* Operator used in thrust sort to compare particles by label.
* Used to move dead particles to the end of array, since they have label -1 or -2.
*/
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
/**
* CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Contains kerenls that execute CollimatorPhysics functions form OPAL. * Contains kerenls that execute CollimatorPhysics functions form OPAL.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation * For detailed documentation on CollimatorPhysics functions see OPAL documentation.
*/ */
class CudaCollimatorPhysics : public DKSCollimatorPhysics { class CudaCollimatorPhysics : public DKSCollimatorPhysics {
@ -86,32 +122,44 @@ private:
public: public:
/** Constructor with CudaBase argument /**
* * Constructor with CudaBase as argument.
* Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
*/ */
CudaCollimatorPhysics(CudaBase *base) { CudaCollimatorPhysics(CudaBase *base) {
m_base = base; m_base = base;
base_create = false; base_create = false;
} }
/** Constructor - empty. */ /**
* Empty constructor.
* Create a new instance of CudaCollimatorPhysics with its own CudaBase.
*/
CudaCollimatorPhysics() { CudaCollimatorPhysics() {
m_base = new CudaBase(); m_base = new CudaBase();
base_create = true; base_create = true;
} }
/** Destructor - empty */ /**
* Destructor.
* Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
*/
~CudaCollimatorPhysics() { ~CudaCollimatorPhysics() {
if (base_create) if (base_create)
delete m_base; delete m_base;
}; };
/** Execute collimator physics kernel. /**
* Execute collimator physics kernel.
* *
*/ */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numpartices, bool enableRutherforScattering = true); int numpartices, bool enableRutherforScattering = true);
/**
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
@ -120,12 +168,17 @@ public:
return DKS_ERROR; return DKS_ERROR;
} }
/** Sort particle array on GPU. /**
* Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle * Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array * array so these particles are at the end of array
*/ */
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
* Used only on the MIC side, was not implemented on the GPU.
*/
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr, void *px_ptr, void *py_ptr, void *pz_ptr,
@ -134,18 +187,25 @@ public:
return DKS_ERROR; return DKS_ERROR;
} }
/** BorisPusher push function for integration from OPAL. /**
* BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda. * ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal * For more details see ParallelTTracler docomentation in opal
*/ */
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1); double dt, double c, bool usedt = false, int streamId = -1);
/**
* BorisPusher kick function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr, int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass, void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId = -1); int npart, double c, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL /**
* BorisPusher push function with transformto function form OPAL.
* ParallelTTracker integration from OPAL implemented in cuda. * ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal * For more details see ParallelTTracler docomentation in opal
*/ */

View File

@ -10,7 +10,11 @@
#include "../Algorithms/FFT.h" #include "../Algorithms/FFT.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
class CudaFFT : public DKSFFT{ /**
* Cuda FFT class based on BaseFFT interface.
* Uses cuFFT library to perform FFTs on nvidias GPUs.
*/
class CudaFFT : public BaseFFT {
private: private:
@ -34,7 +38,7 @@ public:
~CudaFFT(); ~CudaFFT();
/** /**
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type * Init cufftPlans witch can be reused for all FFTs of the same size and type
* Return: success or error code * Return: success or error code
*/ */
int setupFFT(int ndim, int N[3]); int setupFFT(int ndim, int N[3]);
@ -42,45 +46,21 @@ public:
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
/** /**
* Info: destroy default FFT plans * Destroy default FFT plans
* Return: success or error code * Return: success or error code
*/ */
int destroyFFT(); int destroyFFT();
/*
Info: execute complex to complex double precision fft using cufft library
Return: success or error code
*/
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
/*
Info: execute ifft
Return: success or error code
*/
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize using cuda kernel for complex to complex iFFT
Return: success or error code
*/
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute real to complex double precision FFT
Return: success or error code
*/
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: exectue complex to real double precision FFT
Return: success or error code
*/
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1); int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
}; };

View File

@ -12,6 +12,7 @@
#include "../Algorithms/GreensFunction.h" #include "../Algorithms/GreensFunction.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
class CudaGreensFunction : public GreensFunction{ class CudaGreensFunction : public GreensFunction{
private: private:

View File

@ -10,6 +10,7 @@
#include "../Algorithms/ImageReconstruction.h" #include "../Algorithms/ImageReconstruction.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
/** CUDA implementation of ImageReconstruction interface. */
class CudaImageReconstruction : public ImageReconstruction { class CudaImageReconstruction : public ImageReconstruction {
private: private:

View File

@ -1,11 +1,3 @@
/** DKSBase class.
* DKSBase.h
* Author: Uldis Locans
* Date: 15.09.2014
* Base class of Dynamic Kernel Scheduler that handles the function calls
* from host application to DKS
*/
#ifndef H_DKS_BASE #ifndef H_DKS_BASE
#define H_DKS_BASE #define H_DKS_BASE
@ -33,7 +25,6 @@
#ifdef DKS_CUDA #ifdef DKS_CUDA
#include "CUDA/CudaBase.cuh" #include "CUDA/CudaBase.cuh"
#include "nvToolsExt.h"
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
@ -42,7 +33,12 @@
#include "AutoTuning/DKSConfig.h" #include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */ /**
* API for handling communication function calls to DKS library.
* DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
* memory manegement, data transfer and other basic communication functions between
* the host and device.
*/
class DKSBase { class DKSBase {
private: private:
@ -75,7 +71,7 @@ protected:
DKSConfig dksconfig; DKSConfig dksconfig;
/** /**
* Check if current API is set to OpenCL * Check if current API is set to OpenCL.
* Return true/false wether current api is opencl * Return true/false wether current api is opencl
*/ */
bool apiOpenCL(); bool apiOpenCL();
@ -92,11 +88,11 @@ protected:
*/ */
bool apiOpenMP(); bool apiOpenMP();
/** Check if device is GPU */ /** Check if device is GPU. */
bool deviceGPU(); bool deviceGPU();
/** Check if device is CPU */ /** Check if device is CPU. */
bool deviceCPU(); bool deviceCPU();
/** Check if device is MIC */ /** Check if device is MIC. */
bool deviceMIC(); bool deviceMIC();
/** /**
@ -889,9 +885,10 @@ public:
* TODO: opencl and mic imlementation * TODO: opencl and mic imlementation
*/ */
int callMemInfo() { int callMemInfo() {
#ifdef DKS_CUDA
if (apiCuda()) if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_memInfo()); return CUDA_SAFECALL(cbase->cuda_memInfo());
#endif
return DKS_ERROR; return DKS_ERROR;
} }
@ -900,11 +897,13 @@ public:
* Used for debuging and timing purposes only. * Used for debuging and timing purposes only.
*/ */
void oclEventInfo() { void oclEventInfo() {
#ifdef DKS_OPENCL
if (apiOpenCL()) if (apiOpenCL())
return OPENCL_SAFECALL(oclbase->ocl_eventInfo()); return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
#endif
} }
/** /**
* Test function to profile opencl kernel calls. * Test function to profile opencl kernel calls.
* Used for debuging and timing purposes only. * Used for debuging and timing purposes only.

View File

@ -24,6 +24,7 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
//if we are not auto tuning and the size of the problem has changed find the new parameters //if we are not auto tuning and the size of the problem has changed find the new parameters
//from autotuning config file //from autotuning config file
if (!isAutoTuningOn() && length != chiSquareSize_m) { if (!isAutoTuningOn() && length != chiSquareSize_m) {
/*
int numBlocks, blockSize; int numBlocks, blockSize;
std::string device_name; std::string device_name;
getDeviceName(device_name); getDeviceName(device_name);
@ -33,8 +34,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
length, "BlockSize", blockSize); length, "BlockSize", blockSize);
chiSq->setKernelParams(numBlocks, blockSize); chiSq->setKernelParams(numBlocks, blockSize);
//std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl; std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
*/
chiSquareSize_m = length; chiSquareSize_m = length;
} }

View File

@ -8,6 +8,7 @@
#include "AutoTuning/DKSAutoTuningTester.h" #include "AutoTuning/DKSAutoTuningTester.h"
#include "DKSBase.h" #include "DKSBase.h"
#include "DKSFFT.h"
#include "Algorithms/ChiSquareRuntime.h" #include "Algorithms/ChiSquareRuntime.h"
@ -19,7 +20,12 @@
#include "OpenCL/OpenCLChiSquareRuntime.h" #include "OpenCL/OpenCLChiSquareRuntime.h"
#endif #endif
class DKSBaseMuSR : public DKSBase { /**
* API to handle musrfit calls to DKS library.
* Using ChiSquareRuntime interface allows to call chi square functions on the
* GPU or CPU using CUDA or OpenCL.
*/
class DKSBaseMuSR : public DKSFFT {
private: private:

147
src/DKSFFT.cpp Normal file
View File

@ -0,0 +1,147 @@
#include "DKSFFT.h"
DKSFFT::DKSFFT() {
dksfft = nullptr;
}
DKSFFT::~DKSFFT() {
delete dksfft;
}
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSFFT::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
return dksfft->setupFFT(ndim, N);
} else if (apiOpenCL()) {
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
int ierr1 = dksfft->setupFFT(ndim, N);
int ierr2 = dksfft->setupFFTRC(ndim, N);
int ierr3 = dksfft->setupFFTCR(ndim, N);
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSFFT::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTRC(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTRC(ndim, N, scale);
return DKS_ERROR;
}
//BENI:
int DKSFFT::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTCR(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTCR(ndim, N, scale);
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSFFT::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSFFT::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSFFT::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
else
return DKS_ERROR;
} else if (apiCuda()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
} else if (apiOpenMP()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSFFT::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSFFT::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSFFT::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
else if (apiOpenCL())
return DKS_ERROR;
else if (apiOpenMP())
return DKS_ERROR;
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}

112
src/DKSFFT.h Normal file
View File

@ -0,0 +1,112 @@
#ifndef H_DKSBASE_FFT
#define H_DKSBASE_FFT
#include <iostream>
#include "AutoTuning/DKSAutoTuning.h"
#include "DKSBase.h"
#include "DKSDefinitions.h"
#include "Algorithms/GreensFunction.h"
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#ifdef DKS_AMD
#include "OpenCL/OpenCLFFT.h"
#endif
#ifdef DKS_CUDA
#include "CUDA/CudaFFT.cuh"
#endif
#ifdef DKS_MIC
#include "MIC/MICFFT.h"
#endif
/**
* API to handel calls to DKSFFT.
* Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
*/
class DKSFFT : public DKSBase {
private:
BaseFFT *dksfft;
int initFFT();
public:
DKSFFT();
~DKSFFT();
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
};
#endif

View File

@ -10,6 +10,9 @@
#include "CUDA/CudaImageReconstruction.cuh" #include "CUDA/CudaImageReconstruction.cuh"
#endif #endif
/**
* API to handle PET image reconstruction calls.
*/
class DKSImageRecon : public DKSBase { class DKSImageRecon : public DKSBase {
private: private:
@ -22,87 +25,88 @@ public:
~DKSImageRecon(); ~DKSImageRecon();
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateSource(void *image_space, void *image_position, void *source_position, int callCalculateSource(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateBackground(void *image_space, void *image_position, void *source_position, int callCalculateBackground(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels, void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateSources(void *image_space, void *image_position, void *source_position, int callCalculateSources(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels, void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source. /**
* * Image reconstruction analaysis calculate source.
*
*/ */
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels, void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0); int total_sources, int start = 0);
/** Image reconstruction - generate normalization. /**
* * Image reconstruction - generate normalization.
*/ */
int callGenerateNormalization(void *recon, void *image_position, int callGenerateNormalization(void *recon, void *image_position,
void *det_position, int total_det); void *det_position, int total_det);
/** Image reconstruction - forward correction. /**
* * Image reconstruction - forward correction.
*/ */
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events); void *image_position, int num_events);
/** Image reconstruction - backward projection. /**
* * Image reconstruction - backward projection.
*/ */
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position, void *det_position, void *image_position,
int num_events, int num_voxels); int num_events, int num_voxels);
/** Set the voxel dimensins on device. /**
* Set the voxel dimensins on device.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
/** Set the image edge. /**
* Set the image edge.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setEdge(float x_edge, float y_edge, float z_edge); int setEdge(float x_edge, float y_edge, float z_edge);
/** Set the image edge1. /**
* Set the image edge1.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
/** Set the minimum crystan in one ring values. /**
* Set the minimum crystal in one ring values.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.
*/ */
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
/** Set all other required parameters for reconstruction. /**
* Set all other required parameters for reconstruction.
* Values are stored in GPU memory and used in forward and backward projection calculations. * Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU. * Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values. * If value changes on the host side set functions needs to be called again to update GPU values.

32
src/DKSMainPage.dox Normal file
View File

@ -0,0 +1,32 @@
/**
\mainpage
<P>
<B>
The aim of DKS is to allow the creation of fast fine tuned kernels using device specific frameworks such as CUDA, OpenCL, OpenACC and OpenMP and accelerator libraries such as Thrust, Nvidia CUDA libraries, Intel MKL or others. On top of that, DKS allows the easy use of these kernels in host applications without providing any device or framework specific details. This approach facilitates the integration of different types of devices in the existing applications with minimal code changes and makes the device and the host code a lot more manageable.
</B>
<P>
The main parts of DKS are:
<ul>
<li>DKSBase - provides the basic communication functions between host application and hardware accelerators including memory manegement, data transfer and synchronization.</li>
<li>DKSOPAL - provides functions for Object Oriented Particle Accelerator library to offload FFTPoisson calculations and particle matter interaction using Monte Carlo simulations to GPU and Intel MIC</li>
<li>DKSBaseMuSR - provides functions to perform parameter fitting for musrfit on the GPU</li>
<li>DKSImageRecon - provides functions to perform PET image reconstruction on the GPU</li>
<li>DKSFFT - provides functions to perform FFT on the GPU and Intel MIC</li>
</ul>
<P>
<B>
Developed by
Uldis Locans
</B>
<P>
For further information contact: locans.uldis@psi.ch - Uldis Locans
<P>
<P>
<a href="https://gitlab.psi.ch/uldis_l/DKS">DKS on gitlab</a><br>
*/

View File

@ -1,7 +1,6 @@
#include "DKSOPAL.h" #include "DKSOPAL.h"
DKSOPAL::DKSOPAL() { DKSOPAL::DKSOPAL() {
dksfft = nullptr;
dkscol = nullptr; dkscol = nullptr;
dksgreens = nullptr; dksgreens = nullptr;
} }
@ -12,7 +11,6 @@ DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
} }
DKSOPAL::~DKSOPAL() { DKSOPAL::~DKSOPAL() {
delete dksfft;
delete dkscol; delete dkscol;
delete dksgreens; delete dksgreens;
} }
@ -22,17 +20,14 @@ int DKSOPAL::setupOPAL() {
if (apiOpenCL()) { if (apiOpenCL()) {
ierr = OPENCL_SAFECALL( DKS_SUCCESS ); ierr = OPENCL_SAFECALL( DKS_SUCCESS );
//TODO: only enable if AMD libraries are available //TODO: only enable if AMD libraries are available
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) ); dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) ); dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
} else if (apiCuda()) { } else if (apiCuda()) {
ierr = CUDA_SAFECALL( DKS_SUCCESS ); ierr = CUDA_SAFECALL( DKS_SUCCESS );
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) ); dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) ); dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
ierr = MIC_SAFECALL( DKS_SUCCESS ); ierr = MIC_SAFECALL( DKS_SUCCESS );
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) ); dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) ); dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
} else { } else {
@ -50,139 +45,6 @@ int DKSOPAL::initDevice() {
} }
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSOPAL::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return dksfft->setupFFT(ndim, N);
} else if (apiOpenCL()) {
int ierr1 = dksfft->setupFFT(ndim, N);
int ierr2 = dksfft->setupFFTRC(ndim, N);
int ierr3 = dksfft->setupFFTCR(ndim, N);
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTRC(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTRC(ndim, N, scale);
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTCR(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTCR(ndim, N, scale);
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
else
return DKS_ERROR;
} else if (apiCuda()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
} else if (apiOpenMP()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
else if (apiOpenCL())
return DKS_ERROR;
else if (apiOpenMP())
return DKS_ERROR;
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) { double hz_m0, double hz_m1, double hz_m2, int streamId) {

View File

@ -5,6 +5,7 @@
#include "AutoTuning/DKSAutoTuning.h" #include "AutoTuning/DKSAutoTuning.h"
#include "DKSBase.h" #include "DKSBase.h"
#include "DKSFFT.h"
#include "DKSDefinitions.h" #include "DKSDefinitions.h"
@ -32,11 +33,15 @@
#include "MIC/MICCollimatorPhysics.h" #include "MIC/MICCollimatorPhysics.h"
#endif #endif
class DKSOPAL : public DKSBase { /**
* API to handle OPAL calls to DKS library.
* Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
* functions.
*/
class DKSOPAL : public DKSFFT {
private: private:
DKSFFT *dksfft;
DKSCollimatorPhysics *dkscol; DKSCollimatorPhysics *dkscol;
GreensFunction *dksgreens; GreensFunction *dksgreens;
@ -56,71 +61,6 @@ public:
///////Function library part of dksbase//////// ///////Function library part of dksbase////////
/////////////////////////////////////////////// ///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/** /**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs. * For specifics check OPAL docs.

View File

@ -1,11 +1,10 @@
SET (_SRCS MICBase.cpp) SET (_SRCS MICBase.cpp MICFFT.cpp)
SET (_HDRS MICBase.h) SET (_HDRS MICBase.h MICFFT.h)
IF (ENABLE_OPAL) IF (ENABLE_OPAL)
SET (_SRCS SET (_SRCS
${_SRCS} ${_SRCS}
MICChiSquare.cpp MICChiSquare.cpp
MICFFT.cpp
MICGreensFunction.cpp MICGreensFunction.cpp
MICCollimatorPhysics.cpp MICCollimatorPhysics.cpp
) )
@ -13,7 +12,6 @@ IF (ENABLE_OPAL)
SET (_HDRS SET (_HDRS
${_HDRS} ${_HDRS}
MICChiSquare.h MICChiSquare.h
MICFFT.h
MICCollimatorPhysics.h MICCollimatorPhysics.h
MICGreensFunction.hpp MICGreensFunction.hpp
MICMergeSort.h MICMergeSort.h

View File

@ -26,6 +26,9 @@
#define MIC_WIDTH 128 #define MIC_WIDTH 128
/** MIC Base class handles device setup and basic communication with the device.
* Handles devicew setup, memory manegement and data transfers.
*/
class MICBase { class MICBase {
private: private:
@ -45,57 +48,59 @@ public:
int m_device_id; int m_device_id;
/* constructor */ /** constructor */
MICBase(); MICBase();
/* destructor */ /** destructor */
~MICBase(); ~MICBase();
/* /**
Info: create MKL rand streams for each thread * Create MKL rand streams for each thread
Return: success or error code * Return: success or error code
*/ */
int mic_createRandStreams(int size); int mic_createRandStreams(int size);
/* /**
Info: delete MKL rand streams * Delete MKL rand streams
Return: succes or error code * Return: succes or error code
*/ */
int mic_deleteRandStreams(); int mic_deleteRandStreams();
/* /**
Info: create a new signal for the mic * Create a new signal for the mic.
Return: success or error code * Signals can be used for assynchronous data transfers.
* Return: success or error code
*/ */
int mic_createStream(int & streamId); int mic_createStream(int & streamId);
/* /**
Info: get the signal from the vector * Info: get the signal from the vector.
Return: mic signal * Return: mic signal
*/ */
int& mic_getStream(int id); int& mic_getStream(int id);
/* /**
Info: delete streams * Info: delete streams.
Return: success or error code * Return: success or error code
*/ */
int mic_deleteStreams(); int mic_deleteStreams();
/* /**
Info: set device id * Info: set device id.
Return: success or error code * Return: success or error code
*/ */
int mic_setDeviceId(int id); int mic_setDeviceId(int id);
/* /**
Info: get mic devices * Info: get mic devices.
Return: success or error code * Prints information about mic devices.
* Return: success or error code
*/ */
int mic_getDevices(); int mic_getDevices();
/* /**
Info: allocate memory on MIC device * Allocate memory on MIC device.
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
void * mic_allocateMemory(int size) { void * mic_allocateMemory(int size) {
@ -109,9 +114,9 @@ public:
return tmp; return tmp;
} }
/* /**
Info: transfer data to device * Transfer data to device.
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) { int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
@ -123,9 +128,9 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Info: write data to device, non-blocking * Write data to device, non-blocking.
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
@ -139,9 +144,9 @@ public:
} }
/* /**
Info: read data from device * Read data from device
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) { int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
@ -154,9 +159,9 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Info: read data from device waiting for signal * Read data from device waiting for signal
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_readDataAsync(const void * data_ptr, void * result, int size, int mic_readDataAsync(const void * data_ptr, void * result, int size,
@ -172,9 +177,9 @@ public:
} }
/* /**
Info: wait till all the signals are complete * Wait till all the signals are complete
Return siccess or error code * Return siccess or error code
*/ */
int mic_syncDevice() { int mic_syncDevice() {
@ -193,9 +198,9 @@ public:
} }
/* /**
Info: free memory on device * Free memory on device
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
int mic_freeMemory(void * data_ptr, int size) { int mic_freeMemory(void * data_ptr, int size) {
@ -210,9 +215,9 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Info: allocate memory and write data to device * Allocate memory and write data to device
Return: success or error code * Return: success or error code
*/ */
template<typename T> template<typename T>
void * mic_pushData(const void * data, int size) { void * mic_pushData(const void * data, int size) {
@ -227,9 +232,9 @@ public:
return tmp_ptr; return tmp_ptr;
} }
/* /**
Info: read data and free memory on device * Read data and free memory on device
Return: success or erro code * Return: success or erro code
*/ */
template<typename T> template<typename T>
int mic_pullData(void * data_ptr, void * result, int size) { int mic_pullData(void * data_ptr, void * result, int size) {

View File

@ -14,6 +14,9 @@
#include <offload.h> #include <offload.h>
#include "MICBase.h" #include "MICBase.h"
/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices.
* Not complete and untested because of the poor performance of first MIC devices.
*/
class MICChiSquare { class MICChiSquare {
MICBase *m_micbase; MICBase *m_micbase;

View File

@ -22,22 +22,34 @@
#define I_M 10 #define I_M 10
#define DT_M 11 #define DT_M 11
/**
* MIC device function for calculating dot product.
*/
__declspec(target(mic)) __declspec(target(mic))
double dot(mic_double3 d1, mic_double3 d2) { double dot(mic_double3 d1, mic_double3 d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
} }
/**
* MIC device function for calculating dot product.
*/
__declspec(target(mic)) __declspec(target(mic))
double dot(double dx, double dy, double dz) { double dot(double dx, double dy, double dz) {
return (dx * dx + dy * dy + dz * dz); return (dx * dx + dy * dy + dz * dz);
} }
/**
* MIC device function to check if particle is still in material.
*/
__declspec(target(mic)) __declspec(target(mic))
bool checkHit(double &z, double *par) { bool checkHit(double &z, double *par) {
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
} }
/**
* MIC device function to calculate arbitrary rotation.
*/
__declspec(target(mic)) __declspec(target(mic))
void Rot(double &px, double &pz, double &x, double &z, double xplane, void Rot(double &px, double &pz, double &x, double &z, double xplane,
double normP, double thetacou, double deltas, int coord) double normP, double thetacou, double deltas, int coord)
@ -70,6 +82,14 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
} }
/**
* MIC device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* Uses AoS to store particle positions and momentum, paralelized using OpenMP.
* For details on the algorithm see OPAL user guide.
* Deprecated on favor of SoA data layout.
*/
__declspec(target(mic)) __declspec(target(mic))
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) { void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
@ -136,11 +156,19 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
} }
/**
* MIC device function to calculate Coulomb scattering for one particle.
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
* Uses SoA to store particle positions and momentum, paralelized using OpenMP.
* For details on the algorithm see OPAL user guide.
*/
__declspec(target(mic)) __declspec(target(mic))
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label, void coulombScat(double *rx, double *ry, double *rz,
double *px, double *py, double *pz, int *label,
double *par, VSLStreamStatePtr &stream, int ii, int size) double *par, VSLStreamStatePtr &stream, int ii, int size)
{ {
//arrays for temporary storage, each core proceses MIC_WIDTH particles
double normP[MIC_WIDTH] __attribute__((aligned(64))); double normP[MIC_WIDTH] __attribute__((aligned(64)));
double deltas[MIC_WIDTH] __attribute__((aligned(64))); double deltas[MIC_WIDTH] __attribute__((aligned(64)));
double theta0[MIC_WIDTH] __attribute__((aligned(64))); double theta0[MIC_WIDTH] __attribute__((aligned(64)));
@ -152,6 +180,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
double z2[MIC_WIDTH] __attribute__((aligned(64))); double z2[MIC_WIDTH] __attribute__((aligned(64)));
double thetacou[MIC_WIDTH] __attribute__((aligned(64))); double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
//simd instruction tells the compiler its safe to vectorize the loop
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) { for (int i = ii; i < ii + MIC_WIDTH; i++) {
@ -191,6 +220,7 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
} }
} }
//vectorize the loop
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int i = ii; i < ii + size; i++) { for (int i = ii; i < ii + size; i++) {
@ -202,7 +232,6 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
} }
} }
//generate array of random numbers //generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
@ -281,6 +310,11 @@ void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, dou
} }
/**
* MIC device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__declspec(target(mic)) __declspec(target(mic))
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) { void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
@ -328,6 +362,11 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
pdead = 1; pdead = 1;
} }
/**
* MIC device function to calculate energyLoss for one particle.
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
* algorith are available in OPAL user guide.
*/
__declspec(target(mic)) __declspec(target(mic))
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) { void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
@ -377,6 +416,8 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
double *par = (double*) par_ptr; double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream; VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
the memory allocation and data trasnfer need to be handled before */
#pragma offload target(mic:m_micbase->m_device_id) \ #pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \ inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \
@ -389,7 +430,6 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()]; VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x //for loop trough particles if not checkhit set label to -2 and update R.x
#pragma omp for simd #pragma omp for simd
for (int i = 0; i < numparticles; i++) { for (int i = 0; i < numparticles; i++) {
if ( !checkHit(data[i].Rincol.z, par) ) { if ( !checkHit(data[i].Rincol.z, par) ) {
@ -449,7 +489,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
{ {
//cast device memory pointers to appropriate types
int *label = (int*)label_ptr; int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr; unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr; double *rx = (double*)rx_ptr;
@ -465,6 +505,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream; VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
the memory allocation and data trasnfer need to be handled before */
#pragma offload target (mic:0) \ #pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \ in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \ in(localID:length(0) DKS_REUSE DKS_RETAIN) \

View File

@ -26,6 +26,12 @@ typedef struct {
} MIC_PART_SMALL; } MIC_PART_SMALL;
/**
* MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Implementes OPALs collimator physics class for particle matter interactions using OpenMP
* and offload mode targetomg Intel Xeon Phi processors.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
*/
class MICCollimatorPhysics : public DKSCollimatorPhysics { class MICCollimatorPhysics : public DKSCollimatorPhysics {
private: private:

View File

@ -10,7 +10,11 @@
#include "../Algorithms/FFT.h" #include "../Algorithms/FFT.h"
#include "MICBase.h" #include "MICBase.h"
class MICFFT : public DKSFFT { /**
* MIC FFT based on BaseFFT interface.
* uses MKL library to offload FFT on Intel Xeon Phi devices.
*/
class MICFFT : public BaseFFT {
private: private:

View File

@ -15,6 +15,7 @@
#define DKS_SUCCESS 0 #define DKS_SUCCESS 0
#define DKS_ERROR 1 #define DKS_ERROR 1
/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
class MICGreensFunction : public GreensFunction { class MICGreensFunction : public GreensFunction {
private: private:

View File

@ -71,6 +71,10 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
return p; return p;
} }
/**
* Merge sort implementation for intel MIC.
* Paralellized over all the MIC cores using OpenMP tasks.
*/
template <typename T> template <typename T>
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) { void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
@ -84,6 +88,9 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
} }
} }
/**
* Quicksort algorithm, developed for use on Intel MIC devices.
*/
template <typename T> template <typename T>
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) { void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
@ -100,6 +107,10 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
} }
/**
* Insertion sort of @p list, developed for use on Intel MIC.
* Used by quick_sort to sort small lists.
*/
template <typename T> template <typename T>
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) { void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {

View File

@ -4,6 +4,25 @@ SET (_HDRS OpenCLBase.h)
SET (_SRCS OpenCLBase.cpp) SET (_SRCS OpenCLBase.cpp)
SET (_KERNELS "") SET (_KERNELS "")
IF (ENABLE_AMD)
SET (_SRCS
${_SRCS}
OpenCLFFT.cpp
)
SET (_HDRS
${_HDRS}
OpenCLFFT.h
)
SET (_KERNELS
${_KERNELS}
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
)
ENDIF (ENABLE_AMD)
IF (ENABLE_MUSR) IF (ENABLE_MUSR)
SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h) SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp) SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
@ -13,23 +32,18 @@ ENDIF (ENABLE_MUSR)
IF (ENABLE_AMD AND ENABLE_OPAL) IF (ENABLE_AMD AND ENABLE_OPAL)
SET (_SRCS SET (_SRCS
${_SRCS} ${_SRCS}
OpenCLFFT.cpp
OpenCLCollimatorPhysics.cpp OpenCLCollimatorPhysics.cpp
OpenCLGreensFunction.cpp OpenCLGreensFunction.cpp
) )
SET (_HDRS SET (_HDRS
${_HDRS} ${_HDRS}
OpenCLFFT.h
OpenCLCollimatorPhysics.h OpenCLCollimatorPhysics.h
OpenCLGreensFunction.h OpenCLGreensFunction.h
) )
SET (_KERNELS SET (_KERNELS
${_KERNELS} ${_KERNELS}
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLGreensFunction.cl OpenCLKernels/OpenCLGreensFunction.cl
) )

View File

@ -675,17 +675,14 @@ cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) {
/* /*
write data specified by in_data to device memory, device memory space defined by cl_mem write data specified by in_data to device memory, device memory space defined by cl_mem
*/ */
int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) { int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size,
size_t offset, int blocking)
{
cl_int ierr; cl_int ierr;
ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size,
//std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl; in_data, 0, NULL, NULL);
ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event);
//m_events[m_num_events] = m_last_event;
m_events.push_back(m_last_event);
if (ierr != CL_SUCCESS) { if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr); DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
@ -716,6 +713,11 @@ int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) {
*/ */
int OpenCLBase::ocl_createKernel(const char* kernel_name) { int OpenCLBase::ocl_createKernel(const char* kernel_name) {
cl_int ierr; cl_int ierr;
//release the old kernel
if (m_kernel != NULL)
clReleaseKernel(m_kernel);
//create a new kernel
m_kernel = clCreateKernel(m_program, kernel_name, &ierr); m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
if (ierr != CL_SUCCESS) { if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr); DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
@ -743,23 +745,19 @@ int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) {
optional: work_group_size - can specify how work items are divided in work groups, optional: work_group_size - can specify how work items are divided in work groups,
if left NULL OpenCL implementation handles this part. if left NULL OpenCL implementation handles this part.
*/ */
int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) { int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items,
const size_t *work_group_size)
{
cl_int ierr; cl_int ierr;
cl_event tmp_event; ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL,
if (m_last_event == NULL) { work_items, work_group_size,
ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, 0, NULL, NULL);
0, NULL, &tmp_event);
} else {
ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size,
1, &m_last_event, &tmp_event);
}
if (ierr != CL_SUCCESS) if (ierr != CL_SUCCESS)
DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr); DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr
<< " work items: " << *work_items << ", "
m_last_event = tmp_event; << " work group: " << *work_group_size);
m_events.push_back(m_last_event);
return ierr; return ierr;
} }
@ -768,12 +766,13 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const
read data from device, mem_ptr points to data on device out_data points to memory in host read data from device, mem_ptr points to data on device out_data points to memory in host
blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE) blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
*/ */
int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) { int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size,
size_t offset, int blocking)
{
cl_int ierr; cl_int ierr;
ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event); ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size,
out_data, 0, NULL, NULL);
m_events.push_back(m_last_event);
if (ierr != CL_SUCCESS) if (ierr != CL_SUCCESS)
DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr); DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
@ -937,22 +936,27 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
if (ierr != DKS_SUCCESS) if (ierr != DKS_SUCCESS)
return ierr; return ierr;
//get device properties /* get device properties */
//maximum number of work-items in a work group supported by device
size_t max_group_size; size_t max_group_size;
clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0); clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
//maxumum local memory size per work group
cl_ulong local_mem_size; cl_ulong local_mem_size;
clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0); clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
//get the supported extensions
size_t ext_size; size_t ext_size;
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size); clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
char *ext = new char[ext_size]; char *ext = new char[ext_size];
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0); clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
//get kernel properties /* get kernel properties */
//get max work group size that can be used for this kernel
size_t kernel_group_size; size_t kernel_group_size;
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(size_t), &kernel_group_size, 0); sizeof(size_t), &kernel_group_size, 0);
threadsPerBlock = kernel_group_size; threadsPerBlock = kernel_group_size;
//get max local memory size that can be used for this kernel
cl_ulong kernel_local_mem; cl_ulong kernel_local_mem;
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE, clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
sizeof(cl_ulong), &kernel_local_mem, 0); sizeof(cl_ulong), &kernel_local_mem, 0);
@ -961,16 +965,16 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl; std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
std::cout << "Work groups: device limit " << max_group_size << ", " std::cout << "Work group size: max for device " << max_group_size << " > "
<< "kernel limit " << kernel_group_size << ", " << "max for kernel " << kernel_group_size << " > "
<< "required " << work_group_size << std::endl; << "required " << work_group_size << std::endl;
std::cout << "Local memory: device limit " << local_mem_size << std::endl; std::cout << "Local memory: device limit " << local_mem_size << std::endl;
std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
std::cout << std::endl << "Available extensions: " << ext << std::endl;
std::cout << "Available extensions: " << ext << std::endl;
std::cout << "End " << kernel_name << " check..." << std::endl << std::endl; std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;

View File

@ -1,16 +1,3 @@
/*
Name: OpenCLBase
Author: Uldis Locans
Info: OpenCL base class to handle all the common details associated
with kernel launch on OpenCL device
Date: 2014.09.18
*/
#ifndef H_OPENCL_BASE #ifndef H_OPENCL_BASE
#define H_OPENCL_BASE #define H_OPENCL_BASE
@ -32,7 +19,7 @@
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/* struct for random number state */ /** struct for random number state. */
typedef struct { typedef struct {
double s10; double s10;
double s11; double s11;
@ -44,168 +31,194 @@ typedef struct {
bool gen; bool gen;
} RNDState; } RNDState;
/**
* OpenCL base class to handle device setup and basic communication wiht the device.
* Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
* The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
* class contains methods to read the kernel files, compile the kernel codes and launch kernels
* from the compiled codes. Which kernel file needs to be loaded for the specif functin is
* handled by the base class that is launching the kernel.
*/
class OpenCLBase { class OpenCLBase {
private: private:
//variables containig OpenCL device and platform ids
static cl_platform_id m_platform_id; static cl_platform_id m_platform_id;
static cl_device_id m_device_id; static cl_device_id m_device_id;
//variables containit compiled OpenCL program and kernel
cl_context_properties m_context_properties[3]; cl_context_properties m_context_properties[3];
cl_program m_program; cl_program m_program;
cl_kernel m_kernel; cl_kernel m_kernel;
//variables for tracking OpenCL events
static cl_event m_last_event; static cl_event m_last_event;
cl_int m_num_events; cl_int m_num_events;
std::vector<cl_event> m_events; std::vector<cl_event> m_events;
//currently load kernel file
char * m_kernel_file; char * m_kernel_file;
//type of device used by OpenCL
cl_device_type m_device_type; cl_device_type m_device_type;
/* /**
Name: getPlatforms * Get all available OpenCL platforms.
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms * Get all avaialble platforms and save in m_platform_ids, save number of platforms
Return: success or error code * Return: success or error code
*/ */
int ocl_getPlatforms(); int ocl_getPlatforms();
/* /**
Name: getDevice * Get first available OpenCL device of specified type.
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu) * Get first avaialble devices and save device id and platform id for this device,
ReturnL success or error code * device name: (-gpu, -mic, -cpu)
* ReturnL success or error code
*/ */
int ocl_getDevice(const char* device_name); int ocl_getDevice(const char* device_name);
/* /**
Name getDeviceType * Get cl_device_type from the specified device name.
Info: get device type from device name (-gpu, -cpu, -mic) * get device type from device name (-gpu, -cpu, -mic)
Return: success or error code * Return: success or error code
*/ */
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type); int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
/* /**
Name: createContext * Create OpenCL context with specified device.
Info: create context with specified device * Return: success or error code
Return: success or error code
*/ */
int ocl_createContext(); int ocl_createContext();
/* /**
Name: buildProgram * Build program from specified kernel file.
Info: build program from specified kernel file * Return: success or error code.
Return: success or error code
*/ */
int ocl_buildProgram(const char* kernel_file); int ocl_buildProgram(const char* kernel_file);
/** Compile program from kernel source string /**
* * Compile program from kernel source string.
* Takes a string read from OpenCL kernel file saved in kernel_source and compiles the
* OpenCL program, that can be then executed on the device.
* opts is a string specifiend additional compiler flags.
*/ */
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL); int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
protected: protected:
//memory for random number states
int defaultRndSet; int defaultRndSet;
cl_mem defaultRndState; cl_mem defaultRndState;
public: public:
//OpenCL context and commad queue
static cl_context m_context; static cl_context m_context;
static cl_command_queue m_command_queue; static cl_command_queue m_command_queue;
/* /**
constructor * constructor
*/ */
OpenCLBase(); OpenCLBase();
/* /**
destructor * destructor
*/ */
~OpenCLBase(); ~OpenCLBase();
/* /**
Create RND states * Allocate memory for size random number states and init the rnd states.
Return: success or error code * Uses AMD clRng library for random numbers.
* This library is only compatible with AMD devices.
*/ */
int ocl_createRndStates(int size); int ocl_createRndStates(int size);
/* Create an array of random numbers on the device /**
* * Create an array of random numbers on the device.
* Filles hte mem_ptr with random numbers.
*/ */
int ocl_createRandomNumbers(void *mem_ptr, int size); int ocl_createRandomNumbers(void *mem_ptr, int size);
/* /**
Destroy rnd states * Destroy rnd states and free device memory.
Return: success or error code * Return: success or error code
*/ */
int ocl_deleteRndStates(); int ocl_deleteRndStates();
/* /**
Name: getAllDevices * Prints info about all the available platforms and devices.
Info: get all available devices * Can be used for information purposes to see what devices are available on the system.
ReturnL success or error code * ReturnL success or error code.
*/ */
int ocl_getAllDevices(); int ocl_getAllDevices();
/** Get the OpenCL device count for the set type of device /**
* * Get the OpenCL device count for the set type of device.
* Device count is set in ndev parameter, returns success or error code.
*/ */
int ocl_getDeviceCount(int &ndev); int ocl_getDeviceCount(int &ndev);
/** Get the name of the device used /**
* Get the name of the device currently us use.
*/ */
int ocl_getDeviceName(std::string &device_name); int ocl_getDeviceName(std::string &device_name);
/** Set the device to use for OpenCL kernels. /**
* device id to use is passed as integer. * Set the device to use for OpenCL kernels.
* Device id to use is passed as integer.
*/ */
int ocl_setDevice(int device); int ocl_setDevice(int device);
/** Get a list of all the unique devices of the same type that can run OpenCL kernels /**
* Get a list of all the unique devices of the same type that can run OpenCL kernels.
* Used when GPUs of different types might be pressent on the system. * Used when GPUs of different types might be pressent on the system.
*/ */
int ocl_getUniqueDevices(std::vector<int> &devices); int ocl_getUniqueDevices(std::vector<int> &devices);
/* /**
Name: setUp * Initialize OpenCL connection with a device of specified type.
Info: set up opencl resources * Find if specified device is avaialble, creates a contex and command queue.
Return: success or error code * Returns success or error code.
*/ */
int ocl_setUp(const char* device_name); int ocl_setUp(const char* device_name);
/* /**
Name: loadKernel * Given a OpenCL kernel file name loads the content and compile the OpenCL code.
Info: load and compile opencl kernel file if it has changed * Load and compile opencl kernel file if it has changed.
Return: success or error code * Return: success or error code
*/ */
int ocl_loadKernel(const char* kernel_file); int ocl_loadKernel(const char* kernel_file);
/** Build program from kernel source. /**
* Build program from kernel source.
* Builds a program from source code provided in kernel_source. * Builds a program from source code provided in kernel_source.
* If compilation fails will return DKS_ERROR * If compilation fails will return DKS_ERROR
*/ */
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL); int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
/* /**
Name: allocateMemory * Allocate memory on the device.
Info: allocate memory on device * Return: return pointer to memory
Return: return pointer to memory
*/ */
cl_mem ocl_allocateMemory(size_t size, int &ierr); cl_mem ocl_allocateMemory(size_t size, int &ierr);
/* /**
Name: allocateMemory * Allocate memory of specific type on device.
Info: allocate memory on device * The availabel types are cl_mem_flags type listed in OpenCL documentation:
Return: return pointer to memory * CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,
* CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
* Return: return pointer to memory
*/ */
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer /**
* Set all the elemetns in the device array to zero * Zero OpenCL memory buffer.
* Set all the elemetns in the device array to zero.
*/ */
template <typename T> template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) { int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
@ -218,91 +231,89 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/* /**
Name: writeData * Write data to device memory (needs ptr to mem object)
Info: write data to device memory (needs ptr to mem object) * Return: success or error code
Return: success or error code
*/ */
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/* /**
Name: copyData * Copy data from one buffer on the device to another
Info: copy data from one buffer on the device to another * Return: success or error code
Return: success or error code
*/ */
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size); int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
/* /**
Name: createKernel * Create kernel from compiled OpenCL program.
Info: create kernel from program * Return: success or error code
Return: success or error code
*/ */
int ocl_createKernel(const char* kernel_name); int ocl_createKernel(const char* kernel_name);
/* /**
Name: setKernelArgs * Set argiments for the kernel that will be launched.
Info: set opencl kernel arguments * Return: success or error code
Return: success or error code
*/ */
int ocl_setKernelArg(int idx, size_t size, const void *arg_value); int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
/* /**
Name: executeKernel * Execute selected kernel.
Info: execute selected kernel (needs kernel parameters) * Before kenrel can be executed buildProgram must be executed, create kernel must be executed
Return: success or error code * and kenre specifeid in execute kerenel must be in compiled source, and the necessary
* kernel arguments must be set.
* Return: success or error code
*/ */
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL); int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
/* /**
Name: readData * Read data from device (needs pointer to mem object).
Info: read data from device (needs pointer to mem object) * Return: success or error code
Return: success or error code
*/ */
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/* /**
Name: freeMemory * Free device memory (needs ptr to mem object).
Info: free device memory (needs ptr to mem object) * Return: success or error code
Return: success or error code
*/ */
int ocl_freeMemory(cl_mem mem_ptr); int ocl_freeMemory(cl_mem mem_ptr);
/* /**
Name: cleanUp * Free opencl resources.
Info: free opencl resources * Deletes the kernel, compiled program, command queue and colese the connection
Return: success or error code * to device by releasing the context.
* Return: success or error code
*/ */
int ocl_cleanUp(); int ocl_cleanUp();
/* /**
Name: deviceInfo * Print info of currently selected device.
Info: print device info (mostly for debugging purposes) * Mostly for debugging purposes, but in verbose mode can be used to see device properties.
Return: success or error code * Return: success or error code
*/ */
int ocl_deviceInfo(bool verbose = true); int ocl_deviceInfo(bool verbose = true);
/* Check OpenCL kernel. /*
* Query device and check if it can run the kernel with required parameters * Check OpenCL kernel.
* Query device and check if it can run the kernel with required parameters.
* Also check the available OpenCL extensions - usefull for checking the supported device
* features, like double precission.
*/ */
int ocl_checkKernel(const char* kernel_name, int work_group_size, int ocl_checkKernel(const char* kernel_name, int work_group_size,
bool double_precision, int &threadsPerBlock); bool double_precision, int &threadsPerBlock);
/* /**
Name: clearEvents * Clear the event list.
Info: clear saved events (for debuging purposes) * Events can be used for timing and synchronization purposes.
Return: nothing
*/ */
void ocl_clearEvents(); void ocl_clearEvents();
/* /**
Name: eventInfo * print information about kernel timings from event list.
Info: print information about kernel timings (for debuging purposes) * for debuging purposes
Return: nothing
*/ */
void ocl_eventInfo(); void ocl_eventInfo();
/* /**
Return current command queue * Return current command queue.
*/ */
cl_command_queue ocl_getQueue() { return m_command_queue; } cl_command_queue ocl_getQueue() { return m_command_queue; }
}; };

View File

@ -14,7 +14,7 @@
#define DKS_SUCCESS 0 #define DKS_SUCCESS 0
#define DKS_ERROR 1 #define DKS_ERROR 1
/** Deprecated, SimpleFit implementation of ChiSquare. */
class OpenCLChiSquare { class OpenCLChiSquare {
private: private:

View File

@ -79,7 +79,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
int ierr; int ierr;
//calc number of threads per workgroup and nr of work groups //calc number of threads per workgroup and nr of work groups
size_t work_size_sum = 128; size_t work_size_sum = (size_t)blockSize_m;
/* /*
size_t work_items = (size_t)length; size_t work_items = (size_t)length;
@ -98,7 +98,6 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr); tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
//execute sum kernel //execute sum kernel
//ocl_createKernel("parallelReductionSum");
m_oclbase->ocl_createKernel("parallelReductionTwoPhase"); m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr); m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
@ -141,6 +140,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
//set work item size //set work item size
size_t work_items; size_t work_items;
size_t work_size = (size_t)blockSize_m; size_t work_size = (size_t)blockSize_m;
if (numBlocks_m < 0) if (numBlocks_m < 0)
work_items = (size_t)length; work_items = (size_t)length;
else else
@ -226,6 +226,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
} }
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) { int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
//write params to gpu
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams); int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
return ierr; return ierr;
} }
@ -235,6 +236,7 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
if (numfunc == 0) if (numfunc == 0)
return DKS_SUCCESS; return DKS_SUCCESS;
//write function values to the GPU
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc); int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
return ierr; return ierr;
} }
@ -243,6 +245,7 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
if (nummap == 0) if (nummap == 0)
return DKS_SUCCESS; return DKS_SUCCESS;
//wrtie map values to the GPU
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap); int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
return ierr; return ierr;
} }
@ -257,7 +260,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
freeChiSquare(); freeChiSquare();
} }
//allocate temporary memory //allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr); mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr); mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
if (size_func == 0) if (size_func == 0)
@ -277,7 +280,7 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
int ierr = DKS_ERROR; int ierr = DKS_ERROR;
if (initDone_m) { if (initDone_m) {
//free memory //free GPU memory
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m); ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
@ -308,7 +311,12 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
return DKS_ERROR; return DKS_ERROR;
} }
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock); //check the GPU kernel
ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
if (threadsPerBlock < blockSize_m) {
std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
blockSize_m = threadsPerBlock;
}
return ierr; return ierr;

View File

@ -17,44 +17,54 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
const std::string openclFunctFooter = "}\n"; const std::string openclFunctFooter = "}\n";
/**
* OpenCL implementation of ChiSquareRuntime class.
* Implements ChiSquareRuntime interface to allow musrfit to target devices that
* support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
*/
class OpenCLChiSquareRuntime : public ChiSquareRuntime { class OpenCLChiSquareRuntime : public ChiSquareRuntime {
private: private:
OpenCLBase *m_oclbase; OpenCLBase *m_oclbase;
/** Private function to add user defined function to kernel string /**
* * Private function to add user defined function to kernel string.
*/ */
std::string buildProgram(std::string function); std::string buildProgram(std::string function);
/**
* Launch parallel reduction kernel to calculate the sum of data array
*/
double calculateSum(cl_mem data, int length); double calculateSum(cl_mem data, int length);
public: public:
/** Constructor wiht openclbase argument /**
* * Constructor wiht openclbase argument.
*/ */
OpenCLChiSquareRuntime(OpenCLBase *base); OpenCLChiSquareRuntime(OpenCLBase *base);
/** Default constructor /**
* * Default constructor
*/ */
OpenCLChiSquareRuntime(); OpenCLChiSquareRuntime();
/** Default destructor /**
* * Default destructor
*/ */
~OpenCLChiSquareRuntime(); ~OpenCLChiSquareRuntime();
/** Compile program and save ptx. /**
* Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program * Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in * Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]] * a form par[map[idx]]
*/ */
int compileProgram(std::string function, bool mlh = false); int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel /**
* Launch selected kernel.
* Launched the selected kernel from the compiled code. * Launched the selected kernel from the compiled code.
* Result is put in &result variable * Result is put in &result variable
*/ */
@ -64,22 +74,26 @@ public:
double timeStart, double timeStep, double timeStart, double timeStep,
double &result); double &result);
/** Write params to device. /**
* Write params to device.
* Write params from double array to mem_param_m memory on the device. * Write params from double array to mem_param_m memory on the device.
*/ */
int writeParams(const double *params, int numparams); int writeParams(const double *params, int numparams);
/** Write functions to device. /**
* Write functions to device.
* Write function values from double array to mem_func_m memory on the device. * Write function values from double array to mem_func_m memory on the device.
*/ */
int writeFunc(const double *func, int numfunc); int writeFunc(const double *func, int numfunc);
/** Write maps to device. /**
* Write maps to device.
* Write map values from int array to mem_map_m memory on the device. * Write map values from int array to mem_map_m memory on the device.
*/ */
int writeMap(const int *map, int nummap); int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square. /**
* Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param, * the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in * size_func and size_map are the maximum number of parameters, functions and maps used in
@ -87,14 +101,16 @@ public:
*/ */
int initChiSquare(int size_data, int size_param, int size_func, int size_map); int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square. /**
* Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps * Frees the chisq temporary memory and memory for params, functions and maps
*/ */
int freeChiSquare(); int freeChiSquare();
/** Check MuSR kernels for necessary resources. /**
* Check MuSR kernels for necessary resources.
* Query device properties to get if sufficient resources are * Query device properties to get if sufficient resources are
* available to run the kernels * available to run the kernels. Also checks if double precission is enabled on the device.
*/ */
int checkChiSquareKernels(int fitType, int &threadsPerBlock); int checkChiSquareKernels(int fitType, int &threadsPerBlock);

View File

@ -17,12 +17,16 @@
#include "boost/compute/core.hpp" #include "boost/compute/core.hpp"
*/ */
/** Double3 structure for use in OpenCL code. */
typedef struct { typedef struct {
double x; double x;
double y; double y;
double z; double z;
} Double3; } Double3;
/**
* Structure for stroing particles in OpenCL code.
*/
typedef struct { typedef struct {
int label; int label;
unsigned localID; unsigned localID;
@ -35,6 +39,10 @@ typedef struct {
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z)); //BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol)); //BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
/**
* OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
* Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
*/
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics { class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
private: private:
@ -42,16 +50,20 @@ private:
public: public:
/* constructor */ /**
* Constructor with OpenCLBase as argument.
* Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
*/
OpenCLCollimatorPhysics(OpenCLBase *base) { OpenCLCollimatorPhysics(OpenCLBase *base) {
m_oclbase = base; m_oclbase = base;
} }
/* destructor */ /**
* Destructor.
*/
~OpenCLCollimatorPhysics() { ~OpenCLCollimatorPhysics() {
} }
/* execute degrader code on device */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherforScattering = true); bool enableRutherforScattering = true);

View File

@ -1,14 +1,3 @@
/*
Name: OpenCLFFT
Author: Uldis Locans
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
Data: 19.09.2014
*/
#ifndef H_OPENCL_FFT #ifndef H_OPENCL_FFT
#define H_OPENCL_FFT #define H_OPENCL_FFT
@ -22,7 +11,13 @@
#include "clFFT.h" #include "clFFT.h"
class OpenCLFFT : public DKSFFT { /**
* OpenCL FFT class based on BaseFFT interface.
* Uses clFFT library to perform FFTs on AMD gpus.
* clFFT library works also on nvida GPUs and other devices that
* support OpenCL.
*/
class OpenCLFFT : public BaseFFT {
private: private:
@ -112,8 +107,7 @@ public:
int streamId = -1); int streamId = -1);
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1); int streamId = -1);
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) {
{
return DKS_ERROR; return DKS_ERROR;
} }

View File

@ -7,6 +7,7 @@
#include "../Algorithms/GreensFunction.h" #include "../Algorithms/GreensFunction.h"
#include "OpenCLBase.h" #include "OpenCLBase.h"
/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
class OpenCLGreensFunction : public GreensFunction { class OpenCLGreensFunction : public GreensFunction {
private: private:
@ -31,7 +32,7 @@ public:
int buildProgram(); int buildProgram();
/** /**
Info: calc itegral on device memory (taken from OPAL src code) Info: calc itegral on device memory (taken from OPAL src code).
Return: success or error code Return: success or error code
*/ */
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
@ -39,20 +40,20 @@ public:
int streamId = -1); int streamId = -1);
/** /**
Info: integration of rho2_m field (taken from OPAL src code) Info: integration of rho2_m field (taken from OPAL src code).
Return: success or error code Return: success or error code
*/ */
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1); int streamId = -1);
/** /**
Info: mirror rho field (taken from OPAL src code) Info: mirror rho field (taken from OPAL src code).
Return: succes or error code Return: succes or error code
*/ */
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1); int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/** /**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1 Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
Return: success or error code Return: success or error code
*/ */
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);

View File

@ -5,6 +5,10 @@
#include <string> #include <string>
#include <sys/time.h> #include <sys/time.h>
/**
* Custom timer class.
* Allows to insert timers in the code to get function exectution times.
*/
class DKSTimer { class DKSTimer {
private: private:
@ -17,38 +21,44 @@ private:
public: public:
/** Init DKSTimer by seting timer to zero */ /** Init DKSTimer by seting timer to zero. */
DKSTimer(); DKSTimer();
~DKSTimer(); ~DKSTimer();
/** Init the timer /**
* Init the timer.
* Set the name for timer and clear all values * Set the name for timer and clear all values
*/ */
void init(std::string n); void init(std::string n);
/** Start the timer. /**
* Start the timer.
* Get the curret time with gettimeofday and save in timeStart * Get the curret time with gettimeofday and save in timeStart
*/ */
void start(); void start();
/** Stop the timer /**
* Stop the timer.
* Get the curretn time with gettimeofday and save in timeEnd * Get the curretn time with gettimeofday and save in timeEnd
* Calculate elapsed time by timeEnd - timeStart and add to timervalue * Calculate elapsed time by timeEnd - timeStart and add to timervalue
*/ */
void stop(); void stop();
/** Reset timervalue to zero. /**
* Reset timervalue to zero.
* Set timervalue, timeStart and timeEnd to zero * Set timervalue, timeStart and timeEnd to zero
*/ */
void reset(); void reset();
/** Return elapsed time in seconds. /**
* Return elapsed time in seconds.
* Return the value of timervalue * Return the value of timervalue
*/ */
double gettime(); double gettime();
/** Print timer. /**
* Print timer.
* Print the elapsed time of the timer * Print the elapsed time of the timer
*/ */
void print(); void print();

View File

@ -39,8 +39,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#TARGET_LINK_LIBRARIES(testFFT dks) #TARGET_LINK_LIBRARIES(testFFT dks)
#TARGET_LINK_LIBRARIES(testMIC dks) #TARGET_LINK_LIBRARIES(testMIC dks)
#TARGET_LINK_LIBRARIES(testMICOpenCL dks) #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testFFT3D dks ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks) #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks) #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
#TARGET_LINK_LIBRARIES(testStockhamFFT dks) #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -54,11 +54,11 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testGather dks) #TARGET_LINK_LIBRARIES(testGather dks)
#TARGET_LINK_LIBRARIES(testGatherAsync dks) #TARGET_LINK_LIBRARIES(testGatherAsync dks)
#TARGET_LINK_LIBRARIES(testTranspose dks) #TARGET_LINK_LIBRARIES(testTranspose dks)
TARGET_LINK_LIBRARIES(testRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testRandom dks ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testPush dks) #TARGET_LINK_LIBRARIES(testPush dks)
TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testIntegration dks) #TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks) #TARGET_LINK_LIBRARIES(testImageReconstruction dks)