Compare commits
3 Commits
DKS-1.1.4
...
dks-1.0.0-
Author | SHA1 | Date | |
---|---|---|---|
24f394c693 | |||
8f00d2a593 | |||
1606b641d4 |
104
CMakeLists.txt
104
CMakeLists.txt
@ -1,8 +1,10 @@
|
|||||||
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
|
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
|
||||||
PROJECT (DKS)
|
PROJECT (DKS)
|
||||||
SET (DKS_VERSION_MAJOR 1)
|
SET (DKS_VERSION_MAJOR 1)
|
||||||
SET (DKS_VERSION_MINOR 1)
|
SET (DKS_VERSION_MINOR 0)
|
||||||
SET (DKS_VERSION_PATCH 4)
|
SET (DKS_VERSION_PATCH 2)
|
||||||
|
SET (PACKAGE \"dks\")
|
||||||
|
|
||||||
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
|
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
|
||||||
SET (PACKAGE \"dks\")
|
SET (PACKAGE \"dks\")
|
||||||
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
|
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
|
||||||
@ -10,9 +12,6 @@ SET (PACKAGE_NAME \"DKS\")
|
|||||||
SET (PACKAGE_TARNAME \"dks\")
|
SET (PACKAGE_TARNAME \"dks\")
|
||||||
SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
|
SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
|
||||||
SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
||||||
if (APPLE)
|
|
||||||
SET (CMAKE_MACOSX_RPATH TRUE)
|
|
||||||
endif (APPLE)
|
|
||||||
|
|
||||||
#get compiler name
|
#get compiler name
|
||||||
#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
|
#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
|
||||||
@ -31,60 +30,24 @@ MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
|
|||||||
set (BOOSTROOT $ENV{BOOST_DIR})
|
set (BOOSTROOT $ENV{BOOST_DIR})
|
||||||
SET (Boost_USE_STATIC_LIBS OFF)
|
SET (Boost_USE_STATIC_LIBS OFF)
|
||||||
SET (Boost_USE_STATIC_RUNTIME OFF)
|
SET (Boost_USE_STATIC_RUNTIME OFF)
|
||||||
#FIND_PACKAGE(Boost 1.55 REQUIRED COMPONENTS filesystem system)
|
FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
|
||||||
FIND_PACKAGE(Boost 1.41 REQUIRED)
|
|
||||||
IF (Boost_FOUND)
|
IF (Boost_FOUND)
|
||||||
MESSAGE (STATUS "Boost version: ${Boost_VERSION}")
|
|
||||||
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
|
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
|
||||||
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
|
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
|
||||||
#MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
|
MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
|
||||||
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
|
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
|
||||||
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
||||||
ENDIF (Boost_FOUND)
|
ENDIF (Boost_FOUND)
|
||||||
|
|
||||||
#include OPAL, musrfit or pet kernels
|
|
||||||
OPTION(DKS_FULL "Compile DKS with full library" OFF)
|
|
||||||
OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
|
|
||||||
OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
|
|
||||||
OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
|
|
||||||
|
|
||||||
IF (DKS_FULL)
|
|
||||||
SET(ENABLE_OPAL ON)
|
|
||||||
SET(ENABLE_MUSR ON)
|
|
||||||
SET(ENABLE_PET ON)
|
|
||||||
ENDIF(DKS_FULL)
|
|
||||||
|
|
||||||
#find clFFT
|
|
||||||
OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
|
|
||||||
IF (ENABLE_AMD)
|
|
||||||
SET (clFFT_USE_STATIC_LIBS OFF)
|
|
||||||
FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
|
|
||||||
MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
|
|
||||||
MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
|
|
||||||
INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
|
|
||||||
LINK_DIRECTORIES (${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
#find clRNG
|
|
||||||
#SET (clRNG_USE_STATIC_LIBS OFF)
|
|
||||||
#FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
|
|
||||||
#MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
|
|
||||||
#MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
|
|
||||||
#INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
|
|
||||||
#LINK_DIRECTORIES (${CLRNG_LIBRARIES})
|
|
||||||
#find_package(PkgConfig)
|
|
||||||
#pkg_check_modules(clRng REQUIRED)
|
|
||||||
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
|
|
||||||
ENDIF (ENABLE_AMD)
|
|
||||||
|
|
||||||
#enable UQTK
|
#enable UQTK
|
||||||
OPTION (USE_UQTK "Use UQTK" OFF)
|
OPTION (USE_UQTK "Use UQTK" OFF)
|
||||||
|
|
||||||
|
|
||||||
#intel icpc compiler specific flags
|
#intel icpc compiler specific flags
|
||||||
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
||||||
|
|
||||||
#for intel compiler turn on openmp and opencl
|
#for intel compiler turn on openmp and opencl
|
||||||
OPTION (USE_OPENCL "Use OpenCL" OFF)
|
OPTION (USE_OPENCL "Use OpenCL" ON)
|
||||||
OPTION (USE_CUDA "Use CUDA" OFF)
|
OPTION (USE_CUDA "Use CUDA" OFF)
|
||||||
OPTION (USE_MIC "Use intel MIC" ON)
|
OPTION (USE_MIC "Use intel MIC" ON)
|
||||||
|
|
||||||
@ -115,30 +78,18 @@ IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
|||||||
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
||||||
|
|
||||||
#gnu copmpiler specific flags
|
#gnu copmpiler specific flags
|
||||||
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
|
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
|
||||||
|
|
||||||
|
|
||||||
OPTION (USE_OPENCL "Use OpenCL" OFF)
|
OPTION (USE_OPENCL "Use OpenCL" ON)
|
||||||
OPTION (USE_CUDA "Use CUDA" OFF)
|
OPTION (USE_CUDA "Use CUDA" OFF)
|
||||||
OPTION (USE_MIC "Use intel MIC" OFF)
|
OPTION (USE_MIC "Use intel MIC" OFF)
|
||||||
OPTION (STATIC_CUDA "Link static cuda libraries" OFF)
|
|
||||||
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
|
||||||
IF (ENABLE_MUSR)
|
|
||||||
SET (USE_OPENCL ON)
|
|
||||||
ENDIF (ENABLE_MUSR)
|
|
||||||
|
|
||||||
#dont set openmp flag for apple devices
|
|
||||||
IF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
|
|
||||||
ELSE ($CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -std=c++11 -D__wsu")
|
|
||||||
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
|
||||||
|
|
||||||
|
|
||||||
FIND_PACKAGE(CUDA)
|
FIND_PACKAGE(CUDA)
|
||||||
IF (CUDA_FOUND)
|
IF (CUDA_FOUND)
|
||||||
SET (USE_CUDA ON)
|
SET (USE_CUDA ON)
|
||||||
OPTION(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cuda libraries" OFF)
|
|
||||||
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
|
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
|
||||||
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
|
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
|
||||||
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
|
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
|
||||||
@ -148,27 +99,20 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
|
|||||||
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
|
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
|
||||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||||
|
|
||||||
SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
|
||||||
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")
|
SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
|
||||||
|
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -DDEBUG -std=c++11 -D__wsu")
|
||||||
IF (NOT STATIC_CUDA)
|
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA")
|
|
||||||
SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart -lcufft -lcublas")
|
|
||||||
ELSE (NOT STATIC_CUDA)
|
|
||||||
SET (CUDA_SEPARABLE_COMPILATION ON)
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA -fPIC")
|
|
||||||
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-rdc=true;-lcufft_static;-lcublas_static;-lcurand_static")
|
|
||||||
SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart_static -lcufft_static -lcublas_static -lculibos")
|
|
||||||
ENDIF (NOT STATIC_CUDA)
|
|
||||||
|
|
||||||
#if cuda version >= 7.0 add runtime commpilation flags
|
#if cuda version >= 7.0 add runtime commpilation flags
|
||||||
IF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
|
IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
|
||||||
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
|
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||||
|
|
||||||
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
|
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
|
||||||
|
|
||||||
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
|
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
|
||||||
|
#set(CUDA_SEPARABLE_COMPILATION ON)
|
||||||
SET(BUILD_SHARED_LIBS OFF)
|
SET(BUILD_SHARED_LIBS OFF)
|
||||||
|
|
||||||
ENDIF (CUDA_FOUND)
|
ENDIF (CUDA_FOUND)
|
||||||
@ -178,9 +122,6 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
|
|||||||
MESSAGE(STATUS "CUDA not found, looking for OpenCL")
|
MESSAGE(STATUS "CUDA not found, looking for OpenCL")
|
||||||
|
|
||||||
FIND_PACKAGE(OpenCL)
|
FIND_PACKAGE(OpenCL)
|
||||||
MESSAGE("after FIND_PACKAGE(OpenCL): version: ${OpenCL_VERSION_STRING}")
|
|
||||||
MESSAGE("after FIND_PACKAGE(OpenCL): inc dir: ${OpenCL_INCLUDE_DIR}")
|
|
||||||
MESSAGE("after FIND_PACKAGE(OpenCL): lib dir: ${OpenCL_LIBRARY}")
|
|
||||||
IF (OpenCL_FOUND)
|
IF (OpenCL_FOUND)
|
||||||
MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
|
MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
|
||||||
MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
|
MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
|
||||||
@ -198,9 +139,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
|
|||||||
ENDIF(APPLE AND NOT CUDA_FOUND)
|
ENDIF(APPLE AND NOT CUDA_FOUND)
|
||||||
|
|
||||||
#if cuda found set cuda opencl flags
|
#if cuda found set cuda opencl flags
|
||||||
IF (CUDA_FOUND AND USE_OPENCL)
|
IF (CUDA_FOUND)
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
|
||||||
ENDIF (CUDA_FOUND AND USE_OPENCL)
|
ENDIF (CUDA_FOUND)
|
||||||
|
|
||||||
#if cuda not found but amd opencl found set opencl flags
|
#if cuda not found but amd opencl found set opencl flags
|
||||||
IF (NOT CUDA_FOUND AND OpenCL_FOUND)
|
IF (NOT CUDA_FOUND AND OpenCL_FOUND)
|
||||||
@ -212,7 +153,7 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
|
|||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
|
||||||
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
|
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
|
||||||
|
|
||||||
ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") AND NOT USE_INTEL)
|
ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
|
||||||
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
|
||||||
MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
|
MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
|
||||||
@ -244,3 +185,4 @@ INSTALL (
|
|||||||
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
|
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
|
||||||
RENAME ${PROJECT_NAME}ConfigVersion.cmake
|
RENAME ${PROJECT_NAME}ConfigVersion.cmake
|
||||||
)
|
)
|
||||||
|
|
||||||
|
36
ReadMe.first
36
ReadMe.first
@ -1,7 +1,7 @@
|
|||||||
##################################################################
|
##################################################################
|
||||||
#
|
#
|
||||||
# Name: Dynamic Kernel Scheduler
|
# Name: Dynamic Kernel Scheduler
|
||||||
# Version: 1.1
|
# Version: 1.0
|
||||||
# Author: Uldis Locans
|
# Author: Uldis Locans
|
||||||
# Contacts: locans.uldis@psi.ch
|
# Contacts: locans.uldis@psi.ch
|
||||||
#
|
#
|
||||||
@ -29,30 +29,30 @@ Intel MIC compilers (optional)
|
|||||||
######Source######
|
######Source######
|
||||||
https://gitlab.psi.ch/uldis_l/DKS
|
https://gitlab.psi.ch/uldis_l/DKS
|
||||||
|
|
||||||
######Changes from DKS-1.0.x version######
|
|
||||||
DKS is split into three modules that can be enabled/disabled at compile time depending on which software it is used for.
|
|
||||||
By default only DKSBase and DKSFFT modules are enabled. In order to install other modules the necessary otion needs to be enabled.
|
|
||||||
Supported options are:
|
|
||||||
-DENABLE_OPAL option should be enabled if DKS will be used for OPAL
|
|
||||||
-DENABLE_MUSR option should be enable if DKS will be used for musrfit
|
|
||||||
-DENABLE_PET option should be enabled if DKS will be used for PET image reconstruction
|
|
||||||
|
|
||||||
See install instructions for more details on how to enable the necessary options in DKS
|
|
||||||
|
|
||||||
######Install######
|
######Install######
|
||||||
#consult the https://gitlab.psi.ch/uldis_l/DKS/wikis/home for full install isntructions
|
|
||||||
|
|
||||||
#clone DKS
|
#clone DKS
|
||||||
git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS
|
git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS
|
||||||
|
|
||||||
#switch to the desired version (OPTIONAL)
|
#set compilers to use
|
||||||
git checkout DKS-1.1.0
|
#supported c++ compilers: g++, icpc, mpicxx whith g++
|
||||||
|
#supported c compilers: gcc, icc, mpicc whith gcc
|
||||||
|
export CXX_COMPILER=cpp_compiler_name
|
||||||
|
export CC_COMPILER=c_compiler_name
|
||||||
|
|
||||||
#configure installation in build directory
|
#set dks root directory directory
|
||||||
#enable DKS modules to compile -DENABLE_OPAL, -DENABLE_MUSR, -DENABLE_PET
|
cd DKS
|
||||||
CXX=<c++ compiler> CC=<c compiler> -DCMAKE_INSTALL_PREFIX=<install dir> <path to DKS source> [-DENABLE_OPAL=1 -DENABLE_MUSR=1 -DENABLE_PET=1]
|
export DKS_ROOT = $PWD
|
||||||
|
|
||||||
|
#set build directory
|
||||||
|
mkdir $DKS_BUILD_DIR
|
||||||
|
cd $DKS_BUILD_DIR
|
||||||
|
|
||||||
|
#set install directory
|
||||||
|
export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
|
||||||
|
|
||||||
|
CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
|
||||||
|
|
||||||
#install DKS
|
|
||||||
make
|
make
|
||||||
make install
|
make install
|
||||||
|
|
||||||
|
@ -2,32 +2,18 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||||
|
|
||||||
#chi square kernel tests
|
#chi square kernel tests
|
||||||
IF (ENABLE_MUSR)
|
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||||
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${CLFFT_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
|
||||||
|
|
||||||
IF (USE_UQTK)
|
|
||||||
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
|
||||||
ENDIF (USE_UQTK)
|
|
||||||
|
|
||||||
#test to verify search functions
|
|
||||||
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testSearch dks ${CLFFT_LIBRARIES})
|
|
||||||
ENDIF (ENABLE_MUSR)
|
|
||||||
|
|
||||||
IF (ENABLE_OPAL)
|
|
||||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
ADD_EXECUTABLE(testPushKick testPushKick.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testPushKick dks ${CLFFT_LIBRARIES})
|
|
||||||
ENDIF(ENABLE_OPAL)
|
|
||||||
|
|
||||||
ADD_EXECUTABLE(testFFT testFFT.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testFFT dks ${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
|
IF (USE_UQTK)
|
||||||
|
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
||||||
|
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
|
ENDIF (USE_UQTK)
|
||||||
|
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
|
||||||
|
|
||||||
|
#test to verify search functions
|
||||||
|
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
||||||
|
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
|
||||||
|
@ -292,9 +292,6 @@ int runTest(const char *api_name, const char *device_name, bool autotune, bool m
|
|||||||
//set autotuning on/off
|
//set autotuning on/off
|
||||||
if (autotune)
|
if (autotune)
|
||||||
dksbase.setAutoTuningOn();
|
dksbase.setAutoTuningOn();
|
||||||
|
|
||||||
//check kernel
|
|
||||||
dksbase.checkMuSRKernels(1);
|
|
||||||
|
|
||||||
//tmp values to store results and tmp values for time steps and start time
|
//tmp values to store results and tmp values for time steps and start time
|
||||||
double result_gpu = 0.0;
|
double result_gpu = 0.0;
|
||||||
@ -376,11 +373,11 @@ int main(int argc, char* argv[]) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int numPlatforms = 3;
|
int numPlatforms = 2;
|
||||||
const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
|
const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
|
||||||
const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
|
const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
|
||||||
|
|
||||||
for (int i = 2; i < numPlatforms; i++) {
|
for (int i = 0; i < numPlatforms; i++) {
|
||||||
runTest(api[i], device[i], autotune, mlh, asym);
|
runTest(api[i], device[i], autotune, mlh, asym);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -392,10 +392,7 @@ int main(int argc, char *argv[]) {
|
|||||||
dksbase.setAPI(api_name);
|
dksbase.setAPI(api_name);
|
||||||
dksbase.setDevice(device_name);
|
dksbase.setDevice(device_name);
|
||||||
|
|
||||||
std::cout << "Init device" << std::endl;
|
|
||||||
dksbase.initDevice();
|
dksbase.initDevice();
|
||||||
|
|
||||||
std::cout << "Init chi square" << std::endl;
|
|
||||||
dksbase.initChiSquare(Ndata, np, nf, nm);
|
dksbase.initChiSquare(Ndata, np, nf, nm);
|
||||||
|
|
||||||
dksbase.writeParams(p, np);
|
dksbase.writeParams(p, np);
|
||||||
@ -404,24 +401,20 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
dksbase.callSetConsts(N0, TAU, BKG);
|
dksbase.callSetConsts(N0, TAU, BKG);
|
||||||
|
|
||||||
std::cout << "Compile program" << std::endl;
|
|
||||||
dksbase.callCompileProgram(sfunc);
|
dksbase.callCompileProgram(sfunc);
|
||||||
|
|
||||||
dksbase.checkMuSRKernels(1);
|
|
||||||
|
|
||||||
if (autotune)
|
if (autotune)
|
||||||
dksbase.setAutoTuningOn();
|
dksbase.setAutoTuningOn();
|
||||||
|
|
||||||
//std::cout << "Get operations" << std::endl;
|
int oper = 0;
|
||||||
//int oper = 0;
|
dksbase.getOperations(oper);
|
||||||
//dksbase.getOperations(oper);
|
|
||||||
|
|
||||||
cout << "=========================BEGIN TEST=========================" << endl;
|
cout << "=========================BEGIN TEST=========================" << endl;
|
||||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
||||||
cout << "Number of params: " << np << endl;
|
cout << "Number of params: " << np << endl;
|
||||||
cout << "Number of maps: " << nm << endl;
|
cout << "Number of maps: " << nm << endl;
|
||||||
cout << "Number of predefined functions: " << nfunc << endl;
|
cout << "Number of predefined functions: " << nfunc << endl;
|
||||||
//cout << "Number of ptx instructions: " << oper << endl;
|
cout << "Number of ptx instructions: " << oper << endl;
|
||||||
cout << "------------------------------------------------------------" << endl;
|
cout << "------------------------------------------------------------" << endl;
|
||||||
cout << sfunc << endl;
|
cout << sfunc << endl;
|
||||||
cout << "------------------------------------------------------------" << endl;
|
cout << "------------------------------------------------------------" << endl;
|
||||||
|
@ -1,161 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "DKSOPAL.h"
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int label;
|
|
||||||
unsigned localID;
|
|
||||||
double Rincol[3];
|
|
||||||
double Pincol[3];
|
|
||||||
} PART;
|
|
||||||
|
|
||||||
PART initPartSmall(int d) {
|
|
||||||
|
|
||||||
PART p;
|
|
||||||
p.label = 0;
|
|
||||||
p.localID = d;
|
|
||||||
|
|
||||||
p.Rincol[0] = 0.0;
|
|
||||||
p.Rincol[1] = 0.0;
|
|
||||||
p.Rincol[2] = 0.02;
|
|
||||||
|
|
||||||
p.Pincol[0] = 0.0;
|
|
||||||
p.Pincol[1] = 0.0;
|
|
||||||
p.Pincol[2] = 3.9920183237269791e-01;
|
|
||||||
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
void printPart(PART p) {
|
|
||||||
std::cout << "label: " << p.label << ", ";
|
|
||||||
std::cout << "localid: " << p.localID << ",";
|
|
||||||
std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
|
|
||||||
std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void initParts(PART *p, int N) {
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
p[i] = initPartSmall(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
void printParts(PART *p, int N) {
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
printPart(p[i]);
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void initParams(double *data) {
|
|
||||||
data[0] = 0.0;//2.0000000000000000e-02;
|
|
||||||
data[1] = 1.0;//1.0000000000000000e-02;
|
|
||||||
data[2] = 2.2100000000000000e+00;
|
|
||||||
data[3] = 6.0000000000000000e+00;
|
|
||||||
data[4] = 1.2010700000000000e+01;
|
|
||||||
data[5] = 2.6010000000000000e+00;
|
|
||||||
data[6] = 1.7010000000000000e+03;
|
|
||||||
data[7] = 1.2790000000000000e+03;
|
|
||||||
data[8] = 1.6379999999999999e-02;
|
|
||||||
data[9] = 1.9321266968325795e-01;
|
|
||||||
data[10] = 7.9000000000000000e+01;
|
|
||||||
data[11] = 1.0000000000000002e-12;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int loop = 10;
|
|
||||||
int numpart = 1e5;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-npart")) {
|
|
||||||
numpart = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-loop")) {
|
|
||||||
loop = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "=========================BEGIN TEST=========================" << std::endl;
|
|
||||||
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
|
|
||||||
std::cout << "Number of particles: " << numpart << std::endl;
|
|
||||||
std::cout << "Number of loops: " << loop << std::endl;
|
|
||||||
std::cout << "------------------------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
//init part vector to test mc
|
|
||||||
PART *parts = new PART[numpart];
|
|
||||||
initParts(parts, numpart);
|
|
||||||
|
|
||||||
double *params = new double[12];
|
|
||||||
initParams(params);
|
|
||||||
|
|
||||||
//init dks
|
|
||||||
int ierr;
|
|
||||||
DKSOPAL base;
|
|
||||||
base.setAPI(api_name, strlen(api_name));
|
|
||||||
base.setDevice(device_name, strlen(api_name));
|
|
||||||
ierr = base.initDevice();
|
|
||||||
if (ierr != DKS_SUCCESS)
|
|
||||||
std::cout << "Error with init device!" << std::endl;
|
|
||||||
|
|
||||||
//init random
|
|
||||||
base.callInitRandoms(numpart);
|
|
||||||
|
|
||||||
//**test collimator physics and sort***//
|
|
||||||
void *part_ptr, *param_ptr;
|
|
||||||
|
|
||||||
//allocate memory for particles
|
|
||||||
part_ptr = base.allocateMemory<PART>(numpart, ierr);
|
|
||||||
param_ptr = base.allocateMemory<double>(12, ierr);
|
|
||||||
|
|
||||||
//transfer data to device
|
|
||||||
base.writeData<PART>(part_ptr, parts, numpart);
|
|
||||||
base.writeData<double>(param_ptr, params, 12);
|
|
||||||
|
|
||||||
int numaddback;
|
|
||||||
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
|
|
||||||
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
|
|
||||||
base.syncDevice();
|
|
||||||
|
|
||||||
//read data from device
|
|
||||||
base.readData<PART>(part_ptr, parts, numpart);
|
|
||||||
|
|
||||||
//free memory
|
|
||||||
base.freeMemory<PART>(part_ptr, numpart);
|
|
||||||
base.freeMemory<double>(param_ptr, 12);
|
|
||||||
|
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
|
||||||
for (int i = 0; i < 10; i++) {
|
|
||||||
std::cout << parts[i].label << "\t"
|
|
||||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
|
||||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
|
||||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
|
||||||
<< std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
std:: cout << "..." << std::endl;
|
|
||||||
|
|
||||||
for (int i = numpart - 10; i < numpart; i++) {
|
|
||||||
std::cout << parts[i].label << "\t"
|
|
||||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
|
||||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
|
||||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
|
||||||
<< std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1,214 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <complex>
|
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
|
||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
void compareData(complex<double>* data1, complex<double>* data2, int N, int dim);
|
|
||||||
void compareData(double* data1, double *data2, int N, int dim);
|
|
||||||
|
|
||||||
void initData(complex<double> *data, int dimsize[3], int dim);
|
|
||||||
void initData(double *data, int dimsize[3], int dim);
|
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
|
|
||||||
char *api_name, char *device_name);
|
|
||||||
|
|
||||||
void printHelp();
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int ierr;
|
|
||||||
int N1 = 8;
|
|
||||||
int N2 = 8;
|
|
||||||
int N3 = 8;
|
|
||||||
int dim = 3;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
|
|
||||||
if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
|
||||||
|
|
||||||
int dimsize[3] = {N1, N2, N3};
|
|
||||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
|
||||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
|
||||||
|
|
||||||
double *rdata = new double[sizereal];
|
|
||||||
double *ordata = new double[sizereal];
|
|
||||||
complex<double> *cdata = new complex<double>[sizereal];
|
|
||||||
complex<double> *codata = new complex<double>[sizereal];
|
|
||||||
|
|
||||||
initData(rdata, dimsize, 3);
|
|
||||||
initData(cdata, dimsize, 3);
|
|
||||||
|
|
||||||
/* init DKSBase */
|
|
||||||
cout << "Init device and set function" << endl;
|
|
||||||
DKSFFT base;
|
|
||||||
base.setAPI(api_name, strlen(api_name));
|
|
||||||
base.setDevice(device_name, strlen(device_name));
|
|
||||||
cout << "init device" << endl;
|
|
||||||
base.initDevice();
|
|
||||||
cout << "setup fft" << endl;
|
|
||||||
base.setupFFT(dim, dimsize);
|
|
||||||
|
|
||||||
//Test RC FFT -> CR FFT
|
|
||||||
void *real_ptr, *comp_ptr, *res_ptr;
|
|
||||||
cout << "allocate memory" << endl;
|
|
||||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
|
||||||
res_ptr = base.allocateMemory<double>(sizereal, ierr);
|
|
||||||
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
|
||||||
|
|
||||||
cout << "write data" << endl;
|
|
||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
|
||||||
|
|
||||||
cout << "perform fft" << endl;
|
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
|
||||||
base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize);
|
|
||||||
base.callNormalizeC2RFFT(res_ptr, dim, dimsize);
|
|
||||||
|
|
||||||
cout << "read data" << endl;
|
|
||||||
base.readData<double>(res_ptr, ordata, sizereal);
|
|
||||||
|
|
||||||
compareData(rdata, ordata, N1, 3);
|
|
||||||
|
|
||||||
base.freeMemory<double>(real_ptr, sizereal);
|
|
||||||
base.freeMemory<double>(res_ptr, sizereal);
|
|
||||||
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
|
||||||
|
|
||||||
//Test CC FFT
|
|
||||||
void *mem_ptr;
|
|
||||||
mem_ptr = base.allocateMemory< complex<double> >(sizereal, ierr);
|
|
||||||
base.writeData< complex<double> >(mem_ptr, cdata, sizereal);
|
|
||||||
base.callFFT(mem_ptr, 3, dimsize);
|
|
||||||
base.callIFFT(mem_ptr, 3, dimsize);
|
|
||||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
|
||||||
base.readData< complex<double> >(mem_ptr, codata, sizereal);
|
|
||||||
|
|
||||||
compareData(cdata, codata, N1, 3);
|
|
||||||
|
|
||||||
base.freeMemory< complex<double> > (mem_ptr, sizereal);
|
|
||||||
|
|
||||||
delete[] rdata;
|
|
||||||
delete[] ordata;
|
|
||||||
delete[] cdata;
|
|
||||||
delete[] codata;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void compareData(complex<double>* data1, complex<double>* data2, int N, int dim) {
|
|
||||||
int ni, nj, nk, id;
|
|
||||||
ni = (dim > 2) ? N : 1;
|
|
||||||
nj = (dim > 1) ? N : 1;
|
|
||||||
nk = N;
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < ni; i++) {
|
|
||||||
for (int j = 0; j < nj; j++) {
|
|
||||||
for (int k = 0; k < nk; k++) {
|
|
||||||
id = i*ni*ni + j*nj + k;
|
|
||||||
sum += fabs(data1[id].real() - data2[id].real());
|
|
||||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void compareData(double* data1, double* data2, int N, int dim) {
|
|
||||||
int ni, nj, nk, id;
|
|
||||||
ni = (dim > 2) ? N : 1;
|
|
||||||
nj = (dim > 1) ? N : 1;
|
|
||||||
nk = N;
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < ni; i++) {
|
|
||||||
for (int j = 0; j < nj; j++) {
|
|
||||||
for (int k = 0; k < nk; k++) {
|
|
||||||
id = i*ni*ni + j*nj + k;
|
|
||||||
sum += fabs(data1[id] - data2[id]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void initData(complex<double> *data, int dimsize[3], int dim) {
|
|
||||||
if (dim == 3) {
|
|
||||||
for (int i = 0; i < dimsize[2]; i++)
|
|
||||||
for (int j = 0; j < dimsize[1]; j++)
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
|
|
||||||
} else if (dim == 2) {
|
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
|
||||||
data[j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[k] = complex<double>(sin(k), 0.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void initData(double *data, int dimsize[3], int dim) {
|
|
||||||
if (dim == 3) {
|
|
||||||
for (int i = 0; i < dimsize[2]; i++)
|
|
||||||
for (int j = 0; j < dimsize[1]; j++)
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
|
|
||||||
} else if (dim == 2) {
|
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
|
||||||
data[j*dimsize[0] + k] = sin(k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[k] = sin(k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
|
|
||||||
char *api_name, char *device_name)
|
|
||||||
{
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if ( argv[i] == std::string("-dim")) {
|
|
||||||
dim = atoi(argv[i + 1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( argv[i] == std::string("-grid") ) {
|
|
||||||
N1 = atoi(argv[i + 1]);
|
|
||||||
N2 = atoi(argv[i + 2]);
|
|
||||||
N3 = atoi(argv[i + 3]);
|
|
||||||
i += 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cuda")) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-cpu");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
@ -1,132 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "DKSOPAL.h"
|
|
||||||
|
|
||||||
#include <vector_types.h>
|
|
||||||
#include "cuda_runtime.h"
|
|
||||||
|
|
||||||
void initData(double3 *data, int N) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
data[i].x = (double)rand() / RAND_MAX;
|
|
||||||
data[i].y = (double)rand() / RAND_MAX;
|
|
||||||
data[i].z = (double)rand() / RAND_MAX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void initDt(double *data, int N) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
data[i] = 0.00001;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int loop = 10;
|
|
||||||
int numpart = 1e5;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-npart")) {
|
|
||||||
numpart = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-loop")) {
|
|
||||||
loop = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "=========================BEGIN TEST=========================" << std::endl;
|
|
||||||
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
|
|
||||||
std::cout << "Number of particles: " << numpart << std::endl;
|
|
||||||
std::cout << "Number of loops: " << loop << std::endl;
|
|
||||||
std::cout << "------------------------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
int ierr;
|
|
||||||
DKSOPAL dksbase;
|
|
||||||
dksbase.setAPI(api_name, strlen(api_name));
|
|
||||||
dksbase.setDevice(device_name, strlen(api_name));
|
|
||||||
ierr = dksbase.initDevice();
|
|
||||||
if (ierr != DKS_SUCCESS)
|
|
||||||
std::cout << "Error with init device!" << std::endl;
|
|
||||||
|
|
||||||
double3 *R = new double3[numpart];
|
|
||||||
double3 *P = new double3[numpart];
|
|
||||||
double3 *Ef = new double3[numpart];
|
|
||||||
double3 *Bf = new double3[numpart];
|
|
||||||
double *dt = new double[numpart];
|
|
||||||
|
|
||||||
initData(R, numpart);
|
|
||||||
initData(P, numpart);
|
|
||||||
initData(Ef, numpart);
|
|
||||||
initData(Bf, numpart);
|
|
||||||
initDt(dt, numpart);
|
|
||||||
|
|
||||||
void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
|
|
||||||
|
|
||||||
r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
|
|
||||||
|
|
||||||
|
|
||||||
dksbase.writeData<double3>(r_ptr, R, numpart);
|
|
||||||
dksbase.writeData<double3>(p_ptr, P, numpart);
|
|
||||||
dksbase.writeData<double3>(ef_ptr, Ef, numpart);
|
|
||||||
dksbase.writeData<double3>(bf_ptr, Bf, numpart);
|
|
||||||
dksbase.writeData<double>(dt_ptr, dt, numpart);
|
|
||||||
|
|
||||||
for (int i = 0; i < loop; ++i)
|
|
||||||
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
|
|
||||||
|
|
||||||
|
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
|
||||||
for (int i = 0; i < 10; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
std:: cout << "..." << std::endl;
|
|
||||||
|
|
||||||
for (int i = numpart - 10; i < numpart; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
std::cout << "============" << std::endl;
|
|
||||||
|
|
||||||
dksbase.readData<double3>(r_ptr, R, numpart);
|
|
||||||
|
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
|
||||||
for (int i = 0; i < 10; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
std:: cout << "..." << std::endl;
|
|
||||||
|
|
||||||
for (int i = numpart - 10; i < numpart; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
dksbase.freeMemory<double3>(r_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double3>(p_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double3>(ef_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double3>(bf_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double>(dt_ptr, numpart);
|
|
||||||
|
|
||||||
delete[] R;
|
|
||||||
delete[] P;
|
|
||||||
delete[] Ef;
|
|
||||||
delete[] Bf;
|
|
||||||
delete[] dt;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -3,7 +3,5 @@ SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
|
|||||||
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
|
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
|
||||||
SET(${PROJECT_NAME}_LIBRARY "dks")
|
SET(${PROJECT_NAME}_LIBRARY "dks")
|
||||||
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
|
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
|
||||||
SET(DKS_CUDA_STATIC ${STATIC_CUDA})
|
|
||||||
SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
|
|
||||||
SET(DKS_VERSION ${DKS_VERSION})
|
SET(DKS_VERSION ${DKS_VERSION})
|
||||||
SET(DKS_VERSION_STR ${DKS_VERSION_STR})
|
SET(DKS_VERSION_STR ${DKS_VERSION_STR})
|
BIN
doc/refman.pdf
BIN
doc/refman.pdf
Binary file not shown.
@ -6,7 +6,6 @@ SET (_HDRS
|
|||||||
ImageReconstruction.h
|
ImageReconstruction.h
|
||||||
CollimatorPhysics.h
|
CollimatorPhysics.h
|
||||||
FFT.h
|
FFT.h
|
||||||
GreensFunction.h
|
|
||||||
)
|
)
|
||||||
|
|
||||||
ADD_SOURCES (${_SRCS})
|
ADD_SOURCES (${_SRCS})
|
||||||
|
@ -15,9 +15,6 @@
|
|||||||
|
|
||||||
class DKSBaseMuSR;
|
class DKSBaseMuSR;
|
||||||
|
|
||||||
/**
|
|
||||||
* Interface to implement ChiSquareRuntime class for musrfit.
|
|
||||||
*/
|
|
||||||
class ChiSquareRuntime {
|
class ChiSquareRuntime {
|
||||||
friend class DKSBaseMuSR;
|
friend class DKSBaseMuSR;
|
||||||
|
|
||||||
@ -66,54 +63,23 @@ public:
|
|||||||
/** Default constructor */
|
/** Default constructor */
|
||||||
//ChiSquareRuntime();
|
//ChiSquareRuntime();
|
||||||
|
|
||||||
/** Default destructor. */
|
/** Default destructor */
|
||||||
virtual ~ChiSquareRuntime() { };
|
virtual ~ChiSquareRuntime() { };
|
||||||
|
|
||||||
/**
|
|
||||||
* Compile GPU programm generated at runtime.
|
|
||||||
*/
|
|
||||||
virtual int compileProgram(std::string function, bool mlh = false) = 0;
|
virtual int compileProgram(std::string function, bool mlh = false) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Launche the compiled chiSquare kernel.
|
|
||||||
*/
|
|
||||||
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||||
int numpar, int numfunc, int nummap,
|
int numpar, int numfunc, int nummap,
|
||||||
double timeStart, double timeStep,
|
double timeStart, double timeStep,
|
||||||
double &result) = 0;
|
double &result) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Write the parameter values to the GPU.
|
|
||||||
*/
|
|
||||||
virtual int writeParams(const double *params, int numparams) = 0;
|
virtual int writeParams(const double *params, int numparams) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Write the function values to the GPU.
|
|
||||||
*/
|
|
||||||
virtual int writeFunc(const double *func, int numfunc) = 0;
|
virtual int writeFunc(const double *func, int numfunc) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Write map values to the GPU.
|
|
||||||
*/
|
|
||||||
virtual int writeMap(const int *map, int nummap) = 0;
|
virtual int writeMap(const int *map, int nummap) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Allocate temporary memory needed for the chi square calucaltios on the device.
|
|
||||||
*/
|
|
||||||
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
|
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Free device memory allocated for chi square calculations.
|
|
||||||
*/
|
|
||||||
virtual int freeChiSquare() = 0;
|
virtual int freeChiSquare() = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if available device can run the chi square GPU code.
|
|
||||||
*/
|
|
||||||
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
|
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
|
||||||
|
|
||||||
/**
|
/** Set N0, tau and bgk values to use for the kernel.
|
||||||
* Set N0, tau and bgk values to use for the kernel.
|
|
||||||
* If values changes between data sets this needs to be called before
|
* If values changes between data sets this needs to be called before
|
||||||
* every kernel call. Returns DKS_SUCCESS.
|
* every kernel call. Returns DKS_SUCCESS.
|
||||||
*/
|
*/
|
||||||
@ -125,8 +91,7 @@ public:
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Set alpha and beta values to use for the kernel.
|
||||||
* Set alpha and beta values to use for the kernel.
|
|
||||||
* If values changes between data sets this needs to be called before
|
* If values changes between data sets this needs to be called before
|
||||||
* every kernel call. Returns DKS_SUCCESS.
|
* every kernel call. Returns DKS_SUCCESS.
|
||||||
*/
|
*/
|
||||||
@ -136,9 +101,8 @@ public:
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Set number of blocks and threads.
|
||||||
* Set number of blocks and threads.
|
* Used to set parameters obtained from auto-tuning
|
||||||
* Used to set parameters obtained from auto-tuning
|
|
||||||
*/
|
*/
|
||||||
int setKernelParams(int numBlocks, int blockSize) {
|
int setKernelParams(int numBlocks, int blockSize) {
|
||||||
int ierr = DKS_ERROR;
|
int ierr = DKS_ERROR;
|
||||||
@ -154,9 +118,8 @@ public:
|
|||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Get the number of operations in compiled kernel.
|
||||||
* Get the number of operations in compiled kernel.
|
* Count the number of operation in the ptx file for the compiled program.
|
||||||
* Count the number of operation in the ptx file for the compiled program.
|
|
||||||
*/
|
*/
|
||||||
int getOperations(int &oper) {
|
int getOperations(int &oper) {
|
||||||
|
|
||||||
|
@ -5,10 +5,10 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
/**
|
class DKSBaseMuSR;
|
||||||
* Interface to impelment particle matter interaction for OPAL.
|
|
||||||
*/
|
|
||||||
class DKSCollimatorPhysics {
|
class DKSCollimatorPhysics {
|
||||||
|
friend class DKSBaseMuSR;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
@ -18,61 +18,25 @@ protected:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
virtual ~DKSCollimatorPhysics() { }
|
virtual ~DKSCollimatorPhysics() { }
|
||||||
|
|
||||||
/**
|
|
||||||
* Execute collimator physics kernel.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
|
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
|
||||||
bool enableRutherforScattering = true) = 0;
|
bool enableRutherfordScattering = true) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
|
|
||||||
* Used only on the MIC side, was not implemented on the GPU.
|
|
||||||
*/
|
|
||||||
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
void *par_ptr, int numparticles) = 0;
|
void *par_ptr, int numparticles) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Sort particle array on GPU.
|
|
||||||
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
|
|
||||||
* array so these particles are at the end of array
|
|
||||||
*/
|
|
||||||
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
|
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
|
|
||||||
* Used only on the MIC side, was not implemented on the GPU.
|
|
||||||
*/
|
|
||||||
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
void *par_ptr, int numparticles, int &numaddback) = 0;
|
void *par_ptr, int numparticles, int &numaddback) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* BorisPusher push function for integration from OPAL.
|
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
|
||||||
*/
|
|
||||||
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||||
double dt, double c, bool usedt = false, int streamId = -1) = 0;
|
double dt, double c, bool usedt = false, int streamId = -1) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* BorisPusher kick function for integration from OPAL.
|
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
|
||||||
*/
|
|
||||||
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge,
|
|
||||||
double mass, int npart, double c, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* BorisPusher push function with transformto function form OPAL.
|
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
|
||||||
*/
|
|
||||||
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||||
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
||||||
double dt, double c, bool usedt = false,
|
double dt, double c, bool usedt = false,
|
||||||
|
@ -6,21 +6,12 @@
|
|||||||
|
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
/**
|
class DKSFFT {
|
||||||
* Abstract class defining methods for DKS FFT class.
|
|
||||||
* Used by CudaFFT, OpenCLFFT and MICFFT to create device specific FFT classes.
|
|
||||||
*/
|
|
||||||
class BaseFFT {
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
int defaultN[3];
|
int defaultN[3];
|
||||||
int defaultNdim;
|
int defaultNdim;
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if FFT plan is created for the needed dimension and FFT size.
|
|
||||||
* Returns true if the plan has been created and false if no plan for specified dimension
|
|
||||||
* and size exists.
|
|
||||||
*/
|
|
||||||
bool useDefaultPlan(int ndim, int N[3]) {
|
bool useDefaultPlan(int ndim, int N[3]) {
|
||||||
if (ndim != defaultNdim)
|
if (ndim != defaultNdim)
|
||||||
return false;
|
return false;
|
||||||
@ -31,59 +22,20 @@ protected:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
virtual ~BaseFFT() { }
|
virtual ~DKSFFT() { }
|
||||||
|
|
||||||
/** Setup FFT - init FFT library used by chosen device. */
|
|
||||||
virtual int setupFFT(int ndim, int N[3]) = 0;
|
virtual int setupFFT(int ndim, int N[3]) = 0;
|
||||||
|
|
||||||
/** Setup real to complex FFT - init FFT library used by chosen device. */
|
|
||||||
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
|
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
|
||||||
|
|
||||||
/** Setup real to complex complex to real FFT - init FFT library used by chosen device. */
|
|
||||||
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
|
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
|
||||||
|
|
||||||
/** Clean up. */
|
|
||||||
virtual int destroyFFT() = 0;
|
virtual int destroyFFT() = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Exectute C2C FFT.
|
|
||||||
* mem_ptr - memory ptr on the device for complex data.
|
|
||||||
* Performs in place FFT.
|
|
||||||
*/
|
|
||||||
virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
|
virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
|
||||||
int streamId = -1, bool forward = true) = 0;
|
int streamId = -1, bool forward = true) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Exectute inverse C2C FFT.
|
|
||||||
* mem_ptr - memory ptr on the device for complex data.
|
|
||||||
* Performs in place FFT.
|
|
||||||
*/
|
|
||||||
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize the FFT or IFFT.
|
|
||||||
* mem_ptr - memory to complex data.
|
|
||||||
*/
|
|
||||||
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Exectute R2C FFT.
|
|
||||||
* real_ptr - real input data for FFT, comp_ptr - memory on the device where
|
|
||||||
* results for the FFT are stored as complex numbers.
|
|
||||||
*/
|
|
||||||
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1) = 0;
|
int streamId = -1) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Exectute C2R FFT.
|
|
||||||
* real_ptr - real output data from the C2R FFT, comp_ptr - complex input data for the FFT.
|
|
||||||
*/
|
|
||||||
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1) = 0;
|
int streamId = -1) = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize CR FFT.
|
|
||||||
*/
|
|
||||||
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
#ifndef H_GREENSFUNCTION
|
|
||||||
#define H_GREENSFUNCTION
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Interface to implement Greens function calculations for OPAL.
|
|
||||||
*/
|
|
||||||
class GreensFunction {
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
virtual ~GreensFunction() { }
|
|
||||||
|
|
||||||
/** calc greens integral, as defined in OPAL. */
|
|
||||||
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/** integration if rho2_m, see OPAL for more details. */
|
|
||||||
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
|
|
||||||
int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/** mirror rho2_m field. */
|
|
||||||
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/** multiply two complex fields from device memory. */
|
|
||||||
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -5,22 +5,17 @@
|
|||||||
|
|
||||||
#define BLOCK_SIZE 128
|
#define BLOCK_SIZE 128
|
||||||
|
|
||||||
/** Struct to hold voxel position for PET image. */
|
|
||||||
struct VoxelPosition {
|
struct VoxelPosition {
|
||||||
float x;
|
float x;
|
||||||
float y;
|
float y;
|
||||||
float z;
|
float z;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Struct that holds pair of detectors that registered an envent. */
|
|
||||||
struct ListEvent {
|
struct ListEvent {
|
||||||
unsigned detA : 16;
|
unsigned detA : 16;
|
||||||
unsigned detB : 16;
|
unsigned detB : 16;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* Interface to implement PET image reconstruction.
|
|
||||||
*/
|
|
||||||
class ImageReconstruction {
|
class ImageReconstruction {
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -30,8 +25,7 @@ public:
|
|||||||
|
|
||||||
virtual ~ImageReconstruction() { }
|
virtual ~ImageReconstruction() { }
|
||||||
|
|
||||||
/**
|
/** Caluclate source.
|
||||||
* Caluclate source.
|
|
||||||
* Places a sphere at each voxel position and calculate the avg value and std value of pixels
|
* Places a sphere at each voxel position and calculate the avg value and std value of pixels
|
||||||
* that are inside this sphere. All the sphere used have the same diameter.
|
* that are inside this sphere. All the sphere used have the same diameter.
|
||||||
*/
|
*/
|
||||||
@ -39,8 +33,7 @@ public:
|
|||||||
void *avg, void *std, float diameter, int total_voxels,
|
void *avg, void *std, float diameter, int total_voxels,
|
||||||
int total_sources, int start = 0) = 0;
|
int total_sources, int start = 0) = 0;
|
||||||
|
|
||||||
/**
|
/** Calculate background.
|
||||||
* Calculate background.
|
|
||||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||||
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
|
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
|
||||||
@ -49,8 +42,7 @@ public:
|
|||||||
void *avg, void *std, float diameter, int total_voxels,
|
void *avg, void *std, float diameter, int total_voxels,
|
||||||
int total_sources, int start = 0) = 0;
|
int total_sources, int start = 0) = 0;
|
||||||
|
|
||||||
/**
|
/** Caluclate source using differente sources.
|
||||||
* Caluclate source using differente sources.
|
|
||||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||||
* each sphere is given by *diameter array.
|
* each sphere is given by *diameter array.
|
||||||
@ -60,7 +52,7 @@ public:
|
|||||||
int total_sources, int start = 0) = 0;
|
int total_sources, int start = 0) = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Places two sphere at each voxel position, calculates the avg value and std value of pixels.
|
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
|
||||||
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
|
||||||
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
|
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
|
||||||
* smaller sphere.
|
* smaller sphere.
|
||||||
@ -69,8 +61,7 @@ public:
|
|||||||
void *avg, void *std, void *diameter, int total_voxels,
|
void *avg, void *std, void *diameter, int total_voxels,
|
||||||
int total_sources, int start = 0) = 0;
|
int total_sources, int start = 0) = 0;
|
||||||
|
|
||||||
/**
|
/** Generate normalization.
|
||||||
* Generate normalization.
|
|
||||||
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
|
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
|
||||||
* that updates voxel values in the image on the slope between these two detectors.
|
* that updates voxel values in the image on the slope between these two detectors.
|
||||||
*/
|
*/
|
||||||
@ -78,16 +69,14 @@ public:
|
|||||||
void *det_position, int total_det) = 0;
|
void *det_position, int total_det) = 0;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/** Calculate forward projection.
|
||||||
* Calculate forward projection.
|
|
||||||
* For image reconstruction calculates forward projections.
|
* For image reconstruction calculates forward projections.
|
||||||
* see recon.cpp for details
|
* see recon.cpp for details
|
||||||
*/
|
*/
|
||||||
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||||
void *image_position, int num_events) = 0;
|
void *image_position, int num_events) = 0;
|
||||||
|
|
||||||
/**
|
/** Calculate backward projection.
|
||||||
* Calculate backward projection.
|
|
||||||
* For image reconstruction calculates backward projections.
|
* For image reconstruction calculates backward projections.
|
||||||
* see recon.cpp for details
|
* see recon.cpp for details
|
||||||
*/
|
*/
|
||||||
@ -95,29 +84,29 @@ public:
|
|||||||
void *det_position, void *image_position,
|
void *det_position, void *image_position,
|
||||||
int num_events, int num_voxels) = 0;
|
int num_events, int num_voxels) = 0;
|
||||||
|
|
||||||
/**
|
/** Set the voxel dimensins on device.
|
||||||
*Set the voxel dimensins on device.
|
*
|
||||||
*/
|
*/
|
||||||
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
|
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
|
||||||
|
|
||||||
/**
|
/** Set the image edge variables on the device.
|
||||||
* Set the image edge variables on the device.
|
*
|
||||||
*/
|
*/
|
||||||
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
|
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
|
||||||
|
|
||||||
/**
|
/** Set the image edge1 on the device.
|
||||||
* Set the image edge1 on the device.
|
*
|
||||||
*/
|
*/
|
||||||
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
|
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
|
||||||
|
|
||||||
/**
|
/** Set the minimum crystan in one ring values on the device.
|
||||||
* Set the minimum crystan in one ring values on the device.
|
*
|
||||||
*/
|
*/
|
||||||
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
|
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
|
||||||
float min_CrystalDist_InOneRing1) = 0;
|
float min_CrystalDist_InOneRing1) = 0;
|
||||||
|
|
||||||
/**
|
/** Set all other required parameters for reconstruction.
|
||||||
* Set all other required parameters for reconstruction.
|
*
|
||||||
*/
|
*/
|
||||||
virtual int setParams(float matrix_distance_factor, float phantom_diameter,
|
virtual int setParams(float matrix_distance_factor, float phantom_diameter,
|
||||||
float atten_per_mm, float ring_diameter) = 0;
|
float atten_per_mm, float ring_diameter) = 0;
|
||||||
|
@ -18,17 +18,6 @@
|
|||||||
typedef std::vector<Parameter> Parameters;
|
typedef std::vector<Parameter> Parameters;
|
||||||
typedef std::vector<State> States;
|
typedef std::vector<State> States;
|
||||||
|
|
||||||
/**
|
|
||||||
* DKS autotuning class, allows to auto-tune the defince function.
|
|
||||||
* Executes the defined function for auto-tuning and searches for optimal parameters to improve
|
|
||||||
* the function execution time. The function that is auto-tuned, parameters and the ranges
|
|
||||||
* need to be set. Includes multiple search methods, that searches the parameter space to finde
|
|
||||||
* the optimal solution.
|
|
||||||
* 1) exaustive search
|
|
||||||
* 2) line search
|
|
||||||
* 3) hill climbimg
|
|
||||||
* 4) simulated annealing
|
|
||||||
*/
|
|
||||||
class DKSAutoTuning {
|
class DKSAutoTuning {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -47,13 +36,12 @@ private:
|
|||||||
|
|
||||||
int loops_m;
|
int loops_m;
|
||||||
|
|
||||||
/** Update parameters from a state. */
|
/** Update parameters from a state */
|
||||||
int setParameterValues(States states);
|
int setParameterValues(States states);
|
||||||
|
|
||||||
/**
|
/** Evaluate the function and set execution time
|
||||||
* Evaluate the function and set execution time
|
* Returns DKS_ERROR if errors occured during function execution.
|
||||||
* Returns DKS_ERROR if errors occured during function execution.
|
* Returns DKS_SUCCESS if function executed as planned.
|
||||||
* Returns DKS_SUCCESS if function executed as planned.
|
|
||||||
*/
|
*/
|
||||||
int evaluateFunction(double &value);
|
int evaluateFunction(double &value);
|
||||||
|
|
||||||
@ -62,13 +50,12 @@ public:
|
|||||||
/** Constructor */
|
/** Constructor */
|
||||||
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
|
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
|
||||||
|
|
||||||
/** Destructor. */
|
/** Destructor */
|
||||||
~DKSAutoTuning();
|
~DKSAutoTuning();
|
||||||
|
|
||||||
/**
|
/** Set function to auto tune.
|
||||||
* Set function to auto tune.
|
* Caller of setFunction is responsible to bind the correct parameters
|
||||||
* Caller of setFunction is responsible to bind the correct parameters
|
* to the function with std::bind.
|
||||||
* to the function with std::bind.
|
|
||||||
*/
|
*/
|
||||||
void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
|
void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
|
||||||
f_m = f;
|
f_m = f;
|
||||||
@ -76,21 +63,15 @@ public:
|
|||||||
evaluate_time_m = evaluate_time;
|
evaluate_time_m = evaluate_time;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Set function to auto tune.
|
|
||||||
* Caller of setFunction is responsible to bind the correct parameters
|
|
||||||
* to the function with std::bind.
|
|
||||||
*/
|
|
||||||
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
|
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
|
||||||
fd_m = f;
|
fd_m = f;
|
||||||
function_name_m = name;
|
function_name_m = name;
|
||||||
evaluate_time_m = evaluate_time;
|
evaluate_time_m = evaluate_time;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Set parameter for auto tuning.
|
||||||
* Set parameter for auto tuning.
|
* Provide a pointer to a parameter that will be changed during auto-tuning
|
||||||
* Provide a pointer to a parameter that will be changed during auto-tuning
|
* and a min-max value for this element
|
||||||
* and a min-max value for this element
|
|
||||||
*/
|
*/
|
||||||
template <typename T1>
|
template <typename T1>
|
||||||
void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
|
void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
|
||||||
@ -104,9 +85,9 @@ public:
|
|||||||
/** Perform exaustive search evaluating all the parameter configurations */
|
/** Perform exaustive search evaluating all the parameter configurations */
|
||||||
void exaustiveSearch();
|
void exaustiveSearch();
|
||||||
|
|
||||||
/**
|
/** Perform auto-tuning.
|
||||||
* Perform line-search auto-tuning by variying parameters one at a time.
|
* Perform line-search auto-tuning by variying parameters one at a time and keeping other
|
||||||
* After one parameter is auto-tuned the next on is varied
|
* parameters constant.
|
||||||
*/
|
*/
|
||||||
void lineSearch();
|
void lineSearch();
|
||||||
|
|
||||||
|
@ -4,7 +4,6 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
/** Tester class for auto-tuning search algorithms. */
|
|
||||||
class DKSAutoTuningTester {
|
class DKSAutoTuningTester {
|
||||||
|
|
||||||
friend class DKSBaseMuSR;
|
friend class DKSBaseMuSR;
|
||||||
|
@ -1,3 +1,9 @@
|
|||||||
|
/** Class to save and load DKS autotunning configs.
|
||||||
|
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
|
||||||
|
* Uses boost xml_parser to read and write the xml file and boost property tree to store
|
||||||
|
* the xml content.
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef DKS_CONFIG
|
#ifndef DKS_CONFIG
|
||||||
#define DKS_CONFIG
|
#define DKS_CONFIG
|
||||||
|
|
||||||
@ -5,7 +11,7 @@
|
|||||||
#include <boost/optional/optional.hpp>
|
#include <boost/optional/optional.hpp>
|
||||||
#include <boost/property_tree/xml_parser.hpp>
|
#include <boost/property_tree/xml_parser.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
//#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -18,18 +24,11 @@
|
|||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
namespace pt = boost::property_tree;
|
namespace pt = boost::property_tree;
|
||||||
//namespace fs = boost::filesystem;
|
namespace fs = boost::filesystem;
|
||||||
|
|
||||||
const std::string config_dir = "/.config/DKS";
|
const std::string config_dir = "/.config/DKS";
|
||||||
const std::string config_file = "/autotuning.xml";
|
const std::string config_file = "/autotuning.xml";
|
||||||
|
|
||||||
/** Class to save and load DKS autotunning configs.
|
|
||||||
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
|
|
||||||
* Uses boost xml_parser to read and write the xml file and boost property tree to store
|
|
||||||
* the xml content.
|
|
||||||
* TODO: need an update boost::filesystem is disabled at the moment, no configuration file is saved
|
|
||||||
* so the auto-tuning has no effect.
|
|
||||||
*/
|
|
||||||
class DKSConfig {
|
class DKSConfig {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -9,9 +9,6 @@
|
|||||||
|
|
||||||
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
|
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
|
||||||
|
|
||||||
/**
|
|
||||||
* Parameter class allows to change the searchable parameters during the auto-tuning.
|
|
||||||
*/
|
|
||||||
class Parameter {
|
class Parameter {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -67,10 +64,6 @@ public:
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* Struct to hold a auto-tuning state.
|
|
||||||
* Holds the current value, min, max and a step to witch a state can change.
|
|
||||||
*/
|
|
||||||
struct State {
|
struct State {
|
||||||
double value;
|
double value;
|
||||||
double min;
|
double min;
|
||||||
@ -81,12 +74,6 @@ struct State {
|
|||||||
typedef std::vector<Parameter> Parameters;
|
typedef std::vector<Parameter> Parameters;
|
||||||
typedef std::vector<State> States;
|
typedef std::vector<State> States;
|
||||||
|
|
||||||
/**
|
|
||||||
* Used by auto-tuning search algorithms to move between parameter configurations.
|
|
||||||
* Allows to move from one parameter stat to another, get neighboring states,
|
|
||||||
* move to neighboring states and save state information. Print functions are available
|
|
||||||
* for debugging purposes, to follow how algorithm muves between sates.
|
|
||||||
*/
|
|
||||||
class DKSSearchStates {
|
class DKSSearchStates {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -35,29 +35,13 @@ ENDMACRO ()
|
|||||||
SET (DKS_BASEDIR_HDRS
|
SET (DKS_BASEDIR_HDRS
|
||||||
DKSBase.h
|
DKSBase.h
|
||||||
DKSDefinitions.h
|
DKSDefinitions.h
|
||||||
DKSFFT.h
|
|
||||||
)
|
)
|
||||||
|
|
||||||
SET (DKS_BASEDIR_SRCS
|
SET (DKS_BASEDIR_SRCS
|
||||||
DKSBase.cpp
|
DKSBase.cpp
|
||||||
DKSFFT.cpp
|
|
||||||
)
|
)
|
||||||
|
|
||||||
#add opal to DKS if enable_opal is set
|
IF (USE_CUDA OR USE_OPENCL)
|
||||||
IF (ENABLE_OPAL)
|
|
||||||
SET (DKS_BASEDIR_HDRS
|
|
||||||
${DKS_BASEDIR_HDRS}
|
|
||||||
DKSOPAL.h
|
|
||||||
)
|
|
||||||
|
|
||||||
SET (DKS_BASEDIR_SRCS
|
|
||||||
${DKS_BASEDIR_SRCS}
|
|
||||||
DKSOPAL.cpp
|
|
||||||
)
|
|
||||||
ENDIF (ENABLE_OPAL)
|
|
||||||
|
|
||||||
#and musrt to DKS if cuda or opencl is used and enable_musr is set
|
|
||||||
IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
|
|
||||||
SET (DKS_BASEDIR_HDRS
|
SET (DKS_BASEDIR_HDRS
|
||||||
${DKS_BASEDIR_HDRS}
|
${DKS_BASEDIR_HDRS}
|
||||||
DKSBaseMuSR.h
|
DKSBaseMuSR.h
|
||||||
@ -67,10 +51,9 @@ IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
|
|||||||
${DKS_BASEDIR_SRCS}
|
${DKS_BASEDIR_SRCS}
|
||||||
DKSBaseMuSR.cpp
|
DKSBaseMuSR.cpp
|
||||||
)
|
)
|
||||||
ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
|
ENDIF (USE_CUDA OR USE_OPENCL)
|
||||||
|
|
||||||
#add image reconstruction to DKS if cuda is used and enable_pet is set
|
IF (USE_CUDA)
|
||||||
IF (USE_CUDA AND ENABLE_PET)
|
|
||||||
SET (DKS_BASEDIR_HDRS
|
SET (DKS_BASEDIR_HDRS
|
||||||
${DKS_BASEDIR_HDRS}
|
${DKS_BASEDIR_HDRS}
|
||||||
DKSImageReconstruction.h
|
DKSImageReconstruction.h
|
||||||
@ -80,7 +63,7 @@ IF (USE_CUDA AND ENABLE_PET)
|
|||||||
${DKS_BASEDIR_SRCS}
|
${DKS_BASEDIR_SRCS}
|
||||||
DKSImageReconstruction.cpp
|
DKSImageReconstruction.cpp
|
||||||
)
|
)
|
||||||
ENDIF (USE_CUDA AND ENABLE_PET)
|
ENDIF (USE_CUDA)
|
||||||
|
|
||||||
ADD_HEADERS (${DKS_BASEDIR_HDRS})
|
ADD_HEADERS (${DKS_BASEDIR_HDRS})
|
||||||
ADD_SOURCES (${DKS_BASEDIR_SRCS})
|
ADD_SOURCES (${DKS_BASEDIR_SRCS})
|
||||||
@ -112,18 +95,26 @@ IF (USE_CUDA)
|
|||||||
CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
|
CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
|
||||||
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
||||||
|
|
||||||
TARGET_LINK_LIBRARIES(dks ${DKS_CUDA_LIBS})
|
IF (USE_UQTK)
|
||||||
TARGET_LINK_LIBRARIES(dksshared ${DKS_CUDA_LIBS})
|
TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
#TARGET_LINK_LIBRARIES(dks)
|
TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
#TARGET_LINK_LIBRARIES(dksshared)
|
ELSE (USE_UQTK)
|
||||||
|
TARGET_LINK_LIBRARIES(dks cudadevrt)
|
||||||
|
TARGET_LINK_LIBRARIES(dksshared cudadevrt)
|
||||||
|
ENDIF (USE_UQTK)
|
||||||
|
|
||||||
ELSE (USE_CUDA)
|
ELSE (USE_CUDA)
|
||||||
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
|
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
|
||||||
ADD_LIBRARY(dks ${DKS_SRCS})
|
ADD_LIBRARY(dks ${DKS_SRCS})
|
||||||
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
||||||
|
|
||||||
TARGET_LINK_LIBRARIES(dks)
|
IF (USE_UQTK)
|
||||||
TARGET_LINK_LIBRARIES(dksshared)
|
TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
|
TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
|
ELSE (USE_UQTK)
|
||||||
|
TARGET_LINK_LIBRARIES(dks)
|
||||||
|
TARGET_LINK_LIBRARIES(dksshared)
|
||||||
|
ENDIF(USE_UQTK)
|
||||||
|
|
||||||
ENDIF (USE_CUDA)
|
ENDIF (USE_CUDA)
|
||||||
|
|
||||||
|
@ -1,27 +1,35 @@
|
|||||||
SET (_HDRS CudaBase.cuh CudaFFT.cuh)
|
SET (_HDRS
|
||||||
SET (_SRCS CudaBase.cu CudaFFT.cu)
|
CudaBase.cuh
|
||||||
|
CudaFFT.cuh
|
||||||
|
CudaGreensFunction.cuh
|
||||||
|
CudaChiSquare.cuh
|
||||||
|
CudaCollimatorPhysics.cuh
|
||||||
|
CudaImageReconstruction.cuh
|
||||||
|
CudaChiSquareRuntime.cuh
|
||||||
|
)
|
||||||
|
|
||||||
|
SET (_SRCS
|
||||||
|
CudaBase.cu
|
||||||
|
CudaFFT.cu
|
||||||
|
CudaGreensFunction.cu
|
||||||
|
CudaChiSquare.cu
|
||||||
|
CudaCollimatorPhysics.cu
|
||||||
|
CudaImageReconstruction.cu
|
||||||
|
CudaChiSquareRuntime.cu
|
||||||
|
)
|
||||||
|
|
||||||
IF (ENABLE_OPAL)
|
#INCLUDE_DIRECTORIES (
|
||||||
SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
|
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu)
|
#)
|
||||||
ENDIF (ENABLE_OPAL)
|
|
||||||
|
|
||||||
IF (ENABLE_MUSR)
|
|
||||||
SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
|
|
||||||
SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
|
|
||||||
SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
|
|
||||||
ENDIF (ENABLE_MUSR)
|
|
||||||
|
|
||||||
IF (ENABLE_PET)
|
|
||||||
SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
|
|
||||||
SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
|
|
||||||
ENDIF (ENABLE_PET)
|
|
||||||
|
|
||||||
MESSAGE (STATUS "CUDA headers: ${_HDRS}")
|
|
||||||
|
|
||||||
ADD_SOURCES(${_SRCS})
|
ADD_SOURCES(${_SRCS})
|
||||||
ADD_HEADERS(${_HDRS})
|
ADD_HEADERS(${_HDRS})
|
||||||
|
|
||||||
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
|
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
|
||||||
|
|
||||||
|
SET (_KERNELS
|
||||||
|
NVRTCKernels/CudaChiSquareKernel.cu
|
||||||
|
)
|
||||||
|
|
||||||
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
|
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
|
||||||
|
|
||||||
|
@ -13,13 +13,6 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
|
|
||||||
|
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
if (idx < size)
|
|
||||||
data[idx] = curand_uniform_double(&state[idx]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//=====================================//
|
//=====================================//
|
||||||
//==========Private functions==========//
|
//==========Private functions==========//
|
||||||
@ -55,7 +48,7 @@ int CudaBase::cuda_createCurandStates(int size, int seed) {
|
|||||||
|
|
||||||
int threads = 128;
|
int threads = 128;
|
||||||
int blocks = size / threads + 1;
|
int blocks = size / threads + 1;
|
||||||
if (seed == -1)
|
if (seed == -1)
|
||||||
seed = time(NULL);
|
seed = time(NULL);
|
||||||
|
|
||||||
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
|
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
|
||||||
@ -76,15 +69,6 @@ int CudaBase::cuda_deleteCurandStates() {
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
|
|
||||||
int threads = BLOCK_SIZE;
|
|
||||||
int blocks = size / threads + 1;
|
|
||||||
|
|
||||||
kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
curandState* CudaBase::cuda_getCurandStates() {
|
curandState* CudaBase::cuda_getCurandStates() {
|
||||||
return defaultRndState;
|
return defaultRndState;
|
||||||
}
|
}
|
||||||
@ -342,3 +326,62 @@ int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
|
|||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: allcate memory and write data (push)
|
||||||
|
Return: pointer to memory object
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
|
||||||
|
|
||||||
|
void * mem_ptr;
|
||||||
|
mem_ptr = cuda_allocateMemory(size, ierr);
|
||||||
|
|
||||||
|
if (ierr == DKS_SUCCESS)
|
||||||
|
ierr = cuda_writeData(mem_ptr, in_data, size);
|
||||||
|
|
||||||
|
return mem_ptr;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: read data and free memory (pull)
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
|
||||||
|
|
||||||
|
ierr = cuda_readData(mem_ptr, out_data, size);
|
||||||
|
if (ierr == DKS_SUCCESS)
|
||||||
|
ierr = cuda_freeMemory(mem_ptr);
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
|
||||||
|
if (ierr == DKS_SUCCESS)
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: execute function
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
|
int CudaBase::cuda_executeFunction() {
|
||||||
|
|
||||||
|
std::cout << "Execute function" << std::endl;
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: clean up
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
|
int CudaBase::cuda_cleanUp() {
|
||||||
|
|
||||||
|
std::cout << "clean up" << std::endl;
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -12,15 +12,9 @@
|
|||||||
#include <cufft.h>
|
#include <cufft.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <curand_kernel.h>
|
#include <curand_kernel.h>
|
||||||
|
#include <nvToolsExt.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
#define BLOCK_SIZE 128
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA base class handles device setup and basic communication with the device.
|
|
||||||
* Handles devicew setup, memory manegement, data transfers and stream setup for
|
|
||||||
* asynchronous data transfers and kernel executions.
|
|
||||||
*/
|
|
||||||
class CudaBase {
|
class CudaBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -45,7 +39,6 @@ public:
|
|||||||
* Init cuda random number (cuRand) states.
|
* Init cuda random number (cuRand) states.
|
||||||
* Create an array of type curandState with "size" elements on the GPU
|
* Create an array of type curandState with "size" elements on the GPU
|
||||||
* and create a curandState with different seed for each array entry.
|
* and create a curandState with different seed for each array entry.
|
||||||
* If no seed is given create a seed based on current time.
|
|
||||||
* Return success or error code
|
* Return success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_createCurandStates(int size, int seed = -1);
|
int cuda_createCurandStates(int size, int seed = -1);
|
||||||
@ -53,17 +46,12 @@ public:
|
|||||||
/**
|
/**
|
||||||
* Delete curandState.
|
* Delete curandState.
|
||||||
* Delete curandState array on the GPU and free memory.
|
* Delete curandState array on the GPU and free memory.
|
||||||
* Return success or error code
|
* Return success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_deleteCurandStates();
|
int cuda_deleteCurandStates();
|
||||||
|
|
||||||
/**
|
/** Get a pointer to curand states
|
||||||
* Create 'size' random numbers on the device and save in mem_ptr array.
|
*
|
||||||
*/
|
|
||||||
int cuda_createRandomNumbers(void *mem_ptr, int size);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a pointer to curand states.
|
|
||||||
*/
|
*/
|
||||||
curandState* cuda_getCurandStates();
|
curandState* cuda_getCurandStates();
|
||||||
|
|
||||||
@ -80,98 +68,93 @@ public:
|
|||||||
int cuda_addStream(cudaStream_t tmpStream, int &streamId);
|
int cuda_addStream(cudaStream_t tmpStream, int &streamId);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* delete cuda stream.
|
* delete cuda stream
|
||||||
* success or error code
|
* success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_deleteStream(int id);
|
int cuda_deleteStream(int id);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* delete all streams.
|
* delete all streams
|
||||||
* success or error code
|
* success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_deleteStreams();
|
int cuda_deleteStreams();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set stream to use.
|
* set stream to use
|
||||||
* success or error code
|
* success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_setStream(int id);
|
int cuda_setStream(int id);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get stream that is used.
|
* Info: get stream that is used
|
||||||
* Return: return id of curretn stream
|
* Return: return id of curretn stream
|
||||||
*/
|
*/
|
||||||
int cuda_getStreamId();
|
int cuda_getStreamId();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* reset to default stream.
|
* Info: reset to default stream
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_defaultStream();
|
int cuda_defaultStream();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get number of streams.
|
* Info: get number of streams
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_numberOfStreams();
|
int cuda_numberOfStreams();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get stream.
|
* Info: get stream
|
||||||
* Return: stream
|
* Return: stream
|
||||||
*/
|
*/
|
||||||
cudaStream_t cuda_getStream(int id);
|
cudaStream_t cuda_getStream(int id);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get default cublass handle.
|
* Get default cublass handle
|
||||||
*/
|
*/
|
||||||
cublasHandle_t cuda_getCublas();
|
cublasHandle_t cuda_getCublas();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get information on cuda devices.
|
* Info: get information on cuda devices
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_getDevices();
|
int cuda_getDevices();
|
||||||
|
|
||||||
/**
|
/** Get CUDA device count.
|
||||||
* Get CUDA device count.
|
* Sets the number of devices on the platform that can use CUDA.
|
||||||
* Sets the number of devices on the platform that can use CUDA.
|
* Returns DKS_SUCCESS
|
||||||
*/
|
*/
|
||||||
int cuda_getDeviceCount(int &ndev);
|
int cuda_getDeviceCount(int &ndev);
|
||||||
|
|
||||||
/**
|
/** Get the name of the device.
|
||||||
* Get the name of the device.
|
* QUery the device properties of the used device and set the string device_name
|
||||||
* QUery the device properties of the used device and set the string device_name
|
|
||||||
*/
|
*/
|
||||||
int cuda_getDeviceName(std::string &device_name);
|
int cuda_getDeviceName(std::string &device_name);
|
||||||
|
|
||||||
/**
|
/** Set CUDA device to use.
|
||||||
* Set CUDA device to use.
|
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR
|
||||||
* If device passed in is larger than the number of devices use
|
|
||||||
* the default:0 and return DKS_ERROR
|
|
||||||
*/
|
*/
|
||||||
int cuda_setDevice(int device);
|
int cuda_setDevice(int device);
|
||||||
|
|
||||||
/**
|
/** Get unique devices
|
||||||
* Get unique devices.
|
* Get array of indeces with the unique CUDA devices available on the paltform
|
||||||
* Get array of indeces with the unique CUDA devices available on the paltform
|
|
||||||
*/
|
*/
|
||||||
int cuda_getUniqueDevices(std::vector<int> &devices);
|
int cuda_getUniqueDevices(std::vector<int> &devices);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize connection to the device.
|
* Info: init device
|
||||||
* Only needed when runtime compilation is used.
|
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_setUp();
|
int cuda_setUp();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allocate memory on cuda device.
|
* Info: allocate memory on cuda device
|
||||||
* Return: pointer to memory object
|
* Return: pointer to memory object
|
||||||
*/
|
*/
|
||||||
void * cuda_allocateMemory(size_t size, int &ierr);
|
void * cuda_allocateMemory(size_t size, int &ierr);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allocate host memory in pinned memory
|
* Info: allocate host memory in pinned memory
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -185,43 +168,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Zero CUDA memory.
|
* Info: write data to memory
|
||||||
* Set all the elements of the array on the device to zero.
|
|
||||||
*/
|
|
||||||
template<typename T>
|
|
||||||
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
|
|
||||||
cudaError cerror;
|
|
||||||
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
|
|
||||||
if (cerror != cudaSuccess) {
|
|
||||||
DEBUG_MSG("Error zeroing cuda memory!\n");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Zero CUDA memory async.
|
|
||||||
* Set all the elements of the array on the device to zero.
|
|
||||||
*/
|
|
||||||
template<typename T>
|
|
||||||
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
|
|
||||||
int dkserror = DKS_SUCCESS;
|
|
||||||
cudaError cerror;
|
|
||||||
if (streamId < cuda_numberOfStreams()) {
|
|
||||||
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
|
|
||||||
cuda_getStream(streamId));
|
|
||||||
|
|
||||||
if (cerror != cudaSuccess)
|
|
||||||
dkserror = DKS_ERROR;
|
|
||||||
} else
|
|
||||||
dkserror = DKS_ERROR;
|
|
||||||
|
|
||||||
return dkserror;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Write data to memory
|
|
||||||
* Retrun: success or error code
|
* Retrun: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -238,7 +185,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write data assynchonuously
|
* Info: write data assynchonuously
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -270,7 +217,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read data from memory
|
* Info: read data from memory
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -287,7 +234,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read data async from device memory
|
* Info: read data async from device memory
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -319,19 +266,19 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Free memory on device
|
* Info: free memory on device
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_freeMemory(void * mem_ptr);
|
int cuda_freeMemory(void * mem_ptr);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Free page locked memory on host
|
* Info: free page locked memory on host
|
||||||
* Return: success or erro code
|
* Return: success or erro code
|
||||||
*/
|
*/
|
||||||
int cuda_freeHostMemory(void * mem_ptr);
|
int cuda_freeHostMemory(void * mem_ptr);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allcate memory and write data (push)
|
* Info: allcate memory and write data (push)
|
||||||
* Return: pointer to memory object
|
* Return: pointer to memory object
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -347,7 +294,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read data and free memory (pull)
|
* Info: read data and free memory (pull)
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@ -365,10 +312,21 @@ public:
|
|||||||
else
|
else
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Info: execute function
|
||||||
|
* Return: success or error code
|
||||||
|
*/
|
||||||
|
int cuda_executeFunction();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Info: clean up
|
||||||
|
* Return: success or error code
|
||||||
|
*/
|
||||||
|
int cuda_cleanUp();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sync cuda device.
|
* Info: sync cuda device
|
||||||
* Waits till all the tasks on the GPU are finished.
|
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_syncDevice() {
|
int cuda_syncDevice() {
|
||||||
@ -377,7 +335,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Page-lock host memory.
|
* Page-lock host memory
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int cuda_hostRegister(T *ptr, int size) {
|
int cuda_hostRegister(T *ptr, int size) {
|
||||||
@ -391,7 +349,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Release page locked memory.
|
* Release page locked memory
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int cuda_hostUnregister(T *ptr) {
|
int cuda_hostUnregister(T *ptr) {
|
||||||
@ -404,7 +362,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Print device memory info (total, used, avail)
|
* Info: print device memory info (total, used, avail)
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_memInfo() {
|
int cuda_memInfo() {
|
||||||
|
@ -8,7 +8,6 @@
|
|||||||
|
|
||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
/** Deprecated, CUDA simpleFit implementation of ChiSquare. */
|
|
||||||
class CudaChiSquare {
|
class CudaChiSquare {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -86,19 +86,15 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
|
|||||||
|
|
||||||
//create program
|
//create program
|
||||||
nvrtcProgram prog;
|
nvrtcProgram prog;
|
||||||
// std::cout << cudaProg.c_str() << std::endl;
|
//std::cout << cudaProg.c_str() << std::endl;
|
||||||
nvrtcResult createResult = nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
|
nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
|
||||||
if (createResult != NVRTC_SUCCESS) {
|
|
||||||
DEBUG_MSG("Program creation failed!");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
//compile program
|
//compile program
|
||||||
const char *opts[] = {"-arch=compute_35", "-fmad=false", ""};
|
const char *opts[] = {"-fmad=false", ""};
|
||||||
int numopts = 2;
|
int numopts = 1;
|
||||||
if (mlh) {
|
if (mlh) {
|
||||||
opts[2] = "-DMLH";
|
opts[1] = "-DMLH";
|
||||||
numopts = 3;
|
numopts = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
|
nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
|
||||||
@ -122,11 +118,7 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
|
|||||||
if (ptx_m != NULL)
|
if (ptx_m != NULL)
|
||||||
delete[] ptx_m;
|
delete[] ptx_m;
|
||||||
size_t ptxSize;
|
size_t ptxSize;
|
||||||
nvrtcResult ptxSizeResult = nvrtcGetPTXSize(prog, &ptxSize);
|
nvrtcGetPTXSize(prog, &ptxSize);
|
||||||
if (ptxSizeResult != NVRTC_SUCCESS) {
|
|
||||||
DEBUG_MSG("PTX get size error!");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
ptx_m = new char[ptxSize];
|
ptx_m = new char[ptxSize];
|
||||||
nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);
|
nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);
|
||||||
|
|
||||||
@ -135,26 +127,10 @@ int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
// add some additional diagnostics
|
|
||||||
const int buffer_size = 8192;
|
|
||||||
CUjit_option options[3];
|
|
||||||
void* values[3];
|
|
||||||
char error_log[buffer_size];
|
|
||||||
int err;
|
|
||||||
options[0] = CU_JIT_ERROR_LOG_BUFFER;
|
|
||||||
values[0] = (void*)error_log;
|
|
||||||
options[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
|
||||||
values[1] = (void*)buffer_size;
|
|
||||||
options[2] = CU_JIT_TARGET_FROM_CUCONTEXT;
|
|
||||||
values[2] = 0;
|
|
||||||
//load module from ptx
|
//load module from ptx
|
||||||
CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 3, options, values);
|
CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0);
|
||||||
if (loadResult != CUDA_SUCCESS) {
|
if (loadResult != CUDA_SUCCESS) {
|
||||||
const char *err_msg;
|
DEBUG_MSG("Load module from ptx failed!");
|
||||||
cuGetErrorString(loadResult, &err_msg);
|
|
||||||
std::string msg = "Load module from ptx failed! (" + std::to_string(loadResult) + ") : " + err_msg;
|
|
||||||
DEBUG_MSG(msg);
|
|
||||||
DEBUG_MSG(error_log);
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,10 +15,6 @@ const std::string cudaFunctHeader = "__device__ double fTheory(double t, double
|
|||||||
|
|
||||||
const std::string cudaFunctFooter = "}\n";
|
const std::string cudaFunctFooter = "}\n";
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA implementation of ChiSquareRuntime class.
|
|
||||||
* Implements ChiSquareRuntime interface to allow musrfit to use CUDA to target Nvidia GPU.
|
|
||||||
*/
|
|
||||||
class CudaChiSquareRuntime : public ChiSquareRuntime{
|
class CudaChiSquareRuntime : public ChiSquareRuntime{
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -33,72 +29,65 @@ private:
|
|||||||
|
|
||||||
cublasHandle_t defaultCublasRT;
|
cublasHandle_t defaultCublasRT;
|
||||||
|
|
||||||
/**
|
/** Setup to init device
|
||||||
* Setup to init device.
|
* Create context and init device for RT compilation
|
||||||
* Create context and init device for RT compilation
|
|
||||||
*/
|
*/
|
||||||
void setUpContext();
|
void setUpContext();
|
||||||
|
|
||||||
/**
|
/** Private function to add function to kernel string
|
||||||
* Private function to add function to kernel string.
|
*
|
||||||
*/
|
*/
|
||||||
std::string buildProgram(std::string function);
|
std::string buildProgram(std::string function);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/** Constructor with CudaBase argument
|
||||||
* Constructor with CudaBase argument
|
*
|
||||||
*/
|
*/
|
||||||
CudaChiSquareRuntime(CudaBase *base);
|
CudaChiSquareRuntime(CudaBase *base);
|
||||||
|
|
||||||
/**
|
/** Default constructor init cuda device
|
||||||
* Default constructor init cuda device
|
*
|
||||||
*/
|
*/
|
||||||
CudaChiSquareRuntime();
|
CudaChiSquareRuntime();
|
||||||
|
|
||||||
/**
|
/** Default destructor
|
||||||
* Default destructor.
|
*
|
||||||
*/
|
*/
|
||||||
~CudaChiSquareRuntime();
|
~CudaChiSquareRuntime();
|
||||||
|
|
||||||
/**
|
/** Compile program and save ptx.
|
||||||
* Compile program and save ptx.
|
|
||||||
* Add function string to the calcFunction kernel and compile the program
|
* Add function string to the calcFunction kernel and compile the program
|
||||||
* Function must be valid C math expression. Parameters can be addressed in
|
* Function must be valid C math expression. Parameters can be addressed in
|
||||||
* a form par[map[idx]]
|
* a form par[map[idx]]
|
||||||
*/
|
*/
|
||||||
int compileProgram(std::string function, bool mlh = false);
|
int compileProgram(std::string function, bool mlh = false);
|
||||||
|
|
||||||
/**
|
/** Launch selected kernel
|
||||||
* Launch selected kernel.
|
|
||||||
* Launched the selected kernel from the compiled code.
|
* Launched the selected kernel from the compiled code.
|
||||||
* Result is put in &result variable.
|
* Result is put in &result variable
|
||||||
*/
|
*/
|
||||||
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
|
||||||
int numpar, int numfunc, int nummap,
|
int numpar, int numfunc, int nummap,
|
||||||
double timeStart, double timeStep,
|
double timeStart, double timeStep,
|
||||||
double &result);
|
double &result);
|
||||||
|
|
||||||
/**
|
/** Write params to device.
|
||||||
* Write params to device.
|
|
||||||
* Write params from double array to mem_param_m memory on the device.
|
* Write params from double array to mem_param_m memory on the device.
|
||||||
*/
|
*/
|
||||||
int writeParams(const double *params, int numparams);
|
int writeParams(const double *params, int numparams);
|
||||||
|
|
||||||
/**
|
/** Write functions to device.
|
||||||
* Write functions to device.
|
|
||||||
* Write function values from double array to mem_func_m memory on the device.
|
* Write function values from double array to mem_func_m memory on the device.
|
||||||
*/
|
*/
|
||||||
int writeFunc(const double *func, int numfunc);
|
int writeFunc(const double *func, int numfunc);
|
||||||
|
|
||||||
/**
|
/** Write maps to device.
|
||||||
* Write maps to device.
|
|
||||||
* Write map values from int array to mem_map_m memory on the device.
|
* Write map values from int array to mem_map_m memory on the device.
|
||||||
*/
|
*/
|
||||||
int writeMap(const int *map, int nummap);
|
int writeMap(const int *map, int nummap);
|
||||||
|
|
||||||
/**
|
/** Allocate temporary memory needed for chi square.
|
||||||
* Allocate temporary memory needed for chi square.
|
|
||||||
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
||||||
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
||||||
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
||||||
@ -107,16 +96,14 @@ public:
|
|||||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/** Free temporary memory allocated for chi square.
|
||||||
* Free temporary memory allocated for chi square.
|
|
||||||
* Frees the chisq temporary memory and memory for params, functions and maps
|
* Frees the chisq temporary memory and memory for params, functions and maps
|
||||||
*/
|
*/
|
||||||
int freeChiSquare();
|
int freeChiSquare();
|
||||||
|
|
||||||
/**
|
/** Check if CUDA device is able to run the chi square kernel.
|
||||||
* Check if CUDA device is able to run the chi square kernel.
|
* Redundant - all new CUDA devices that support RT compilation will also support
|
||||||
* Redundant - all new CUDA devices that support RT compilation will also support
|
* double precision, there are no other requirements to run chi square on GPU
|
||||||
* double precision, there are no other requirements to run chi square on GPU
|
|
||||||
*/
|
*/
|
||||||
int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
|
int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
#include "CudaCollimatorPhysics.cuh"
|
#include "CudaCollimatorPhysics.cuh"
|
||||||
|
|
||||||
//constants used in OPAL
|
//#define M_P 0.93827231e+00
|
||||||
#define M_P 0.93827204e+00
|
#define M_P 0.93827204e+00
|
||||||
#define C 299792458.0
|
#define C 299792458.0
|
||||||
#define PI 3.14159265358979323846
|
#define PI 3.14159265358979323846
|
||||||
#define AVO 6.022e23
|
#define AVO 6.022e23
|
||||||
#define R_E 2.81794092e-15
|
#define R_E 2.81794092e-15
|
||||||
|
//#define eM_E 0.51099906e-03
|
||||||
#define eM_E 0.51099892e-03
|
#define eM_E 0.51099892e-03
|
||||||
#define Z_P 1
|
#define Z_P 1
|
||||||
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
|
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
|
||||||
|
|
||||||
//parameter array indexes
|
|
||||||
#define POSITION 0
|
#define POSITION 0
|
||||||
#define ZSIZE 1
|
#define ZSIZE 1
|
||||||
#define RHO_M 2
|
#define RHO_M 2
|
||||||
@ -26,53 +26,14 @@
|
|||||||
#define LOWENERGY_THR 12
|
#define LOWENERGY_THR 12
|
||||||
|
|
||||||
#define BLOCK_SIZE 128
|
#define BLOCK_SIZE 128
|
||||||
#define NUMPAR 13
|
#define NUMPAR 12
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA device function for calculating dot product.
|
|
||||||
*/
|
|
||||||
__device__ inline double dot(double3 &d1, double3 &d2) {
|
__device__ inline double dot(double3 &d1, double3 &d2) {
|
||||||
|
|
||||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA devce function to calculate cross product.
|
|
||||||
*/
|
|
||||||
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
|
|
||||||
double3 tmp;
|
|
||||||
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
|
|
||||||
tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
|
|
||||||
tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA device function to calculate arbitrary rotation.
|
|
||||||
*/
|
|
||||||
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
|
|
||||||
double c=cos(Theta);
|
|
||||||
double s=sin(Theta);
|
|
||||||
double dotW = sqrt(dot(W,W));
|
|
||||||
W.x = W.x / dotW;
|
|
||||||
W.y = W.y / dotW;
|
|
||||||
W.z = W.z / dotW;
|
|
||||||
|
|
||||||
double dotWR = dot(W, Rorg) * (1.0 - c);
|
|
||||||
double3 crossW = cross(W, Rorg);
|
|
||||||
double3 tmp;
|
|
||||||
tmp.x = Rorg.x * c + crossW.x * s + W.x * dotWR;
|
|
||||||
tmp.y = Rorg.y * c + crossW.y * s + W.y * dotWR;
|
|
||||||
tmp.z = Rorg.z * c + crossW.z * s + W.z * dotWR;
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA device function to check if particle is still in material.
|
|
||||||
* z - particle position, par - parameter array. Particle is considered inside the
|
|
||||||
* material if z is > material starting position and z < material starting position - mat size.
|
|
||||||
*/
|
|
||||||
__device__ inline bool checkHit(double &z, double *par) {
|
__device__ inline bool checkHit(double &z, double *par) {
|
||||||
|
|
||||||
/* check if particle is in the degrader material */
|
/* check if particle is in the degrader material */
|
||||||
@ -81,11 +42,6 @@ __device__ inline bool checkHit(double &z, double *par) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA device function to calculate energyLoss for one particle.
|
|
||||||
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
|
|
||||||
* algorith are available in OPAL user guide.
|
|
||||||
*/
|
|
||||||
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
|
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -130,53 +86,49 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
|
||||||
* CUDA device function for rotation in 2 dimensions.
|
double &normP, double &thetacou, double &deltas, int coord,
|
||||||
* For details: see J. Beringer et al. (Particle Data Group), Phys. Rev. D 86, 010001 (2012),
|
|
||||||
* "Passage of particles through matter"
|
|
||||||
*/
|
|
||||||
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &plane,
|
|
||||||
double &betaGamma, double &thetacou, double &deltas, int coord,
|
|
||||||
double *par)
|
double *par)
|
||||||
{
|
{
|
||||||
// Calculate the angle between the px and pz momenta to change from beam coordinate to lab coordinate
|
double Psixz;
|
||||||
const double Psi = atan2(px, pz);
|
double pxz;
|
||||||
const double pxz = sqrt(px*px + pz*pz);
|
|
||||||
const double cosPsi = cos(Psi);
|
|
||||||
const double sinPsi = sin(Psi);
|
|
||||||
const double cosTheta = cos(thetacou);
|
|
||||||
const double sinTheta = sin(thetacou);
|
|
||||||
|
|
||||||
// Apply the rotation about the random angle thetacou & change from beam
|
if (px>=0 && pz>=0)
|
||||||
// coordinate system to the lab coordinate system using Psixz (2 dimensions)
|
Psixz = atan(px/pz);
|
||||||
x += deltas * px / betaGamma + plane * cosPsi;
|
else if (px>0 && pz<0)
|
||||||
z -= plane * sinPsi;
|
Psixz = atan(px/pz) + PI;
|
||||||
|
else if (px<0 && pz>0)
|
||||||
|
Psixz = atan(px/pz) + 2*PI;
|
||||||
|
else
|
||||||
|
Psixz = atan(px/pz) + PI;
|
||||||
|
|
||||||
if (coord == 1) {
|
pxz = sqrt(px*px + pz*pz);
|
||||||
z += deltas * pz / betaGamma;
|
|
||||||
|
if(coord==1) {
|
||||||
|
x = x + deltas * px/normP + xplane*cos(Psixz);
|
||||||
|
z = z - xplane * sin(Psixz);
|
||||||
}
|
}
|
||||||
|
|
||||||
px = pxz * (cosPsi * sinTheta + sinPsi * cosTheta);
|
if(coord==2) {
|
||||||
pz = pxz * (-sinPsi * sinTheta + cosPsi * cosTheta);
|
x = x + deltas * px/normP + xplane*cos(Psixz);
|
||||||
|
z = z - xplane * sin(Psixz) + deltas * pz / normP;
|
||||||
|
}
|
||||||
|
|
||||||
|
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
|
||||||
|
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state,
|
||||||
* CUDA device function to calculate Coulomb scattering for one particle.
|
double* par, bool enableRutherfordScattering)
|
||||||
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
|
|
||||||
* For details on the algorithm see OPAL user guide.
|
|
||||||
*/
|
|
||||||
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
|
|
||||||
bool enableRutherfordScattering)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
||||||
double gamma = (Eng + M_P) / M_P;
|
double gamma = (Eng + M_P) / M_P;
|
||||||
|
double normP = sqrt(dot(P, P));
|
||||||
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
|
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
|
||||||
double betaGamma = sqrt(dot(P, P));
|
|
||||||
double deltas = par[DT_M] * beta * C;
|
double deltas = par[DT_M] * beta * C;
|
||||||
double mass = M_P * 1e9; // in eV
|
|
||||||
|
|
||||||
double theta0 = 13.6e6 / (beta * betaGamma * mass) *
|
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
|
||||||
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
|
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
|
||||||
|
|
||||||
// x-direction: See Physical Review, "Multiple Scattering"
|
// x-direction: See Physical Review, "Multiple Scattering"
|
||||||
@ -191,9 +143,19 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
|
|||||||
}
|
}
|
||||||
|
|
||||||
//__syncthreads();
|
//__syncthreads();
|
||||||
//double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
|
||||||
double xplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
|
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||||
Rot(P.x, P.z, R.x, R.z, xplane, betaGamma, thetacou, deltas, 0, par);
|
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
|
||||||
|
|
||||||
|
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
|
if( (P2 < 0.0047) && enableRutherfordScattering) {
|
||||||
|
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
|
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||||
|
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
|
if(P4 > 0.5)
|
||||||
|
thetaru = -thetaru;
|
||||||
|
Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
|
||||||
|
}
|
||||||
|
|
||||||
// y-direction: See Physical Review, "Multiple Scattering"
|
// y-direction: See Physical Review, "Multiple Scattering"
|
||||||
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||||
@ -206,43 +168,24 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
|
|||||||
thetacou = z2 * theta0;
|
thetacou = z2 * theta0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
//__syncthreads();
|
||||||
double yplane = 0.5 * deltas * theta0 * (z1 / sqrt(3.0) + z2);
|
|
||||||
Rot(P.y,P.z,R.y,R.z, yplane, betaGamma, thetacou, deltas, 1, par);
|
|
||||||
|
|
||||||
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||||
|
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
|
||||||
|
|
||||||
|
P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
if( (P2 < 0.0047) && enableRutherfordScattering) {
|
if( (P2 < 0.0047) && enableRutherfordScattering) {
|
||||||
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
//double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||||
double thetaru = 2.5 * sqrt(1 / P3) * 2.0 * theta0;
|
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
double phiru = 2.0 * M_PI * curand_uniform_double(&state);
|
if(P4 > 0.5)
|
||||||
double th0=atan2(sqrt(P.x*P.x+P.y*P.y),fabs(P.z));
|
thetaru = -thetaru;
|
||||||
double3 W,X;
|
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
|
||||||
|
|
||||||
double dotP = sqrt(dot(P,P));
|
|
||||||
X.x = cos(phiru)*sin(thetaru) * dotP;
|
|
||||||
X.y = sin(phiru)*sin(thetaru) * dotP;
|
|
||||||
X.z = cos(thetaru) * dotP;
|
|
||||||
W.x = -P.y;
|
|
||||||
W.y = P.x;
|
|
||||||
W.z = 0.0;
|
|
||||||
P = ArbitraryRotation(W, X, th0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA kernel that performs one step in particle movement trough mater.
|
|
||||||
* One thread is launched for each particle in the simulation. The kernel checks if the particle
|
|
||||||
* is still in the material, performs energy loss caluclations and Coulomb scattering, and marks
|
|
||||||
* particles that are exiting the material.
|
|
||||||
* @param[in] *data array of particles of type CUDA_PART or CUDA_PART_SMALL
|
|
||||||
* @param[in] *par array of material properties, always constant size - 13
|
|
||||||
* @param[in] *state array holding cuRand states to preserve states between kernel launches
|
|
||||||
* @param[in] numparticles number of particles in the simulation
|
|
||||||
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
|
|
||||||
*/
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
|
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
|
||||||
int numparticles, bool enableRutherfordScattering)
|
int numparticles, bool enableRutherfordScattering)
|
||||||
@ -252,63 +195,51 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
|
|||||||
volatile int tid = threadIdx.x;
|
volatile int tid = threadIdx.x;
|
||||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
volatile int idx = blockIdx.x * blockDim.x + tid;
|
||||||
|
|
||||||
//transfer params and particle positions to shared memory
|
//transfer params to shared memory
|
||||||
//R is kept in shared memory in order to reduce register pressure for the kernel
|
|
||||||
extern __shared__ double smem[];
|
extern __shared__ double smem[];
|
||||||
double *p = (double*)smem;
|
double *p = (double*)smem;
|
||||||
double3 *R = (double3*)&smem[NUMPAR];
|
double3 *R = (double3*)&smem[NUMPAR];
|
||||||
|
|
||||||
curandState s; //each tread gets its own cuRand state for random number generation
|
curandState s;
|
||||||
double3 P;
|
double3 P;
|
||||||
|
|
||||||
//load parameters to shared memory
|
|
||||||
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
|
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
|
||||||
p[tt] = par[tt];
|
p[tt] = par[tt];
|
||||||
|
|
||||||
//sync threads to ensure that parameters are finished loading
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
//there might be some empty threads that do no work
|
|
||||||
if (idx < numparticles) {
|
if (idx < numparticles) {
|
||||||
s = state[idx]; //load cuRand state to local memory
|
s = state[idx];
|
||||||
R[tid] = data[idx].Rincol; //load position to shared memory
|
R[tid] = data[idx].Rincol;
|
||||||
P = data[idx].Pincol; //load momentum to local memory
|
P = data[idx].Pincol;
|
||||||
|
|
||||||
bool pdead = false;
|
bool pdead = false;
|
||||||
volatile double sq = sqrt(1.0 + dot(P, P));
|
volatile double sq = sqrt(1.0 + dot(P, P));
|
||||||
|
|
||||||
double Eng;
|
double Eng;
|
||||||
|
|
||||||
//check if particle is still in the material
|
|
||||||
if (checkHit(R[tid].z, p)) {
|
if (checkHit(R[tid].z, p)) {
|
||||||
|
|
||||||
//calculate enery loss
|
|
||||||
Eng = (sq - 1) * M_P;
|
Eng = (sq - 1) * M_P;
|
||||||
energyLoss(Eng, pdead, s, p);
|
energyLoss(Eng, pdead, s, p);
|
||||||
|
|
||||||
//check if particle is not dead
|
|
||||||
if (!pdead) {
|
if (!pdead) {
|
||||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||||
sq = sqrt(dot(P, P));
|
sq = sqrt(dot(P, P));
|
||||||
|
|
||||||
//caluclate Coulomb scattering
|
|
||||||
P.x = P.x * ptot / sq;
|
P.x = P.x * ptot / sq;
|
||||||
P.y = P.y * ptot / sq;
|
P.y = P.y * ptot / sq;
|
||||||
P.z = P.z * ptot / sq;
|
P.z = P.z * ptot / sq;
|
||||||
|
|
||||||
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
|
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
|
||||||
|
|
||||||
//update particle momentum
|
|
||||||
data[idx].Pincol = P;
|
data[idx].Pincol = P;
|
||||||
} else {
|
} else {
|
||||||
//mark particle as dead (-1)
|
|
||||||
data[idx].label = -1;
|
data[idx].label = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
//update cuRand state
|
|
||||||
state[idx] = s;
|
state[idx] = s;
|
||||||
} else {
|
} else {
|
||||||
//particle exits material - drift and mark as exiting (-2)
|
|
||||||
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
|
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
|
||||||
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
|
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
|
||||||
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
|
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
|
||||||
@ -316,25 +247,14 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//update particle position
|
|
||||||
data[idx].Rincol = R[tid];
|
data[idx].Rincol = R[tid];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
||||||
* CUDA kernel that performs one step in particle movement trough mater using SoA particles.
|
curandState *state, int numparticles,
|
||||||
* Identical to kernelCollimatorPhysics only uses particles stored as structure of arrays.
|
bool enableRutherfordScattering)
|
||||||
* Deprecated - GPU version does not use SoA.
|
|
||||||
* @param[in] data structure of arrays containing particle data
|
|
||||||
* @param[in] *par array of material properties, always constant size - 13
|
|
||||||
* @param[in] *state array holding cuRand states to preserve states between kernel launches
|
|
||||||
* @param[in] numparticles number of particles in the simulation
|
|
||||||
* @param[in] enableRutherfordScattering true/false whether to enable RutherfordScattering
|
|
||||||
*/
|
|
||||||
__global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
|
|
||||||
curandState *state, int numparticles,
|
|
||||||
bool enableRutherfordScattering)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
//get global id and thread id
|
//get global id and thread id
|
||||||
@ -393,32 +313,92 @@ __global__ void kernelCollimatorPhysicsSoA(CUDA_PART2_SMALL data, double *par,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Device function to swich off unitless positions.
|
|
||||||
*/
|
|
||||||
inline __device__ void unitlessOff(double3 &a, const double &c) {
|
inline __device__ void unitlessOff(double3 &a, const double &c) {
|
||||||
a.x *= c;
|
a.x *= c;
|
||||||
a.y *= c;
|
a.y *= c;
|
||||||
a.z *= c;
|
a.z *= c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Device function to swich on unitless positions.
|
|
||||||
*/
|
|
||||||
inline __device__ void unitlessOn(double3 &a, const double &c) {
|
inline __device__ void unitlessOn(double3 &a, const double &c) {
|
||||||
a.x /= c;
|
a.x /= c;
|
||||||
a.y /= c;
|
a.y /= c;
|
||||||
a.z /= c;
|
a.z /= c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//swithch to unitless positions with dtc
|
||||||
|
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
|
||||||
|
|
||||||
|
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (idx < npart) {
|
||||||
|
double3 R = gR[idx];
|
||||||
|
double3 X = gX[idx];
|
||||||
|
|
||||||
|
unitlessOn(R, dtc);
|
||||||
|
unitlessOn(X, dtc);
|
||||||
|
|
||||||
|
gR[idx] = R;
|
||||||
|
gX[idx] = X;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//swithc to unitless positions with dt*c
|
||||||
|
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
|
||||||
|
|
||||||
|
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (idx < npart) {
|
||||||
|
double3 R = gR[idx];
|
||||||
|
double3 X = gX[idx];
|
||||||
|
double dt = gdt[idx];
|
||||||
|
|
||||||
|
unitlessOff(R, dt*c);
|
||||||
|
unitlessOff(X, dt*c);
|
||||||
|
|
||||||
|
gR[idx] = R;
|
||||||
|
gX[idx] = X;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//swithc off unitless positions with dtc
|
||||||
|
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
|
||||||
|
|
||||||
|
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (idx < npart) {
|
||||||
|
double3 R = gR[idx];
|
||||||
|
double3 X = gX[idx];
|
||||||
|
|
||||||
|
unitlessOff(R, dtc);
|
||||||
|
unitlessOff(X, dtc);
|
||||||
|
|
||||||
|
gR[idx] = R;
|
||||||
|
gX[idx] = X;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//switch off unitelss positions with dt*c
|
||||||
|
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
|
||||||
|
|
||||||
|
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (idx < npart) {
|
||||||
|
double3 R = gR[idx];
|
||||||
|
double3 X = gX[idx];
|
||||||
|
double dt = gdt[idx];
|
||||||
|
|
||||||
|
unitlessOff(R, dt*c);
|
||||||
|
unitlessOff(X, dt*c);
|
||||||
|
|
||||||
|
gR[idx] = R;
|
||||||
|
gX[idx] = X;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA kernel to perform particle push.
|
|
||||||
* @param[in] *gR array of particle positions
|
|
||||||
* @param[in] *gP array of particle momentums
|
|
||||||
* @param[in] npart number of particles
|
|
||||||
* @param[in] dtc dt*c
|
|
||||||
*/
|
|
||||||
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
||||||
|
|
||||||
//get global id and thread id
|
//get global id and thread id
|
||||||
@ -446,15 +426,8 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* CUDA kernel to perform particle push.
|
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
|
||||||
* @param[in] *gR array of particle positions
|
|
||||||
* @param[in] *gP array of particle momentums
|
|
||||||
* @param[in] *gdt array of time steps for each particle
|
|
||||||
* @param[in] npart number of particles
|
|
||||||
* @param[in] c speed of light
|
|
||||||
*/
|
|
||||||
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
|
|
||||||
|
|
||||||
//get global id and thread id
|
//get global id and thread id
|
||||||
volatile int tid = threadIdx.x;
|
volatile int tid = threadIdx.x;
|
||||||
@ -480,61 +453,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
//TODO: kernel for push with switch off unitless positions with dt[i]*c
|
||||||
* CUDA kernel to perform particle kick.
|
|
||||||
* @param[in] *gR array of particle positions
|
|
||||||
* @param[in] *gP array of particle momentums
|
|
||||||
* @param[in] *gEf
|
|
||||||
* @param[in] *gBf
|
|
||||||
* @param[in] *gdt array of time steps for each particle
|
|
||||||
* @param[in] npart number of particles
|
|
||||||
* @param[in] c speed of light
|
|
||||||
*/
|
|
||||||
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
|
|
||||||
double3 *gBf, double *gdt, double charge,
|
|
||||||
double mass, int npart, double c)
|
|
||||||
{
|
|
||||||
volatile int tid = threadIdx.x;
|
|
||||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
|
||||||
|
|
||||||
if (idx < npart) {
|
|
||||||
double3 R = gR[idx];
|
|
||||||
double3 P = gP[idx];
|
|
||||||
double3 Ef = gEf[idx];
|
|
||||||
double3 Bf = gBf[idx];
|
|
||||||
double dt = gdt[idx];
|
|
||||||
|
|
||||||
P.x += 0.5 * dt * charge * c / mass * Ef.x;
|
|
||||||
P.y += 0.5 * dt * charge * c / mass * Ef.y;
|
|
||||||
P.z += 0.5 * dt * charge * c / mass * Ef.z;
|
|
||||||
|
|
||||||
double gamma = sqrt(1.0 + dot(P, P));
|
|
||||||
double3 t, w, s;
|
|
||||||
t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
|
|
||||||
t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
|
|
||||||
t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
|
|
||||||
|
|
||||||
double3 crossPt = cross(P, t);
|
|
||||||
w.x = P.x + crossPt.x;
|
|
||||||
w.y = P.y + crossPt.y;
|
|
||||||
w.z = P.z + crossPt.z;
|
|
||||||
|
|
||||||
s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
|
|
||||||
s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
|
|
||||||
s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
|
|
||||||
|
|
||||||
double3 crossws = cross(w, s);
|
|
||||||
P.x += crossws.x;
|
|
||||||
P.y += crossws.y;
|
|
||||||
P.z += crossws.z;
|
|
||||||
|
|
||||||
P.x += 0.5 * dt * charge * c / mass * Ef.x;
|
|
||||||
P.y += 0.5 * dt * charge * c / mass * Ef.y;
|
|
||||||
P.z += 0.5 * dt * charge * c / mass * Ef.z;
|
|
||||||
|
|
||||||
gP[idx] = P;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
|
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
|
||||||
|
|
||||||
@ -639,7 +558,64 @@ __global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
struct compare_particle
|
||||||
|
{
|
||||||
|
int threshold;
|
||||||
|
|
||||||
|
compare_particle() {
|
||||||
|
threshold = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_threshold(int t) {
|
||||||
|
threshold = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ __device__
|
||||||
|
bool operator()(CUDA_PART p1, CUDA_PART p2) {
|
||||||
|
return p1.label > p2.label;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ __device__
|
||||||
|
bool operator()(CUDA_PART p1) {
|
||||||
|
return p1.label < threshold;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct compare_particle_small
|
||||||
|
{
|
||||||
|
int threshold;
|
||||||
|
|
||||||
|
compare_particle_small() {
|
||||||
|
threshold = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_threshold(int t) {
|
||||||
|
threshold = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ __device__
|
||||||
|
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
|
||||||
|
return p1.label > p2.label;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ __device__
|
||||||
|
bool operator()(CUDA_PART_SMALL p1) {
|
||||||
|
return p1.label < threshold;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct less_then
|
||||||
|
{
|
||||||
|
__host__ __device__
|
||||||
|
bool operator()(int x)
|
||||||
|
{
|
||||||
|
return x < 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
||||||
bool enableRutherfordScattering)
|
bool enableRutherfordScattering)
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -701,12 +677,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr,
|
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart,
|
||||||
(double*)dt_ptr, npart, c);
|
(double*)dt_ptr, c);
|
||||||
} else {
|
} else {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
|
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart,
|
||||||
(double*)dt_ptr, npart, c);
|
(double*)dt_ptr, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -714,29 +690,6 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge,
|
|
||||||
double mass, int npart,
|
|
||||||
double c, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
int threads = BLOCK_SIZE;
|
|
||||||
int blocks = npart / threads + 1;
|
|
||||||
|
|
||||||
//call kernel
|
|
||||||
if (streamId == -1) {
|
|
||||||
kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
|
|
||||||
(double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
|
|
||||||
} else {
|
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
|
||||||
kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
|
|
||||||
(double3*)ef_ptr, (double3*)bf_ptr,
|
|
||||||
(double*)dt_ptr, charge, mass, npart, c);
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||||
void *lastSec_ptr, void *orient_ptr,
|
void *lastSec_ptr, void *orient_ptr,
|
||||||
int npart, int nsec,
|
int npart, int nsec,
|
||||||
|
@ -20,8 +20,7 @@
|
|||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure for storing particle on GPU or MIC as AoS.
|
* Structure for storing particle on GPU
|
||||||
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures.
|
|
||||||
*/
|
*/
|
||||||
typedef struct __align__(16) {
|
typedef struct __align__(16) {
|
||||||
int label;
|
int label;
|
||||||
@ -38,10 +37,7 @@ typedef struct __align__(16) {
|
|||||||
} CUDA_PART;
|
} CUDA_PART;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure for storing particle on GPU as AoS
|
* Structure for storing particle on GPU
|
||||||
* Structure for OPAL particle, can be used to store particles on the GPU in array of structures,
|
|
||||||
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
|
|
||||||
* on the host side.
|
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int label;
|
int label;
|
||||||
@ -51,8 +47,7 @@ typedef struct {
|
|||||||
} CUDA_PART_SMALL;
|
} CUDA_PART_SMALL;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure for storing particle on GPU as SoA.
|
* Structure for storing particle on GPU
|
||||||
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays.
|
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int *label;
|
int *label;
|
||||||
@ -70,9 +65,6 @@ typedef struct {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure for storing particle on GPU
|
* Structure for storing particle on GPU
|
||||||
* Structure for OPAL particle, can be used to store particles on the GPU in structure of arrays,
|
|
||||||
* contains only data that are used by the GPU kernels, the rest of the particle data must be kept
|
|
||||||
* on the host side.
|
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int *label;
|
int *label;
|
||||||
@ -81,39 +73,11 @@ typedef struct {
|
|||||||
double3 *Pincol;
|
double3 *Pincol;
|
||||||
} CUDA_PART2_SMALL;
|
} CUDA_PART2_SMALL;
|
||||||
|
|
||||||
/**
|
/** CudaCollimatorPhysics class.
|
||||||
* Operator used in thrust sort to compare particles by label.
|
|
||||||
* Used to move dead particles to the end of array, since they have label -1 or -2.
|
|
||||||
*/
|
|
||||||
struct compare_particle_small
|
|
||||||
{
|
|
||||||
int threshold;
|
|
||||||
|
|
||||||
compare_particle_small() {
|
|
||||||
threshold = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_threshold(int t) {
|
|
||||||
threshold = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
__host__ __device__
|
|
||||||
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
|
|
||||||
return p1.label > p2.label;
|
|
||||||
}
|
|
||||||
|
|
||||||
__host__ __device__
|
|
||||||
bool operator()(CUDA_PART_SMALL p1) {
|
|
||||||
return p1.label < threshold;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CudaCollimatorPhysics class based on DKSCollimatorPhysics interface.
|
|
||||||
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
|
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
|
||||||
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
|
* For detailed documentation on CollimatorPhysics functions see OPAL documentation
|
||||||
*/
|
*/
|
||||||
class CudaCollimatorPhysics : public DKSCollimatorPhysics {
|
class CudaCollimatorPhysics : public DKSCollimatorPhysics{
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -122,44 +86,32 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/** Constructor with CudaBase argument
|
||||||
* Constructor with CudaBase as argument.
|
*
|
||||||
* Create a new instace of the CudaCollimatorPhysics using existing CudaBase object.
|
|
||||||
*/
|
*/
|
||||||
CudaCollimatorPhysics(CudaBase *base) {
|
CudaCollimatorPhysics(CudaBase *base) {
|
||||||
m_base = base;
|
m_base = base;
|
||||||
base_create = false;
|
base_create = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Constructor - empty. */
|
||||||
* Empty constructor.
|
|
||||||
* Create a new instance of CudaCollimatorPhysics with its own CudaBase.
|
|
||||||
*/
|
|
||||||
CudaCollimatorPhysics() {
|
CudaCollimatorPhysics() {
|
||||||
m_base = new CudaBase();
|
m_base = new CudaBase();
|
||||||
base_create = true;
|
base_create = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Destructor - empty */
|
||||||
* Destructor.
|
|
||||||
* Destroy CudaBase object if it was created by CudaCollimatorPhysics constructor.
|
|
||||||
*/
|
|
||||||
~CudaCollimatorPhysics() {
|
~CudaCollimatorPhysics() {
|
||||||
if (base_create)
|
if (base_create)
|
||||||
delete m_base;
|
delete m_base;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/** Execute collimator physics kernel.
|
||||||
* Execute collimator physics kernel.
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
int numpartices, bool enableRutherforScattering = true);
|
int numpartices, bool enableRutherfordScattering = true);
|
||||||
|
|
||||||
/**
|
|
||||||
* Special calse CollimatorPhysics kernel that uses SoA instead of AoS.
|
|
||||||
* Used only on the MIC side, was not implemented on the GPU.
|
|
||||||
*/
|
|
||||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
@ -168,17 +120,12 @@ public:
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Sort particle array on GPU.
|
||||||
* Sort particle array on GPU.
|
|
||||||
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
|
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
|
||||||
* array so these particles are at the end of array
|
* array so these particles are at the end of array
|
||||||
*/
|
*/
|
||||||
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
||||||
|
|
||||||
/**
|
|
||||||
* Special calse CollimatorPhysicsSort kernel that uses SoA instead of AoS.
|
|
||||||
* Used only on the MIC side, was not implemented on the GPU.
|
|
||||||
*/
|
|
||||||
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
@ -187,25 +134,14 @@ public:
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** BorisPusher push function for integration from OPAL.
|
||||||
* BorisPusher push function for integration from OPAL.
|
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
* For more details see ParallelTTracler docomentation in opal
|
||||||
*/
|
*/
|
||||||
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||||
double dt, double c, bool usedt = false, int streamId = -1);
|
double dt, double c, bool usedt = false, int streamId = -1);
|
||||||
|
|
||||||
/**
|
/** BorisPusher push function with transformto function form OPAL
|
||||||
* BorisPusher kick function for integration from OPAL.
|
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
|
||||||
*/
|
|
||||||
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge, double mass,
|
|
||||||
int npart, double c, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* BorisPusher push function with transformto function form OPAL.
|
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
* For more details see ParallelTTracler docomentation in opal
|
||||||
*/
|
*/
|
||||||
|
@ -10,11 +10,7 @@
|
|||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithms/FFT.h"
|
||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
/**
|
class CudaFFT : public DKSFFT{
|
||||||
* Cuda FFT class based on BaseFFT interface.
|
|
||||||
* Uses cuFFT library to perform FFTs on nvidias GPUs.
|
|
||||||
*/
|
|
||||||
class CudaFFT : public BaseFFT {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -38,7 +34,7 @@ public:
|
|||||||
~CudaFFT();
|
~CudaFFT();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Init cufftPlans witch can be reused for all FFTs of the same size and type
|
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int setupFFT(int ndim, int N[3]);
|
int setupFFT(int ndim, int N[3]);
|
||||||
@ -46,21 +42,45 @@ public:
|
|||||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Destroy default FFT plans
|
* Info: destroy default FFT plans
|
||||||
* Return: success or error code
|
* Return: success or error code
|
||||||
*/
|
*/
|
||||||
int destroyFFT();
|
int destroyFFT();
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: execute complex to complex double precision fft using cufft library
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
|
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: execute ifft
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: execute normalize using cuda kernel for complex to complex iFFT
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: execute real to complex double precision FFT
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: exectue complex to real double precision FFT
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Info: execute normalize for complex to real iFFT
|
||||||
|
Return: success or error code
|
||||||
|
*/
|
||||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
|
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -189,11 +189,12 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
|
|||||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||||
|
|
||||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -272,20 +273,28 @@ __global__ void mirroredRhoField(double *rho2_m,
|
|||||||
id7 = rk * NI * NJ + rj * NI + i;
|
id7 = rk * NI * NJ + rj * NI + i;
|
||||||
id8 = rk * NI * NJ + rj * NI + ri;
|
id8 = rk * NI * NJ + rj * NI + ri;
|
||||||
|
|
||||||
|
|
||||||
double data = rho2_m[id1];
|
double data = rho2_m[id1];
|
||||||
if (i != 0) rho2_m[id2] = data;
|
if (i != 0)
|
||||||
|
rho2_m[id2] = data;
|
||||||
|
|
||||||
if (j != 0) rho2_m[id3] = data;
|
if (j != 0)
|
||||||
|
rho2_m[id3] = data;
|
||||||
|
|
||||||
if (i != 0 && j != 0) rho2_m[id4] = data;
|
if (i != 0 && j != 0)
|
||||||
|
rho2_m[id4] = data;
|
||||||
|
|
||||||
if (k != 0) rho2_m[id5] = data;
|
if (k != 0)
|
||||||
|
rho2_m[id5] = data;
|
||||||
|
|
||||||
if (k != 0 && i != 0) rho2_m[id6] = data;
|
if (k != 0 && i != 0)
|
||||||
|
rho2_m[id6] = data;
|
||||||
|
|
||||||
if (k!= 0 && j != 0) rho2_m[id7] = data;
|
if (k!= 0 && j != 0)
|
||||||
|
rho2_m[id7] = data;
|
||||||
|
|
||||||
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
|
if (k != 0 && j != 0 & i != 0)
|
||||||
|
rho2_m[id8] = data;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,9 +363,9 @@ CudaGreensFunction::~CudaGreensFunction() {
|
|||||||
delete m_base;
|
delete m_base;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
double hr_m0, double hr_m1, double hr_m2,
|
||||||
int streamId)
|
int streamId)
|
||||||
{
|
{
|
||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
@ -364,7 +373,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
|
|||||||
|
|
||||||
//if no stream specified use default stream
|
//if no stream specified use default stream
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
|
kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -372,7 +381,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
|
|||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
|
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -380,17 +389,15 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen,
|
||||||
int I, int J, int K,
|
int I, int J, int K,
|
||||||
int streamId)
|
int streamId)
|
||||||
{
|
{
|
||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
int block = (I * J * K / thread) + 1;
|
int block = (I * J * K / thread) + 1;
|
||||||
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
|
|
||||||
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
||||||
2*(I - 1), 2*(J - 1), I, J, K);
|
2*(I - 1), 2*(J - 1), I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
@ -399,7 +406,6 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
|||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
|
|
||||||
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
||||||
2*(I - 1), 2*(J - 1), I, J, K);
|
2*(I - 1), 2*(J - 1), I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
@ -409,22 +415,22 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
|
int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
|
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I, 2*J);
|
mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J);
|
||||||
mirroredRhoField<<< block, thread >>>( (double *) rho2_m, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
|
mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I, 2*J);
|
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J);
|
||||||
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
|
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -434,13 +440,13 @@ int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int st
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2,
|
int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2,
|
||||||
int size, int streamId) {
|
int size, int streamId) {
|
||||||
|
|
||||||
int threads = 128;
|
int threads = 128;
|
||||||
int blocks = size / threads + 1;
|
int blocks = size / threads + 1;
|
||||||
int datasize = 2 * threads * sizeof(cuDoubleComplex);
|
int datasize = 2 * threads * sizeof(cuDoubleComplex);
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
|
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
|
||||||
(cuDoubleComplex*)ptr2,
|
(cuDoubleComplex*)ptr2,
|
||||||
|
@ -2,18 +2,17 @@
|
|||||||
#define H_CUDA_GREENSFUNCTION
|
#define H_CUDA_GREENSFUNCTION
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cmath>
|
#include <math.h>
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cuComplex.h>
|
#include <cuComplex.h>
|
||||||
#include "cublas_v2.h"
|
#include "cublas_v2.h"
|
||||||
|
|
||||||
#include "../Algorithms/GreensFunction.h"
|
|
||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
/** CUDA implementation of GreensFunction calculation for OPALs Poisson Solver. */
|
class CudaGreensFunction {
|
||||||
class CudaGreensFunction : public GreensFunction{
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -31,32 +30,32 @@ public:
|
|||||||
/* destructor */
|
/* destructor */
|
||||||
~CudaGreensFunction();
|
~CudaGreensFunction();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: calc itegral on device memory (taken from OPAL src code)
|
Info: calc itegral on device memory (taken from OPAL src code)
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
double hr_m0, double hr_m1, double hr_m2,
|
||||||
int streamId = -1);
|
int streamId = -1);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: integration of rho2_m field (taken from OPAL src code)
|
Info: integration of rho2_m field (taken from OPAL src code)
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
||||||
int streamId = -1);
|
int streamId = -1);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: mirror rho field (taken from OPAL src code)
|
Info: mirror rho field (taken from OPAL src code)
|
||||||
Return: succes or error code
|
Return: succes or error code
|
||||||
*/
|
*/
|
||||||
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
|
int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -10,7 +10,6 @@
|
|||||||
#include "../Algorithms/ImageReconstruction.h"
|
#include "../Algorithms/ImageReconstruction.h"
|
||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
/** CUDA implementation of ImageReconstruction interface. */
|
|
||||||
class CudaImageReconstruction : public ImageReconstruction {
|
class CudaImageReconstruction : public ImageReconstruction {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -83,56 +83,6 @@ __device__ double ifld(double t, double alpha, double phi, double nu, double lam
|
|||||||
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
|
|
||||||
double wt = TWO_PI*nu*t;
|
|
||||||
double rate2 = sigma*sigma*t*t;
|
|
||||||
double rateL = 0.0;
|
|
||||||
double result = 0.0;
|
|
||||||
|
|
||||||
// make sure lambda > 0
|
|
||||||
if (lambda < 0.0)
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
if (beta < 0.001) {
|
|
||||||
rateL = 1.0;
|
|
||||||
} else {
|
|
||||||
rateL = pow(lambda*t, beta);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nu < 0.01) {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
|
|
||||||
} else {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
|
|
||||||
double wt = TWO_PI*nu*t;
|
|
||||||
double at = a*t;
|
|
||||||
double rateL = 0.0;
|
|
||||||
double result = 0.0;
|
|
||||||
|
|
||||||
// make sure lambda > 0
|
|
||||||
if (lambda < 0.0)
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
if (beta < 0.001) {
|
|
||||||
rateL = 1.0;
|
|
||||||
} else {
|
|
||||||
rateL = pow(lambda*t, beta);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nu < 0.01) {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
|
|
||||||
} else {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ double b(double t, double phi, double nu) {
|
__device__ double b(double t, double phi, double nu) {
|
||||||
return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
||||||
}
|
}
|
||||||
|
455
src/DKSBase.cpp
455
src/DKSBase.cpp
@ -103,14 +103,25 @@ DKSBase::DKSBase() {
|
|||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
cbase = new CudaBase();
|
cbase = new CudaBase();
|
||||||
|
cfft = new CudaFFT(cbase);
|
||||||
|
cgreens = new CudaGreensFunction(cbase);
|
||||||
|
cchi = new CudaChiSquare(cbase);
|
||||||
|
ccol = new CudaCollimatorPhysics(cbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
oclbase = new OpenCLBase();
|
oclbase = new OpenCLBase();
|
||||||
|
oclfft = new OpenCLFFT(oclbase);
|
||||||
|
oclchi = new OpenCLChiSquare(oclbase);
|
||||||
|
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
micbase = new MICBase();
|
micbase = new MICBase();
|
||||||
|
micfft = new MICFFT(micbase);
|
||||||
|
miccol = new MICCollimatorPhysics(micbase);
|
||||||
|
micgreens = new MICGreensFunction(micbase);
|
||||||
|
micchi = new MICChiSquare(micbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -127,14 +138,25 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
|
|||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
cbase = new CudaBase();
|
cbase = new CudaBase();
|
||||||
|
cfft = new CudaFFT(cbase);
|
||||||
|
cgreens = new CudaGreensFunction(cbase);
|
||||||
|
cchi = new CudaChiSquare(cbase);
|
||||||
|
ccol = new CudaCollimatorPhysics(cbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
oclbase = new OpenCLBase();
|
oclbase = new OpenCLBase();
|
||||||
|
oclfft = new OpenCLFFT(oclbase);
|
||||||
|
oclchi = new OpenCLChiSquare(oclbase);
|
||||||
|
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
micbase = new MICBase();
|
micbase = new MICBase();
|
||||||
|
micfft = new MICFFT(micbase);
|
||||||
|
miccol = new MICCollimatorPhysics(micbase);
|
||||||
|
micgreens = new MICGreensFunction(micbase);
|
||||||
|
micchi = new MICChiSquare(micbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -151,16 +173,27 @@ DKSBase::~DKSBase() {
|
|||||||
if (m_function_name != NULL)
|
if (m_function_name != NULL)
|
||||||
delete[] m_function_name;
|
delete[] m_function_name;
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
|
delete cfft;
|
||||||
|
delete cgreens;
|
||||||
|
delete cchi;
|
||||||
|
delete ccol;
|
||||||
delete cbase;
|
delete cbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
|
delete oclfft;
|
||||||
|
delete oclchi;
|
||||||
|
delete oclcol;
|
||||||
delete oclbase;
|
delete oclbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
|
delete micfft;
|
||||||
|
delete miccol;
|
||||||
|
delete micgreens;
|
||||||
|
delete micchi;
|
||||||
delete micbase;
|
delete micbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -274,45 +307,38 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int DKSBase::setupDevice() {
|
/*
|
||||||
|
init device
|
||||||
int ierr = DKS_ERROR;
|
*/
|
||||||
|
int DKSBase::initDevice() {
|
||||||
|
|
||||||
//if api is not set default is OpenCL
|
//if api is not set default is OpenCL
|
||||||
if (!m_api_set) {
|
if (!m_api_set) {
|
||||||
setDevice("-gpu", 4);
|
setDevice("-gpu", 4);
|
||||||
setAPI(API_OPENCL, 6);
|
setAPI(API_OPENCL, 6);
|
||||||
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
||||||
} else {
|
} else {
|
||||||
if (apiOpenCL()) {
|
if (apiOpenCL()) {
|
||||||
if (!m_device_set) {
|
if (!m_device_set) {
|
||||||
setDevice("-gpu", 4);
|
setDevice("-gpu", 4);
|
||||||
setAPI(API_OPENCL, 6);
|
setAPI(API_OPENCL, 6);
|
||||||
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
||||||
} else {
|
} else {
|
||||||
setAPI(API_OPENCL, 6);
|
setAPI(API_OPENCL, 6);
|
||||||
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
|
return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
|
||||||
}
|
}
|
||||||
} else if (apiCuda()) {
|
} else if (apiCuda()) {
|
||||||
setDevice("-gpu", 4);
|
setDevice("-gpu", 4);
|
||||||
setAPI(API_CUDA, 4);
|
setAPI(API_CUDA, 4);
|
||||||
ierr = CUDA_SAFECALL(DKS_SUCCESS);
|
return CUDA_SAFECALL(DKS_SUCCESS);
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
setDevice("-mic", 4);
|
setDevice("-mic", 4);
|
||||||
setAPI(API_OPENMP, 6);
|
setAPI(API_OPENMP, 6);
|
||||||
ierr = MIC_SAFECALL(DKS_SUCCESS);
|
return MIC_SAFECALL(DKS_SUCCESS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ierr;
|
return DKS_ERROR;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
init device
|
|
||||||
*/
|
|
||||||
int DKSBase::initDevice() {
|
|
||||||
return setupDevice();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -430,15 +456,358 @@ int DKSBase::syncDevice() {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup fft plans to reuse if multiple ffts of same size are needed */
|
||||||
|
int DKSBase::setupFFT(int ndim, int N[3]) {
|
||||||
|
|
||||||
int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
|
if (apiCuda()) {
|
||||||
if (apiCuda())
|
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
|
||||||
return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
|
} else if (apiOpenMP()) {
|
||||||
if (apiOpenCL())
|
//micbase.mic_setupFFT(ndim, N);
|
||||||
return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));
|
//BENI: setting up RC and CR transformations on MIC
|
||||||
|
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
|
||||||
|
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
|
||||||
|
if (ierr1 != DKS_SUCCESS)
|
||||||
|
return ierr1;
|
||||||
|
if (ierr2 != DKS_SUCCESS)
|
||||||
|
return ierr2;
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
//BENI:
|
||||||
|
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
|
||||||
|
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//BENI:
|
||||||
|
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
|
||||||
|
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call OpenCL FFT function for selected platform */
|
||||||
|
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
//load kernel and execute
|
||||||
|
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||||
|
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
} else if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call OpenCL IFFT function for selected platform */
|
||||||
|
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||||
|
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
} else if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call normalize FFT function for selected platform */
|
||||||
|
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||||
|
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
} else if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call real to complex FFT */
|
||||||
|
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call complex to real FFT */
|
||||||
|
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* normalize complex to real iFFT */
|
||||||
|
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* normalize complex to real iFFT */
|
||||||
|
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
||||||
|
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||||
|
hz_m0, hz_m1, hz_m2, streamId) );
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
//BENI:
|
||||||
|
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
||||||
|
int I, int J, int K, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
|
||||||
|
double fTimeResolution, double fRebin,
|
||||||
|
int sensors, int length, int numpar, double &result)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||||
|
fTimeResolution, fRebin,
|
||||||
|
sensors, length, numpar,
|
||||||
|
result));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||||
|
fTimeResolution, fRebin,
|
||||||
|
sensors, length, numpar, result));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result)
|
||||||
|
{
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar,
|
||||||
|
result));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar, result));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result)
|
||||||
|
{
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar,
|
||||||
|
result));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar, result));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
|
int numparticles, int numparams,
|
||||||
|
int &numaddback, int &numdead)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||||
|
}
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
|
||||||
|
bool enableRutherfordScattering)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles,
|
||||||
|
enableRutherfordScattering) );
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
|
||||||
|
rx_ptr, ry_ptr, rz_ptr,
|
||||||
|
px_ptr, py_ptr, pz_ptr,
|
||||||
|
par_ptr, numparticles) );
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles, int &numaddback)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
|
||||||
|
rx_ptr, ry_ptr, rz_ptr,
|
||||||
|
px_ptr, py_ptr, pz_ptr,
|
||||||
|
par_ptr, numparticles, numaddback));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int DKSBase::callInitRandoms(int size, int seed) {
|
int DKSBase::callInitRandoms(int size, int seed) {
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
@ -452,3 +821,43 @@ int DKSBase::callInitRandoms(int size, int seed) {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
||||||
|
void *dt_ptr, double dt, double c,
|
||||||
|
bool usedt, int streamId)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
|
||||||
|
usedt, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
|
||||||
|
c, usedt, streamId));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||||
|
void *lastSec_ptr, void *orient_ptr,
|
||||||
|
int npart, int nsec, void *dt_ptr, double dt,
|
||||||
|
double c, bool usedt, int streamId)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
|
||||||
|
lastSec_ptr, orient_ptr,
|
||||||
|
npart, nsec, dt_ptr, dt,
|
||||||
|
c, usedt, streamId));
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
|
||||||
|
lastSec_ptr, orient_ptr,
|
||||||
|
npart, nsec, dt_ptr, dt,
|
||||||
|
c, usedt, streamId));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
286
src/DKSBase.h
286
src/DKSBase.h
@ -1,3 +1,11 @@
|
|||||||
|
/** DKSBase class.
|
||||||
|
* DKSBase.h
|
||||||
|
* Author: Uldis Locans
|
||||||
|
* Date: 15.09.2014
|
||||||
|
* Base class of Dynamic Kernel Scheduler that handles the function calls
|
||||||
|
* from host application to DKS
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef H_DKS_BASE
|
#ifndef H_DKS_BASE
|
||||||
#define H_DKS_BASE
|
#define H_DKS_BASE
|
||||||
|
|
||||||
@ -21,24 +29,34 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "OpenCL/OpenCLBase.h"
|
#include "OpenCL/OpenCLBase.h"
|
||||||
|
#include "OpenCL/OpenCLFFT.h"
|
||||||
|
#include "OpenCL/OpenCLChiSquare.h"
|
||||||
|
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
#include "CUDA/CudaBase.cuh"
|
#include "CUDA/CudaBase.cuh"
|
||||||
|
#include "CUDA/CudaFFT.cuh"
|
||||||
|
#include "CUDA/CudaGreensFunction.cuh"
|
||||||
|
#include "CUDA/CudaChiSquare.cuh"
|
||||||
|
#include "CUDA/CudaCollimatorPhysics.cuh"
|
||||||
|
#include "nvToolsExt.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
#include "MIC/MICBase.h"
|
#include "MIC/MICBase.h"
|
||||||
|
#include "MIC/MICChiSquare.h"
|
||||||
|
#include "MIC/MICFFT.h"
|
||||||
|
#include "MIC/MICCollimatorPhysics.h"
|
||||||
|
#include "MIC/MICGreensFunction.hpp"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "Algorithms/CollimatorPhysics.h"
|
||||||
|
#include "Algorithms/FFT.h"
|
||||||
|
|
||||||
#include "AutoTuning/DKSConfig.h"
|
#include "AutoTuning/DKSConfig.h"
|
||||||
|
|
||||||
/**
|
/** DKSBase class for handling function calls to DKS library */
|
||||||
* API for handling communication function calls to DKS library.
|
|
||||||
* DKSBase class uses CudaBase, OpenCLBase and MICBase to handle setup of device,
|
|
||||||
* memory manegement, data transfer and other basic communication functions between
|
|
||||||
* the host and device.
|
|
||||||
*/
|
|
||||||
class DKSBase {
|
class DKSBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -55,14 +73,25 @@ private:
|
|||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
OpenCLBase *oclbase;
|
OpenCLBase *oclbase;
|
||||||
|
OpenCLFFT *oclfft;
|
||||||
|
OpenCLChiSquare *oclchi;
|
||||||
|
OpenCLCollimatorPhysics *oclcol;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
CudaBase *cbase;
|
CudaBase *cbase;
|
||||||
|
CudaFFT *cfft;
|
||||||
|
CudaGreensFunction *cgreens;
|
||||||
|
CudaChiSquare *cchi;
|
||||||
|
CudaCollimatorPhysics *ccol;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
MICBase *micbase;
|
MICBase *micbase;
|
||||||
|
MICFFT *micfft;
|
||||||
|
MICCollimatorPhysics *miccol;
|
||||||
|
MICGreensFunction *micgreens;
|
||||||
|
MICChiSquare *micchi;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -71,7 +100,7 @@ protected:
|
|||||||
DKSConfig dksconfig;
|
DKSConfig dksconfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if current API is set to OpenCL.
|
* Check if current API is set to OpenCL
|
||||||
* Return true/false wether current api is opencl
|
* Return true/false wether current api is opencl
|
||||||
*/
|
*/
|
||||||
bool apiOpenCL();
|
bool apiOpenCL();
|
||||||
@ -88,11 +117,11 @@ protected:
|
|||||||
*/
|
*/
|
||||||
bool apiOpenMP();
|
bool apiOpenMP();
|
||||||
|
|
||||||
/** Check if device is GPU. */
|
/** Check if device is GPU */
|
||||||
bool deviceGPU();
|
bool deviceGPU();
|
||||||
/** Check if device is CPU. */
|
/** Check if device is CPU */
|
||||||
bool deviceCPU();
|
bool deviceCPU();
|
||||||
/** Check if device is MIC. */
|
/** Check if device is MIC */
|
||||||
bool deviceMIC();
|
bool deviceMIC();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -110,12 +139,6 @@ protected:
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
MICBase *getMICBase() {
|
|
||||||
return micbase;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/** Call OpenCL base to load specified kenrel file.
|
/** Call OpenCL base to load specified kenrel file.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -131,7 +154,6 @@ protected:
|
|||||||
return device_name;
|
return device_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -151,11 +173,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
~DKSBase();
|
~DKSBase();
|
||||||
|
|
||||||
/** Function to initialize objects based on the device used.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
int setupDevice();
|
|
||||||
|
|
||||||
/** Turn on auto tuning */
|
/** Turn on auto tuning */
|
||||||
void setAutoTuningOn() { m_auto_tuning = true; }
|
void setAutoTuningOn() { m_auto_tuning = true; }
|
||||||
|
|
||||||
@ -388,7 +405,7 @@ public:
|
|||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
void * mem_ptr = NULL;
|
void * mem_ptr = NULL;
|
||||||
mem_ptr = micbase->mic_allocateMemory<T>(elements);
|
mem_ptr = micbase.mic_allocateMemory<T>(elements);
|
||||||
return mem_ptr;
|
return mem_ptr;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -481,7 +498,7 @@ public:
|
|||||||
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
||||||
|
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
|
return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -515,7 +532,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -815,7 +832,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
|
return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -843,7 +860,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
||||||
streamId, offset));
|
streamId, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -863,32 +880,229 @@ public:
|
|||||||
else if (apiCuda())
|
else if (apiCuda())
|
||||||
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
|
return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Create random numbers on the device and fille mem_data array
|
///////////////////////////////////////////////
|
||||||
|
///////Function library part of dksbase////////
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Setup FFT function.
|
||||||
|
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
|
||||||
|
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
|
||||||
|
* each fft will do its own setup according to fft size and dimensions.
|
||||||
|
* TODO: opencl and mic implementations
|
||||||
*/
|
*/
|
||||||
int callCreateRandomNumbers(void *mem_ptr, int size);
|
int setupFFT(int ndim, int N[3]);
|
||||||
|
//BENI:
|
||||||
|
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
||||||
|
//BENI:
|
||||||
|
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call complex-to-complex fft.
|
||||||
|
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
|
||||||
|
* stream id can be specified to use other streams than default.
|
||||||
|
* TODO: mic implementation
|
||||||
|
*/
|
||||||
|
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call complex-to-complex ifft.
|
||||||
|
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
|
||||||
|
* stream id can be specified to use other streams than default.
|
||||||
|
* TODO: mic implementation.
|
||||||
|
*/
|
||||||
|
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize complex to complex ifft.
|
||||||
|
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
||||||
|
* fft size
|
||||||
|
* TODO: mic implementation.
|
||||||
|
*/
|
||||||
|
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call real to complex FFT.
|
||||||
|
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
|
||||||
|
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
||||||
|
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
||||||
|
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
||||||
|
* TODO: opencl and mic implementations
|
||||||
|
*/
|
||||||
|
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call complex to real iFFT.
|
||||||
|
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
|
||||||
|
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
||||||
|
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
||||||
|
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize compelx to real ifft.
|
||||||
|
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
||||||
|
* fft size.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transpose 2D and 3D arrays, OpenCL implementation
|
||||||
|
* N - size of dimensions, ndim - number of dimensions, dim - dim to transpose
|
||||||
|
*/
|
||||||
|
int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
|
* For specifics check OPAL docs.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
||||||
|
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
|
* For specifics check OPAL docs.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
||||||
|
int I, int J, int K, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
|
* For specifics check OPAL docs.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Element by element multiplication.
|
||||||
|
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
|
||||||
|
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chi square for parameter fitting on device.
|
||||||
|
* mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for
|
||||||
|
* intermediate results. Chi square results are put in &results
|
||||||
|
*/
|
||||||
|
int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
|
||||||
|
double fTimeResolution, double fRebin,
|
||||||
|
int sensors, int length, int numpar, double &result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* max-log-likelihood for parameter fitting on device.
|
||||||
|
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
|
||||||
|
* mem_par - pointer to parameter set, mem_results - pointer for
|
||||||
|
* intermediate results. Chi square results are put in &results.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffser,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* max-log-likelihood for parameter fitting on device.
|
||||||
|
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
|
||||||
|
* mem_par - pointer to parameter set, mem_results - pointer for
|
||||||
|
* intermediate results. Chi square results are put in &results.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffser,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
|
int numparticles, int numparams,
|
||||||
|
int &numaddback, int &numdead);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
|
||||||
|
bool enableRutherfordScattering = true);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles, int &numaddback);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Init random number states and save for reuse on device.
|
* Init random number states and save for reuse on device.
|
||||||
* If seed is -1, a random seed based on current time is taken.
|
|
||||||
* TODO: opencl and mic implementations.
|
* TODO: opencl and mic implementations.
|
||||||
*/
|
*/
|
||||||
int callInitRandoms(int size, int seed = -1);
|
int callInitRandoms(int size, int seed = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integration code from ParallelTTracker from OPAL.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
||||||
|
*/
|
||||||
|
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
||||||
|
void *dt_ptr, double dt, double c,
|
||||||
|
bool usedt = false, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integration code from ParallelTTracker from OPAL.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
||||||
|
*/
|
||||||
|
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||||
|
void *lastSec_ptr, void *orient_ptr,
|
||||||
|
int npart, int nsec, void *dt_ptr,
|
||||||
|
double dt, double c, bool usedt = false,
|
||||||
|
int streamId = -1);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Print memory information on device (total, used, available)
|
* Print memory information on device (total, used, available)
|
||||||
* TODO: opencl and mic imlementation
|
* TODO: opencl and mic imlementation
|
||||||
*/
|
*/
|
||||||
int callMemInfo() {
|
int callMemInfo() {
|
||||||
#ifdef DKS_CUDA
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cbase->cuda_memInfo());
|
return CUDA_SAFECALL(cbase->cuda_memInfo());
|
||||||
#endif
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -897,12 +1111,10 @@ public:
|
|||||||
* Used for debuging and timing purposes only.
|
* Used for debuging and timing purposes only.
|
||||||
*/
|
*/
|
||||||
void oclEventInfo() {
|
void oclEventInfo() {
|
||||||
#ifdef DKS_OPENCL
|
|
||||||
if (apiOpenCL())
|
if (apiOpenCL())
|
||||||
return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
|
return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test function to profile opencl kernel calls.
|
* Test function to profile opencl kernel calls.
|
||||||
|
@ -24,7 +24,6 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
|
|||||||
//if we are not auto tuning and the size of the problem has changed find the new parameters
|
//if we are not auto tuning and the size of the problem has changed find the new parameters
|
||||||
//from autotuning config file
|
//from autotuning config file
|
||||||
if (!isAutoTuningOn() && length != chiSquareSize_m) {
|
if (!isAutoTuningOn() && length != chiSquareSize_m) {
|
||||||
/*
|
|
||||||
int numBlocks, blockSize;
|
int numBlocks, blockSize;
|
||||||
std::string device_name;
|
std::string device_name;
|
||||||
getDeviceName(device_name);
|
getDeviceName(device_name);
|
||||||
@ -34,8 +33,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
|
|||||||
length, "BlockSize", blockSize);
|
length, "BlockSize", blockSize);
|
||||||
chiSq->setKernelParams(numBlocks, blockSize);
|
chiSq->setKernelParams(numBlocks, blockSize);
|
||||||
|
|
||||||
std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
|
//std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
|
||||||
*/
|
|
||||||
chiSquareSize_m = length;
|
chiSquareSize_m = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,7 +8,6 @@
|
|||||||
#include "AutoTuning/DKSAutoTuningTester.h"
|
#include "AutoTuning/DKSAutoTuningTester.h"
|
||||||
|
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
#include "Algorithms/ChiSquareRuntime.h"
|
#include "Algorithms/ChiSquareRuntime.h"
|
||||||
|
|
||||||
@ -20,12 +19,7 @@
|
|||||||
#include "OpenCL/OpenCLChiSquareRuntime.h"
|
#include "OpenCL/OpenCLChiSquareRuntime.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
class DKSBaseMuSR : public DKSBase {
|
||||||
* API to handle musrfit calls to DKS library.
|
|
||||||
* Using ChiSquareRuntime interface allows to call chi square functions on the
|
|
||||||
* GPU or CPU using CUDA or OpenCL.
|
|
||||||
*/
|
|
||||||
class DKSBaseMuSR : public DKSFFT {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
@ -62,12 +62,6 @@
|
|||||||
#define OPENCL_SAFEINIT(x) ( NULL )
|
#define OPENCL_SAFEINIT(x) ( NULL )
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_AMD
|
|
||||||
#define OPENCL_SAFEINIT_AMD(x) ( x )
|
|
||||||
#else
|
|
||||||
#define OPENCL_SAFEINIT_AMD(x) ( NULL )
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
#define MIC_SAFEINIT(x) ( x )
|
#define MIC_SAFEINIT(x) ( x )
|
||||||
#else
|
#else
|
||||||
|
147
src/DKSFFT.cpp
147
src/DKSFFT.cpp
@ -1,147 +0,0 @@
|
|||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
DKSFFT::DKSFFT() {
|
|
||||||
dksfft = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
DKSFFT::~DKSFFT() {
|
|
||||||
delete dksfft;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* setup fft plans to reuse if multiple ffts of same size are needed */
|
|
||||||
int DKSFFT::setupFFT(int ndim, int N[3]) {
|
|
||||||
|
|
||||||
if (apiCuda()) {
|
|
||||||
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
|
|
||||||
return dksfft->setupFFT(ndim, N);
|
|
||||||
} else if (apiOpenCL()) {
|
|
||||||
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
|
|
||||||
int ierr1 = dksfft->setupFFT(ndim, N);
|
|
||||||
int ierr2 = dksfft->setupFFTRC(ndim, N);
|
|
||||||
int ierr3 = dksfft->setupFFTCR(ndim, N);
|
|
||||||
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
} else if (apiOpenMP()) {
|
|
||||||
//micbase.mic_setupFFT(ndim, N);
|
|
||||||
//BENI: setting up RC and CR transformations on MIC
|
|
||||||
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
|
|
||||||
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
|
|
||||||
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
|
|
||||||
if (ierr1 != DKS_SUCCESS)
|
|
||||||
return ierr1;
|
|
||||||
if (ierr2 != DKS_SUCCESS)
|
|
||||||
return ierr2;
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
//BENI:
|
|
||||||
int DKSFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
|
||||||
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->setupFFT(ndim, N);
|
|
||||||
if (apiOpenCL())
|
|
||||||
return dksfft->setupFFTRC(ndim, N);
|
|
||||||
else if (apiOpenMP())
|
|
||||||
return dksfft->setupFFTRC(ndim, N, scale);
|
|
||||||
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//BENI:
|
|
||||||
int DKSFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
|
||||||
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->setupFFT(ndim, N);
|
|
||||||
if (apiOpenCL())
|
|
||||||
return dksfft->setupFFTCR(ndim, N);
|
|
||||||
else if (apiOpenMP())
|
|
||||||
return dksfft->setupFFTCR(ndim, N, scale);
|
|
||||||
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call OpenCL FFT function for selected platform */
|
|
||||||
int DKSFFT::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
|
|
||||||
if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeFFT(data_ptr, ndim, dimsize);
|
|
||||||
else if (apiCuda())
|
|
||||||
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call OpenCL IFFT function for selected platform */
|
|
||||||
int DKSFFT::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
|
|
||||||
else if (apiCuda())
|
|
||||||
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call normalize FFT function for selected platform */
|
|
||||||
int DKSFFT::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
|
|
||||||
if (apiOpenCL()) {
|
|
||||||
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
|
||||||
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
|
|
||||||
else
|
|
||||||
return DKS_ERROR;
|
|
||||||
} else if (apiCuda()) {
|
|
||||||
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
|
|
||||||
} else if (apiOpenMP()) {
|
|
||||||
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call real to complex FFT */
|
|
||||||
int DKSFFT::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
|
|
||||||
else if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call complex to real FFT */
|
|
||||||
int DKSFFT::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
|
|
||||||
else if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* normalize complex to real iFFT */
|
|
||||||
int DKSFFT::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
|
|
||||||
else if (apiOpenCL())
|
|
||||||
return DKS_ERROR;
|
|
||||||
else if (apiOpenMP())
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
112
src/DKSFFT.h
112
src/DKSFFT.h
@ -1,112 +0,0 @@
|
|||||||
#ifndef H_DKSBASE_FFT
|
|
||||||
#define H_DKSBASE_FFT
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include "AutoTuning/DKSAutoTuning.h"
|
|
||||||
|
|
||||||
#include "DKSBase.h"
|
|
||||||
|
|
||||||
#include "DKSDefinitions.h"
|
|
||||||
|
|
||||||
#include "Algorithms/GreensFunction.h"
|
|
||||||
#include "Algorithms/CollimatorPhysics.h"
|
|
||||||
#include "Algorithms/FFT.h"
|
|
||||||
|
|
||||||
#ifdef DKS_AMD
|
|
||||||
#include "OpenCL/OpenCLFFT.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
|
||||||
#include "CUDA/CudaFFT.cuh"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
#include "MIC/MICFFT.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
|
||||||
* API to handel calls to DKSFFT.
|
|
||||||
* Using DKSFFT interface executes FFT on GPUs, CPUs and MICs using cuFFT, clFFT or MKL libraries.
|
|
||||||
*/
|
|
||||||
class DKSFFT : public DKSBase {
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
BaseFFT *dksfft;
|
|
||||||
|
|
||||||
int initFFT();
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
DKSFFT();
|
|
||||||
~DKSFFT();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Setup FFT function.
|
|
||||||
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
|
|
||||||
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
|
|
||||||
* each fft will do its own setup according to fft size and dimensions.
|
|
||||||
* TODO: opencl and mic implementations
|
|
||||||
*/
|
|
||||||
int setupFFT(int ndim, int N[3]);
|
|
||||||
//BENI:
|
|
||||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
|
||||||
//BENI:
|
|
||||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call complex-to-complex fft.
|
|
||||||
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
|
|
||||||
* stream id can be specified to use other streams than default.
|
|
||||||
* TODO: mic implementation
|
|
||||||
*/
|
|
||||||
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call complex-to-complex ifft.
|
|
||||||
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
|
|
||||||
* stream id can be specified to use other streams than default.
|
|
||||||
* TODO: mic implementation.
|
|
||||||
*/
|
|
||||||
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize complex to complex ifft.
|
|
||||||
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
|
||||||
* fft size
|
|
||||||
* TODO: mic implementation.
|
|
||||||
*/
|
|
||||||
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call real to complex FFT.
|
|
||||||
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
|
|
||||||
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
|
||||||
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
|
||||||
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
|
||||||
* TODO: opencl and mic implementations
|
|
||||||
*/
|
|
||||||
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call complex to real iFFT.
|
|
||||||
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
|
|
||||||
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
|
||||||
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
|
||||||
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize compelx to real ifft.
|
|
||||||
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
|
||||||
* fft size.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -10,9 +10,6 @@
|
|||||||
#include "CUDA/CudaImageReconstruction.cuh"
|
#include "CUDA/CudaImageReconstruction.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
|
||||||
* API to handle PET image reconstruction calls.
|
|
||||||
*/
|
|
||||||
class DKSImageRecon : public DKSBase {
|
class DKSImageRecon : public DKSBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -25,88 +22,87 @@ public:
|
|||||||
|
|
||||||
~DKSImageRecon();
|
~DKSImageRecon();
|
||||||
|
|
||||||
/**
|
/** Image reconstruction analaysis calculate source.
|
||||||
* Image reconstruction analaysis calculate source.
|
*
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
int callCalculateSource(void *image_space, void *image_position, void *source_position,
|
int callCalculateSource(void *image_space, void *image_position, void *source_position,
|
||||||
void *avg, void *std, float diameter, int total_voxels,
|
void *avg, void *std, float diameter, int total_voxels,
|
||||||
int total_sources, int start = 0);
|
int total_sources, int start = 0);
|
||||||
|
|
||||||
/**
|
/** Image reconstruction analaysis calculate source.
|
||||||
* Image reconstruction analaysis calculate source.
|
*
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
int callCalculateBackground(void *image_space, void *image_position, void *source_position,
|
int callCalculateBackground(void *image_space, void *image_position, void *source_position,
|
||||||
void *avg, void *std, float diameter, int total_voxels,
|
void *avg, void *std, float diameter, int total_voxels,
|
||||||
int total_sources, int start = 0);
|
int total_sources, int start = 0);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/** Image reconstruction analaysis calculate source.
|
||||||
* Image reconstruction analaysis calculate source.
|
*
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
int callCalculateSources(void *image_space, void *image_position, void *source_position,
|
int callCalculateSources(void *image_space, void *image_position, void *source_position,
|
||||||
void *avg, void *std, void *diameter, int total_voxels,
|
void *avg, void *std, void *diameter, int total_voxels,
|
||||||
int total_sources, int start = 0);
|
int total_sources, int start = 0);
|
||||||
|
|
||||||
/**
|
/** Image reconstruction analaysis calculate source.
|
||||||
* Image reconstruction analaysis calculate source.
|
*
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
|
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
|
||||||
void *avg, void *std, void *diameter, int total_voxels,
|
void *avg, void *std, void *diameter, int total_voxels,
|
||||||
int total_sources, int start = 0);
|
int total_sources, int start = 0);
|
||||||
|
|
||||||
/**
|
/** Image reconstruction - generate normalization.
|
||||||
* Image reconstruction - generate normalization.
|
*
|
||||||
*/
|
*/
|
||||||
int callGenerateNormalization(void *recon, void *image_position,
|
int callGenerateNormalization(void *recon, void *image_position,
|
||||||
void *det_position, int total_det);
|
void *det_position, int total_det);
|
||||||
|
|
||||||
/**
|
/** Image reconstruction - forward correction.
|
||||||
* Image reconstruction - forward correction.
|
*
|
||||||
*/
|
*/
|
||||||
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
|
||||||
void *image_position, int num_events);
|
void *image_position, int num_events);
|
||||||
|
|
||||||
/**
|
/** Image reconstruction - backward projection.
|
||||||
* Image reconstruction - backward projection.
|
*
|
||||||
*/
|
*/
|
||||||
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
|
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
|
||||||
void *det_position, void *image_position,
|
void *det_position, void *image_position,
|
||||||
int num_events, int num_voxels);
|
int num_events, int num_voxels);
|
||||||
|
|
||||||
/**
|
/** Set the voxel dimensins on device.
|
||||||
* Set the voxel dimensins on device.
|
|
||||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||||
* Call set function once to transfer the values from host side to GPU.
|
* Call set function once to transfer the values from host side to GPU.
|
||||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||||
*/
|
*/
|
||||||
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
|
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
|
||||||
|
|
||||||
/**
|
/** Set the image edge.
|
||||||
* Set the image edge.
|
|
||||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||||
* Call set function once to transfer the values from host side to GPU.
|
* Call set function once to transfer the values from host side to GPU.
|
||||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||||
*/
|
*/
|
||||||
int setEdge(float x_edge, float y_edge, float z_edge);
|
int setEdge(float x_edge, float y_edge, float z_edge);
|
||||||
|
|
||||||
/**
|
/** Set the image edge1.
|
||||||
* Set the image edge1.
|
|
||||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||||
* Call set function once to transfer the values from host side to GPU.
|
* Call set function once to transfer the values from host side to GPU.
|
||||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||||
*/
|
*/
|
||||||
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
|
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
|
||||||
|
|
||||||
/**
|
/** Set the minimum crystan in one ring values.
|
||||||
* Set the minimum crystal in one ring values.
|
|
||||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||||
* Call set function once to transfer the values from host side to GPU.
|
* Call set function once to transfer the values from host side to GPU.
|
||||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||||
*/
|
*/
|
||||||
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
|
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
|
||||||
|
|
||||||
/**
|
/** Set all other required parameters for reconstruction.
|
||||||
* Set all other required parameters for reconstruction.
|
|
||||||
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
* Values are stored in GPU memory and used in forward and backward projection calculations.
|
||||||
* Call set function once to transfer the values from host side to GPU.
|
* Call set function once to transfer the values from host side to GPU.
|
||||||
* If value changes on the host side set functions needs to be called again to update GPU values.
|
* If value changes on the host side set functions needs to be called again to update GPU values.
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
/**
|
|
||||||
\mainpage
|
|
||||||
|
|
||||||
<P>
|
|
||||||
<B>
|
|
||||||
The aim of DKS is to allow the creation of fast fine tuned kernels using device specific frameworks such as CUDA, OpenCL, OpenACC and OpenMP and accelerator libraries such as Thrust, Nvidia CUDA libraries, Intel MKL or others. On top of that, DKS allows the easy use of these kernels in host applications without providing any device or framework specific details. This approach facilitates the integration of different types of devices in the existing applications with minimal code changes and makes the device and the host code a lot more manageable.
|
|
||||||
</B>
|
|
||||||
<P>
|
|
||||||
|
|
||||||
The main parts of DKS are:
|
|
||||||
<ul>
|
|
||||||
<li>DKSBase - provides the basic communication functions between host application and hardware accelerators including memory manegement, data transfer and synchronization.</li>
|
|
||||||
<li>DKSOPAL - provides functions for Object Oriented Particle Accelerator library to offload FFTPoisson calculations and particle matter interaction using Monte Carlo simulations to GPU and Intel MIC</li>
|
|
||||||
<li>DKSBaseMuSR - provides functions to perform parameter fitting for musrfit on the GPU</li>
|
|
||||||
<li>DKSImageRecon - provides functions to perform PET image reconstruction on the GPU</li>
|
|
||||||
<li>DKSFFT - provides functions to perform FFT on the GPU and Intel MIC</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<P>
|
|
||||||
<B>
|
|
||||||
Developed by
|
|
||||||
Uldis Locans
|
|
||||||
</B>
|
|
||||||
|
|
||||||
<P>
|
|
||||||
For further information contact: locans.uldis@psi.ch - Uldis Locans
|
|
||||||
<P>
|
|
||||||
|
|
||||||
<P>
|
|
||||||
<a href="https://gitlab.psi.ch/uldis_l/DKS">DKS on gitlab</a><br>
|
|
||||||
|
|
||||||
*/
|
|
162
src/DKSOPAL.cpp
162
src/DKSOPAL.cpp
@ -1,162 +0,0 @@
|
|||||||
#include "DKSOPAL.h"
|
|
||||||
|
|
||||||
DKSOPAL::DKSOPAL() {
|
|
||||||
dkscol = nullptr;
|
|
||||||
dksgreens = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
|
|
||||||
setAPI(api_name, strlen(api_name));
|
|
||||||
setDevice(device_name, strlen(device_name));
|
|
||||||
}
|
|
||||||
|
|
||||||
DKSOPAL::~DKSOPAL() {
|
|
||||||
delete dkscol;
|
|
||||||
delete dksgreens;
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::setupOPAL() {
|
|
||||||
int ierr = DKS_ERROR;
|
|
||||||
if (apiOpenCL()) {
|
|
||||||
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
|
|
||||||
//TODO: only enable if AMD libraries are available
|
|
||||||
dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
|
|
||||||
dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
|
|
||||||
} else if (apiCuda()) {
|
|
||||||
ierr = CUDA_SAFECALL( DKS_SUCCESS );
|
|
||||||
dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
|
|
||||||
dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
|
|
||||||
} else if (apiOpenMP()) {
|
|
||||||
ierr = MIC_SAFECALL( DKS_SUCCESS );
|
|
||||||
dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
|
|
||||||
dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
|
|
||||||
} else {
|
|
||||||
ierr = DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::initDevice() {
|
|
||||||
int ierr = setupDevice();
|
|
||||||
if (ierr == DKS_SUCCESS)
|
|
||||||
ierr = setupOPAL();
|
|
||||||
return ierr;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
|
||||||
hz_m0, hz_m1, hz_m2, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
|
||||||
int I, int J, int K, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
|
||||||
int numparticles, int numparams,
|
|
||||||
int &numaddback, int &numdead,
|
|
||||||
bool enableRutherforScattering)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
|
|
||||||
bool enableRutherforScattering)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles)
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
|
|
||||||
rx_ptr, ry_ptr, rz_ptr,
|
|
||||||
px_ptr, py_ptr, pz_ptr,
|
|
||||||
par_ptr, numparticles);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles, int &numaddback)
|
|
||||||
{
|
|
||||||
|
|
||||||
return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
|
|
||||||
rx_ptr, ry_ptr, rz_ptr,
|
|
||||||
px_ptr, py_ptr, pz_ptr,
|
|
||||||
par_ptr, numparticles, numaddback));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
|
||||||
void *dt_ptr, double dt, double c,
|
|
||||||
bool usedt, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
|
|
||||||
int npart, double c, int streamId) {
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge, double mass,
|
|
||||||
int npart, double c, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr,
|
|
||||||
charge, mass, npart, c, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
|
||||||
void *lastSec_ptr, void *orient_ptr,
|
|
||||||
int npart, int nsec, void *dt_ptr, double dt,
|
|
||||||
double c, bool usedt, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
|
|
||||||
npart, nsec, dt_ptr, dt, c, usedt, streamId);
|
|
||||||
|
|
||||||
}
|
|
175
src/DKSOPAL.h
175
src/DKSOPAL.h
@ -1,175 +0,0 @@
|
|||||||
#ifndef H_DKS_OPAL
|
|
||||||
#define H_DKS_OPAL
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include "AutoTuning/DKSAutoTuning.h"
|
|
||||||
|
|
||||||
#include "DKSBase.h"
|
|
||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
#include "DKSDefinitions.h"
|
|
||||||
|
|
||||||
#include "Algorithms/GreensFunction.h"
|
|
||||||
#include "Algorithms/CollimatorPhysics.h"
|
|
||||||
#include "Algorithms/FFT.h"
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_AMD
|
|
||||||
#include "OpenCL/OpenCLFFT.h"
|
|
||||||
#include "OpenCL/OpenCLGreensFunction.h"
|
|
||||||
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
|
||||||
#include "CUDA/CudaFFT.cuh"
|
|
||||||
#include "CUDA/CudaGreensFunction.cuh"
|
|
||||||
#include "CUDA/CudaCollimatorPhysics.cuh"
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
#include "MIC/MICFFT.h"
|
|
||||||
#include "MIC/MICGreensFunction.hpp"
|
|
||||||
#include "MIC/MICCollimatorPhysics.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
|
||||||
* API to handle OPAL calls to DKS library.
|
|
||||||
* Gives access to DKSCollimatorPhysics, GreensFunction and DKSFFT, as well as all the DKSBase
|
|
||||||
* functions.
|
|
||||||
*/
|
|
||||||
class DKSOPAL : public DKSFFT {
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
DKSCollimatorPhysics *dkscol;
|
|
||||||
GreensFunction *dksgreens;
|
|
||||||
|
|
||||||
int setupOPAL();
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
DKSOPAL();
|
|
||||||
|
|
||||||
DKSOPAL(const char* api_name, const char* device_name);
|
|
||||||
|
|
||||||
~DKSOPAL();
|
|
||||||
|
|
||||||
int initDevice();
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
///////Function library part of dksbase////////
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
|
||||||
* For specifics check OPAL docs.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
|
||||||
* For specifics check OPAL docs.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
|
||||||
int I, int J, int K, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
|
||||||
* For specifics check OPAL docs.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Element by element multiplication.
|
|
||||||
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
|
|
||||||
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
|
||||||
int numparticles, int numparams,
|
|
||||||
int &numaddback, int &numdead,
|
|
||||||
bool enableRutherfordScattering = true);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
|
|
||||||
bool enableRutherfordScattering = true);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles, int &numaddback);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
|
||||||
void *dt_ptr, double dt, double c,
|
|
||||||
bool usedt = false, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
|
||||||
void *lastSec_ptr, void *orient_ptr,
|
|
||||||
int npart, int nsec, void *dt_ptr,
|
|
||||||
double dt, double c, bool usedt = false,
|
|
||||||
int streamId = -1);
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
|
|
||||||
int npart, double c, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge,
|
|
||||||
double mass, int npart, double c, int streamId = -1);
|
|
||||||
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,22 +1,19 @@
|
|||||||
SET (_SRCS MICBase.cpp MICFFT.cpp)
|
SET (_SRCS
|
||||||
SET (_HDRS MICBase.h MICFFT.h)
|
MICBase.cpp
|
||||||
|
MICChiSquare.cpp
|
||||||
|
MICFFT.cpp
|
||||||
|
MICGreensFunction.cpp
|
||||||
|
MICCollimatorPhysics.cpp
|
||||||
|
)
|
||||||
|
|
||||||
IF (ENABLE_OPAL)
|
SET (_HDRS
|
||||||
SET (_SRCS
|
MICBase.h
|
||||||
${_SRCS}
|
MICChiSquare.h
|
||||||
MICChiSquare.cpp
|
MICFFT.h
|
||||||
MICGreensFunction.cpp
|
MICCollimatorPhysics.h
|
||||||
MICCollimatorPhysics.cpp
|
MICGreensFunction.hpp
|
||||||
)
|
MICMergeSort.h
|
||||||
|
)
|
||||||
SET (_HDRS
|
|
||||||
${_HDRS}
|
|
||||||
MICChiSquare.h
|
|
||||||
MICCollimatorPhysics.h
|
|
||||||
MICGreensFunction.hpp
|
|
||||||
MICMergeSort.h
|
|
||||||
)
|
|
||||||
ENDIF (ENABLE_OPAL)
|
|
||||||
|
|
||||||
#INCLUDE_DIRECTORIES (
|
#INCLUDE_DIRECTORIES (
|
||||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
|
@ -18,28 +18,30 @@ int MICBase::mic_createRandStreams(int size) {
|
|||||||
|
|
||||||
int seed = time(NULL);
|
int seed = time(NULL);
|
||||||
|
|
||||||
int numThreads = 0;
|
#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
|
||||||
#pragma offload target(mic:m_device_id) inout(numThreads)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
|
//get the number of threads
|
||||||
|
int numThreads;
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
numThreads = omp_get_num_threads();
|
numThreads = omp_get_num_threads();
|
||||||
}
|
|
||||||
|
|
||||||
defaultRndStream = mic_allocateMemory<VSLStreamStatePtr>(numThreads);
|
//if default rnd stream already allocated delete the array
|
||||||
VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
|
if (defaultRndSet == 1)
|
||||||
maxThreads = numThreads;
|
delete[] defaultRndStream;
|
||||||
|
|
||||||
#pragma offload target(mic:m_device_id) \
|
//allocate defaultRndStream array
|
||||||
in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN) \
|
defaultRndStream = new VSLStreamStatePtr[numThreads];
|
||||||
in(seed)
|
|
||||||
{
|
|
||||||
//create stream states for each thread
|
//create stream states for each thread
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < omp_get_num_threads(); i++)
|
for (int i = 0; i < omp_get_num_threads(); i++)
|
||||||
vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
|
vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
|
||||||
}
|
|
||||||
|
|
||||||
defaultRndSet = 1;
|
defaultRndSet = 1;
|
||||||
|
}
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -47,8 +49,15 @@ int MICBase::mic_createRandStreams(int size) {
|
|||||||
//delete default rand streams
|
//delete default rand streams
|
||||||
int MICBase::mic_deleteRandStreams() {
|
int MICBase::mic_deleteRandStreams() {
|
||||||
|
|
||||||
//mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
|
#pragma offload target(mic:m_device_id) inout(defaultRndSet)
|
||||||
return DKS_SUCCESS;
|
{
|
||||||
|
if (defaultRndSet == 1) {
|
||||||
|
delete[] defaultRndStream;
|
||||||
|
defaultRndSet = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
//create a new signal for the mic
|
//create a new signal for the mic
|
||||||
|
@ -26,82 +26,72 @@
|
|||||||
|
|
||||||
#define MIC_WIDTH 128
|
#define MIC_WIDTH 128
|
||||||
|
|
||||||
/** MIC Base class handles device setup and basic communication with the device.
|
|
||||||
* Handles devicew setup, memory manegement and data transfers.
|
|
||||||
*/
|
|
||||||
class MICBase {
|
class MICBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<int> micStreams;
|
std::vector<int> micStreams;
|
||||||
int maxThreads;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
|
|
||||||
int defaultRndSet;
|
int defaultRndSet;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
VSLStreamStatePtr *defaultRndStream;
|
||||||
//#pragma offload_attribute(push,target(mic))
|
|
||||||
void *defaultRndStream; //VSLSStreamStatePtr
|
|
||||||
void *testPtr;
|
|
||||||
|
|
||||||
//#pragma offload_attribute(pop)
|
|
||||||
|
|
||||||
int m_device_id;
|
int m_device_id;
|
||||||
|
|
||||||
/** constructor */
|
/* constructor */
|
||||||
MICBase();
|
MICBase();
|
||||||
|
|
||||||
/** destructor */
|
/* destructor */
|
||||||
~MICBase();
|
~MICBase();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Create MKL rand streams for each thread
|
Info: create MKL rand streams for each thread
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int mic_createRandStreams(int size);
|
int mic_createRandStreams(int size);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Delete MKL rand streams
|
Info: delete MKL rand streams
|
||||||
* Return: succes or error code
|
Return: succes or error code
|
||||||
*/
|
*/
|
||||||
int mic_deleteRandStreams();
|
int mic_deleteRandStreams();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Create a new signal for the mic.
|
Info: create a new signal for the mic
|
||||||
* Signals can be used for assynchronous data transfers.
|
Return: success or error code
|
||||||
* Return: success or error code
|
*/
|
||||||
*/
|
|
||||||
int mic_createStream(int & streamId);
|
int mic_createStream(int & streamId);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Info: get the signal from the vector.
|
Info: get the signal from the vector
|
||||||
* Return: mic signal
|
Return: mic signal
|
||||||
*/
|
*/
|
||||||
int& mic_getStream(int id);
|
int& mic_getStream(int id);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Info: delete streams.
|
Info: delete streams
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int mic_deleteStreams();
|
int mic_deleteStreams();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Info: set device id.
|
Info: set device id
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int mic_setDeviceId(int id);
|
int mic_setDeviceId(int id);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Info: get mic devices.
|
Info: get mic devices
|
||||||
* Prints information about mic devices.
|
Return: success or error code
|
||||||
* Return: success or error code
|
*/
|
||||||
*/
|
|
||||||
int mic_getDevices();
|
int mic_getDevices();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Allocate memory on MIC device.
|
Info: allocate memory on MIC device
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void * mic_allocateMemory(int size) {
|
void * mic_allocateMemory(int size) {
|
||||||
|
|
||||||
@ -114,10 +104,10 @@ public:
|
|||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Transfer data to device.
|
Info: transfer data to device
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
|
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
|
||||||
T* tmp_ptr = (T*)data_ptr;
|
T* tmp_ptr = (T*)data_ptr;
|
||||||
@ -128,10 +118,10 @@ public:
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Write data to device, non-blocking.
|
Info: write data to device, non-blocking
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
|
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
|
||||||
{
|
{
|
||||||
@ -144,10 +134,10 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Read data from device
|
Info: read data from device
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
|
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
|
||||||
T* tmp_ptr = (T*)data_ptr;
|
T* tmp_ptr = (T*)data_ptr;
|
||||||
@ -159,10 +149,10 @@ public:
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Read data from device waiting for signal
|
Info: read data from device waiting for signal
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int mic_readDataAsync(const void * data_ptr, void * result, int size,
|
int mic_readDataAsync(const void * data_ptr, void * result, int size,
|
||||||
int streamId = -1, int offset = 0) {
|
int streamId = -1, int offset = 0) {
|
||||||
@ -177,10 +167,10 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Wait till all the signals are complete
|
Info: wait till all the signals are complete
|
||||||
* Return siccess or error code
|
Return siccess or error code
|
||||||
*/
|
*/
|
||||||
int mic_syncDevice() {
|
int mic_syncDevice() {
|
||||||
|
|
||||||
//empty offload to wait for all the signals to finish and launch a new empy signal
|
//empty offload to wait for all the signals to finish and launch a new empy signal
|
||||||
@ -198,10 +188,10 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Free memory on device
|
Info: free memory on device
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int mic_freeMemory(void * data_ptr, int size) {
|
int mic_freeMemory(void * data_ptr, int size) {
|
||||||
|
|
||||||
@ -212,13 +202,14 @@ public:
|
|||||||
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
|
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Allocate memory and write data to device
|
Info: allocate memory and write data to device
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void * mic_pushData(const void * data, int size) {
|
void * mic_pushData(const void * data, int size) {
|
||||||
T* tmp_ptr = new T[size];
|
T* tmp_ptr = new T[size];
|
||||||
@ -232,10 +223,10 @@ public:
|
|||||||
return tmp_ptr;
|
return tmp_ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Read data and free memory on device
|
Info: read data and free memory on device
|
||||||
* Return: success or erro code
|
Return: success or erro code
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename T>
|
||||||
int mic_pullData(void * data_ptr, void * result, int size) {
|
int mic_pullData(void * data_ptr, void * result, int size) {
|
||||||
T* tmp_ptr = (T*)data_ptr;
|
T* tmp_ptr = (T*)data_ptr;
|
||||||
|
@ -14,9 +14,6 @@
|
|||||||
#include <offload.h>
|
#include <offload.h>
|
||||||
#include "MICBase.h"
|
#include "MICBase.h"
|
||||||
|
|
||||||
/** Deprecated, OpenMP + offload to Xeon Phi implementation of ChiSquare for MIC devices.
|
|
||||||
* Not complete and untested because of the poor performance of first MIC devices.
|
|
||||||
*/
|
|
||||||
class MICChiSquare {
|
class MICChiSquare {
|
||||||
|
|
||||||
MICBase *m_micbase;
|
MICBase *m_micbase;
|
||||||
|
@ -22,34 +22,22 @@
|
|||||||
#define I_M 10
|
#define I_M 10
|
||||||
#define DT_M 11
|
#define DT_M 11
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function for calculating dot product.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
double dot(mic_double3 d1, mic_double3 d2) {
|
double dot(mic_double3 d1, mic_double3 d2) {
|
||||||
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function for calculating dot product.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
double dot(double dx, double dy, double dz) {
|
double dot(double dx, double dy, double dz) {
|
||||||
return (dx * dx + dy * dy + dz * dz);
|
return (dx * dx + dy * dy + dz * dz);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function to check if particle is still in material.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
bool checkHit(double &z, double *par) {
|
bool checkHit(double &z, double *par) {
|
||||||
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
|
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function to calculate arbitrary rotation.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
void Rot(double &px, double &pz, double &x, double &z, double xplane,
|
void Rot(double &px, double &pz, double &x, double &z, double xplane,
|
||||||
double normP, double thetacou, double deltas, int coord)
|
double normP, double thetacou, double deltas, int coord)
|
||||||
@ -82,14 +70,6 @@ void Rot(double &px, double &pz, double &x, double &z, double xplane,
|
|||||||
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function to calculate Coulomb scattering for one particle.
|
|
||||||
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
|
|
||||||
* Uses AoS to store particle positions and momentum, paralelized using OpenMP.
|
|
||||||
* For details on the algorithm see OPAL user guide.
|
|
||||||
* Deprecated on favor of SoA data layout.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
|
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
|
||||||
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
||||||
@ -156,19 +136,11 @@ void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function to calculate Coulomb scattering for one particle.
|
|
||||||
* Including Multiple Coulomb Scattering and large angle Rutherford Scattering.
|
|
||||||
* Uses SoA to store particle positions and momentum, paralelized using OpenMP.
|
|
||||||
* For details on the algorithm see OPAL user guide.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
void coulombScat(double *rx, double *ry, double *rz,
|
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
|
||||||
double *px, double *py, double *pz, int *label,
|
|
||||||
double *par, VSLStreamStatePtr &stream, int ii, int size)
|
double *par, VSLStreamStatePtr &stream, int ii, int size)
|
||||||
{
|
{
|
||||||
|
|
||||||
//arrays for temporary storage, each core proceses MIC_WIDTH particles
|
|
||||||
double normP[MIC_WIDTH] __attribute__((aligned(64)));
|
double normP[MIC_WIDTH] __attribute__((aligned(64)));
|
||||||
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
|
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
|
||||||
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
|
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
|
||||||
@ -180,7 +152,6 @@ void coulombScat(double *rx, double *ry, double *rz,
|
|||||||
double z2[MIC_WIDTH] __attribute__((aligned(64)));
|
double z2[MIC_WIDTH] __attribute__((aligned(64)));
|
||||||
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
|
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
|
||||||
|
|
||||||
//simd instruction tells the compiler its safe to vectorize the loop
|
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
for (int i = ii; i < ii + MIC_WIDTH; i++) {
|
||||||
@ -220,7 +191,6 @@ void coulombScat(double *rx, double *ry, double *rz,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//vectorize the loop
|
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
for (int i = ii; i < ii + size; i++) {
|
for (int i = ii; i < ii + size; i++) {
|
||||||
@ -232,6 +202,7 @@ void coulombScat(double *rx, double *ry, double *rz,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//generate array of random numbers
|
//generate array of random numbers
|
||||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
|
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
|
||||||
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
|
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
|
||||||
@ -310,11 +281,6 @@ void coulombScat(double *rx, double *ry, double *rz,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function to calculate energyLoss for one particle.
|
|
||||||
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
|
|
||||||
* algorith are available in OPAL user guide.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
|
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
|
||||||
|
|
||||||
@ -326,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
|
|||||||
|
|
||||||
const double deltas = par[DT_M] * beta * C;
|
const double deltas = par[DT_M] * beta * C;
|
||||||
const double deltasrho = deltas * 100 * par[RHO_M];
|
const double deltasrho = deltas * 100 * par[RHO_M];
|
||||||
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
|
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
|
||||||
|
|
||||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||||
const double Ts = (Eng * 1E6) / 1.0073;
|
const double Ts = (Eng * 1E6) / 1.0073;
|
||||||
@ -362,11 +328,6 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
|
|||||||
pdead = 1;
|
pdead = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* MIC device function to calculate energyLoss for one particle.
|
|
||||||
* Energy loss is calculated using Betha-Bloch equation. More details on EnergyLoss
|
|
||||||
* algorith are available in OPAL user guide.
|
|
||||||
*/
|
|
||||||
__declspec(target(mic))
|
__declspec(target(mic))
|
||||||
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
||||||
|
|
||||||
@ -377,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
|||||||
|
|
||||||
const double deltas = par[DT_M] * beta * C;
|
const double deltas = par[DT_M] * beta * C;
|
||||||
const double deltasrho = deltas * 100 * par[RHO_M];
|
const double deltasrho = deltas * 100 * par[RHO_M];
|
||||||
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
|
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
|
||||||
|
|
||||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||||
const double Ts = (Eng * 1E6) / 1.0073;
|
const double Ts = (Eng * 1E6) / 1.0073;
|
||||||
@ -407,29 +368,26 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
bool enableRutherforScattering)
|
int numparticles, boll enableRutherfordScattering)
|
||||||
{
|
{
|
||||||
|
|
||||||
//cast device memory pointers to appropriate types
|
//cast device memory pointers to appropriate types
|
||||||
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
|
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
|
||||||
double *par = (double*) par_ptr;
|
double *par = (double*) par_ptr;
|
||||||
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
|
|
||||||
|
|
||||||
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
|
|
||||||
the memory allocation and data trasnfer need to be handled before */
|
|
||||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||||
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
|
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
|
||||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||||
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
|
|
||||||
in(numparticles)
|
in(numparticles)
|
||||||
{
|
{
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
|
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
||||||
|
|
||||||
//for loop trough particles if not checkhit set label to -2 and update R.x
|
//for loop trough particles if not checkhit set label to -2 and update R.x
|
||||||
|
|
||||||
#pragma omp for simd
|
#pragma omp for simd
|
||||||
for (int i = 0; i < numparticles; i++) {
|
for (int i = 0; i < numparticles; i++) {
|
||||||
if ( !checkHit(data[i].Rincol.z, par) ) {
|
if ( !checkHit(data[i].Rincol.z, par) ) {
|
||||||
@ -489,7 +447,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
//cast device memory pointers to appropriate types
|
|
||||||
int *label = (int*)label_ptr;
|
int *label = (int*)label_ptr;
|
||||||
unsigned *localID = (unsigned*)localID_ptr;
|
unsigned *localID = (unsigned*)localID_ptr;
|
||||||
double *rx = (double*)rx_ptr;
|
double *rx = (double*)rx_ptr;
|
||||||
@ -503,10 +461,6 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
int padding = numparticles % MIC_WIDTH;
|
int padding = numparticles % MIC_WIDTH;
|
||||||
int totalpart = numparticles + padding;
|
int totalpart = numparticles + padding;
|
||||||
|
|
||||||
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
|
|
||||||
|
|
||||||
/* offload the computation to the MIC, reuses the memory already allocated on the mic.
|
|
||||||
the memory allocation and data trasnfer need to be handled before */
|
|
||||||
#pragma offload target (mic:0) \
|
#pragma offload target (mic:0) \
|
||||||
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
@ -517,16 +471,14 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
in(py:length(0) DKS_REUSE DKS_RETAIN) \
|
in(py:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
|
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||||
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
|
|
||||||
in(totalpart)
|
in(totalpart)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
//every thread gets its own rnd stream state
|
//every thread gets its own rnd stream state
|
||||||
//VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
||||||
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
|
|
||||||
|
|
||||||
#pragma omp for nowait
|
#pragma omp for nowait
|
||||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||||
@ -562,11 +514,9 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
double Eng = (sq - 1) * M_P;
|
double Eng = (sq - 1) * M_P;
|
||||||
double dEdx = 0;
|
double dEdx = 0;
|
||||||
|
|
||||||
|
|
||||||
if (label[i] == 0) {
|
if (label[i] == 0) {
|
||||||
energyLoss(Eng, dEdx, par, randv, i - ii);
|
energyLoss(Eng, dEdx, par, randv, i - ii);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (Eng > 1e-4 && dEdx < 0) {
|
if (Eng > 1e-4 && dEdx < 0) {
|
||||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||||
@ -578,12 +528,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
|
|
||||||
if (Eng < 1e-4 || dEdx > 0)
|
if (Eng < 1e-4 || dEdx > 0)
|
||||||
label[i] = -1;
|
label[i] = -1;
|
||||||
|
|
||||||
} //end inner energy loss loop
|
} //end inner energy loss loop
|
||||||
|
|
||||||
} //end outer energy loss loop
|
|
||||||
|
|
||||||
|
} //end outer energy loss loop
|
||||||
|
|
||||||
//vectorize coulomb scattering as much as possible
|
//vectorize coulomb scattering as much as possible
|
||||||
#pragma omp for nowait
|
#pragma omp for nowait
|
||||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||||
@ -593,7 +542,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
} //end omp parallel
|
} //end omp parallel
|
||||||
|
|
||||||
} //end offload
|
} //end offload
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,13 +26,7 @@ typedef struct {
|
|||||||
} MIC_PART_SMALL;
|
} MIC_PART_SMALL;
|
||||||
|
|
||||||
|
|
||||||
/**
|
class MICCollimatorPhysics : DKSAlogorithms{
|
||||||
* MICCollimatorPhysics class based on DKSCollimatorPhysics interface.
|
|
||||||
* Implementes OPALs collimator physics class for particle matter interactions using OpenMP
|
|
||||||
* and offload mode targetomg Intel Xeon Phi processors.
|
|
||||||
* For detailed documentation on CollimatorPhysics functions see OPAL documentation.
|
|
||||||
*/
|
|
||||||
class MICCollimatorPhysics : public DKSCollimatorPhysics {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -44,10 +38,10 @@ public:
|
|||||||
m_micbase = base;
|
m_micbase = base;
|
||||||
};
|
};
|
||||||
|
|
||||||
~MICCollimatorPhysics() { };
|
~MICCollimatorPhysics() { };
|
||||||
|
|
||||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
||||||
bool enableRutherforScattering = true);
|
bool enableRutherfordScattering = true);
|
||||||
|
|
||||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
@ -6,16 +6,13 @@
|
|||||||
|
|
||||||
MICFFT::MICFFT(MICBase *base) {
|
MICFFT::MICFFT(MICBase *base) {
|
||||||
m_micbase = base;
|
m_micbase = base;
|
||||||
m_fftsetup = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MICFFT::~MICFFT() {
|
MICFFT::~MICFFT() {
|
||||||
if (m_fftsetup) {
|
|
||||||
#pragma offload target(mic:0)
|
#pragma offload target(mic:0)
|
||||||
{
|
{
|
||||||
DftiFreeDescriptor(&FFTHandle_m);
|
DftiFreeDescriptor(&FFTHandle_m);
|
||||||
DftiFreeDescriptor(&handle);
|
DftiFreeDescriptor(&handle);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,7 +35,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m_fftsetup = true;
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
//BENI:
|
//BENI:
|
||||||
@ -125,8 +122,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
|
|||||||
}
|
}
|
||||||
|
|
||||||
//execute iFFT
|
//execute iFFT
|
||||||
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
|
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
|
||||||
return executeFFT(mem_ptr, ndim, N, -1, false);
|
return mic_executeFFT(mem_ptr, ndim, N, -1, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
//execute REAL->COMPLEX FFT
|
//execute REAL->COMPLEX FFT
|
||||||
|
@ -7,18 +7,13 @@
|
|||||||
#include <offload.h>
|
#include <offload.h>
|
||||||
#include <mkl_dfti.h>
|
#include <mkl_dfti.h>
|
||||||
|
|
||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithm/DKSFFT.h"
|
||||||
#include "MICBase.h"
|
#include "MICBase.h"
|
||||||
|
|
||||||
/**
|
class MICFFT : public DKSFFT {
|
||||||
* MIC FFT based on BaseFFT interface.
|
|
||||||
* uses MKL library to offload FFT on Intel Xeon Phi devices.
|
|
||||||
*/
|
|
||||||
class MICFFT : public BaseFFT {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
bool m_fftsetup;
|
|
||||||
MICBase *m_micbase;
|
MICBase *m_micbase;
|
||||||
|
|
||||||
/// Internal FFT object for performing serial FFTs.
|
/// Internal FFT object for performing serial FFTs.
|
||||||
@ -79,18 +74,6 @@ public:
|
|||||||
/* normalize IFFT on MIC */
|
/* normalize IFFT on MIC */
|
||||||
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
|
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
/**
|
|
||||||
* Info: destroy default FFT plans
|
|
||||||
* Return: success or error code
|
|
||||||
*/
|
|
||||||
int destroyFFT() { return DKS_SUCCESS; }
|
|
||||||
|
|
||||||
/*
|
|
||||||
Info: execute normalize for complex to real iFFT
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
|
||||||
double hr_m0, double hr_m1, double hr_m2, int streamId)
|
double hr_m1, double hr_m2)
|
||||||
{
|
{
|
||||||
|
|
||||||
double *tmp_ptr = (double*) tmpgreen;
|
double *tmp_ptr = (double*) tmp_ptr_;
|
||||||
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
|
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
|
||||||
{
|
{
|
||||||
std::memset(tmp_ptr,0,I*J*K);
|
std::memset(tmp_ptr,0,I*J*K);
|
||||||
@ -173,14 +173,12 @@ return 0;
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
//CUDA similar version:
|
//CUDA similar version:
|
||||||
int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
|
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
|
||||||
int streamId)
|
double *tmpgreen = (double*) tmp_ptr_;
|
||||||
{
|
double *mem_ptr = (double*) mem_ptr_;
|
||||||
double *tmpgreen_ptr = (double*) tmpgreen;
|
|
||||||
double *mem_ptr = (double*) rho2_m;
|
|
||||||
|
|
||||||
// the actual integration
|
// the actual integration
|
||||||
#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||||
{
|
{
|
||||||
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
||||||
std::memset(mem_ptr,0,II*JJ*KK);
|
std::memset(mem_ptr,0,II*JJ*KK);
|
||||||
@ -199,27 +197,27 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
|
|||||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||||
tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (i+1 < NI_tmp)
|
if (i+1 < NI_tmp)
|
||||||
tmp1 = tmpgreen_ptr[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (j+1 < NJ_tmp)
|
if (j+1 < NJ_tmp)
|
||||||
tmp2 = tmpgreen_ptr[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (k+1 < NK_tmp)
|
if (k+1 < NK_tmp)
|
||||||
tmp3 = tmpgreen_ptr[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
||||||
tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
||||||
tmp5 = tmpgreen_ptr[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||||
tmp6 = tmpgreen_ptr[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
tmp7 = tmpgreen_ptr[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||||
|
|
||||||
@ -236,8 +234,8 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
|
int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
|
||||||
double *mem_ptr = (double*) rho2_m;
|
double *mem_ptr = (double*) mem_ptr_;
|
||||||
|
|
||||||
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||||
{
|
{
|
||||||
@ -283,11 +281,11 @@ int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int str
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*multiply complex fields*/
|
/*multiply complex fields*/
|
||||||
int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
|
int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
|
||||||
// double *mem_ptr1 = (double*) mem_ptr1_;
|
// double *mem_ptr1 = (double*) mem_ptr1_;
|
||||||
// double *mem_ptr2 = (double*) mem_ptr2_;
|
// double *mem_ptr2 = (double*) mem_ptr2_;
|
||||||
_Complex double *mem_ptr1 = (_Complex double *) ptr1;
|
_Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
|
||||||
_Complex double *mem_ptr2 = (_Complex double *) ptr2;
|
_Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
|
||||||
|
|
||||||
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
|
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
|
||||||
{
|
{
|
||||||
|
@ -9,14 +9,12 @@
|
|||||||
#include <offload.h>
|
#include <offload.h>
|
||||||
#include <mkl_dfti.h>
|
#include <mkl_dfti.h>
|
||||||
|
|
||||||
#include "../Algorithms/GreensFunction.h"
|
|
||||||
#include "MICBase.h"
|
#include "MICBase.h"
|
||||||
|
|
||||||
#define DKS_SUCCESS 0
|
#define DKS_SUCCESS 0
|
||||||
#define DKS_ERROR 1
|
#define DKS_ERROR 1
|
||||||
|
|
||||||
/** OpenMP offload implementation of GreensFunction calculation for OPALs Poisson Solver. */
|
class MICGreensFunction {
|
||||||
class MICGreensFunction : public GreensFunction {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MICBase *m_micbase;
|
MICBase *m_micbase;
|
||||||
@ -30,18 +28,16 @@ public:
|
|||||||
~MICGreensFunction();
|
~MICGreensFunction();
|
||||||
|
|
||||||
/* compute greens integral analytically */
|
/* compute greens integral analytically */
|
||||||
int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
|
int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
|
||||||
double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
|
|
||||||
|
|
||||||
/* perform the actual integration */
|
/* perform the actual integration */
|
||||||
int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K,
|
int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
|
||||||
int stremaId = -1);
|
|
||||||
|
|
||||||
/* Mirror rho-Field */
|
/* Mirror rho-Field */
|
||||||
int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);
|
int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
|
||||||
|
|
||||||
/*multiply complex fields*/
|
/*multiply complex fields*/
|
||||||
int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);
|
int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -71,10 +71,6 @@ int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
|
|||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Merge sort implementation for intel MIC.
|
|
||||||
* Paralellized over all the MIC cores using OpenMP tasks.
|
|
||||||
*/
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
|
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
|
||||||
|
|
||||||
@ -88,9 +84,6 @@ void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Quicksort algorithm, developed for use on Intel MIC devices.
|
|
||||||
*/
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||||
|
|
||||||
@ -107,10 +100,6 @@ void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Insertion sort of @p list, developed for use on Intel MIC.
|
|
||||||
* Used by quick_sort to sort small lists.
|
|
||||||
*/
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
|
||||||
|
|
||||||
|
@ -1,53 +1,31 @@
|
|||||||
#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found
|
SET (_SRCS
|
||||||
|
OpenCLBase.cpp
|
||||||
SET (_HDRS OpenCLBase.h)
|
OpenCLFFT.cpp
|
||||||
SET (_SRCS OpenCLBase.cpp)
|
OpenCLChiSquare.cpp
|
||||||
SET (_KERNELS "")
|
OpenCLCollimatorPhysics.cpp
|
||||||
|
OpenCLChiSquareRuntime.cpp
|
||||||
IF (ENABLE_AMD)
|
|
||||||
SET (_SRCS
|
|
||||||
${_SRCS}
|
|
||||||
OpenCLFFT.cpp
|
|
||||||
)
|
|
||||||
|
|
||||||
SET (_HDRS
|
|
||||||
${_HDRS}
|
|
||||||
OpenCLFFT.h
|
|
||||||
)
|
|
||||||
|
|
||||||
SET (_KERNELS
|
|
||||||
${_KERNELS}
|
|
||||||
OpenCLKernels/OpenCLFFT.cl
|
|
||||||
OpenCLKernels/OpenCLFFTStockham.cl
|
|
||||||
OpenCLKernels/OpenCLTranspose.cl
|
|
||||||
)
|
)
|
||||||
ENDIF (ENABLE_AMD)
|
|
||||||
|
|
||||||
IF (ENABLE_MUSR)
|
SET (_HDRS
|
||||||
SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
|
OpenCLBase.h
|
||||||
SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
|
OpenCLFFT.h
|
||||||
SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
|
OpenCLChiSquare.h
|
||||||
ENDIF (ENABLE_MUSR)
|
OpenCLCollimatorPhysics.h
|
||||||
|
OpenCLChiSquareRuntime.h
|
||||||
IF (ENABLE_AMD AND ENABLE_OPAL)
|
)
|
||||||
SET (_SRCS
|
|
||||||
${_SRCS}
|
#INCLUDE_DIRECTORIES (
|
||||||
OpenCLCollimatorPhysics.cpp
|
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
OpenCLGreensFunction.cpp
|
#)
|
||||||
)
|
|
||||||
|
SET (_KERNELS
|
||||||
SET (_HDRS
|
OpenCLKernels/OpenCLChiSquare.cl
|
||||||
${_HDRS}
|
OpenCLKernels/OpenCLFFT.cl
|
||||||
OpenCLCollimatorPhysics.h
|
OpenCLKernels/OpenCLFFTStockham.cl
|
||||||
OpenCLGreensFunction.h
|
OpenCLKernels/OpenCLTranspose.cl
|
||||||
)
|
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
||||||
|
OpenCLKernels/OpenCLChiSquareRuntime.cl
|
||||||
SET (_KERNELS
|
|
||||||
${_KERNELS}
|
|
||||||
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
|
||||||
OpenCLKernels/OpenCLGreensFunction.cl
|
|
||||||
)
|
)
|
||||||
ENDIF (ENABLE_AMD AND ENABLE_OPAL)
|
|
||||||
|
|
||||||
ADD_SOURCES (${_SRCS})
|
ADD_SOURCES (${_SRCS})
|
||||||
ADD_HEADERS (${_HDRS})
|
ADD_HEADERS (${_HDRS})
|
||||||
|
@ -7,13 +7,21 @@ cl_device_id OpenCLBase::m_device_id = NULL;
|
|||||||
cl_event OpenCLBase::m_last_event = NULL;
|
cl_event OpenCLBase::m_last_event = NULL;
|
||||||
|
|
||||||
OpenCLBase::OpenCLBase() {
|
OpenCLBase::OpenCLBase() {
|
||||||
|
//m_context = NULL;
|
||||||
|
//m_command_queue = NULL;
|
||||||
m_program = NULL;
|
m_program = NULL;
|
||||||
m_kernel = NULL;
|
m_kernel = NULL;
|
||||||
|
//m_device_id = NULL;
|
||||||
|
//m_platform_id = NULL;
|
||||||
m_kernel_file = NULL;
|
m_kernel_file = NULL;
|
||||||
|
|
||||||
m_last_event = NULL;
|
m_last_event = NULL;
|
||||||
|
|
||||||
|
//m_events = new cl_event[500];
|
||||||
|
//m_num_events = 0;
|
||||||
|
|
||||||
defaultRndSet = 0;
|
defaultRndSet = 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
OpenCLBase::~OpenCLBase() {
|
OpenCLBase::~OpenCLBase() {
|
||||||
@ -33,11 +41,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
|
|||||||
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
|
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
|
||||||
ocl_loadKernel(kernel_file);
|
ocl_loadKernel(kernel_file);
|
||||||
delete[] kernel_file;
|
delete[] kernel_file;
|
||||||
|
|
||||||
//allocate memory for rand states
|
//allocate memory for rand states
|
||||||
int ierr;
|
int ierr;
|
||||||
defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
|
defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
|
||||||
|
|
||||||
//exec kernel
|
//exec kernel
|
||||||
int seed = 0;
|
int seed = 0;
|
||||||
ocl_createKernel("initRand");
|
ocl_createKernel("initRand");
|
||||||
@ -47,34 +55,13 @@ int OpenCLBase::ocl_createRndStates(int size) {
|
|||||||
|
|
||||||
size_t work_items = size;
|
size_t work_items = size;
|
||||||
size_t work_group_size = 1;
|
size_t work_group_size = 1;
|
||||||
|
|
||||||
ocl_executeKernel(1, &work_items, &work_group_size);
|
ocl_executeKernel(1, &work_items, &work_group_size);
|
||||||
|
|
||||||
defaultRndSet = 1;
|
defaultRndSet = 1;
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
|
return OCL_SUCCESS;
|
||||||
//load kernel
|
|
||||||
char * kernel_file = new char[500];
|
|
||||||
kernel_file[0] = '\0';
|
|
||||||
strcat(kernel_file, OPENCL_KERNELS);
|
|
||||||
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
|
|
||||||
ocl_loadKernel(kernel_file);
|
|
||||||
delete[] kernel_file;
|
|
||||||
|
|
||||||
//set kernel variables
|
|
||||||
cl_mem tmp_data = (cl_mem) mem_ptr;
|
|
||||||
|
|
||||||
ocl_createKernel("createRandoms");
|
|
||||||
ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
|
|
||||||
ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
|
|
||||||
ocl_setKernelArg(2, sizeof(int), &size);
|
|
||||||
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = (size % work_size + 1) * work_size;
|
|
||||||
ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* destroy rnd states */
|
/* destroy rnd states */
|
||||||
@ -83,7 +70,7 @@ int OpenCLBase::ocl_deleteRndStates() {
|
|||||||
ocl_freeMemory(defaultRndState);
|
ocl_freeMemory(defaultRndState);
|
||||||
defaultRndSet = 0;
|
defaultRndSet = 0;
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -441,8 +428,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
//create program from kernel
|
//create program from kernel
|
||||||
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
|
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
|
||||||
NULL, &ierr);
|
|
||||||
if (ierr != CL_SUCCESS) {
|
if (ierr != CL_SUCCESS) {
|
||||||
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -452,7 +438,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
check if compiling kernel source succeded, if failed return error code
|
check if compileng kernel source succeded, if failed return error code
|
||||||
if in debug mode get compilation info and print program build log witch
|
if in debug mode get compilation info and print program build log witch
|
||||||
will give indication what made the compilation fail
|
will give indication what made the compilation fail
|
||||||
*/
|
*/
|
||||||
@ -461,8 +447,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
|
|
||||||
//get build status
|
//get build status
|
||||||
cl_build_status status;
|
cl_build_status status;
|
||||||
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
|
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
|
||||||
sizeof(cl_build_status), &status, NULL);
|
|
||||||
|
|
||||||
//get log size
|
//get log size
|
||||||
size_t log_size;
|
size_t log_size;
|
||||||
@ -628,12 +613,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ierr != DKS_SUCCESS) {
|
if (ierr != OCL_SUCCESS) {
|
||||||
DEBUG_MSG("Failed to build kernel file " << kernel_file);
|
DEBUG_MSG("Failed to build kernel file " << kernel_file);
|
||||||
return DKS_ERROR;
|
return OCL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
//compile kernel form source code provided
|
//compile kernel form source code provided
|
||||||
@ -675,14 +660,17 @@ cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) {
|
|||||||
/*
|
/*
|
||||||
write data specified by in_data to device memory, device memory space defined by cl_mem
|
write data specified by in_data to device memory, device memory space defined by cl_mem
|
||||||
*/
|
*/
|
||||||
int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size,
|
int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) {
|
||||||
size_t offset, int blocking)
|
|
||||||
{
|
|
||||||
|
|
||||||
cl_int ierr;
|
cl_int ierr;
|
||||||
|
|
||||||
|
|
||||||
|
//std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl;
|
||||||
|
ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event);
|
||||||
|
|
||||||
|
//m_events[m_num_events] = m_last_event;
|
||||||
|
m_events.push_back(m_last_event);
|
||||||
|
|
||||||
ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size,
|
|
||||||
in_data, 0, NULL, NULL);
|
|
||||||
|
|
||||||
if (ierr != CL_SUCCESS) {
|
if (ierr != CL_SUCCESS) {
|
||||||
DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
|
DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr);
|
||||||
@ -713,11 +701,6 @@ int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) {
|
|||||||
*/
|
*/
|
||||||
int OpenCLBase::ocl_createKernel(const char* kernel_name) {
|
int OpenCLBase::ocl_createKernel(const char* kernel_name) {
|
||||||
cl_int ierr;
|
cl_int ierr;
|
||||||
|
|
||||||
//release the old kernel
|
|
||||||
if (m_kernel != NULL)
|
|
||||||
clReleaseKernel(m_kernel);
|
|
||||||
//create a new kernel
|
|
||||||
m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
|
m_kernel = clCreateKernel(m_program, kernel_name, &ierr);
|
||||||
if (ierr != CL_SUCCESS) {
|
if (ierr != CL_SUCCESS) {
|
||||||
DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
|
DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr);
|
||||||
@ -745,20 +728,24 @@ int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) {
|
|||||||
optional: work_group_size - can specify how work items are divided in work groups,
|
optional: work_group_size - can specify how work items are divided in work groups,
|
||||||
if left NULL OpenCL implementation handles this part.
|
if left NULL OpenCL implementation handles this part.
|
||||||
*/
|
*/
|
||||||
int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items,
|
int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) {
|
||||||
const size_t *work_group_size)
|
cl_int ierr;
|
||||||
{
|
|
||||||
cl_int ierr;
|
cl_event tmp_event;
|
||||||
|
if (m_last_event == NULL) {
|
||||||
ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL,
|
ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size,
|
||||||
work_items, work_group_size,
|
0, NULL, &tmp_event);
|
||||||
0, NULL, NULL);
|
} else {
|
||||||
|
ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size,
|
||||||
|
1, &m_last_event, &tmp_event);
|
||||||
|
}
|
||||||
|
|
||||||
if (ierr != CL_SUCCESS)
|
if (ierr != CL_SUCCESS)
|
||||||
DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr
|
DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
|
||||||
<< " work items: " << *work_items << ", "
|
|
||||||
<< " work group: " << *work_group_size);
|
m_last_event = tmp_event;
|
||||||
|
m_events.push_back(m_last_event);
|
||||||
|
|
||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -766,13 +753,12 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items,
|
|||||||
read data from device, mem_ptr points to data on device out_data points to memory in host
|
read data from device, mem_ptr points to data on device out_data points to memory in host
|
||||||
blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
|
blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE)
|
||||||
*/
|
*/
|
||||||
int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size,
|
int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) {
|
||||||
size_t offset, int blocking)
|
|
||||||
{
|
|
||||||
cl_int ierr;
|
cl_int ierr;
|
||||||
|
|
||||||
|
ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event);
|
||||||
|
|
||||||
ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size,
|
m_events.push_back(m_last_event);
|
||||||
out_data, 0, NULL, NULL);
|
|
||||||
|
|
||||||
if (ierr != CL_SUCCESS)
|
if (ierr != CL_SUCCESS)
|
||||||
DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
|
DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr);
|
||||||
@ -936,27 +922,22 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
|
|||||||
if (ierr != DKS_SUCCESS)
|
if (ierr != DKS_SUCCESS)
|
||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
/* get device properties */
|
//get device properties
|
||||||
//maximum number of work-items in a work group supported by device
|
|
||||||
size_t max_group_size;
|
size_t max_group_size;
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
|
||||||
//maxumum local memory size per work group
|
|
||||||
cl_ulong local_mem_size;
|
cl_ulong local_mem_size;
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
|
||||||
//get the supported extensions
|
|
||||||
size_t ext_size;
|
size_t ext_size;
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
|
||||||
char *ext = new char[ext_size];
|
char *ext = new char[ext_size];
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
|
||||||
|
|
||||||
/* get kernel properties */
|
//get kernel properties
|
||||||
//get max work group size that can be used for this kernel
|
|
||||||
size_t kernel_group_size;
|
size_t kernel_group_size;
|
||||||
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE,
|
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE,
|
||||||
sizeof(size_t), &kernel_group_size, 0);
|
sizeof(size_t), &kernel_group_size, 0);
|
||||||
threadsPerBlock = kernel_group_size;
|
threadsPerBlock = kernel_group_size;
|
||||||
|
|
||||||
//get max local memory size that can be used for this kernel
|
|
||||||
cl_ulong kernel_local_mem;
|
cl_ulong kernel_local_mem;
|
||||||
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
|
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
|
||||||
sizeof(cl_ulong), &kernel_local_mem, 0);
|
sizeof(cl_ulong), &kernel_local_mem, 0);
|
||||||
@ -965,18 +946,18 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
|
|||||||
std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
|
std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
|
||||||
|
|
||||||
|
|
||||||
std::cout << "Work group size: max for device " << max_group_size << " > "
|
std::cout << "Work groups: device limit " << max_group_size << ", "
|
||||||
<< "max for kernel " << kernel_group_size << " > "
|
<< "kernel limit " << kernel_group_size << ", "
|
||||||
<< "required " << work_group_size << std::endl;
|
<< "required " << work_group_size << std::endl;
|
||||||
|
|
||||||
|
|
||||||
std::cout << "Local memory: device limit " << local_mem_size << std::endl;
|
std::cout << "Local memory: device limit " << local_mem_size << std::endl;
|
||||||
std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
std::cout << std::endl << "Available extensions: " << ext << std::endl;
|
std::cout << "Available extensions: " << ext << std::endl;
|
||||||
|
|
||||||
std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
|
std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,16 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
Name: OpenCLBase
|
||||||
|
|
||||||
|
Author: Uldis Locans
|
||||||
|
|
||||||
|
Info: OpenCL base class to handle all the common details associated
|
||||||
|
with kernel launch on OpenCL device
|
||||||
|
|
||||||
|
Date: 2014.09.18
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef H_OPENCL_BASE
|
#ifndef H_OPENCL_BASE
|
||||||
#define H_OPENCL_BASE
|
#define H_OPENCL_BASE
|
||||||
|
|
||||||
@ -17,10 +30,13 @@
|
|||||||
#include <CL/cl_ext.h>
|
#include <CL/cl_ext.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
/** struct for random number state. */
|
/* struct for random number state */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
||||||
double s10;
|
double s10;
|
||||||
double s11;
|
double s11;
|
||||||
double s12;
|
double s12;
|
||||||
@ -29,292 +45,250 @@ typedef struct {
|
|||||||
double s22;
|
double s22;
|
||||||
double z;
|
double z;
|
||||||
bool gen;
|
bool gen;
|
||||||
|
|
||||||
} RNDState;
|
} RNDState;
|
||||||
|
|
||||||
/**
|
|
||||||
* OpenCL base class to handle device setup and basic communication wiht the device.
|
|
||||||
* Handles initialization of OpenCL device, memory manegement, data transfer and kernel launch.
|
|
||||||
* The OpenCL kernels are located in seperate files in OpenCLKernels folder, the OpenCLBase
|
|
||||||
* class contains methods to read the kernel files, compile the kernel codes and launch kernels
|
|
||||||
* from the compiled codes. Which kernel file needs to be loaded for the specif functin is
|
|
||||||
* handled by the base class that is launching the kernel.
|
|
||||||
*/
|
|
||||||
class OpenCLBase {
|
class OpenCLBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
static cl_context m_context;
|
||||||
|
static cl_command_queue m_command_queue;
|
||||||
|
|
||||||
//variables containig OpenCL device and platform ids
|
|
||||||
static cl_platform_id m_platform_id;
|
static cl_platform_id m_platform_id;
|
||||||
static cl_device_id m_device_id;
|
static cl_device_id m_device_id;
|
||||||
|
|
||||||
//variables containit compiled OpenCL program and kernel
|
|
||||||
cl_context_properties m_context_properties[3];
|
cl_context_properties m_context_properties[3];
|
||||||
cl_program m_program;
|
cl_program m_program;
|
||||||
cl_kernel m_kernel;
|
cl_kernel m_kernel;
|
||||||
|
|
||||||
//variables for tracking OpenCL events
|
|
||||||
static cl_event m_last_event;
|
static cl_event m_last_event;
|
||||||
cl_int m_num_events;
|
cl_int m_num_events;
|
||||||
std::vector<cl_event> m_events;
|
std::vector<cl_event> m_events;
|
||||||
|
|
||||||
//currently load kernel file
|
|
||||||
char * m_kernel_file;
|
char * m_kernel_file;
|
||||||
|
|
||||||
//type of device used by OpenCL
|
|
||||||
cl_device_type m_device_type;
|
cl_device_type m_device_type;
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Get all available OpenCL platforms.
|
Name: getPlatforms
|
||||||
* Get all avaialble platforms and save in m_platform_ids, save number of platforms
|
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_getPlatforms();
|
int ocl_getPlatforms();
|
||||||
|
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Get first available OpenCL device of specified type.
|
Name: getDevice
|
||||||
* Get first avaialble devices and save device id and platform id for this device,
|
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
|
||||||
* device name: (-gpu, -mic, -cpu)
|
ReturnL success or error code
|
||||||
* ReturnL success or error code
|
*/
|
||||||
*/
|
|
||||||
int ocl_getDevice(const char* device_name);
|
int ocl_getDevice(const char* device_name);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Get cl_device_type from the specified device name.
|
Name getDeviceType
|
||||||
* get device type from device name (-gpu, -cpu, -mic)
|
Info: get device type from device name (-gpu, -cpu, -mic)
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
|
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Create OpenCL context with specified device.
|
Name: createContext
|
||||||
* Return: success or error code
|
Info: create context with specified device
|
||||||
*/
|
Return: success or error code
|
||||||
|
*/
|
||||||
int ocl_createContext();
|
int ocl_createContext();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Build program from specified kernel file.
|
Name: buildProgram
|
||||||
* Return: success or error code.
|
Info: build program from specified kernel file
|
||||||
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_buildProgram(const char* kernel_file);
|
int ocl_buildProgram(const char* kernel_file);
|
||||||
|
|
||||||
/**
|
/** Compile program from kernel source string
|
||||||
* Compile program from kernel source string.
|
*
|
||||||
* Takes a string read from OpenCL kernel file saved in kernel_source and compiles the
|
|
||||||
* OpenCL program, that can be then executed on the device.
|
|
||||||
* opts is a string specifiend additional compiler flags.
|
|
||||||
*/
|
*/
|
||||||
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
|
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
//memory for random number states
|
|
||||||
int defaultRndSet;
|
int defaultRndSet;
|
||||||
cl_mem defaultRndState;
|
cl_mem defaultRndState;
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
//OpenCL context and commad queue
|
|
||||||
static cl_context m_context;
|
|
||||||
static cl_command_queue m_command_queue;
|
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* constructor
|
constructor
|
||||||
*/
|
*/
|
||||||
OpenCLBase();
|
OpenCLBase();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* destructor
|
destructor
|
||||||
*/
|
*/
|
||||||
~OpenCLBase();
|
~OpenCLBase();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Allocate memory for size random number states and init the rnd states.
|
Create RND states
|
||||||
* Uses AMD clRng library for random numbers.
|
Return: success or error code
|
||||||
* This library is only compatible with AMD devices.
|
*/
|
||||||
*/
|
|
||||||
int ocl_createRndStates(int size);
|
int ocl_createRndStates(int size);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Create an array of random numbers on the device.
|
Destroy rnd states
|
||||||
* Filles hte mem_ptr with random numbers.
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_createRandomNumbers(void *mem_ptr, int size);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Destroy rnd states and free device memory.
|
|
||||||
* Return: success or error code
|
|
||||||
*/
|
|
||||||
int ocl_deleteRndStates();
|
int ocl_deleteRndStates();
|
||||||
|
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Prints info about all the available platforms and devices.
|
Name: getAllDevices
|
||||||
* Can be used for information purposes to see what devices are available on the system.
|
Info: get all available devices
|
||||||
* ReturnL success or error code.
|
ReturnL success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_getAllDevices();
|
int ocl_getAllDevices();
|
||||||
|
|
||||||
/**
|
/** Get the OpenCL device count for the set type of device
|
||||||
* Get the OpenCL device count for the set type of device.
|
*
|
||||||
* Device count is set in ndev parameter, returns success or error code.
|
|
||||||
*/
|
*/
|
||||||
int ocl_getDeviceCount(int &ndev);
|
int ocl_getDeviceCount(int &ndev);
|
||||||
|
|
||||||
/**
|
/** Get the name of the device used
|
||||||
* Get the name of the device currently us use.
|
|
||||||
*/
|
*/
|
||||||
int ocl_getDeviceName(std::string &device_name);
|
int ocl_getDeviceName(std::string &device_name);
|
||||||
|
|
||||||
/**
|
/** Set the device to use for OpenCL kernels.
|
||||||
* Set the device to use for OpenCL kernels.
|
* device id to use is passed as integer.
|
||||||
* Device id to use is passed as integer.
|
|
||||||
*/
|
*/
|
||||||
int ocl_setDevice(int device);
|
int ocl_setDevice(int device);
|
||||||
|
|
||||||
/**
|
/** Get a list of all the unique devices of the same type that can run OpenCL kernels
|
||||||
* Get a list of all the unique devices of the same type that can run OpenCL kernels.
|
* Used when GPUs of different types might be pressent on the system.
|
||||||
* Used when GPUs of different types might be pressent on the system.
|
|
||||||
*/
|
*/
|
||||||
int ocl_getUniqueDevices(std::vector<int> &devices);
|
int ocl_getUniqueDevices(std::vector<int> &devices);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Initialize OpenCL connection with a device of specified type.
|
Name: setUp
|
||||||
* Find if specified device is avaialble, creates a contex and command queue.
|
Info: set up opencl resources
|
||||||
* Returns success or error code.
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_setUp(const char* device_name);
|
int ocl_setUp(const char* device_name);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Given a OpenCL kernel file name loads the content and compile the OpenCL code.
|
Name: loadKernel
|
||||||
* Load and compile opencl kernel file if it has changed.
|
Info: load and compile opencl kernel file if it has changed
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_loadKernel(const char* kernel_file);
|
int ocl_loadKernel(const char* kernel_file);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/** Build program from kernel source.
|
||||||
* Build program from kernel source.
|
|
||||||
* Builds a program from source code provided in kernel_source.
|
* Builds a program from source code provided in kernel_source.
|
||||||
* If compilation fails will return DKS_ERROR
|
* If compilation fails will return DKS_ERROR
|
||||||
*/
|
*/
|
||||||
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
|
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Allocate memory on the device.
|
Name: allocateMemory
|
||||||
* Return: return pointer to memory
|
Info: allocate memory on device
|
||||||
|
Return: return pointer to memory
|
||||||
*/
|
*/
|
||||||
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Allocate memory of specific type on device.
|
Name: allocateMemory
|
||||||
* The availabel types are cl_mem_flags type listed in OpenCL documentation:
|
Info: allocate memory on device
|
||||||
* CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY, CL_MEM_USE_HOST_PTR,
|
Return: return pointer to memory
|
||||||
* CL_MEM_ALLOC_HOST_PTR and CL_MEM_COPY_HOST_PTR.
|
|
||||||
* Return: return pointer to memory
|
|
||||||
*/
|
*/
|
||||||
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Zero OpenCL memory buffer.
|
Name: writeData
|
||||||
* Set all the elemetns in the device array to zero.
|
Info: write data to device memory (needs ptr to mem object)
|
||||||
*/
|
Return: success or error code
|
||||||
template <typename T>
|
*/
|
||||||
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
|
|
||||||
|
|
||||||
cl_int ierr;
|
|
||||||
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
|
|
||||||
sizeof(T)*size, 0, nullptr, nullptr);
|
|
||||||
if (ierr != CL_SUCCESS)
|
|
||||||
return DKS_ERROR;
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Write data to device memory (needs ptr to mem object)
|
|
||||||
* Return: success or error code
|
|
||||||
*/
|
|
||||||
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Copy data from one buffer on the device to another
|
Name: copyData
|
||||||
* Return: success or error code
|
Info: copy data from one buffer on the device to another
|
||||||
*/
|
Return: success or error code
|
||||||
|
*/
|
||||||
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
|
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Create kernel from compiled OpenCL program.
|
Name: createKernel
|
||||||
* Return: success or error code
|
Info: create kernel from program
|
||||||
*/
|
Return: success or error code
|
||||||
|
*/
|
||||||
int ocl_createKernel(const char* kernel_name);
|
int ocl_createKernel(const char* kernel_name);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Set argiments for the kernel that will be launched.
|
Name: setKernelArgs
|
||||||
* Return: success or error code
|
Info: set opencl kernel arguments
|
||||||
*/
|
Return: success or error code
|
||||||
|
*/
|
||||||
int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
|
int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Execute selected kernel.
|
Name: executeKernel
|
||||||
* Before kenrel can be executed buildProgram must be executed, create kernel must be executed
|
Info: execute selected kernel (needs kernel parameters)
|
||||||
* and kenre specifeid in execute kerenel must be in compiled source, and the necessary
|
Return: success or error code
|
||||||
* kernel arguments must be set.
|
|
||||||
* Return: success or error code
|
|
||||||
*/
|
*/
|
||||||
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
|
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Read data from device (needs pointer to mem object).
|
Name: readData
|
||||||
* Return: success or error code
|
Info: read data from device (needs pointer to mem object)
|
||||||
*/
|
Return: success or error code
|
||||||
|
*/
|
||||||
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Free device memory (needs ptr to mem object).
|
Name: freeMemory
|
||||||
* Return: success or error code
|
Info: free device memory (needs ptr to mem object)
|
||||||
*/
|
Return: success or error code
|
||||||
|
*/
|
||||||
int ocl_freeMemory(cl_mem mem_ptr);
|
int ocl_freeMemory(cl_mem mem_ptr);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Free opencl resources.
|
Name: cleanUp
|
||||||
* Deletes the kernel, compiled program, command queue and colese the connection
|
Info: free opencl resources
|
||||||
* to device by releasing the context.
|
Return: success or error code
|
||||||
* Return: success or error code
|
*/
|
||||||
*/
|
|
||||||
int ocl_cleanUp();
|
int ocl_cleanUp();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Print info of currently selected device.
|
Name: deviceInfo
|
||||||
* Mostly for debugging purposes, but in verbose mode can be used to see device properties.
|
Info: print device info (mostly for debugging purposes)
|
||||||
* Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int ocl_deviceInfo(bool verbose = true);
|
int ocl_deviceInfo(bool verbose = true);
|
||||||
|
|
||||||
/*
|
/* Check OpenCL kernel.
|
||||||
* Check OpenCL kernel.
|
* Query device and check if it can run the kernel with required parameters
|
||||||
* Query device and check if it can run the kernel with required parameters.
|
|
||||||
* Also check the available OpenCL extensions - usefull for checking the supported device
|
|
||||||
* features, like double precission.
|
|
||||||
*/
|
*/
|
||||||
int ocl_checkKernel(const char* kernel_name, int work_group_size,
|
int ocl_checkKernel(const char* kernel_name, int work_group_size,
|
||||||
bool double_precision, int &threadsPerBlock);
|
bool double_precision, int &threadsPerBlock);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Clear the event list.
|
Name: clearEvents
|
||||||
* Events can be used for timing and synchronization purposes.
|
Info: clear saved events (for debuging purposes)
|
||||||
*/
|
Return: nothing
|
||||||
|
*/
|
||||||
void ocl_clearEvents();
|
void ocl_clearEvents();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* print information about kernel timings from event list.
|
Name: eventInfo
|
||||||
* for debuging purposes
|
Info: print information about kernel timings (for debuging purposes)
|
||||||
*/
|
Return: nothing
|
||||||
|
*/
|
||||||
void ocl_eventInfo();
|
void ocl_eventInfo();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* Return current command queue.
|
Return current command queue
|
||||||
*/
|
*/
|
||||||
cl_command_queue ocl_getQueue() { return m_command_queue; }
|
cl_command_queue ocl_getQueue() { return m_command_queue; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
#define DKS_SUCCESS 0
|
#define DKS_SUCCESS 0
|
||||||
#define DKS_ERROR 1
|
#define DKS_ERROR 1
|
||||||
|
|
||||||
/** Deprecated, SimpleFit implementation of ChiSquare. */
|
|
||||||
class OpenCLChiSquare {
|
class OpenCLChiSquare {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -42,7 +42,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
|
|||||||
if (!fp)
|
if (!fp)
|
||||||
DEBUG_MSG("Can't open kernel file" << kernel_file);
|
DEBUG_MSG("Can't open kernel file" << kernel_file);
|
||||||
|
|
||||||
//get file size and allocate memory
|
//get file size and allocate memory
|
||||||
fseek(fp, 0, SEEK_END);
|
fseek(fp, 0, SEEK_END);
|
||||||
fsize = ftell(fp);
|
fsize = ftell(fp);
|
||||||
kernel_source = new char[fsize+1];
|
kernel_source = new char[fsize+1];
|
||||||
@ -52,7 +52,7 @@ std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
|
|||||||
fread(kernel_source, 1, sizeof(char)*fsize, fp);
|
fread(kernel_source, 1, sizeof(char)*fsize, fp);
|
||||||
kernel_source[fsize] = '\0';
|
kernel_source[fsize] = '\0';
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
|
|
||||||
std::string kernel_string (kernel_source);
|
std::string kernel_string (kernel_source);
|
||||||
return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;
|
return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;
|
||||||
|
|
||||||
@ -76,9 +76,10 @@ int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {
|
|||||||
|
|
||||||
double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
|
double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
|
||||||
|
|
||||||
|
|
||||||
int ierr;
|
int ierr;
|
||||||
//calc number of threads per workgroup and nr of work groups
|
//calc number of thread sper workgroup and nr of work groups
|
||||||
size_t work_size_sum = (size_t)blockSize_m;
|
size_t work_size_sum = 128;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
size_t work_items = (size_t)length;
|
size_t work_items = (size_t)length;
|
||||||
@ -86,7 +87,7 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
|
|||||||
work_items = (length / work_size_sum + 1) * work_size_sum;
|
work_items = (length / work_size_sum + 1) * work_size_sum;
|
||||||
int work_groups = length / work_size_sum + 1;
|
int work_groups = length / work_size_sum + 1;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
size_t work_items = 80 * work_size_sum;
|
size_t work_items = 80 * work_size_sum;
|
||||||
int work_groups = 80;
|
int work_groups = 80;
|
||||||
|
|
||||||
@ -95,19 +96,20 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
|
|||||||
|
|
||||||
double *partial_sums = new double[work_groups];
|
double *partial_sums = new double[work_groups];
|
||||||
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
|
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
|
||||||
|
|
||||||
//execute sum kernel
|
//execute sum kernel
|
||||||
|
//ocl_createKernel("parallelReductionSum");
|
||||||
m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
|
m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
|
||||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
|
||||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
|
||||||
m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
|
m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
|
||||||
m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
|
||||||
m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
|
m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
|
||||||
|
|
||||||
//read partial sums and free temp memory
|
//read partial sums and free temp mempry
|
||||||
m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
|
m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
|
||||||
m_oclbase->ocl_freeMemory(tmp_ptr);
|
m_oclbase->ocl_freeMemory(tmp_ptr);
|
||||||
|
|
||||||
//sumup partial sums on the host
|
//sumup partial sums on the host
|
||||||
double result = 0;
|
double result = 0;
|
||||||
for (int i = 0; i < work_groups; i++)
|
for (int i = 0; i < work_groups; i++)
|
||||||
@ -139,7 +141,6 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
|||||||
//set work item size
|
//set work item size
|
||||||
size_t work_items;
|
size_t work_items;
|
||||||
size_t work_size = (size_t)blockSize_m;
|
size_t work_size = (size_t)blockSize_m;
|
||||||
|
|
||||||
if (numBlocks_m < 0)
|
if (numBlocks_m < 0)
|
||||||
work_items = (size_t)length;
|
work_items = (size_t)length;
|
||||||
else
|
else
|
||||||
@ -156,7 +157,6 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
|||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
//set kernel args
|
//set kernel args
|
||||||
size_t num=1;
|
|
||||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
|
||||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
|
||||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
|
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
|
||||||
@ -172,23 +172,20 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
|||||||
m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
|
m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
|
||||||
m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
|
m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
|
||||||
m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
|
m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
|
||||||
num = numpar; if (num == 0) num = 1;
|
m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
|
||||||
m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
|
m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
|
||||||
num = numfunc; if (num == 0) num = 1;
|
m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
|
||||||
m_oclbase->ocl_setKernelArg(16, sizeof(double)*num, NULL);
|
|
||||||
num = nummap; if (num == 0) num = 1;
|
|
||||||
m_oclbase->ocl_setKernelArg(17, sizeof(int)*num, NULL);
|
|
||||||
|
|
||||||
if (ierr != DKS_SUCCESS)
|
if (ierr != DKS_SUCCESS)
|
||||||
return ierr;
|
return ierr;
|
||||||
} else if (fitType == FITTYPE_ASYMMETRY) {
|
} else if (fitType == FITTYPE_ASYMMETRY) {
|
||||||
//create kernel
|
//create kernel
|
||||||
ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
|
ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
|
||||||
|
|
||||||
if (ierr != DKS_SUCCESS)
|
if (ierr != DKS_SUCCESS)
|
||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
//set kernel args
|
//set kernel args
|
||||||
size_t num=1;
|
|
||||||
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
|
||||||
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
|
||||||
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
|
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
|
||||||
@ -203,12 +200,9 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
|||||||
m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
|
m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
|
||||||
m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
|
m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
|
||||||
m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
|
m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
|
||||||
num = numpar; if (num == 0) num = 1;
|
m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
|
||||||
m_oclbase->ocl_setKernelArg(14, sizeof(double)*num, NULL);
|
m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
|
||||||
num = numfunc; if (num == 0) num = 1;
|
m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
|
||||||
m_oclbase->ocl_setKernelArg(15, sizeof(double)*num, NULL);
|
|
||||||
num = nummap; if (num == 0) num = 1;
|
|
||||||
m_oclbase->ocl_setKernelArg(16, sizeof(int)*num, NULL);
|
|
||||||
|
|
||||||
if (ierr != DKS_SUCCESS)
|
if (ierr != DKS_SUCCESS)
|
||||||
return ierr;
|
return ierr;
|
||||||
@ -232,7 +226,6 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
|
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
|
||||||
//write params to gpu
|
|
||||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
|
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
|
||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
@ -242,7 +235,6 @@ int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
|
|||||||
if (numfunc == 0)
|
if (numfunc == 0)
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
//write function values to the GPU
|
|
||||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
|
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
|
||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
@ -251,12 +243,11 @@ int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
|
|||||||
if (nummap == 0)
|
if (nummap == 0)
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
//wrtie map values to the GPU
|
|
||||||
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
|
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
|
||||||
return ierr;
|
return ierr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
|
int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
|
||||||
int size_func, int size_map)
|
int size_func, int size_map)
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -266,7 +257,7 @@ int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
|
|||||||
freeChiSquare();
|
freeChiSquare();
|
||||||
}
|
}
|
||||||
|
|
||||||
//allocate temporary memory, memory is allocated for the data set, parametrs, functions and maps
|
//allocate temporary memory
|
||||||
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
|
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
|
||||||
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
|
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
|
||||||
if (size_func == 0)
|
if (size_func == 0)
|
||||||
@ -286,12 +277,12 @@ int OpenCLChiSquareRuntime::freeChiSquare() {
|
|||||||
int ierr = DKS_ERROR;
|
int ierr = DKS_ERROR;
|
||||||
if (initDone_m) {
|
if (initDone_m) {
|
||||||
|
|
||||||
//free GPU memory
|
//free memory
|
||||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
|
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
|
||||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
|
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
|
||||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
|
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
|
||||||
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
|
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
|
||||||
|
|
||||||
initDone_m = false;
|
initDone_m = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,13 +308,9 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
//check the GPU kernel
|
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
|
||||||
ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
|
|
||||||
if (threadsPerBlock < blockSize_m) {
|
|
||||||
std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
|
|
||||||
blockSize_m = threadsPerBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,54 +17,44 @@ const std::string openclFunctHeader = "double fTheory(double t, __local double *
|
|||||||
|
|
||||||
const std::string openclFunctFooter = "}\n";
|
const std::string openclFunctFooter = "}\n";
|
||||||
|
|
||||||
/**
|
|
||||||
* OpenCL implementation of ChiSquareRuntime class.
|
|
||||||
* Implements ChiSquareRuntime interface to allow musrfit to target devices that
|
|
||||||
* support OpenCL - Nvidia and AMD GPUs, Intel and AMD CPUs, Intel Xeon Phi.
|
|
||||||
*/
|
|
||||||
class OpenCLChiSquareRuntime : public ChiSquareRuntime {
|
class OpenCLChiSquareRuntime : public ChiSquareRuntime {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
OpenCLBase *m_oclbase;
|
OpenCLBase *m_oclbase;
|
||||||
|
|
||||||
/**
|
/** Private function to add user defined function to kernel string
|
||||||
* Private function to add user defined function to kernel string.
|
*
|
||||||
*/
|
*/
|
||||||
std::string buildProgram(std::string function);
|
std::string buildProgram(std::string function);
|
||||||
|
|
||||||
/**
|
|
||||||
* Launch parallel reduction kernel to calculate the sum of data array
|
|
||||||
*/
|
|
||||||
double calculateSum(cl_mem data, int length);
|
double calculateSum(cl_mem data, int length);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/** Constructor wiht openclbase argument
|
||||||
* Constructor wiht openclbase argument.
|
*
|
||||||
*/
|
*/
|
||||||
OpenCLChiSquareRuntime(OpenCLBase *base);
|
OpenCLChiSquareRuntime(OpenCLBase *base);
|
||||||
|
|
||||||
/**
|
/** Default constructor
|
||||||
* Default constructor
|
*
|
||||||
*/
|
*/
|
||||||
OpenCLChiSquareRuntime();
|
OpenCLChiSquareRuntime();
|
||||||
|
|
||||||
/**
|
/** Default destructor
|
||||||
* Default destructor
|
*
|
||||||
*/
|
*/
|
||||||
~OpenCLChiSquareRuntime();
|
~OpenCLChiSquareRuntime();
|
||||||
|
|
||||||
/**
|
/** Compile program and save ptx.
|
||||||
* Compile program and save ptx.
|
|
||||||
* Add function string to the calcFunction kernel and compile the program
|
* Add function string to the calcFunction kernel and compile the program
|
||||||
* Function must be valid C math expression. Parameters can be addressed in
|
* Function must be valid C math expression. Parameters can be addressed in
|
||||||
* a form par[map[idx]]
|
* a form par[map[idx]]
|
||||||
*/
|
*/
|
||||||
int compileProgram(std::string function, bool mlh = false);
|
int compileProgram(std::string function, bool mlh = false);
|
||||||
|
|
||||||
/**
|
/** Launch selected kernel
|
||||||
* Launch selected kernel.
|
|
||||||
* Launched the selected kernel from the compiled code.
|
* Launched the selected kernel from the compiled code.
|
||||||
* Result is put in &result variable
|
* Result is put in &result variable
|
||||||
*/
|
*/
|
||||||
@ -74,26 +64,22 @@ public:
|
|||||||
double timeStart, double timeStep,
|
double timeStart, double timeStep,
|
||||||
double &result);
|
double &result);
|
||||||
|
|
||||||
/**
|
/** Write params to device.
|
||||||
* Write params to device.
|
|
||||||
* Write params from double array to mem_param_m memory on the device.
|
* Write params from double array to mem_param_m memory on the device.
|
||||||
*/
|
*/
|
||||||
int writeParams(const double *params, int numparams);
|
int writeParams(const double *params, int numparams);
|
||||||
|
|
||||||
/**
|
/** Write functions to device.
|
||||||
* Write functions to device.
|
|
||||||
* Write function values from double array to mem_func_m memory on the device.
|
* Write function values from double array to mem_func_m memory on the device.
|
||||||
*/
|
*/
|
||||||
int writeFunc(const double *func, int numfunc);
|
int writeFunc(const double *func, int numfunc);
|
||||||
|
|
||||||
/**
|
/** Write maps to device.
|
||||||
* Write maps to device.
|
|
||||||
* Write map values from int array to mem_map_m memory on the device.
|
* Write map values from int array to mem_map_m memory on the device.
|
||||||
*/
|
*/
|
||||||
int writeMap(const int *map, int nummap);
|
int writeMap(const int *map, int nummap);
|
||||||
|
|
||||||
/**
|
/** Allocate temporary memory needed for chi square.
|
||||||
* Allocate temporary memory needed for chi square.
|
|
||||||
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
|
||||||
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
|
||||||
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
* size_func and size_map are the maximum number of parameters, functions and maps used in
|
||||||
@ -101,16 +87,14 @@ public:
|
|||||||
*/
|
*/
|
||||||
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
|
||||||
|
|
||||||
/**
|
/** Free temporary memory allocated for chi square.
|
||||||
* Free temporary memory allocated for chi square.
|
|
||||||
* Frees the chisq temporary memory and memory for params, functions and maps
|
* Frees the chisq temporary memory and memory for params, functions and maps
|
||||||
*/
|
*/
|
||||||
int freeChiSquare();
|
int freeChiSquare();
|
||||||
|
|
||||||
/**
|
/** Check MuSR kernels for necessary resources.
|
||||||
* Check MuSR kernels for necessary resources.
|
|
||||||
* Query device properties to get if sufficient resources are
|
* Query device properties to get if sufficient resources are
|
||||||
* available to run the kernels. Also checks if double precission is enabled on the device.
|
* available to run the kernels
|
||||||
*/
|
*/
|
||||||
int checkChiSquareKernels(int fitType, int &threadsPerBlock);
|
int checkChiSquareKernels(int fitType, int &threadsPerBlock);
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ TODO:
|
|||||||
2. boost.compute sort for user defined structure crashes
|
2. boost.compute sort for user defined structure crashes
|
||||||
*/
|
*/
|
||||||
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
int numparticles, bool enableRutherforScattering)
|
int numparticles, bool enableRutherfordScattering)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
//set number of total threads, and number threads per block
|
//set number of total threads, and number threads per block
|
||||||
|
@ -17,16 +17,12 @@
|
|||||||
#include "boost/compute/core.hpp"
|
#include "boost/compute/core.hpp"
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/** Double3 structure for use in OpenCL code. */
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
double x;
|
double x;
|
||||||
double y;
|
double y;
|
||||||
double z;
|
double z;
|
||||||
} Double3;
|
} Double3;
|
||||||
|
|
||||||
/**
|
|
||||||
* Structure for stroing particles in OpenCL code.
|
|
||||||
*/
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int label;
|
int label;
|
||||||
unsigned localID;
|
unsigned localID;
|
||||||
@ -39,10 +35,6 @@ typedef struct {
|
|||||||
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
|
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
|
||||||
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
|
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
|
||||||
|
|
||||||
/**
|
|
||||||
* OpenCLCollimatorPhysics class based on DKSCollimatorPhysics interface.
|
|
||||||
* Implementes CollimatorPhysics for OPAL using OpenCL for execution on AMD GPUs.
|
|
||||||
*/
|
|
||||||
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
|
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -50,22 +42,18 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/* constructor */
|
||||||
* Constructor with OpenCLBase as argument.
|
|
||||||
* Create a new instace of the OpenCLCollimatorPhysics using existing OpenCLBase object.
|
|
||||||
*/
|
|
||||||
OpenCLCollimatorPhysics(OpenCLBase *base) {
|
OpenCLCollimatorPhysics(OpenCLBase *base) {
|
||||||
m_oclbase = base;
|
m_oclbase = base;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/* destructor */
|
||||||
* Destructor.
|
|
||||||
*/
|
|
||||||
~OpenCLCollimatorPhysics() {
|
~OpenCLCollimatorPhysics() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* execute degrader code on device */
|
||||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
||||||
bool enableRutherforScattering = true);
|
bool enableRutherfordScattering = true);
|
||||||
|
|
||||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
@ -31,6 +31,7 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
|
|||||||
|
|
||||||
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
|
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
|
||||||
return OCL_ERROR;
|
return OCL_ERROR;
|
||||||
|
|
||||||
|
|
||||||
//execute kernel
|
//execute kernel
|
||||||
for (int step = 1; step < N; step <<= 1) {
|
for (int step = 1; step < N; step <<= 1) {
|
||||||
@ -88,78 +89,26 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
|
|||||||
call fft execution on device for every dimension
|
call fft execution on device for every dimension
|
||||||
*/
|
*/
|
||||||
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
||||||
|
int ierr;
|
||||||
int dkserr = DKS_SUCCESS;
|
|
||||||
cl_int ierr;
|
|
||||||
cl_mem inout = (cl_mem)data;
|
cl_mem inout = (cl_mem)data;
|
||||||
|
int n = N[0];
|
||||||
|
|
||||||
if (forward)
|
for (int dim = 0; dim < ndim; dim++) {
|
||||||
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
|
||||||
0, NULL, NULL, &inout, NULL, NULL);
|
if (ierr != OCL_SUCCESS) {
|
||||||
else
|
DEBUG_MSG("Error executing bit reverse");
|
||||||
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
|
return OCL_ERROR;
|
||||||
0, NULL, NULL, &inout, NULL, NULL);
|
}
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
|
||||||
dkserr = DKS_ERROR;
|
if (ierr != OCL_SUCCESS) {
|
||||||
DEBUG_MSG("Error executing cfFFT\n");
|
DEBUG_MSG("Error executing fft reverse");
|
||||||
if (ierr == CLFFT_INVALID_PLAN)
|
return OCL_ERROR;
|
||||||
std::cout << "Invlalid plan" << std::endl;
|
}
|
||||||
else
|
|
||||||
std::cout << "CLFFT error" << std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return dkserr;
|
return OCL_SUCCESS;
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
call rcfft execution on device for every dimension
|
|
||||||
*/
|
|
||||||
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
|
||||||
|
|
||||||
int dkserr = DKS_SUCCESS;
|
|
||||||
cl_int ierr;
|
|
||||||
cl_mem real_in = (cl_mem)real_ptr;
|
|
||||||
cl_mem comp_out = (cl_mem)comp_ptr;
|
|
||||||
|
|
||||||
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
|
||||||
0, NULL, NULL, &real_in, &comp_out, NULL);
|
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
|
||||||
dkserr = DKS_ERROR;
|
|
||||||
DEBUG_MSG("Error executing cfFFT\n");
|
|
||||||
if (ierr == CLFFT_INVALID_PLAN)
|
|
||||||
std::cout << "Invlalid plan" << std::endl;
|
|
||||||
else
|
|
||||||
std::cout << "CLFFT error" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return dkserr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
call rcfft execution on device for every dimension
|
|
||||||
*/
|
|
||||||
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
|
||||||
|
|
||||||
int dkserr = DKS_SUCCESS;
|
|
||||||
cl_int ierr;
|
|
||||||
cl_mem real_in = (cl_mem)real_ptr;
|
|
||||||
cl_mem comp_out = (cl_mem)comp_ptr;
|
|
||||||
|
|
||||||
ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
|
|
||||||
0, NULL, NULL, &comp_out, &real_in, NULL);
|
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
|
||||||
dkserr = DKS_ERROR;
|
|
||||||
DEBUG_MSG("Error executing cfFFT\n");
|
|
||||||
if (ierr == CLFFT_INVALID_PLAN)
|
|
||||||
std::cout << "Invlalid plan" << std::endl;
|
|
||||||
else
|
|
||||||
std::cout << "CLFFT error" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return dkserr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -171,11 +120,10 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
|
call kernel to normalize fft
|
||||||
*/
|
*/
|
||||||
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
/*
|
|
||||||
cl_mem inout = (cl_mem)data;
|
cl_mem inout = (cl_mem)data;
|
||||||
|
|
||||||
int n = N[0];
|
int n = N[0];
|
||||||
@ -202,175 +150,132 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
|||||||
DEBUG_MSG("Error executing kernel");
|
DEBUG_MSG("Error executing kernel");
|
||||||
return OCL_ERROR;
|
return OCL_ERROR;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
return OCL_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
|
||||||
|
|
||||||
|
int ierr;
|
||||||
|
int size = sizeof(cl_double2)*pow(N,ndim);
|
||||||
|
|
||||||
|
cl_mem mem_tmp;
|
||||||
|
cl_mem mem_src = (cl_mem)src;
|
||||||
|
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
|
||||||
|
|
||||||
cl_int err;
|
//set the number of work items in each dimension
|
||||||
|
size_t work_items[3];
|
||||||
|
int p = 1;
|
||||||
|
int threads = N / 2;
|
||||||
|
int f = (forward) ? -1 : 1;
|
||||||
|
|
||||||
|
//execute kernel
|
||||||
|
int n = (int)log2(N);
|
||||||
|
for (int i = 0; i < ndim; i++) {
|
||||||
|
|
||||||
clfftDim dim;
|
int dim = i+1;
|
||||||
if (ndim == 1)
|
p = 1;
|
||||||
dim = CLFFT_1D;
|
work_items[0] = (dim == 1) ? N/2 : N;
|
||||||
else if (ndim == 2)
|
work_items[1] = (dim == 2) ? N/2 : N;
|
||||||
dim = CLFFT_2D;
|
work_items[2] = (dim == 3) ? N/2 : N;
|
||||||
else
|
|
||||||
dim = CLFFT_3D;
|
//transpose array if calculating dimension larger than 1
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
//if (dim > 1)
|
||||||
|
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
||||||
|
|
||||||
|
//create kernel and set kernel arguments
|
||||||
|
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
|
||||||
|
return OCL_ERROR;
|
||||||
|
|
||||||
|
for (int t = 1; t <= log2(N); t++) {
|
||||||
|
|
||||||
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
|
||||||
|
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
|
||||||
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
|
||||||
|
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
|
||||||
|
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
|
||||||
|
|
||||||
|
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
||||||
|
return OCL_ERROR;
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
mem_tmp = mem_src;
|
||||||
err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
|
mem_src = mem_dst;
|
||||||
|
mem_dst = mem_tmp;
|
||||||
/* Set plan parameters */
|
|
||||||
err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
|
p = 2*p;
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
std::cout << "Error setting precision" << std::endl;
|
|
||||||
err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
|
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
std::cout << "Error setting layout" << std::endl;
|
|
||||||
err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
|
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
std::cout << "Error setting result location" << std::endl;
|
|
||||||
/* Bake the plan */
|
|
||||||
err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
DEBUG_MSG("Error creating Complex-to-complex plan");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
|
||||||
cl_int err;
|
|
||||||
|
|
||||||
clfftDim dim;
|
|
||||||
if (ndim == 1)
|
|
||||||
dim = CLFFT_1D;
|
|
||||||
else if (ndim == 2)
|
|
||||||
dim = CLFFT_2D;
|
|
||||||
else
|
|
||||||
dim = CLFFT_3D;
|
|
||||||
|
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
|
||||||
|
|
||||||
size_t half = (size_t)N[0] / 2 + 1;
|
|
||||||
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
|
||||||
size_t clOutStride[3] = {1, half, half * N[1]};
|
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
|
||||||
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
|
|
||||||
|
|
||||||
/* Set plan parameters */
|
|
||||||
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
|
|
||||||
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
|
|
||||||
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
|
|
||||||
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
|
|
||||||
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
|
|
||||||
|
|
||||||
/* Bake the plan */
|
|
||||||
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
DEBUG_MSG("Error creating Real-to-complex plan");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
|
||||||
cl_int err;
|
|
||||||
|
|
||||||
clfftDim dim;
|
|
||||||
if (ndim == 1)
|
|
||||||
dim = CLFFT_1D;
|
|
||||||
else if (ndim == 2)
|
|
||||||
dim = CLFFT_2D;
|
|
||||||
else
|
|
||||||
dim = CLFFT_3D;
|
|
||||||
|
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
|
||||||
|
|
||||||
size_t half = (size_t)N[0] / 2 + 1;
|
|
||||||
size_t clInStride[3] = {1, half, half * N[1]};
|
|
||||||
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
|
||||||
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
|
|
||||||
|
|
||||||
/* Set plan parameters */
|
|
||||||
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
|
|
||||||
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
|
|
||||||
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
|
|
||||||
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
|
|
||||||
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
|
|
||||||
|
|
||||||
/* Bake the plan */
|
|
||||||
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
DEBUG_MSG("Error creating Complex-to-real plan");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLFFT::destroyFFT() {
|
|
||||||
clfftDestroyPlan(&planHandleZ2Z);
|
|
||||||
clfftDestroyPlan(&planHandleD2Z);
|
|
||||||
clfftDestroyPlan(&planHandleZ2D);
|
|
||||||
|
|
||||||
clfftTeardown();
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void OpenCLFFT::printError(clfftStatus err) {
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
std::cout << "Error creating default plan " << err << std::endl;
|
|
||||||
switch(err) {
|
|
||||||
case CLFFT_BUGCHECK:
|
|
||||||
std::cout << "bugcheck" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_NOTIMPLEMENTED:
|
|
||||||
std::cout << "not implemented" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
|
|
||||||
std::cout << "transposed not implemented" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_FILE_NOT_FOUND:
|
|
||||||
std::cout << "file not found" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_FILE_CREATE_FAILURE:
|
|
||||||
std::cout << "file create failure" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_VERSION_MISMATCH:
|
|
||||||
std::cout << "version missmatch" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_INVALID_PLAN:
|
|
||||||
std::cout << "invalid plan" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_DEVICE_NO_DOUBLE:
|
|
||||||
std::cout << "no double" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_DEVICE_MISMATCH:
|
|
||||||
std::cout << "device missmatch" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_ENDSTATUS:
|
|
||||||
std::cout << "end status" << std::endl;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
std::cout << "other: " << err << std::endl;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//transpose array back if calculating dimension larger than 1
|
||||||
|
//if (dim > 1)
|
||||||
|
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ndim*n % 2 == 1) {
|
||||||
|
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
|
||||||
|
mem_tmp = mem_src;
|
||||||
|
mem_src = mem_dst;
|
||||||
|
mem_dst = mem_tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_oclbase->ocl_freeMemory(mem_dst);
|
||||||
|
|
||||||
|
return OCL_SUCCESS;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
|
||||||
|
|
||||||
|
cl_mem mem_src = (cl_mem)src;
|
||||||
|
|
||||||
|
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
|
||||||
|
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
|
||||||
|
|
||||||
|
m_oclbase->ocl_createKernel("fft_batch3D");
|
||||||
|
|
||||||
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
|
||||||
|
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
|
||||||
|
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
|
||||||
|
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
|
||||||
|
|
||||||
|
|
||||||
|
for (int dim = 1; dim < ndim+1; dim++) {
|
||||||
|
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
|
||||||
|
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return OCL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
|
||||||
|
|
||||||
|
cl_mem mem_src = (cl_mem)src;
|
||||||
|
|
||||||
|
if (ndim == 1)
|
||||||
|
return OCL_SUCCESS;
|
||||||
|
|
||||||
|
size_t work_items[3];
|
||||||
|
work_items[0] = N[0];
|
||||||
|
work_items[1] = N[1];
|
||||||
|
work_items[2] = 1;
|
||||||
|
|
||||||
|
size_t work_group_size[3];
|
||||||
|
work_group_size[0] = N[0];
|
||||||
|
work_group_size[1] = N[1];
|
||||||
|
work_group_size[2] = 1;
|
||||||
|
|
||||||
|
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
|
||||||
|
|
||||||
|
m_oclbase->ocl_createKernel("transpose");
|
||||||
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
|
||||||
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
|
||||||
|
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
|
||||||
|
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
|
||||||
|
|
||||||
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1,3 +1,14 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
Name: OpenCLFFT
|
||||||
|
|
||||||
|
Author: Uldis Locans
|
||||||
|
|
||||||
|
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
|
||||||
|
|
||||||
|
Data: 19.09.2014
|
||||||
|
|
||||||
|
*/
|
||||||
#ifndef H_OPENCL_FFT
|
#ifndef H_OPENCL_FFT
|
||||||
#define H_OPENCL_FFT
|
#define H_OPENCL_FFT
|
||||||
|
|
||||||
@ -9,25 +20,12 @@
|
|||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithms/FFT.h"
|
||||||
#include "OpenCLBase.h"
|
#include "OpenCLBase.h"
|
||||||
|
|
||||||
#include "clFFT.h"
|
class OpenCLFFT : public DKSFFT {
|
||||||
|
|
||||||
/**
|
|
||||||
* OpenCL FFT class based on BaseFFT interface.
|
|
||||||
* Uses clFFT library to perform FFTs on AMD gpus.
|
|
||||||
* clFFT library works also on nvida GPUs and other devices that
|
|
||||||
* support OpenCL.
|
|
||||||
*/
|
|
||||||
class OpenCLFFT : public BaseFFT {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
OpenCLBase *m_oclbase;
|
OpenCLBase *m_oclbase;
|
||||||
|
|
||||||
clfftSetupData fftSetup;
|
|
||||||
clfftPlanHandle planHandleZ2Z;
|
|
||||||
clfftPlanHandle planHandleD2Z;
|
|
||||||
clfftPlanHandle planHandleZ2D;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Info: call fft kernels to execute FFT of the given domain,
|
Info: call fft kernels to execute FFT of the given domain,
|
||||||
data - devevice memory ptr, cdim - current dim to transform,
|
data - devevice memory ptr, cdim - current dim to transform,
|
||||||
@ -44,31 +42,15 @@ private:
|
|||||||
*/
|
*/
|
||||||
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
||||||
|
|
||||||
/** Get clfftStatus and print the corresponding error message.
|
|
||||||
* clfftStatus is returned from all clFFT library functions, print error displays the
|
|
||||||
* corresponding error message. If "other" is printed then error code corresponds to
|
|
||||||
* OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
|
|
||||||
* be checked to determine the reason for the error.
|
|
||||||
*/
|
|
||||||
void printError(clfftStatus err);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/* constructor - currently does nothing*/
|
/* constructor - currently does nothing*/
|
||||||
OpenCLFFT(OpenCLBase *base) {
|
OpenCLFFT(OpenCLBase *base) {
|
||||||
m_oclbase = base;
|
m_oclbase = base;
|
||||||
|
|
||||||
/* Set up fft */
|
|
||||||
cl_int err;
|
|
||||||
err = clfftInitSetupData(&fftSetup);
|
|
||||||
err = clfftSetup(&fftSetup);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
DEBUG_MSG("Error seting up clFFT");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* destructor - currently does nothing*/
|
/* destructor - currently does nothing*/
|
||||||
~OpenCLFFT() { destroyFFT(); }
|
~OpenCLFFT() { }
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Info: execute forward fft function with data set on device
|
Info: execute forward fft function with data set on device
|
||||||
@ -95,22 +77,35 @@ public:
|
|||||||
Info: set FFT size
|
Info: set FFT size
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int setupFFT(int ndim, int N[3]);
|
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int destroyFFT();
|
int destroyFFT() { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1);
|
int streamId = -1)
|
||||||
|
{
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1);
|
int streamId = -1)
|
||||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) {
|
{
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
|
||||||
|
{
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
|
||||||
|
|
||||||
|
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
|
||||||
|
|
||||||
|
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
|
||||||
|
|
||||||
//void printData3DN4(cl_double2* &data, int N);
|
//void printData3DN4(cl_double2* &data, int N);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -1,181 +0,0 @@
|
|||||||
#include "OpenCLGreensFunction.h"
|
|
||||||
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
|
|
||||||
|
|
||||||
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
|
|
||||||
m_base = base;
|
|
||||||
base_create = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
OpenCLGreensFunction::OpenCLGreensFunction() {
|
|
||||||
m_base = new OpenCLBase();
|
|
||||||
base_create = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
OpenCLGreensFunction::~OpenCLGreensFunction() {
|
|
||||||
if (base_create)
|
|
||||||
delete m_base;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::buildProgram() {
|
|
||||||
char *kernel_file = new char[500];
|
|
||||||
kernel_file[0] = '\0';
|
|
||||||
strcat(kernel_file, OPENCL_KERNELS);
|
|
||||||
strcat(kernel_file, GREENS_KERNEL);
|
|
||||||
|
|
||||||
return m_base->ocl_loadKernel(kernel_file);
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
|
||||||
int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = I * J * K;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("kernelTmpgreen");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
|
|
||||||
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
|
|
||||||
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
|
||||||
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
|
||||||
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
|
|
||||||
int K, int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem rho2_ptr = (cl_mem)rho2_m;
|
|
||||||
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
|
||||||
int NI = 2*(I - 1);
|
|
||||||
int NJ = 2*(J - 1);
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = I * J * K;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("kernelIntegration");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
|
|
||||||
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
|
|
||||||
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
|
||||||
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
|
||||||
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
double zero = 0.0;
|
|
||||||
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
|
||||||
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem rho2_ptr = (cl_mem)rho2_m;
|
|
||||||
int NI = I + 1;
|
|
||||||
int NJ = J + 1;
|
|
||||||
int NK = K + 1;
|
|
||||||
int I2 = 2*I;
|
|
||||||
int J2 = 2*J;
|
|
||||||
int K2 = 2*K;
|
|
||||||
|
|
||||||
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = NI * NJ * NK;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
|
|
||||||
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
|
|
||||||
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
|
|
||||||
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
|
|
||||||
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
|
|
||||||
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem mem_ptr1 = (cl_mem) ptr1;
|
|
||||||
cl_mem mem_ptr2 = (cl_mem) ptr2;
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = size;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("multiplyComplexFields");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(int), &size);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
|
|
||||||
}
|
|
@ -1,64 +0,0 @@
|
|||||||
#ifndef H_OPENCL_GREENSFUNCTION
|
|
||||||
#define H_OPENCL_GREENSFUNCTION
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
#include "../Algorithms/GreensFunction.h"
|
|
||||||
#include "OpenCLBase.h"
|
|
||||||
|
|
||||||
/** OpenCL implementation of GreensFunction calculation for OPALs Poisson Solver. */
|
|
||||||
class OpenCLGreensFunction : public GreensFunction {
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
bool base_create;
|
|
||||||
OpenCLBase *m_base;
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
/** Constructor with OpenCLBase argument */
|
|
||||||
OpenCLGreensFunction(OpenCLBase *base);
|
|
||||||
|
|
||||||
/** Default constructor */
|
|
||||||
OpenCLGreensFunction();
|
|
||||||
|
|
||||||
/** Destructor */
|
|
||||||
~OpenCLGreensFunction();
|
|
||||||
|
|
||||||
/** Load OpenCL kernel file containing greens function kernels.
|
|
||||||
* m_base takes the kernel file and compiles the OpenCL programm.
|
|
||||||
*/
|
|
||||||
int buildProgram();
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: calc itegral on device memory (taken from OPAL src code).
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
|
||||||
int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: integration of rho2_m field (taken from OPAL src code).
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
|
||||||
int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: mirror rho field (taken from OPAL src code).
|
|
||||||
Return: succes or error code
|
|
||||||
*/
|
|
||||||
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1.
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
@ -106,56 +106,6 @@ double ifld(double t, double alpha, double phi, double nu, double lambdaT, doubl
|
|||||||
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
|
||||||
}
|
}
|
||||||
|
|
||||||
double ifgk(double t, double alpha, double nu, double sigma, double lambda, double beta) {
|
|
||||||
double wt = TWO_PI*nu*t;
|
|
||||||
double rate2 = sigma*sigma*t*t;
|
|
||||||
double rateL = 0.0;
|
|
||||||
double result = 0.0;
|
|
||||||
|
|
||||||
// make sure lambda > 0
|
|
||||||
if (lambda < 0.0)
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
if (beta < 0.001) {
|
|
||||||
rateL = 1.0;
|
|
||||||
} else {
|
|
||||||
rateL = pow(lambda*t, beta);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nu < 0.01) {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-rate2)*exp(-0.5*rate2);
|
|
||||||
} else {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-sigma*sigma*t*t/(wt)*sin(wt))*exp(-0.5*rate2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
double ifll(double t, double alpha, double nu, double a, double lambda, double beta) {
|
|
||||||
double wt = TWO_PI*nu*t;
|
|
||||||
double at = a*t;
|
|
||||||
double rateL = 0.0;
|
|
||||||
double result = 0.0;
|
|
||||||
|
|
||||||
// make sure lambda > 0
|
|
||||||
if (lambda < 0.0)
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
if (beta < 0.001) {
|
|
||||||
rateL = 1.0;
|
|
||||||
} else {
|
|
||||||
rateL = pow(lambda*t, beta);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nu < 0.01) {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(1.0-at)*exp(-at);
|
|
||||||
} else {
|
|
||||||
result = (1.0-alpha)*exp(-rateL) + alpha*(cos(wt)-a/(TWO_PI*nu)*sin(wt))*exp(-at);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
double b(double t, double phi, double nu) {
|
double b(double t, double phi, double nu) {
|
||||||
return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
#pragma OPENCL EXTENSION
|
||||||
|
|
||||||
|
|
||||||
/******Random numbers********/
|
/******Random numbers********/
|
||||||
|
|
||||||
@ -87,14 +89,13 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
|
|||||||
|
|
||||||
if (id < N) {
|
if (id < N) {
|
||||||
RNDState tmp;
|
RNDState tmp;
|
||||||
int tmp_seed = 2*id;// * 0x100000000ULL;
|
int tmp_seed = id;// * 0x100000000ULL;
|
||||||
tmp.s10 = 12345 + tmp_seed;
|
tmp.s10 = 12345 + tmp_seed;
|
||||||
tmp.s11 = 12345 + tmp_seed;
|
tmp.s11 = 12345 + tmp_seed;
|
||||||
tmp.s12 = 12345 + tmp_seed;
|
tmp.s12 = 123 + tmp_seed;
|
||||||
tmp.s20 = 12345 + tmp_seed;
|
tmp.s20 = 12345 + tmp_seed;
|
||||||
tmp.s21 = 12345 + tmp_seed;
|
tmp.s21 = 12345 + tmp_seed;
|
||||||
tmp.s22 = 12345 + tmp_seed;
|
tmp.s22 = 123 + tmp_seed;
|
||||||
|
|
||||||
|
|
||||||
tmp.z = 0;
|
tmp.z = 0;
|
||||||
tmp.gen = true;
|
tmp.gen = true;
|
||||||
@ -104,19 +105,6 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* create random numbers and fill an array */
|
|
||||||
__kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
|
|
||||||
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
if (idx < size) {
|
|
||||||
RNDState s = states[idx];
|
|
||||||
data[idx] = rand_uniform(&s);
|
|
||||||
states[idx] = s;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**********Degrader**********/
|
/**********Degrader**********/
|
||||||
enum PARAMS { POSITION,
|
enum PARAMS { POSITION,
|
||||||
|
@ -1,170 +0,0 @@
|
|||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
|
||||||
|
|
||||||
/** compute the greens integral analytically */
|
|
||||||
__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
|
|
||||||
int NI, int NJ, int NK)
|
|
||||||
{
|
|
||||||
|
|
||||||
int tid = get_local_size(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
if (id < NI * NJ * NK) {
|
|
||||||
int i = id % NI;
|
|
||||||
int k = id / (NI * NJ);
|
|
||||||
int j = (id - k * NI * NJ) / NI;
|
|
||||||
|
|
||||||
|
|
||||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
|
||||||
|
|
||||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
|
||||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
|
||||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
|
||||||
|
|
||||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
|
||||||
|
|
||||||
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
|
||||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
|
||||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
|
||||||
|
|
||||||
tmpgrn = tmpgrn / 2;
|
|
||||||
|
|
||||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
|
||||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
|
||||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
|
||||||
|
|
||||||
tmpgreen[id] = tmpgrn / cellVolume;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/** perform the actual integration */
|
|
||||||
__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen,
|
|
||||||
int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp)
|
|
||||||
{
|
|
||||||
|
|
||||||
int tid = get_local_id(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
int ni = NI;
|
|
||||||
int nj = NJ;
|
|
||||||
|
|
||||||
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
|
||||||
|
|
||||||
if (id < NI_tmp * NJ_tmp * NK_tmp) {
|
|
||||||
int i = id % NI_tmp;
|
|
||||||
int k = id / (NI_tmp * NJ_tmp);
|
|
||||||
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
|
|
||||||
|
|
||||||
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
|
|
||||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
|
||||||
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp)
|
|
||||||
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (j+1 < NJ_tmp)
|
|
||||||
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (k+1 < NK_tmp)
|
|
||||||
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
|
||||||
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
|
||||||
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
|
||||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
|
||||||
|
|
||||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** miror rho-field */
|
|
||||||
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
|
|
||||||
rho2_m[0] = rho2_m[NI*NJ];
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void kernelMirroredRhoField(__global double *rho2_m,
|
|
||||||
int NI, int NJ, int NK,
|
|
||||||
int NI_tmp, int NJ_tmp, int NK_tmp,
|
|
||||||
int size)
|
|
||||||
{
|
|
||||||
|
|
||||||
int tid = get_local_id(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
if (id == 0)
|
|
||||||
rho2_m[0] = rho2_m[NI * NJ];
|
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
||||||
|
|
||||||
int id1, id2, id3, id4, id5, id6, id7, id8;
|
|
||||||
|
|
||||||
if (id < NI_tmp * NJ_tmp * NK_tmp) {
|
|
||||||
int i = id % NI_tmp;
|
|
||||||
int k = id / (NI_tmp * NJ_tmp);
|
|
||||||
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
|
|
||||||
|
|
||||||
int ri = NI - i;
|
|
||||||
int rj = NJ - j;
|
|
||||||
int rk = NK - k;
|
|
||||||
|
|
||||||
id1 = k * NI * NJ + j * NI + i;
|
|
||||||
id2 = k * NI * NJ + j * NI + ri;
|
|
||||||
id3 = k * NI * NJ + rj * NI + i;
|
|
||||||
id4 = k * NI * NJ + rj * NI + ri;
|
|
||||||
|
|
||||||
id5 = rk * NI * NJ + j * NI + i;
|
|
||||||
id6 = rk * NI * NJ + j * NI + ri;
|
|
||||||
id7 = rk * NI * NJ + rj * NI + i;
|
|
||||||
id8 = rk * NI * NJ + rj * NI + ri;
|
|
||||||
|
|
||||||
double data = 0.0;
|
|
||||||
if (id1 < size)
|
|
||||||
data = rho2_m[id1];
|
|
||||||
|
|
||||||
if (i != 0 && id2 < size) rho2_m[id2] = data;
|
|
||||||
|
|
||||||
if (j != 0 && id3 < size) rho2_m[id3] = data;
|
|
||||||
|
|
||||||
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
|
|
||||||
|
|
||||||
if (k != 0 && id5 < size) rho2_m[id5] = data;
|
|
||||||
|
|
||||||
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
|
|
||||||
|
|
||||||
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
|
|
||||||
|
|
||||||
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/** multiply complex fields */
|
|
||||||
double2 ComplexMul(double2 a, double2 b) {
|
|
||||||
double2 c;
|
|
||||||
c.x = a.x * b.x - a.y * b.y;
|
|
||||||
c.y = a.x * b.y + a.y * b.x;
|
|
||||||
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
|
|
||||||
int size)
|
|
||||||
{
|
|
||||||
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
if (idx < size)
|
|
||||||
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
|
|
||||||
|
|
||||||
}
|
|
@ -5,10 +5,6 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
/**
|
|
||||||
* Custom timer class.
|
|
||||||
* Allows to insert timers in the code to get function exectution times.
|
|
||||||
*/
|
|
||||||
class DKSTimer {
|
class DKSTimer {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -21,45 +17,39 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/** Init DKSTimer by seting timer to zero. */
|
/** Init DKSTimer by seting timer to zero */
|
||||||
DKSTimer();
|
DKSTimer();
|
||||||
|
|
||||||
~DKSTimer();
|
~DKSTimer();
|
||||||
|
|
||||||
/**
|
/** Init the timer
|
||||||
* Init the timer.
|
* Set the name for timer and clear all values
|
||||||
* Set the name for timer and clear all values
|
|
||||||
*/
|
*/
|
||||||
void init(std::string n);
|
void init(std::string n);
|
||||||
|
|
||||||
/**
|
/** Start the timer.
|
||||||
* Start the timer.
|
* Get the curret time with gettimeofday and save in timeStart
|
||||||
* Get the curret time with gettimeofday and save in timeStart
|
|
||||||
*/
|
*/
|
||||||
void start();
|
void start();
|
||||||
|
|
||||||
/**
|
/** Stop the timer
|
||||||
* Stop the timer.
|
* Get the curretn time with gettimeofday and save in timeEnd
|
||||||
* Get the curretn time with gettimeofday and save in timeEnd
|
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
|
||||||
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
|
|
||||||
*/
|
*/
|
||||||
void stop();
|
void stop();
|
||||||
|
|
||||||
/**
|
/** Reset timervalue to zero.
|
||||||
* Reset timervalue to zero.
|
* Set timervalue, timeStart and timeEnd to zero
|
||||||
* Set timervalue, timeStart and timeEnd to zero
|
|
||||||
*/
|
*/
|
||||||
void reset();
|
void reset();
|
||||||
|
|
||||||
/**
|
/** Return elapsed time in seconds.
|
||||||
* Return elapsed time in seconds.
|
* Return the value of timervalue
|
||||||
* Return the value of timervalue
|
|
||||||
*/
|
*/
|
||||||
double gettime();
|
double gettime();
|
||||||
|
|
||||||
/**
|
/** Print timer.
|
||||||
* Print timer.
|
* Print the elapsed time of the timer
|
||||||
* Print the elapsed time of the timer
|
|
||||||
*/
|
*/
|
||||||
void print();
|
void print();
|
||||||
|
|
||||||
|
@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
||||||
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
||||||
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
||||||
ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
||||||
ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
||||||
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
||||||
@ -22,11 +22,10 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
|||||||
#ADD_EXECUTABLE(testGather testGather.cpp)
|
#ADD_EXECUTABLE(testGather testGather.cpp)
|
||||||
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
||||||
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
||||||
ADD_EXECUTABLE(testRandom testRandom.cpp)
|
|
||||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
||||||
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
||||||
#ADD_EXECUTABLE(testPush testPush.cpp)
|
#ADD_EXECUTABLE(testPush testPush.cpp)
|
||||||
ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
||||||
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
||||||
|
|
||||||
@ -39,8 +38,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
|||||||
#TARGET_LINK_LIBRARIES(testFFT dks)
|
#TARGET_LINK_LIBRARIES(testFFT dks)
|
||||||
#TARGET_LINK_LIBRARIES(testMIC dks)
|
#TARGET_LINK_LIBRARIES(testMIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
||||||
TARGET_LINK_LIBRARIES(testFFT3D dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testFFT3D dks)
|
||||||
TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
||||||
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
||||||
@ -54,11 +53,10 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
|
|||||||
#TARGET_LINK_LIBRARIES(testGather dks)
|
#TARGET_LINK_LIBRARIES(testGather dks)
|
||||||
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
||||||
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
||||||
TARGET_LINK_LIBRARIES(testRandom dks ${CLFFT_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${CLFFT_LIBRARIES})
|
|
||||||
#TARGET_LINK_LIBRARIES(testPush dks)
|
#TARGET_LINK_LIBRARIES(testPush dks)
|
||||||
TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
||||||
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
||||||
|
|
||||||
@ -83,4 +81,4 @@ TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
|
|||||||
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||||
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||||
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
|
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
|
||||||
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
@ -129,9 +129,7 @@ int main(int argc, char *argv[]) {
|
|||||||
//init random
|
//init random
|
||||||
base.callInitRandoms(numpart);
|
base.callInitRandoms(numpart);
|
||||||
|
|
||||||
|
|
||||||
//**test collimator physics and sort***//
|
//**test collimator physics and sort***//
|
||||||
|
|
||||||
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
|
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
|
||||||
|
|
||||||
//allocate memory for particles
|
//allocate memory for particles
|
||||||
@ -212,8 +210,8 @@ int main(int argc, char *argv[]) {
|
|||||||
base.freeMemory<double>(pz_ptr, numpart);
|
base.freeMemory<double>(pz_ptr, numpart);
|
||||||
|
|
||||||
base.freeMemory<double>(param_ptr, 12);
|
base.freeMemory<double>(param_ptr, 12);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
std::cout << std::fixed << std::setprecision(4);
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
std::cout << p.label[i] << "\t" << p.rx[i]
|
std::cout << p.label[i] << "\t" << p.rx[i]
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
#include "Utility/TimeStamp.h"
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -19,30 +18,22 @@ int main(int argc, char *argv[]) {
|
|||||||
int N = 16;
|
int N = 16;
|
||||||
char *api_name = new char[10];
|
char *api_name = new char[10];
|
||||||
char *device_name = new char[10];
|
char *device_name = new char[10];
|
||||||
|
if (argc == 2) {
|
||||||
for (int i = 1; i < argc; i++) {
|
N = atoi(argv[1]);
|
||||||
if (argv[i] == string("-cuda")) {
|
strcpy(api_name, "Cuda");
|
||||||
strcpy(api_name, "Cuda");
|
strcpy(device_name, "-gpu");
|
||||||
strcpy(device_name, "-gpu");
|
} else if (argc == 3) {
|
||||||
}
|
N = atoi(argv[1]);
|
||||||
|
strcpy(api_name, argv[2]);
|
||||||
if (argv[i] == string("-opencl")) {
|
strcpy(device_name, "-gpu");
|
||||||
strcpy(api_name, "OpenCL");
|
} else if (argc == 4) {
|
||||||
strcpy(device_name, "-gpu");
|
N = atoi(argv[1]);
|
||||||
}
|
strcpy(api_name, argv[2]);
|
||||||
|
strcpy(device_name, argv[3]);
|
||||||
if (argv[i] == string("-mic")) {
|
} else {
|
||||||
strcpy(api_name, "OpenMP");
|
N = 16;
|
||||||
strcpy(device_name, "-mic");
|
strcpy(api_name, "OpenCL");
|
||||||
}
|
strcpy(device_name, "-gpu");
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-cpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-N"))
|
|
||||||
N = atoi(argv[i+1]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||||
@ -83,16 +74,9 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
/* write data to device */
|
/* write data to device */
|
||||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||||
if (N < 5)
|
|
||||||
printData3DN4(cdata, N, 3);
|
|
||||||
|
|
||||||
|
|
||||||
/* execute fft */
|
/* execute fft */
|
||||||
base.callFFT(mem_ptr, 3, dimsize);
|
base.callFFT(mem_ptr, 3, dimsize);
|
||||||
if (N < 5) {
|
|
||||||
base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
|
|
||||||
printData3DN4(cfft, N, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* execute ifft */
|
/* execute ifft */
|
||||||
base.callIFFT(mem_ptr, 3, dimsize);
|
base.callIFFT(mem_ptr, 3, dimsize);
|
||||||
@ -102,9 +86,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
/* read data from device */
|
/* read data from device */
|
||||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||||
if (N < 5)
|
|
||||||
printData3DN4(cifft, N, 3);
|
|
||||||
|
|
||||||
/* free device memory */
|
/* free device memory */
|
||||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||||
|
|
||||||
@ -148,7 +130,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
|
|||||||
if (a < 10e-5 && a > -10e-5)
|
if (a < 10e-5 && a > -10e-5)
|
||||||
a = 0;
|
a = 0;
|
||||||
|
|
||||||
cout << "(" << d << "," << a << ") ";
|
cout << d << "; " << a << "\t";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
@ -175,5 +157,3 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
|
|||||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include <fstream>
|
|
||||||
#include <iomanip>
|
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
#include "Utility/TimeStamp.h"
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -10,53 +8,54 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
||||||
void initData(double *data, int dimsize[3], int dim);
|
void initData(double *data, int dimsize[3]);
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
|
||||||
char *api_name, char *device_name, char *file_name);
|
|
||||||
void printHelp();
|
void printHelp();
|
||||||
|
|
||||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
|
||||||
void printData3DN4(double* &data, int N, int dim);
|
|
||||||
|
|
||||||
double precision(double a) {
|
|
||||||
//if (a < 1e-10)
|
|
||||||
// return 0.0;
|
|
||||||
//else
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
int N1 = 8;
|
int N1 = 8;
|
||||||
int N2 = 8;
|
int N2 = 8;
|
||||||
int N3 = 8;
|
int N3 = 8;
|
||||||
int dim = 3;
|
int dim = 3;
|
||||||
int loop = 0;
|
int loop = 10;
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
char *file_name = new char[50];
|
|
||||||
|
|
||||||
if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
|
if ( readParams(argc, argv, N1, N2, N3, loop) )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
int dimsize[3] = {N3, N2, N1};
|
||||||
|
|
||||||
int dimsize[3] = {N1, N2, N3};
|
|
||||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
||||||
|
|
||||||
double *rdata = new double[sizereal];
|
double *rdata = new double[sizereal];
|
||||||
double *outdata = new double[sizereal];
|
double *outdata = new double[sizereal];
|
||||||
complex<double> *cfft = new complex<double>[sizecomp];
|
complex<double> *cfft = new complex<double>[sizecomp];
|
||||||
initData(rdata, dimsize, dim);
|
|
||||||
|
for (int i=0; i<sizecomp; ++i) {
|
||||||
|
cfft[i].real() = 7.;
|
||||||
|
cfft[i].imag() = 3.33;
|
||||||
|
}
|
||||||
|
initData(rdata, dimsize);
|
||||||
|
|
||||||
/* init DKSBase */
|
/* init DKSBase */
|
||||||
cout << "Init device and set function" << endl;
|
cout << "Init device and set function" << endl;
|
||||||
|
#ifdef DKS_MIC
|
||||||
DKSBase base;
|
DKSBase base;
|
||||||
base.setAPI(api_name, strlen(api_name));
|
base.setAPI("OpenMP", 6);
|
||||||
base.setDevice(device_name, strlen(device_name));
|
base.setDevice("-mic", 4);
|
||||||
|
base.initDevice();
|
||||||
|
base.setupFFTRC(dim, dimsize);
|
||||||
|
/* setup backward fft (COMPLEX->REAL) */
|
||||||
|
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef DKS_CUDA
|
||||||
|
DKSBase base;
|
||||||
|
base.setAPI("Cuda", 4);
|
||||||
|
base.setDevice("-gpu", 4);
|
||||||
base.initDevice();
|
base.initDevice();
|
||||||
base.setupFFT(dim, dimsize);
|
base.setupFFT(dim, dimsize);
|
||||||
|
#endif
|
||||||
|
|
||||||
// allocate memory on device
|
// allocate memory on device
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -68,59 +67,69 @@ int main(int argc, char *argv[]) {
|
|||||||
// execute one run before starting the timers
|
// execute one run before starting the timers
|
||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||||
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
|
||||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
|
||||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
|
|
||||||
|
//timer for total loop time, FFT and IFFT calls
|
||||||
|
struct timeval timeStart, timeEnd;
|
||||||
|
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
|
||||||
|
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
|
||||||
|
|
||||||
|
gettimeofday(&timeStart, NULL);
|
||||||
|
for (int i=0; i<loop; ++i){
|
||||||
|
|
||||||
|
// write data to device
|
||||||
|
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||||
|
|
||||||
|
// execute rcfft
|
||||||
|
gettimeofday(&timeFFTStart[i], NULL);
|
||||||
|
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||||
|
gettimeofday(&timeFFTEnd[i], NULL);
|
||||||
|
|
||||||
|
// execute crfft
|
||||||
|
gettimeofday(&timeIFFTStart[i], NULL);
|
||||||
|
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||||
|
gettimeofday(&timeIFFTEnd[i], NULL);
|
||||||
|
|
||||||
|
//normalize
|
||||||
|
#ifdef DKS_CUDA
|
||||||
|
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// read IFFT data from device
|
||||||
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
|
|
||||||
|
|
||||||
ofstream myfile;
|
|
||||||
myfile.open(file_name);
|
|
||||||
myfile<< "in\tout\treal\timag\n";
|
|
||||||
for (int i = 0; i < sizereal; i++) {
|
|
||||||
//myfile << precision(rdata[i]) << "\t";
|
|
||||||
//myfile << precision(outdata[i]) << "\t";
|
|
||||||
if (i < sizecomp) {
|
|
||||||
myfile << precision(cfft[i].real()) << "\t";
|
|
||||||
myfile << precision(cfft[i].imag());
|
|
||||||
}
|
|
||||||
myfile << "\n";
|
|
||||||
}
|
}
|
||||||
myfile.close();
|
gettimeofday(&timeEnd, NULL);
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (dim == 2) {
|
|
||||||
for (int i = 0; i < N2; i++) {
|
|
||||||
for (int j = 0; j < N1; j++) {
|
|
||||||
cout << rdata[i*N1 + j] << " ";
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (dim == 2) {
|
|
||||||
for (int i = 0; i < N2; i++) {
|
|
||||||
for (int j = 0; j < N1 / 2 + 1; j++) {
|
|
||||||
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
// free device memory
|
// free device memory
|
||||||
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
||||||
base.freeMemory<double>(real_ptr, sizereal);
|
base.freeMemory<double>(real_ptr, sizereal);
|
||||||
base.freeMemory<double>(real_res_ptr, sizereal);
|
base.freeMemory<double>(real_res_ptr, sizereal);
|
||||||
|
|
||||||
// compare in and out data to see if we get back the same results
|
// compare in and out data to see if we get back the same results
|
||||||
cout << "comp" << endl;
|
|
||||||
compareData(rdata, outdata, N1, N2, N3, dim);
|
compareData(rdata, outdata, N1, N2, N3, dim);
|
||||||
cout << "done" << endl;
|
|
||||||
|
//calculate seconds for total time and fft times
|
||||||
|
double tfft = 0;
|
||||||
|
double tifft = 0;
|
||||||
|
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
|
||||||
|
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
|
||||||
|
|
||||||
|
for (int i = 0; i < loop; i++) {
|
||||||
|
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
|
||||||
|
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
|
||||||
|
|
||||||
|
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
|
||||||
|
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
|
//print timing results
|
||||||
|
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
|
||||||
|
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
|
||||||
|
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
|
||||||
|
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
|
||||||
|
<< "\n\n";
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -128,10 +137,10 @@ int main(int argc, char *argv[]) {
|
|||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
||||||
int id;
|
int id;
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < NK; i++) {
|
for (int i = 0; i < NI; i++) {
|
||||||
for (int j = 0; j < NJ; j++) {
|
for (int j = 0; j < NJ; j++) {
|
||||||
for (int k = 0; k < NI; k++) {
|
for (int k = 0; k < NK; k++) {
|
||||||
id = i*NI*NJ + j*NI + k;
|
id = k*NI*NJ + j*NI + i;
|
||||||
sum += fabs(data1[id] - data2[id]);
|
sum += fabs(data1[id] - data2[id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -139,21 +148,13 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
|
|||||||
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData(double *data, int dimsize[3], int dim) {
|
void initData(double *data, int dimsize[3]) {
|
||||||
if (dim == 3) {
|
for (int i = 0; i < dimsize[2]; i++) {
|
||||||
for (int i = 0; i < dimsize[2]; i++)
|
|
||||||
for (int j = 0; j < dimsize[1]; j++)
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
|
|
||||||
} else if (dim == 2) {
|
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
for (int j = 0; j < dimsize[1]; j++) {
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
for (int k = 0; k < dimsize[0]; k++) {
|
||||||
data[j*dimsize[0] + k] = sin(k);
|
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[k] = sin(k);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,17 +173,10 @@ void printHelp() {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
|
||||||
char *api_name, char *device_name, char *file_name)
|
|
||||||
{
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
|
|
||||||
if ( argv[i] == std::string("-dim")) {
|
|
||||||
dim = atoi(argv[i + 1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( argv[i] == std::string("-grid") ) {
|
if ( argv[i] == std::string("-grid") ) {
|
||||||
N1 = atoi(argv[i + 1]);
|
N1 = atoi(argv[i + 1]);
|
||||||
N2 = atoi(argv[i + 2]);
|
N2 = atoi(argv[i + 2]);
|
||||||
@ -199,72 +193,7 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, in
|
|||||||
printHelp();
|
printHelp();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv[i] == string("-cuda")) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
strcpy(file_name, "cuda_fft.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
strcpy(file_name, "opencl_fft.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
strcpy(file_name, "openmp_fft.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-cpu");
|
|
||||||
strcpy(file_name, "opencl_cpu_fft.dat");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
|
||||||
|
|
||||||
for (int j = 0; j < N; j++) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
for (int k = 0; k < N/2 + 1; k++) {
|
|
||||||
double d = data[i*N*N + j*N + k].real();
|
|
||||||
double a = data[i*N*N + j*N + k].imag();
|
|
||||||
|
|
||||||
if (d < 10e-5 && d > -10e-5)
|
|
||||||
d = 0;
|
|
||||||
if (a < 10e-5 && a > -10e-5)
|
|
||||||
a = 0;
|
|
||||||
|
|
||||||
cout << "(" << d << "," << a << ") ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void printData3DN4(double* &data, int N, int dim) {
|
|
||||||
|
|
||||||
for (int j = 0; j < N; j++) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
for (int k = 0; k < N; k++) {
|
|
||||||
double d = data[i*N*N + j*N + k];
|
|
||||||
|
|
||||||
if (d < 10e-5 && d > -10e-5)
|
|
||||||
d = 0;
|
|
||||||
|
|
||||||
cout << d << " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
//#include <mpi.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -10,265 +11,309 @@ using namespace std;
|
|||||||
|
|
||||||
|
|
||||||
void printData3D(double* data, int N, int NI, const char *message = "") {
|
void printData3D(double* data, int N, int NI, const char *message = "") {
|
||||||
if (strcmp(message, "") != 0)
|
if (strcmp(message, "") != 0)
|
||||||
cout << message;
|
cout << message;
|
||||||
|
|
||||||
for (int i = 0; i < NI; i++) {
|
for (int i = 0; i < NI; i++) {
|
||||||
for (int j = 0; j < N; j++) {
|
for (int j = 0; j < N; j++) {
|
||||||
for (int k = 0; k < N; k++) {
|
for (int k = 0; k < N; k++) {
|
||||||
cout << data[i*N*N + j*N + k] << "\t";
|
cout << data[i*N*N + j*N + k] << "\t";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData(double *data, int N) {
|
void initData(double *data, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N/4 + 1; i++) {
|
for (int i = 0; i < N/4 + 1; i++) {
|
||||||
for (int j = 0; j < N/2 + 1; j++) {
|
for (int j = 0; j < N/2 + 1; j++) {
|
||||||
for (int k = 0; k < N/2 + 1; k++) {
|
for (int k = 0; k < N/2 + 1; k++) {
|
||||||
data[i*N*N + j*N + k] = k+1;
|
data[i*N*N + j*N + k] = k+1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData2(double *data, int N) {
|
void initData2(double *data, int N) {
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
data[i] = i;
|
data[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initComplex( complex<double> *d, int N) {
|
void initComplex( complex<double> *d, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
d[i] = complex<double>(2, 0);
|
d[i] = complex<double>(2, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void printComplex(complex<double> *d, int N) {
|
void printComplex(complex<double> *d, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
cout << d[i] << "\t";
|
cout << d[i] << "\t";
|
||||||
cout << endl;
|
cout << endl;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void printDouble(double *d, int N) {
|
|
||||||
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
cout << d[i] << ", ";
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void initMirror(double *data, int n1, int n2, int n3) {
|
void initMirror(double *data, int n1, int n2, int n3) {
|
||||||
int d = 1;
|
int d = 1;
|
||||||
for (int i = 0; i < n3; i++) {
|
for (int i = 0; i < n3; i++) {
|
||||||
for (int j = 0; j < n2; j++) {
|
for (int j = 0; j < n2; j++) {
|
||||||
for (int k = 0; k < n1; k++) {
|
for (int k = 0; k < n1; k++) {
|
||||||
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
||||||
data[i * n2 * n1 + j * n1 + k] = d++;
|
data[i * n2 * n1 + j * n1 + k] = d++;
|
||||||
else
|
else
|
||||||
data[i * n2 * n1 + j * n1 + k] = 0;
|
data[i * n2 * n1 + j * n1 + k] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void printDiv(int c) {
|
void printDiv(int c) {
|
||||||
for (int i = 0; i < c; i++)
|
for (int i = 0; i < c; i++)
|
||||||
cout << "-";
|
cout << "-";
|
||||||
cout << endl;
|
cout << endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void printMirror(double *data, int n1, int n2, int n3) {
|
void printMirror(double *data, int n1, int n2, int n3) {
|
||||||
|
|
||||||
printDiv(75);
|
printDiv(75);
|
||||||
for (int i = 0; i < n3; i++) {
|
for (int i = 0; i < n3; i++) {
|
||||||
for (int j = 0; j < n2; j++) {
|
for (int j = 0; j < n2; j++) {
|
||||||
for (int k = 0; k < n1; k++) {
|
for (int k = 0; k < n1; k++) {
|
||||||
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
double sumData(double *data, int datasize) {
|
double sumData(double *data, int datasize) {
|
||||||
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < datasize; i++)
|
for (int i = 0; i < datasize; i++)
|
||||||
sum += data[i];
|
sum += data[i];
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
char *api_name = new char[10];
|
/* mpi init */
|
||||||
char *device_name = new char[10];
|
//int rank, nprocs;
|
||||||
|
//MPI_Init(&argc, &argv);
|
||||||
|
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
/*
|
||||||
if (argv[i] == string("-cuda")) {
|
if (nprocs != 8) {
|
||||||
strcpy(api_name, "Cuda");
|
cout << "example was set to run with 8 processes" << endl;
|
||||||
strcpy(device_name, "-gpu");
|
cout << "exit..." << endl;
|
||||||
}
|
return 0;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
/* set domain size */
|
||||||
strcpy(api_name, "OpenCL");
|
int NG[3] = {64, 64, 32};
|
||||||
strcpy(device_name, "-gpu");
|
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
||||||
}
|
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
||||||
|
int sizerho = NG[0] * NG[1] * NG[2];
|
||||||
|
int sizegreen = ng[0] * ng[1] * ng[2];
|
||||||
|
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
||||||
|
int id[3];
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
//id[0] = 0;
|
||||||
strcpy(api_name, "OpenMP");
|
//id[1] = NL[1] * (rank % 4);
|
||||||
strcpy(device_name, "-mic");
|
//id[2] = NL[2] * (rank / 4);
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
/* print some messages bout the example in the begginig */
|
||||||
strcpy(api_name, "OpenCL");
|
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
||||||
strcpy(device_name, "-cpu");
|
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
|
||||||
}
|
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
||||||
}
|
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
|
||||||
|
int tmp[3];
|
||||||
|
/* for (int p = 1; p < nprocs; p++) {
|
||||||
|
MPI_Status mpistatus;
|
||||||
|
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
|
||||||
|
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
|
||||||
|
}*/
|
||||||
|
// } else {
|
||||||
|
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
|
||||||
|
// }
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
/* dks init and create 2 streams */
|
||||||
|
int dkserr;
|
||||||
|
//int streamGreens, streamFFT;
|
||||||
|
#ifdef DKS_MIC
|
||||||
|
DKSBase base;
|
||||||
|
base.setAPI("OpenMP", 6);
|
||||||
|
base.setDevice("-mic", 4);
|
||||||
|
base.initDevice();
|
||||||
|
#endif
|
||||||
|
|
||||||
/* set domain size */
|
#ifdef DKS_CUDA
|
||||||
int NG[3] = {64, 64, 32};
|
DKSBase base;
|
||||||
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
base.setAPI("Cuda", 4);
|
||||||
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
base.setDevice("-gpu", 4);
|
||||||
int sizerho = NG[0] * NG[1] * NG[2];
|
base.initDevice();
|
||||||
int sizegreen = ng[0] * ng[1] * ng[2];
|
#endif
|
||||||
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
|
||||||
|
|
||||||
/* print some messages bout the example in the begginig */
|
//base.createStream(streamFFT);
|
||||||
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
//if (rank == 0) {
|
||||||
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
// base.createStream(streamGreens);
|
||||||
|
base.setupFFT(3, NG);
|
||||||
|
//}
|
||||||
|
|
||||||
/* dks init and create 2 streams */
|
/* allocate memory and init rho field */
|
||||||
int dkserr;
|
double *rho = new double[sizerho];
|
||||||
DKSBase base;
|
double *rho_out = new double[sizerho];
|
||||||
base.setAPI(api_name, strlen(api_name));
|
//double *green_out = new double[sizegreen];
|
||||||
base.setDevice(device_name, strlen(device_name));
|
initMirror(rho, NL[0], NL[1], NL[2]);
|
||||||
base.initDevice();
|
|
||||||
base.setupFFT(3, NG);
|
|
||||||
|
|
||||||
/* allocate memory and init rho field */
|
/*
|
||||||
double *rho = new double[sizerho];
|
allocate memory on device for
|
||||||
double *rho_out = new double[sizerho];
|
- rho field
|
||||||
//double *green_out = new double[sizegreen];
|
- rho FFT
|
||||||
double *mirror_out = new double[sizerho];
|
- tmpgreen
|
||||||
//initMirror(rho, NL[0], NL[1], NL[2]);
|
- greens integral
|
||||||
initMirror(rho, NG[0], NG[1], NG[2]);
|
- greens integral FFT
|
||||||
|
*/
|
||||||
|
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
||||||
|
// if (rank == 0) {
|
||||||
|
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
||||||
|
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||||
|
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||||
|
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||||
|
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||||
|
/* } else {
|
||||||
|
grntr_ptr = NULL;
|
||||||
|
rho2_ptr = NULL;
|
||||||
|
grn_ptr = NULL;
|
||||||
|
rho2tr_ptr = NULL;
|
||||||
|
tmpgreen_ptr = NULL;
|
||||||
|
}*/
|
||||||
|
|
||||||
/*
|
|
||||||
allocate memory on device for
|
|
||||||
- rho field
|
|
||||||
- rho FFT
|
|
||||||
- tmpgreen
|
|
||||||
- greens integral
|
|
||||||
- greens integral FFT
|
|
||||||
*/
|
|
||||||
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
|
||||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
|
||||||
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
|
||||||
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
|
||||||
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
|
||||||
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
|
||||||
|
|
||||||
/* =================================================*/
|
/* send and receive pointer to allocated memory on device */
|
||||||
/* =================================================*/
|
/*
|
||||||
/* =====loop trough fftpoison solver iterations=====*/
|
if (rank == 0) {
|
||||||
/* =================================================*/
|
for (int p = 1; p < nprocs; p++)
|
||||||
/* =================================================*/
|
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
|
||||||
|
} else {
|
||||||
|
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
|
||||||
|
}
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
*/
|
||||||
|
|
||||||
double old_sum = 0;
|
|
||||||
|
|
||||||
int hr_m[3] = {1, 1, 1};
|
/* =================================================*/
|
||||||
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
|
/* =================================================*/
|
||||||
|
/* =====loop trough fftpoison solver iterations=====*/
|
||||||
|
/* =================================================*/
|
||||||
|
/* =================================================*/
|
||||||
|
|
||||||
/* calculate greens integral on gpu */
|
double old_sum = 0;
|
||||||
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
double tmp_sum = 0;
|
||||||
|
for (int l = 0; l < 100; l++) {
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
/* on node 0, calculate tmpgreen on gpu */
|
||||||
|
int hr_m[3] = {1, 1, 1};
|
||||||
|
//if (rank == 0)
|
||||||
|
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
|
||||||
|
hr_m[0], hr_m[1], hr_m[2]);
|
||||||
|
|
||||||
/* mirror the field */
|
/* calculate greens integral on gpu */
|
||||||
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
//if (rank == 0)
|
||||||
/*
|
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
||||||
base.readData<double>(grn_ptr, mirror_out, sizerho);
|
|
||||||
for (int i = 0; i < sizerho; i++)
|
|
||||||
cout << mirror_out[i] << " ";
|
|
||||||
cout << endl << endl;
|
|
||||||
|
|
||||||
for (int i = 0; i < sizerho; i++)
|
/* mirror the field */
|
||||||
cout << rho[i] << " ";
|
//if (rank == 0)
|
||||||
cout << endl << endl;
|
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
||||||
*/
|
|
||||||
/* transfer rho field to device */
|
|
||||||
base.writeData<double>(rho2_ptr, rho, sizerho);
|
|
||||||
|
|
||||||
/* get FFT of rho field */
|
|
||||||
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
|
||||||
|
|
||||||
/* get FFT of mirrored greens integral */
|
/* get FFT of mirrored greens integral */
|
||||||
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
//if (rank == 0)
|
||||||
|
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
||||||
|
|
||||||
/* multiply both FFTs */
|
/* transfer rho field to device */
|
||||||
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
|
||||||
|
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
/*
|
/* get FFT of rho field */
|
||||||
complex<double> *crho = new complex<double>[sizecomp];
|
//if (rank == 0) {
|
||||||
complex<double> *cgre = new complex<double>[sizecomp];
|
//base.syncDevice();
|
||||||
base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
|
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
||||||
base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
|
//}
|
||||||
|
|
||||||
for (int i = 0; i < sizecomp; i++)
|
/* multiply both FFTs */
|
||||||
cout << cgre[i].real() << " ";
|
//if (rank == 0)
|
||||||
cout << endl << endl;
|
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
for (int i = 0; i < sizecomp; i++)
|
/* inverse fft and transfer data back */
|
||||||
cout << crho[i].real() << " ";
|
/*
|
||||||
cout << endl << endl;
|
multiple device syncs and mpi barriers are used to make sure data
|
||||||
|
transfer is started when results are ready and progam moves on
|
||||||
delete[] crho;
|
only when data transfer is finished
|
||||||
delete[] cgre;
|
*/
|
||||||
*/
|
//if (rank == 0) {
|
||||||
|
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
||||||
|
//base.syncDevice();
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||||
|
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
//base.syncDevice();
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
//cout << "result: " << sumData(rho_out, sizerho) << endl;
|
||||||
|
if (l == 0) {
|
||||||
|
old_sum = sumData(rho_out, sizerho);
|
||||||
|
} else {
|
||||||
|
tmp_sum = sumData(rho_out, sizerho);
|
||||||
|
if (old_sum != tmp_sum) {
|
||||||
|
cout << "diff in iteration: " << l << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*} else {
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
/* inverse fft and transfer data back */
|
|
||||||
/*
|
|
||||||
multiple device syncs and mpi barriers are used to make sure data
|
|
||||||
transfer is started when results are ready and progam moves on
|
|
||||||
only when data transfer is finished
|
|
||||||
*/
|
|
||||||
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
|
||||||
|
|
||||||
base.readData<double> (rho2_ptr, rho_out, sizerho);
|
|
||||||
|
|
||||||
for (int i = 0; i < 10; i++)
|
|
||||||
cout << rho_out[i] << " ";
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
old_sum = sumData(rho_out, sizerho);
|
|
||||||
|
|
||||||
|
}
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* ==========end fftpoison solver test run==========*/
|
/* ==========end fftpoison solver test run==========*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
|
|
||||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
|
||||||
base.freeMemory<double>(grn_ptr, sizerho);
|
|
||||||
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
|
||||||
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
|
||||||
base.freeMemory<double>(rho2_ptr, sizerho);
|
|
||||||
|
|
||||||
delete[] rho_out;
|
|
||||||
delete[] rho;
|
/* free memory on device */
|
||||||
delete[] mirror_out;
|
//if (rank == 0) {
|
||||||
cout << "Final sum: " << old_sum << endl;
|
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||||
|
base.freeMemory<double>(grn_ptr, sizerho);
|
||||||
|
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
||||||
|
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
base.freeMemory<double>(rho2_ptr, sizerho);
|
||||||
|
cout << "Final sum: " << old_sum << endl;
|
||||||
|
/*} else {
|
||||||
|
base.closeHandle(rho2_ptr);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
}*/
|
||||||
|
|
||||||
|
//MPI_Finalize();
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <sys/time.h>
|
|
||||||
|
|
||||||
#include "DKSBase.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int size = 10;
|
|
||||||
bool apiSet = false;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if (argv[i] == string("-cuda")) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
apiSet = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
apiSet = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-N")) {
|
|
||||||
size = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!apiSet) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
cout << "=========================BEGIN TEST=========================" << endl;
|
|
||||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
|
||||||
cout << "Number of randoms: " << size << endl;
|
|
||||||
|
|
||||||
//init dks
|
|
||||||
int ierr;
|
|
||||||
DKSBase base;
|
|
||||||
base.setAPI(api_name, strlen(api_name));
|
|
||||||
base.setDevice(device_name, strlen(api_name));
|
|
||||||
base.initDevice();
|
|
||||||
base.callInitRandoms(size);
|
|
||||||
|
|
||||||
//create host vector to store results
|
|
||||||
double *host_data = new double[size];
|
|
||||||
|
|
||||||
//create device vector
|
|
||||||
void *device_data = base.allocateMemory<double>(size, ierr);
|
|
||||||
|
|
||||||
for (int i = 0; i < 5; i++) {
|
|
||||||
//fill device vector with random values
|
|
||||||
base.callCreateRandomNumbers(device_data, size);
|
|
||||||
|
|
||||||
//read device vector
|
|
||||||
base.readData<double>(device_data, host_data, size);
|
|
||||||
|
|
||||||
//print host data
|
|
||||||
for (int i = 0; i < size; i++)
|
|
||||||
cout << host_data[i] << " ";
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
//free device vector
|
|
||||||
base.freeMemory<double>(device_data, size);
|
|
||||||
|
|
||||||
//free host data
|
|
||||||
delete[] host_data;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
Reference in New Issue
Block a user