Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
8f00d2a593 | |||
1606b641d4 |
@ -1,8 +1,10 @@
|
|||||||
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
|
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
|
||||||
PROJECT (DKS)
|
PROJECT (DKS)
|
||||||
SET (DKS_VERSION_MAJOR 1)
|
SET (DKS_VERSION_MAJOR 1)
|
||||||
SET (DKS_VERSION_MINOR 1)
|
SET (DKS_VERSION_MINOR 0)
|
||||||
SET (DKS_VERSION_PATCH 2)
|
SET (DKS_VERSION_PATCH 2)
|
||||||
|
SET (PACKAGE \"dks\")
|
||||||
|
|
||||||
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
|
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
|
||||||
SET (PACKAGE \"dks\")
|
SET (PACKAGE \"dks\")
|
||||||
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
|
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
|
||||||
@ -28,60 +30,24 @@ MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
|
|||||||
set (BOOSTROOT $ENV{BOOST_DIR})
|
set (BOOSTROOT $ENV{BOOST_DIR})
|
||||||
SET (Boost_USE_STATIC_LIBS OFF)
|
SET (Boost_USE_STATIC_LIBS OFF)
|
||||||
SET (Boost_USE_STATIC_RUNTIME OFF)
|
SET (Boost_USE_STATIC_RUNTIME OFF)
|
||||||
#FIND_PACKAGE(Boost 1.55 REQUIRED COMPONENTS filesystem system)
|
FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
|
||||||
FIND_PACKAGE(Boost 1.41 REQUIRED)
|
|
||||||
IF (Boost_FOUND)
|
IF (Boost_FOUND)
|
||||||
MESSAGE (STATUS "Boost version: ${Boost_VERSION}")
|
|
||||||
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
|
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
|
||||||
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
|
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
|
||||||
#MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
|
MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
|
||||||
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
|
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
|
||||||
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
||||||
ENDIF (Boost_FOUND)
|
ENDIF (Boost_FOUND)
|
||||||
|
|
||||||
#include OPAL, musrfit or pet kernels
|
|
||||||
OPTION(DKS_FULL "Compile DKS with full library" OFF)
|
|
||||||
OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
|
|
||||||
OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
|
|
||||||
OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
|
|
||||||
|
|
||||||
IF (DKS_FULL)
|
|
||||||
SET(ENABLE_OPAL ON)
|
|
||||||
SET(ENABLE_MUSR ON)
|
|
||||||
SET(ENABLE_PET ON)
|
|
||||||
ENDIF(DKS_FULL)
|
|
||||||
|
|
||||||
#find clFFT
|
|
||||||
OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
|
|
||||||
IF (ENABLE_AMD)
|
|
||||||
SET (clFFT_USE_STATIC_LIBS OFF)
|
|
||||||
FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
|
|
||||||
MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
|
|
||||||
MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
|
|
||||||
INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
|
|
||||||
LINK_DIRECTORIES (${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
#find clRNG
|
|
||||||
#SET (clRNG_USE_STATIC_LIBS OFF)
|
|
||||||
#FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
|
|
||||||
#MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
|
|
||||||
#MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
|
|
||||||
#INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
|
|
||||||
#LINK_DIRECTORIES (${CLRNG_LIBRARIES})
|
|
||||||
#find_package(PkgConfig)
|
|
||||||
#pkg_check_modules(clRng REQUIRED)
|
|
||||||
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
|
|
||||||
ENDIF (ENABLE_AMD)
|
|
||||||
|
|
||||||
#enable UQTK
|
#enable UQTK
|
||||||
OPTION (USE_UQTK "Use UQTK" OFF)
|
OPTION (USE_UQTK "Use UQTK" OFF)
|
||||||
|
|
||||||
|
|
||||||
#intel icpc compiler specific flags
|
#intel icpc compiler specific flags
|
||||||
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
||||||
|
|
||||||
#for intel compiler turn on openmp and opencl
|
#for intel compiler turn on openmp and opencl
|
||||||
OPTION (USE_OPENCL "Use OpenCL" OFF)
|
OPTION (USE_OPENCL "Use OpenCL" ON)
|
||||||
OPTION (USE_CUDA "Use CUDA" OFF)
|
OPTION (USE_CUDA "Use CUDA" OFF)
|
||||||
OPTION (USE_MIC "Use intel MIC" ON)
|
OPTION (USE_MIC "Use intel MIC" ON)
|
||||||
|
|
||||||
@ -115,21 +81,15 @@ ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
|
|||||||
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
|
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
|
||||||
|
|
||||||
|
|
||||||
OPTION (USE_OPENCL "Use OpenCL" OFF)
|
OPTION (USE_OPENCL "Use OpenCL" ON)
|
||||||
OPTION (USE_CUDA "Use CUDA" OFF)
|
OPTION (USE_CUDA "Use CUDA" OFF)
|
||||||
OPTION (USE_MIC "Use intel MIC" OFF)
|
OPTION (USE_MIC "Use intel MIC" OFF)
|
||||||
OPTION (STATIC_CUDA "Link static cuda libraries" OFF)
|
|
||||||
|
|
||||||
IF (ENABLE_MUSR)
|
|
||||||
SET (USE_OPENCL ON)
|
|
||||||
ENDIF (ENABLE_MUSR)
|
|
||||||
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
|
||||||
|
|
||||||
FIND_PACKAGE(CUDA)
|
FIND_PACKAGE(CUDA)
|
||||||
IF (CUDA_FOUND)
|
IF (CUDA_FOUND)
|
||||||
SET (USE_CUDA ON)
|
SET (USE_CUDA ON)
|
||||||
OPTION(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cuda libraries" OFF)
|
|
||||||
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
|
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
|
||||||
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
|
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
|
||||||
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
|
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
|
||||||
@ -139,27 +99,20 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
|
|||||||
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
|
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
|
||||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||||
|
|
||||||
SET (CUDA_NVCC_FLAGS "-arch=sm_35;-DDEBUG;-std=c++11;-D__wsu;-fmad=false")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
|
||||||
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${OPENCL_KERNELS}")
|
SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
|
||||||
|
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -DDEBUG -std=c++11 -D__wsu")
|
||||||
IF (NOT STATIC_CUDA)
|
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA")
|
|
||||||
SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart -lcufft -lcublas")
|
|
||||||
ELSE (NOT STATIC_CUDA)
|
|
||||||
SET (CUDA_SEPARABLE_COMPILATION ON)
|
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_CUDA -fPIC")
|
|
||||||
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-rdc=true;-lcufft_static;-lcublas_static;-lcurand_static")
|
|
||||||
SET (DKS_CUDA_LIBS "-lcudadevrt -lcudart_static -lcufft_static -lcublas_static -lculibos")
|
|
||||||
ENDIF (NOT STATIC_CUDA)
|
|
||||||
|
|
||||||
#if cuda version >= 7.0 add runtime commpilation flags
|
#if cuda version >= 7.0 add runtime commpilation flags
|
||||||
IF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
|
IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
|
||||||
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0" AND ENABLE_MUSR)
|
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||||
|
|
||||||
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
|
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
|
||||||
|
|
||||||
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
|
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
|
||||||
|
#set(CUDA_SEPARABLE_COMPILATION ON)
|
||||||
SET(BUILD_SHARED_LIBS OFF)
|
SET(BUILD_SHARED_LIBS OFF)
|
||||||
|
|
||||||
ENDIF (CUDA_FOUND)
|
ENDIF (CUDA_FOUND)
|
||||||
@ -186,9 +139,9 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
|
|||||||
ENDIF(APPLE AND NOT CUDA_FOUND)
|
ENDIF(APPLE AND NOT CUDA_FOUND)
|
||||||
|
|
||||||
#if cuda found set cuda opencl flags
|
#if cuda found set cuda opencl flags
|
||||||
IF (CUDA_FOUND AND USE_OPENCL)
|
IF (CUDA_FOUND)
|
||||||
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
|
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
|
||||||
ENDIF (CUDA_FOUND AND USE_OPENCL)
|
ENDIF (CUDA_FOUND)
|
||||||
|
|
||||||
#if cuda not found but amd opencl found set opencl flags
|
#if cuda not found but amd opencl found set opencl flags
|
||||||
IF (NOT CUDA_FOUND AND OpenCL_FOUND)
|
IF (NOT CUDA_FOUND AND OpenCL_FOUND)
|
||||||
@ -232,3 +185,4 @@ INSTALL (
|
|||||||
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
|
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
|
||||||
RENAME ${PROJECT_NAME}ConfigVersion.cmake
|
RENAME ${PROJECT_NAME}ConfigVersion.cmake
|
||||||
)
|
)
|
||||||
|
|
||||||
|
34
ReadMe.first
34
ReadMe.first
@ -29,30 +29,30 @@ Intel MIC compilers (optional)
|
|||||||
######Source######
|
######Source######
|
||||||
https://gitlab.psi.ch/uldis_l/DKS
|
https://gitlab.psi.ch/uldis_l/DKS
|
||||||
|
|
||||||
######Changes from DKS-1.0.x version######
|
|
||||||
DKS is split into three modules that can be enabled/disabled at compile time depending on which software it is used for.
|
|
||||||
By default only DKSBase and DKSFFT modules are enabled. In order to install other modules the necessary otion needs to be enabled.
|
|
||||||
Supported options are:
|
|
||||||
-DENABLE_OPAL option should be enabled if DKS will be used for OPAL
|
|
||||||
-DENABLE_MUSR option should be enable if DKS will be used for musrfit
|
|
||||||
-DENABLE_PET option should be enabled if DKS will be used for PET image reconstruction
|
|
||||||
|
|
||||||
See install instructions for more details on how to enable the necessary options in DKS
|
|
||||||
|
|
||||||
######Install######
|
######Install######
|
||||||
#consult the https://gitlab.psi.ch/uldis_l/DKS/wikis/home for full install isntructions
|
|
||||||
|
|
||||||
#clone DKS
|
#clone DKS
|
||||||
git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS
|
git clone git@gitlab.psi.ch:uldis_l/DKS.git DKS
|
||||||
|
|
||||||
#switch to the desired version (OPTIONAL)
|
#set compilers to use
|
||||||
git checkout DKS-1.1.0
|
#supported c++ compilers: g++, icpc, mpicxx whith g++
|
||||||
|
#supported c compilers: gcc, icc, mpicc whith gcc
|
||||||
|
export CXX_COMPILER=cpp_compiler_name
|
||||||
|
export CC_COMPILER=c_compiler_name
|
||||||
|
|
||||||
#configure installation in build directory
|
#set dks root directory directory
|
||||||
#enable DKS modules to compile -DENABLE_OPAL, -DENABLE_MUSR, -DENABLE_PET
|
cd DKS
|
||||||
CXX=<c++ compiler> CC=<c compiler> -DCMAKE_INSTALL_PREFIX=<install dir> <path to DKS source> [-DENABLE_OPAL=1 -DENABLE_MUSR=1 -DENABLE_PET=1]
|
export DKS_ROOT = $PWD
|
||||||
|
|
||||||
|
#set build directory
|
||||||
|
mkdir $DKS_BUILD_DIR
|
||||||
|
cd $DKS_BUILD_DIR
|
||||||
|
|
||||||
|
#set install directory
|
||||||
|
export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
|
||||||
|
|
||||||
|
CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
|
||||||
|
|
||||||
#install DKS
|
|
||||||
make
|
make
|
||||||
make install
|
make install
|
||||||
|
|
||||||
|
@ -2,32 +2,18 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
||||||
|
|
||||||
#chi square kernel tests
|
#chi square kernel tests
|
||||||
IF (ENABLE_MUSR)
|
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||||
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${CLFFT_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
|
||||||
|
|
||||||
IF (USE_UQTK)
|
|
||||||
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
|
||||||
ENDIF (USE_UQTK)
|
|
||||||
|
|
||||||
#test to verify search functions
|
|
||||||
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testSearch dks ${CLFFT_LIBRARIES})
|
|
||||||
ENDIF (ENABLE_MUSR)
|
|
||||||
|
|
||||||
IF (ENABLE_OPAL)
|
|
||||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
ADD_EXECUTABLE(testPushKick testPushKick.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testPushKick dks ${CLFFT_LIBRARIES})
|
|
||||||
ENDIF(ENABLE_OPAL)
|
|
||||||
|
|
||||||
ADD_EXECUTABLE(testFFT testFFT.cpp)
|
|
||||||
TARGET_LINK_LIBRARIES(testFFT dks ${CLFFT_LIBRARIES})
|
|
||||||
|
|
||||||
|
IF (USE_UQTK)
|
||||||
|
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
|
||||||
|
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
|
ENDIF (USE_UQTK)
|
||||||
|
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
|
||||||
|
|
||||||
|
#test to verify search functions
|
||||||
|
ADD_EXECUTABLE(testSearch testSearch.cpp)
|
||||||
|
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})
|
||||||
|
@ -1,161 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "DKSOPAL.h"
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int label;
|
|
||||||
unsigned localID;
|
|
||||||
double Rincol[3];
|
|
||||||
double Pincol[3];
|
|
||||||
} PART;
|
|
||||||
|
|
||||||
PART initPartSmall(int d) {
|
|
||||||
|
|
||||||
PART p;
|
|
||||||
p.label = 0;
|
|
||||||
p.localID = d;
|
|
||||||
|
|
||||||
p.Rincol[0] = 0.0;
|
|
||||||
p.Rincol[1] = 0.0;
|
|
||||||
p.Rincol[2] = 0.02;
|
|
||||||
|
|
||||||
p.Pincol[0] = 0.0;
|
|
||||||
p.Pincol[1] = 0.0;
|
|
||||||
p.Pincol[2] = 3.9920183237269791e-01;
|
|
||||||
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
void printPart(PART p) {
|
|
||||||
std::cout << "label: " << p.label << ", ";
|
|
||||||
std::cout << "localid: " << p.localID << ",";
|
|
||||||
std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
|
|
||||||
std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void initParts(PART *p, int N) {
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
p[i] = initPartSmall(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
void printParts(PART *p, int N) {
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
printPart(p[i]);
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void initParams(double *data) {
|
|
||||||
data[0] = 0.0;//2.0000000000000000e-02;
|
|
||||||
data[1] = 1.0;//1.0000000000000000e-02;
|
|
||||||
data[2] = 2.2100000000000000e+00;
|
|
||||||
data[3] = 6.0000000000000000e+00;
|
|
||||||
data[4] = 1.2010700000000000e+01;
|
|
||||||
data[5] = 2.6010000000000000e+00;
|
|
||||||
data[6] = 1.7010000000000000e+03;
|
|
||||||
data[7] = 1.2790000000000000e+03;
|
|
||||||
data[8] = 1.6379999999999999e-02;
|
|
||||||
data[9] = 1.9321266968325795e-01;
|
|
||||||
data[10] = 7.9000000000000000e+01;
|
|
||||||
data[11] = 1.0000000000000002e-12;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int loop = 10;
|
|
||||||
int numpart = 1e5;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-npart")) {
|
|
||||||
numpart = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-loop")) {
|
|
||||||
loop = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "=========================BEGIN TEST=========================" << std::endl;
|
|
||||||
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
|
|
||||||
std::cout << "Number of particles: " << numpart << std::endl;
|
|
||||||
std::cout << "Number of loops: " << loop << std::endl;
|
|
||||||
std::cout << "------------------------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
//init part vector to test mc
|
|
||||||
PART *parts = new PART[numpart];
|
|
||||||
initParts(parts, numpart);
|
|
||||||
|
|
||||||
double *params = new double[12];
|
|
||||||
initParams(params);
|
|
||||||
|
|
||||||
//init dks
|
|
||||||
int ierr;
|
|
||||||
DKSOPAL base;
|
|
||||||
base.setAPI(api_name, strlen(api_name));
|
|
||||||
base.setDevice(device_name, strlen(api_name));
|
|
||||||
ierr = base.initDevice();
|
|
||||||
if (ierr != DKS_SUCCESS)
|
|
||||||
std::cout << "Error with init device!" << std::endl;
|
|
||||||
|
|
||||||
//init random
|
|
||||||
base.callInitRandoms(numpart);
|
|
||||||
|
|
||||||
//**test collimator physics and sort***//
|
|
||||||
void *part_ptr, *param_ptr;
|
|
||||||
|
|
||||||
//allocate memory for particles
|
|
||||||
part_ptr = base.allocateMemory<PART>(numpart, ierr);
|
|
||||||
param_ptr = base.allocateMemory<double>(12, ierr);
|
|
||||||
|
|
||||||
//transfer data to device
|
|
||||||
base.writeData<PART>(part_ptr, parts, numpart);
|
|
||||||
base.writeData<double>(param_ptr, params, 12);
|
|
||||||
|
|
||||||
int numaddback;
|
|
||||||
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
|
|
||||||
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
|
|
||||||
base.syncDevice();
|
|
||||||
|
|
||||||
//read data from device
|
|
||||||
base.readData<PART>(part_ptr, parts, numpart);
|
|
||||||
|
|
||||||
//free memory
|
|
||||||
base.freeMemory<PART>(part_ptr, numpart);
|
|
||||||
base.freeMemory<double>(param_ptr, 12);
|
|
||||||
|
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
|
||||||
for (int i = 0; i < 10; i++) {
|
|
||||||
std::cout << parts[i].label << "\t"
|
|
||||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
|
||||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
|
||||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
|
||||||
<< std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
std:: cout << "..." << std::endl;
|
|
||||||
|
|
||||||
for (int i = numpart - 10; i < numpart; i++) {
|
|
||||||
std::cout << parts[i].label << "\t"
|
|
||||||
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
|
|
||||||
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
|
|
||||||
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
|
|
||||||
<< std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1,214 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <complex>
|
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
|
||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
void compareData(complex<double>* data1, complex<double>* data2, int N, int dim);
|
|
||||||
void compareData(double* data1, double *data2, int N, int dim);
|
|
||||||
|
|
||||||
void initData(complex<double> *data, int dimsize[3], int dim);
|
|
||||||
void initData(double *data, int dimsize[3], int dim);
|
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
|
|
||||||
char *api_name, char *device_name);
|
|
||||||
|
|
||||||
void printHelp();
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int ierr;
|
|
||||||
int N1 = 8;
|
|
||||||
int N2 = 8;
|
|
||||||
int N3 = 8;
|
|
||||||
int dim = 3;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
|
|
||||||
if ( readParams(argc, argv, N1, N2, N3, dim, api_name, device_name) )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
|
||||||
|
|
||||||
int dimsize[3] = {N1, N2, N3};
|
|
||||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
|
||||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
|
||||||
|
|
||||||
double *rdata = new double[sizereal];
|
|
||||||
double *ordata = new double[sizereal];
|
|
||||||
complex<double> *cdata = new complex<double>[sizereal];
|
|
||||||
complex<double> *codata = new complex<double>[sizereal];
|
|
||||||
|
|
||||||
initData(rdata, dimsize, 3);
|
|
||||||
initData(cdata, dimsize, 3);
|
|
||||||
|
|
||||||
/* init DKSBase */
|
|
||||||
cout << "Init device and set function" << endl;
|
|
||||||
DKSFFT base;
|
|
||||||
base.setAPI(api_name, strlen(api_name));
|
|
||||||
base.setDevice(device_name, strlen(device_name));
|
|
||||||
cout << "init device" << endl;
|
|
||||||
base.initDevice();
|
|
||||||
cout << "setup fft" << endl;
|
|
||||||
base.setupFFT(dim, dimsize);
|
|
||||||
|
|
||||||
//Test RC FFT -> CR FFT
|
|
||||||
void *real_ptr, *comp_ptr, *res_ptr;
|
|
||||||
cout << "allocate memory" << endl;
|
|
||||||
real_ptr = base.allocateMemory<double>(sizereal, ierr);
|
|
||||||
res_ptr = base.allocateMemory<double>(sizereal, ierr);
|
|
||||||
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
|
|
||||||
|
|
||||||
cout << "write data" << endl;
|
|
||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
|
||||||
|
|
||||||
cout << "perform fft" << endl;
|
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
|
||||||
base.callC2RFFT(res_ptr, comp_ptr, dim, dimsize);
|
|
||||||
base.callNormalizeC2RFFT(res_ptr, dim, dimsize);
|
|
||||||
|
|
||||||
cout << "read data" << endl;
|
|
||||||
base.readData<double>(res_ptr, ordata, sizereal);
|
|
||||||
|
|
||||||
compareData(rdata, ordata, N1, 3);
|
|
||||||
|
|
||||||
base.freeMemory<double>(real_ptr, sizereal);
|
|
||||||
base.freeMemory<double>(res_ptr, sizereal);
|
|
||||||
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
|
|
||||||
|
|
||||||
//Test CC FFT
|
|
||||||
void *mem_ptr;
|
|
||||||
mem_ptr = base.allocateMemory< complex<double> >(sizereal, ierr);
|
|
||||||
base.writeData< complex<double> >(mem_ptr, cdata, sizereal);
|
|
||||||
base.callFFT(mem_ptr, 3, dimsize);
|
|
||||||
base.callIFFT(mem_ptr, 3, dimsize);
|
|
||||||
base.callNormalizeFFT(mem_ptr, 3, dimsize);
|
|
||||||
base.readData< complex<double> >(mem_ptr, codata, sizereal);
|
|
||||||
|
|
||||||
compareData(cdata, codata, N1, 3);
|
|
||||||
|
|
||||||
base.freeMemory< complex<double> > (mem_ptr, sizereal);
|
|
||||||
|
|
||||||
delete[] rdata;
|
|
||||||
delete[] ordata;
|
|
||||||
delete[] cdata;
|
|
||||||
delete[] codata;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void compareData(complex<double>* data1, complex<double>* data2, int N, int dim) {
|
|
||||||
int ni, nj, nk, id;
|
|
||||||
ni = (dim > 2) ? N : 1;
|
|
||||||
nj = (dim > 1) ? N : 1;
|
|
||||||
nk = N;
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < ni; i++) {
|
|
||||||
for (int j = 0; j < nj; j++) {
|
|
||||||
for (int k = 0; k < nk; k++) {
|
|
||||||
id = i*ni*ni + j*nj + k;
|
|
||||||
sum += fabs(data1[id].real() - data2[id].real());
|
|
||||||
sum += fabs(data1[id].imag() - data2[id].imag());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void compareData(double* data1, double* data2, int N, int dim) {
|
|
||||||
int ni, nj, nk, id;
|
|
||||||
ni = (dim > 2) ? N : 1;
|
|
||||||
nj = (dim > 1) ? N : 1;
|
|
||||||
nk = N;
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < ni; i++) {
|
|
||||||
for (int j = 0; j < nj; j++) {
|
|
||||||
for (int k = 0; k < nk; k++) {
|
|
||||||
id = i*ni*ni + j*nj + k;
|
|
||||||
sum += fabs(data1[id] - data2[id]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void initData(complex<double> *data, int dimsize[3], int dim) {
|
|
||||||
if (dim == 3) {
|
|
||||||
for (int i = 0; i < dimsize[2]; i++)
|
|
||||||
for (int j = 0; j < dimsize[1]; j++)
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
|
|
||||||
} else if (dim == 2) {
|
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
|
||||||
data[j*dimsize[0] + k] = complex<double>(sin(k), 0.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[k] = complex<double>(sin(k), 0.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void initData(double *data, int dimsize[3], int dim) {
|
|
||||||
if (dim == 3) {
|
|
||||||
for (int i = 0; i < dimsize[2]; i++)
|
|
||||||
for (int j = 0; j < dimsize[1]; j++)
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
|
|
||||||
} else if (dim == 2) {
|
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
|
||||||
data[j*dimsize[0] + k] = sin(k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[k] = sin(k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &dim,
|
|
||||||
char *api_name, char *device_name)
|
|
||||||
{
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if ( argv[i] == std::string("-dim")) {
|
|
||||||
dim = atoi(argv[i + 1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( argv[i] == std::string("-grid") ) {
|
|
||||||
N1 = atoi(argv[i + 1]);
|
|
||||||
N2 = atoi(argv[i + 2]);
|
|
||||||
N3 = atoi(argv[i + 3]);
|
|
||||||
i += 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cuda")) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-cpu");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
@ -1,132 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "DKSOPAL.h"
|
|
||||||
|
|
||||||
#include <vector_types.h>
|
|
||||||
#include "cuda_runtime.h"
|
|
||||||
|
|
||||||
void initData(double3 *data, int N) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
data[i].x = (double)rand() / RAND_MAX;
|
|
||||||
data[i].y = (double)rand() / RAND_MAX;
|
|
||||||
data[i].z = (double)rand() / RAND_MAX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void initDt(double *data, int N) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
data[i] = 0.00001;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int loop = 10;
|
|
||||||
int numpart = 1e5;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-npart")) {
|
|
||||||
numpart = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == std::string("-loop")) {
|
|
||||||
loop = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "=========================BEGIN TEST=========================" << std::endl;
|
|
||||||
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
|
|
||||||
std::cout << "Number of particles: " << numpart << std::endl;
|
|
||||||
std::cout << "Number of loops: " << loop << std::endl;
|
|
||||||
std::cout << "------------------------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
int ierr;
|
|
||||||
DKSOPAL dksbase;
|
|
||||||
dksbase.setAPI(api_name, strlen(api_name));
|
|
||||||
dksbase.setDevice(device_name, strlen(api_name));
|
|
||||||
ierr = dksbase.initDevice();
|
|
||||||
if (ierr != DKS_SUCCESS)
|
|
||||||
std::cout << "Error with init device!" << std::endl;
|
|
||||||
|
|
||||||
double3 *R = new double3[numpart];
|
|
||||||
double3 *P = new double3[numpart];
|
|
||||||
double3 *Ef = new double3[numpart];
|
|
||||||
double3 *Bf = new double3[numpart];
|
|
||||||
double *dt = new double[numpart];
|
|
||||||
|
|
||||||
initData(R, numpart);
|
|
||||||
initData(P, numpart);
|
|
||||||
initData(Ef, numpart);
|
|
||||||
initData(Bf, numpart);
|
|
||||||
initDt(dt, numpart);
|
|
||||||
|
|
||||||
void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
|
|
||||||
|
|
||||||
r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
|
|
||||||
dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
|
|
||||||
|
|
||||||
|
|
||||||
dksbase.writeData<double3>(r_ptr, R, numpart);
|
|
||||||
dksbase.writeData<double3>(p_ptr, P, numpart);
|
|
||||||
dksbase.writeData<double3>(ef_ptr, Ef, numpart);
|
|
||||||
dksbase.writeData<double3>(bf_ptr, Bf, numpart);
|
|
||||||
dksbase.writeData<double>(dt_ptr, dt, numpart);
|
|
||||||
|
|
||||||
for (int i = 0; i < loop; ++i)
|
|
||||||
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
|
|
||||||
|
|
||||||
|
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
|
||||||
for (int i = 0; i < 10; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
std:: cout << "..." << std::endl;
|
|
||||||
|
|
||||||
for (int i = numpart - 10; i < numpart; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
std::cout << "============" << std::endl;
|
|
||||||
|
|
||||||
dksbase.readData<double3>(r_ptr, R, numpart);
|
|
||||||
|
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
|
||||||
for (int i = 0; i < 10; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
std:: cout << "..." << std::endl;
|
|
||||||
|
|
||||||
for (int i = numpart - 10; i < numpart; i++)
|
|
||||||
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
|
|
||||||
|
|
||||||
dksbase.freeMemory<double3>(r_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double3>(p_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double3>(ef_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double3>(bf_ptr, numpart);
|
|
||||||
dksbase.freeMemory<double>(dt_ptr, numpart);
|
|
||||||
|
|
||||||
delete[] R;
|
|
||||||
delete[] P;
|
|
||||||
delete[] Ef;
|
|
||||||
delete[] Bf;
|
|
||||||
delete[] dt;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -3,7 +3,5 @@ SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
|
|||||||
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
|
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
|
||||||
SET(${PROJECT_NAME}_LIBRARY "dks")
|
SET(${PROJECT_NAME}_LIBRARY "dks")
|
||||||
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
|
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
|
||||||
SET(DKS_CUDA_STATIC ${STATIC_CUDA})
|
|
||||||
SET(DKS_CUDA_LIBS "${DKS_CUDA_LIBS}")
|
|
||||||
SET(DKS_VERSION ${DKS_VERSION})
|
SET(DKS_VERSION ${DKS_VERSION})
|
||||||
SET(DKS_VERSION_STR ${DKS_VERSION_STR})
|
SET(DKS_VERSION_STR ${DKS_VERSION_STR})
|
BIN
doc/refman.pdf
BIN
doc/refman.pdf
Binary file not shown.
@ -6,7 +6,6 @@ SET (_HDRS
|
|||||||
ImageReconstruction.h
|
ImageReconstruction.h
|
||||||
CollimatorPhysics.h
|
CollimatorPhysics.h
|
||||||
FFT.h
|
FFT.h
|
||||||
GreensFunction.h
|
|
||||||
)
|
)
|
||||||
|
|
||||||
ADD_SOURCES (${_SRCS})
|
ADD_SOURCES (${_SRCS})
|
||||||
|
@ -5,7 +5,10 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
|
class DKSBaseMuSR;
|
||||||
|
|
||||||
class DKSCollimatorPhysics {
|
class DKSCollimatorPhysics {
|
||||||
|
friend class DKSBaseMuSR;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
@ -16,8 +19,7 @@ public:
|
|||||||
|
|
||||||
virtual ~DKSCollimatorPhysics() { }
|
virtual ~DKSCollimatorPhysics() { }
|
||||||
|
|
||||||
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
|
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;
|
||||||
bool enableRutherforScattering = true) = 0;
|
|
||||||
|
|
||||||
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
@ -34,10 +36,6 @@ public:
|
|||||||
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||||
double dt, double c, bool usedt = false, int streamId = -1) = 0;
|
double dt, double c, bool usedt = false, int streamId = -1) = 0;
|
||||||
|
|
||||||
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge,
|
|
||||||
double mass, int npart, double c, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
|
||||||
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
void *orient_ptr, int npart, int nsec, void *dt_ptr,
|
||||||
double dt, double c, bool usedt = false,
|
double dt, double c, bool usedt = false,
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
class BaseFFT {
|
class DKSFFT {
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
int defaultN[3];
|
int defaultN[3];
|
||||||
@ -22,7 +22,7 @@ protected:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
virtual ~BaseFFT() { }
|
virtual ~DKSFFT() { }
|
||||||
|
|
||||||
virtual int setupFFT(int ndim, int N[3]) = 0;
|
virtual int setupFFT(int ndim, int N[3]) = 0;
|
||||||
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
|
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
|
||||||
|
@ -1,29 +0,0 @@
|
|||||||
#ifndef H_GREENSFUNCTION
|
|
||||||
#define H_GREENSFUNCTION
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
class GreensFunction {
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
virtual ~GreensFunction() { }
|
|
||||||
|
|
||||||
/** calc greens integral, as defined in OPAL */
|
|
||||||
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/** integration if rho2_m, see OPAL for more details */
|
|
||||||
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
|
|
||||||
int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/** mirror rho2_m field */
|
|
||||||
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
/** multiply two complex fields from device memory */
|
|
||||||
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -11,7 +11,7 @@
|
|||||||
#include <boost/optional/optional.hpp>
|
#include <boost/optional/optional.hpp>
|
||||||
#include <boost/property_tree/xml_parser.hpp>
|
#include <boost/property_tree/xml_parser.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
//#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -24,7 +24,7 @@
|
|||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
namespace pt = boost::property_tree;
|
namespace pt = boost::property_tree;
|
||||||
//namespace fs = boost::filesystem;
|
namespace fs = boost::filesystem;
|
||||||
|
|
||||||
const std::string config_dir = "/.config/DKS";
|
const std::string config_dir = "/.config/DKS";
|
||||||
const std::string config_file = "/autotuning.xml";
|
const std::string config_file = "/autotuning.xml";
|
||||||
|
@ -35,29 +35,13 @@ ENDMACRO ()
|
|||||||
SET (DKS_BASEDIR_HDRS
|
SET (DKS_BASEDIR_HDRS
|
||||||
DKSBase.h
|
DKSBase.h
|
||||||
DKSDefinitions.h
|
DKSDefinitions.h
|
||||||
DKSFFT.h
|
|
||||||
)
|
)
|
||||||
|
|
||||||
SET (DKS_BASEDIR_SRCS
|
SET (DKS_BASEDIR_SRCS
|
||||||
DKSBase.cpp
|
DKSBase.cpp
|
||||||
DKSFFT.cpp
|
|
||||||
)
|
)
|
||||||
|
|
||||||
#add opal to DKS if enable_opal is set
|
IF (USE_CUDA OR USE_OPENCL)
|
||||||
IF (ENABLE_OPAL)
|
|
||||||
SET (DKS_BASEDIR_HDRS
|
|
||||||
${DKS_BASEDIR_HDRS}
|
|
||||||
DKSOPAL.h
|
|
||||||
)
|
|
||||||
|
|
||||||
SET (DKS_BASEDIR_SRCS
|
|
||||||
${DKS_BASEDIR_SRCS}
|
|
||||||
DKSOPAL.cpp
|
|
||||||
)
|
|
||||||
ENDIF (ENABLE_OPAL)
|
|
||||||
|
|
||||||
#and musrt to DKS if cuda or opencl is used and enable_musr is set
|
|
||||||
IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
|
|
||||||
SET (DKS_BASEDIR_HDRS
|
SET (DKS_BASEDIR_HDRS
|
||||||
${DKS_BASEDIR_HDRS}
|
${DKS_BASEDIR_HDRS}
|
||||||
DKSBaseMuSR.h
|
DKSBaseMuSR.h
|
||||||
@ -67,10 +51,9 @@ IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
|
|||||||
${DKS_BASEDIR_SRCS}
|
${DKS_BASEDIR_SRCS}
|
||||||
DKSBaseMuSR.cpp
|
DKSBaseMuSR.cpp
|
||||||
)
|
)
|
||||||
ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
|
ENDIF (USE_CUDA OR USE_OPENCL)
|
||||||
|
|
||||||
#add image reconstruction to DKS if cuda is used and enable_pet is set
|
IF (USE_CUDA)
|
||||||
IF (USE_CUDA AND ENABLE_PET)
|
|
||||||
SET (DKS_BASEDIR_HDRS
|
SET (DKS_BASEDIR_HDRS
|
||||||
${DKS_BASEDIR_HDRS}
|
${DKS_BASEDIR_HDRS}
|
||||||
DKSImageReconstruction.h
|
DKSImageReconstruction.h
|
||||||
@ -80,7 +63,7 @@ IF (USE_CUDA AND ENABLE_PET)
|
|||||||
${DKS_BASEDIR_SRCS}
|
${DKS_BASEDIR_SRCS}
|
||||||
DKSImageReconstruction.cpp
|
DKSImageReconstruction.cpp
|
||||||
)
|
)
|
||||||
ENDIF (USE_CUDA AND ENABLE_PET)
|
ENDIF (USE_CUDA)
|
||||||
|
|
||||||
ADD_HEADERS (${DKS_BASEDIR_HDRS})
|
ADD_HEADERS (${DKS_BASEDIR_HDRS})
|
||||||
ADD_SOURCES (${DKS_BASEDIR_SRCS})
|
ADD_SOURCES (${DKS_BASEDIR_SRCS})
|
||||||
@ -112,18 +95,26 @@ IF (USE_CUDA)
|
|||||||
CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
|
CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
|
||||||
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
||||||
|
|
||||||
TARGET_LINK_LIBRARIES(dks ${DKS_CUDA_LIBS})
|
IF (USE_UQTK)
|
||||||
TARGET_LINK_LIBRARIES(dksshared ${DKS_CUDA_LIBS})
|
TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
#TARGET_LINK_LIBRARIES(dks)
|
TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
#TARGET_LINK_LIBRARIES(dksshared)
|
ELSE (USE_UQTK)
|
||||||
|
TARGET_LINK_LIBRARIES(dks cudadevrt)
|
||||||
|
TARGET_LINK_LIBRARIES(dksshared cudadevrt)
|
||||||
|
ENDIF (USE_UQTK)
|
||||||
|
|
||||||
ELSE (USE_CUDA)
|
ELSE (USE_CUDA)
|
||||||
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
|
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
|
||||||
ADD_LIBRARY(dks ${DKS_SRCS})
|
ADD_LIBRARY(dks ${DKS_SRCS})
|
||||||
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
|
||||||
|
|
||||||
TARGET_LINK_LIBRARIES(dks)
|
IF (USE_UQTK)
|
||||||
TARGET_LINK_LIBRARIES(dksshared)
|
TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
|
TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
|
||||||
|
ELSE (USE_UQTK)
|
||||||
|
TARGET_LINK_LIBRARIES(dks)
|
||||||
|
TARGET_LINK_LIBRARIES(dksshared)
|
||||||
|
ENDIF(USE_UQTK)
|
||||||
|
|
||||||
ENDIF (USE_CUDA)
|
ENDIF (USE_CUDA)
|
||||||
|
|
||||||
|
@ -1,27 +1,35 @@
|
|||||||
SET (_HDRS CudaBase.cuh CudaFFT.cuh)
|
SET (_HDRS
|
||||||
SET (_SRCS CudaBase.cu CudaFFT.cu)
|
CudaBase.cuh
|
||||||
|
CudaFFT.cuh
|
||||||
|
CudaGreensFunction.cuh
|
||||||
|
CudaChiSquare.cuh
|
||||||
|
CudaCollimatorPhysics.cuh
|
||||||
|
CudaImageReconstruction.cuh
|
||||||
|
CudaChiSquareRuntime.cuh
|
||||||
|
)
|
||||||
|
|
||||||
|
SET (_SRCS
|
||||||
|
CudaBase.cu
|
||||||
|
CudaFFT.cu
|
||||||
|
CudaGreensFunction.cu
|
||||||
|
CudaChiSquare.cu
|
||||||
|
CudaCollimatorPhysics.cu
|
||||||
|
CudaImageReconstruction.cu
|
||||||
|
CudaChiSquareRuntime.cu
|
||||||
|
)
|
||||||
|
|
||||||
IF (ENABLE_OPAL)
|
#INCLUDE_DIRECTORIES (
|
||||||
SET (_HDRS ${_HDRS} CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
|
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
SET (_SRCS ${_SRCS} CudaGreensFunction.cu CudaCollimatorPhysics.cu)
|
#)
|
||||||
ENDIF (ENABLE_OPAL)
|
|
||||||
|
|
||||||
IF (ENABLE_MUSR)
|
|
||||||
SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
|
|
||||||
SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
|
|
||||||
SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
|
|
||||||
ENDIF (ENABLE_MUSR)
|
|
||||||
|
|
||||||
IF (ENABLE_PET)
|
|
||||||
SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
|
|
||||||
SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
|
|
||||||
ENDIF (ENABLE_PET)
|
|
||||||
|
|
||||||
MESSAGE (STATUS "CUDA headers: ${_HDRS}")
|
|
||||||
|
|
||||||
ADD_SOURCES(${_SRCS})
|
ADD_SOURCES(${_SRCS})
|
||||||
ADD_HEADERS(${_HDRS})
|
ADD_HEADERS(${_HDRS})
|
||||||
|
|
||||||
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
|
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
|
||||||
|
|
||||||
|
SET (_KERNELS
|
||||||
|
NVRTCKernels/CudaChiSquareKernel.cu
|
||||||
|
)
|
||||||
|
|
||||||
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
|
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)
|
||||||
|
|
||||||
|
@ -13,13 +13,6 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
|
|
||||||
|
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
if (idx < size)
|
|
||||||
data[idx] = curand_uniform_double(&state[idx]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//=====================================//
|
//=====================================//
|
||||||
//==========Private functions==========//
|
//==========Private functions==========//
|
||||||
@ -55,7 +48,7 @@ int CudaBase::cuda_createCurandStates(int size, int seed) {
|
|||||||
|
|
||||||
int threads = 128;
|
int threads = 128;
|
||||||
int blocks = size / threads + 1;
|
int blocks = size / threads + 1;
|
||||||
if (seed == -1)
|
if (seed == -1)
|
||||||
seed = time(NULL);
|
seed = time(NULL);
|
||||||
|
|
||||||
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
|
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
|
||||||
@ -76,15 +69,6 @@ int CudaBase::cuda_deleteCurandStates() {
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
|
|
||||||
int threads = BLOCK_SIZE;
|
|
||||||
int blocks = size / threads + 1;
|
|
||||||
|
|
||||||
kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
curandState* CudaBase::cuda_getCurandStates() {
|
curandState* CudaBase::cuda_getCurandStates() {
|
||||||
return defaultRndState;
|
return defaultRndState;
|
||||||
}
|
}
|
||||||
|
@ -12,10 +12,9 @@
|
|||||||
#include <cufft.h>
|
#include <cufft.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <curand_kernel.h>
|
#include <curand_kernel.h>
|
||||||
|
#include <nvToolsExt.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
#define BLOCK_SIZE 128
|
|
||||||
|
|
||||||
class CudaBase {
|
class CudaBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -40,7 +39,6 @@ public:
|
|||||||
* Init cuda random number (cuRand) states.
|
* Init cuda random number (cuRand) states.
|
||||||
* Create an array of type curandState with "size" elements on the GPU
|
* Create an array of type curandState with "size" elements on the GPU
|
||||||
* and create a curandState with different seed for each array entry.
|
* and create a curandState with different seed for each array entry.
|
||||||
* If no seed is given create a seed based on current time.
|
|
||||||
* Return success or error code
|
* Return success or error code
|
||||||
*/
|
*/
|
||||||
int cuda_createCurandStates(int size, int seed = -1);
|
int cuda_createCurandStates(int size, int seed = -1);
|
||||||
@ -52,11 +50,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
int cuda_deleteCurandStates();
|
int cuda_deleteCurandStates();
|
||||||
|
|
||||||
/** Create 'size' random numbers on the device and save in mem_ptr array
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
int cuda_createRandomNumbers(void *mem_ptr, int size);
|
|
||||||
|
|
||||||
/** Get a pointer to curand states
|
/** Get a pointer to curand states
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -174,40 +167,6 @@ public:
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Zero CUDA memory.
|
|
||||||
* Set all the elements of the array on the device to zero.
|
|
||||||
*/
|
|
||||||
template<typename T>
|
|
||||||
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
|
|
||||||
cudaError cerror;
|
|
||||||
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
|
|
||||||
if (cerror != cudaSuccess) {
|
|
||||||
DEBUG_MSG("Error zeroing cuda memory!\n");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Zero CUDA memory.
|
|
||||||
* Set all the elements of the array on the device to zero.
|
|
||||||
*/
|
|
||||||
template<typename T>
|
|
||||||
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
|
|
||||||
int dkserror = DKS_SUCCESS;
|
|
||||||
cudaError cerror;
|
|
||||||
if (streamId < cuda_numberOfStreams()) {
|
|
||||||
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
|
|
||||||
cuda_getStream(streamId));
|
|
||||||
|
|
||||||
if (cerror != cudaSuccess)
|
|
||||||
dkserror = DKS_ERROR;
|
|
||||||
} else
|
|
||||||
dkserror = DKS_ERROR;
|
|
||||||
|
|
||||||
return dkserror;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Info: write data to memory
|
* Info: write data to memory
|
||||||
* Retrun: success or error code
|
* Retrun: success or error code
|
||||||
|
@ -23,10 +23,9 @@
|
|||||||
#define X0_M 9
|
#define X0_M 9
|
||||||
#define I_M 10
|
#define I_M 10
|
||||||
#define DT_M 11
|
#define DT_M 11
|
||||||
#define LOWENERGY_THR 12
|
|
||||||
|
|
||||||
#define BLOCK_SIZE 128
|
#define BLOCK_SIZE 128
|
||||||
#define NUMPAR 13
|
#define NUMPAR 12
|
||||||
|
|
||||||
__device__ inline double dot(double3 &d1, double3 &d2) {
|
__device__ inline double dot(double3 &d1, double3 &d2) {
|
||||||
|
|
||||||
@ -34,31 +33,6 @@ __device__ inline double dot(double3 &d1, double3 &d2) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
|
|
||||||
double3 tmp;
|
|
||||||
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
|
|
||||||
tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
|
|
||||||
tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline double3 ArbitraryRotation(double3 &W, double3 &Rorg, double Theta) {
|
|
||||||
double c=cos(Theta);
|
|
||||||
double s=sin(Theta);
|
|
||||||
double dotW = sqrt(dot(W,W));
|
|
||||||
W.x = W.x / dotW;
|
|
||||||
W.y = W.y / dotW;
|
|
||||||
W.z = W.z / dotW;
|
|
||||||
|
|
||||||
double dotWR = dot(W, Rorg) * (1.0 - c);
|
|
||||||
double3 crossW = cross(W, Rorg);
|
|
||||||
double3 tmp;
|
|
||||||
tmp.x = Rorg.x * c + crossW.x * s + W.x * dotWR;
|
|
||||||
tmp.y = Rorg.y * c + crossW.y * s + W.y * dotWR;
|
|
||||||
tmp.z = Rorg.z * c + crossW.z * s + W.z * dotWR;
|
|
||||||
return tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline bool checkHit(double &z, double *par) {
|
__device__ inline bool checkHit(double &z, double *par) {
|
||||||
|
|
||||||
/* check if particle is in the degrader material */
|
/* check if particle is in the degrader material */
|
||||||
@ -107,7 +81,7 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
|
|||||||
Eng = Eng + delta_E / 1E3;
|
Eng = Eng + delta_E / 1E3;
|
||||||
}
|
}
|
||||||
|
|
||||||
pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) );
|
pdead = ((Eng<1E-4) || (dEdx>0));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,7 +92,6 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
|
|||||||
double Psixz;
|
double Psixz;
|
||||||
double pxz;
|
double pxz;
|
||||||
|
|
||||||
/*
|
|
||||||
if (px>=0 && pz>=0)
|
if (px>=0 && pz>=0)
|
||||||
Psixz = atan(px/pz);
|
Psixz = atan(px/pz);
|
||||||
else if (px>0 && pz<0)
|
else if (px>0 && pz<0)
|
||||||
@ -127,8 +100,7 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
|
|||||||
Psixz = atan(px/pz) + 2*PI;
|
Psixz = atan(px/pz) + 2*PI;
|
||||||
else
|
else
|
||||||
Psixz = atan(px/pz) + PI;
|
Psixz = atan(px/pz) + PI;
|
||||||
*/
|
|
||||||
Psixz = atan2(px, pz);
|
|
||||||
pxz = sqrt(px*px + pz*pz);
|
pxz = sqrt(px*px + pz*pz);
|
||||||
|
|
||||||
if(coord==1) {
|
if(coord==1) {
|
||||||
@ -145,9 +117,7 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
|
|||||||
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par,
|
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
|
||||||
bool enableRutherfordScattering)
|
|
||||||
{
|
|
||||||
|
|
||||||
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
|
||||||
double gamma = (Eng + M_P) / M_P;
|
double gamma = (Eng + M_P) / M_P;
|
||||||
@ -170,9 +140,20 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
|
|||||||
}
|
}
|
||||||
|
|
||||||
//__syncthreads();
|
//__syncthreads();
|
||||||
|
|
||||||
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||||
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
|
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
|
||||||
|
|
||||||
|
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
|
if(P2 < 0.0047) {
|
||||||
|
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
|
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||||
|
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
|
if(P4 > 0.5)
|
||||||
|
thetaru = -thetaru;
|
||||||
|
Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
|
||||||
|
}
|
||||||
|
|
||||||
// y-direction: See Physical Review, "Multiple Scattering"
|
// y-direction: See Physical Review, "Multiple Scattering"
|
||||||
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||||
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
|
||||||
@ -189,23 +170,14 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
|
|||||||
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
|
||||||
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
|
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
|
||||||
|
|
||||||
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
if( (P2 < 0.0047) && enableRutherfordScattering) {
|
if(P2 < 0.0047) {
|
||||||
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
//double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
|
||||||
double thetaru = 2.5 * sqrt(1 / P3) * 2.0 * theta0;
|
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
|
||||||
double phiru = 2.0 * M_PI * curand_uniform_double(&state);
|
if(P4 > 0.5)
|
||||||
double th0=atan2(sqrt(P.x*P.x+P.y*P.y),fabs(P.z));
|
thetaru = -thetaru;
|
||||||
double3 W,X;
|
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
|
||||||
|
|
||||||
double dotP = sqrt(dot(P,P));
|
|
||||||
X.x = cos(phiru)*sin(thetaru) * dotP;
|
|
||||||
X.y = sin(phiru)*sin(thetaru) * dotP;
|
|
||||||
X.z = cos(thetaru) * dotP;
|
|
||||||
W.x = -P.y;
|
|
||||||
W.y = P.x;
|
|
||||||
W.z = 0.0;
|
|
||||||
P = ArbitraryRotation(W, X, th0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -213,7 +185,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
|
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
|
||||||
int numparticles, bool enableRutherfordScattering)
|
int numparticles)
|
||||||
{
|
{
|
||||||
|
|
||||||
//get global id and thread id
|
//get global id and thread id
|
||||||
@ -255,7 +227,7 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
|
|||||||
P.x = P.x * ptot / sq;
|
P.x = P.x * ptot / sq;
|
||||||
P.y = P.y * ptot / sq;
|
P.y = P.y * ptot / sq;
|
||||||
P.z = P.z * ptot / sq;
|
P.z = P.z * ptot / sq;
|
||||||
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
|
coulombScat(R[tid], P, s, p);
|
||||||
|
|
||||||
data[idx].Pincol = P;
|
data[idx].Pincol = P;
|
||||||
} else {
|
} else {
|
||||||
@ -278,8 +250,7 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
|
|||||||
}
|
}
|
||||||
|
|
||||||
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
||||||
curandState *state, int numparticles,
|
curandState *state, int numparticles)
|
||||||
bool enableRutherfordScattering)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
//get global id and thread id
|
//get global id and thread id
|
||||||
@ -317,7 +288,7 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
|
|||||||
P.x = P.x * ptot / sq;
|
P.x = P.x * ptot / sq;
|
||||||
P.y = P.y * ptot / sq;
|
P.y = P.y * ptot / sq;
|
||||||
P.z = P.z * ptot / sq;
|
P.z = P.z * ptot / sq;
|
||||||
coulombScat(R[tid], P, s, p, enableRutherfordScattering);
|
coulombScat(R[tid], P, s, p);
|
||||||
|
|
||||||
data.Pincol[idx] = P;
|
data.Pincol[idx] = P;
|
||||||
} else {
|
} else {
|
||||||
@ -452,7 +423,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
|
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
|
||||||
|
|
||||||
//get global id and thread id
|
//get global id and thread id
|
||||||
volatile int tid = threadIdx.x;
|
volatile int tid = threadIdx.x;
|
||||||
@ -478,51 +449,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
|
//TODO: kernel for push with switch off unitless positions with dt[i]*c
|
||||||
double3 *gBf, double *gdt, double charge,
|
|
||||||
double mass, int npart, double c)
|
|
||||||
{
|
|
||||||
volatile int tid = threadIdx.x;
|
|
||||||
volatile int idx = blockIdx.x * blockDim.x + tid;
|
|
||||||
|
|
||||||
if (idx < npart) {
|
|
||||||
double3 R = gR[idx];
|
|
||||||
double3 P = gP[idx];
|
|
||||||
double3 Ef = gEf[idx];
|
|
||||||
double3 Bf = gBf[idx];
|
|
||||||
double dt = gdt[idx];
|
|
||||||
|
|
||||||
P.x += 0.5 * dt * charge * c / mass * Ef.x;
|
|
||||||
P.y += 0.5 * dt * charge * c / mass * Ef.y;
|
|
||||||
P.z += 0.5 * dt * charge * c / mass * Ef.z;
|
|
||||||
|
|
||||||
double gamma = sqrt(1.0 + dot(P, P));
|
|
||||||
double3 t, w, s;
|
|
||||||
t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
|
|
||||||
t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
|
|
||||||
t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
|
|
||||||
|
|
||||||
double3 crossPt = cross(P, t);
|
|
||||||
w.x = P.x + crossPt.x;
|
|
||||||
w.y = P.y + crossPt.y;
|
|
||||||
w.z = P.z + crossPt.z;
|
|
||||||
|
|
||||||
s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
|
|
||||||
s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
|
|
||||||
s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
|
|
||||||
|
|
||||||
double3 crossws = cross(w, s);
|
|
||||||
P.x += crossws.x;
|
|
||||||
P.y += crossws.y;
|
|
||||||
P.z += crossws.z;
|
|
||||||
|
|
||||||
P.x += 0.5 * dt * charge * c / mass * Ef.x;
|
|
||||||
P.y += 0.5 * dt * charge * c / mass * Ef.y;
|
|
||||||
P.z += 0.5 * dt * charge * c / mass * Ef.z;
|
|
||||||
|
|
||||||
gP[idx] = P;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
|
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
|
||||||
|
|
||||||
@ -684,8 +611,7 @@ struct less_then
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
|
||||||
bool enableRutherfordScattering)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
int threads = BLOCK_SIZE;
|
int threads = BLOCK_SIZE;
|
||||||
@ -698,8 +624,7 @@ int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int n
|
|||||||
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
|
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
|
||||||
(double*)par_ptr,
|
(double*)par_ptr,
|
||||||
m_base->cuda_getCurandStates(),
|
m_base->cuda_getCurandStates(),
|
||||||
numparticles,
|
numparticles);
|
||||||
enableRutherfordScattering);
|
|
||||||
|
|
||||||
cudaError_t err = cudaGetLastError();
|
cudaError_t err = cudaGetLastError();
|
||||||
if (err != cudaSuccess)
|
if (err != cudaSuccess)
|
||||||
@ -746,12 +671,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr,
|
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart,
|
||||||
(double*)dt_ptr, npart, c);
|
(double*)dt_ptr, c);
|
||||||
} else {
|
} else {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
|
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart,
|
||||||
(double*)dt_ptr, npart, c);
|
(double*)dt_ptr, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -759,29 +684,6 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
|
|||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge,
|
|
||||||
double mass, int npart,
|
|
||||||
double c, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
int threads = BLOCK_SIZE;
|
|
||||||
int blocks = npart / threads + 1;
|
|
||||||
|
|
||||||
//call kernel
|
|
||||||
if (streamId == -1) {
|
|
||||||
kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
|
|
||||||
(double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
|
|
||||||
} else {
|
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
|
||||||
kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
|
|
||||||
(double3*)ef_ptr, (double3*)bf_ptr,
|
|
||||||
(double*)dt_ptr, charge, mass, npart, c);
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||||
void *lastSec_ptr, void *orient_ptr,
|
void *lastSec_ptr, void *orient_ptr,
|
||||||
int npart, int nsec,
|
int npart, int nsec,
|
||||||
|
@ -110,7 +110,7 @@ public:
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
int numpartices, bool enableRutherforScattering = true);
|
int numpartices);
|
||||||
|
|
||||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
@ -141,10 +141,6 @@ public:
|
|||||||
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
|
||||||
double dt, double c, bool usedt = false, int streamId = -1);
|
double dt, double c, bool usedt = false, int streamId = -1);
|
||||||
|
|
||||||
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge, double mass,
|
|
||||||
int npart, double c, int streamId = -1);
|
|
||||||
|
|
||||||
/** BorisPusher push function with transformto function form OPAL
|
/** BorisPusher push function with transformto function form OPAL
|
||||||
* ParallelTTracker integration from OPAL implemented in cuda.
|
* ParallelTTracker integration from OPAL implemented in cuda.
|
||||||
* For more details see ParallelTTracler docomentation in opal
|
* For more details see ParallelTTracler docomentation in opal
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithms/FFT.h"
|
||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
class CudaFFT : public BaseFFT {
|
class CudaFFT : public DKSFFT{
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
@ -189,11 +189,12 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
|
|||||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||||
|
|
||||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -272,20 +273,28 @@ __global__ void mirroredRhoField(double *rho2_m,
|
|||||||
id7 = rk * NI * NJ + rj * NI + i;
|
id7 = rk * NI * NJ + rj * NI + i;
|
||||||
id8 = rk * NI * NJ + rj * NI + ri;
|
id8 = rk * NI * NJ + rj * NI + ri;
|
||||||
|
|
||||||
|
|
||||||
double data = rho2_m[id1];
|
double data = rho2_m[id1];
|
||||||
if (i != 0) rho2_m[id2] = data;
|
if (i != 0)
|
||||||
|
rho2_m[id2] = data;
|
||||||
|
|
||||||
if (j != 0) rho2_m[id3] = data;
|
if (j != 0)
|
||||||
|
rho2_m[id3] = data;
|
||||||
|
|
||||||
if (i != 0 && j != 0) rho2_m[id4] = data;
|
if (i != 0 && j != 0)
|
||||||
|
rho2_m[id4] = data;
|
||||||
|
|
||||||
if (k != 0) rho2_m[id5] = data;
|
if (k != 0)
|
||||||
|
rho2_m[id5] = data;
|
||||||
|
|
||||||
if (k != 0 && i != 0) rho2_m[id6] = data;
|
if (k != 0 && i != 0)
|
||||||
|
rho2_m[id6] = data;
|
||||||
|
|
||||||
if (k!= 0 && j != 0) rho2_m[id7] = data;
|
if (k!= 0 && j != 0)
|
||||||
|
rho2_m[id7] = data;
|
||||||
|
|
||||||
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
|
if (k != 0 && j != 0 & i != 0)
|
||||||
|
rho2_m[id8] = data;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,9 +363,9 @@ CudaGreensFunction::~CudaGreensFunction() {
|
|||||||
delete m_base;
|
delete m_base;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
double hr_m0, double hr_m1, double hr_m2,
|
||||||
int streamId)
|
int streamId)
|
||||||
{
|
{
|
||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
@ -364,7 +373,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
|
|||||||
|
|
||||||
//if no stream specified use default stream
|
//if no stream specified use default stream
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
|
kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -372,7 +381,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
|
|||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
|
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -380,17 +389,15 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen,
|
||||||
int I, int J, int K,
|
int I, int J, int K,
|
||||||
int streamId)
|
int streamId)
|
||||||
{
|
{
|
||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
int block = (I * J * K / thread) + 1;
|
int block = (I * J * K / thread) + 1;
|
||||||
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
|
|
||||||
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
|
||||||
2*(I - 1), 2*(J - 1), I, J, K);
|
2*(I - 1), 2*(J - 1), I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
@ -399,7 +406,6 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
|||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
|
|
||||||
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
|
||||||
2*(I - 1), 2*(J - 1), I, J, K);
|
2*(I - 1), 2*(J - 1), I, J, K);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
@ -409,22 +415,22 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
|
int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
||||||
|
|
||||||
int thread = 128;
|
int thread = 128;
|
||||||
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
|
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I, 2*J);
|
mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J);
|
||||||
mirroredRhoField<<< block, thread >>>( (double *) rho2_m, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
|
mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (streamId < m_base->cuda_numberOfStreams()) {
|
if (streamId < m_base->cuda_numberOfStreams()) {
|
||||||
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
cudaStream_t cs = m_base->cuda_getStream(streamId);
|
||||||
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I, 2*J);
|
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J);
|
||||||
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
|
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -434,13 +440,13 @@ int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int st
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2,
|
int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2,
|
||||||
int size, int streamId) {
|
int size, int streamId) {
|
||||||
|
|
||||||
int threads = 128;
|
int threads = 128;
|
||||||
int blocks = size / threads + 1;
|
int blocks = size / threads + 1;
|
||||||
int datasize = 2 * threads * sizeof(cuDoubleComplex);
|
int datasize = 2 * threads * sizeof(cuDoubleComplex);
|
||||||
|
|
||||||
if (streamId == -1) {
|
if (streamId == -1) {
|
||||||
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
|
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
|
||||||
(cuDoubleComplex*)ptr2,
|
(cuDoubleComplex*)ptr2,
|
||||||
|
@ -2,17 +2,17 @@
|
|||||||
#define H_CUDA_GREENSFUNCTION
|
#define H_CUDA_GREENSFUNCTION
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cmath>
|
#include <math.h>
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cuComplex.h>
|
#include <cuComplex.h>
|
||||||
#include "cublas_v2.h"
|
#include "cublas_v2.h"
|
||||||
|
|
||||||
#include "../Algorithms/GreensFunction.h"
|
|
||||||
#include "CudaBase.cuh"
|
#include "CudaBase.cuh"
|
||||||
|
|
||||||
class CudaGreensFunction : public GreensFunction{
|
class CudaGreensFunction {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -30,32 +30,32 @@ public:
|
|||||||
/* destructor */
|
/* destructor */
|
||||||
~CudaGreensFunction();
|
~CudaGreensFunction();
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: calc itegral on device memory (taken from OPAL src code)
|
Info: calc itegral on device memory (taken from OPAL src code)
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
double hr_m0, double hr_m1, double hr_m2,
|
||||||
int streamId = -1);
|
int streamId = -1);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: integration of rho2_m field (taken from OPAL src code)
|
Info: integration of rho2_m field (taken from OPAL src code)
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
||||||
int streamId = -1);
|
int streamId = -1);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: mirror rho field (taken from OPAL src code)
|
Info: mirror rho field (taken from OPAL src code)
|
||||||
Return: succes or error code
|
Return: succes or error code
|
||||||
*/
|
*/
|
||||||
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
|
int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
||||||
|
|
||||||
/**
|
/*
|
||||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
453
src/DKSBase.cpp
453
src/DKSBase.cpp
@ -103,14 +103,25 @@ DKSBase::DKSBase() {
|
|||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
cbase = new CudaBase();
|
cbase = new CudaBase();
|
||||||
|
cfft = new CudaFFT(cbase);
|
||||||
|
cgreens = new CudaGreensFunction(cbase);
|
||||||
|
cchi = new CudaChiSquare(cbase);
|
||||||
|
ccol = new CudaCollimatorPhysics(cbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
oclbase = new OpenCLBase();
|
oclbase = new OpenCLBase();
|
||||||
|
oclfft = new OpenCLFFT(oclbase);
|
||||||
|
oclchi = new OpenCLChiSquare(oclbase);
|
||||||
|
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
micbase = new MICBase();
|
micbase = new MICBase();
|
||||||
|
micfft = new MICFFT(micbase);
|
||||||
|
miccol = new MICCollimatorPhysics(micbase);
|
||||||
|
micgreens = new MICGreensFunction(micbase);
|
||||||
|
micchi = new MICChiSquare(micbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -127,14 +138,25 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
|
|||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
cbase = new CudaBase();
|
cbase = new CudaBase();
|
||||||
|
cfft = new CudaFFT(cbase);
|
||||||
|
cgreens = new CudaGreensFunction(cbase);
|
||||||
|
cchi = new CudaChiSquare(cbase);
|
||||||
|
ccol = new CudaCollimatorPhysics(cbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
oclbase = new OpenCLBase();
|
oclbase = new OpenCLBase();
|
||||||
|
oclfft = new OpenCLFFT(oclbase);
|
||||||
|
oclchi = new OpenCLChiSquare(oclbase);
|
||||||
|
oclcol = new OpenCLCollimatorPhysics(oclbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
micbase = new MICBase();
|
micbase = new MICBase();
|
||||||
|
micfft = new MICFFT(micbase);
|
||||||
|
miccol = new MICCollimatorPhysics(micbase);
|
||||||
|
micgreens = new MICGreensFunction(micbase);
|
||||||
|
micchi = new MICChiSquare(micbase);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -151,16 +173,27 @@ DKSBase::~DKSBase() {
|
|||||||
if (m_function_name != NULL)
|
if (m_function_name != NULL)
|
||||||
delete[] m_function_name;
|
delete[] m_function_name;
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
|
delete cfft;
|
||||||
|
delete cgreens;
|
||||||
|
delete cchi;
|
||||||
|
delete ccol;
|
||||||
delete cbase;
|
delete cbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
|
delete oclfft;
|
||||||
|
delete oclchi;
|
||||||
|
delete oclcol;
|
||||||
delete oclbase;
|
delete oclbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
|
delete micfft;
|
||||||
|
delete miccol;
|
||||||
|
delete micgreens;
|
||||||
|
delete micchi;
|
||||||
delete micbase;
|
delete micbase;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -274,45 +307,38 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
int DKSBase::setupDevice() {
|
/*
|
||||||
|
init device
|
||||||
int ierr = DKS_ERROR;
|
*/
|
||||||
|
int DKSBase::initDevice() {
|
||||||
|
|
||||||
//if api is not set default is OpenCL
|
//if api is not set default is OpenCL
|
||||||
if (!m_api_set) {
|
if (!m_api_set) {
|
||||||
setDevice("-gpu", 4);
|
setDevice("-gpu", 4);
|
||||||
setAPI(API_OPENCL, 6);
|
setAPI(API_OPENCL, 6);
|
||||||
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
||||||
} else {
|
} else {
|
||||||
if (apiOpenCL()) {
|
if (apiOpenCL()) {
|
||||||
if (!m_device_set) {
|
if (!m_device_set) {
|
||||||
setDevice("-gpu", 4);
|
setDevice("-gpu", 4);
|
||||||
setAPI(API_OPENCL, 6);
|
setAPI(API_OPENCL, 6);
|
||||||
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
|
||||||
} else {
|
} else {
|
||||||
setAPI(API_OPENCL, 6);
|
setAPI(API_OPENCL, 6);
|
||||||
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
|
return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
|
||||||
}
|
}
|
||||||
} else if (apiCuda()) {
|
} else if (apiCuda()) {
|
||||||
setDevice("-gpu", 4);
|
setDevice("-gpu", 4);
|
||||||
setAPI(API_CUDA, 4);
|
setAPI(API_CUDA, 4);
|
||||||
ierr = CUDA_SAFECALL(DKS_SUCCESS);
|
return CUDA_SAFECALL(DKS_SUCCESS);
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
setDevice("-mic", 4);
|
setDevice("-mic", 4);
|
||||||
setAPI(API_OPENMP, 6);
|
setAPI(API_OPENMP, 6);
|
||||||
ierr = MIC_SAFECALL(DKS_SUCCESS);
|
return MIC_SAFECALL(DKS_SUCCESS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ierr;
|
return DKS_ERROR;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
init device
|
|
||||||
*/
|
|
||||||
int DKSBase::initDevice() {
|
|
||||||
return setupDevice();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -430,15 +456,356 @@ int DKSBase::syncDevice() {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup fft plans to reuse if multiple ffts of same size are needed */
|
||||||
|
int DKSBase::setupFFT(int ndim, int N[3]) {
|
||||||
|
|
||||||
int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
|
if (apiCuda()) {
|
||||||
if (apiCuda())
|
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
|
||||||
return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
|
} else if (apiOpenMP()) {
|
||||||
if (apiOpenCL())
|
//micbase.mic_setupFFT(ndim, N);
|
||||||
return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));
|
//BENI: setting up RC and CR transformations on MIC
|
||||||
|
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
|
||||||
|
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
|
||||||
|
if (ierr1 != DKS_SUCCESS)
|
||||||
|
return ierr1;
|
||||||
|
if (ierr2 != DKS_SUCCESS)
|
||||||
|
return ierr2;
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
//BENI:
|
||||||
|
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
|
||||||
|
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//BENI:
|
||||||
|
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
|
||||||
|
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call OpenCL FFT function for selected platform */
|
||||||
|
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
//load kernel and execute
|
||||||
|
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||||
|
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
} else if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call OpenCL IFFT function for selected platform */
|
||||||
|
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||||
|
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
} else if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call normalize FFT function for selected platform */
|
||||||
|
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
||||||
|
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
} else if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call real to complex FFT */
|
||||||
|
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* call complex to real FFT */
|
||||||
|
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* normalize complex to real iFFT */
|
||||||
|
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* normalize complex to real iFFT */
|
||||||
|
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
|
||||||
|
if (apiOpenCL()) {
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selected platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
||||||
|
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
||||||
|
hz_m0, hz_m1, hz_m2, streamId) );
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
//BENI:
|
||||||
|
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
||||||
|
int I, int J, int K, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
|
||||||
|
double fTimeResolution, double fRebin,
|
||||||
|
int sensors, int length, int numpar, double &result)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||||
|
fTimeResolution, fRebin,
|
||||||
|
sensors, length, numpar,
|
||||||
|
result));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
|
||||||
|
fTimeResolution, fRebin,
|
||||||
|
sensors, length, numpar, result));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result)
|
||||||
|
{
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar,
|
||||||
|
result));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar, result));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffset,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result)
|
||||||
|
{
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar,
|
||||||
|
result));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
|
||||||
|
fTimeResolution, fRebin, fGoodBinOffset,
|
||||||
|
sensors, length, numpar, result));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
|
int numparticles, int numparams,
|
||||||
|
int &numaddback, int &numdead)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||||
|
} else if (apiOpenCL()) {
|
||||||
|
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
|
||||||
|
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||||
|
else
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
|
||||||
|
}
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
|
||||||
|
rx_ptr, ry_ptr, rz_ptr,
|
||||||
|
px_ptr, py_ptr, pz_ptr,
|
||||||
|
par_ptr, numparticles) );
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles, int &numaddback)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
|
||||||
|
rx_ptr, ry_ptr, rz_ptr,
|
||||||
|
px_ptr, py_ptr, pz_ptr,
|
||||||
|
par_ptr, numparticles, numaddback));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int DKSBase::callInitRandoms(int size, int seed) {
|
int DKSBase::callInitRandoms(int size, int seed) {
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
@ -452,3 +819,43 @@ int DKSBase::callInitRandoms(int size, int seed) {
|
|||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
||||||
|
void *dt_ptr, double dt, double c,
|
||||||
|
bool usedt, int streamId)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda())
|
||||||
|
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
|
||||||
|
usedt, streamId));
|
||||||
|
else if (apiOpenMP())
|
||||||
|
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
|
||||||
|
c, usedt, streamId));
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||||
|
void *lastSec_ptr, void *orient_ptr,
|
||||||
|
int npart, int nsec, void *dt_ptr, double dt,
|
||||||
|
double c, bool usedt, int streamId)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (apiCuda()) {
|
||||||
|
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
|
||||||
|
lastSec_ptr, orient_ptr,
|
||||||
|
npart, nsec, dt_ptr, dt,
|
||||||
|
c, usedt, streamId));
|
||||||
|
} else if (apiOpenMP()) {
|
||||||
|
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
|
||||||
|
lastSec_ptr, orient_ptr,
|
||||||
|
npart, nsec, dt_ptr, dt,
|
||||||
|
c, usedt, streamId));
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_MSG("No implementation for selceted platform");
|
||||||
|
return DKS_ERROR;
|
||||||
|
|
||||||
|
}
|
||||||
|
262
src/DKSBase.h
262
src/DKSBase.h
@ -29,16 +29,31 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "OpenCL/OpenCLBase.h"
|
#include "OpenCL/OpenCLBase.h"
|
||||||
|
#include "OpenCL/OpenCLFFT.h"
|
||||||
|
#include "OpenCL/OpenCLChiSquare.h"
|
||||||
|
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
#include "CUDA/CudaBase.cuh"
|
#include "CUDA/CudaBase.cuh"
|
||||||
|
#include "CUDA/CudaFFT.cuh"
|
||||||
|
#include "CUDA/CudaGreensFunction.cuh"
|
||||||
|
#include "CUDA/CudaChiSquare.cuh"
|
||||||
|
#include "CUDA/CudaCollimatorPhysics.cuh"
|
||||||
|
#include "nvToolsExt.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
#include "MIC/MICBase.h"
|
#include "MIC/MICBase.h"
|
||||||
|
#include "MIC/MICChiSquare.h"
|
||||||
|
#include "MIC/MICFFT.h"
|
||||||
|
#include "MIC/MICCollimatorPhysics.h"
|
||||||
|
#include "MIC/MICGreensFunction.hpp"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "Algorithms/CollimatorPhysics.h"
|
||||||
|
#include "Algorithms/FFT.h"
|
||||||
|
|
||||||
#include "AutoTuning/DKSConfig.h"
|
#include "AutoTuning/DKSConfig.h"
|
||||||
|
|
||||||
/** DKSBase class for handling function calls to DKS library */
|
/** DKSBase class for handling function calls to DKS library */
|
||||||
@ -58,14 +73,25 @@ private:
|
|||||||
|
|
||||||
#ifdef DKS_OPENCL
|
#ifdef DKS_OPENCL
|
||||||
OpenCLBase *oclbase;
|
OpenCLBase *oclbase;
|
||||||
|
OpenCLFFT *oclfft;
|
||||||
|
OpenCLChiSquare *oclchi;
|
||||||
|
OpenCLCollimatorPhysics *oclcol;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
#ifdef DKS_CUDA
|
||||||
CudaBase *cbase;
|
CudaBase *cbase;
|
||||||
|
CudaFFT *cfft;
|
||||||
|
CudaGreensFunction *cgreens;
|
||||||
|
CudaChiSquare *cchi;
|
||||||
|
CudaCollimatorPhysics *ccol;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
MICBase *micbase;
|
MICBase *micbase;
|
||||||
|
MICFFT *micfft;
|
||||||
|
MICCollimatorPhysics *miccol;
|
||||||
|
MICGreensFunction *micgreens;
|
||||||
|
MICChiSquare *micchi;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -113,12 +139,6 @@ protected:
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
MICBase *getMICBase() {
|
|
||||||
return micbase;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/** Call OpenCL base to load specified kenrel file.
|
/** Call OpenCL base to load specified kenrel file.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -134,7 +154,6 @@ protected:
|
|||||||
return device_name;
|
return device_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -154,11 +173,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
~DKSBase();
|
~DKSBase();
|
||||||
|
|
||||||
/** Function to initialize objects based on the device used.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
int setupDevice();
|
|
||||||
|
|
||||||
/** Turn on auto tuning */
|
/** Turn on auto tuning */
|
||||||
void setAutoTuningOn() { m_auto_tuning = true; }
|
void setAutoTuningOn() { m_auto_tuning = true; }
|
||||||
|
|
||||||
@ -391,7 +405,7 @@ public:
|
|||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
void * mem_ptr = NULL;
|
void * mem_ptr = NULL;
|
||||||
mem_ptr = micbase->mic_allocateMemory<T>(elements);
|
mem_ptr = micbase.mic_allocateMemory<T>(elements);
|
||||||
return mem_ptr;
|
return mem_ptr;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -484,7 +498,7 @@ public:
|
|||||||
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
||||||
|
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
|
return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -518,7 +532,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -818,7 +832,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
|
return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -846,7 +860,7 @@ public:
|
|||||||
size_t size = sizeof(T)*elements;
|
size_t size = sizeof(T)*elements;
|
||||||
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
||||||
} else if (apiOpenMP()) {
|
} else if (apiOpenMP()) {
|
||||||
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
||||||
streamId, offset));
|
streamId, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -866,32 +880,228 @@ public:
|
|||||||
else if (apiCuda())
|
else if (apiCuda())
|
||||||
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
||||||
else if (apiOpenMP())
|
else if (apiOpenMP())
|
||||||
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
|
return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
|
||||||
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Create random numbers on the device and fille mem_data array
|
///////////////////////////////////////////////
|
||||||
|
///////Function library part of dksbase////////
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Setup FFT function.
|
||||||
|
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
|
||||||
|
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
|
||||||
|
* each fft will do its own setup according to fft size and dimensions.
|
||||||
|
* TODO: opencl and mic implementations
|
||||||
*/
|
*/
|
||||||
int callCreateRandomNumbers(void *mem_ptr, int size);
|
int setupFFT(int ndim, int N[3]);
|
||||||
|
//BENI:
|
||||||
|
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
||||||
|
//BENI:
|
||||||
|
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call complex-to-complex fft.
|
||||||
|
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
|
||||||
|
* stream id can be specified to use other streams than default.
|
||||||
|
* TODO: mic implementation
|
||||||
|
*/
|
||||||
|
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call complex-to-complex ifft.
|
||||||
|
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
|
||||||
|
* stream id can be specified to use other streams than default.
|
||||||
|
* TODO: mic implementation.
|
||||||
|
*/
|
||||||
|
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize complex to complex ifft.
|
||||||
|
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
||||||
|
* fft size
|
||||||
|
* TODO: mic implementation.
|
||||||
|
*/
|
||||||
|
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call real to complex FFT.
|
||||||
|
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
|
||||||
|
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
||||||
|
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
||||||
|
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
||||||
|
* TODO: opencl and mic implementations
|
||||||
|
*/
|
||||||
|
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call complex to real iFFT.
|
||||||
|
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
|
||||||
|
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
||||||
|
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
||||||
|
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize compelx to real ifft.
|
||||||
|
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
||||||
|
* fft size.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transpose 2D and 3D arrays, OpenCL implementation
|
||||||
|
* N - size of dimensions, ndim - number of dimensions, dim - dim to transpose
|
||||||
|
*/
|
||||||
|
int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
|
* For specifics check OPAL docs.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
||||||
|
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
|
* For specifics check OPAL docs.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
||||||
|
int I, int J, int K, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
||||||
|
* For specifics check OPAL docs.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Element by element multiplication.
|
||||||
|
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
|
||||||
|
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chi square for parameter fitting on device.
|
||||||
|
* mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for
|
||||||
|
* intermediate results. Chi square results are put in &results
|
||||||
|
*/
|
||||||
|
int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
|
||||||
|
double fTimeResolution, double fRebin,
|
||||||
|
int sensors, int length, int numpar, double &result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* max-log-likelihood for parameter fitting on device.
|
||||||
|
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
|
||||||
|
* mem_par - pointer to parameter set, mem_results - pointer for
|
||||||
|
* intermediate results. Chi square results are put in &results.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffser,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* max-log-likelihood for parameter fitting on device.
|
||||||
|
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
|
||||||
|
* mem_par - pointer to parameter set, mem_results - pointer for
|
||||||
|
* intermediate results. Chi square results are put in &results.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
|
||||||
|
double fTimeResolution, double fRebin, double fGoodBinOffser,
|
||||||
|
int sensors, int length, int numpar,
|
||||||
|
double &result);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
|
int numparticles, int numparams,
|
||||||
|
int &numaddback, int &numdead);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
||||||
|
* TODO: opencl and mic implementations.
|
||||||
|
*/
|
||||||
|
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
||||||
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
void *px_ptr, void *py_ptr, void *pz_ptr,
|
||||||
|
void *par_ptr, int numparticles, int &numaddback);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Init random number states and save for reuse on device.
|
* Init random number states and save for reuse on device.
|
||||||
* If seed is -1, a random seed based on current time is taken.
|
|
||||||
* TODO: opencl and mic implementations.
|
* TODO: opencl and mic implementations.
|
||||||
*/
|
*/
|
||||||
int callInitRandoms(int size, int seed = -1);
|
int callInitRandoms(int size, int seed = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integration code from ParallelTTracker from OPAL.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
||||||
|
*/
|
||||||
|
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
||||||
|
void *dt_ptr, double dt, double c,
|
||||||
|
bool usedt = false, int streamId = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integration code from ParallelTTracker from OPAL.
|
||||||
|
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
||||||
|
*/
|
||||||
|
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
||||||
|
void *lastSec_ptr, void *orient_ptr,
|
||||||
|
int npart, int nsec, void *dt_ptr,
|
||||||
|
double dt, double c, bool usedt = false,
|
||||||
|
int streamId = -1);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Print memory information on device (total, used, available)
|
* Print memory information on device (total, used, available)
|
||||||
* TODO: opencl and mic imlementation
|
* TODO: opencl and mic imlementation
|
||||||
*/
|
*/
|
||||||
int callMemInfo() {
|
int callMemInfo() {
|
||||||
#ifdef DKS_CUDA
|
|
||||||
if (apiCuda())
|
if (apiCuda())
|
||||||
return CUDA_SAFECALL(cbase->cuda_memInfo());
|
return CUDA_SAFECALL(cbase->cuda_memInfo());
|
||||||
#endif
|
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -900,12 +1110,10 @@ public:
|
|||||||
* Used for debuging and timing purposes only.
|
* Used for debuging and timing purposes only.
|
||||||
*/
|
*/
|
||||||
void oclEventInfo() {
|
void oclEventInfo() {
|
||||||
#ifdef DKS_OPENCL
|
|
||||||
if (apiOpenCL())
|
if (apiOpenCL())
|
||||||
return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
|
return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test function to profile opencl kernel calls.
|
* Test function to profile opencl kernel calls.
|
||||||
|
@ -8,7 +8,6 @@
|
|||||||
#include "AutoTuning/DKSAutoTuningTester.h"
|
#include "AutoTuning/DKSAutoTuningTester.h"
|
||||||
|
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
#include "Algorithms/ChiSquareRuntime.h"
|
#include "Algorithms/ChiSquareRuntime.h"
|
||||||
|
|
||||||
@ -20,7 +19,7 @@
|
|||||||
#include "OpenCL/OpenCLChiSquareRuntime.h"
|
#include "OpenCL/OpenCLChiSquareRuntime.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class DKSBaseMuSR : public DKSFFT {
|
class DKSBaseMuSR : public DKSBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
@ -62,12 +62,6 @@
|
|||||||
#define OPENCL_SAFEINIT(x) ( NULL )
|
#define OPENCL_SAFEINIT(x) ( NULL )
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DKS_AMD
|
|
||||||
#define OPENCL_SAFEINIT_AMD(x) ( x )
|
|
||||||
#else
|
|
||||||
#define OPENCL_SAFEINIT_AMD(x) ( NULL )
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
#ifdef DKS_MIC
|
||||||
#define MIC_SAFEINIT(x) ( x )
|
#define MIC_SAFEINIT(x) ( x )
|
||||||
#else
|
#else
|
||||||
|
147
src/DKSFFT.cpp
147
src/DKSFFT.cpp
@ -1,147 +0,0 @@
|
|||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
DKSFFT::DKSFFT() {
|
|
||||||
dksfft = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
DKSFFT::~DKSFFT() {
|
|
||||||
delete dksfft;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* setup fft plans to reuse if multiple ffts of same size are needed */
|
|
||||||
int DKSFFT::setupFFT(int ndim, int N[3]) {
|
|
||||||
|
|
||||||
if (apiCuda()) {
|
|
||||||
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
|
|
||||||
return dksfft->setupFFT(ndim, N);
|
|
||||||
} else if (apiOpenCL()) {
|
|
||||||
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
|
|
||||||
int ierr1 = dksfft->setupFFT(ndim, N);
|
|
||||||
int ierr2 = dksfft->setupFFTRC(ndim, N);
|
|
||||||
int ierr3 = dksfft->setupFFTCR(ndim, N);
|
|
||||||
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
} else if (apiOpenMP()) {
|
|
||||||
//micbase.mic_setupFFT(ndim, N);
|
|
||||||
//BENI: setting up RC and CR transformations on MIC
|
|
||||||
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
|
|
||||||
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
|
|
||||||
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
|
|
||||||
if (ierr1 != DKS_SUCCESS)
|
|
||||||
return ierr1;
|
|
||||||
if (ierr2 != DKS_SUCCESS)
|
|
||||||
return ierr2;
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
//BENI:
|
|
||||||
int DKSFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
|
||||||
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->setupFFT(ndim, N);
|
|
||||||
if (apiOpenCL())
|
|
||||||
return dksfft->setupFFTRC(ndim, N);
|
|
||||||
else if (apiOpenMP())
|
|
||||||
return dksfft->setupFFTRC(ndim, N, scale);
|
|
||||||
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//BENI:
|
|
||||||
int DKSFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
|
||||||
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->setupFFT(ndim, N);
|
|
||||||
if (apiOpenCL())
|
|
||||||
return dksfft->setupFFTCR(ndim, N);
|
|
||||||
else if (apiOpenMP())
|
|
||||||
return dksfft->setupFFTCR(ndim, N, scale);
|
|
||||||
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call OpenCL FFT function for selected platform */
|
|
||||||
int DKSFFT::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
|
|
||||||
if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeFFT(data_ptr, ndim, dimsize);
|
|
||||||
else if (apiCuda())
|
|
||||||
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call OpenCL IFFT function for selected platform */
|
|
||||||
int DKSFFT::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
|
|
||||||
else if (apiCuda())
|
|
||||||
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call normalize FFT function for selected platform */
|
|
||||||
int DKSFFT::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
|
|
||||||
if (apiOpenCL()) {
|
|
||||||
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
|
|
||||||
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
|
|
||||||
else
|
|
||||||
return DKS_ERROR;
|
|
||||||
} else if (apiCuda()) {
|
|
||||||
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
|
|
||||||
} else if (apiOpenMP()) {
|
|
||||||
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call real to complex FFT */
|
|
||||||
int DKSFFT::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
|
|
||||||
else if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* call complex to real FFT */
|
|
||||||
int DKSFFT::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
|
|
||||||
else if (apiOpenCL() || apiOpenMP())
|
|
||||||
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* normalize complex to real iFFT */
|
|
||||||
int DKSFFT::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
|
|
||||||
if (apiCuda())
|
|
||||||
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
|
|
||||||
else if (apiOpenCL())
|
|
||||||
return DKS_ERROR;
|
|
||||||
else if (apiOpenMP())
|
|
||||||
return DKS_ERROR;
|
|
||||||
|
|
||||||
DEBUG_MSG("No implementation for selected platform");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
108
src/DKSFFT.h
108
src/DKSFFT.h
@ -1,108 +0,0 @@
|
|||||||
#ifndef H_DKSBASE_FFT
|
|
||||||
#define H_DKSBASE_FFT
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include "AutoTuning/DKSAutoTuning.h"
|
|
||||||
|
|
||||||
#include "DKSBase.h"
|
|
||||||
|
|
||||||
#include "DKSDefinitions.h"
|
|
||||||
|
|
||||||
#include "Algorithms/GreensFunction.h"
|
|
||||||
#include "Algorithms/CollimatorPhysics.h"
|
|
||||||
#include "Algorithms/FFT.h"
|
|
||||||
|
|
||||||
#ifdef DKS_AMD
|
|
||||||
#include "OpenCL/OpenCLFFT.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
|
||||||
#include "CUDA/CudaFFT.cuh"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
#include "MIC/MICFFT.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class DKSFFT : public DKSBase {
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
BaseFFT *dksfft;
|
|
||||||
|
|
||||||
int initFFT();
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
DKSFFT();
|
|
||||||
~DKSFFT();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Setup FFT function.
|
|
||||||
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
|
|
||||||
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
|
|
||||||
* each fft will do its own setup according to fft size and dimensions.
|
|
||||||
* TODO: opencl and mic implementations
|
|
||||||
*/
|
|
||||||
int setupFFT(int ndim, int N[3]);
|
|
||||||
//BENI:
|
|
||||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
|
||||||
//BENI:
|
|
||||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call complex-to-complex fft.
|
|
||||||
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
|
|
||||||
* stream id can be specified to use other streams than default.
|
|
||||||
* TODO: mic implementation
|
|
||||||
*/
|
|
||||||
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call complex-to-complex ifft.
|
|
||||||
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
|
|
||||||
* stream id can be specified to use other streams than default.
|
|
||||||
* TODO: mic implementation.
|
|
||||||
*/
|
|
||||||
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize complex to complex ifft.
|
|
||||||
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
|
||||||
* fft size
|
|
||||||
* TODO: mic implementation.
|
|
||||||
*/
|
|
||||||
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call real to complex FFT.
|
|
||||||
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
|
|
||||||
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
|
||||||
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
|
||||||
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
|
||||||
* TODO: opencl and mic implementations
|
|
||||||
*/
|
|
||||||
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Call complex to real iFFT.
|
|
||||||
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
|
|
||||||
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
|
|
||||||
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
|
|
||||||
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize compelx to real ifft.
|
|
||||||
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
|
|
||||||
* fft size.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
|
|
||||||
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
162
src/DKSOPAL.cpp
162
src/DKSOPAL.cpp
@ -1,162 +0,0 @@
|
|||||||
#include "DKSOPAL.h"
|
|
||||||
|
|
||||||
DKSOPAL::DKSOPAL() {
|
|
||||||
dkscol = nullptr;
|
|
||||||
dksgreens = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
|
|
||||||
setAPI(api_name, strlen(api_name));
|
|
||||||
setDevice(device_name, strlen(device_name));
|
|
||||||
}
|
|
||||||
|
|
||||||
DKSOPAL::~DKSOPAL() {
|
|
||||||
delete dkscol;
|
|
||||||
delete dksgreens;
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::setupOPAL() {
|
|
||||||
int ierr = DKS_ERROR;
|
|
||||||
if (apiOpenCL()) {
|
|
||||||
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
|
|
||||||
//TODO: only enable if AMD libraries are available
|
|
||||||
dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
|
|
||||||
dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
|
|
||||||
} else if (apiCuda()) {
|
|
||||||
ierr = CUDA_SAFECALL( DKS_SUCCESS );
|
|
||||||
dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
|
|
||||||
dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
|
|
||||||
} else if (apiOpenMP()) {
|
|
||||||
ierr = MIC_SAFECALL( DKS_SUCCESS );
|
|
||||||
dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
|
|
||||||
dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
|
|
||||||
} else {
|
|
||||||
ierr = DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::initDevice() {
|
|
||||||
int ierr = setupDevice();
|
|
||||||
if (ierr == DKS_SUCCESS)
|
|
||||||
ierr = setupOPAL();
|
|
||||||
return ierr;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hz_m0, double hz_m1, double hz_m2, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
|
|
||||||
hz_m0, hz_m1, hz_m2, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
|
||||||
int I, int J, int K, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
|
|
||||||
|
|
||||||
return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
|
||||||
int numparticles, int numparams,
|
|
||||||
int &numaddback, int &numdead,
|
|
||||||
bool enableRutherforScattering)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
|
|
||||||
bool enableRutherforScattering)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles, enableRutherforScattering);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles)
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
|
|
||||||
rx_ptr, ry_ptr, rz_ptr,
|
|
||||||
px_ptr, py_ptr, pz_ptr,
|
|
||||||
par_ptr, numparticles);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles, int &numaddback)
|
|
||||||
{
|
|
||||||
|
|
||||||
return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
|
|
||||||
rx_ptr, ry_ptr, rz_ptr,
|
|
||||||
px_ptr, py_ptr, pz_ptr,
|
|
||||||
par_ptr, numparticles, numaddback));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
|
||||||
void *dt_ptr, double dt, double c,
|
|
||||||
bool usedt, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
|
|
||||||
int npart, double c, int streamId) {
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge, double mass,
|
|
||||||
int npart, double c, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr,
|
|
||||||
charge, mass, npart, c, streamId);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
|
||||||
void *lastSec_ptr, void *orient_ptr,
|
|
||||||
int npart, int nsec, void *dt_ptr, double dt,
|
|
||||||
double c, bool usedt, int streamId)
|
|
||||||
{
|
|
||||||
|
|
||||||
return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
|
|
||||||
npart, nsec, dt_ptr, dt, c, usedt, streamId);
|
|
||||||
|
|
||||||
}
|
|
170
src/DKSOPAL.h
170
src/DKSOPAL.h
@ -1,170 +0,0 @@
|
|||||||
#ifndef H_DKS_OPAL
|
|
||||||
#define H_DKS_OPAL
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include "AutoTuning/DKSAutoTuning.h"
|
|
||||||
|
|
||||||
#include "DKSBase.h"
|
|
||||||
#include "DKSFFT.h"
|
|
||||||
|
|
||||||
#include "DKSDefinitions.h"
|
|
||||||
|
|
||||||
#include "Algorithms/GreensFunction.h"
|
|
||||||
#include "Algorithms/CollimatorPhysics.h"
|
|
||||||
#include "Algorithms/FFT.h"
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef DKS_AMD
|
|
||||||
#include "OpenCL/OpenCLFFT.h"
|
|
||||||
#include "OpenCL/OpenCLGreensFunction.h"
|
|
||||||
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_CUDA
|
|
||||||
#include "CUDA/CudaFFT.cuh"
|
|
||||||
#include "CUDA/CudaGreensFunction.cuh"
|
|
||||||
#include "CUDA/CudaCollimatorPhysics.cuh"
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DKS_MIC
|
|
||||||
#include "MIC/MICFFT.h"
|
|
||||||
#include "MIC/MICGreensFunction.hpp"
|
|
||||||
#include "MIC/MICCollimatorPhysics.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class DKSOPAL : public DKSFFT {
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
DKSCollimatorPhysics *dkscol;
|
|
||||||
GreensFunction *dksgreens;
|
|
||||||
|
|
||||||
int setupOPAL();
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
DKSOPAL();
|
|
||||||
|
|
||||||
DKSOPAL(const char* api_name, const char* device_name);
|
|
||||||
|
|
||||||
~DKSOPAL();
|
|
||||||
|
|
||||||
int initDevice();
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
///////Function library part of dksbase////////
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
|
||||||
* For specifics check OPAL docs.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
|
||||||
* For specifics check OPAL docs.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
|
|
||||||
int I, int J, int K, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
|
|
||||||
* For specifics check OPAL docs.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Element by element multiplication.
|
|
||||||
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
|
|
||||||
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
|
|
||||||
int numparticles, int numparams,
|
|
||||||
int &numaddback, int &numdead,
|
|
||||||
bool enableRutherfordScattering = true);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
|
|
||||||
bool enableRutherfordScattering = true);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
|
|
||||||
* TODO: opencl and mic implementations.
|
|
||||||
*/
|
|
||||||
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
|
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
|
||||||
void *px_ptr, void *py_ptr, void *pz_ptr,
|
|
||||||
void *par_ptr, int numparticles, int &numaddback);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
|
|
||||||
void *dt_ptr, double dt, double c,
|
|
||||||
bool usedt = false, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
|
|
||||||
void *lastSec_ptr, void *orient_ptr,
|
|
||||||
int npart, int nsec, void *dt_ptr,
|
|
||||||
double dt, double c, bool usedt = false,
|
|
||||||
int streamId = -1);
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
|
|
||||||
int npart, double c, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Integration code from ParallelTTracker from OPAL.
|
|
||||||
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
|
|
||||||
*/
|
|
||||||
int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
|
|
||||||
void *bf_ptr, void *dt_ptr, double charge,
|
|
||||||
double mass, int npart, double c, int streamId = -1);
|
|
||||||
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,22 +1,19 @@
|
|||||||
SET (_SRCS MICBase.cpp MICFFT.cpp)
|
SET (_SRCS
|
||||||
SET (_HDRS MICBase.h MICFFT.h)
|
MICBase.cpp
|
||||||
|
MICChiSquare.cpp
|
||||||
|
MICFFT.cpp
|
||||||
|
MICGreensFunction.cpp
|
||||||
|
MICCollimatorPhysics.cpp
|
||||||
|
)
|
||||||
|
|
||||||
IF (ENABLE_OPAL)
|
SET (_HDRS
|
||||||
SET (_SRCS
|
MICBase.h
|
||||||
${_SRCS}
|
MICChiSquare.h
|
||||||
MICChiSquare.cpp
|
MICFFT.h
|
||||||
MICGreensFunction.cpp
|
MICCollimatorPhysics.h
|
||||||
MICCollimatorPhysics.cpp
|
MICGreensFunction.hpp
|
||||||
)
|
MICMergeSort.h
|
||||||
|
)
|
||||||
SET (_HDRS
|
|
||||||
${_HDRS}
|
|
||||||
MICChiSquare.h
|
|
||||||
MICCollimatorPhysics.h
|
|
||||||
MICGreensFunction.hpp
|
|
||||||
MICMergeSort.h
|
|
||||||
)
|
|
||||||
ENDIF (ENABLE_OPAL)
|
|
||||||
|
|
||||||
#INCLUDE_DIRECTORIES (
|
#INCLUDE_DIRECTORIES (
|
||||||
# ${CMAKE_CURRENT_SOURCE_DIR}
|
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
|
@ -18,28 +18,30 @@ int MICBase::mic_createRandStreams(int size) {
|
|||||||
|
|
||||||
int seed = time(NULL);
|
int seed = time(NULL);
|
||||||
|
|
||||||
int numThreads = 0;
|
#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
|
||||||
#pragma offload target(mic:m_device_id) inout(numThreads)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
|
//get the number of threads
|
||||||
|
int numThreads;
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
numThreads = omp_get_num_threads();
|
numThreads = omp_get_num_threads();
|
||||||
}
|
|
||||||
|
|
||||||
defaultRndStream = mic_allocateMemory<VSLStreamStatePtr>(numThreads);
|
//if default rnd stream already allocated delete the array
|
||||||
VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
|
if (defaultRndSet == 1)
|
||||||
maxThreads = numThreads;
|
delete[] defaultRndStream;
|
||||||
|
|
||||||
#pragma offload target(mic:m_device_id) \
|
//allocate defaultRndStream array
|
||||||
in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN) \
|
defaultRndStream = new VSLStreamStatePtr[numThreads];
|
||||||
in(seed)
|
|
||||||
{
|
|
||||||
//create stream states for each thread
|
//create stream states for each thread
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < omp_get_num_threads(); i++)
|
for (int i = 0; i < omp_get_num_threads(); i++)
|
||||||
vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
|
vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
|
||||||
}
|
|
||||||
|
|
||||||
defaultRndSet = 1;
|
defaultRndSet = 1;
|
||||||
|
}
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -47,8 +49,15 @@ int MICBase::mic_createRandStreams(int size) {
|
|||||||
//delete default rand streams
|
//delete default rand streams
|
||||||
int MICBase::mic_deleteRandStreams() {
|
int MICBase::mic_deleteRandStreams() {
|
||||||
|
|
||||||
//mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
|
#pragma offload target(mic:m_device_id) inout(defaultRndSet)
|
||||||
return DKS_SUCCESS;
|
{
|
||||||
|
if (defaultRndSet == 1) {
|
||||||
|
delete[] defaultRndStream;
|
||||||
|
defaultRndSet = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
//create a new signal for the mic
|
//create a new signal for the mic
|
||||||
|
@ -30,19 +30,14 @@ class MICBase {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<int> micStreams;
|
std::vector<int> micStreams;
|
||||||
int maxThreads;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
|
|
||||||
int defaultRndSet;
|
int defaultRndSet;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
VSLStreamStatePtr *defaultRndStream;
|
||||||
//#pragma offload_attribute(push,target(mic))
|
|
||||||
void *defaultRndStream; //VSLSStreamStatePtr
|
|
||||||
void *testPtr;
|
|
||||||
|
|
||||||
//#pragma offload_attribute(pop)
|
|
||||||
|
|
||||||
int m_device_id;
|
int m_device_id;
|
||||||
|
|
||||||
/* constructor */
|
/* constructor */
|
||||||
@ -207,6 +202,7 @@ public:
|
|||||||
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
|
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -292,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
|
|||||||
|
|
||||||
const double deltas = par[DT_M] * beta * C;
|
const double deltas = par[DT_M] * beta * C;
|
||||||
const double deltasrho = deltas * 100 * par[RHO_M];
|
const double deltasrho = deltas * 100 * par[RHO_M];
|
||||||
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
|
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
|
||||||
|
|
||||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||||
const double Ts = (Eng * 1E6) / 1.0073;
|
const double Ts = (Eng * 1E6) / 1.0073;
|
||||||
@ -338,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
|||||||
|
|
||||||
const double deltas = par[DT_M] * beta * C;
|
const double deltas = par[DT_M] * beta * C;
|
||||||
const double deltasrho = deltas * 100 * par[RHO_M];
|
const double deltasrho = deltas * 100 * par[RHO_M];
|
||||||
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
|
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
|
||||||
|
|
||||||
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
|
||||||
const double Ts = (Eng * 1E6) / 1.0073;
|
const double Ts = (Eng * 1E6) / 1.0073;
|
||||||
@ -368,25 +368,21 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
|
||||||
bool enableRutherforScattering)
|
|
||||||
{
|
|
||||||
|
|
||||||
//cast device memory pointers to appropriate types
|
//cast device memory pointers to appropriate types
|
||||||
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
|
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
|
||||||
double *par = (double*) par_ptr;
|
double *par = (double*) par_ptr;
|
||||||
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
|
|
||||||
|
|
||||||
#pragma offload target(mic:m_micbase->m_device_id) \
|
#pragma offload target(mic:m_micbase->m_device_id) \
|
||||||
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
|
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
|
||||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||||
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
|
|
||||||
in(numparticles)
|
in(numparticles)
|
||||||
{
|
{
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
|
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
||||||
|
|
||||||
//for loop trough particles if not checkhit set label to -2 and update R.x
|
//for loop trough particles if not checkhit set label to -2 and update R.x
|
||||||
|
|
||||||
@ -463,8 +459,6 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
int padding = numparticles % MIC_WIDTH;
|
int padding = numparticles % MIC_WIDTH;
|
||||||
int totalpart = numparticles + padding;
|
int totalpart = numparticles + padding;
|
||||||
|
|
||||||
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
|
|
||||||
|
|
||||||
#pragma offload target (mic:0) \
|
#pragma offload target (mic:0) \
|
||||||
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
in(label:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
@ -475,16 +469,14 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
in(py:length(0) DKS_REUSE DKS_RETAIN) \
|
in(py:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
|
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
|
||||||
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
in(par:length(0) DKS_RETAIN DKS_REUSE) \
|
||||||
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
|
|
||||||
in(totalpart)
|
in(totalpart)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
//every thread gets its own rnd stream state
|
//every thread gets its own rnd stream state
|
||||||
//VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
|
||||||
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
|
|
||||||
|
|
||||||
#pragma omp for nowait
|
#pragma omp for nowait
|
||||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||||
@ -520,11 +512,9 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
double Eng = (sq - 1) * M_P;
|
double Eng = (sq - 1) * M_P;
|
||||||
double dEdx = 0;
|
double dEdx = 0;
|
||||||
|
|
||||||
|
|
||||||
if (label[i] == 0) {
|
if (label[i] == 0) {
|
||||||
energyLoss(Eng, dEdx, par, randv, i - ii);
|
energyLoss(Eng, dEdx, par, randv, i - ii);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (Eng > 1e-4 && dEdx < 0) {
|
if (Eng > 1e-4 && dEdx < 0) {
|
||||||
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
|
||||||
@ -536,12 +526,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
|
|
||||||
if (Eng < 1e-4 || dEdx > 0)
|
if (Eng < 1e-4 || dEdx > 0)
|
||||||
label[i] = -1;
|
label[i] = -1;
|
||||||
|
|
||||||
} //end inner energy loss loop
|
} //end inner energy loss loop
|
||||||
|
|
||||||
} //end outer energy loss loop
|
|
||||||
|
|
||||||
|
} //end outer energy loss loop
|
||||||
|
|
||||||
//vectorize coulomb scattering as much as possible
|
//vectorize coulomb scattering as much as possible
|
||||||
#pragma omp for nowait
|
#pragma omp for nowait
|
||||||
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
|
||||||
@ -551,7 +540,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
|
|||||||
} //end omp parallel
|
} //end omp parallel
|
||||||
|
|
||||||
} //end offload
|
} //end offload
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ typedef struct {
|
|||||||
} MIC_PART_SMALL;
|
} MIC_PART_SMALL;
|
||||||
|
|
||||||
|
|
||||||
class MICCollimatorPhysics : public DKSCollimatorPhysics {
|
class MICCollimatorPhysics : DKSAlogorithms{
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -38,10 +38,9 @@ public:
|
|||||||
m_micbase = base;
|
m_micbase = base;
|
||||||
};
|
};
|
||||||
|
|
||||||
~MICCollimatorPhysics() { };
|
~MICCollimatorPhysics() { };
|
||||||
|
|
||||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
|
||||||
bool enableRutherforScattering = true);
|
|
||||||
|
|
||||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
@ -6,16 +6,13 @@
|
|||||||
|
|
||||||
MICFFT::MICFFT(MICBase *base) {
|
MICFFT::MICFFT(MICBase *base) {
|
||||||
m_micbase = base;
|
m_micbase = base;
|
||||||
m_fftsetup = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MICFFT::~MICFFT() {
|
MICFFT::~MICFFT() {
|
||||||
if (m_fftsetup) {
|
|
||||||
#pragma offload target(mic:0)
|
#pragma offload target(mic:0)
|
||||||
{
|
{
|
||||||
DftiFreeDescriptor(&FFTHandle_m);
|
DftiFreeDescriptor(&FFTHandle_m);
|
||||||
DftiFreeDescriptor(&handle);
|
DftiFreeDescriptor(&handle);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,7 +35,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m_fftsetup = true;
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
//BENI:
|
//BENI:
|
||||||
@ -125,8 +122,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
|
|||||||
}
|
}
|
||||||
|
|
||||||
//execute iFFT
|
//execute iFFT
|
||||||
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
|
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
|
||||||
return executeFFT(mem_ptr, ndim, N, -1, false);
|
return mic_executeFFT(mem_ptr, ndim, N, -1, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
//execute REAL->COMPLEX FFT
|
//execute REAL->COMPLEX FFT
|
||||||
|
@ -7,14 +7,13 @@
|
|||||||
#include <offload.h>
|
#include <offload.h>
|
||||||
#include <mkl_dfti.h>
|
#include <mkl_dfti.h>
|
||||||
|
|
||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithm/DKSFFT.h"
|
||||||
#include "MICBase.h"
|
#include "MICBase.h"
|
||||||
|
|
||||||
class MICFFT : public BaseFFT {
|
class MICFFT : public DKSFFT {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
bool m_fftsetup;
|
|
||||||
MICBase *m_micbase;
|
MICBase *m_micbase;
|
||||||
|
|
||||||
/// Internal FFT object for performing serial FFTs.
|
/// Internal FFT object for performing serial FFTs.
|
||||||
@ -75,18 +74,6 @@ public:
|
|||||||
/* normalize IFFT on MIC */
|
/* normalize IFFT on MIC */
|
||||||
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
|
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
|
||||||
|
|
||||||
/**
|
|
||||||
* Info: destroy default FFT plans
|
|
||||||
* Return: success or error code
|
|
||||||
*/
|
|
||||||
int destroyFFT() { return DKS_SUCCESS; }
|
|
||||||
|
|
||||||
/*
|
|
||||||
Info: execute normalize for complex to real iFFT
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
|
||||||
double hr_m0, double hr_m1, double hr_m2, int streamId)
|
double hr_m1, double hr_m2)
|
||||||
{
|
{
|
||||||
|
|
||||||
double *tmp_ptr = (double*) tmpgreen;
|
double *tmp_ptr = (double*) tmp_ptr_;
|
||||||
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
|
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
|
||||||
{
|
{
|
||||||
std::memset(tmp_ptr,0,I*J*K);
|
std::memset(tmp_ptr,0,I*J*K);
|
||||||
@ -173,14 +173,12 @@ return 0;
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
//CUDA similar version:
|
//CUDA similar version:
|
||||||
int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
|
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
|
||||||
int streamId)
|
double *tmpgreen = (double*) tmp_ptr_;
|
||||||
{
|
double *mem_ptr = (double*) mem_ptr_;
|
||||||
double *tmpgreen_ptr = (double*) tmpgreen;
|
|
||||||
double *mem_ptr = (double*) rho2_m;
|
|
||||||
|
|
||||||
// the actual integration
|
// the actual integration
|
||||||
#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||||
{
|
{
|
||||||
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
|
||||||
std::memset(mem_ptr,0,II*JJ*KK);
|
std::memset(mem_ptr,0,II*JJ*KK);
|
||||||
@ -199,27 +197,27 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
|
|||||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||||
tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (i+1 < NI_tmp)
|
if (i+1 < NI_tmp)
|
||||||
tmp1 = tmpgreen_ptr[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (j+1 < NJ_tmp)
|
if (j+1 < NJ_tmp)
|
||||||
tmp2 = tmpgreen_ptr[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (k+1 < NK_tmp)
|
if (k+1 < NK_tmp)
|
||||||
tmp3 = tmpgreen_ptr[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
||||||
tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
||||||
tmp5 = tmpgreen_ptr[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
||||||
tmp6 = tmpgreen_ptr[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
tmp7 = tmpgreen_ptr[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
||||||
|
|
||||||
@ -236,8 +234,8 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
|
int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
|
||||||
double *mem_ptr = (double*) rho2_m;
|
double *mem_ptr = (double*) mem_ptr_;
|
||||||
|
|
||||||
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
|
||||||
{
|
{
|
||||||
@ -283,11 +281,11 @@ int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int str
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*multiply complex fields*/
|
/*multiply complex fields*/
|
||||||
int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
|
int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
|
||||||
// double *mem_ptr1 = (double*) mem_ptr1_;
|
// double *mem_ptr1 = (double*) mem_ptr1_;
|
||||||
// double *mem_ptr2 = (double*) mem_ptr2_;
|
// double *mem_ptr2 = (double*) mem_ptr2_;
|
||||||
_Complex double *mem_ptr1 = (_Complex double *) ptr1;
|
_Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
|
||||||
_Complex double *mem_ptr2 = (_Complex double *) ptr2;
|
_Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
|
||||||
|
|
||||||
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
|
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
|
||||||
{
|
{
|
||||||
|
@ -9,13 +9,12 @@
|
|||||||
#include <offload.h>
|
#include <offload.h>
|
||||||
#include <mkl_dfti.h>
|
#include <mkl_dfti.h>
|
||||||
|
|
||||||
#include "../Algorithms/GreensFunction.h"
|
|
||||||
#include "MICBase.h"
|
#include "MICBase.h"
|
||||||
|
|
||||||
#define DKS_SUCCESS 0
|
#define DKS_SUCCESS 0
|
||||||
#define DKS_ERROR 1
|
#define DKS_ERROR 1
|
||||||
|
|
||||||
class MICGreensFunction : public GreensFunction {
|
class MICGreensFunction {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MICBase *m_micbase;
|
MICBase *m_micbase;
|
||||||
@ -29,18 +28,16 @@ public:
|
|||||||
~MICGreensFunction();
|
~MICGreensFunction();
|
||||||
|
|
||||||
/* compute greens integral analytically */
|
/* compute greens integral analytically */
|
||||||
int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
|
int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
|
||||||
double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
|
|
||||||
|
|
||||||
/* perform the actual integration */
|
/* perform the actual integration */
|
||||||
int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K,
|
int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
|
||||||
int stremaId = -1);
|
|
||||||
|
|
||||||
/* Mirror rho-Field */
|
/* Mirror rho-Field */
|
||||||
int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);
|
int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
|
||||||
|
|
||||||
/*multiply complex fields*/
|
/*multiply complex fields*/
|
||||||
int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);
|
int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,53 +1,31 @@
|
|||||||
#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found
|
SET (_SRCS
|
||||||
|
OpenCLBase.cpp
|
||||||
SET (_HDRS OpenCLBase.h)
|
OpenCLFFT.cpp
|
||||||
SET (_SRCS OpenCLBase.cpp)
|
OpenCLChiSquare.cpp
|
||||||
SET (_KERNELS "")
|
OpenCLCollimatorPhysics.cpp
|
||||||
|
OpenCLChiSquareRuntime.cpp
|
||||||
IF (ENABLE_AMD)
|
|
||||||
SET (_SRCS
|
|
||||||
${_SRCS}
|
|
||||||
OpenCLFFT.cpp
|
|
||||||
)
|
|
||||||
|
|
||||||
SET (_HDRS
|
|
||||||
${_HDRS}
|
|
||||||
OpenCLFFT.h
|
|
||||||
)
|
|
||||||
|
|
||||||
SET (_KERNELS
|
|
||||||
${_KERNELS}
|
|
||||||
OpenCLKernels/OpenCLFFT.cl
|
|
||||||
OpenCLKernels/OpenCLFFTStockham.cl
|
|
||||||
OpenCLKernels/OpenCLTranspose.cl
|
|
||||||
)
|
)
|
||||||
ENDIF (ENABLE_AMD)
|
|
||||||
|
|
||||||
IF (ENABLE_MUSR)
|
SET (_HDRS
|
||||||
SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
|
OpenCLBase.h
|
||||||
SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
|
OpenCLFFT.h
|
||||||
SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
|
OpenCLChiSquare.h
|
||||||
ENDIF (ENABLE_MUSR)
|
OpenCLCollimatorPhysics.h
|
||||||
|
OpenCLChiSquareRuntime.h
|
||||||
IF (ENABLE_AMD AND ENABLE_OPAL)
|
)
|
||||||
SET (_SRCS
|
|
||||||
${_SRCS}
|
#INCLUDE_DIRECTORIES (
|
||||||
OpenCLCollimatorPhysics.cpp
|
# ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
OpenCLGreensFunction.cpp
|
#)
|
||||||
)
|
|
||||||
|
SET (_KERNELS
|
||||||
SET (_HDRS
|
OpenCLKernels/OpenCLChiSquare.cl
|
||||||
${_HDRS}
|
OpenCLKernels/OpenCLFFT.cl
|
||||||
OpenCLCollimatorPhysics.h
|
OpenCLKernels/OpenCLFFTStockham.cl
|
||||||
OpenCLGreensFunction.h
|
OpenCLKernels/OpenCLTranspose.cl
|
||||||
)
|
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
||||||
|
OpenCLKernels/OpenCLChiSquareRuntime.cl
|
||||||
SET (_KERNELS
|
|
||||||
${_KERNELS}
|
|
||||||
OpenCLKernels/OpenCLCollimatorPhysics.cl
|
|
||||||
OpenCLKernels/OpenCLGreensFunction.cl
|
|
||||||
)
|
)
|
||||||
ENDIF (ENABLE_AMD AND ENABLE_OPAL)
|
|
||||||
|
|
||||||
ADD_SOURCES (${_SRCS})
|
ADD_SOURCES (${_SRCS})
|
||||||
ADD_HEADERS (${_HDRS})
|
ADD_HEADERS (${_HDRS})
|
||||||
|
@ -7,13 +7,21 @@ cl_device_id OpenCLBase::m_device_id = NULL;
|
|||||||
cl_event OpenCLBase::m_last_event = NULL;
|
cl_event OpenCLBase::m_last_event = NULL;
|
||||||
|
|
||||||
OpenCLBase::OpenCLBase() {
|
OpenCLBase::OpenCLBase() {
|
||||||
|
//m_context = NULL;
|
||||||
|
//m_command_queue = NULL;
|
||||||
m_program = NULL;
|
m_program = NULL;
|
||||||
m_kernel = NULL;
|
m_kernel = NULL;
|
||||||
|
//m_device_id = NULL;
|
||||||
|
//m_platform_id = NULL;
|
||||||
m_kernel_file = NULL;
|
m_kernel_file = NULL;
|
||||||
|
|
||||||
m_last_event = NULL;
|
m_last_event = NULL;
|
||||||
|
|
||||||
|
//m_events = new cl_event[500];
|
||||||
|
//m_num_events = 0;
|
||||||
|
|
||||||
defaultRndSet = 0;
|
defaultRndSet = 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
OpenCLBase::~OpenCLBase() {
|
OpenCLBase::~OpenCLBase() {
|
||||||
@ -33,11 +41,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
|
|||||||
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
|
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
|
||||||
ocl_loadKernel(kernel_file);
|
ocl_loadKernel(kernel_file);
|
||||||
delete[] kernel_file;
|
delete[] kernel_file;
|
||||||
|
|
||||||
//allocate memory for rand states
|
//allocate memory for rand states
|
||||||
int ierr;
|
int ierr;
|
||||||
defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
|
defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
|
||||||
|
|
||||||
//exec kernel
|
//exec kernel
|
||||||
int seed = 0;
|
int seed = 0;
|
||||||
ocl_createKernel("initRand");
|
ocl_createKernel("initRand");
|
||||||
@ -47,34 +55,13 @@ int OpenCLBase::ocl_createRndStates(int size) {
|
|||||||
|
|
||||||
size_t work_items = size;
|
size_t work_items = size;
|
||||||
size_t work_group_size = 1;
|
size_t work_group_size = 1;
|
||||||
|
|
||||||
ocl_executeKernel(1, &work_items, &work_group_size);
|
ocl_executeKernel(1, &work_items, &work_group_size);
|
||||||
|
|
||||||
defaultRndSet = 1;
|
defaultRndSet = 1;
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
|
return OCL_SUCCESS;
|
||||||
//load kernel
|
|
||||||
char * kernel_file = new char[500];
|
|
||||||
kernel_file[0] = '\0';
|
|
||||||
strcat(kernel_file, OPENCL_KERNELS);
|
|
||||||
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
|
|
||||||
ocl_loadKernel(kernel_file);
|
|
||||||
delete[] kernel_file;
|
|
||||||
|
|
||||||
//set kernel variables
|
|
||||||
cl_mem tmp_data = (cl_mem) mem_ptr;
|
|
||||||
|
|
||||||
ocl_createKernel("createRandoms");
|
|
||||||
ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
|
|
||||||
ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
|
|
||||||
ocl_setKernelArg(2, sizeof(int), &size);
|
|
||||||
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = (size % work_size + 1) * work_size;
|
|
||||||
ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* destroy rnd states */
|
/* destroy rnd states */
|
||||||
@ -83,7 +70,7 @@ int OpenCLBase::ocl_deleteRndStates() {
|
|||||||
ocl_freeMemory(defaultRndState);
|
ocl_freeMemory(defaultRndState);
|
||||||
defaultRndSet = 0;
|
defaultRndSet = 0;
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -441,8 +428,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
//create program from kernel
|
//create program from kernel
|
||||||
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
|
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
|
||||||
NULL, &ierr);
|
|
||||||
if (ierr != CL_SUCCESS) {
|
if (ierr != CL_SUCCESS) {
|
||||||
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
@ -452,7 +438,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
check if compiling kernel source succeded, if failed return error code
|
check if compileng kernel source succeded, if failed return error code
|
||||||
if in debug mode get compilation info and print program build log witch
|
if in debug mode get compilation info and print program build log witch
|
||||||
will give indication what made the compilation fail
|
will give indication what made the compilation fail
|
||||||
*/
|
*/
|
||||||
@ -461,8 +447,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
|
|||||||
|
|
||||||
//get build status
|
//get build status
|
||||||
cl_build_status status;
|
cl_build_status status;
|
||||||
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
|
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
|
||||||
sizeof(cl_build_status), &status, NULL);
|
|
||||||
|
|
||||||
//get log size
|
//get log size
|
||||||
size_t log_size;
|
size_t log_size;
|
||||||
@ -628,12 +613,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ierr != DKS_SUCCESS) {
|
if (ierr != OCL_SUCCESS) {
|
||||||
DEBUG_MSG("Failed to build kernel file " << kernel_file);
|
DEBUG_MSG("Failed to build kernel file " << kernel_file);
|
||||||
return DKS_ERROR;
|
return OCL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
//compile kernel form source code provided
|
//compile kernel form source code provided
|
||||||
|
@ -30,10 +30,13 @@
|
|||||||
#include <CL/cl_ext.h>
|
#include <CL/cl_ext.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include "../DKSDefinitions.h"
|
#include "../DKSDefinitions.h"
|
||||||
|
|
||||||
/* struct for random number state */
|
/* struct for random number state */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
||||||
double s10;
|
double s10;
|
||||||
double s11;
|
double s11;
|
||||||
double s12;
|
double s12;
|
||||||
@ -42,12 +45,16 @@ typedef struct {
|
|||||||
double s22;
|
double s22;
|
||||||
double z;
|
double z;
|
||||||
bool gen;
|
bool gen;
|
||||||
|
|
||||||
} RNDState;
|
} RNDState;
|
||||||
|
|
||||||
class OpenCLBase {
|
class OpenCLBase {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
static cl_context m_context;
|
||||||
|
static cl_command_queue m_command_queue;
|
||||||
|
|
||||||
static cl_platform_id m_platform_id;
|
static cl_platform_id m_platform_id;
|
||||||
static cl_device_id m_device_id;
|
static cl_device_id m_device_id;
|
||||||
|
|
||||||
@ -111,9 +118,6 @@ protected:
|
|||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
static cl_context m_context;
|
|
||||||
static cl_command_queue m_command_queue;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
constructor
|
constructor
|
||||||
@ -131,11 +135,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
int ocl_createRndStates(int size);
|
int ocl_createRndStates(int size);
|
||||||
|
|
||||||
/* Create an array of random numbers on the device
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
int ocl_createRandomNumbers(void *mem_ptr, int size);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Destroy rnd states
|
Destroy rnd states
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
@ -196,7 +195,7 @@ public:
|
|||||||
Return: return pointer to memory
|
Return: return pointer to memory
|
||||||
*/
|
*/
|
||||||
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
cl_mem ocl_allocateMemory(size_t size, int &ierr);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Name: allocateMemory
|
Name: allocateMemory
|
||||||
Info: allocate memory on device
|
Info: allocate memory on device
|
||||||
@ -204,20 +203,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
|
||||||
|
|
||||||
/** Zero OpenCL memory buffer
|
|
||||||
* Set all the elemetns in the device array to zero
|
|
||||||
*/
|
|
||||||
template <typename T>
|
|
||||||
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
|
|
||||||
|
|
||||||
cl_int ierr;
|
|
||||||
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
|
|
||||||
sizeof(T)*size, 0, nullptr, nullptr);
|
|
||||||
if (ierr != CL_SUCCESS)
|
|
||||||
return DKS_ERROR;
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Name: writeData
|
Name: writeData
|
||||||
Info: write data to device memory (needs ptr to mem object)
|
Info: write data to device memory (needs ptr to mem object)
|
||||||
|
@ -34,7 +34,7 @@ TODO:
|
|||||||
2. boost.compute sort for user defined structure crashes
|
2. boost.compute sort for user defined structure crashes
|
||||||
*/
|
*/
|
||||||
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
|
||||||
int numparticles, bool enableRutherforScattering)
|
int numparticles)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
//set number of total threads, and number threads per block
|
//set number of total threads, and number threads per block
|
||||||
|
@ -52,8 +52,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* execute degrader code on device */
|
/* execute degrader code on device */
|
||||||
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
|
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
|
||||||
bool enableRutherforScattering = true);
|
|
||||||
|
|
||||||
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
|
||||||
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
void *rx_ptr, void *ry_ptr, void *rz_ptr,
|
||||||
|
@ -31,6 +31,7 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
|
|||||||
|
|
||||||
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
|
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
|
||||||
return OCL_ERROR;
|
return OCL_ERROR;
|
||||||
|
|
||||||
|
|
||||||
//execute kernel
|
//execute kernel
|
||||||
for (int step = 1; step < N; step <<= 1) {
|
for (int step = 1; step < N; step <<= 1) {
|
||||||
@ -88,78 +89,26 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
|
|||||||
call fft execution on device for every dimension
|
call fft execution on device for every dimension
|
||||||
*/
|
*/
|
||||||
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
|
||||||
|
int ierr;
|
||||||
int dkserr = DKS_SUCCESS;
|
|
||||||
cl_int ierr;
|
|
||||||
cl_mem inout = (cl_mem)data;
|
cl_mem inout = (cl_mem)data;
|
||||||
|
int n = N[0];
|
||||||
|
|
||||||
if (forward)
|
for (int dim = 0; dim < ndim; dim++) {
|
||||||
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
|
||||||
0, NULL, NULL, &inout, NULL, NULL);
|
if (ierr != OCL_SUCCESS) {
|
||||||
else
|
DEBUG_MSG("Error executing bit reverse");
|
||||||
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
|
return OCL_ERROR;
|
||||||
0, NULL, NULL, &inout, NULL, NULL);
|
}
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
|
||||||
dkserr = DKS_ERROR;
|
if (ierr != OCL_SUCCESS) {
|
||||||
DEBUG_MSG("Error executing cfFFT\n");
|
DEBUG_MSG("Error executing fft reverse");
|
||||||
if (ierr == CLFFT_INVALID_PLAN)
|
return OCL_ERROR;
|
||||||
std::cout << "Invlalid plan" << std::endl;
|
}
|
||||||
else
|
|
||||||
std::cout << "CLFFT error" << std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return dkserr;
|
return OCL_SUCCESS;
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
call rcfft execution on device for every dimension
|
|
||||||
*/
|
|
||||||
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
|
||||||
|
|
||||||
int dkserr = DKS_SUCCESS;
|
|
||||||
cl_int ierr;
|
|
||||||
cl_mem real_in = (cl_mem)real_ptr;
|
|
||||||
cl_mem comp_out = (cl_mem)comp_ptr;
|
|
||||||
|
|
||||||
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
|
|
||||||
0, NULL, NULL, &real_in, &comp_out, NULL);
|
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
|
||||||
dkserr = DKS_ERROR;
|
|
||||||
DEBUG_MSG("Error executing cfFFT\n");
|
|
||||||
if (ierr == CLFFT_INVALID_PLAN)
|
|
||||||
std::cout << "Invlalid plan" << std::endl;
|
|
||||||
else
|
|
||||||
std::cout << "CLFFT error" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return dkserr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
call rcfft execution on device for every dimension
|
|
||||||
*/
|
|
||||||
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
|
|
||||||
|
|
||||||
int dkserr = DKS_SUCCESS;
|
|
||||||
cl_int ierr;
|
|
||||||
cl_mem real_in = (cl_mem)real_ptr;
|
|
||||||
cl_mem comp_out = (cl_mem)comp_ptr;
|
|
||||||
|
|
||||||
ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
|
|
||||||
0, NULL, NULL, &comp_out, &real_in, NULL);
|
|
||||||
|
|
||||||
if (ierr != OCL_SUCCESS) {
|
|
||||||
dkserr = DKS_ERROR;
|
|
||||||
DEBUG_MSG("Error executing cfFFT\n");
|
|
||||||
if (ierr == CLFFT_INVALID_PLAN)
|
|
||||||
std::cout << "Invlalid plan" << std::endl;
|
|
||||||
else
|
|
||||||
std::cout << "CLFFT error" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return dkserr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -171,11 +120,10 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
|
call kernel to normalize fft
|
||||||
*/
|
*/
|
||||||
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
||||||
|
|
||||||
/*
|
|
||||||
cl_mem inout = (cl_mem)data;
|
cl_mem inout = (cl_mem)data;
|
||||||
|
|
||||||
int n = N[0];
|
int n = N[0];
|
||||||
@ -202,175 +150,132 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
|
|||||||
DEBUG_MSG("Error executing kernel");
|
DEBUG_MSG("Error executing kernel");
|
||||||
return OCL_ERROR;
|
return OCL_ERROR;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
return OCL_SUCCESS;
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int OpenCLFFT::setupFFT(int ndim, int N[3]) {
|
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
|
||||||
|
|
||||||
|
int ierr;
|
||||||
|
int size = sizeof(cl_double2)*pow(N,ndim);
|
||||||
|
|
||||||
|
cl_mem mem_tmp;
|
||||||
|
cl_mem mem_src = (cl_mem)src;
|
||||||
|
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
|
||||||
|
|
||||||
cl_int err;
|
//set the number of work items in each dimension
|
||||||
|
size_t work_items[3];
|
||||||
|
int p = 1;
|
||||||
|
int threads = N / 2;
|
||||||
|
int f = (forward) ? -1 : 1;
|
||||||
|
|
||||||
|
//execute kernel
|
||||||
|
int n = (int)log2(N);
|
||||||
|
for (int i = 0; i < ndim; i++) {
|
||||||
|
|
||||||
clfftDim dim;
|
int dim = i+1;
|
||||||
if (ndim == 1)
|
p = 1;
|
||||||
dim = CLFFT_1D;
|
work_items[0] = (dim == 1) ? N/2 : N;
|
||||||
else if (ndim == 2)
|
work_items[1] = (dim == 2) ? N/2 : N;
|
||||||
dim = CLFFT_2D;
|
work_items[2] = (dim == 3) ? N/2 : N;
|
||||||
else
|
|
||||||
dim = CLFFT_3D;
|
//transpose array if calculating dimension larger than 1
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
//if (dim > 1)
|
||||||
|
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
||||||
|
|
||||||
|
//create kernel and set kernel arguments
|
||||||
|
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
|
||||||
|
return OCL_ERROR;
|
||||||
|
|
||||||
|
for (int t = 1; t <= log2(N); t++) {
|
||||||
|
|
||||||
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
|
||||||
|
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
|
||||||
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
|
||||||
|
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
|
||||||
|
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
|
||||||
|
|
||||||
|
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
|
||||||
|
return OCL_ERROR;
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
mem_tmp = mem_src;
|
||||||
err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
|
mem_src = mem_dst;
|
||||||
|
mem_dst = mem_tmp;
|
||||||
/* Set plan parameters */
|
|
||||||
err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
|
p = 2*p;
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
std::cout << "Error setting precision" << std::endl;
|
|
||||||
err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
|
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
std::cout << "Error setting layout" << std::endl;
|
|
||||||
err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
|
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
std::cout << "Error setting result location" << std::endl;
|
|
||||||
/* Bake the plan */
|
|
||||||
err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
DEBUG_MSG("Error creating Complex-to-complex plan");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
|
|
||||||
cl_int err;
|
|
||||||
|
|
||||||
clfftDim dim;
|
|
||||||
if (ndim == 1)
|
|
||||||
dim = CLFFT_1D;
|
|
||||||
else if (ndim == 2)
|
|
||||||
dim = CLFFT_2D;
|
|
||||||
else
|
|
||||||
dim = CLFFT_3D;
|
|
||||||
|
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
|
||||||
|
|
||||||
size_t half = (size_t)N[0] / 2 + 1;
|
|
||||||
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
|
||||||
size_t clOutStride[3] = {1, half, half * N[1]};
|
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
|
||||||
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
|
|
||||||
|
|
||||||
/* Set plan parameters */
|
|
||||||
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
|
|
||||||
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
|
|
||||||
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
|
|
||||||
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
|
|
||||||
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
|
|
||||||
|
|
||||||
/* Bake the plan */
|
|
||||||
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
DEBUG_MSG("Error creating Real-to-complex plan");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
|
|
||||||
cl_int err;
|
|
||||||
|
|
||||||
clfftDim dim;
|
|
||||||
if (ndim == 1)
|
|
||||||
dim = CLFFT_1D;
|
|
||||||
else if (ndim == 2)
|
|
||||||
dim = CLFFT_2D;
|
|
||||||
else
|
|
||||||
dim = CLFFT_3D;
|
|
||||||
|
|
||||||
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
|
|
||||||
|
|
||||||
size_t half = (size_t)N[0] / 2 + 1;
|
|
||||||
size_t clInStride[3] = {1, half, half * N[1]};
|
|
||||||
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
|
|
||||||
|
|
||||||
/* Create 3D fft plan*/
|
|
||||||
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
|
|
||||||
|
|
||||||
/* Set plan parameters */
|
|
||||||
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
|
|
||||||
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
|
|
||||||
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
|
|
||||||
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
|
|
||||||
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
|
|
||||||
|
|
||||||
/* Bake the plan */
|
|
||||||
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
DEBUG_MSG("Error creating Complex-to-real plan");
|
|
||||||
return DKS_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLFFT::destroyFFT() {
|
|
||||||
clfftDestroyPlan(&planHandleZ2Z);
|
|
||||||
clfftDestroyPlan(&planHandleD2Z);
|
|
||||||
clfftDestroyPlan(&planHandleZ2D);
|
|
||||||
|
|
||||||
clfftTeardown();
|
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void OpenCLFFT::printError(clfftStatus err) {
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
std::cout << "Error creating default plan " << err << std::endl;
|
|
||||||
switch(err) {
|
|
||||||
case CLFFT_BUGCHECK:
|
|
||||||
std::cout << "bugcheck" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_NOTIMPLEMENTED:
|
|
||||||
std::cout << "not implemented" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
|
|
||||||
std::cout << "transposed not implemented" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_FILE_NOT_FOUND:
|
|
||||||
std::cout << "file not found" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_FILE_CREATE_FAILURE:
|
|
||||||
std::cout << "file create failure" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_VERSION_MISMATCH:
|
|
||||||
std::cout << "version missmatch" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_INVALID_PLAN:
|
|
||||||
std::cout << "invalid plan" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_DEVICE_NO_DOUBLE:
|
|
||||||
std::cout << "no double" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_DEVICE_MISMATCH:
|
|
||||||
std::cout << "device missmatch" << std::endl;
|
|
||||||
break;
|
|
||||||
case CLFFT_ENDSTATUS:
|
|
||||||
std::cout << "end status" << std::endl;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
std::cout << "other: " << err << std::endl;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//transpose array back if calculating dimension larger than 1
|
||||||
|
//if (dim > 1)
|
||||||
|
// ocl_executeTranspose(mem_src, N, ndim, dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ndim*n % 2 == 1) {
|
||||||
|
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
|
||||||
|
mem_tmp = mem_src;
|
||||||
|
mem_src = mem_dst;
|
||||||
|
mem_dst = mem_tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_oclbase->ocl_freeMemory(mem_dst);
|
||||||
|
|
||||||
|
return OCL_SUCCESS;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
|
||||||
|
|
||||||
|
cl_mem mem_src = (cl_mem)src;
|
||||||
|
|
||||||
|
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
|
||||||
|
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
|
||||||
|
|
||||||
|
m_oclbase->ocl_createKernel("fft_batch3D");
|
||||||
|
|
||||||
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
|
||||||
|
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
|
||||||
|
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
|
||||||
|
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
|
||||||
|
|
||||||
|
|
||||||
|
for (int dim = 1; dim < ndim+1; dim++) {
|
||||||
|
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
|
||||||
|
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return OCL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
|
||||||
|
|
||||||
|
cl_mem mem_src = (cl_mem)src;
|
||||||
|
|
||||||
|
if (ndim == 1)
|
||||||
|
return OCL_SUCCESS;
|
||||||
|
|
||||||
|
size_t work_items[3];
|
||||||
|
work_items[0] = N[0];
|
||||||
|
work_items[1] = N[1];
|
||||||
|
work_items[2] = 1;
|
||||||
|
|
||||||
|
size_t work_group_size[3];
|
||||||
|
work_group_size[0] = N[0];
|
||||||
|
work_group_size[1] = N[1];
|
||||||
|
work_group_size[2] = 1;
|
||||||
|
|
||||||
|
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
|
||||||
|
|
||||||
|
m_oclbase->ocl_createKernel("transpose");
|
||||||
|
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
|
||||||
|
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
|
||||||
|
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
|
||||||
|
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
|
||||||
|
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
|
||||||
|
|
||||||
|
return OCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -20,19 +20,12 @@
|
|||||||
#include "../Algorithms/FFT.h"
|
#include "../Algorithms/FFT.h"
|
||||||
#include "OpenCLBase.h"
|
#include "OpenCLBase.h"
|
||||||
|
|
||||||
#include "clFFT.h"
|
class OpenCLFFT : public DKSFFT {
|
||||||
|
|
||||||
class OpenCLFFT : public BaseFFT {
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
OpenCLBase *m_oclbase;
|
OpenCLBase *m_oclbase;
|
||||||
|
|
||||||
clfftSetupData fftSetup;
|
|
||||||
clfftPlanHandle planHandleZ2Z;
|
|
||||||
clfftPlanHandle planHandleD2Z;
|
|
||||||
clfftPlanHandle planHandleZ2D;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Info: call fft kernels to execute FFT of the given domain,
|
Info: call fft kernels to execute FFT of the given domain,
|
||||||
data - devevice memory ptr, cdim - current dim to transform,
|
data - devevice memory ptr, cdim - current dim to transform,
|
||||||
@ -49,31 +42,15 @@ private:
|
|||||||
*/
|
*/
|
||||||
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
|
||||||
|
|
||||||
/** Get clfftStatus and print the corresponding error message.
|
|
||||||
* clfftStatus is returned from all clFFT library functions, print error displays the
|
|
||||||
* corresponding error message. If "other" is printed then error code corresponds to
|
|
||||||
* OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
|
|
||||||
* be checked to determine the reason for the error.
|
|
||||||
*/
|
|
||||||
void printError(clfftStatus err);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/* constructor - currently does nothing*/
|
/* constructor - currently does nothing*/
|
||||||
OpenCLFFT(OpenCLBase *base) {
|
OpenCLFFT(OpenCLBase *base) {
|
||||||
m_oclbase = base;
|
m_oclbase = base;
|
||||||
|
|
||||||
/* Set up fft */
|
|
||||||
cl_int err;
|
|
||||||
err = clfftInitSetupData(&fftSetup);
|
|
||||||
err = clfftSetup(&fftSetup);
|
|
||||||
|
|
||||||
if (err != CL_SUCCESS)
|
|
||||||
DEBUG_MSG("Error seting up clFFT");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* destructor - currently does nothing*/
|
/* destructor - currently does nothing*/
|
||||||
~OpenCLFFT() { destroyFFT(); }
|
~OpenCLFFT() { }
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Info: execute forward fft function with data set on device
|
Info: execute forward fft function with data set on device
|
||||||
@ -100,22 +77,35 @@ public:
|
|||||||
Info: set FFT size
|
Info: set FFT size
|
||||||
Return: success or error code
|
Return: success or error code
|
||||||
*/
|
*/
|
||||||
int setupFFT(int ndim, int N[3]);
|
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
|
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
|
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int destroyFFT();
|
int destroyFFT() { return DKS_SUCCESS; }
|
||||||
|
|
||||||
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1);
|
int streamId = -1)
|
||||||
|
{
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
|
||||||
int streamId = -1);
|
int streamId = -1)
|
||||||
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) {
|
{
|
||||||
return DKS_ERROR;
|
return DKS_ERROR;
|
||||||
}
|
}
|
||||||
|
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
|
||||||
|
{
|
||||||
|
return DKS_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
|
||||||
|
|
||||||
|
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
|
||||||
|
|
||||||
|
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
|
||||||
|
|
||||||
//void printData3DN4(cl_double2* &data, int N);
|
//void printData3DN4(cl_double2* &data, int N);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -1,181 +0,0 @@
|
|||||||
#include "OpenCLGreensFunction.h"
|
|
||||||
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
|
|
||||||
|
|
||||||
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
|
|
||||||
m_base = base;
|
|
||||||
base_create = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
OpenCLGreensFunction::OpenCLGreensFunction() {
|
|
||||||
m_base = new OpenCLBase();
|
|
||||||
base_create = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
OpenCLGreensFunction::~OpenCLGreensFunction() {
|
|
||||||
if (base_create)
|
|
||||||
delete m_base;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::buildProgram() {
|
|
||||||
char *kernel_file = new char[500];
|
|
||||||
kernel_file[0] = '\0';
|
|
||||||
strcat(kernel_file, OPENCL_KERNELS);
|
|
||||||
strcat(kernel_file, GREENS_KERNEL);
|
|
||||||
|
|
||||||
return m_base->ocl_loadKernel(kernel_file);
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
|
||||||
int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = I * J * K;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("kernelTmpgreen");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
|
|
||||||
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
|
|
||||||
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
|
||||||
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
|
||||||
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
}
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
|
|
||||||
int K, int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem rho2_ptr = (cl_mem)rho2_m;
|
|
||||||
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
|
|
||||||
int NI = 2*(I - 1);
|
|
||||||
int NJ = 2*(J - 1);
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = I * J * K;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("kernelIntegration");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
|
|
||||||
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
|
|
||||||
m_base->ocl_setKernelArg(4, sizeof(int), &I);
|
|
||||||
m_base->ocl_setKernelArg(5, sizeof(int), &J);
|
|
||||||
m_base->ocl_setKernelArg(6, sizeof(int), &K);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
double zero = 0.0;
|
|
||||||
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
|
|
||||||
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem rho2_ptr = (cl_mem)rho2_m;
|
|
||||||
int NI = I + 1;
|
|
||||||
int NJ = J + 1;
|
|
||||||
int NK = K + 1;
|
|
||||||
int I2 = 2*I;
|
|
||||||
int J2 = 2*J;
|
|
||||||
int K2 = 2*K;
|
|
||||||
|
|
||||||
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = NI * NJ * NK;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
|
|
||||||
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
|
|
||||||
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
|
|
||||||
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
|
|
||||||
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
|
|
||||||
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
|
|
||||||
{
|
|
||||||
int ierr = DKS_SUCCESS;
|
|
||||||
|
|
||||||
//compile opencl program from source
|
|
||||||
buildProgram();
|
|
||||||
|
|
||||||
//cast the input data ptr to cl_mem
|
|
||||||
cl_mem mem_ptr1 = (cl_mem) ptr1;
|
|
||||||
cl_mem mem_ptr2 = (cl_mem) ptr2;
|
|
||||||
|
|
||||||
//set the work item size
|
|
||||||
size_t work_size = 128;
|
|
||||||
size_t work_items = size;
|
|
||||||
if (work_items % work_size > 0)
|
|
||||||
work_items = (work_items / work_size + 1) * work_size;
|
|
||||||
|
|
||||||
//create kernel
|
|
||||||
ierr = m_base->ocl_createKernel("multiplyComplexFields");
|
|
||||||
|
|
||||||
//set kernel parameters
|
|
||||||
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
|
|
||||||
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
|
|
||||||
m_base->ocl_setKernelArg(2, sizeof(int), &size);
|
|
||||||
|
|
||||||
//execute kernel
|
|
||||||
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
|
|
||||||
|
|
||||||
return ierr;
|
|
||||||
|
|
||||||
}
|
|
@ -1,63 +0,0 @@
|
|||||||
#ifndef H_OPENCL_GREENSFUNCTION
|
|
||||||
#define H_OPENCL_GREENSFUNCTION
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
#include "../Algorithms/GreensFunction.h"
|
|
||||||
#include "OpenCLBase.h"
|
|
||||||
|
|
||||||
class OpenCLGreensFunction : public GreensFunction {
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
bool base_create;
|
|
||||||
OpenCLBase *m_base;
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
/** Constructor with OpenCLBase argument */
|
|
||||||
OpenCLGreensFunction(OpenCLBase *base);
|
|
||||||
|
|
||||||
/** Default constructor */
|
|
||||||
OpenCLGreensFunction();
|
|
||||||
|
|
||||||
/** Destructor */
|
|
||||||
~OpenCLGreensFunction();
|
|
||||||
|
|
||||||
/** Load OpenCL kernel file containing greens function kernels.
|
|
||||||
* m_base takes the kernel file and compiles the OpenCL programm.
|
|
||||||
*/
|
|
||||||
int buildProgram();
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: calc itegral on device memory (taken from OPAL src code)
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
|
|
||||||
double hr_m0, double hr_m1, double hr_m2,
|
|
||||||
int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: integration of rho2_m field (taken from OPAL src code)
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
|
|
||||||
int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: mirror rho field (taken from OPAL src code)
|
|
||||||
Return: succes or error code
|
|
||||||
*/
|
|
||||||
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
|
|
||||||
Return: success or error code
|
|
||||||
*/
|
|
||||||
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,4 +1,6 @@
|
|||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
#pragma OPENCL EXTENSION
|
||||||
|
|
||||||
|
|
||||||
/******Random numbers********/
|
/******Random numbers********/
|
||||||
|
|
||||||
@ -87,14 +89,13 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
|
|||||||
|
|
||||||
if (id < N) {
|
if (id < N) {
|
||||||
RNDState tmp;
|
RNDState tmp;
|
||||||
int tmp_seed = 2*id;// * 0x100000000ULL;
|
int tmp_seed = id;// * 0x100000000ULL;
|
||||||
tmp.s10 = 12345 + tmp_seed;
|
tmp.s10 = 12345 + tmp_seed;
|
||||||
tmp.s11 = 12345 + tmp_seed;
|
tmp.s11 = 12345 + tmp_seed;
|
||||||
tmp.s12 = 12345 + tmp_seed;
|
tmp.s12 = 123 + tmp_seed;
|
||||||
tmp.s20 = 12345 + tmp_seed;
|
tmp.s20 = 12345 + tmp_seed;
|
||||||
tmp.s21 = 12345 + tmp_seed;
|
tmp.s21 = 12345 + tmp_seed;
|
||||||
tmp.s22 = 12345 + tmp_seed;
|
tmp.s22 = 123 + tmp_seed;
|
||||||
|
|
||||||
|
|
||||||
tmp.z = 0;
|
tmp.z = 0;
|
||||||
tmp.gen = true;
|
tmp.gen = true;
|
||||||
@ -104,19 +105,6 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* create random numbers and fill an array */
|
|
||||||
__kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
|
|
||||||
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
if (idx < size) {
|
|
||||||
RNDState s = states[idx];
|
|
||||||
data[idx] = rand_uniform(&s);
|
|
||||||
states[idx] = s;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**********Degrader**********/
|
/**********Degrader**********/
|
||||||
enum PARAMS { POSITION,
|
enum PARAMS { POSITION,
|
||||||
|
@ -1,170 +0,0 @@
|
|||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
|
||||||
|
|
||||||
/** compute the greens integral analytically */
|
|
||||||
__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
|
|
||||||
int NI, int NJ, int NK)
|
|
||||||
{
|
|
||||||
|
|
||||||
int tid = get_local_size(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
if (id < NI * NJ * NK) {
|
|
||||||
int i = id % NI;
|
|
||||||
int k = id / (NI * NJ);
|
|
||||||
int j = (id - k * NI * NJ) / NI;
|
|
||||||
|
|
||||||
|
|
||||||
double cellVolume = hr_m0 * hr_m1 * hr_m2;
|
|
||||||
|
|
||||||
double vv0 = i * hr_m0 - hr_m0 / 2;
|
|
||||||
double vv1 = j * hr_m1 - hr_m1 / 2;
|
|
||||||
double vv2 = k * hr_m2 - hr_m2 / 2;
|
|
||||||
|
|
||||||
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
|
|
||||||
|
|
||||||
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
|
|
||||||
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
|
|
||||||
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
|
|
||||||
|
|
||||||
tmpgrn = tmpgrn / 2;
|
|
||||||
|
|
||||||
tmpgrn += vv1 * vv2 * log(vv0 + r);
|
|
||||||
tmpgrn += vv0 * vv2 * log(vv1 + r);
|
|
||||||
tmpgrn += vv0 * vv1 * log(vv2 + r);
|
|
||||||
|
|
||||||
tmpgreen[id] = tmpgrn / cellVolume;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/** perform the actual integration */
|
|
||||||
__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen,
|
|
||||||
int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp)
|
|
||||||
{
|
|
||||||
|
|
||||||
int tid = get_local_id(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
int ni = NI;
|
|
||||||
int nj = NJ;
|
|
||||||
|
|
||||||
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
|
||||||
|
|
||||||
if (id < NI_tmp * NJ_tmp * NK_tmp) {
|
|
||||||
int i = id % NI_tmp;
|
|
||||||
int k = id / (NI_tmp * NJ_tmp);
|
|
||||||
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
|
|
||||||
|
|
||||||
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
|
|
||||||
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
|
|
||||||
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp)
|
|
||||||
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (j+1 < NJ_tmp)
|
|
||||||
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (k+1 < NK_tmp)
|
|
||||||
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp && j+1 < NJ_tmp)
|
|
||||||
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (i+1 < NI_tmp && k+1 < NK_tmp)
|
|
||||||
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
if (j+1 < NJ_tmp && k+1 < NK_tmp)
|
|
||||||
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
|
|
||||||
|
|
||||||
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
|
|
||||||
|
|
||||||
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** miror rho-field */
|
|
||||||
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
|
|
||||||
rho2_m[0] = rho2_m[NI*NJ];
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void kernelMirroredRhoField(__global double *rho2_m,
|
|
||||||
int NI, int NJ, int NK,
|
|
||||||
int NI_tmp, int NJ_tmp, int NK_tmp,
|
|
||||||
int size)
|
|
||||||
{
|
|
||||||
|
|
||||||
int tid = get_local_id(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
if (id == 0)
|
|
||||||
rho2_m[0] = rho2_m[NI * NJ];
|
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
||||||
|
|
||||||
int id1, id2, id3, id4, id5, id6, id7, id8;
|
|
||||||
|
|
||||||
if (id < NI_tmp * NJ_tmp * NK_tmp) {
|
|
||||||
int i = id % NI_tmp;
|
|
||||||
int k = id / (NI_tmp * NJ_tmp);
|
|
||||||
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
|
|
||||||
|
|
||||||
int ri = NI - i;
|
|
||||||
int rj = NJ - j;
|
|
||||||
int rk = NK - k;
|
|
||||||
|
|
||||||
id1 = k * NI * NJ + j * NI + i;
|
|
||||||
id2 = k * NI * NJ + j * NI + ri;
|
|
||||||
id3 = k * NI * NJ + rj * NI + i;
|
|
||||||
id4 = k * NI * NJ + rj * NI + ri;
|
|
||||||
|
|
||||||
id5 = rk * NI * NJ + j * NI + i;
|
|
||||||
id6 = rk * NI * NJ + j * NI + ri;
|
|
||||||
id7 = rk * NI * NJ + rj * NI + i;
|
|
||||||
id8 = rk * NI * NJ + rj * NI + ri;
|
|
||||||
|
|
||||||
double data = 0.0;
|
|
||||||
if (id1 < size)
|
|
||||||
data = rho2_m[id1];
|
|
||||||
|
|
||||||
if (i != 0 && id2 < size) rho2_m[id2] = data;
|
|
||||||
|
|
||||||
if (j != 0 && id3 < size) rho2_m[id3] = data;
|
|
||||||
|
|
||||||
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
|
|
||||||
|
|
||||||
if (k != 0 && id5 < size) rho2_m[id5] = data;
|
|
||||||
|
|
||||||
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
|
|
||||||
|
|
||||||
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
|
|
||||||
|
|
||||||
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/** multiply complex fields */
|
|
||||||
double2 ComplexMul(double2 a, double2 b) {
|
|
||||||
double2 c;
|
|
||||||
c.x = a.x * b.x - a.y * b.y;
|
|
||||||
c.y = a.x * b.y + a.y * b.x;
|
|
||||||
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
|
|
||||||
int size)
|
|
||||||
{
|
|
||||||
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
if (idx < size)
|
|
||||||
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
|
|
||||||
|
|
||||||
}
|
|
@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
|
|||||||
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
#ADD_EXECUTABLE(testFFT testFFT.cpp)
|
||||||
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
#ADD_EXECUTABLE(testMIC testMIC.cpp)
|
||||||
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
|
||||||
ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
|
||||||
ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
|
||||||
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
|
||||||
@ -22,11 +22,10 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
|
|||||||
#ADD_EXECUTABLE(testGather testGather.cpp)
|
#ADD_EXECUTABLE(testGather testGather.cpp)
|
||||||
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
|
||||||
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
|
||||||
ADD_EXECUTABLE(testRandom testRandom.cpp)
|
|
||||||
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
|
||||||
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
|
||||||
#ADD_EXECUTABLE(testPush testPush.cpp)
|
#ADD_EXECUTABLE(testPush testPush.cpp)
|
||||||
ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
||||||
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
|
||||||
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
|
||||||
|
|
||||||
@ -39,8 +38,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
|
|||||||
#TARGET_LINK_LIBRARIES(testFFT dks)
|
#TARGET_LINK_LIBRARIES(testFFT dks)
|
||||||
#TARGET_LINK_LIBRARIES(testMIC dks)
|
#TARGET_LINK_LIBRARIES(testMIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
|
||||||
TARGET_LINK_LIBRARIES(testFFT3D dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testFFT3D dks)
|
||||||
TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
|
||||||
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
|
||||||
@ -54,11 +53,10 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${CLFFT_LIBRARIES})
|
|||||||
#TARGET_LINK_LIBRARIES(testGather dks)
|
#TARGET_LINK_LIBRARIES(testGather dks)
|
||||||
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
|
||||||
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
#TARGET_LINK_LIBRARIES(testTranspose dks)
|
||||||
TARGET_LINK_LIBRARIES(testRandom dks ${CLFFT_LIBRARIES})
|
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
|
||||||
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${CLFFT_LIBRARIES})
|
|
||||||
#TARGET_LINK_LIBRARIES(testPush dks)
|
#TARGET_LINK_LIBRARIES(testPush dks)
|
||||||
TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
|
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
|
||||||
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
#TARGET_LINK_LIBRARIES(testIntegration dks)
|
||||||
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
|
||||||
|
|
||||||
@ -83,4 +81,4 @@ TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${CLFFT_LIBRARIES})
|
|||||||
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
||||||
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
|
||||||
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
|
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
|
||||||
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
|
@ -129,9 +129,7 @@ int main(int argc, char *argv[]) {
|
|||||||
//init random
|
//init random
|
||||||
base.callInitRandoms(numpart);
|
base.callInitRandoms(numpart);
|
||||||
|
|
||||||
|
|
||||||
//**test collimator physics and sort***//
|
//**test collimator physics and sort***//
|
||||||
|
|
||||||
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
|
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
|
||||||
|
|
||||||
//allocate memory for particles
|
//allocate memory for particles
|
||||||
@ -212,8 +210,8 @@ int main(int argc, char *argv[]) {
|
|||||||
base.freeMemory<double>(pz_ptr, numpart);
|
base.freeMemory<double>(pz_ptr, numpart);
|
||||||
|
|
||||||
base.freeMemory<double>(param_ptr, 12);
|
base.freeMemory<double>(param_ptr, 12);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
std::cout << std::fixed << std::setprecision(4);
|
std::cout << std::fixed << std::setprecision(4);
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
std::cout << p.label[i] << "\t" << p.rx[i]
|
std::cout << p.label[i] << "\t" << p.rx[i]
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
#include "Utility/TimeStamp.h"
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -19,30 +18,22 @@ int main(int argc, char *argv[]) {
|
|||||||
int N = 16;
|
int N = 16;
|
||||||
char *api_name = new char[10];
|
char *api_name = new char[10];
|
||||||
char *device_name = new char[10];
|
char *device_name = new char[10];
|
||||||
|
if (argc == 2) {
|
||||||
for (int i = 1; i < argc; i++) {
|
N = atoi(argv[1]);
|
||||||
if (argv[i] == string("-cuda")) {
|
strcpy(api_name, "Cuda");
|
||||||
strcpy(api_name, "Cuda");
|
strcpy(device_name, "-gpu");
|
||||||
strcpy(device_name, "-gpu");
|
} else if (argc == 3) {
|
||||||
}
|
N = atoi(argv[1]);
|
||||||
|
strcpy(api_name, argv[2]);
|
||||||
if (argv[i] == string("-opencl")) {
|
strcpy(device_name, "-gpu");
|
||||||
strcpy(api_name, "OpenCL");
|
} else if (argc == 4) {
|
||||||
strcpy(device_name, "-gpu");
|
N = atoi(argv[1]);
|
||||||
}
|
strcpy(api_name, argv[2]);
|
||||||
|
strcpy(device_name, argv[3]);
|
||||||
if (argv[i] == string("-mic")) {
|
} else {
|
||||||
strcpy(api_name, "OpenMP");
|
N = 16;
|
||||||
strcpy(device_name, "-mic");
|
strcpy(api_name, "OpenCL");
|
||||||
}
|
strcpy(device_name, "-gpu");
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-cpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-N"))
|
|
||||||
N = atoi(argv[i+1]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
cout << "Use api: " << api_name << ", " << device_name << endl;
|
||||||
@ -83,16 +74,9 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
/* write data to device */
|
/* write data to device */
|
||||||
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
|
||||||
if (N < 5)
|
|
||||||
printData3DN4(cdata, N, 3);
|
|
||||||
|
|
||||||
|
|
||||||
/* execute fft */
|
/* execute fft */
|
||||||
base.callFFT(mem_ptr, 3, dimsize);
|
base.callFFT(mem_ptr, 3, dimsize);
|
||||||
if (N < 5) {
|
|
||||||
base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
|
|
||||||
printData3DN4(cfft, N, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* execute ifft */
|
/* execute ifft */
|
||||||
base.callIFFT(mem_ptr, 3, dimsize);
|
base.callIFFT(mem_ptr, 3, dimsize);
|
||||||
@ -102,9 +86,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
/* read data from device */
|
/* read data from device */
|
||||||
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
|
||||||
if (N < 5)
|
|
||||||
printData3DN4(cifft, N, 3);
|
|
||||||
|
|
||||||
/* free device memory */
|
/* free device memory */
|
||||||
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
|
||||||
|
|
||||||
@ -148,7 +130,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
|
|||||||
if (a < 10e-5 && a > -10e-5)
|
if (a < 10e-5 && a > -10e-5)
|
||||||
a = 0;
|
a = 0;
|
||||||
|
|
||||||
cout << "(" << d << "," << a << ") ";
|
cout << d << "; " << a << "\t";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
@ -175,5 +157,3 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
|
|||||||
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include <fstream>
|
|
||||||
#include <iomanip>
|
|
||||||
|
|
||||||
#include "Utility/TimeStamp.h"
|
#include "Utility/TimeStamp.h"
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -10,53 +8,54 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
|
||||||
void initData(double *data, int dimsize[3], int dim);
|
void initData(double *data, int dimsize[3]);
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
|
||||||
char *api_name, char *device_name, char *file_name);
|
|
||||||
void printHelp();
|
void printHelp();
|
||||||
|
|
||||||
void printData3DN4(complex<double>* &data, int N, int dim);
|
|
||||||
void printData3DN4(double* &data, int N, int dim);
|
|
||||||
|
|
||||||
double precision(double a) {
|
|
||||||
//if (a < 1e-10)
|
|
||||||
// return 0.0;
|
|
||||||
//else
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
int N1 = 8;
|
int N1 = 8;
|
||||||
int N2 = 8;
|
int N2 = 8;
|
||||||
int N3 = 8;
|
int N3 = 8;
|
||||||
int dim = 3;
|
int dim = 3;
|
||||||
int loop = 0;
|
int loop = 10;
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
char *file_name = new char[50];
|
|
||||||
|
|
||||||
if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
|
if ( readParams(argc, argv, N1, N2, N3, loop) )
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
int dimsize[3] = {N3, N2, N1};
|
||||||
|
|
||||||
int dimsize[3] = {N1, N2, N3};
|
|
||||||
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
|
||||||
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
|
||||||
|
|
||||||
double *rdata = new double[sizereal];
|
double *rdata = new double[sizereal];
|
||||||
double *outdata = new double[sizereal];
|
double *outdata = new double[sizereal];
|
||||||
complex<double> *cfft = new complex<double>[sizecomp];
|
complex<double> *cfft = new complex<double>[sizecomp];
|
||||||
initData(rdata, dimsize, dim);
|
|
||||||
|
for (int i=0; i<sizecomp; ++i) {
|
||||||
|
cfft[i].real() = 7.;
|
||||||
|
cfft[i].imag() = 3.33;
|
||||||
|
}
|
||||||
|
initData(rdata, dimsize);
|
||||||
|
|
||||||
/* init DKSBase */
|
/* init DKSBase */
|
||||||
cout << "Init device and set function" << endl;
|
cout << "Init device and set function" << endl;
|
||||||
|
#ifdef DKS_MIC
|
||||||
DKSBase base;
|
DKSBase base;
|
||||||
base.setAPI(api_name, strlen(api_name));
|
base.setAPI("OpenMP", 6);
|
||||||
base.setDevice(device_name, strlen(device_name));
|
base.setDevice("-mic", 4);
|
||||||
|
base.initDevice();
|
||||||
|
base.setupFFTRC(dim, dimsize);
|
||||||
|
/* setup backward fft (COMPLEX->REAL) */
|
||||||
|
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef DKS_CUDA
|
||||||
|
DKSBase base;
|
||||||
|
base.setAPI("Cuda", 4);
|
||||||
|
base.setDevice("-gpu", 4);
|
||||||
base.initDevice();
|
base.initDevice();
|
||||||
base.setupFFT(dim, dimsize);
|
base.setupFFT(dim, dimsize);
|
||||||
|
#endif
|
||||||
|
|
||||||
// allocate memory on device
|
// allocate memory on device
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -68,59 +67,69 @@ int main(int argc, char *argv[]) {
|
|||||||
// execute one run before starting the timers
|
// execute one run before starting the timers
|
||||||
base.writeData<double>(real_ptr, rdata, sizereal);
|
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||||
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||||
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
|
|
||||||
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||||
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
|
||||||
base.readData<double>(real_res_ptr, outdata, sizereal);
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
|
|
||||||
|
//timer for total loop time, FFT and IFFT calls
|
||||||
|
struct timeval timeStart, timeEnd;
|
||||||
|
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
|
||||||
|
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
|
||||||
|
|
||||||
|
gettimeofday(&timeStart, NULL);
|
||||||
|
for (int i=0; i<loop; ++i){
|
||||||
|
|
||||||
|
// write data to device
|
||||||
|
base.writeData<double>(real_ptr, rdata, sizereal);
|
||||||
|
|
||||||
|
// execute rcfft
|
||||||
|
gettimeofday(&timeFFTStart[i], NULL);
|
||||||
|
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
|
||||||
|
gettimeofday(&timeFFTEnd[i], NULL);
|
||||||
|
|
||||||
|
// execute crfft
|
||||||
|
gettimeofday(&timeIFFTStart[i], NULL);
|
||||||
|
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
|
||||||
|
gettimeofday(&timeIFFTEnd[i], NULL);
|
||||||
|
|
||||||
|
//normalize
|
||||||
|
#ifdef DKS_CUDA
|
||||||
|
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// read IFFT data from device
|
||||||
|
base.readData<double>(real_res_ptr, outdata, sizereal);
|
||||||
|
|
||||||
|
|
||||||
ofstream myfile;
|
|
||||||
myfile.open(file_name);
|
|
||||||
myfile<< "in\tout\treal\timag\n";
|
|
||||||
for (int i = 0; i < sizereal; i++) {
|
|
||||||
//myfile << precision(rdata[i]) << "\t";
|
|
||||||
//myfile << precision(outdata[i]) << "\t";
|
|
||||||
if (i < sizecomp) {
|
|
||||||
myfile << precision(cfft[i].real()) << "\t";
|
|
||||||
myfile << precision(cfft[i].imag());
|
|
||||||
}
|
|
||||||
myfile << "\n";
|
|
||||||
}
|
}
|
||||||
myfile.close();
|
gettimeofday(&timeEnd, NULL);
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (dim == 2) {
|
|
||||||
for (int i = 0; i < N2; i++) {
|
|
||||||
for (int j = 0; j < N1; j++) {
|
|
||||||
cout << rdata[i*N1 + j] << " ";
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (dim == 2) {
|
|
||||||
for (int i = 0; i < N2; i++) {
|
|
||||||
for (int j = 0; j < N1 / 2 + 1; j++) {
|
|
||||||
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
// free device memory
|
// free device memory
|
||||||
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
|
||||||
base.freeMemory<double>(real_ptr, sizereal);
|
base.freeMemory<double>(real_ptr, sizereal);
|
||||||
base.freeMemory<double>(real_res_ptr, sizereal);
|
base.freeMemory<double>(real_res_ptr, sizereal);
|
||||||
|
|
||||||
// compare in and out data to see if we get back the same results
|
// compare in and out data to see if we get back the same results
|
||||||
cout << "comp" << endl;
|
|
||||||
compareData(rdata, outdata, N1, N2, N3, dim);
|
compareData(rdata, outdata, N1, N2, N3, dim);
|
||||||
cout << "done" << endl;
|
|
||||||
|
//calculate seconds for total time and fft times
|
||||||
|
double tfft = 0;
|
||||||
|
double tifft = 0;
|
||||||
|
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
|
||||||
|
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
|
||||||
|
|
||||||
|
for (int i = 0; i < loop; i++) {
|
||||||
|
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
|
||||||
|
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
|
||||||
|
|
||||||
|
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
|
||||||
|
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
|
//print timing results
|
||||||
|
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
|
||||||
|
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
|
||||||
|
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
|
||||||
|
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
|
||||||
|
<< "\n\n";
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -128,10 +137,10 @@ int main(int argc, char *argv[]) {
|
|||||||
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
|
||||||
int id;
|
int id;
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < NK; i++) {
|
for (int i = 0; i < NI; i++) {
|
||||||
for (int j = 0; j < NJ; j++) {
|
for (int j = 0; j < NJ; j++) {
|
||||||
for (int k = 0; k < NI; k++) {
|
for (int k = 0; k < NK; k++) {
|
||||||
id = i*NI*NJ + j*NI + k;
|
id = k*NI*NJ + j*NI + i;
|
||||||
sum += fabs(data1[id] - data2[id]);
|
sum += fabs(data1[id] - data2[id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -139,21 +148,13 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
|
|||||||
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
std::cout << "RC <--> CR diff: " << sum << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData(double *data, int dimsize[3], int dim) {
|
void initData(double *data, int dimsize[3]) {
|
||||||
if (dim == 3) {
|
for (int i = 0; i < dimsize[2]; i++) {
|
||||||
for (int i = 0; i < dimsize[2]; i++)
|
|
||||||
for (int j = 0; j < dimsize[1]; j++)
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
|
|
||||||
} else if (dim == 2) {
|
|
||||||
for (int j = 0; j < dimsize[1]; j++) {
|
for (int j = 0; j < dimsize[1]; j++) {
|
||||||
for (int k = 0; k < dimsize[0]; k++) {
|
for (int k = 0; k < dimsize[0]; k++) {
|
||||||
data[j*dimsize[0] + k] = sin(k);
|
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
for (int k = 0; k < dimsize[0]; k++)
|
|
||||||
data[k] = sin(k);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,17 +173,10 @@ void printHelp() {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
|
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
|
||||||
char *api_name, char *device_name, char *file_name)
|
|
||||||
{
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
|
|
||||||
if ( argv[i] == std::string("-dim")) {
|
|
||||||
dim = atoi(argv[i + 1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( argv[i] == std::string("-grid") ) {
|
if ( argv[i] == std::string("-grid") ) {
|
||||||
N1 = atoi(argv[i + 1]);
|
N1 = atoi(argv[i + 1]);
|
||||||
N2 = atoi(argv[i + 2]);
|
N2 = atoi(argv[i + 2]);
|
||||||
@ -199,72 +193,7 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, in
|
|||||||
printHelp();
|
printHelp();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv[i] == string("-cuda")) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
strcpy(file_name, "cuda_fft.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
strcpy(file_name, "opencl_fft.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
|
||||||
strcpy(api_name, "OpenMP");
|
|
||||||
strcpy(device_name, "-mic");
|
|
||||||
strcpy(file_name, "openmp_fft.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-cpu");
|
|
||||||
strcpy(file_name, "opencl_cpu_fft.dat");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void printData3DN4(complex<double>* &data, int N, int dim) {
|
|
||||||
|
|
||||||
for (int j = 0; j < N; j++) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
for (int k = 0; k < N/2 + 1; k++) {
|
|
||||||
double d = data[i*N*N + j*N + k].real();
|
|
||||||
double a = data[i*N*N + j*N + k].imag();
|
|
||||||
|
|
||||||
if (d < 10e-5 && d > -10e-5)
|
|
||||||
d = 0;
|
|
||||||
if (a < 10e-5 && a > -10e-5)
|
|
||||||
a = 0;
|
|
||||||
|
|
||||||
cout << "(" << d << "," << a << ") ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void printData3DN4(double* &data, int N, int dim) {
|
|
||||||
|
|
||||||
for (int j = 0; j < N; j++) {
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
for (int k = 0; k < N; k++) {
|
|
||||||
double d = data[i*N*N + j*N + k];
|
|
||||||
|
|
||||||
if (d < 10e-5 && d > -10e-5)
|
|
||||||
d = 0;
|
|
||||||
|
|
||||||
cout << d << " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
//#include <mpi.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "DKSBase.h"
|
#include "DKSBase.h"
|
||||||
@ -10,265 +11,309 @@ using namespace std;
|
|||||||
|
|
||||||
|
|
||||||
void printData3D(double* data, int N, int NI, const char *message = "") {
|
void printData3D(double* data, int N, int NI, const char *message = "") {
|
||||||
if (strcmp(message, "") != 0)
|
if (strcmp(message, "") != 0)
|
||||||
cout << message;
|
cout << message;
|
||||||
|
|
||||||
for (int i = 0; i < NI; i++) {
|
for (int i = 0; i < NI; i++) {
|
||||||
for (int j = 0; j < N; j++) {
|
for (int j = 0; j < N; j++) {
|
||||||
for (int k = 0; k < N; k++) {
|
for (int k = 0; k < N; k++) {
|
||||||
cout << data[i*N*N + j*N + k] << "\t";
|
cout << data[i*N*N + j*N + k] << "\t";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData(double *data, int N) {
|
void initData(double *data, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N/4 + 1; i++) {
|
for (int i = 0; i < N/4 + 1; i++) {
|
||||||
for (int j = 0; j < N/2 + 1; j++) {
|
for (int j = 0; j < N/2 + 1; j++) {
|
||||||
for (int k = 0; k < N/2 + 1; k++) {
|
for (int k = 0; k < N/2 + 1; k++) {
|
||||||
data[i*N*N + j*N + k] = k+1;
|
data[i*N*N + j*N + k] = k+1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void initData2(double *data, int N) {
|
void initData2(double *data, int N) {
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
data[i] = i;
|
data[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initComplex( complex<double> *d, int N) {
|
void initComplex( complex<double> *d, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
d[i] = complex<double>(2, 0);
|
d[i] = complex<double>(2, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void printComplex(complex<double> *d, int N) {
|
void printComplex(complex<double> *d, int N) {
|
||||||
|
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
cout << d[i] << "\t";
|
cout << d[i] << "\t";
|
||||||
cout << endl;
|
cout << endl;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void printDouble(double *d, int N) {
|
|
||||||
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
cout << d[i] << ", ";
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void initMirror(double *data, int n1, int n2, int n3) {
|
void initMirror(double *data, int n1, int n2, int n3) {
|
||||||
int d = 1;
|
int d = 1;
|
||||||
for (int i = 0; i < n3; i++) {
|
for (int i = 0; i < n3; i++) {
|
||||||
for (int j = 0; j < n2; j++) {
|
for (int j = 0; j < n2; j++) {
|
||||||
for (int k = 0; k < n1; k++) {
|
for (int k = 0; k < n1; k++) {
|
||||||
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
|
||||||
data[i * n2 * n1 + j * n1 + k] = d++;
|
data[i * n2 * n1 + j * n1 + k] = d++;
|
||||||
else
|
else
|
||||||
data[i * n2 * n1 + j * n1 + k] = 0;
|
data[i * n2 * n1 + j * n1 + k] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void printDiv(int c) {
|
void printDiv(int c) {
|
||||||
for (int i = 0; i < c; i++)
|
for (int i = 0; i < c; i++)
|
||||||
cout << "-";
|
cout << "-";
|
||||||
cout << endl;
|
cout << endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void printMirror(double *data, int n1, int n2, int n3) {
|
void printMirror(double *data, int n1, int n2, int n3) {
|
||||||
|
|
||||||
printDiv(75);
|
printDiv(75);
|
||||||
for (int i = 0; i < n3; i++) {
|
for (int i = 0; i < n3; i++) {
|
||||||
for (int j = 0; j < n2; j++) {
|
for (int j = 0; j < n2; j++) {
|
||||||
for (int k = 0; k < n1; k++) {
|
for (int k = 0; k < n1; k++) {
|
||||||
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
double sumData(double *data, int datasize) {
|
double sumData(double *data, int datasize) {
|
||||||
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < datasize; i++)
|
for (int i = 0; i < datasize; i++)
|
||||||
sum += data[i];
|
sum += data[i];
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
char *api_name = new char[10];
|
/* mpi init */
|
||||||
char *device_name = new char[10];
|
//int rank, nprocs;
|
||||||
|
//MPI_Init(&argc, &argv);
|
||||||
|
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
/*
|
||||||
if (argv[i] == string("-cuda")) {
|
if (nprocs != 8) {
|
||||||
strcpy(api_name, "Cuda");
|
cout << "example was set to run with 8 processes" << endl;
|
||||||
strcpy(device_name, "-gpu");
|
cout << "exit..." << endl;
|
||||||
}
|
return 0;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
/* set domain size */
|
||||||
strcpy(api_name, "OpenCL");
|
int NG[3] = {64, 64, 32};
|
||||||
strcpy(device_name, "-gpu");
|
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
||||||
}
|
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
||||||
|
int sizerho = NG[0] * NG[1] * NG[2];
|
||||||
|
int sizegreen = ng[0] * ng[1] * ng[2];
|
||||||
|
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
||||||
|
int id[3];
|
||||||
|
|
||||||
if (argv[i] == string("-mic")) {
|
//id[0] = 0;
|
||||||
strcpy(api_name, "OpenMP");
|
//id[1] = NL[1] * (rank % 4);
|
||||||
strcpy(device_name, "-mic");
|
//id[2] = NL[2] * (rank / 4);
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-cpu")) {
|
/* print some messages bout the example in the begginig */
|
||||||
strcpy(api_name, "OpenCL");
|
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
||||||
strcpy(device_name, "-cpu");
|
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
|
||||||
}
|
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
||||||
}
|
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
|
||||||
|
int tmp[3];
|
||||||
|
/* for (int p = 1; p < nprocs; p++) {
|
||||||
|
MPI_Status mpistatus;
|
||||||
|
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
|
||||||
|
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
|
||||||
|
}*/
|
||||||
|
// } else {
|
||||||
|
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
|
||||||
|
// }
|
||||||
|
|
||||||
cout << "Use api: " << api_name << ", " << device_name << endl;
|
/* dks init and create 2 streams */
|
||||||
|
int dkserr;
|
||||||
|
//int streamGreens, streamFFT;
|
||||||
|
#ifdef DKS_MIC
|
||||||
|
DKSBase base;
|
||||||
|
base.setAPI("OpenMP", 6);
|
||||||
|
base.setDevice("-mic", 4);
|
||||||
|
base.initDevice();
|
||||||
|
#endif
|
||||||
|
|
||||||
/* set domain size */
|
#ifdef DKS_CUDA
|
||||||
int NG[3] = {64, 64, 32};
|
DKSBase base;
|
||||||
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
|
base.setAPI("Cuda", 4);
|
||||||
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
|
base.setDevice("-gpu", 4);
|
||||||
int sizerho = NG[0] * NG[1] * NG[2];
|
base.initDevice();
|
||||||
int sizegreen = ng[0] * ng[1] * ng[2];
|
#endif
|
||||||
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
|
|
||||||
|
|
||||||
/* print some messages bout the example in the begginig */
|
//base.createStream(streamFFT);
|
||||||
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
|
//if (rank == 0) {
|
||||||
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
|
// base.createStream(streamGreens);
|
||||||
|
base.setupFFT(3, NG);
|
||||||
|
//}
|
||||||
|
|
||||||
/* dks init and create 2 streams */
|
/* allocate memory and init rho field */
|
||||||
int dkserr;
|
double *rho = new double[sizerho];
|
||||||
DKSBase base;
|
double *rho_out = new double[sizerho];
|
||||||
base.setAPI(api_name, strlen(api_name));
|
//double *green_out = new double[sizegreen];
|
||||||
base.setDevice(device_name, strlen(device_name));
|
initMirror(rho, NL[0], NL[1], NL[2]);
|
||||||
base.initDevice();
|
|
||||||
base.setupFFT(3, NG);
|
|
||||||
|
|
||||||
/* allocate memory and init rho field */
|
/*
|
||||||
double *rho = new double[sizerho];
|
allocate memory on device for
|
||||||
double *rho_out = new double[sizerho];
|
- rho field
|
||||||
//double *green_out = new double[sizegreen];
|
- rho FFT
|
||||||
double *mirror_out = new double[sizerho];
|
- tmpgreen
|
||||||
//initMirror(rho, NL[0], NL[1], NL[2]);
|
- greens integral
|
||||||
initMirror(rho, NG[0], NG[1], NG[2]);
|
- greens integral FFT
|
||||||
|
*/
|
||||||
|
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
||||||
|
// if (rank == 0) {
|
||||||
|
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
||||||
|
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||||
|
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
||||||
|
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||||
|
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
||||||
|
/* } else {
|
||||||
|
grntr_ptr = NULL;
|
||||||
|
rho2_ptr = NULL;
|
||||||
|
grn_ptr = NULL;
|
||||||
|
rho2tr_ptr = NULL;
|
||||||
|
tmpgreen_ptr = NULL;
|
||||||
|
}*/
|
||||||
|
|
||||||
/*
|
|
||||||
allocate memory on device for
|
|
||||||
- rho field
|
|
||||||
- rho FFT
|
|
||||||
- tmpgreen
|
|
||||||
- greens integral
|
|
||||||
- greens integral FFT
|
|
||||||
*/
|
|
||||||
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
|
|
||||||
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
|
|
||||||
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
|
||||||
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
|
|
||||||
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
|
||||||
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
|
|
||||||
|
|
||||||
/* =================================================*/
|
/* send and receive pointer to allocated memory on device */
|
||||||
/* =================================================*/
|
/*
|
||||||
/* =====loop trough fftpoison solver iterations=====*/
|
if (rank == 0) {
|
||||||
/* =================================================*/
|
for (int p = 1; p < nprocs; p++)
|
||||||
/* =================================================*/
|
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
|
||||||
|
} else {
|
||||||
|
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
|
||||||
|
}
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
*/
|
||||||
|
|
||||||
double old_sum = 0;
|
|
||||||
|
|
||||||
int hr_m[3] = {1, 1, 1};
|
/* =================================================*/
|
||||||
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
|
/* =================================================*/
|
||||||
|
/* =====loop trough fftpoison solver iterations=====*/
|
||||||
|
/* =================================================*/
|
||||||
|
/* =================================================*/
|
||||||
|
|
||||||
/* calculate greens integral on gpu */
|
double old_sum = 0;
|
||||||
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
double tmp_sum = 0;
|
||||||
|
for (int l = 0; l < 100; l++) {
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
/* on node 0, calculate tmpgreen on gpu */
|
||||||
|
int hr_m[3] = {1, 1, 1};
|
||||||
|
//if (rank == 0)
|
||||||
|
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
|
||||||
|
hr_m[0], hr_m[1], hr_m[2]);
|
||||||
|
|
||||||
/* mirror the field */
|
/* calculate greens integral on gpu */
|
||||||
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
//if (rank == 0)
|
||||||
/*
|
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
|
||||||
base.readData<double>(grn_ptr, mirror_out, sizerho);
|
|
||||||
for (int i = 0; i < sizerho; i++)
|
|
||||||
cout << mirror_out[i] << " ";
|
|
||||||
cout << endl << endl;
|
|
||||||
|
|
||||||
for (int i = 0; i < sizerho; i++)
|
/* mirror the field */
|
||||||
cout << rho[i] << " ";
|
//if (rank == 0)
|
||||||
cout << endl << endl;
|
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
|
||||||
*/
|
|
||||||
/* transfer rho field to device */
|
|
||||||
base.writeData<double>(rho2_ptr, rho, sizerho);
|
|
||||||
|
|
||||||
/* get FFT of rho field */
|
|
||||||
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
|
||||||
|
|
||||||
/* get FFT of mirrored greens integral */
|
/* get FFT of mirrored greens integral */
|
||||||
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
//if (rank == 0)
|
||||||
|
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
|
||||||
|
|
||||||
/* multiply both FFTs */
|
/* transfer rho field to device */
|
||||||
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
|
||||||
|
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
/*
|
/* get FFT of rho field */
|
||||||
complex<double> *crho = new complex<double>[sizecomp];
|
//if (rank == 0) {
|
||||||
complex<double> *cgre = new complex<double>[sizecomp];
|
//base.syncDevice();
|
||||||
base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
|
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
|
||||||
base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
|
//}
|
||||||
|
|
||||||
for (int i = 0; i < sizecomp; i++)
|
/* multiply both FFTs */
|
||||||
cout << cgre[i].real() << " ";
|
//if (rank == 0)
|
||||||
cout << endl << endl;
|
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
for (int i = 0; i < sizecomp; i++)
|
/* inverse fft and transfer data back */
|
||||||
cout << crho[i].real() << " ";
|
/*
|
||||||
cout << endl << endl;
|
multiple device syncs and mpi barriers are used to make sure data
|
||||||
|
transfer is started when results are ready and progam moves on
|
||||||
delete[] crho;
|
only when data transfer is finished
|
||||||
delete[] cgre;
|
*/
|
||||||
*/
|
//if (rank == 0) {
|
||||||
|
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
||||||
|
//base.syncDevice();
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||||
|
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
//base.syncDevice();
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
//cout << "result: " << sumData(rho_out, sizerho) << endl;
|
||||||
|
if (l == 0) {
|
||||||
|
old_sum = sumData(rho_out, sizerho);
|
||||||
|
} else {
|
||||||
|
tmp_sum = sumData(rho_out, sizerho);
|
||||||
|
if (old_sum != tmp_sum) {
|
||||||
|
cout << "diff in iteration: " << l << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*} else {
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
/* inverse fft and transfer data back */
|
|
||||||
/*
|
|
||||||
multiple device syncs and mpi barriers are used to make sure data
|
|
||||||
transfer is started when results are ready and progam moves on
|
|
||||||
only when data transfer is finished
|
|
||||||
*/
|
|
||||||
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
|
|
||||||
|
|
||||||
base.readData<double> (rho2_ptr, rho_out, sizerho);
|
|
||||||
|
|
||||||
for (int i = 0; i < 10; i++)
|
|
||||||
cout << rho_out[i] << " ";
|
|
||||||
cout << endl;
|
|
||||||
|
|
||||||
old_sum = sumData(rho_out, sizerho);
|
|
||||||
|
|
||||||
|
}
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* ==========end fftpoison solver test run==========*/
|
/* ==========end fftpoison solver test run==========*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
/* =================================================*/
|
/* =================================================*/
|
||||||
|
|
||||||
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
|
||||||
base.freeMemory<double>(grn_ptr, sizerho);
|
|
||||||
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
|
||||||
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
|
||||||
base.freeMemory<double>(rho2_ptr, sizerho);
|
|
||||||
|
|
||||||
delete[] rho_out;
|
|
||||||
delete[] rho;
|
/* free memory on device */
|
||||||
delete[] mirror_out;
|
//if (rank == 0) {
|
||||||
cout << "Final sum: " << old_sum << endl;
|
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
|
||||||
|
base.freeMemory<double>(grn_ptr, sizerho);
|
||||||
|
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
|
||||||
|
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
|
||||||
|
//MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
base.freeMemory<double>(rho2_ptr, sizerho);
|
||||||
|
cout << "Final sum: " << old_sum << endl;
|
||||||
|
/*} else {
|
||||||
|
base.closeHandle(rho2_ptr);
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
}*/
|
||||||
|
|
||||||
|
//MPI_Finalize();
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <sys/time.h>
|
|
||||||
|
|
||||||
#include "DKSBase.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
|
|
||||||
int size = 10;
|
|
||||||
bool apiSet = false;
|
|
||||||
char *api_name = new char[10];
|
|
||||||
char *device_name = new char[10];
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
|
|
||||||
if (argv[i] == string("-cuda")) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
apiSet = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-opencl")) {
|
|
||||||
strcpy(api_name, "OpenCL");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
apiSet = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argv[i] == string("-N")) {
|
|
||||||
size = atoi(argv[i+1]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!apiSet) {
|
|
||||||
strcpy(api_name, "Cuda");
|
|
||||||
strcpy(device_name, "-gpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
cout << "=========================BEGIN TEST=========================" << endl;
|
|
||||||
cout << "Use api: " << api_name << "\t" << device_name << endl;
|
|
||||||
cout << "Number of randoms: " << size << endl;
|
|
||||||
|
|
||||||
//init dks
|
|
||||||
int ierr;
|
|
||||||
DKSBase base;
|
|
||||||
base.setAPI(api_name, strlen(api_name));
|
|
||||||
base.setDevice(device_name, strlen(api_name));
|
|
||||||
base.initDevice();
|
|
||||||
base.callInitRandoms(size);
|
|
||||||
|
|
||||||
//create host vector to store results
|
|
||||||
double *host_data = new double[size];
|
|
||||||
|
|
||||||
//create device vector
|
|
||||||
void *device_data = base.allocateMemory<double>(size, ierr);
|
|
||||||
|
|
||||||
for (int i = 0; i < 5; i++) {
|
|
||||||
//fill device vector with random values
|
|
||||||
base.callCreateRandomNumbers(device_data, size);
|
|
||||||
|
|
||||||
//read device vector
|
|
||||||
base.readData<double>(device_data, host_data, size);
|
|
||||||
|
|
||||||
//print host data
|
|
||||||
for (int i = 0; i < size; i++)
|
|
||||||
cout << host_data[i] << " ";
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
//free device vector
|
|
||||||
base.freeMemory<double>(device_data, size);
|
|
||||||
|
|
||||||
//free host data
|
|
||||||
delete[] host_data;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
Reference in New Issue
Block a user