22 Commits

Author SHA1 Message Date
432d2f5e4e Correct patch and minor versions set when installing dks 2017-04-05 17:28:13 +02:00
6e57cf5580 DKSConfigVersion file provided to allow cmake find_package to determine if compatible DKS version is installed 2017-03-15 15:18:58 +01:00
9e9a20c4af add tests for collimator physics and kick/push 2017-03-15 10:02:00 +01:00
9b9910e9f9 add seed to random number initialization 2017-03-15 10:01:28 +01:00
244ad0b230 OPALs greens function added to algorithms 2017-03-15 10:00:40 +01:00
9734157ad8 push and kick for OPALs BorisPusher 2017-03-15 09:59:33 +01:00
24f9c9dbf4 allow to enalbe wich parts of DKS to compile trough cmake flags 2017-02-28 15:11:59 +01:00
47fe8e8e52 allow to enalbe wich parts of DKS to compile trough cmake flags 2017-02-28 15:07:22 +01:00
eee9dfd89e seperate OPAL DKS functions from base 2017-02-28 15:06:45 +01:00
7c7c2e240b dksbase use Algoruthms base class for fft, colimator physics and greens function 2017-02-15 09:00:55 +01:00
e9d411235c merge dks-1.0.0-branch (fix conflicts) 2017-02-07 09:57:04 +01:00
c4e7029b5d libcuda loaded from stubs 2017-01-16 15:25:31 +01:00
8750ea06e5 remove duplicate flags for c++ and cuda compilers 2017-01-16 11:10:22 +01:00
69a9f7b9fb remove rpath in cmake when AMD OpenCL is used - fixes segfaults in OpenCL. Write compiler flags and cmake options in DKSConfig.cmake.in file to be used by projects using DKS 2017-01-10 16:06:00 +01:00
b5c5da29b2 add function to generate list of random numbers with cuda and opencl on the device 2016-12-09 13:43:09 +01:00
3a74d6cdee init randoms - each thread gets same seed but different sequence 2016-11-28 16:23:29 +01:00
7abc9fdbd3 cuda random number initialization - same seed different sequence for each state 2016-11-23 21:15:20 +01:00
b3a12e02a8 OpenCL FFT using clfft and tests 2016-11-21 16:14:11 +01:00
4432d32480 OpenCL greens function calculation for OPAL 2016-11-17 18:03:28 +01:00
63a008d111 Greens function calculation for OPAL rewriten with abstract base class 2016-11-17 18:02:48 +01:00
87cdf52f07 Collimator physics for MIC fix 2016-11-16 18:12:17 +01:00
027bdc01f5 FFT for OpenCL using clFFT library 2016-11-16 18:12:00 +01:00
46 changed files with 2610 additions and 1421 deletions

View File

@ -1,15 +1,14 @@
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
PROJECT (DKS)
SET (DKS_VERSION_MAJOR 1)
SET (DKS_VERSION_MINOR 0.1)
SET (DKS_VERSION_MINOR 1)
SET (DKS_VERSION_PATCH 0)
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
SET (PACKAGE \"dks\")
SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\")
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
SET (PACKAGE_NAME \"DKS\")
SET (PACKAGE_STRING \"DKS\ 1.0.1\")
SET (PACKAGE_TARNAME \"dks\")
SET (PACKAGE_VERSION \"1.0.1\")
SET (VERSION \"1.0.1\")
SET (DKS_VERSION_STR "\"${DKS_VERSION}\"")
SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
#get compiler name
@ -38,10 +37,44 @@ IF (Boost_FOUND)
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
ENDIF (Boost_FOUND)
#include OPAL, musrfit or pet kernels
OPTION(DKS_FULL "Compile DKS with full library" OFF)
OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
IF (DKS_FULL)
SET(ENABLE_OPAL ON)
SET(ENABLE_MUSR ON)
SET(ENABLE_PET ON)
ENDIF(DKS_FULL)
#find clFFT
OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
IF (ENABLE_AMD)
SET (clFFT_USE_STATIC_LIBS OFF)
FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
LINK_DIRECTORIES (${CLFFT_LIBRARIES})
#find clRNG
#SET (clRNG_USE_STATIC_LIBS OFF)
#FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
#MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
#MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
#INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
#LINK_DIRECTORIES (${CLRNG_LIBRARIES})
#find_package(PkgConfig)
#pkg_check_modules(clRng REQUIRED)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
ENDIF (ENABLE_AMD)
#enable UQTK
OPTION (USE_UQTK "Use UQTK" OFF)
#intel icpc compiler specific flags
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
@ -91,14 +124,16 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
SET (USE_CUDA ON)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
MESSAGE (STATUS "cuda include: ${CUDA_INCLUDE_DIRS}")
MESSAGE (STATUS "cuda libs: ${CUDA_TOOLKIT_ROOT_DIR}/lib64")
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -DDEBUG -std=c++11 -D__wsu")
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
#if cuda version >= 7.0 add runtime commpilation flags
@ -123,6 +158,7 @@ IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "
MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
MESSAGE(STATUS "OpenCL library dir: ${OpenCL_LIBRARY}")
SET(CMAKE_SKIP_RPATH TRUE)
INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIR})
LINK_DIRECTORIES(${OpenCL_LIBRARY})
ENDIF (OpenCL_FOUND)
@ -166,9 +202,18 @@ ADD_SUBDIRECTORY (auto-tuning)
CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake )
CONFIGURE_FILE (${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion_install.cmake @ONLY)
### install files ###
INSTALL (
FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
RENAME ${PROJECT_NAME}Config.cmake
)
INSTALL (
FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion_install.cmake
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
RENAME ${PROJECT_NAME}ConfigVersion.cmake
)

View File

@ -2,18 +2,30 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#chi square kernel tests
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
IF (ENABLE_MUSR)
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ENDIF (ENABLE_MUSR)
IF (ENABLE_OPAL)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testPushKick testPushKick.cpp)
TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ENDIF(ENABLE_OPAL)
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})

View File

@ -0,0 +1,161 @@
#include <iostream>
#include <vector>
#include <string>
#include "DKSOPAL.h"
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
} PART;
PART initPartSmall(int d) {
PART p;
p.label = 0;
p.localID = d;
p.Rincol[0] = 0.0;
p.Rincol[1] = 0.0;
p.Rincol[2] = 0.02;
p.Pincol[0] = 0.0;
p.Pincol[1] = 0.0;
p.Pincol[2] = 3.9920183237269791e-01;
return p;
}
void printPart(PART p) {
std::cout << "label: " << p.label << ", ";
std::cout << "localid: " << p.localID << ",";
std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
std::cout << std::endl;
}
void initParts(PART *p, int N) {
for (int i = 0; i < N; i++)
p[i] = initPartSmall(i);
}
void printParts(PART *p, int N) {
for (int i = 0; i < N; i++)
printPart(p[i]);
std::cout << std::endl;
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == std::string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == std::string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == std::string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
std::cout << "=========================BEGIN TEST=========================" << std::endl;
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
std::cout << "Number of particles: " << numpart << std::endl;
std::cout << "Number of loops: " << loop << std::endl;
std::cout << "------------------------------------------------------------" << std::endl;
//init part vector to test mc
PART *parts = new PART[numpart];
initParts(parts, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSOPAL base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
ierr = base.initDevice();
if (ierr != DKS_SUCCESS)
std::cout << "Error with init device!" << std::endl;
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *part_ptr, *param_ptr;
//allocate memory for particles
part_ptr = base.allocateMemory<PART>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<PART>(part_ptr, parts, numpart);
base.writeData<double>(param_ptr, params, 12);
int numaddback;
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
//read data from device
base.readData<PART>(part_ptr, parts, numpart);
//free memory
base.freeMemory<PART>(part_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
return 0;
}

View File

@ -0,0 +1,132 @@
#include <iostream>
#include <vector>
#include <string>
#include "DKSOPAL.h"
#include <vector_types.h>
#include "cuda_runtime.h"
void initData(double3 *data, int N) {
for (int i = 0; i < N; i++) {
data[i].x = (double)rand() / RAND_MAX;
data[i].y = (double)rand() / RAND_MAX;
data[i].z = (double)rand() / RAND_MAX;
}
}
void initDt(double *data, int N) {
for (int i = 0; i < N; i++) {
data[i] = 0.00001;
}
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == std::string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == std::string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == std::string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
std::cout << "=========================BEGIN TEST=========================" << std::endl;
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
std::cout << "Number of particles: " << numpart << std::endl;
std::cout << "Number of loops: " << loop << std::endl;
std::cout << "------------------------------------------------------------" << std::endl;
int ierr;
DKSOPAL dksbase;
dksbase.setAPI(api_name, strlen(api_name));
dksbase.setDevice(device_name, strlen(api_name));
ierr = dksbase.initDevice();
if (ierr != DKS_SUCCESS)
std::cout << "Error with init device!" << std::endl;
double3 *R = new double3[numpart];
double3 *P = new double3[numpart];
double3 *Ef = new double3[numpart];
double3 *Bf = new double3[numpart];
double *dt = new double[numpart];
initData(R, numpart);
initData(P, numpart);
initData(Ef, numpart);
initData(Bf, numpart);
initDt(dt, numpart);
void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
dksbase.writeData<double3>(r_ptr, R, numpart);
dksbase.writeData<double3>(p_ptr, P, numpart);
dksbase.writeData<double3>(ef_ptr, Ef, numpart);
dksbase.writeData<double3>(bf_ptr, Bf, numpart);
dksbase.writeData<double>(dt_ptr, dt, numpart);
for (int i = 0; i < loop; ++i)
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std::cout << "============" << std::endl;
dksbase.readData<double3>(r_ptr, R, numpart);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
dksbase.freeMemory<double3>(r_ptr, numpart);
dksbase.freeMemory<double3>(p_ptr, numpart);
dksbase.freeMemory<double3>(ef_ptr, numpart);
dksbase.freeMemory<double3>(bf_ptr, numpart);
dksbase.freeMemory<double>(dt_ptr, numpart);
delete[] R;
delete[] P;
delete[] Ef;
delete[] Bf;
delete[] dt;
}

View File

@ -1,4 +1,7 @@
SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${${PROJECT_NAME}_CXX_FLAGS}")
SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
SET(${PROJECT_NAME}_LIBRARY "dks")
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
SET(DKS_VERSION ${DKS_VERSION})
SET(DKS_VERSION_STR ${DKS_VERSION_STR})

View File

@ -0,0 +1,13 @@
set(PACKAGE_VERSION @DKS_VERSION@)
if("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "@DKS_VERSION_MAJOR@" AND "${PACKAGE_FIND_VERSION_MINOR}" EQUAL "@DKS_VERSION_MINOR@")
if ("${PACKAGE_FIND_VERSION_PATCH}" EQUAL "@DKS_VERSION_PATCH@")
set(PACKAGE_VERSION_EXACT TRUE)
elseif("${PACKAGE_FIND_VERSION_PATCH}" LESS "@DKS_VERSION_PATCH@")
set(PACKAGE_VERSION_COMPATIBLE TRUE)
else()
set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()
else()
set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()

View File

@ -6,6 +6,7 @@ SET (_HDRS
ImageReconstruction.h
CollimatorPhysics.h
FFT.h
GreensFunction.h
)
ADD_SOURCES (${_SRCS})

View File

@ -5,10 +5,7 @@
#include <string>
#include "../DKSDefinitions.h"
class DKSBaseMuSR;
class DKSCollimatorPhysics {
friend class DKSBaseMuSR;
protected:
@ -36,6 +33,10 @@ public:
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0;
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1) = 0;
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,

View File

@ -0,0 +1,29 @@
#ifndef H_GREENSFUNCTION
#define H_GREENSFUNCTION
#include <iostream>
#include <cmath>
class GreensFunction {
public:
virtual ~GreensFunction() { }
/** calc greens integral, as defined in OPAL */
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
/** integration if rho2_m, see OPAL for more details */
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1) = 0;
/** mirror rho2_m field */
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
/** multiply two complex fields from device memory */
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
};
#endif

View File

@ -35,13 +35,29 @@ ENDMACRO ()
SET (DKS_BASEDIR_HDRS
DKSBase.h
DKSDefinitions.h
DKSOPAL.h
)
SET (DKS_BASEDIR_SRCS
DKSBase.cpp
DKSOPAL.cpp
)
IF (USE_CUDA OR USE_OPENCL)
#add opal to DKS if enable_opal is set
IF (ENABLE_OPAL)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSOPAL.h
)
SET (DKS_BASEDIR_SRCS
${DKS_BASEDIR_SRCS}
DKSOPAL.cpp
)
ENDIF (ENABLE_OPAL)
#and musrt to DKS if cuda or opencl is used and enable_musr is set
IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSBaseMuSR.h
@ -51,9 +67,10 @@ IF (USE_CUDA OR USE_OPENCL)
${DKS_BASEDIR_SRCS}
DKSBaseMuSR.cpp
)
ENDIF (USE_CUDA OR USE_OPENCL)
ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
IF (USE_CUDA)
#add image reconstruction to DKS if cuda is used and enable_pet is set
IF (USE_CUDA AND ENABLE_PET)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSImageReconstruction.h
@ -63,7 +80,7 @@ IF (USE_CUDA)
${DKS_BASEDIR_SRCS}
DKSImageReconstruction.cpp
)
ENDIF (USE_CUDA)
ENDIF (USE_CUDA AND ENABLE_PET)
ADD_HEADERS (${DKS_BASEDIR_HDRS})
ADD_SOURCES (${DKS_BASEDIR_SRCS})

View File

@ -1,35 +1,27 @@
SET (_HDRS
CudaBase.cuh
CudaFFT.cuh
CudaGreensFunction.cuh
CudaChiSquare.cuh
CudaCollimatorPhysics.cuh
CudaImageReconstruction.cuh
CudaChiSquareRuntime.cuh
)
SET (_HDRS CudaBase.cuh)
SET (_SRCS CudaBase.cu)
SET (_SRCS
CudaBase.cu
CudaFFT.cu
CudaGreensFunction.cu
CudaChiSquare.cu
CudaCollimatorPhysics.cu
CudaImageReconstruction.cu
CudaChiSquareRuntime.cu
)
IF (ENABLE_OPAL)
SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu)
ENDIF (ENABLE_OPAL)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
IF (ENABLE_MUSR)
SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
ENDIF (ENABLE_MUSR)
IF (ENABLE_PET)
SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
ENDIF (ENABLE_PET)
MESSAGE (STATUS "CUDA headers: ${_HDRS}")
ADD_SOURCES(${_SRCS})
ADD_HEADERS(${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
SET (_KERNELS
NVRTCKernels/CudaChiSquareKernel.cu
)
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)

View File

@ -8,11 +8,18 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
curand_init(seed + idx, 0, 0, &state[idx]);
curand_init(seed, idx, 0, &state[idx]);
}
}
__global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
data[idx] = curand_uniform_double(&state[idx]);
}
//=====================================//
//==========Private functions==========//
@ -41,14 +48,15 @@ CudaBase::~CudaBase() {
/*
create curandStates
*/
int CudaBase::cuda_createCurandStates(int size) {
int CudaBase::cuda_createCurandStates(int size, int seed) {
if (defaultRndSet == 1)
cuda_deleteCurandStates();
int threads = 128;
int blocks = size / threads + 1;
int seed = time(NULL);
if (seed == -1)
seed = time(NULL);
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
cudaMalloc(&defaultRndState, sizeof(curandState)*size);
@ -68,6 +76,15 @@ int CudaBase::cuda_deleteCurandStates() {
return DKS_SUCCESS;
}
int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
int threads = BLOCK_SIZE;
int blocks = size / threads + 1;
kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
return DKS_SUCCESS;
}
curandState* CudaBase::cuda_getCurandStates() {
return defaultRndState;
}

View File

@ -15,6 +15,8 @@
#include <nvToolsExt.h>
#include <time.h>
#define BLOCK_SIZE 128
class CudaBase {
private:
@ -39,9 +41,10 @@ public:
* Init cuda random number (cuRand) states.
* Create an array of type curandState with "size" elements on the GPU
* and create a curandState with different seed for each array entry.
* If no seed is given create a seed based on current time.
* Return success or error code
*/
int cuda_createCurandStates(int size);
int cuda_createCurandStates(int size, int seed = -1);
/**
* Delete curandState.
@ -50,6 +53,11 @@ public:
*/
int cuda_deleteCurandStates();
/** Create 'size' random numbers on the device and save in mem_ptr array
*
*/
int cuda_createRandomNumbers(void *mem_ptr, int size);
/** Get a pointer to curand states
*
*/
@ -167,6 +175,40 @@ public:
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error zeroing cuda memory!\n");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
int dkserror = DKS_SUCCESS;
cudaError cerror;
if (streamId < cuda_numberOfStreams()) {
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
cuda_getStream(streamId));
if (cerror != cudaSuccess)
dkserror = DKS_ERROR;
} else
dkserror = DKS_ERROR;
return dkserror;
}
/**
* Info: write data to memory
* Retrun: success or error code

View File

@ -33,6 +33,14 @@ __device__ inline double dot(double3 &d1, double3 &d2) {
}
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
return tmp;
}
__device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */
@ -423,7 +431,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
}
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
//get global id and thread id
volatile int tid = threadIdx.x;
@ -449,7 +457,51 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, dou
}
}
//TODO: kernel for push with switch off unitless positions with dt[i]*c
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
double3 *gBf, double *gdt, double charge,
double mass, int npart, double c)
{
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
double3 Ef = gEf[idx];
double3 Bf = gBf[idx];
double dt = gdt[idx];
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
double gamma = sqrt(1.0 + dot(P, P));
double3 t, w, s;
t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
double3 crossPt = cross(P, t);
w.x = P.x + crossPt.x;
w.y = P.y + crossPt.y;
w.z = P.z + crossPt.z;
s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
double3 crossws = cross(w, s);
P.x += crossws.x;
P.y += crossws.y;
P.z += crossws.z;
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
gP[idx] = P;
}
}
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
@ -671,12 +723,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
}
} else {
if (streamId == -1) {
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart,
(double*)dt_ptr, c);
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr,
(double*)dt_ptr, npart, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart,
(double*)dt_ptr, c);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double*)dt_ptr, npart, c);
}
}
@ -684,6 +736,29 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart,
double c, int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
//call kernel
if (streamId == -1) {
kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
(double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double3*)ef_ptr, (double3*)bf_ptr,
(double*)dt_ptr, charge, mass, npart, c);
}
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec,

View File

@ -141,6 +141,10 @@ public:
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1);
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal

View File

@ -194,7 +194,6 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
@ -273,28 +272,20 @@ __global__ void mirroredRhoField(double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = rho2_m[id1];
if (i != 0)
rho2_m[id2] = data;
if (i != 0) rho2_m[id2] = data;
if (j != 0)
rho2_m[id3] = data;
if (j != 0) rho2_m[id3] = data;
if (i != 0 && j != 0)
rho2_m[id4] = data;
if (i != 0 && j != 0) rho2_m[id4] = data;
if (k != 0)
rho2_m[id5] = data;
if (k != 0) rho2_m[id5] = data;
if (k != 0 && i != 0)
rho2_m[id6] = data;
if (k != 0 && i != 0) rho2_m[id6] = data;
if (k!= 0 && j != 0)
rho2_m[id7] = data;
if (k!= 0 && j != 0) rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0)
rho2_m[id8] = data;
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
}
@ -363,9 +354,9 @@ CudaGreensFunction::~CudaGreensFunction() {
delete m_base;
}
int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId)
int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId)
{
int thread = 128;
@ -373,7 +364,7 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
//if no stream specified use default stream
if (streamId == -1) {
kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS;
}
@ -381,7 +372,7 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS;
}
@ -389,15 +380,17 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
}
int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen,
int I, int J, int K,
int streamId)
int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
int I, int J, int K,
int streamId)
{
int thread = 128;
int block = (I * J * K / thread) + 1;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
if (streamId == -1) {
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS;
@ -406,6 +399,7 @@ int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgr
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS;
@ -415,22 +409,22 @@ int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgr
return DKS_ERROR;
}
int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
int thread = 128;
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
if (streamId == -1) {
mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J);
mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I, 2*J);
mirroredRhoField<<< block, thread >>>( (double *) rho2_m, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
return DKS_SUCCESS;
}
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J);
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I, 2*J);
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
return DKS_SUCCESS;
}
@ -440,8 +434,8 @@ int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K,
return DKS_ERROR;
}
int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2,
int size, int streamId) {
int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2,
int size, int streamId) {
int threads = 128;
int blocks = size / threads + 1;

View File

@ -2,17 +2,17 @@
#define H_CUDA_GREENSFUNCTION
#include <iostream>
#include <math.h>
#include <cmath>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuComplex.h>
#include "cublas_v2.h"
#include "../Algorithms/GreensFunction.h"
#include "CudaBase.cuh"
class CudaGreensFunction {
class CudaGreensFunction : public GreensFunction{
private:
@ -30,32 +30,32 @@ public:
/* destructor */
~CudaGreensFunction();
/*
/**
Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code
*/
int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId = -1);
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId = -1);
/*
/**
Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code
*/
int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
/*
/**
Info: mirror rho field (taken from OPAL src code)
Return: succes or error code
*/
int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/*
/**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code
*/
int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
};

View File

@ -103,25 +103,14 @@ DKSBase::DKSBase() {
#ifdef DKS_CUDA
cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif
#ifdef DKS_OPENCL
oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif
#ifdef DKS_MIC
micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif
}
@ -138,25 +127,14 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
#ifdef DKS_CUDA
cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif
#ifdef DKS_OPENCL
oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif
#ifdef DKS_MIC
micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif
}
@ -173,27 +151,16 @@ DKSBase::~DKSBase() {
if (m_function_name != NULL)
delete[] m_function_name;
#ifdef DKS_CUDA
delete cfft;
delete cgreens;
delete cchi;
delete ccol;
delete cbase;
#endif
#ifdef DKS_OPENCL
delete oclfft;
delete oclchi;
delete oclcol;
delete oclbase;
#endif
#ifdef DKS_MIC
delete micfft;
delete miccol;
delete micgreens;
delete micchi;
delete micbase;
#endif
@ -307,38 +274,45 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
return DKS_ERROR;
}
/*
init device
*/
int DKSBase::initDevice() {
int DKSBase::setupDevice() {
int ierr = DKS_ERROR;
//if api is not set default is OpenCL
if (!m_api_set) {
setDevice("-gpu", 4);
setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else {
if (apiOpenCL()) {
if (!m_device_set) {
setDevice("-gpu", 4);
setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else {
setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
}
} else if (apiCuda()) {
setDevice("-gpu", 4);
setAPI(API_CUDA, 4);
return CUDA_SAFECALL(DKS_SUCCESS);
ierr = CUDA_SAFECALL(DKS_SUCCESS);
} else if (apiOpenMP()) {
setDevice("-mic", 4);
setAPI(API_OPENMP, 6);
return MIC_SAFECALL(DKS_SUCCESS);
ierr = MIC_SAFECALL(DKS_SUCCESS);
}
}
return DKS_ERROR;
return ierr;
}
/*
init device
*/
int DKSBase::initDevice() {
return setupDevice();
}
/*
@ -456,360 +430,19 @@ int DKSBase::syncDevice() {
return DKS_ERROR;
}
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSBase::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
if (apiOpenCL())
return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));
return DKS_ERROR;
}
//BENI:
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
int DKSBase::callInitRandoms(int size, int seed) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
//load kernel and execute
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
DEBUG_MSG("No implementation for selected platform");
return DKS_SUCCESS;
}
/* normalize complex to real iFFT */
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
if (apiCuda()) {
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId) );
} else if (apiOpenMP()) {
//BENI:
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
else
return DKS_ERROR;
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
{
if (apiCuda())
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
else if (apiOpenMP())
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
if (apiOpenMP()) {
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles) );
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callInitRandoms(int size) {
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_createCurandStates(size));
return CUDA_SAFECALL(cbase->cuda_createCurandStates(size, seed));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size));
else if (apiOpenMP())
@ -819,43 +452,3 @@ int DKSBase::callInitRandoms(int size) {
return DKS_ERROR;
}
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
usedt, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
c, usedt, streamId));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}

View File

@ -29,31 +29,17 @@
#endif
#include "OpenCL/OpenCLBase.h"
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLChiSquare.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#endif
#ifdef DKS_CUDA
#include "CUDA/CudaBase.cuh"
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaChiSquare.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#include "nvToolsExt.h"
#endif
#ifdef DKS_MIC
#include "MIC/MICBase.h"
#include "MIC/MICChiSquare.h"
#include "MIC/MICFFT.h"
#include "MIC/MICCollimatorPhysics.h"
#include "MIC/MICGreensFunction.hpp"
#endif
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */
@ -73,25 +59,14 @@ private:
#ifdef DKS_OPENCL
OpenCLBase *oclbase;
OpenCLFFT *oclfft;
OpenCLChiSquare *oclchi;
OpenCLCollimatorPhysics *oclcol;
#endif
#ifdef DKS_CUDA
CudaBase *cbase;
CudaFFT *cfft;
CudaGreensFunction *cgreens;
CudaChiSquare *cchi;
CudaCollimatorPhysics *ccol;
#endif
#ifdef DKS_MIC
MICBase *micbase;
MICFFT *micfft;
MICCollimatorPhysics *miccol;
MICGreensFunction *micgreens;
MICChiSquare *micchi;
#endif
protected:
@ -139,6 +114,12 @@ protected:
}
#endif
#ifdef DKS_MIC
MICBase *getMICBase() {
return micbase;
}
#endif
/** Call OpenCL base to load specified kenrel file.
*
*/
@ -154,6 +135,7 @@ protected:
return device_name;
}
public:
/**
@ -173,6 +155,11 @@ public:
*/
~DKSBase();
/** Function to initialize objects based on the device used.
*
*/
int setupDevice();
/** Turn on auto tuning */
void setAutoTuningOn() { m_auto_tuning = true; }
@ -405,7 +392,7 @@ public:
} else if (apiOpenMP()) {
#ifdef DKS_MIC
void * mem_ptr = NULL;
mem_ptr = micbase.mic_allocateMemory<T>(elements);
mem_ptr = micbase->mic_allocateMemory<T>(elements);
return mem_ptr;
#endif
}
@ -498,7 +485,7 @@ public:
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
}
@ -532,7 +519,7 @@ public:
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
}
return DKS_ERROR;
@ -832,7 +819,7 @@ public:
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
}
return DKS_ERROR;
@ -860,7 +847,7 @@ public:
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements,
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
streamId, offset));
}
@ -880,219 +867,22 @@ public:
else if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
else if (apiOpenMP())
return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
return DKS_ERROR;
}
///////////////////////////////////////////////
///////Function library part of dksbase////////
///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
* Create random numbers on the device and fille mem_data array
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Transpose 2D and 3D arrays, OpenCL implementation
* N - size of dimensions, ndim - number of dimensions, dim - dim to transpose
*/
int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/**
* Element by element multiplication.
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
* TODO: opencl and mic implementations.
*/
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
/**
* Chi square for parameter fitting on device.
* mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for
* intermediate results. Chi square results are put in &results
*/
int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result);
/**
* max-log-likelihood for parameter fitting on device.
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
* mem_par - pointer to parameter set, mem_results - pointer for
* intermediate results. Chi square results are put in &results.
* TODO: opencl and mic implementations.
*/
int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffser,
int sensors, int length, int numpar,
double &result);
/**
* max-log-likelihood for parameter fitting on device.
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
* mem_par - pointer to parameter set, mem_results - pointer for
* intermediate results. Chi square results are put in &results.
* TODO: opencl and mic implementations.
*/
int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffser,
int sensors, int length, int numpar,
double &result);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
*/
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
int callCreateRandomNumbers(void *mem_ptr, int size);
/**
* Init random number states and save for reuse on device.
* If seed is -1, a random seed based on current time is taken.
* TODO: opencl and mic implementations.
*/
int callInitRandoms(int size);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1);
int callInitRandoms(int size, int seed = -1);
/**
* Print memory information on device (total, used, available)

View File

@ -62,6 +62,12 @@
#define OPENCL_SAFEINIT(x) ( NULL )
#endif
#ifdef DKS_AMD
#define OPENCL_SAFEINIT_AMD(x) ( x )
#else
#define OPENCL_SAFEINIT_AMD(x) ( NULL )
#endif
#ifdef DKS_MIC
#define MIC_SAFEINIT(x) ( x )
#else

298
src/DKSOPAL.cpp Normal file
View File

@ -0,0 +1,298 @@
#include "DKSOPAL.h"
DKSOPAL::DKSOPAL() {
dksfft = nullptr;
dkscol = nullptr;
dksgreens = nullptr;
}
DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
setAPI(api_name, strlen(api_name));
setDevice(device_name, strlen(device_name));
}
DKSOPAL::~DKSOPAL() {
delete dksfft;
delete dkscol;
delete dksgreens;
}
int DKSOPAL::setupOPAL() {
int ierr = DKS_ERROR;
if (apiOpenCL()) {
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
//TODO: only enable if AMD libraries are available
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
} else if (apiCuda()) {
ierr = CUDA_SAFECALL( DKS_SUCCESS );
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
} else if (apiOpenMP()) {
ierr = MIC_SAFECALL( DKS_SUCCESS );
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
} else {
ierr = DKS_ERROR;
}
return ierr;
}
int DKSOPAL::initDevice() {
int ierr = setupDevice();
if (ierr == DKS_SUCCESS)
ierr = setupOPAL();
return ierr;
}
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSOPAL::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return dksfft->setupFFT(ndim, N);
} else if (apiOpenCL()) {
int ierr1 = dksfft->setupFFT(ndim, N);
int ierr2 = dksfft->setupFFTRC(ndim, N);
int ierr3 = dksfft->setupFFTCR(ndim, N);
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTRC(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTRC(ndim, N, scale);
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTCR(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTCR(ndim, N, scale);
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
else
return DKS_ERROR;
} else if (apiCuda()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
} else if (apiOpenMP()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
else if (apiOpenCL())
return DKS_ERROR;
else if (apiOpenMP())
return DKS_ERROR;
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId);
}
int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
}
int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);
}
int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
}
int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
{
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
}
int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
}
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
int npart, double c, int streamId) {
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
}
int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId)
{
return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr,
charge, mass, npart, c, streamId);
}
int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt, c, usedt, streamId);
}

233
src/DKSOPAL.h Normal file
View File

@ -0,0 +1,233 @@
#ifndef H_DKS_OPAL
#define H_DKS_OPAL
#include <iostream>
#include "AutoTuning/DKSAutoTuning.h"
#include "DKSBase.h"
#include "DKSDefinitions.h"
#include "Algorithms/GreensFunction.h"
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#ifdef DKS_AMD
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLGreensFunction.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#endif
#ifdef DKS_CUDA
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#endif
#ifdef DKS_MIC
#include "MIC/MICFFT.h"
#include "MIC/MICGreensFunction.hpp"
#include "MIC/MICCollimatorPhysics.h"
#endif
class DKSOPAL : public DKSBase {
private:
DKSFFT *dksfft;
DKSCollimatorPhysics *dkscol;
GreensFunction *dksgreens;
int setupOPAL();
public:
DKSOPAL();
DKSOPAL(const char* api_name, const char* device_name);
~DKSOPAL();
int initDevice();
///////////////////////////////////////////////
///////Function library part of dksbase////////
///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/**
* Element by element multiplication.
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
* TODO: opencl and mic implementations.
*/
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
*/
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
int npart, double c, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1);
};
#endif

View File

@ -1,19 +1,24 @@
SET (_SRCS
MICBase.cpp
MICChiSquare.cpp
MICFFT.cpp
MICGreensFunction.cpp
MICCollimatorPhysics.cpp
)
SET (_SRCS MICBase.cpp)
SET (_HDRS MICBase.h)
SET (_HDRS
MICBase.h
MICChiSquare.h
MICFFT.h
MICCollimatorPhysics.h
MICGreensFunction.hpp
MICMergeSort.h
)
IF (ENABLE_OPAL)
SET (_SRCS
${_SRCS}
MICChiSquare.cpp
MICFFT.cpp
MICGreensFunction.cpp
MICCollimatorPhysics.cpp
)
SET (_HDRS
${_HDRS}
MICChiSquare.h
MICFFT.h
MICCollimatorPhysics.h
MICGreensFunction.hpp
MICMergeSort.h
)
ENDIF (ENABLE_OPAL)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}

View File

@ -18,30 +18,28 @@ int MICBase::mic_createRandStreams(int size) {
int seed = time(NULL);
#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
int numThreads = 0;
#pragma offload target(mic:m_device_id) inout(numThreads)
{
//get the number of threads
int numThreads;
#pragma omp parallel
numThreads = omp_get_num_threads();
}
//if default rnd stream already allocated delete the array
if (defaultRndSet == 1)
delete[] defaultRndStream;
//allocate defaultRndStream array
defaultRndStream = new VSLStreamStatePtr[numThreads];
defaultRndStream = mic_allocateMemory<VSLStreamStatePtr>(numThreads);
VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
maxThreads = numThreads;
#pragma offload target(mic:m_device_id) \
in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN) \
in(seed)
{
//create stream states for each thread
#pragma omp parallel for
for (int i = 0; i < omp_get_num_threads(); i++)
vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
defaultRndSet = 1;
vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
}
defaultRndSet = 1;
return DKS_SUCCESS;
}
@ -49,15 +47,8 @@ int MICBase::mic_createRandStreams(int size) {
//delete default rand streams
int MICBase::mic_deleteRandStreams() {
#pragma offload target(mic:m_device_id) inout(defaultRndSet)
{
if (defaultRndSet == 1) {
delete[] defaultRndStream;
defaultRndSet = -1;
}
}
return DKS_ERROR;
//mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
return DKS_SUCCESS;
}
//create a new signal for the mic

View File

@ -30,14 +30,19 @@ class MICBase {
private:
std::vector<int> micStreams;
int maxThreads;
protected:
int defaultRndSet;
public:
VSLStreamStatePtr *defaultRndStream;
//#pragma offload_attribute(push,target(mic))
void *defaultRndStream; //VSLSStreamStatePtr
void *testPtr;
//#pragma offload_attribute(pop)
int m_device_id;
/* constructor */
@ -202,7 +207,6 @@ public:
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
{
}
return DKS_SUCCESS;
}

View File

@ -292,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073;
@ -338,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073;
@ -373,16 +373,18 @@ int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int nu
//cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles)
{
#pragma omp parallel
{
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x
@ -459,6 +461,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
int padding = numparticles % MIC_WIDTH;
int totalpart = numparticles + padding;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
@ -469,14 +473,16 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(totalpart)
{
#pragma omp parallel
{
//every thread gets its own rnd stream state
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
//VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -512,10 +518,12 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
double Eng = (sq - 1) * M_P;
double dEdx = 0;
if (label[i] == 0) {
energyLoss(Eng, dEdx, par, randv, i - ii);
}
if (Eng > 1e-4 && dEdx < 0) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(px[i], py[i], pz[i]));
@ -531,6 +539,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
} //end outer energy loss loop
//vectorize coulomb scattering as much as possible
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {

View File

@ -26,7 +26,7 @@ typedef struct {
} MIC_PART_SMALL;
class MICCollimatorPhysics : DKSAlogorithms{
class MICCollimatorPhysics : public DKSCollimatorPhysics {
private:
@ -38,7 +38,7 @@ public:
m_micbase = base;
};
~MICCollimatorPhysics() { };
~MICCollimatorPhysics() { };
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);

View File

@ -6,13 +6,16 @@
MICFFT::MICFFT(MICBase *base) {
m_micbase = base;
m_fftsetup = false;
}
MICFFT::~MICFFT() {
if (m_fftsetup) {
#pragma offload target(mic:0)
{
DftiFreeDescriptor(&FFTHandle_m);
DftiFreeDescriptor(&handle);
{
DftiFreeDescriptor(&FFTHandle_m);
DftiFreeDescriptor(&handle);
}
}
}
@ -35,7 +38,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
}
m_fftsetup = true;
return DKS_SUCCESS;
}
//BENI:
@ -122,8 +125,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
}
//execute iFFT
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
return mic_executeFFT(mem_ptr, ndim, N, -1, false);
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
return executeFFT(mem_ptr, ndim, N, -1, false);
}
//execute REAL->COMPLEX FFT

View File

@ -7,13 +7,14 @@
#include <offload.h>
#include <mkl_dfti.h>
#include "../Algorithm/DKSFFT.h"
#include "../Algorithms/FFT.h"
#include "MICBase.h"
class MICFFT : public DKSFFT {
private:
bool m_fftsetup;
MICBase *m_micbase;
/// Internal FFT object for performing serial FFTs.
@ -74,6 +75,18 @@ public:
/* normalize IFFT on MIC */
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
/**
* Info: destroy default FFT plans
* Return: success or error code
*/
int destroyFFT() { return DKS_SUCCESS; }
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
};
#endif

View File

@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
}
*/
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
double hr_m1, double hr_m2)
int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId)
{
double *tmp_ptr = (double*) tmp_ptr_;
double *tmp_ptr = (double*) tmpgreen;
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
{
std::memset(tmp_ptr,0,I*J*K);
@ -173,12 +173,14 @@ return 0;
*/
//CUDA similar version:
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
double *tmpgreen = (double*) tmp_ptr_;
double *mem_ptr = (double*) mem_ptr_;
int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId)
{
double *tmpgreen_ptr = (double*) tmpgreen;
double *mem_ptr = (double*) rho2_m;
// the actual integration
#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
std::memset(mem_ptr,0,II*JJ*KK);
@ -197,27 +199,27 @@ int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
tmp1 = tmpgreen_ptr[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
tmp2 = tmpgreen_ptr[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp3 = tmpgreen_ptr[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp5 = tmpgreen_ptr[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp6 = tmpgreen_ptr[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
tmp7 = tmpgreen_ptr[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
@ -234,8 +236,8 @@ int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp
int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
double *mem_ptr = (double*) mem_ptr_;
int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
double *mem_ptr = (double*) rho2_m;
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{
@ -281,11 +283,11 @@ int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K)
}
/*multiply complex fields*/
int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
// double *mem_ptr1 = (double*) mem_ptr1_;
// double *mem_ptr2 = (double*) mem_ptr2_;
_Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
_Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
_Complex double *mem_ptr1 = (_Complex double *) ptr1;
_Complex double *mem_ptr2 = (_Complex double *) ptr2;
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
{

View File

@ -9,12 +9,13 @@
#include <offload.h>
#include <mkl_dfti.h>
#include "../Algorithms/GreensFunction.h"
#include "MICBase.h"
#define DKS_SUCCESS 0
#define DKS_ERROR 1
class MICGreensFunction {
class MICGreensFunction : public GreensFunction {
private:
MICBase *m_micbase;
@ -28,16 +29,18 @@ public:
~MICGreensFunction();
/* compute greens integral analytically */
int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
/* perform the actual integration */
int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K,
int stremaId = -1);
/* Mirror rho-Field */
int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);
/*multiply complex fields*/
int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);
};

View File

@ -1,31 +1,39 @@
SET (_SRCS
OpenCLBase.cpp
OpenCLFFT.cpp
OpenCLChiSquare.cpp
OpenCLCollimatorPhysics.cpp
OpenCLChiSquareRuntime.cpp
)
#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found
SET (_HDRS
OpenCLBase.h
OpenCLFFT.h
OpenCLChiSquare.h
OpenCLCollimatorPhysics.h
OpenCLChiSquareRuntime.h
)
SET (_HDRS OpenCLBase.h)
SET (_SRCS OpenCLBase.cpp)
SET (_KERNELS "")
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
IF (ENABLE_MUSR)
SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
ENDIF (ENABLE_MUSR)
SET (_KERNELS
OpenCLKernels/OpenCLChiSquare.cl
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLChiSquareRuntime.cl
IF (ENABLE_AMD AND ENABLE_OPAL)
SET (_SRCS
${_SRCS}
OpenCLFFT.cpp
OpenCLCollimatorPhysics.cpp
OpenCLGreensFunction.cpp
)
SET (_HDRS
${_HDRS}
OpenCLFFT.h
OpenCLCollimatorPhysics.h
OpenCLGreensFunction.h
)
SET (_KERNELS
${_KERNELS}
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLGreensFunction.cl
)
ENDIF (ENABLE_AMD AND ENABLE_OPAL)
ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS})

View File

@ -7,21 +7,13 @@ cl_device_id OpenCLBase::m_device_id = NULL;
cl_event OpenCLBase::m_last_event = NULL;
OpenCLBase::OpenCLBase() {
//m_context = NULL;
//m_command_queue = NULL;
m_program = NULL;
m_kernel = NULL;
//m_device_id = NULL;
//m_platform_id = NULL;
m_kernel_file = NULL;
m_last_event = NULL;
//m_events = new cl_event[500];
//m_num_events = 0;
defaultRndSet = 0;
}
OpenCLBase::~OpenCLBase() {
@ -55,13 +47,34 @@ int OpenCLBase::ocl_createRndStates(int size) {
size_t work_items = size;
size_t work_group_size = 1;
ocl_executeKernel(1, &work_items, &work_group_size);
defaultRndSet = 1;
return OCL_SUCCESS;
return DKS_SUCCESS;
}
int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
//load kernel
char * kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
ocl_loadKernel(kernel_file);
delete[] kernel_file;
//set kernel variables
cl_mem tmp_data = (cl_mem) mem_ptr;
ocl_createKernel("createRandoms");
ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
ocl_setKernelArg(2, sizeof(int), &size);
size_t work_size = 128;
size_t work_items = (size % work_size + 1) * work_size;
ocl_executeKernel(1, &work_items, &work_size);
return DKS_SUCCESS;
}
/* destroy rnd states */
@ -70,7 +83,7 @@ int OpenCLBase::ocl_deleteRndStates() {
ocl_freeMemory(defaultRndState);
defaultRndSet = 0;
return OCL_SUCCESS;
return DKS_SUCCESS;
}
@ -428,7 +441,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
int ierr;
//create program from kernel
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
NULL, &ierr);
if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
return DKS_ERROR;
@ -438,7 +452,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
/*
check if compileng kernel source succeded, if failed return error code
check if compiling kernel source succeded, if failed return error code
if in debug mode get compilation info and print program build log witch
will give indication what made the compilation fail
*/
@ -447,7 +461,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
//get build status
cl_build_status status;
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &status, NULL);
//get log size
size_t log_size;
@ -613,12 +628,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
}
}
if (ierr != OCL_SUCCESS) {
if (ierr != DKS_SUCCESS) {
DEBUG_MSG("Failed to build kernel file " << kernel_file);
return OCL_ERROR;
return DKS_ERROR;
}
return OCL_SUCCESS;
return DKS_SUCCESS;
}
//compile kernel form source code provided

View File

@ -30,13 +30,10 @@
#include <CL/cl_ext.h>
#endif
#include "../DKSDefinitions.h"
/* struct for random number state */
typedef struct {
double s10;
double s11;
double s12;
@ -45,16 +42,12 @@ typedef struct {
double s22;
double z;
bool gen;
} RNDState;
class OpenCLBase {
private:
static cl_context m_context;
static cl_command_queue m_command_queue;
static cl_platform_id m_platform_id;
static cl_device_id m_device_id;
@ -119,6 +112,9 @@ protected:
public:
static cl_context m_context;
static cl_command_queue m_command_queue;
/*
constructor
*/
@ -135,6 +131,11 @@ public:
*/
int ocl_createRndStates(int size);
/* Create an array of random numbers on the device
*
*/
int ocl_createRandomNumbers(void *mem_ptr, int size);
/*
Destroy rnd states
Return: success or error code
@ -203,6 +204,20 @@ public:
*/
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer
* Set all the elemetns in the device array to zero
*/
template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
cl_int ierr;
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
sizeof(T)*size, 0, nullptr, nullptr);
if (ierr != CL_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
}
/*
Name: writeData
Info: write data to device memory (needs ptr to mem object)

View File

@ -32,7 +32,6 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
for (int step = 1; step < N; step <<= 1) {
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS)
@ -89,26 +88,78 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
call fft execution on device for every dimension
*/
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
int ierr;
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem inout = (cl_mem)data;
int n = N[0];
for (int dim = 0; dim < ndim; dim++) {
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing bit reverse");
return OCL_ERROR;
}
if (forward)
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &inout, NULL, NULL);
else
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &inout, NULL, NULL);
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing fft reverse");
return OCL_ERROR;
}
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return OCL_SUCCESS;
return dkserr;
}
/*
call rcfft execution on device for every dimension
*/
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &real_in, &comp_out, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return dkserr;
}
/*
call rcfft execution on device for every dimension
*/
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &comp_out, &real_in, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return dkserr;
}
/*
@ -120,10 +171,11 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
}
/*
call kernel to normalize fft
call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
*/
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
/*
cl_mem inout = (cl_mem)data;
int n = N[0];
@ -150,132 +202,175 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
DEBUG_MSG("Error executing kernel");
return OCL_ERROR;
}
*/
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
int OpenCLFFT::setupFFT(int ndim, int N[3]) {
int ierr;
int size = sizeof(cl_double2)*pow(N,ndim);
cl_mem mem_tmp;
cl_mem mem_src = (cl_mem)src;
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
//set the number of work items in each dimension
size_t work_items[3];
int p = 1;
int threads = N / 2;
int f = (forward) ? -1 : 1;
//execute kernel
int n = (int)log2(N);
for (int i = 0; i < ndim; i++) {
int dim = i+1;
p = 1;
work_items[0] = (dim == 1) ? N/2 : N;
work_items[1] = (dim == 2) ? N/2 : N;
work_items[2] = (dim == 3) ? N/2 : N;
//transpose array if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
return OCL_ERROR;
for (int t = 1; t <= log2(N); t++) {
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
p = 2*p;
}
//transpose array back if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
}
if (ndim*n % 2 == 1) {
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
}
m_oclbase->ocl_freeMemory(mem_dst);
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
cl_mem mem_src = (cl_mem)src;
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
m_oclbase->ocl_createKernel("fft_batch3D");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
for (int dim = 1; dim < ndim+1; dim++) {
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
cl_mem mem_src = (cl_mem)src;
cl_int err;
clfftDim dim;
if (ndim == 1)
return OCL_SUCCESS;
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t work_items[3];
work_items[0] = N[0];
work_items[1] = N[1];
work_items[2] = 1;
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
size_t work_group_size[3];
work_group_size[0] = N[0];
work_group_size[1] = N[1];
work_group_size[2] = 1;
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
if (err != CL_SUCCESS)
std::cout << "Error setting precision" << std::endl;
err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
if (err != CL_SUCCESS)
std::cout << "Error setting layout" << std::endl;
err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
if (err != CL_SUCCESS)
std::cout << "Error setting result location" << std::endl;
/* Bake the plan */
err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Complex-to-complex plan");
return DKS_ERROR;
}
m_oclbase->ocl_createKernel("transpose");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
return DKS_SUCCESS;
}
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
size_t clOutStride[3] = {1, half, half * N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Real-to-complex plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, half, half * N[1]};
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Complex-to-real plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::destroyFFT() {
clfftDestroyPlan(&planHandleZ2Z);
clfftDestroyPlan(&planHandleD2Z);
clfftDestroyPlan(&planHandleZ2D);
clfftTeardown();
return DKS_SUCCESS;
}
void OpenCLFFT::printError(clfftStatus err) {
if (err != CL_SUCCESS) {
std::cout << "Error creating default plan " << err << std::endl;
switch(err) {
case CLFFT_BUGCHECK:
std::cout << "bugcheck" << std::endl;
break;
case CLFFT_NOTIMPLEMENTED:
std::cout << "not implemented" << std::endl;
break;
case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
std::cout << "transposed not implemented" << std::endl;
break;
case CLFFT_FILE_NOT_FOUND:
std::cout << "file not found" << std::endl;
break;
case CLFFT_FILE_CREATE_FAILURE:
std::cout << "file create failure" << std::endl;
break;
case CLFFT_VERSION_MISMATCH:
std::cout << "version missmatch" << std::endl;
break;
case CLFFT_INVALID_PLAN:
std::cout << "invalid plan" << std::endl;
break;
case CLFFT_DEVICE_NO_DOUBLE:
std::cout << "no double" << std::endl;
break;
case CLFFT_DEVICE_MISMATCH:
std::cout << "device missmatch" << std::endl;
break;
case CLFFT_ENDSTATUS:
std::cout << "end status" << std::endl;
break;
default:
std::cout << "other: " << err << std::endl;
break;
}
}
return OCL_SUCCESS;
}
/*

View File

@ -20,12 +20,19 @@
#include "../Algorithms/FFT.h"
#include "OpenCLBase.h"
#include "clFFT.h"
class OpenCLFFT : public DKSFFT {
private:
OpenCLBase *m_oclbase;
clfftSetupData fftSetup;
clfftPlanHandle planHandleZ2Z;
clfftPlanHandle planHandleD2Z;
clfftPlanHandle planHandleZ2D;
/*
Info: call fft kernels to execute FFT of the given domain,
data - devevice memory ptr, cdim - current dim to transform,
@ -42,15 +49,31 @@ private:
*/
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
/** Get clfftStatus and print the corresponding error message.
* clfftStatus is returned from all clFFT library functions, print error displays the
* corresponding error message. If "other" is printed then error code corresponds to
* OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
* be checked to determine the reason for the error.
*/
void printError(clfftStatus err);
public:
/* constructor - currently does nothing*/
OpenCLFFT(OpenCLBase *base) {
m_oclbase = base;
/* Set up fft */
cl_int err;
err = clfftInitSetupData(&fftSetup);
err = clfftSetup(&fftSetup);
if (err != CL_SUCCESS)
DEBUG_MSG("Error seting up clFFT");
}
/* destructor - currently does nothing*/
~OpenCLFFT() { }
~OpenCLFFT() { destroyFFT(); }
/*
Info: execute forward fft function with data set on device
@ -77,35 +100,23 @@ public:
Info: set FFT size
Return: success or error code
*/
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
int setupFFT(int ndim, int N[3]);
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
int destroyFFT() { return DKS_SUCCESS; }
int destroyFFT();
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1)
{
return DKS_ERROR;
}
int streamId = -1);
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1)
{
return DKS_ERROR;
}
int streamId = -1);
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
{
return DKS_ERROR;
}
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
//void printData3DN4(cl_double2* &data, int N);
};

View File

@ -0,0 +1,181 @@
#include "OpenCLGreensFunction.h"
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
m_base = base;
base_create = false;
}
OpenCLGreensFunction::OpenCLGreensFunction() {
m_base = new OpenCLBase();
base_create = true;
}
OpenCLGreensFunction::~OpenCLGreensFunction() {
if (base_create)
delete m_base;
}
int OpenCLGreensFunction::buildProgram() {
char *kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, GREENS_KERNEL);
return m_base->ocl_loadKernel(kernel_file);
}
int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
//set the work item size
size_t work_size = 128;
size_t work_items = I * J * K;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelTmpgreen");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem rho2_ptr = (cl_mem)rho2_m;
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
int NI = 2*(I - 1);
int NJ = 2*(J - 1);
//set the work item size
size_t work_size = 128;
size_t work_items = I * J * K;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelIntegration");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
double zero = 0.0;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem rho2_ptr = (cl_mem)rho2_m;
int NI = I + 1;
int NJ = J + 1;
int NK = K + 1;
int I2 = 2*I;
int J2 = 2*J;
int K2 = 2*K;
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
//set the work item size
size_t work_size = 128;
size_t work_items = NI * NJ * NK;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem mem_ptr1 = (cl_mem) ptr1;
cl_mem mem_ptr2 = (cl_mem) ptr2;
//set the work item size
size_t work_size = 128;
size_t work_items = size;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("multiplyComplexFields");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
m_base->ocl_setKernelArg(2, sizeof(int), &size);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}

View File

@ -0,0 +1,63 @@
#ifndef H_OPENCL_GREENSFUNCTION
#define H_OPENCL_GREENSFUNCTION
#include <iostream>
#include <cmath>
#include "../Algorithms/GreensFunction.h"
#include "OpenCLBase.h"
class OpenCLGreensFunction : public GreensFunction {
private:
bool base_create;
OpenCLBase *m_base;
public:
/** Constructor with OpenCLBase argument */
OpenCLGreensFunction(OpenCLBase *base);
/** Default constructor */
OpenCLGreensFunction();
/** Destructor */
~OpenCLGreensFunction();
/** Load OpenCL kernel file containing greens function kernels.
* m_base takes the kernel file and compiles the OpenCL programm.
*/
int buildProgram();
/**
Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code
*/
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId = -1);
/**
Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code
*/
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
/**
Info: mirror rho field (taken from OPAL src code)
Return: succes or error code
*/
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code
*/
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
};
#endif

View File

@ -1,6 +1,4 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION
/******Random numbers********/
@ -89,13 +87,14 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
if (id < N) {
RNDState tmp;
int tmp_seed = id;// * 0x100000000ULL;
int tmp_seed = 2*id;// * 0x100000000ULL;
tmp.s10 = 12345 + tmp_seed;
tmp.s11 = 12345 + tmp_seed;
tmp.s12 = 123 + tmp_seed;
tmp.s12 = 12345 + tmp_seed;
tmp.s20 = 12345 + tmp_seed;
tmp.s21 = 12345 + tmp_seed;
tmp.s22 = 123 + tmp_seed;
tmp.s22 = 12345 + tmp_seed;
tmp.z = 0;
tmp.gen = true;
@ -105,6 +104,19 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
}
/* create random numbers and fill an array */
__kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
int idx = get_global_id(0);
if (idx < size) {
RNDState s = states[idx];
data[idx] = rand_uniform(&s);
states[idx] = s;
}
}
/**********Degrader**********/
enum PARAMS { POSITION,

View File

@ -0,0 +1,170 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/** compute the greens integral analytically */
__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
int NI, int NJ, int NK)
{
int tid = get_local_size(0);
int id = get_global_id(0);
if (id < NI * NJ * NK) {
int i = id % NI;
int k = id / (NI * NJ);
int j = (id - k * NI * NJ) / NI;
double cellVolume = hr_m0 * hr_m1 * hr_m2;
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgreen[id] = tmpgrn / cellVolume;
}
}
/** perform the actual integration */
__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen,
int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp)
{
int tid = get_local_id(0);
int id = get_global_id(0);
int ni = NI;
int nj = NJ;
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
/** miror rho-field */
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
rho2_m[0] = rho2_m[NI*NJ];
}
__kernel void kernelMirroredRhoField(__global double *rho2_m,
int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp,
int size)
{
int tid = get_local_id(0);
int id = get_global_id(0);
if (id == 0)
rho2_m[0] = rho2_m[NI * NJ];
barrier(CLK_GLOBAL_MEM_FENCE);
int id1, id2, id3, id4, id5, id6, id7, id8;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
int ri = NI - i;
int rj = NJ - j;
int rk = NK - k;
id1 = k * NI * NJ + j * NI + i;
id2 = k * NI * NJ + j * NI + ri;
id3 = k * NI * NJ + rj * NI + i;
id4 = k * NI * NJ + rj * NI + ri;
id5 = rk * NI * NJ + j * NI + i;
id6 = rk * NI * NJ + j * NI + ri;
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = 0.0;
if (id1 < size)
data = rho2_m[id1];
if (i != 0 && id2 < size) rho2_m[id2] = data;
if (j != 0 && id3 < size) rho2_m[id3] = data;
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
if (k != 0 && id5 < size) rho2_m[id5] = data;
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
}
}
/** multiply complex fields */
double2 ComplexMul(double2 a, double2 b) {
double2 c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
int size)
{
int idx = get_global_id(0);
if (idx < size)
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
}

View File

@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testFFT testFFT.cpp)
#ADD_EXECUTABLE(testMIC testMIC.cpp)
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
@ -22,10 +22,11 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testGather testGather.cpp)
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
ADD_EXECUTABLE(testRandom testRandom.cpp)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
#ADD_EXECUTABLE(testPush testPush.cpp)
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
@ -38,8 +39,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#TARGET_LINK_LIBRARIES(testFFT dks)
#TARGET_LINK_LIBRARIES(testMIC dks)
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
#TARGET_LINK_LIBRARIES(testFFT3D dks)
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -53,10 +54,11 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#TARGET_LINK_LIBRARIES(testGather dks)
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
#TARGET_LINK_LIBRARIES(testTranspose dks)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
TARGET_LINK_LIBRARIES(testRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testPush dks)
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)

View File

@ -129,7 +129,9 @@ int main(int argc, char *argv[]) {
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
//allocate memory for particles
@ -211,7 +213,7 @@ int main(int argc, char *argv[]) {
base.freeMemory<double>(param_ptr, 12);
/*
/*
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << p.label[i] << "\t" << p.rx[i]

View File

@ -1,6 +1,7 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include <string>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
@ -18,22 +19,30 @@ int main(int argc, char *argv[]) {
int N = 16;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
N = atoi(argv[1]);
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
} else if (argc == 3) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, "-gpu");
} else if (argc == 4) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, argv[3]);
} else {
N = 16;
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
if (argv[i] == string("-N"))
N = atoi(argv[i+1]);
}
cout << "Use api: " << api_name << ", " << device_name << endl;
@ -74,9 +83,16 @@ int main(int argc, char *argv[]) {
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
if (N < 5)
printData3DN4(cdata, N, 3);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
if (N < 5) {
base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
printData3DN4(cfft, N, 3);
}
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
@ -86,6 +102,8 @@ int main(int argc, char *argv[]) {
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
if (N < 5)
printData3DN4(cifft, N, 3);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
@ -130,7 +148,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
cout << "(" << d << "," << a << ") ";
}
}
cout << endl;
@ -157,3 +175,5 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

View File

@ -1,6 +1,8 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include <fstream>
#include <iomanip>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
@ -8,54 +10,53 @@
using namespace std;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
void initData(double *data, int dimsize[3]);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
void initData(double *data, int dimsize[3], int dim);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
char *api_name, char *device_name, char *file_name);
void printHelp();
void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* &data, int N, int dim);
double precision(double a) {
//if (a < 1e-10)
// return 0.0;
//else
return a;
}
int main(int argc, char *argv[]) {
int N1 = 8;
int N2 = 8;
int N3 = 8;
int dim = 3;
int loop = 10;
int loop = 0;
char *api_name = new char[10];
char *device_name = new char[10];
char *file_name = new char[50];
if ( readParams(argc, argv, N1, N2, N3, loop) )
if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
return 0;
int dimsize[3] = {N3, N2, N1};
cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N1, N2, N3};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal];
double *outdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp];
for (int i=0; i<sizecomp; ++i) {
cfft[i].real() = 7.;
cfft[i].imag() = 3.33;
}
initData(rdata, dimsize);
initData(rdata, dimsize, dim);
/* init DKSBase */
cout << "Init device and set function" << endl;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
base.setupFFTRC(dim, dimsize);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
#endif
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(dim, dimsize);
#endif
// allocate memory on device
int ierr;
@ -67,69 +68,59 @@ int main(int argc, char *argv[]) {
// execute one run before starting the timers
base.writeData<double>(real_ptr, rdata, sizereal);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
base.readData<double>(real_res_ptr, outdata, sizereal);
//timer for total loop time, FFT and IFFT calls
struct timeval timeStart, timeEnd;
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
gettimeofday(&timeStart, NULL);
for (int i=0; i<loop; ++i){
// write data to device
base.writeData<double>(real_ptr, rdata, sizereal);
// execute rcfft
gettimeofday(&timeFFTStart[i], NULL);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeFFTEnd[i], NULL);
// execute crfft
gettimeofday(&timeIFFTStart[i], NULL);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeIFFTEnd[i], NULL);
//normalize
#ifdef DKS_CUDA
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
#endif
// read IFFT data from device
base.readData<double>(real_res_ptr, outdata, sizereal);
ofstream myfile;
myfile.open(file_name);
myfile<< "in\tout\treal\timag\n";
for (int i = 0; i < sizereal; i++) {
//myfile << precision(rdata[i]) << "\t";
//myfile << precision(outdata[i]) << "\t";
if (i < sizecomp) {
myfile << precision(cfft[i].real()) << "\t";
myfile << precision(cfft[i].imag());
}
myfile << "\n";
}
gettimeofday(&timeEnd, NULL);
myfile.close();
/*
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1; j++) {
cout << rdata[i*N1 + j] << " ";
}
cout << endl;
}
cout << endl;
}
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1 / 2 + 1; j++) {
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
}
cout << endl;
}
cout << endl;
}
*/
// free device memory
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(real_res_ptr, sizereal);
// compare in and out data to see if we get back the same results
cout << "comp" << endl;
compareData(rdata, outdata, N1, N2, N3, dim);
//calculate seconds for total time and fft times
double tfft = 0;
double tifft = 0;
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
for (int i = 0; i < loop; i++) {
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
}
//print timing results
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
<< "\n\n";
cout << "done" << endl;
return 0;
}
@ -137,10 +128,10 @@ int main(int argc, char *argv[]) {
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
int id;
double sum = 0;
for (int i = 0; i < NI; i++) {
for (int i = 0; i < NK; i++) {
for (int j = 0; j < NJ; j++) {
for (int k = 0; k < NK; k++) {
id = k*NI*NJ + j*NI + i;
for (int k = 0; k < NI; k++) {
id = i*NI*NJ + j*NI + k;
sum += fabs(data1[id] - data2[id]);
}
}
@ -148,13 +139,21 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
std::cout << "RC <--> CR diff: " << sum << std::endl;
}
void initData(double *data, int dimsize[3]) {
for (int i = 0; i < dimsize[2]; i++) {
void initData(double *data, int dimsize[3], int dim) {
if (dim == 3) {
for (int i = 0; i < dimsize[2]; i++)
for (int j = 0; j < dimsize[1]; j++)
for (int k = 0; k < dimsize[0]; k++)
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
} else if (dim == 2) {
for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) {
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
data[j*dimsize[0] + k] = sin(k);
}
}
} else {
for (int k = 0; k < dimsize[0]; k++)
data[k] = sin(k);
}
}
@ -173,10 +172,17 @@ void printHelp() {
std::cout << std::endl;
}
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
char *api_name, char *device_name, char *file_name)
{
for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-dim")) {
dim = atoi(argv[i + 1]);
i++;
}
if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]);
@ -193,7 +199,72 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
printHelp();
return true;
}
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
strcpy(file_name, "cuda_fft.dat");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
strcpy(file_name, "opencl_fft.dat");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
strcpy(file_name, "openmp_fft.dat");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
strcpy(file_name, "opencl_cpu_fft.dat");
}
}
return false;
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N/2 + 1; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << "(" << d << "," << a << ") ";
}
}
cout << endl;
}
cout << endl;
}
void printData3DN4(double* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k];
if (d < 10e-5 && d > -10e-5)
d = 0;
cout << d << " ";
}
}
cout << endl;
}
cout << endl;
}

View File

@ -1,5 +1,4 @@
#include <iostream>
//#include <mpi.h>
#include <string.h>
#include "DKSBase.h"
@ -11,309 +10,265 @@ using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0)
cout << message;
if (strcmp(message, "") != 0)
cout << message;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t";
}
cout << endl;
}
cout << endl;
}
}
void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1;
}
}
}
}
void initData2(double *data, int N) {
for (int i = 0; i < N; i++)
data[i] = i;
for (int i = 0; i < N; i++)
data[i] = i;
}
void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0);
}
}
void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
for (int i = 0; i < N; i++)
cout << d[i] << "\t";
cout << endl;
}
void printDouble(double *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << ", ";
cout << endl;
}
void initMirror(double *data, int n1, int n2, int n3) {
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
int d = 1;
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++;
else
data[i * n2 * n1 + j * n1 + k] = 0;
}
}
}
}
void printDiv(int c) {
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
for (int i = 0; i < c; i++)
cout << "-";
cout << endl;
}
void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
printDiv(75);
for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t";
}
cout << endl;
}
cout << endl;
}
cout << endl;
}
double sumData(double *data, int datasize) {
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
double sum = 0;
for (int i = 0; i < datasize; i++)
sum += data[i];
return sum;
return sum;
}
int main(int argc, char *argv[]) {
/* mpi init */
//int rank, nprocs;
//MPI_Init(&argc, &argv);
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
char *api_name = new char[10];
char *device_name = new char[10];
/*
if (nprocs != 8) {
cout << "example was set to run with 8 processes" << endl;
cout << "exit..." << endl;
return 0;
}
*/
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
/* set domain size */
int NG[3] = {64, 64, 32};
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
//id[0] = 0;
//id[1] = NL[1] * (rank % 4);
//id[2] = NL[2] * (rank / 4);
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
/* print some messages bout the example in the begginig */
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
int tmp[3];
/* for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}*/
// } else {
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
// }
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
}
/* dks init and create 2 streams */
int dkserr;
//int streamGreens, streamFFT;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
#endif
cout << "Use api: " << api_name << ", " << device_name << endl;
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
#endif
/* set domain size */
int NG[3] = {64, 64, 32};
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
//base.createStream(streamFFT);
//if (rank == 0) {
// base.createStream(streamGreens);
base.setupFFT(3, NG);
//}
/* print some messages bout the example in the begginig */
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
/* allocate memory and init rho field */
double *rho = new double[sizerho];
double *rho_out = new double[sizerho];
//double *green_out = new double[sizegreen];
initMirror(rho, NL[0], NL[1], NL[2]);
/* dks init and create 2 streams */
int dkserr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, NG);
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
// if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* } else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}*/
/* allocate memory and init rho field */
double *rho = new double[sizerho];
double *rho_out = new double[sizerho];
//double *green_out = new double[sizegreen];
double *mirror_out = new double[sizerho];
//initMirror(rho, NL[0], NL[1], NL[2]);
initMirror(rho, NG[0], NG[1], NG[2]);
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* send and receive pointer to allocated memory on device */
/*
if (rank == 0) {
for (int p = 1; p < nprocs; p++)
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
*/
/* =================================================*/
/* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
double old_sum = 0;
/* =================================================*/
/* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
int hr_m[3] = {1, 1, 1};
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
double old_sum = 0;
double tmp_sum = 0;
for (int l = 0; l < 100; l++) {
//MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
//if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2]);
/* calculate greens integral on gpu */
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
/* calculate greens integral on gpu */
//if (rank == 0)
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
/* mirror the field */
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
/*
base.readData<double>(grn_ptr, mirror_out, sizerho);
for (int i = 0; i < sizerho; i++)
cout << mirror_out[i] << " ";
cout << endl << endl;
/* mirror the field */
//if (rank == 0)
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
for (int i = 0; i < sizerho; i++)
cout << rho[i] << " ";
cout << endl << endl;
*/
/* transfer rho field to device */
base.writeData<double>(rho2_ptr, rho, sizerho);
/* get FFT of rho field */
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
/* get FFT of mirrored greens integral */
//if (rank == 0)
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* get FFT of mirrored greens integral */
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* transfer rho field to device */
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
/* multiply both FFTs */
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
/* get FFT of rho field */
//if (rank == 0) {
//base.syncDevice();
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
//}
/*
complex<double> *crho = new complex<double>[sizecomp];
complex<double> *cgre = new complex<double>[sizecomp];
base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
/* multiply both FFTs */
//if (rank == 0)
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < sizecomp; i++)
cout << cgre[i].real() << " ";
cout << endl << endl;
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
//if (rank == 0) {
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
/*} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
*/
for (int i = 0; i < sizecomp; i++)
cout << crho[i].real() << " ";
cout << endl << endl;
delete[] crho;
delete[] cgre;
*/
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
base.readData<double> (rho2_ptr, rho_out, sizerho);
for (int i = 0; i < 10; i++)
cout << rho_out[i] << " ";
cout << endl;
old_sum = sumData(rho_out, sizerho);
}
/* =================================================*/
/* =================================================*/
/* ==========end fftpoison solver test run==========*/
/* =================================================*/
/* =================================================*/
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
base.freeMemory<double>(rho2_ptr, sizerho);
/* free memory on device */
//if (rank == 0) {
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
/*} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}*/
//MPI_Finalize();
delete[] rho_out;
delete[] rho;
delete[] mirror_out;
cout << "Final sum: " << old_sum << endl;
}

81
test/testRandom.cpp Normal file
View File

@ -0,0 +1,81 @@
#include <iostream>
#include <string>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
int size = 10;
bool apiSet = false;
char *api_name = new char[10];
char *device_name = new char[10];
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
apiSet = true;
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
apiSet = true;
}
if (argv[i] == string("-N")) {
size = atoi(argv[i+1]);
i++;
}
}
if (!apiSet) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of randoms: " << size << endl;
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
base.callInitRandoms(size);
//create host vector to store results
double *host_data = new double[size];
//create device vector
void *device_data = base.allocateMemory<double>(size, ierr);
for (int i = 0; i < 5; i++) {
//fill device vector with random values
base.callCreateRandomNumbers(device_data, size);
//read device vector
base.readData<double>(device_data, host_data, size);
//print host data
for (int i = 0; i < size; i++)
cout << host_data[i] << " ";
cout << endl;
}
//free device vector
base.freeMemory<double>(device_data, size);
//free host data
delete[] host_data;
return 0;
}