18 Commits

Author SHA1 Message Date
432d2f5e4e Correct patch and minor versions set when installing dks 2017-04-05 17:28:13 +02:00
6e57cf5580 DKSConfigVersion file provided to allow cmake find_package to determine if compatible DKS version is installed 2017-03-15 15:18:58 +01:00
9e9a20c4af add tests for collimator physics and kick/push 2017-03-15 10:02:00 +01:00
9b9910e9f9 add seed to random number initialization 2017-03-15 10:01:28 +01:00
244ad0b230 OPALs greens function added to algorithms 2017-03-15 10:00:40 +01:00
9734157ad8 push and kick for OPALs BorisPusher 2017-03-15 09:59:33 +01:00
24f9c9dbf4 allow to enalbe wich parts of DKS to compile trough cmake flags 2017-02-28 15:11:59 +01:00
47fe8e8e52 allow to enalbe wich parts of DKS to compile trough cmake flags 2017-02-28 15:07:22 +01:00
eee9dfd89e seperate OPAL DKS functions from base 2017-02-28 15:06:45 +01:00
7c7c2e240b dksbase use Algoruthms base class for fft, colimator physics and greens function 2017-02-15 09:00:55 +01:00
e9d411235c merge dks-1.0.0-branch (fix conflicts) 2017-02-07 09:57:04 +01:00
b5c5da29b2 add function to generate list of random numbers with cuda and opencl on the device 2016-12-09 13:43:09 +01:00
3a74d6cdee init randoms - each thread gets same seed but different sequence 2016-11-28 16:23:29 +01:00
b3a12e02a8 OpenCL FFT using clfft and tests 2016-11-21 16:14:11 +01:00
4432d32480 OpenCL greens function calculation for OPAL 2016-11-17 18:03:28 +01:00
63a008d111 Greens function calculation for OPAL rewriten with abstract base class 2016-11-17 18:02:48 +01:00
87cdf52f07 Collimator physics for MIC fix 2016-11-16 18:12:17 +01:00
027bdc01f5 FFT for OpenCL using clFFT library 2016-11-16 18:12:00 +01:00
47 changed files with 2586 additions and 1442 deletions

View File

@ -1,10 +1,8 @@
CMAKE_MINIMUM_REQUIRED (VERSION 3.2) CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
PROJECT (DKS) PROJECT (DKS)
SET (DKS_VERSION_MAJOR 1) SET (DKS_VERSION_MAJOR 1)
SET (DKS_VERSION_MINOR 0) SET (DKS_VERSION_MINOR 1)
SET (DKS_VERSION_PATCH 2) SET (DKS_VERSION_PATCH 0)
SET (PACKAGE \"dks\")
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH}) set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
SET (PACKAGE \"dks\") SET (PACKAGE \"dks\")
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\") SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -39,10 +37,44 @@ IF (Boost_FOUND)
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
ENDIF (Boost_FOUND) ENDIF (Boost_FOUND)
#include OPAL, musrfit or pet kernels
OPTION(DKS_FULL "Compile DKS with full library" OFF)
OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
IF (DKS_FULL)
SET(ENABLE_OPAL ON)
SET(ENABLE_MUSR ON)
SET(ENABLE_PET ON)
ENDIF(DKS_FULL)
#find clFFT
OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
IF (ENABLE_AMD)
SET (clFFT_USE_STATIC_LIBS OFF)
FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
LINK_DIRECTORIES (${CLFFT_LIBRARIES})
#find clRNG
#SET (clRNG_USE_STATIC_LIBS OFF)
#FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
#MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
#MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
#INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
#LINK_DIRECTORIES (${CLRNG_LIBRARIES})
#find_package(PkgConfig)
#pkg_check_modules(clRng REQUIRED)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
ENDIF (ENABLE_AMD)
#enable UQTK #enable UQTK
OPTION (USE_UQTK "Use UQTK" OFF) OPTION (USE_UQTK "Use UQTK" OFF)
#intel icpc compiler specific flags #intel icpc compiler specific flags
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
@ -185,4 +217,3 @@ INSTALL (
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}" DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
RENAME ${PROJECT_NAME}ConfigVersion.cmake RENAME ${PROJECT_NAME}ConfigVersion.cmake
) )

View File

@ -2,18 +2,30 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#chi square kernel tests #chi square kernel tests
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) IF (ENABLE_MUSR)
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES}) ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp) ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES}) TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ENDIF (ENABLE_MUSR)
IF (ENABLE_OPAL)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testPushKick testPushKick.cpp)
TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ENDIF(ENABLE_OPAL)
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})

View File

@ -0,0 +1,161 @@
#include <iostream>
#include <vector>
#include <string>
#include "DKSOPAL.h"
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
} PART;
PART initPartSmall(int d) {
PART p;
p.label = 0;
p.localID = d;
p.Rincol[0] = 0.0;
p.Rincol[1] = 0.0;
p.Rincol[2] = 0.02;
p.Pincol[0] = 0.0;
p.Pincol[1] = 0.0;
p.Pincol[2] = 3.9920183237269791e-01;
return p;
}
void printPart(PART p) {
std::cout << "label: " << p.label << ", ";
std::cout << "localid: " << p.localID << ",";
std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
std::cout << std::endl;
}
void initParts(PART *p, int N) {
for (int i = 0; i < N; i++)
p[i] = initPartSmall(i);
}
void printParts(PART *p, int N) {
for (int i = 0; i < N; i++)
printPart(p[i]);
std::cout << std::endl;
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == std::string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == std::string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == std::string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
std::cout << "=========================BEGIN TEST=========================" << std::endl;
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
std::cout << "Number of particles: " << numpart << std::endl;
std::cout << "Number of loops: " << loop << std::endl;
std::cout << "------------------------------------------------------------" << std::endl;
//init part vector to test mc
PART *parts = new PART[numpart];
initParts(parts, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSOPAL base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
ierr = base.initDevice();
if (ierr != DKS_SUCCESS)
std::cout << "Error with init device!" << std::endl;
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *part_ptr, *param_ptr;
//allocate memory for particles
part_ptr = base.allocateMemory<PART>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<PART>(part_ptr, parts, numpart);
base.writeData<double>(param_ptr, params, 12);
int numaddback;
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
//read data from device
base.readData<PART>(part_ptr, parts, numpart);
//free memory
base.freeMemory<PART>(part_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
return 0;
}

View File

@ -0,0 +1,132 @@
#include <iostream>
#include <vector>
#include <string>
#include "DKSOPAL.h"
#include <vector_types.h>
#include "cuda_runtime.h"
void initData(double3 *data, int N) {
for (int i = 0; i < N; i++) {
data[i].x = (double)rand() / RAND_MAX;
data[i].y = (double)rand() / RAND_MAX;
data[i].z = (double)rand() / RAND_MAX;
}
}
void initDt(double *data, int N) {
for (int i = 0; i < N; i++) {
data[i] = 0.00001;
}
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == std::string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == std::string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == std::string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
std::cout << "=========================BEGIN TEST=========================" << std::endl;
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
std::cout << "Number of particles: " << numpart << std::endl;
std::cout << "Number of loops: " << loop << std::endl;
std::cout << "------------------------------------------------------------" << std::endl;
int ierr;
DKSOPAL dksbase;
dksbase.setAPI(api_name, strlen(api_name));
dksbase.setDevice(device_name, strlen(api_name));
ierr = dksbase.initDevice();
if (ierr != DKS_SUCCESS)
std::cout << "Error with init device!" << std::endl;
double3 *R = new double3[numpart];
double3 *P = new double3[numpart];
double3 *Ef = new double3[numpart];
double3 *Bf = new double3[numpart];
double *dt = new double[numpart];
initData(R, numpart);
initData(P, numpart);
initData(Ef, numpart);
initData(Bf, numpart);
initDt(dt, numpart);
void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
dksbase.writeData<double3>(r_ptr, R, numpart);
dksbase.writeData<double3>(p_ptr, P, numpart);
dksbase.writeData<double3>(ef_ptr, Ef, numpart);
dksbase.writeData<double3>(bf_ptr, Bf, numpart);
dksbase.writeData<double>(dt_ptr, dt, numpart);
for (int i = 0; i < loop; ++i)
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std::cout << "============" << std::endl;
dksbase.readData<double3>(r_ptr, R, numpart);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
dksbase.freeMemory<double3>(r_ptr, numpart);
dksbase.freeMemory<double3>(p_ptr, numpart);
dksbase.freeMemory<double3>(ef_ptr, numpart);
dksbase.freeMemory<double3>(bf_ptr, numpart);
dksbase.freeMemory<double>(dt_ptr, numpart);
delete[] R;
delete[] P;
delete[] Ef;
delete[] Bf;
delete[] dt;
}

View File

@ -4,4 +4,4 @@ SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
SET(${PROJECT_NAME}_LIBRARY "dks") SET(${PROJECT_NAME}_LIBRARY "dks")
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH}) SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
SET(DKS_VERSION ${DKS_VERSION}) SET(DKS_VERSION ${DKS_VERSION})
SET(DKS_VERSION_STR ${DKS_VERSION_STR}) SET(DKS_VERSION_STR ${DKS_VERSION_STR})

View File

@ -6,6 +6,7 @@ SET (_HDRS
ImageReconstruction.h ImageReconstruction.h
CollimatorPhysics.h CollimatorPhysics.h
FFT.h FFT.h
GreensFunction.h
) )
ADD_SOURCES (${_SRCS}) ADD_SOURCES (${_SRCS})

View File

@ -5,10 +5,7 @@
#include <string> #include <string>
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
class DKSBaseMuSR;
class DKSCollimatorPhysics { class DKSCollimatorPhysics {
friend class DKSBaseMuSR;
protected: protected:
@ -19,8 +16,7 @@ public:
virtual ~DKSCollimatorPhysics() { } virtual ~DKSCollimatorPhysics() { }
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices, virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;
bool enableRutherfordScattering = true) = 0;
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
@ -37,6 +33,10 @@ public:
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0; double dt, double c, bool usedt = false, int streamId = -1) = 0;
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1) = 0;
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false, double dt, double c, bool usedt = false,

View File

@ -0,0 +1,29 @@
#ifndef H_GREENSFUNCTION
#define H_GREENSFUNCTION
#include <iostream>
#include <cmath>
class GreensFunction {
public:
virtual ~GreensFunction() { }
/** calc greens integral, as defined in OPAL */
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
/** integration if rho2_m, see OPAL for more details */
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1) = 0;
/** mirror rho2_m field */
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
/** multiply two complex fields from device memory */
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
};
#endif

View File

@ -35,13 +35,29 @@ ENDMACRO ()
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
DKSBase.h DKSBase.h
DKSDefinitions.h DKSDefinitions.h
DKSOPAL.h
) )
SET (DKS_BASEDIR_SRCS SET (DKS_BASEDIR_SRCS
DKSBase.cpp DKSBase.cpp
DKSOPAL.cpp
) )
IF (USE_CUDA OR USE_OPENCL) #add opal to DKS if enable_opal is set
IF (ENABLE_OPAL)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSOPAL.h
)
SET (DKS_BASEDIR_SRCS
${DKS_BASEDIR_SRCS}
DKSOPAL.cpp
)
ENDIF (ENABLE_OPAL)
#and musrt to DKS if cuda or opencl is used and enable_musr is set
IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS} ${DKS_BASEDIR_HDRS}
DKSBaseMuSR.h DKSBaseMuSR.h
@ -51,9 +67,10 @@ IF (USE_CUDA OR USE_OPENCL)
${DKS_BASEDIR_SRCS} ${DKS_BASEDIR_SRCS}
DKSBaseMuSR.cpp DKSBaseMuSR.cpp
) )
ENDIF (USE_CUDA OR USE_OPENCL) ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
IF (USE_CUDA) #add image reconstruction to DKS if cuda is used and enable_pet is set
IF (USE_CUDA AND ENABLE_PET)
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS} ${DKS_BASEDIR_HDRS}
DKSImageReconstruction.h DKSImageReconstruction.h
@ -63,7 +80,7 @@ IF (USE_CUDA)
${DKS_BASEDIR_SRCS} ${DKS_BASEDIR_SRCS}
DKSImageReconstruction.cpp DKSImageReconstruction.cpp
) )
ENDIF (USE_CUDA) ENDIF (USE_CUDA AND ENABLE_PET)
ADD_HEADERS (${DKS_BASEDIR_HDRS}) ADD_HEADERS (${DKS_BASEDIR_HDRS})
ADD_SOURCES (${DKS_BASEDIR_SRCS}) ADD_SOURCES (${DKS_BASEDIR_SRCS})

View File

@ -1,35 +1,27 @@
SET (_HDRS SET (_HDRS CudaBase.cuh)
CudaBase.cuh SET (_SRCS CudaBase.cu)
CudaFFT.cuh
CudaGreensFunction.cuh
CudaChiSquare.cuh
CudaCollimatorPhysics.cuh
CudaImageReconstruction.cuh
CudaChiSquareRuntime.cuh
)
SET (_SRCS
CudaBase.cu
CudaFFT.cu
CudaGreensFunction.cu
CudaChiSquare.cu
CudaCollimatorPhysics.cu
CudaImageReconstruction.cu
CudaChiSquareRuntime.cu
)
#INCLUDE_DIRECTORIES ( IF (ENABLE_OPAL)
# ${CMAKE_CURRENT_SOURCE_DIR} SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh)
#) SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu)
ENDIF (ENABLE_OPAL)
IF (ENABLE_MUSR)
SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
ENDIF (ENABLE_MUSR)
IF (ENABLE_PET)
SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
ENDIF (ENABLE_PET)
MESSAGE (STATUS "CUDA headers: ${_HDRS}")
ADD_SOURCES(${_SRCS}) ADD_SOURCES(${_SRCS})
ADD_HEADERS(${_HDRS}) ADD_HEADERS(${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA) INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
SET (_KERNELS
NVRTCKernels/CudaChiSquareKernel.cu
)
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels) INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)

View File

@ -13,6 +13,13 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
} }
__global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
data[idx] = curand_uniform_double(&state[idx]);
}
//=====================================// //=====================================//
//==========Private functions==========// //==========Private functions==========//
@ -48,7 +55,7 @@ int CudaBase::cuda_createCurandStates(int size, int seed) {
int threads = 128; int threads = 128;
int blocks = size / threads + 1; int blocks = size / threads + 1;
if (seed == -1) if (seed == -1)
seed = time(NULL); seed = time(NULL);
//std::cout << "sizeof: " << sizeof(curandState) << std::endl; //std::cout << "sizeof: " << sizeof(curandState) << std::endl;
@ -69,6 +76,15 @@ int CudaBase::cuda_deleteCurandStates() {
return DKS_SUCCESS; return DKS_SUCCESS;
} }
int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
int threads = BLOCK_SIZE;
int blocks = size / threads + 1;
kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
return DKS_SUCCESS;
}
curandState* CudaBase::cuda_getCurandStates() { curandState* CudaBase::cuda_getCurandStates() {
return defaultRndState; return defaultRndState;
} }

View File

@ -15,6 +15,8 @@
#include <nvToolsExt.h> #include <nvToolsExt.h>
#include <time.h> #include <time.h>
#define BLOCK_SIZE 128
class CudaBase { class CudaBase {
private: private:
@ -39,6 +41,7 @@ public:
* Init cuda random number (cuRand) states. * Init cuda random number (cuRand) states.
* Create an array of type curandState with "size" elements on the GPU * Create an array of type curandState with "size" elements on the GPU
* and create a curandState with different seed for each array entry. * and create a curandState with different seed for each array entry.
* If no seed is given create a seed based on current time.
* Return success or error code * Return success or error code
*/ */
int cuda_createCurandStates(int size, int seed = -1); int cuda_createCurandStates(int size, int seed = -1);
@ -50,6 +53,11 @@ public:
*/ */
int cuda_deleteCurandStates(); int cuda_deleteCurandStates();
/** Create 'size' random numbers on the device and save in mem_ptr array
*
*/
int cuda_createRandomNumbers(void *mem_ptr, int size);
/** Get a pointer to curand states /** Get a pointer to curand states
* *
*/ */
@ -167,6 +175,40 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error zeroing cuda memory!\n");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
int dkserror = DKS_SUCCESS;
cudaError cerror;
if (streamId < cuda_numberOfStreams()) {
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
cuda_getStream(streamId));
if (cerror != cudaSuccess)
dkserror = DKS_ERROR;
} else
dkserror = DKS_ERROR;
return dkserror;
}
/** /**
* Info: write data to memory * Info: write data to memory
* Retrun: success or error code * Retrun: success or error code

View File

@ -23,7 +23,6 @@
#define X0_M 9 #define X0_M 9
#define I_M 10 #define I_M 10
#define DT_M 11 #define DT_M 11
#define LOWENERGY_THR 12
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
#define NUMPAR 12 #define NUMPAR 12
@ -34,6 +33,14 @@ __device__ inline double dot(double3 &d1, double3 &d2) {
} }
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
return tmp;
}
__device__ inline bool checkHit(double &z, double *par) { __device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */ /* check if particle is in the degrader material */
@ -82,7 +89,7 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
Eng = Eng + delta_E / 1E3; Eng = Eng + delta_E / 1E3;
} }
pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) ); pdead = ((Eng<1E-4) || (dEdx>0));
} }
@ -118,9 +125,7 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
} }
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
double* par, bool enableRutherfordScattering)
{
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P; double gamma = (Eng + M_P) / M_P;
@ -148,7 +153,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state,
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par); Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if( (P2 < 0.0047) && enableRutherfordScattering) { if(P2 < 0.0047) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
@ -174,7 +179,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state,
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par); Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if( (P2 < 0.0047) && enableRutherfordScattering) { if(P2 < 0.0047) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
@ -188,7 +193,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state,
template <typename T> template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles, bool enableRutherfordScattering) int numparticles)
{ {
//get global id and thread id //get global id and thread id
@ -230,7 +235,7 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
P.x = P.x * ptot / sq; P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq; P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq; P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering); coulombScat(R[tid], P, s, p);
data[idx].Pincol = P; data[idx].Pincol = P;
} else { } else {
@ -253,8 +258,7 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
} }
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles, curandState *state, int numparticles)
bool enableRutherfordScattering)
{ {
//get global id and thread id //get global id and thread id
@ -292,7 +296,7 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
P.x = P.x * ptot / sq; P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq; P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq; P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p, enableRutherfordScattering); coulombScat(R[tid], P, s, p);
data.Pincol[idx] = P; data.Pincol[idx] = P;
} else { } else {
@ -427,7 +431,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
} }
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) { __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) {
//get global id and thread id //get global id and thread id
volatile int tid = threadIdx.x; volatile int tid = threadIdx.x;
@ -453,7 +457,51 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, dou
} }
} }
//TODO: kernel for push with switch off unitless positions with dt[i]*c __global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf,
double3 *gBf, double *gdt, double charge,
double mass, int npart, double c)
{
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
double3 Ef = gEf[idx];
double3 Bf = gBf[idx];
double dt = gdt[idx];
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
double gamma = sqrt(1.0 + dot(P, P));
double3 t, w, s;
t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
double3 crossPt = cross(P, t);
w.x = P.x + crossPt.x;
w.y = P.y + crossPt.y;
w.z = P.z + crossPt.z;
s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
double3 crossws = cross(w, s);
P.x += crossws.x;
P.y += crossws.y;
P.z += crossws.z;
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
gP[idx] = P;
}
}
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) { __device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
@ -615,8 +663,7 @@ struct less_then
} }
}; };
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
bool enableRutherfordScattering)
{ {
int threads = BLOCK_SIZE; int threads = BLOCK_SIZE;
@ -629,8 +676,7 @@ int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int n
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr, kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
(double*)par_ptr, (double*)par_ptr,
m_base->cuda_getCurandStates(), m_base->cuda_getCurandStates(),
numparticles, numparticles);
enableRutherfordScattering);
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) if (err != cudaSuccess)
@ -677,12 +723,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
} }
} else { } else {
if (streamId == -1) { if (streamId == -1) {
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart, kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr,
(double*)dt_ptr, c); (double*)dt_ptr, npart, c);
} else { } else {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double*)dt_ptr, c); (double*)dt_ptr, npart, c);
} }
} }
@ -690,6 +736,29 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
return DKS_SUCCESS; return DKS_SUCCESS;
} }
int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart,
double c, int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
//call kernel
if (streamId == -1) {
kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
(double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double3*)ef_ptr, (double3*)bf_ptr,
(double*)dt_ptr, charge, mass, npart, c);
}
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr, void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, int npart, int nsec,

View File

@ -110,7 +110,7 @@ public:
* *
*/ */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numpartices, bool enableRutherfordScattering = true); int numpartices);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
@ -141,6 +141,10 @@ public:
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1); double dt, double c, bool usedt = false, int streamId = -1);
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL /** BorisPusher push function with transformto function form OPAL
* ParallelTTracker integration from OPAL implemented in cuda. * ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal * For more details see ParallelTTracler docomentation in opal

View File

@ -189,12 +189,11 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho; rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
} }
} }
@ -273,28 +272,20 @@ __global__ void mirroredRhoField(double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i; id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri; id8 = rk * NI * NJ + rj * NI + ri;
double data = rho2_m[id1]; double data = rho2_m[id1];
if (i != 0) if (i != 0) rho2_m[id2] = data;
rho2_m[id2] = data;
if (j != 0) if (j != 0) rho2_m[id3] = data;
rho2_m[id3] = data;
if (i != 0 && j != 0) if (i != 0 && j != 0) rho2_m[id4] = data;
rho2_m[id4] = data;
if (k != 0) if (k != 0) rho2_m[id5] = data;
rho2_m[id5] = data;
if (k != 0 && i != 0) if (k != 0 && i != 0) rho2_m[id6] = data;
rho2_m[id6] = data;
if (k!= 0 && j != 0) if (k!= 0 && j != 0) rho2_m[id7] = data;
rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0) if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data;
rho2_m[id8] = data;
} }
@ -363,9 +354,9 @@ CudaGreensFunction::~CudaGreensFunction() {
delete m_base; delete m_base;
} }
int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, double hr_m0, double hr_m1, double hr_m2,
int streamId) int streamId)
{ {
int thread = 128; int thread = 128;
@ -373,7 +364,7 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
//if no stream specified use default stream //if no stream specified use default stream
if (streamId == -1) { if (streamId == -1) {
kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K); kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -381,7 +372,7 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K); kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -389,15 +380,17 @@ int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, i
} }
int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
int I, int J, int K, int I, int J, int K,
int streamId) int streamId)
{ {
int thread = 128; int thread = 128;
int block = (I * J * K / thread) + 1; int block = (I * J * K / thread) + 1;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
if (streamId == -1) { if (streamId == -1) {
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K); 2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
@ -406,6 +399,7 @@ int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgr
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K); 2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
@ -415,22 +409,22 @@ int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgr
return DKS_ERROR; return DKS_ERROR;
} }
int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
int thread = 128; int thread = 128;
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1; int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
if (streamId == -1) { if (streamId == -1) {
mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J); mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I, 2*J);
mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1); mirroredRhoField<<< block, thread >>>( (double *) rho2_m, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J); mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I, 2*J);
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1); mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -440,13 +434,13 @@ int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K,
return DKS_ERROR; return DKS_ERROR;
} }
int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2,
int size, int streamId) { int size, int streamId) {
int threads = 128; int threads = 128;
int blocks = size / threads + 1; int blocks = size / threads + 1;
int datasize = 2 * threads * sizeof(cuDoubleComplex); int datasize = 2 * threads * sizeof(cuDoubleComplex);
if (streamId == -1) { if (streamId == -1) {
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1, multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
(cuDoubleComplex*)ptr2, (cuDoubleComplex*)ptr2,

View File

@ -2,17 +2,17 @@
#define H_CUDA_GREENSFUNCTION #define H_CUDA_GREENSFUNCTION
#include <iostream> #include <iostream>
#include <math.h> #include <cmath>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuComplex.h> #include <cuComplex.h>
#include "cublas_v2.h" #include "cublas_v2.h"
#include "../Algorithms/GreensFunction.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
class CudaGreensFunction { class CudaGreensFunction : public GreensFunction{
private: private:
@ -30,32 +30,32 @@ public:
/* destructor */ /* destructor */
~CudaGreensFunction(); ~CudaGreensFunction();
/* /**
Info: calc itegral on device memory (taken from OPAL src code) Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code Return: success or error code
*/ */
int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, double hr_m0, double hr_m1, double hr_m2,
int streamId = -1); int streamId = -1);
/* /**
Info: integration of rho2_m field (taken from OPAL src code) Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code Return: success or error code
*/ */
int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1); int streamId = -1);
/* /**
Info: mirror rho field (taken from OPAL src code) Info: mirror rho field (taken from OPAL src code)
Return: succes or error code Return: succes or error code
*/ */
int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/* /**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1 Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code Return: success or error code
*/ */
int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
}; };

View File

@ -103,25 +103,14 @@ DKSBase::DKSBase() {
#ifdef DKS_CUDA #ifdef DKS_CUDA
cbase = new CudaBase(); cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif #endif
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
oclbase = new OpenCLBase(); oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
micbase = new MICBase(); micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif #endif
} }
@ -138,25 +127,14 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
#ifdef DKS_CUDA #ifdef DKS_CUDA
cbase = new CudaBase(); cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif #endif
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
oclbase = new OpenCLBase(); oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
micbase = new MICBase(); micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif #endif
} }
@ -173,27 +151,16 @@ DKSBase::~DKSBase() {
if (m_function_name != NULL) if (m_function_name != NULL)
delete[] m_function_name; delete[] m_function_name;
#ifdef DKS_CUDA #ifdef DKS_CUDA
delete cfft;
delete cgreens;
delete cchi;
delete ccol;
delete cbase; delete cbase;
#endif #endif
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
delete oclfft;
delete oclchi;
delete oclcol;
delete oclbase; delete oclbase;
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
delete micfft;
delete miccol;
delete micgreens;
delete micchi;
delete micbase; delete micbase;
#endif #endif
@ -307,38 +274,45 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
return DKS_ERROR; return DKS_ERROR;
} }
/* int DKSBase::setupDevice() {
init device
*/ int ierr = DKS_ERROR;
int DKSBase::initDevice() {
//if api is not set default is OpenCL //if api is not set default is OpenCL
if (!m_api_set) { if (!m_api_set) {
setDevice("-gpu", 4); setDevice("-gpu", 4);
setAPI(API_OPENCL, 6); setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else { } else {
if (apiOpenCL()) { if (apiOpenCL()) {
if (!m_device_set) { if (!m_device_set) {
setDevice("-gpu", 4); setDevice("-gpu", 4);
setAPI(API_OPENCL, 6); setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else { } else {
setAPI(API_OPENCL, 6); setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) ); ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
} }
} else if (apiCuda()) { } else if (apiCuda()) {
setDevice("-gpu", 4); setDevice("-gpu", 4);
setAPI(API_CUDA, 4); setAPI(API_CUDA, 4);
return CUDA_SAFECALL(DKS_SUCCESS); ierr = CUDA_SAFECALL(DKS_SUCCESS);
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
setDevice("-mic", 4); setDevice("-mic", 4);
setAPI(API_OPENMP, 6); setAPI(API_OPENMP, 6);
return MIC_SAFECALL(DKS_SUCCESS); ierr = MIC_SAFECALL(DKS_SUCCESS);
} }
} }
return DKS_ERROR; return ierr;
}
/*
init device
*/
int DKSBase::initDevice() {
return setupDevice();
} }
/* /*
@ -456,359 +430,16 @@ int DKSBase::syncDevice() {
return DKS_ERROR; return DKS_ERROR;
} }
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSBase::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) {
if (apiCuda()) if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size));
else if (apiOpenMP()) if (apiOpenCL())
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale)); return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size));
return DKS_ERROR; return DKS_ERROR;
} }
//BENI:
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
//load kernel and execute
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
DEBUG_MSG("No implementation for selected platform");
return DKS_SUCCESS;
}
/* normalize complex to real iFFT */
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
if (apiCuda()) {
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId) );
} else if (apiOpenMP()) {
//BENI:
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
else
return DKS_ERROR;
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering)
{
if (apiCuda())
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles,
enableRutherfordScattering) );
else if (apiOpenMP())
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
if (apiOpenMP()) {
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles) );
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callInitRandoms(int size, int seed) { int DKSBase::callInitRandoms(int size, int seed) {
if (apiCuda()) if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_createCurandStates(size, seed)); return CUDA_SAFECALL(cbase->cuda_createCurandStates(size, seed));
@ -821,43 +452,3 @@ int DKSBase::callInitRandoms(int size, int seed) {
return DKS_ERROR; return DKS_ERROR;
} }
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
usedt, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
c, usedt, streamId));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}

View File

@ -29,31 +29,17 @@
#endif #endif
#include "OpenCL/OpenCLBase.h" #include "OpenCL/OpenCLBase.h"
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLChiSquare.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#endif #endif
#ifdef DKS_CUDA #ifdef DKS_CUDA
#include "CUDA/CudaBase.cuh" #include "CUDA/CudaBase.cuh"
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaChiSquare.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
#include "MIC/MICBase.h" #include "MIC/MICBase.h"
#include "MIC/MICChiSquare.h"
#include "MIC/MICFFT.h"
#include "MIC/MICCollimatorPhysics.h"
#include "MIC/MICGreensFunction.hpp"
#endif #endif
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#include "AutoTuning/DKSConfig.h" #include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */ /** DKSBase class for handling function calls to DKS library */
@ -73,25 +59,14 @@ private:
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
OpenCLBase *oclbase; OpenCLBase *oclbase;
OpenCLFFT *oclfft;
OpenCLChiSquare *oclchi;
OpenCLCollimatorPhysics *oclcol;
#endif #endif
#ifdef DKS_CUDA #ifdef DKS_CUDA
CudaBase *cbase; CudaBase *cbase;
CudaFFT *cfft;
CudaGreensFunction *cgreens;
CudaChiSquare *cchi;
CudaCollimatorPhysics *ccol;
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
MICBase *micbase; MICBase *micbase;
MICFFT *micfft;
MICCollimatorPhysics *miccol;
MICGreensFunction *micgreens;
MICChiSquare *micchi;
#endif #endif
protected: protected:
@ -139,6 +114,12 @@ protected:
} }
#endif #endif
#ifdef DKS_MIC
MICBase *getMICBase() {
return micbase;
}
#endif
/** Call OpenCL base to load specified kenrel file. /** Call OpenCL base to load specified kenrel file.
* *
*/ */
@ -154,6 +135,7 @@ protected:
return device_name; return device_name;
} }
public: public:
/** /**
@ -173,6 +155,11 @@ public:
*/ */
~DKSBase(); ~DKSBase();
/** Function to initialize objects based on the device used.
*
*/
int setupDevice();
/** Turn on auto tuning */ /** Turn on auto tuning */
void setAutoTuningOn() { m_auto_tuning = true; } void setAutoTuningOn() { m_auto_tuning = true; }
@ -405,7 +392,7 @@ public:
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
#ifdef DKS_MIC #ifdef DKS_MIC
void * mem_ptr = NULL; void * mem_ptr = NULL;
mem_ptr = micbase.mic_allocateMemory<T>(elements); mem_ptr = micbase->mic_allocateMemory<T>(elements);
return mem_ptr; return mem_ptr;
#endif #endif
} }
@ -498,7 +485,7 @@ public:
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset)); return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset)); return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
} }
@ -532,7 +519,7 @@ public:
size_t size = sizeof(T)*elements; size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset)); return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset)); return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
} }
return DKS_ERROR; return DKS_ERROR;
@ -832,7 +819,7 @@ public:
size_t size = sizeof(T)*elements; size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset)); return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset)); return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
} }
return DKS_ERROR; return DKS_ERROR;
@ -860,7 +847,7 @@ public:
size_t size = sizeof(T)*elements; size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset)); return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements, return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
streamId, offset)); streamId, offset));
} }
@ -880,221 +867,23 @@ public:
else if (apiCuda()) else if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr)); return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
else if (apiOpenMP()) else if (apiOpenMP())
return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements)); return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
return DKS_ERROR; return DKS_ERROR;
} }
///////////////////////////////////////////////
///////Function library part of dksbase////////
///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/** /**
* Transpose 2D and 3D arrays, OpenCL implementation * Create random numbers on the device and fille mem_data array
* N - size of dimensions, ndim - number of dimensions, dim - dim to transpose
*/ */
int callTranspose(void *mem_ptr, int N[3], int ndim, int dim); int callCreateRandomNumbers(void *mem_ptr, int size);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/**
* Element by element multiplication.
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
* TODO: opencl and mic implementations.
*/
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
/**
* Chi square for parameter fitting on device.
* mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for
* intermediate results. Chi square results are put in &results
*/
int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result);
/**
* max-log-likelihood for parameter fitting on device.
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
* mem_par - pointer to parameter set, mem_results - pointer for
* intermediate results. Chi square results are put in &results.
* TODO: opencl and mic implementations.
*/
int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffser,
int sensors, int length, int numpar,
double &result);
/**
* max-log-likelihood for parameter fitting on device.
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
* mem_par - pointer to parameter set, mem_results - pointer for
* intermediate results. Chi square results are put in &results.
* TODO: opencl and mic implementations.
*/
int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffser,
int sensors, int length, int numpar,
double &result);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering = true);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
*/
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
/** /**
* Init random number states and save for reuse on device. * Init random number states and save for reuse on device.
* If seed is -1, a random seed based on current time is taken.
* TODO: opencl and mic implementations. * TODO: opencl and mic implementations.
*/ */
int callInitRandoms(int size, int seed = -1); int callInitRandoms(int size, int seed = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1);
/** /**
* Print memory information on device (total, used, available) * Print memory information on device (total, used, available)
* TODO: opencl and mic imlementation * TODO: opencl and mic imlementation

View File

@ -62,6 +62,12 @@
#define OPENCL_SAFEINIT(x) ( NULL ) #define OPENCL_SAFEINIT(x) ( NULL )
#endif #endif
#ifdef DKS_AMD
#define OPENCL_SAFEINIT_AMD(x) ( x )
#else
#define OPENCL_SAFEINIT_AMD(x) ( NULL )
#endif
#ifdef DKS_MIC #ifdef DKS_MIC
#define MIC_SAFEINIT(x) ( x ) #define MIC_SAFEINIT(x) ( x )
#else #else

298
src/DKSOPAL.cpp Normal file
View File

@ -0,0 +1,298 @@
#include "DKSOPAL.h"
DKSOPAL::DKSOPAL() {
dksfft = nullptr;
dkscol = nullptr;
dksgreens = nullptr;
}
DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
setAPI(api_name, strlen(api_name));
setDevice(device_name, strlen(device_name));
}
DKSOPAL::~DKSOPAL() {
delete dksfft;
delete dkscol;
delete dksgreens;
}
int DKSOPAL::setupOPAL() {
int ierr = DKS_ERROR;
if (apiOpenCL()) {
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
//TODO: only enable if AMD libraries are available
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
} else if (apiCuda()) {
ierr = CUDA_SAFECALL( DKS_SUCCESS );
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
} else if (apiOpenMP()) {
ierr = MIC_SAFECALL( DKS_SUCCESS );
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
} else {
ierr = DKS_ERROR;
}
return ierr;
}
int DKSOPAL::initDevice() {
int ierr = setupDevice();
if (ierr == DKS_SUCCESS)
ierr = setupOPAL();
return ierr;
}
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSOPAL::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return dksfft->setupFFT(ndim, N);
} else if (apiOpenCL()) {
int ierr1 = dksfft->setupFFT(ndim, N);
int ierr2 = dksfft->setupFFTRC(ndim, N);
int ierr3 = dksfft->setupFFTCR(ndim, N);
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTRC(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTRC(ndim, N, scale);
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTCR(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTCR(ndim, N, scale);
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
else
return DKS_ERROR;
} else if (apiCuda()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
} else if (apiOpenMP()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
else if (apiOpenCL())
return DKS_ERROR;
else if (apiOpenMP())
return DKS_ERROR;
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId);
}
int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
}
int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);
}
int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
}
int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
{
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
}
int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
}
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
int npart, double c, int streamId) {
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
}
int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId)
{
return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr,
charge, mass, npart, c, streamId);
}
int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt, c, usedt, streamId);
}

233
src/DKSOPAL.h Normal file
View File

@ -0,0 +1,233 @@
#ifndef H_DKS_OPAL
#define H_DKS_OPAL
#include <iostream>
#include "AutoTuning/DKSAutoTuning.h"
#include "DKSBase.h"
#include "DKSDefinitions.h"
#include "Algorithms/GreensFunction.h"
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#ifdef DKS_AMD
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLGreensFunction.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#endif
#ifdef DKS_CUDA
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#endif
#ifdef DKS_MIC
#include "MIC/MICFFT.h"
#include "MIC/MICGreensFunction.hpp"
#include "MIC/MICCollimatorPhysics.h"
#endif
class DKSOPAL : public DKSBase {
private:
DKSFFT *dksfft;
DKSCollimatorPhysics *dkscol;
GreensFunction *dksgreens;
int setupOPAL();
public:
DKSOPAL();
DKSOPAL(const char* api_name, const char* device_name);
~DKSOPAL();
int initDevice();
///////////////////////////////////////////////
///////Function library part of dksbase////////
///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/**
* Element by element multiplication.
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
* TODO: opencl and mic implementations.
*/
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
*/
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
int npart, double c, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1);
};
#endif

View File

@ -1,19 +1,24 @@
SET (_SRCS SET (_SRCS MICBase.cpp)
MICBase.cpp SET (_HDRS MICBase.h)
MICChiSquare.cpp
MICFFT.cpp
MICGreensFunction.cpp
MICCollimatorPhysics.cpp
)
SET (_HDRS IF (ENABLE_OPAL)
MICBase.h SET (_SRCS
MICChiSquare.h ${_SRCS}
MICFFT.h MICChiSquare.cpp
MICCollimatorPhysics.h MICFFT.cpp
MICGreensFunction.hpp MICGreensFunction.cpp
MICMergeSort.h MICCollimatorPhysics.cpp
) )
SET (_HDRS
${_HDRS}
MICChiSquare.h
MICFFT.h
MICCollimatorPhysics.h
MICGreensFunction.hpp
MICMergeSort.h
)
ENDIF (ENABLE_OPAL)
#INCLUDE_DIRECTORIES ( #INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR} # ${CMAKE_CURRENT_SOURCE_DIR}

View File

@ -18,30 +18,28 @@ int MICBase::mic_createRandStreams(int size) {
int seed = time(NULL); int seed = time(NULL);
#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed) int numThreads = 0;
#pragma offload target(mic:m_device_id) inout(numThreads)
{ {
//get the number of threads
int numThreads;
#pragma omp parallel #pragma omp parallel
numThreads = omp_get_num_threads(); numThreads = omp_get_num_threads();
}
//if default rnd stream already allocated delete the array defaultRndStream = mic_allocateMemory<VSLStreamStatePtr>(numThreads);
if (defaultRndSet == 1) VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream;
delete[] defaultRndStream; maxThreads = numThreads;
//allocate defaultRndStream array #pragma offload target(mic:m_device_id) \
defaultRndStream = new VSLStreamStatePtr[numThreads]; in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN) \
in(seed)
{
//create stream states for each thread //create stream states for each thread
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < omp_get_num_threads(); i++) for (int i = 0; i < omp_get_num_threads(); i++)
vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i); vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i);
defaultRndSet = 1;
} }
defaultRndSet = 1;
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -49,15 +47,8 @@ int MICBase::mic_createRandStreams(int size) {
//delete default rand streams //delete default rand streams
int MICBase::mic_deleteRandStreams() { int MICBase::mic_deleteRandStreams() {
#pragma offload target(mic:m_device_id) inout(defaultRndSet) //mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236);
{ return DKS_SUCCESS;
if (defaultRndSet == 1) {
delete[] defaultRndStream;
defaultRndSet = -1;
}
}
return DKS_ERROR;
} }
//create a new signal for the mic //create a new signal for the mic

View File

@ -30,14 +30,19 @@ class MICBase {
private: private:
std::vector<int> micStreams; std::vector<int> micStreams;
int maxThreads;
protected: protected:
int defaultRndSet; int defaultRndSet;
public: public:
VSLStreamStatePtr *defaultRndStream;
//#pragma offload_attribute(push,target(mic))
void *defaultRndStream; //VSLSStreamStatePtr
void *testPtr;
//#pragma offload_attribute(pop)
int m_device_id; int m_device_id;
/* constructor */ /* constructor */
@ -202,7 +207,6 @@ public:
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE) #pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
{ {
} }
return DKS_SUCCESS; return DKS_SUCCESS;
} }

View File

@ -292,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
const double deltas = par[DT_M] * beta * C; const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M]; const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) { if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073; const double Ts = (Eng * 1E6) / 1.0073;
@ -338,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
const double deltas = par[DT_M] * beta * C; const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M]; const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) { if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073; const double Ts = (Eng * 1E6) / 1.0073;
@ -368,23 +368,23 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
} }
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
int numparticles, boll enableRutherfordScattering)
{
//cast device memory pointers to appropriate types //cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr; MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
double *par = (double*) par_ptr; double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target(mic:m_micbase->m_device_id) \ #pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \ inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles) in(numparticles)
{ {
#pragma omp parallel #pragma omp parallel
{ {
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x //for loop trough particles if not checkhit set label to -2 and update R.x
@ -461,6 +461,8 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
int padding = numparticles % MIC_WIDTH; int padding = numparticles % MIC_WIDTH;
int totalpart = numparticles + padding; int totalpart = numparticles + padding;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target (mic:0) \ #pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \ in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \ in(localID:length(0) DKS_REUSE DKS_RETAIN) \
@ -471,14 +473,16 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
in(py:length(0) DKS_REUSE DKS_RETAIN) \ in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \ in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(totalpart) in(totalpart)
{ {
#pragma omp parallel #pragma omp parallel
{ {
//every thread gets its own rnd stream state //every thread gets its own rnd stream state
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; //VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
#pragma omp for nowait #pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -514,9 +518,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
double Eng = (sq - 1) * M_P; double Eng = (sq - 1) * M_P;
double dEdx = 0; double dEdx = 0;
if (label[i] == 0) { if (label[i] == 0) {
energyLoss(Eng, dEdx, par, randv, i - ii); energyLoss(Eng, dEdx, par, randv, i - ii);
} }
if (Eng > 1e-4 && dEdx < 0) { if (Eng > 1e-4 && dEdx < 0) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
@ -528,11 +534,12 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
if (Eng < 1e-4 || dEdx > 0) if (Eng < 1e-4 || dEdx > 0)
label[i] = -1; label[i] = -1;
} //end inner energy loss loop } //end inner energy loss loop
} //end outer energy loss loop
} //end outer energy loss loop
//vectorize coulomb scattering as much as possible //vectorize coulomb scattering as much as possible
#pragma omp for nowait #pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -542,7 +549,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
} //end omp parallel } //end omp parallel
} //end offload } //end offload
return DKS_SUCCESS; return DKS_SUCCESS;
} }

View File

@ -26,7 +26,7 @@ typedef struct {
} MIC_PART_SMALL; } MIC_PART_SMALL;
class MICCollimatorPhysics : DKSAlogorithms{ class MICCollimatorPhysics : public DKSCollimatorPhysics {
private: private:
@ -38,10 +38,9 @@ public:
m_micbase = base; m_micbase = base;
}; };
~MICCollimatorPhysics() { }; ~MICCollimatorPhysics() { };
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
bool enableRutherfordScattering = true);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,

View File

@ -6,13 +6,16 @@
MICFFT::MICFFT(MICBase *base) { MICFFT::MICFFT(MICBase *base) {
m_micbase = base; m_micbase = base;
m_fftsetup = false;
} }
MICFFT::~MICFFT() { MICFFT::~MICFFT() {
if (m_fftsetup) {
#pragma offload target(mic:0) #pragma offload target(mic:0)
{ {
DftiFreeDescriptor(&FFTHandle_m); DftiFreeDescriptor(&FFTHandle_m);
DftiFreeDescriptor(&handle); DftiFreeDescriptor(&handle);
}
} }
} }
@ -35,7 +38,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
} }
m_fftsetup = true;
return DKS_SUCCESS; return DKS_SUCCESS;
} }
//BENI: //BENI:
@ -122,8 +125,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
} }
//execute iFFT //execute iFFT
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) { int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
return mic_executeFFT(mem_ptr, ndim, N, -1, false); return executeFFT(mem_ptr, ndim, N, -1, false);
} }
//execute REAL->COMPLEX FFT //execute REAL->COMPLEX FFT

View File

@ -7,13 +7,14 @@
#include <offload.h> #include <offload.h>
#include <mkl_dfti.h> #include <mkl_dfti.h>
#include "../Algorithm/DKSFFT.h" #include "../Algorithms/FFT.h"
#include "MICBase.h" #include "MICBase.h"
class MICFFT : public DKSFFT { class MICFFT : public DKSFFT {
private: private:
bool m_fftsetup;
MICBase *m_micbase; MICBase *m_micbase;
/// Internal FFT object for performing serial FFTs. /// Internal FFT object for performing serial FFTs.
@ -74,6 +75,18 @@ public:
/* normalize IFFT on MIC */ /* normalize IFFT on MIC */
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1); int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
/**
* Info: destroy default FFT plans
* Return: success or error code
*/
int destroyFFT() { return DKS_SUCCESS; }
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
}; };
#endif #endif

View File

@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
} }
*/ */
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0, int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m1, double hr_m2) double hr_m0, double hr_m1, double hr_m2, int streamId)
{ {
double *tmp_ptr = (double*) tmp_ptr_; double *tmp_ptr = (double*) tmpgreen;
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2) #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
{ {
std::memset(tmp_ptr,0,I*J*K); std::memset(tmp_ptr,0,I*J*K);
@ -173,12 +173,14 @@ return 0;
*/ */
//CUDA similar version: //CUDA similar version:
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) { int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
double *tmpgreen = (double*) tmp_ptr_; int streamId)
double *mem_ptr = (double*) mem_ptr_; {
double *tmpgreen_ptr = (double*) tmpgreen;
double *mem_ptr = (double*) rho2_m;
// the actual integration // the actual integration
#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K) #pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{ {
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
std::memset(mem_ptr,0,II*JJ*KK); std::memset(mem_ptr,0,II*JJ*KK);
@ -197,27 +199,27 @@ int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp) if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; tmp1 = tmpgreen_ptr[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp) if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; tmp2 = tmpgreen_ptr[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp) if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp3 = tmpgreen_ptr[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp) if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp) if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp5 = tmpgreen_ptr[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp) if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp6 = tmpgreen_ptr[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; tmp7 = tmpgreen_ptr[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
@ -234,8 +236,8 @@ int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp
int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) { int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) {
double *mem_ptr = (double*) mem_ptr_; double *mem_ptr = (double*) rho2_m;
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K) #pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{ {
@ -281,11 +283,11 @@ int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K)
} }
/*multiply complex fields*/ /*multiply complex fields*/
int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) { int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) {
// double *mem_ptr1 = (double*) mem_ptr1_; // double *mem_ptr1 = (double*) mem_ptr1_;
// double *mem_ptr2 = (double*) mem_ptr2_; // double *mem_ptr2 = (double*) mem_ptr2_;
_Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_; _Complex double *mem_ptr1 = (_Complex double *) ptr1;
_Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_; _Complex double *mem_ptr2 = (_Complex double *) ptr2;
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size) #pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
{ {

View File

@ -9,12 +9,13 @@
#include <offload.h> #include <offload.h>
#include <mkl_dfti.h> #include <mkl_dfti.h>
#include "../Algorithms/GreensFunction.h"
#include "MICBase.h" #include "MICBase.h"
#define DKS_SUCCESS 0 #define DKS_SUCCESS 0
#define DKS_ERROR 1 #define DKS_ERROR 1
class MICGreensFunction { class MICGreensFunction : public GreensFunction {
private: private:
MICBase *m_micbase; MICBase *m_micbase;
@ -28,16 +29,18 @@ public:
~MICGreensFunction(); ~MICGreensFunction();
/* compute greens integral analytically */ /* compute greens integral analytically */
int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2); int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
/* perform the actual integration */ /* perform the actual integration */
int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K); int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K,
int stremaId = -1);
/* Mirror rho-Field */ /* Mirror rho-Field */
int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K); int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1);
/*multiply complex fields*/ /*multiply complex fields*/
int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size); int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1);
}; };

View File

@ -1,31 +1,39 @@
SET (_SRCS #dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found
OpenCLBase.cpp
OpenCLFFT.cpp
OpenCLChiSquare.cpp
OpenCLCollimatorPhysics.cpp
OpenCLChiSquareRuntime.cpp
)
SET (_HDRS SET (_HDRS OpenCLBase.h)
OpenCLBase.h SET (_SRCS OpenCLBase.cpp)
OpenCLFFT.h SET (_KERNELS "")
OpenCLChiSquare.h
OpenCLCollimatorPhysics.h
OpenCLChiSquareRuntime.h
)
#INCLUDE_DIRECTORIES ( IF (ENABLE_MUSR)
# ${CMAKE_CURRENT_SOURCE_DIR} SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
#) SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp)
SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl)
ENDIF (ENABLE_MUSR)
SET (_KERNELS IF (ENABLE_AMD AND ENABLE_OPAL)
OpenCLKernels/OpenCLChiSquare.cl SET (_SRCS
OpenCLKernels/OpenCLFFT.cl ${_SRCS}
OpenCLKernels/OpenCLFFTStockham.cl OpenCLFFT.cpp
OpenCLKernels/OpenCLTranspose.cl OpenCLCollimatorPhysics.cpp
OpenCLKernels/OpenCLCollimatorPhysics.cl OpenCLGreensFunction.cpp
OpenCLKernels/OpenCLChiSquareRuntime.cl )
SET (_HDRS
${_HDRS}
OpenCLFFT.h
OpenCLCollimatorPhysics.h
OpenCLGreensFunction.h
)
SET (_KERNELS
${_KERNELS}
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLGreensFunction.cl
) )
ENDIF (ENABLE_AMD AND ENABLE_OPAL)
ADD_SOURCES (${_SRCS}) ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS}) ADD_HEADERS (${_HDRS})

View File

@ -7,21 +7,13 @@ cl_device_id OpenCLBase::m_device_id = NULL;
cl_event OpenCLBase::m_last_event = NULL; cl_event OpenCLBase::m_last_event = NULL;
OpenCLBase::OpenCLBase() { OpenCLBase::OpenCLBase() {
//m_context = NULL;
//m_command_queue = NULL;
m_program = NULL; m_program = NULL;
m_kernel = NULL; m_kernel = NULL;
//m_device_id = NULL;
//m_platform_id = NULL;
m_kernel_file = NULL; m_kernel_file = NULL;
m_last_event = NULL; m_last_event = NULL;
//m_events = new cl_event[500];
//m_num_events = 0;
defaultRndSet = 0; defaultRndSet = 0;
} }
OpenCLBase::~OpenCLBase() { OpenCLBase::~OpenCLBase() {
@ -41,11 +33,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl"); strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
ocl_loadKernel(kernel_file); ocl_loadKernel(kernel_file);
delete[] kernel_file; delete[] kernel_file;
//allocate memory for rand states //allocate memory for rand states
int ierr; int ierr;
defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr); defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
//exec kernel //exec kernel
int seed = 0; int seed = 0;
ocl_createKernel("initRand"); ocl_createKernel("initRand");
@ -55,13 +47,34 @@ int OpenCLBase::ocl_createRndStates(int size) {
size_t work_items = size; size_t work_items = size;
size_t work_group_size = 1; size_t work_group_size = 1;
ocl_executeKernel(1, &work_items, &work_group_size); ocl_executeKernel(1, &work_items, &work_group_size);
defaultRndSet = 1; defaultRndSet = 1;
return DKS_SUCCESS;
}
return OCL_SUCCESS; int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) {
//load kernel
char * kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
ocl_loadKernel(kernel_file);
delete[] kernel_file;
//set kernel variables
cl_mem tmp_data = (cl_mem) mem_ptr;
ocl_createKernel("createRandoms");
ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
ocl_setKernelArg(2, sizeof(int), &size);
size_t work_size = 128;
size_t work_items = (size % work_size + 1) * work_size;
ocl_executeKernel(1, &work_items, &work_size);
return DKS_SUCCESS;
} }
/* destroy rnd states */ /* destroy rnd states */
@ -70,7 +83,7 @@ int OpenCLBase::ocl_deleteRndStates() {
ocl_freeMemory(defaultRndState); ocl_freeMemory(defaultRndState);
defaultRndSet = 0; defaultRndSet = 0;
return OCL_SUCCESS; return DKS_SUCCESS;
} }
@ -428,7 +441,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
int ierr; int ierr;
//create program from kernel //create program from kernel
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr); m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source,
NULL, &ierr);
if (ierr != CL_SUCCESS) { if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr); DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
return DKS_ERROR; return DKS_ERROR;
@ -438,7 +452,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL); ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
/* /*
check if compileng kernel source succeded, if failed return error code check if compiling kernel source succeded, if failed return error code
if in debug mode get compilation info and print program build log witch if in debug mode get compilation info and print program build log witch
will give indication what made the compilation fail will give indication what made the compilation fail
*/ */
@ -447,7 +461,8 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
//get build status //get build status
cl_build_status status; cl_build_status status;
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &status, NULL);
//get log size //get log size
size_t log_size; size_t log_size;
@ -613,12 +628,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
} }
} }
if (ierr != OCL_SUCCESS) { if (ierr != DKS_SUCCESS) {
DEBUG_MSG("Failed to build kernel file " << kernel_file); DEBUG_MSG("Failed to build kernel file " << kernel_file);
return OCL_ERROR; return DKS_ERROR;
} }
return OCL_SUCCESS; return DKS_SUCCESS;
} }
//compile kernel form source code provided //compile kernel form source code provided

View File

@ -30,13 +30,10 @@
#include <CL/cl_ext.h> #include <CL/cl_ext.h>
#endif #endif
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/* struct for random number state */ /* struct for random number state */
typedef struct { typedef struct {
double s10; double s10;
double s11; double s11;
double s12; double s12;
@ -45,16 +42,12 @@ typedef struct {
double s22; double s22;
double z; double z;
bool gen; bool gen;
} RNDState; } RNDState;
class OpenCLBase { class OpenCLBase {
private: private:
static cl_context m_context;
static cl_command_queue m_command_queue;
static cl_platform_id m_platform_id; static cl_platform_id m_platform_id;
static cl_device_id m_device_id; static cl_device_id m_device_id;
@ -118,6 +111,9 @@ protected:
public: public:
static cl_context m_context;
static cl_command_queue m_command_queue;
/* /*
constructor constructor
@ -135,6 +131,11 @@ public:
*/ */
int ocl_createRndStates(int size); int ocl_createRndStates(int size);
/* Create an array of random numbers on the device
*
*/
int ocl_createRandomNumbers(void *mem_ptr, int size);
/* /*
Destroy rnd states Destroy rnd states
Return: success or error code Return: success or error code
@ -195,7 +196,7 @@ public:
Return: return pointer to memory Return: return pointer to memory
*/ */
cl_mem ocl_allocateMemory(size_t size, int &ierr); cl_mem ocl_allocateMemory(size_t size, int &ierr);
/* /*
Name: allocateMemory Name: allocateMemory
Info: allocate memory on device Info: allocate memory on device
@ -203,6 +204,20 @@ public:
*/ */
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer
* Set all the elemetns in the device array to zero
*/
template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
cl_int ierr;
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
sizeof(T)*size, 0, nullptr, nullptr);
if (ierr != CL_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
}
/* /*
Name: writeData Name: writeData
Info: write data to device memory (needs ptr to mem object) Info: write data to device memory (needs ptr to mem object)

View File

@ -34,7 +34,7 @@ TODO:
2. boost.compute sort for user defined structure crashes 2. boost.compute sort for user defined structure crashes
*/ */
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, bool enableRutherfordScattering) int numparticles)
{ {
/* /*
//set number of total threads, and number threads per block //set number of total threads, and number threads per block

View File

@ -52,8 +52,7 @@ public:
} }
/* execute degrader code on device */ /* execute degrader code on device */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles, int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
bool enableRutherfordScattering = true);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,

View File

@ -31,7 +31,6 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS) if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
return OCL_ERROR; return OCL_ERROR;
//execute kernel //execute kernel
for (int step = 1; step < N; step <<= 1) { for (int step = 1; step < N; step <<= 1) {
@ -89,26 +88,78 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
call fft execution on device for every dimension call fft execution on device for every dimension
*/ */
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) { int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
int ierr;
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem inout = (cl_mem)data; cl_mem inout = (cl_mem)data;
int n = N[0];
for (int dim = 0; dim < ndim; dim++) { if (forward)
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n); ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
if (ierr != OCL_SUCCESS) { 0, NULL, NULL, &inout, NULL, NULL);
DEBUG_MSG("Error executing bit reverse"); else
return OCL_ERROR; ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
} 0, NULL, NULL, &inout, NULL, NULL);
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward); if (ierr != OCL_SUCCESS) {
if (ierr != OCL_SUCCESS) { dkserr = DKS_ERROR;
DEBUG_MSG("Error executing fft reverse"); DEBUG_MSG("Error executing cfFFT\n");
return OCL_ERROR; if (ierr == CLFFT_INVALID_PLAN)
} std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
} }
return OCL_SUCCESS; return dkserr;
}
/*
call rcfft execution on device for every dimension
*/
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &real_in, &comp_out, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return dkserr;
}
/*
call rcfft execution on device for every dimension
*/
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &comp_out, &real_in, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return dkserr;
} }
/* /*
@ -120,10 +171,11 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
} }
/* /*
call kernel to normalize fft call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled.
*/ */
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
/*
cl_mem inout = (cl_mem)data; cl_mem inout = (cl_mem)data;
int n = N[0]; int n = N[0];
@ -150,132 +202,175 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
DEBUG_MSG("Error executing kernel"); DEBUG_MSG("Error executing kernel");
return OCL_ERROR; return OCL_ERROR;
} }
*/
return OCL_SUCCESS; return OCL_SUCCESS;
} }
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) { int OpenCLFFT::setupFFT(int ndim, int N[3]) {
int ierr;
int size = sizeof(cl_double2)*pow(N,ndim);
cl_mem mem_tmp;
cl_mem mem_src = (cl_mem)src;
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
//set the number of work items in each dimension cl_int err;
size_t work_items[3];
int p = 1;
int threads = N / 2;
int f = (forward) ? -1 : 1;
//execute kernel
int n = (int)log2(N);
for (int i = 0; i < ndim; i++) {
int dim = i+1; clfftDim dim;
p = 1;
work_items[0] = (dim == 1) ? N/2 : N;
work_items[1] = (dim == 2) ? N/2 : N;
work_items[2] = (dim == 3) ? N/2 : N;
//transpose array if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
return OCL_ERROR;
for (int t = 1; t <= log2(N); t++) {
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
p = 2*p;
}
//transpose array back if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
}
if (ndim*n % 2 == 1) {
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
}
m_oclbase->ocl_freeMemory(mem_dst);
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
cl_mem mem_src = (cl_mem)src;
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
m_oclbase->ocl_createKernel("fft_batch3D");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
for (int dim = 1; dim < ndim+1; dim++) {
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
cl_mem mem_src = (cl_mem)src;
if (ndim == 1) if (ndim == 1)
return OCL_SUCCESS; dim = CLFFT_1D;
else if (ndim == 2)
size_t work_items[3]; dim = CLFFT_2D;
work_items[0] = N[0]; else
work_items[1] = N[1]; dim = CLFFT_3D;
work_items[2] = 1; size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t work_group_size[3]; /* Create 3D fft plan*/
work_group_size[0] = N[0]; err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength);
work_group_size[1] = N[1];
work_group_size[2] = 1;
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2]; /* Set plan parameters */
err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE);
m_oclbase->ocl_createKernel("transpose"); if (err != CL_SUCCESS)
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); std::cout << "Error setting precision" << std::endl;
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src); err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]); if (err != CL_SUCCESS)
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]); std::cout << "Error setting layout" << std::endl;
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL); err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size); if (err != CL_SUCCESS)
std::cout << "Error setting result location" << std::endl;
/* Bake the plan */
err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Complex-to-complex plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
size_t clOutStride[3] = {1, half, half * N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Real-to-complex plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, half, half * N[1]};
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Complex-to-real plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::destroyFFT() {
clfftDestroyPlan(&planHandleZ2Z);
clfftDestroyPlan(&planHandleD2Z);
clfftDestroyPlan(&planHandleZ2D);
clfftTeardown();
return DKS_SUCCESS;
}
void OpenCLFFT::printError(clfftStatus err) {
if (err != CL_SUCCESS) {
std::cout << "Error creating default plan " << err << std::endl;
switch(err) {
case CLFFT_BUGCHECK:
std::cout << "bugcheck" << std::endl;
break;
case CLFFT_NOTIMPLEMENTED:
std::cout << "not implemented" << std::endl;
break;
case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
std::cout << "transposed not implemented" << std::endl;
break;
case CLFFT_FILE_NOT_FOUND:
std::cout << "file not found" << std::endl;
break;
case CLFFT_FILE_CREATE_FAILURE:
std::cout << "file create failure" << std::endl;
break;
case CLFFT_VERSION_MISMATCH:
std::cout << "version missmatch" << std::endl;
break;
case CLFFT_INVALID_PLAN:
std::cout << "invalid plan" << std::endl;
break;
case CLFFT_DEVICE_NO_DOUBLE:
std::cout << "no double" << std::endl;
break;
case CLFFT_DEVICE_MISMATCH:
std::cout << "device missmatch" << std::endl;
break;
case CLFFT_ENDSTATUS:
std::cout << "end status" << std::endl;
break;
default:
std::cout << "other: " << err << std::endl;
break;
}
}
return OCL_SUCCESS;
} }
/* /*

View File

@ -20,12 +20,19 @@
#include "../Algorithms/FFT.h" #include "../Algorithms/FFT.h"
#include "OpenCLBase.h" #include "OpenCLBase.h"
#include "clFFT.h"
class OpenCLFFT : public DKSFFT { class OpenCLFFT : public DKSFFT {
private: private:
OpenCLBase *m_oclbase; OpenCLBase *m_oclbase;
clfftSetupData fftSetup;
clfftPlanHandle planHandleZ2Z;
clfftPlanHandle planHandleD2Z;
clfftPlanHandle planHandleZ2D;
/* /*
Info: call fft kernels to execute FFT of the given domain, Info: call fft kernels to execute FFT of the given domain,
data - devevice memory ptr, cdim - current dim to transform, data - devevice memory ptr, cdim - current dim to transform,
@ -42,15 +49,31 @@ private:
*/ */
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N); int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
/** Get clfftStatus and print the corresponding error message.
* clfftStatus is returned from all clFFT library functions, print error displays the
* corresponding error message. If "other" is printed then error code corresponds to
* OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
* be checked to determine the reason for the error.
*/
void printError(clfftStatus err);
public: public:
/* constructor - currently does nothing*/ /* constructor - currently does nothing*/
OpenCLFFT(OpenCLBase *base) { OpenCLFFT(OpenCLBase *base) {
m_oclbase = base; m_oclbase = base;
/* Set up fft */
cl_int err;
err = clfftInitSetupData(&fftSetup);
err = clfftSetup(&fftSetup);
if (err != CL_SUCCESS)
DEBUG_MSG("Error seting up clFFT");
} }
/* destructor - currently does nothing*/ /* destructor - currently does nothing*/
~OpenCLFFT() { } ~OpenCLFFT() { destroyFFT(); }
/* /*
Info: execute forward fft function with data set on device Info: execute forward fft function with data set on device
@ -77,35 +100,23 @@ public:
Info: set FFT size Info: set FFT size
Return: success or error code Return: success or error code
*/ */
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; } int setupFFT(int ndim, int N[3]);
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } int setupFFTRC(int ndim, int N[3], double scale = 1.0);
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } int setupFFTCR(int ndim, int N[3], double scale = 1.0);
int destroyFFT() { return DKS_SUCCESS; } int destroyFFT();
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) int streamId = -1);
{
return DKS_ERROR;
}
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) int streamId = -1);
{
return DKS_ERROR;
}
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
{ {
return DKS_ERROR; return DKS_ERROR;
} }
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
//void printData3DN4(cl_double2* &data, int N); //void printData3DN4(cl_double2* &data, int N);
}; };

View File

@ -0,0 +1,181 @@
#include "OpenCLGreensFunction.h"
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
m_base = base;
base_create = false;
}
OpenCLGreensFunction::OpenCLGreensFunction() {
m_base = new OpenCLBase();
base_create = true;
}
OpenCLGreensFunction::~OpenCLGreensFunction() {
if (base_create)
delete m_base;
}
int OpenCLGreensFunction::buildProgram() {
char *kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, GREENS_KERNEL);
return m_base->ocl_loadKernel(kernel_file);
}
int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
//set the work item size
size_t work_size = 128;
size_t work_items = I * J * K;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelTmpgreen");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem rho2_ptr = (cl_mem)rho2_m;
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
int NI = 2*(I - 1);
int NJ = 2*(J - 1);
//set the work item size
size_t work_size = 128;
size_t work_items = I * J * K;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelIntegration");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
double zero = 0.0;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem rho2_ptr = (cl_mem)rho2_m;
int NI = I + 1;
int NJ = J + 1;
int NK = K + 1;
int I2 = 2*I;
int J2 = 2*J;
int K2 = 2*K;
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
//set the work item size
size_t work_size = 128;
size_t work_items = NI * NJ * NK;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem mem_ptr1 = (cl_mem) ptr1;
cl_mem mem_ptr2 = (cl_mem) ptr2;
//set the work item size
size_t work_size = 128;
size_t work_items = size;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("multiplyComplexFields");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
m_base->ocl_setKernelArg(2, sizeof(int), &size);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}

View File

@ -0,0 +1,63 @@
#ifndef H_OPENCL_GREENSFUNCTION
#define H_OPENCL_GREENSFUNCTION
#include <iostream>
#include <cmath>
#include "../Algorithms/GreensFunction.h"
#include "OpenCLBase.h"
class OpenCLGreensFunction : public GreensFunction {
private:
bool base_create;
OpenCLBase *m_base;
public:
/** Constructor with OpenCLBase argument */
OpenCLGreensFunction(OpenCLBase *base);
/** Default constructor */
OpenCLGreensFunction();
/** Destructor */
~OpenCLGreensFunction();
/** Load OpenCL kernel file containing greens function kernels.
* m_base takes the kernel file and compiles the OpenCL programm.
*/
int buildProgram();
/**
Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code
*/
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId = -1);
/**
Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code
*/
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
/**
Info: mirror rho field (taken from OPAL src code)
Return: succes or error code
*/
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code
*/
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
};
#endif

View File

@ -1,6 +1,4 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION
/******Random numbers********/ /******Random numbers********/
@ -89,13 +87,14 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
if (id < N) { if (id < N) {
RNDState tmp; RNDState tmp;
int tmp_seed = id;// * 0x100000000ULL; int tmp_seed = 2*id;// * 0x100000000ULL;
tmp.s10 = 12345 + tmp_seed; tmp.s10 = 12345 + tmp_seed;
tmp.s11 = 12345 + tmp_seed; tmp.s11 = 12345 + tmp_seed;
tmp.s12 = 123 + tmp_seed; tmp.s12 = 12345 + tmp_seed;
tmp.s20 = 12345 + tmp_seed; tmp.s20 = 12345 + tmp_seed;
tmp.s21 = 12345 + tmp_seed; tmp.s21 = 12345 + tmp_seed;
tmp.s22 = 123 + tmp_seed; tmp.s22 = 12345 + tmp_seed;
tmp.z = 0; tmp.z = 0;
tmp.gen = true; tmp.gen = true;
@ -105,6 +104,19 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
} }
/* create random numbers and fill an array */
__kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
int idx = get_global_id(0);
if (idx < size) {
RNDState s = states[idx];
data[idx] = rand_uniform(&s);
states[idx] = s;
}
}
/**********Degrader**********/ /**********Degrader**********/
enum PARAMS { POSITION, enum PARAMS { POSITION,

View File

@ -0,0 +1,170 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/** compute the greens integral analytically */
__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
int NI, int NJ, int NK)
{
int tid = get_local_size(0);
int id = get_global_id(0);
if (id < NI * NJ * NK) {
int i = id % NI;
int k = id / (NI * NJ);
int j = (id - k * NI * NJ) / NI;
double cellVolume = hr_m0 * hr_m1 * hr_m2;
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgreen[id] = tmpgrn / cellVolume;
}
}
/** perform the actual integration */
__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen,
int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp)
{
int tid = get_local_id(0);
int id = get_global_id(0);
int ni = NI;
int nj = NJ;
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
/** miror rho-field */
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
rho2_m[0] = rho2_m[NI*NJ];
}
__kernel void kernelMirroredRhoField(__global double *rho2_m,
int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp,
int size)
{
int tid = get_local_id(0);
int id = get_global_id(0);
if (id == 0)
rho2_m[0] = rho2_m[NI * NJ];
barrier(CLK_GLOBAL_MEM_FENCE);
int id1, id2, id3, id4, id5, id6, id7, id8;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
int ri = NI - i;
int rj = NJ - j;
int rk = NK - k;
id1 = k * NI * NJ + j * NI + i;
id2 = k * NI * NJ + j * NI + ri;
id3 = k * NI * NJ + rj * NI + i;
id4 = k * NI * NJ + rj * NI + ri;
id5 = rk * NI * NJ + j * NI + i;
id6 = rk * NI * NJ + j * NI + ri;
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = 0.0;
if (id1 < size)
data = rho2_m[id1];
if (i != 0 && id2 < size) rho2_m[id2] = data;
if (j != 0 && id3 < size) rho2_m[id3] = data;
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
if (k != 0 && id5 < size) rho2_m[id5] = data;
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
}
}
/** multiply complex fields */
double2 ComplexMul(double2 a, double2 b) {
double2 c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
int size)
{
int idx = get_global_id(0);
if (idx < size)
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
}

View File

@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testFFT testFFT.cpp) #ADD_EXECUTABLE(testFFT testFFT.cpp)
#ADD_EXECUTABLE(testMIC testMIC.cpp) #ADD_EXECUTABLE(testMIC testMIC.cpp)
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp) #ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp) ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp) ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp) #ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp) #ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp) #ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
@ -22,10 +22,11 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testGather testGather.cpp) #ADD_EXECUTABLE(testGather testGather.cpp)
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp) #ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
#ADD_EXECUTABLE(testTranspose testTranspose.cpp) #ADD_EXECUTABLE(testTranspose testTranspose.cpp)
ADD_EXECUTABLE(testRandom testRandom.cpp)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
#ADD_EXECUTABLE(testPush testPush.cpp) #ADD_EXECUTABLE(testPush testPush.cpp)
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp) ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp) #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp) #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
@ -38,8 +39,8 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#TARGET_LINK_LIBRARIES(testFFT dks) #TARGET_LINK_LIBRARIES(testFFT dks)
#TARGET_LINK_LIBRARIES(testMIC dks) #TARGET_LINK_LIBRARIES(testMIC dks)
#TARGET_LINK_LIBRARIES(testMICOpenCL dks) #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
#TARGET_LINK_LIBRARIES(testFFT3D dks) TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testFFT3DRC dks) TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks) #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks) #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
#TARGET_LINK_LIBRARIES(testStockhamFFT dks) #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -53,10 +54,11 @@ ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#TARGET_LINK_LIBRARIES(testGather dks) #TARGET_LINK_LIBRARIES(testGather dks)
#TARGET_LINK_LIBRARIES(testGatherAsync dks) #TARGET_LINK_LIBRARIES(testGatherAsync dks)
#TARGET_LINK_LIBRARIES(testTranspose dks) #TARGET_LINK_LIBRARIES(testTranspose dks)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES}) TARGET_LINK_LIBRARIES(testRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks) TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testPush dks) #TARGET_LINK_LIBRARIES(testPush dks)
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks) TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testIntegration dks) #TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks) #TARGET_LINK_LIBRARIES(testImageReconstruction dks)
@ -81,4 +83,4 @@ TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
#IF (NOT CUDA_VERSION VERSION_LESS "7.0") #IF (NOT CUDA_VERSION VERSION_LESS "7.0")
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
#TARGET_LINK_LIBRARIES(testChiSquareRT dks) #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") #ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")

View File

@ -129,7 +129,9 @@ int main(int argc, char *argv[]) {
//init random //init random
base.callInitRandoms(numpart); base.callInitRandoms(numpart);
//**test collimator physics and sort***// //**test collimator physics and sort***//
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr; void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
//allocate memory for particles //allocate memory for particles
@ -210,8 +212,8 @@ int main(int argc, char *argv[]) {
base.freeMemory<double>(pz_ptr, numpart); base.freeMemory<double>(pz_ptr, numpart);
base.freeMemory<double>(param_ptr, 12); base.freeMemory<double>(param_ptr, 12);
/* /*
std::cout << std::fixed << std::setprecision(4); std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
std::cout << p.label[i] << "\t" << p.rx[i] std::cout << p.label[i] << "\t" << p.rx[i]

View File

@ -1,6 +1,7 @@
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include <complex> #include <complex>
#include <string>
#include "Utility/TimeStamp.h" #include "Utility/TimeStamp.h"
#include "DKSBase.h" #include "DKSBase.h"
@ -18,22 +19,30 @@ int main(int argc, char *argv[]) {
int N = 16; int N = 16;
char *api_name = new char[10]; char *api_name = new char[10];
char *device_name = new char[10]; char *device_name = new char[10];
if (argc == 2) {
N = atoi(argv[1]); for (int i = 1; i < argc; i++) {
strcpy(api_name, "Cuda"); if (argv[i] == string("-cuda")) {
strcpy(device_name, "-gpu"); strcpy(api_name, "Cuda");
} else if (argc == 3) { strcpy(device_name, "-gpu");
N = atoi(argv[1]); }
strcpy(api_name, argv[2]);
strcpy(device_name, "-gpu"); if (argv[i] == string("-opencl")) {
} else if (argc == 4) { strcpy(api_name, "OpenCL");
N = atoi(argv[1]); strcpy(device_name, "-gpu");
strcpy(api_name, argv[2]); }
strcpy(device_name, argv[3]);
} else { if (argv[i] == string("-mic")) {
N = 16; strcpy(api_name, "OpenMP");
strcpy(api_name, "OpenCL"); strcpy(device_name, "-mic");
strcpy(device_name, "-gpu"); }
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
if (argv[i] == string("-N"))
N = atoi(argv[i+1]);
} }
cout << "Use api: " << api_name << ", " << device_name << endl; cout << "Use api: " << api_name << ", " << device_name << endl;
@ -74,9 +83,16 @@ int main(int argc, char *argv[]) {
/* write data to device */ /* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N); ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
if (N < 5)
printData3DN4(cdata, N, 3);
/* execute fft */ /* execute fft */
base.callFFT(mem_ptr, 3, dimsize); base.callFFT(mem_ptr, 3, dimsize);
if (N < 5) {
base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
printData3DN4(cfft, N, 3);
}
/* execute ifft */ /* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize); base.callIFFT(mem_ptr, 3, dimsize);
@ -86,7 +102,9 @@ int main(int argc, char *argv[]) {
/* read data from device */ /* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N); base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
if (N < 5)
printData3DN4(cifft, N, 3);
/* free device memory */ /* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N); base.freeMemory< complex<double> >(mem_ptr, N*N*N);
@ -130,7 +148,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
if (a < 10e-5 && a > -10e-5) if (a < 10e-5 && a > -10e-5)
a = 0; a = 0;
cout << d << "; " << a << "\t"; cout << "(" << d << "," << a << ") ";
} }
} }
cout << endl; cout << endl;
@ -157,3 +175,5 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
cout << "Size " << N << " CC <--> CC diff: " << sum << endl; cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
} }

View File

@ -1,6 +1,8 @@
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include <complex> #include <complex>
#include <fstream>
#include <iomanip>
#include "Utility/TimeStamp.h" #include "Utility/TimeStamp.h"
#include "DKSBase.h" #include "DKSBase.h"
@ -8,54 +10,53 @@
using namespace std; using namespace std;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim); void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
void initData(double *data, int dimsize[3]); void initData(double *data, int dimsize[3], int dim);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop); bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
char *api_name, char *device_name, char *file_name);
void printHelp(); void printHelp();
void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* &data, int N, int dim);
double precision(double a) {
//if (a < 1e-10)
// return 0.0;
//else
return a;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
int N1 = 8; int N1 = 8;
int N2 = 8; int N2 = 8;
int N3 = 8; int N3 = 8;
int dim = 3; int dim = 3;
int loop = 10; int loop = 0;
char *api_name = new char[10];
char *device_name = new char[10];
char *file_name = new char[50];
if ( readParams(argc, argv, N1, N2, N3, loop) ) if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) )
return 0; return 0;
int dimsize[3] = {N3, N2, N1}; cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N1, N2, N3};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal]; double *rdata = new double[sizereal];
double *outdata = new double[sizereal]; double *outdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp]; complex<double> *cfft = new complex<double>[sizecomp];
initData(rdata, dimsize, dim);
for (int i=0; i<sizecomp; ++i) {
cfft[i].real() = 7.;
cfft[i].imag() = 3.33;
}
initData(rdata, dimsize);
/* init DKSBase */ /* init DKSBase */
cout << "Init device and set function" << endl; cout << "Init device and set function" << endl;
#ifdef DKS_MIC
DKSBase base; DKSBase base;
base.setAPI("OpenMP", 6); base.setAPI(api_name, strlen(api_name));
base.setDevice("-mic", 4); base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFTRC(dim, dimsize);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
#endif
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice(); base.initDevice();
base.setupFFT(dim, dimsize); base.setupFFT(dim, dimsize);
#endif
// allocate memory on device // allocate memory on device
int ierr; int ierr;
@ -67,69 +68,59 @@ int main(int argc, char *argv[]) {
// execute one run before starting the timers // execute one run before starting the timers
base.writeData<double>(real_ptr, rdata, sizereal); base.writeData<double>(real_ptr, rdata, sizereal);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
base.readData<double>(real_res_ptr, outdata, sizereal); base.readData<double>(real_res_ptr, outdata, sizereal);
//timer for total loop time, FFT and IFFT calls
struct timeval timeStart, timeEnd;
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
gettimeofday(&timeStart, NULL);
for (int i=0; i<loop; ++i){
// write data to device
base.writeData<double>(real_ptr, rdata, sizereal);
// execute rcfft
gettimeofday(&timeFFTStart[i], NULL);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeFFTEnd[i], NULL);
// execute crfft
gettimeofday(&timeIFFTStart[i], NULL);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeIFFTEnd[i], NULL);
//normalize
#ifdef DKS_CUDA
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
#endif
// read IFFT data from device
base.readData<double>(real_res_ptr, outdata, sizereal);
ofstream myfile;
myfile.open(file_name);
myfile<< "in\tout\treal\timag\n";
for (int i = 0; i < sizereal; i++) {
//myfile << precision(rdata[i]) << "\t";
//myfile << precision(outdata[i]) << "\t";
if (i < sizecomp) {
myfile << precision(cfft[i].real()) << "\t";
myfile << precision(cfft[i].imag());
}
myfile << "\n";
} }
gettimeofday(&timeEnd, NULL); myfile.close();
/*
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1; j++) {
cout << rdata[i*N1 + j] << " ";
}
cout << endl;
}
cout << endl;
}
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1 / 2 + 1; j++) {
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
}
cout << endl;
}
cout << endl;
}
*/
// free device memory // free device memory
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp); base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal); base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(real_res_ptr, sizereal); base.freeMemory<double>(real_res_ptr, sizereal);
// compare in and out data to see if we get back the same results // compare in and out data to see if we get back the same results
cout << "comp" << endl;
compareData(rdata, outdata, N1, N2, N3, dim); compareData(rdata, outdata, N1, N2, N3, dim);
cout << "done" << endl;
//calculate seconds for total time and fft times
double tfft = 0;
double tifft = 0;
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
for (int i = 0; i < loop; i++) {
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
}
//print timing results
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
<< "\n\n";
return 0; return 0;
} }
@ -137,10 +128,10 @@ int main(int argc, char *argv[]) {
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) { void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
int id; int id;
double sum = 0; double sum = 0;
for (int i = 0; i < NI; i++) { for (int i = 0; i < NK; i++) {
for (int j = 0; j < NJ; j++) { for (int j = 0; j < NJ; j++) {
for (int k = 0; k < NK; k++) { for (int k = 0; k < NI; k++) {
id = k*NI*NJ + j*NI + i; id = i*NI*NJ + j*NI + k;
sum += fabs(data1[id] - data2[id]); sum += fabs(data1[id] - data2[id]);
} }
} }
@ -148,13 +139,21 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
std::cout << "RC <--> CR diff: " << sum << std::endl; std::cout << "RC <--> CR diff: " << sum << std::endl;
} }
void initData(double *data, int dimsize[3]) { void initData(double *data, int dimsize[3], int dim) {
for (int i = 0; i < dimsize[2]; i++) { if (dim == 3) {
for (int i = 0; i < dimsize[2]; i++)
for (int j = 0; j < dimsize[1]; j++)
for (int k = 0; k < dimsize[0]; k++)
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
} else if (dim == 2) {
for (int j = 0; j < dimsize[1]; j++) { for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) { for (int k = 0; k < dimsize[0]; k++) {
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; data[j*dimsize[0] + k] = sin(k);
} }
} }
} else {
for (int k = 0; k < dimsize[0]; k++)
data[k] = sin(k);
} }
} }
@ -173,10 +172,17 @@ void printHelp() {
std::cout << std::endl; std::cout << std::endl;
} }
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) { bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim,
char *api_name, char *device_name, char *file_name)
{
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-dim")) {
dim = atoi(argv[i + 1]);
i++;
}
if ( argv[i] == std::string("-grid") ) { if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]); N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]); N2 = atoi(argv[i + 2]);
@ -193,7 +199,72 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
printHelp(); printHelp();
return true; return true;
} }
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
strcpy(file_name, "cuda_fft.dat");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
strcpy(file_name, "opencl_fft.dat");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
strcpy(file_name, "openmp_fft.dat");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
strcpy(file_name, "opencl_cpu_fft.dat");
}
} }
return false; return false;
} }
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N/2 + 1; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << "(" << d << "," << a << ") ";
}
}
cout << endl;
}
cout << endl;
}
void printData3DN4(double* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k];
if (d < 10e-5 && d > -10e-5)
d = 0;
cout << d << " ";
}
}
cout << endl;
}
cout << endl;
}

View File

@ -1,5 +1,4 @@
#include <iostream> #include <iostream>
//#include <mpi.h>
#include <string.h> #include <string.h>
#include "DKSBase.h" #include "DKSBase.h"
@ -11,309 +10,265 @@ using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") { void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0) if (strcmp(message, "") != 0)
cout << message; cout << message;
for (int i = 0; i < NI; i++) { for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) { for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) { for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t"; cout << data[i*N*N + j*N + k] << "\t";
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
} }
void initData(double *data, int N) { void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) { for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) { for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) { for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1; data[i*N*N + j*N + k] = k+1;
} }
} }
} }
} }
void initData2(double *data, int N) { void initData2(double *data, int N) {
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
data[i] = i; data[i] = i;
} }
void initComplex( complex<double> *d, int N) { void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0); d[i] = complex<double>(2, 0);
} }
} }
void printComplex(complex<double> *d, int N) { void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
cout << d[i] << "\t"; cout << d[i] << "\t";
cout << endl; cout << endl;
}
void printDouble(double *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << ", ";
cout << endl;
} }
void initMirror(double *data, int n1, int n2, int n3) { void initMirror(double *data, int n1, int n2, int n3) {
int d = 1; int d = 1;
for (int i = 0; i < n3; i++) { for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) { for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) { for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++; data[i * n2 * n1 + j * n1 + k] = d++;
else else
data[i * n2 * n1 + j * n1 + k] = 0; data[i * n2 * n1 + j * n1 + k] = 0;
} }
} }
} }
} }
void printDiv(int c) { void printDiv(int c) {
for (int i = 0; i < c; i++) for (int i = 0; i < c; i++)
cout << "-"; cout << "-";
cout << endl; cout << endl;
} }
void printMirror(double *data, int n1, int n2, int n3) { void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75); printDiv(75);
for (int i = 0; i < n3; i++) { for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) { for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) { for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t"; cout << data[i * n2 * n1 + j * n1 + k] << "\t";
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
double sumData(double *data, int datasize) { double sumData(double *data, int datasize) {
double sum = 0; double sum = 0;
for (int i = 0; i < datasize; i++) for (int i = 0; i < datasize; i++)
sum += data[i]; sum += data[i];
return sum; return sum;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
/* mpi init */ char *api_name = new char[10];
//int rank, nprocs; char *device_name = new char[10];
//MPI_Init(&argc, &argv);
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* for (int i = 1; i < argc; i++) {
if (nprocs != 8) { if (argv[i] == string("-cuda")) {
cout << "example was set to run with 8 processes" << endl; strcpy(api_name, "Cuda");
cout << "exit..." << endl; strcpy(device_name, "-gpu");
return 0; }
}
*/
/* set domain size */ if (argv[i] == string("-opencl")) {
int NG[3] = {64, 64, 32}; strcpy(api_name, "OpenCL");
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; strcpy(device_name, "-gpu");
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; }
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
//id[0] = 0; if (argv[i] == string("-mic")) {
//id[1] = NL[1] * (rank % 4); strcpy(api_name, "OpenMP");
//id[2] = NL[2] * (rank / 4); strcpy(device_name, "-mic");
}
/* print some messages bout the example in the begginig */ if (argv[i] == string("-cpu")) {
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; strcpy(api_name, "OpenCL");
//cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; strcpy(device_name, "-cpu");
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; }
//cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; }
int tmp[3];
/* for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}*/
// } else {
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
// }
/* dks init and create 2 streams */ cout << "Use api: " << api_name << ", " << device_name << endl;
int dkserr;
//int streamGreens, streamFFT;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
#endif
#ifdef DKS_CUDA /* set domain size */
DKSBase base; int NG[3] = {64, 64, 32};
base.setAPI("Cuda", 4); int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
base.setDevice("-gpu", 4); int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
base.initDevice(); int sizerho = NG[0] * NG[1] * NG[2];
#endif int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
//base.createStream(streamFFT); /* print some messages bout the example in the begginig */
//if (rank == 0) { cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
// base.createStream(streamGreens); cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
base.setupFFT(3, NG);
//}
/* allocate memory and init rho field */ /* dks init and create 2 streams */
double *rho = new double[sizerho]; int dkserr;
double *rho_out = new double[sizerho]; DKSBase base;
//double *green_out = new double[sizegreen]; base.setAPI(api_name, strlen(api_name));
initMirror(rho, NL[0], NL[1], NL[2]); base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, NG);
/* /* allocate memory and init rho field */
allocate memory on device for double *rho = new double[sizerho];
- rho field double *rho_out = new double[sizerho];
- rho FFT //double *green_out = new double[sizegreen];
- tmpgreen double *mirror_out = new double[sizerho];
- greens integral //initMirror(rho, NL[0], NL[1], NL[2]);
- greens integral FFT initMirror(rho, NG[0], NG[1], NG[2]);
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
// if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* } else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}*/
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* send and receive pointer to allocated memory on device */ /* =================================================*/
/* /* =================================================*/
if (rank == 0) { /* =====loop trough fftpoison solver iterations=====*/
for (int p = 1; p < nprocs; p++) /* =================================================*/
base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); /* =================================================*/
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
*/
double old_sum = 0;
/* =================================================*/ int hr_m[3] = {1, 1, 1};
/* =================================================*/ base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]);
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
double old_sum = 0; /* calculate greens integral on gpu */
double tmp_sum = 0; base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
for (int l = 0; l < 100; l++) {
//MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
//if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2]);
/* calculate greens integral on gpu */ /* mirror the field */
//if (rank == 0) base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]); /*
base.readData<double>(grn_ptr, mirror_out, sizerho);
for (int i = 0; i < sizerho; i++)
cout << mirror_out[i] << " ";
cout << endl << endl;
/* mirror the field */ for (int i = 0; i < sizerho; i++)
//if (rank == 0) cout << rho[i] << " ";
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]); cout << endl << endl;
*/
/* transfer rho field to device */
base.writeData<double>(rho2_ptr, rho, sizerho);
/* get FFT of rho field */
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
/* get FFT of mirrored greens integral */ /* get FFT of mirrored greens integral */
//if (rank == 0) base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* transfer rho field to device */ /* multiply both FFTs */
//base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT); base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
/* get FFT of rho field */ /*
//if (rank == 0) { complex<double> *crho = new complex<double>[sizecomp];
//base.syncDevice(); complex<double> *cgre = new complex<double>[sizecomp];
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp);
//} base.readData< complex<double> >(grntr_ptr, cgre, sizecomp);
/* multiply both FFTs */ for (int i = 0; i < sizecomp; i++)
//if (rank == 0) cout << cgre[i].real() << " ";
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); cout << endl << endl;
//MPI_Barrier(MPI_COMM_WORLD);
/* inverse fft and transfer data back */ for (int i = 0; i < sizecomp; i++)
/* cout << crho[i].real() << " ";
multiple device syncs and mpi barriers are used to make sure data cout << endl << endl;
transfer is started when results are ready and progam moves on
only when data transfer is finished delete[] crho;
*/ delete[] cgre;
//if (rank == 0) { */
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
/*} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
*/
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
base.readData<double> (rho2_ptr, rho_out, sizerho);
for (int i = 0; i < 10; i++)
cout << rho_out[i] << " ";
cout << endl;
old_sum = sumData(rho_out, sizerho);
}
/* =================================================*/ /* =================================================*/
/* =================================================*/ /* =================================================*/
/* ==========end fftpoison solver test run==========*/ /* ==========end fftpoison solver test run==========*/
/* =================================================*/ /* =================================================*/
/* =================================================*/ /* =================================================*/
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
base.freeMemory<double>(rho2_ptr, sizerho);
delete[] rho_out;
/* free memory on device */ delete[] rho;
//if (rank == 0) { delete[] mirror_out;
base.freeMemory<double>(tmpgreen_ptr, sizegreen); cout << "Final sum: " << old_sum << endl;
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
/*} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}*/
//MPI_Finalize();
} }

81
test/testRandom.cpp Normal file
View File

@ -0,0 +1,81 @@
#include <iostream>
#include <string>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
int size = 10;
bool apiSet = false;
char *api_name = new char[10];
char *device_name = new char[10];
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
apiSet = true;
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
apiSet = true;
}
if (argv[i] == string("-N")) {
size = atoi(argv[i+1]);
i++;
}
}
if (!apiSet) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of randoms: " << size << endl;
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
base.callInitRandoms(size);
//create host vector to store results
double *host_data = new double[size];
//create device vector
void *device_data = base.allocateMemory<double>(size, ierr);
for (int i = 0; i < 5; i++) {
//fill device vector with random values
base.callCreateRandomNumbers(device_data, size);
//read device vector
base.readData<double>(device_data, host_data, size);
//print host data
for (int i = 0; i < size; i++)
cout << host_data[i] << " ";
cout << endl;
}
//free device vector
base.freeMemory<double>(device_data, size);
//free host data
delete[] host_data;
return 0;
}