3 Commits

47 changed files with 1447 additions and 2591 deletions

View File

@ -1,8 +1,10 @@
CMAKE_MINIMUM_REQUIRED (VERSION 3.2) CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
PROJECT (DKS) PROJECT (DKS)
SET (DKS_VERSION_MAJOR 1) SET (DKS_VERSION_MAJOR 1)
SET (DKS_VERSION_MINOR 1) SET (DKS_VERSION_MINOR 0)
SET (DKS_VERSION_PATCH 0) SET (DKS_VERSION_PATCH 2)
SET (PACKAGE \"dks\")
set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH}) set (DKS_VERSION ${DKS_VERSION_MAJOR}.${DKS_VERSION_MINOR}.${DKS_VERSION_PATCH})
SET (PACKAGE \"dks\") SET (PACKAGE \"dks\")
SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\") SET (PACKAGE_BUGREPORT \"locans.uldis@psi.ch\")
@ -37,44 +39,10 @@ IF (Boost_FOUND)
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
ENDIF (Boost_FOUND) ENDIF (Boost_FOUND)
#include OPAL, musrfit or pet kernels
OPTION(DKS_FULL "Compile DKS with full library" OFF)
OPTION(ENABLE_OPAL "Compile DKS with OPAL kernels" OFF)
OPTION(ENABLE_MUSR "Compile DKS with musrfit kernels" OFF)
OPTION(ENABLE_PET "Compile DKS with PET reconstruction kernels" OFF)
IF (DKS_FULL)
SET(ENABLE_OPAL ON)
SET(ENABLE_MUSR ON)
SET(ENABLE_PET ON)
ENDIF(DKS_FULL)
#find clFFT
OPTION (ENABLE_AMD "Enable AMD libraries" OFF)
IF (ENABLE_AMD)
SET (clFFT_USE_STATIC_LIBS OFF)
FIND_PACKAGE(clFFT REQUIRED HINTS $ENV{CLFFT_PREFIX} $ENV{CLFFT_DIR} $ENV{CLFFT})
MESSAGE (STATUS "Found clFFT library: ${CLFFT_LIBRARIES}")
MESSAGE (STATUS "Found clFFT include dir: ${CLFFT_INCLUDE_DIRS}")
INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
LINK_DIRECTORIES (${CLFFT_LIBRARIES})
#find clRNG
#SET (clRNG_USE_STATIC_LIBS OFF)
#FIND_PACKAGE(clRng REQUIRED HINTS &ENV{CLRNG_PREFIX} $ENV{CLRNG_DIR} $ENV{CLRNG})
#MESSAGE (STATUS "Found clRNG library: ${CLRNG_LIBRARIES}")
#MESSAGE (STATUS "Found clRNG include dir: ${CLRNG_INCLUDE_DIRS}")
#INCLUDE_DIRECTORIES (${CLFFT_INCLUDE_DIRS})
#LINK_DIRECTORIES (${CLRNG_LIBRARIES})
#find_package(PkgConfig)
#pkg_check_modules(clRng REQUIRED)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_AMD")
ENDIF (ENABLE_AMD)
#enable UQTK #enable UQTK
OPTION (USE_UQTK "Use UQTK" OFF) OPTION (USE_UQTK "Use UQTK" OFF)
#intel icpc compiler specific flags #intel icpc compiler specific flags
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
@ -217,3 +185,4 @@ INSTALL (
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}" DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
RENAME ${PROJECT_NAME}ConfigVersion.cmake RENAME ${PROJECT_NAME}ConfigVersion.cmake
) )

View File

@ -2,30 +2,18 @@ INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#chi square kernel tests #chi square kernel tests
IF (ENABLE_MUSR) ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp) ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ENDIF (ENABLE_MUSR)
IF (ENABLE_OPAL)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ADD_EXECUTABLE(testPushKick testPushKick.cpp)
TARGET_LINK_LIBRARIES(testPushKick dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
ENDIF(ENABLE_OPAL)
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})

View File

@ -1,161 +0,0 @@
#include <iostream>
#include <vector>
#include <string>
#include "DKSOPAL.h"
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
} PART;
PART initPartSmall(int d) {
PART p;
p.label = 0;
p.localID = d;
p.Rincol[0] = 0.0;
p.Rincol[1] = 0.0;
p.Rincol[2] = 0.02;
p.Pincol[0] = 0.0;
p.Pincol[1] = 0.0;
p.Pincol[2] = 3.9920183237269791e-01;
return p;
}
void printPart(PART p) {
std::cout << "label: " << p.label << ", ";
std::cout << "localid: " << p.localID << ",";
std::cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
std::cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
std::cout << std::endl;
}
void initParts(PART *p, int N) {
for (int i = 0; i < N; i++)
p[i] = initPartSmall(i);
}
void printParts(PART *p, int N) {
for (int i = 0; i < N; i++)
printPart(p[i]);
std::cout << std::endl;
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == std::string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == std::string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == std::string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
std::cout << "=========================BEGIN TEST=========================" << std::endl;
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
std::cout << "Number of particles: " << numpart << std::endl;
std::cout << "Number of loops: " << loop << std::endl;
std::cout << "------------------------------------------------------------" << std::endl;
//init part vector to test mc
PART *parts = new PART[numpart];
initParts(parts, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSOPAL base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
ierr = base.initDevice();
if (ierr != DKS_SUCCESS)
std::cout << "Error with init device!" << std::endl;
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *part_ptr, *param_ptr;
//allocate memory for particles
part_ptr = base.allocateMemory<PART>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<PART>(part_ptr, parts, numpart);
base.writeData<double>(param_ptr, params, 12);
int numaddback;
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
//read data from device
base.readData<PART>(part_ptr, parts, numpart);
//free memory
base.freeMemory<PART>(part_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
return 0;
}

View File

@ -1,132 +0,0 @@
#include <iostream>
#include <vector>
#include <string>
#include "DKSOPAL.h"
#include <vector_types.h>
#include "cuda_runtime.h"
void initData(double3 *data, int N) {
for (int i = 0; i < N; i++) {
data[i].x = (double)rand() / RAND_MAX;
data[i].y = (double)rand() / RAND_MAX;
data[i].z = (double)rand() / RAND_MAX;
}
}
void initDt(double *data, int N) {
for (int i = 0; i < N; i++) {
data[i] = 0.00001;
}
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == std::string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == std::string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == std::string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
std::cout << "=========================BEGIN TEST=========================" << std::endl;
std::cout << "Use api: " << api_name << "\t" << device_name << std::endl;
std::cout << "Number of particles: " << numpart << std::endl;
std::cout << "Number of loops: " << loop << std::endl;
std::cout << "------------------------------------------------------------" << std::endl;
int ierr;
DKSOPAL dksbase;
dksbase.setAPI(api_name, strlen(api_name));
dksbase.setDevice(device_name, strlen(api_name));
ierr = dksbase.initDevice();
if (ierr != DKS_SUCCESS)
std::cout << "Error with init device!" << std::endl;
double3 *R = new double3[numpart];
double3 *P = new double3[numpart];
double3 *Ef = new double3[numpart];
double3 *Bf = new double3[numpart];
double *dt = new double[numpart];
initData(R, numpart);
initData(P, numpart);
initData(Ef, numpart);
initData(Bf, numpart);
initDt(dt, numpart);
void *r_ptr, *p_ptr, *ef_ptr, *bf_ptr, *dt_ptr;
r_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
p_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
ef_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
bf_ptr = dksbase.allocateMemory<double3>(numpart, ierr);
dt_ptr = dksbase.allocateMemory<double>(numpart, ierr);
dksbase.writeData<double3>(r_ptr, R, numpart);
dksbase.writeData<double3>(p_ptr, P, numpart);
dksbase.writeData<double3>(ef_ptr, Ef, numpart);
dksbase.writeData<double3>(bf_ptr, Bf, numpart);
dksbase.writeData<double>(dt_ptr, dt, numpart);
for (int i = 0; i < loop; ++i)
dksbase.callParallelTTrackerPush(r_ptr, p_ptr, dt_ptr, numpart, 1.0);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std::cout << "============" << std::endl;
dksbase.readData<double3>(r_ptr, R, numpart);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++)
std::cout << R[i].x << "\t" << R[i].y << "\t" << R[i].z << std::endl;
dksbase.freeMemory<double3>(r_ptr, numpart);
dksbase.freeMemory<double3>(p_ptr, numpart);
dksbase.freeMemory<double3>(ef_ptr, numpart);
dksbase.freeMemory<double3>(bf_ptr, numpart);
dksbase.freeMemory<double>(dt_ptr, numpart);
delete[] R;
delete[] P;
delete[] Ef;
delete[] Bf;
delete[] dt;
}

View File

@ -4,4 +4,4 @@ SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
SET(${PROJECT_NAME}_LIBRARY "dks") SET(${PROJECT_NAME}_LIBRARY "dks")
SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH}) SET(CMAKE_SKIP_RPATH ${CMAKE_SKIP_RPATH})
SET(DKS_VERSION ${DKS_VERSION}) SET(DKS_VERSION ${DKS_VERSION})
SET(DKS_VERSION_STR ${DKS_VERSION_STR}) SET(DKS_VERSION_STR ${DKS_VERSION_STR})

View File

@ -6,7 +6,6 @@ SET (_HDRS
ImageReconstruction.h ImageReconstruction.h
CollimatorPhysics.h CollimatorPhysics.h
FFT.h FFT.h
GreensFunction.h
) )
ADD_SOURCES (${_SRCS}) ADD_SOURCES (${_SRCS})

View File

@ -5,7 +5,10 @@
#include <string> #include <string>
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
class DKSBaseMuSR;
class DKSCollimatorPhysics { class DKSCollimatorPhysics {
friend class DKSBaseMuSR;
protected: protected:
@ -16,7 +19,8 @@ public:
virtual ~DKSCollimatorPhysics() { } virtual ~DKSCollimatorPhysics() { }
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0; virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices,
bool enableRutherfordScattering = true) = 0;
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
@ -33,10 +37,6 @@ public:
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0; double dt, double c, bool usedt = false, int streamId = -1) = 0;
virtual int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1) = 0;
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr, void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false, double dt, double c, bool usedt = false,

View File

@ -1,29 +0,0 @@
#ifndef H_GREENSFUNCTION
#define H_GREENSFUNCTION
#include <iostream>
#include <cmath>
class GreensFunction {
public:
virtual ~GreensFunction() { }
/** calc greens integral, as defined in OPAL */
virtual int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, int streamId = -1) = 0;
/** integration if rho2_m, see OPAL for more details */
virtual int integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1) = 0;
/** mirror rho2_m field */
virtual int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1) = 0;
/** multiply two complex fields from device memory */
virtual int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1) = 0;
};
#endif

View File

@ -35,29 +35,13 @@ ENDMACRO ()
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
DKSBase.h DKSBase.h
DKSDefinitions.h DKSDefinitions.h
DKSOPAL.h
) )
SET (DKS_BASEDIR_SRCS SET (DKS_BASEDIR_SRCS
DKSBase.cpp DKSBase.cpp
DKSOPAL.cpp
) )
#add opal to DKS if enable_opal is set IF (USE_CUDA OR USE_OPENCL)
IF (ENABLE_OPAL)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSOPAL.h
)
SET (DKS_BASEDIR_SRCS
${DKS_BASEDIR_SRCS}
DKSOPAL.cpp
)
ENDIF (ENABLE_OPAL)
#and musrt to DKS if cuda or opencl is used and enable_musr is set
IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS} ${DKS_BASEDIR_HDRS}
DKSBaseMuSR.h DKSBaseMuSR.h
@ -67,10 +51,9 @@ IF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR)
${DKS_BASEDIR_SRCS} ${DKS_BASEDIR_SRCS}
DKSBaseMuSR.cpp DKSBaseMuSR.cpp
) )
ENDIF ( (USE_CUDA OR USE_OPENCL) AND ENABLE_MUSR) ENDIF (USE_CUDA OR USE_OPENCL)
#add image reconstruction to DKS if cuda is used and enable_pet is set IF (USE_CUDA)
IF (USE_CUDA AND ENABLE_PET)
SET (DKS_BASEDIR_HDRS SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS} ${DKS_BASEDIR_HDRS}
DKSImageReconstruction.h DKSImageReconstruction.h
@ -80,7 +63,7 @@ IF (USE_CUDA AND ENABLE_PET)
${DKS_BASEDIR_SRCS} ${DKS_BASEDIR_SRCS}
DKSImageReconstruction.cpp DKSImageReconstruction.cpp
) )
ENDIF (USE_CUDA AND ENABLE_PET) ENDIF (USE_CUDA)
ADD_HEADERS (${DKS_BASEDIR_HDRS}) ADD_HEADERS (${DKS_BASEDIR_HDRS})
ADD_SOURCES (${DKS_BASEDIR_SRCS}) ADD_SOURCES (${DKS_BASEDIR_SRCS})

View File

@ -1,27 +1,35 @@
SET (_HDRS CudaBase.cuh) SET (_HDRS
SET (_SRCS CudaBase.cu) CudaBase.cuh
CudaFFT.cuh
CudaGreensFunction.cuh
CudaChiSquare.cuh
CudaCollimatorPhysics.cuh
CudaImageReconstruction.cuh
CudaChiSquareRuntime.cuh
)
SET (_SRCS
CudaBase.cu
CudaFFT.cu
CudaGreensFunction.cu
CudaChiSquare.cu
CudaCollimatorPhysics.cu
CudaImageReconstruction.cu
CudaChiSquareRuntime.cu
)
IF (ENABLE_OPAL) #INCLUDE_DIRECTORIES (
SET (_HDRS ${_HDRS} CudaFFT.cuh CudaGreensFunction.cuh CudaCollimatorPhysics.cuh) # ${CMAKE_CURRENT_SOURCE_DIR}
SET (_SRCS ${_SRCS} CudaFFT.cu CudaGreensFunction.cu CudaCollimatorPhysics.cu) #)
ENDIF (ENABLE_OPAL)
IF (ENABLE_MUSR)
SET (_HDRS ${_HDRS} CudaChiSquareRuntime.cuh)
SET (_SRCS ${_SRCS} CudaChiSquareRuntime.cu)
SET (_KERNELS NVRTCKernels/CudaChiSquareKernel.cu)
ENDIF (ENABLE_MUSR)
IF (ENABLE_PET)
SET (_HDRS ${_HDRS} CudaImageReconstruction.cuh)
SET (_SRCS ${_SRCS} CudaImageReconstruction.cu)
ENDIF (ENABLE_PET)
MESSAGE (STATUS "CUDA headers: ${_HDRS}")
ADD_SOURCES(${_SRCS}) ADD_SOURCES(${_SRCS})
ADD_HEADERS(${_HDRS}) ADD_HEADERS(${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA) INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
SET (_KERNELS
NVRTCKernels/CudaChiSquareKernel.cu
)
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels) INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)

View File

@ -13,13 +13,6 @@ __global__ void initcuRandState(curandState *state, int size, int seed = 0) {
} }
__global__ void kernelCreateRandNumbers(curandState *state, double *data, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
data[idx] = curand_uniform_double(&state[idx]);
}
//=====================================// //=====================================//
//==========Private functions==========// //==========Private functions==========//
@ -55,7 +48,7 @@ int CudaBase::cuda_createCurandStates(int size, int seed) {
int threads = 128; int threads = 128;
int blocks = size / threads + 1; int blocks = size / threads + 1;
if (seed == -1) if (seed == -1)
seed = time(NULL); seed = time(NULL);
//std::cout << "sizeof: " << sizeof(curandState) << std::endl; //std::cout << "sizeof: " << sizeof(curandState) << std::endl;
@ -76,15 +69,6 @@ int CudaBase::cuda_deleteCurandStates() {
return DKS_SUCCESS; return DKS_SUCCESS;
} }
int CudaBase::cuda_createRandomNumbers(void *mem_ptr, int size) {
int threads = BLOCK_SIZE;
int blocks = size / threads + 1;
kernelCreateRandNumbers<<<blocks, threads>>>(defaultRndState, (double *)mem_ptr, size);
return DKS_SUCCESS;
}
curandState* CudaBase::cuda_getCurandStates() { curandState* CudaBase::cuda_getCurandStates() {
return defaultRndState; return defaultRndState;
} }

View File

@ -15,8 +15,6 @@
#include <nvToolsExt.h> #include <nvToolsExt.h>
#include <time.h> #include <time.h>
#define BLOCK_SIZE 128
class CudaBase { class CudaBase {
private: private:
@ -41,7 +39,6 @@ public:
* Init cuda random number (cuRand) states. * Init cuda random number (cuRand) states.
* Create an array of type curandState with "size" elements on the GPU * Create an array of type curandState with "size" elements on the GPU
* and create a curandState with different seed for each array entry. * and create a curandState with different seed for each array entry.
* If no seed is given create a seed based on current time.
* Return success or error code * Return success or error code
*/ */
int cuda_createCurandStates(int size, int seed = -1); int cuda_createCurandStates(int size, int seed = -1);
@ -53,11 +50,6 @@ public:
*/ */
int cuda_deleteCurandStates(); int cuda_deleteCurandStates();
/** Create 'size' random numbers on the device and save in mem_ptr array
*
*/
int cuda_createRandomNumbers(void *mem_ptr, int size);
/** Get a pointer to curand states /** Get a pointer to curand states
* *
*/ */
@ -175,40 +167,6 @@ public:
return DKS_SUCCESS; return DKS_SUCCESS;
} }
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemory(T *mem_ptr, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemset(mem_ptr + offset, 0, sizeof(T) * size);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error zeroing cuda memory!\n");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/** Zero CUDA memory.
* Set all the elements of the array on the device to zero.
*/
template<typename T>
int cuda_zeroMemoryAsync(T *mem_ptr, size_t size, int offset = 0, int streamId = -1) {
int dkserror = DKS_SUCCESS;
cudaError cerror;
if (streamId < cuda_numberOfStreams()) {
cerror = cudaMemsetAsync(mem_ptr + offset, 0, sizeof(T) * size,
cuda_getStream(streamId));
if (cerror != cudaSuccess)
dkserror = DKS_ERROR;
} else
dkserror = DKS_ERROR;
return dkserror;
}
/** /**
* Info: write data to memory * Info: write data to memory
* Retrun: success or error code * Retrun: success or error code

View File

@ -23,6 +23,7 @@
#define X0_M 9 #define X0_M 9
#define I_M 10 #define I_M 10
#define DT_M 11 #define DT_M 11
#define LOWENERGY_THR 12
#define BLOCK_SIZE 128 #define BLOCK_SIZE 128
#define NUMPAR 12 #define NUMPAR 12
@ -33,14 +34,6 @@ __device__ inline double dot(double3 &d1, double3 &d2) {
} }
__device__ inline double3 cross(double3 &lhs, double3 &rhs) {
double3 tmp;
tmp.x = lhs.y * rhs.z - lhs.z * rhs.y;
tmp.y = lhs.z * rhs.x - lhs.x * rhs.z;
tmp.z = lhs.x * rhs.y - lhs.y * rhs.x;
return tmp;
}
__device__ inline bool checkHit(double &z, double *par) { __device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */ /* check if particle is in the degrader material */
@ -89,7 +82,7 @@ __device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state,
Eng = Eng + delta_E / 1E3; Eng = Eng + delta_E / 1E3;
} }
pdead = ((Eng<1E-4) || (dEdx>0)); pdead = ( (Eng < par[LOWENERGY_THR]) || (dEdx > 0) );
} }
@ -125,7 +118,9 @@ __device__ inline void Rot(double &px, double &pz, double &x, double &z, double
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
} }
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) { __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state,
double* par, bool enableRutherfordScattering)
{
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P; double gamma = (Eng + M_P) / M_P;
@ -153,7 +148,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par); Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P2 < 0.0047) { if( (P2 < 0.0047) && enableRutherfordScattering) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
@ -179,7 +174,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par); Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P2 < 0.0047) { if( (P2 < 0.0047) && enableRutherfordScattering) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
@ -193,7 +188,7 @@ __device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, d
template <typename T> template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles) int numparticles, bool enableRutherfordScattering)
{ {
//get global id and thread id //get global id and thread id
@ -235,7 +230,7 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
P.x = P.x * ptot / sq; P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq; P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq; P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p); coulombScat(R[tid], P, s, p, enableRutherfordScattering);
data[idx].Pincol = P; data[idx].Pincol = P;
} else { } else {
@ -258,7 +253,8 @@ __global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state
} }
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles) curandState *state, int numparticles,
bool enableRutherfordScattering)
{ {
//get global id and thread id //get global id and thread id
@ -296,7 +292,7 @@ __global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
P.x = P.x * ptot / sq; P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq; P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq; P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p); coulombScat(R[tid], P, s, p, enableRutherfordScattering);
data.Pincol[idx] = P; data.Pincol[idx] = P;
} else { } else {
@ -431,7 +427,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
} }
__global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, double c) { __global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
//get global id and thread id //get global id and thread id
volatile int tid = threadIdx.x; volatile int tid = threadIdx.x;
@ -457,51 +453,7 @@ __global__ void kernelPush(double3 *gR, double3 *gP, double *gdt, int npart, dou
} }
} }
__global__ void kernelKick(double3 *gR, double3 *gP, double3 *gEf, //TODO: kernel for push with switch off unitless positions with dt[i]*c
double3 *gBf, double *gdt, double charge,
double mass, int npart, double c)
{
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
double3 Ef = gEf[idx];
double3 Bf = gBf[idx];
double dt = gdt[idx];
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
double gamma = sqrt(1.0 + dot(P, P));
double3 t, w, s;
t.x = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.x;
t.y = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.y;
t.z = 0.5 * dt * charge * c * c / (gamma * mass) * Bf.z;
double3 crossPt = cross(P, t);
w.x = P.x + crossPt.x;
w.y = P.y + crossPt.y;
w.z = P.z + crossPt.z;
s.x = 2.0 / (1.0 + dot(t, t)) * t.x;
s.y = 2.0 / (1.0 + dot(t, t)) * t.y;
s.z = 2.0 / (1.0 + dot(t, t)) * t.z;
double3 crossws = cross(w, s);
P.x += crossws.x;
P.y += crossws.y;
P.z += crossws.z;
P.x += 0.5 * dt * charge * c / mass * Ef.x;
P.y += 0.5 * dt * charge * c / mass * Ef.y;
P.z += 0.5 * dt * charge * c / mass * Ef.z;
gP[idx] = P;
}
}
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) { __device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
@ -663,7 +615,8 @@ struct less_then
} }
}; };
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering)
{ {
int threads = BLOCK_SIZE; int threads = BLOCK_SIZE;
@ -676,7 +629,8 @@ int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int n
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr, kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
(double*)par_ptr, (double*)par_ptr,
m_base->cuda_getCurandStates(), m_base->cuda_getCurandStates(),
numparticles); numparticles,
enableRutherfordScattering);
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) if (err != cudaSuccess)
@ -723,12 +677,12 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
} }
} else { } else {
if (streamId == -1) { if (streamId == -1) {
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart,
(double*)dt_ptr, npart, c); (double*)dt_ptr, c);
} else { } else {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart,
(double*)dt_ptr, npart, c); (double*)dt_ptr, c);
} }
} }
@ -736,29 +690,6 @@ int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int np
return DKS_SUCCESS; return DKS_SUCCESS;
} }
int CudaCollimatorPhysics::ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart,
double c, int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
//call kernel
if (streamId == -1) {
kernelKick<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, (double3*)ef_ptr,
(double3*)bf_ptr, (double*)dt_ptr, charge, mass, npart, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelKick<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr,
(double3*)ef_ptr, (double3*)bf_ptr,
(double*)dt_ptr, charge, mass, npart, c);
}
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr, void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, int npart, int nsec,

View File

@ -110,7 +110,7 @@ public:
* *
*/ */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numpartices); int numpartices, bool enableRutherfordScattering = true);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,
@ -141,10 +141,6 @@ public:
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1); double dt, double c, bool usedt = false, int streamId = -1);
int ParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL /** BorisPusher push function with transformto function form OPAL
* ParallelTTracker integration from OPAL implemented in cuda. * ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal * For more details see ParallelTTracler docomentation in opal

View File

@ -189,11 +189,12 @@ __global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho; rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
} }
} }
@ -272,20 +273,28 @@ __global__ void mirroredRhoField(double *rho2_m,
id7 = rk * NI * NJ + rj * NI + i; id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri; id8 = rk * NI * NJ + rj * NI + ri;
double data = rho2_m[id1]; double data = rho2_m[id1];
if (i != 0) rho2_m[id2] = data; if (i != 0)
rho2_m[id2] = data;
if (j != 0) rho2_m[id3] = data; if (j != 0)
rho2_m[id3] = data;
if (i != 0 && j != 0) rho2_m[id4] = data; if (i != 0 && j != 0)
rho2_m[id4] = data;
if (k != 0) rho2_m[id5] = data; if (k != 0)
rho2_m[id5] = data;
if (k != 0 && i != 0) rho2_m[id6] = data; if (k != 0 && i != 0)
rho2_m[id6] = data;
if (k!= 0 && j != 0) rho2_m[id7] = data; if (k!= 0 && j != 0)
rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0) rho2_m[id8] = data; if (k != 0 && j != 0 & i != 0)
rho2_m[id8] = data;
} }
@ -354,9 +363,9 @@ CudaGreensFunction::~CudaGreensFunction() {
delete m_base; delete m_base;
} }
int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, double hr_m0, double hr_m1, double hr_m2,
int streamId) int streamId)
{ {
int thread = 128; int thread = 128;
@ -364,7 +373,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
//if no stream specified use default stream //if no stream specified use default stream
if (streamId == -1) { if (streamId == -1) {
kernelTmpgreen_2<<< block, thread >>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K); kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -372,7 +381,7 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpgreen, hr_m0, hr_m1, hr_m2, I, J, K); kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -380,17 +389,15 @@ int CudaGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int
} }
int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen,
int I, int J, int K, int I, int J, int K,
int streamId) int streamId)
{ {
int thread = 128; int thread = 128;
int block = (I * J * K / thread) + 1; int block = (I * J * K / thread) + 1;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
if (streamId == -1) { if (streamId == -1) {
m_base->cuda_zeroMemory( (double*)rho2_m, sizerho, 0 );
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K); 2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
@ -399,7 +406,6 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
m_base->cuda_zeroMemoryAsync( (double*)rho2_m, sizerho, 0, streamId);
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K); 2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS; return DKS_SUCCESS;
@ -409,22 +415,22 @@ int CudaGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen,
return DKS_ERROR; return DKS_ERROR;
} }
int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) { int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
int thread = 128; int thread = 128;
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1; int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
if (streamId == -1) { if (streamId == -1) {
mirroredRhoField0<<< 1, 1>>>( (double *)rho2_m, 2*I, 2*J); mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J);
mirroredRhoField<<< block, thread >>>( (double *) rho2_m, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1); mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
if (streamId < m_base->cuda_numberOfStreams()) { if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId); cudaStream_t cs = m_base->cuda_getStream(streamId);
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)rho2_m, 2*I, 2*J); mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J);
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) rho2_m, 2*I, 2*J, 2*K, I+1, J+1, K+1); mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -434,13 +440,13 @@ int CudaGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int st
return DKS_ERROR; return DKS_ERROR;
} }
int CudaGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2,
int size, int streamId) { int size, int streamId) {
int threads = 128; int threads = 128;
int blocks = size / threads + 1; int blocks = size / threads + 1;
int datasize = 2 * threads * sizeof(cuDoubleComplex); int datasize = 2 * threads * sizeof(cuDoubleComplex);
if (streamId == -1) { if (streamId == -1) {
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1, multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
(cuDoubleComplex*)ptr2, (cuDoubleComplex*)ptr2,

View File

@ -2,17 +2,17 @@
#define H_CUDA_GREENSFUNCTION #define H_CUDA_GREENSFUNCTION
#include <iostream> #include <iostream>
#include <cmath> #include <math.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuComplex.h> #include <cuComplex.h>
#include "cublas_v2.h" #include "cublas_v2.h"
#include "../Algorithms/GreensFunction.h"
#include "CudaBase.cuh" #include "CudaBase.cuh"
class CudaGreensFunction : public GreensFunction{ class CudaGreensFunction {
private: private:
@ -30,32 +30,32 @@ public:
/* destructor */ /* destructor */
~CudaGreensFunction(); ~CudaGreensFunction();
/** /*
Info: calc itegral on device memory (taken from OPAL src code) Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code Return: success or error code
*/ */
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2, double hr_m0, double hr_m1, double hr_m2,
int streamId = -1); int streamId = -1);
/** /*
Info: integration of rho2_m field (taken from OPAL src code) Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code Return: success or error code
*/ */
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1); int streamId = -1);
/** /*
Info: mirror rho field (taken from OPAL src code) Info: mirror rho field (taken from OPAL src code)
Return: succes or error code Return: succes or error code
*/ */
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1); int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/** /*
Info: multiply complex fields already on the GPU memory, result will be put in ptr1 Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code Return: success or error code
*/ */
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
}; };

View File

@ -103,14 +103,25 @@ DKSBase::DKSBase() {
#ifdef DKS_CUDA #ifdef DKS_CUDA
cbase = new CudaBase(); cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif #endif
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
oclbase = new OpenCLBase(); oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
micbase = new MICBase(); micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif #endif
} }
@ -127,14 +138,25 @@ DKSBase::DKSBase(const char* api_name, const char* device_name) {
#ifdef DKS_CUDA #ifdef DKS_CUDA
cbase = new CudaBase(); cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif #endif
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
oclbase = new OpenCLBase(); oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
micbase = new MICBase(); micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif #endif
} }
@ -151,16 +173,27 @@ DKSBase::~DKSBase() {
if (m_function_name != NULL) if (m_function_name != NULL)
delete[] m_function_name; delete[] m_function_name;
#ifdef DKS_CUDA #ifdef DKS_CUDA
delete cfft;
delete cgreens;
delete cchi;
delete ccol;
delete cbase; delete cbase;
#endif #endif
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
delete oclfft;
delete oclchi;
delete oclcol;
delete oclbase; delete oclbase;
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
delete micfft;
delete miccol;
delete micgreens;
delete micchi;
delete micbase; delete micbase;
#endif #endif
@ -274,45 +307,38 @@ int DKSBase::getDeviceList(std::vector<int> &devices) {
return DKS_ERROR; return DKS_ERROR;
} }
int DKSBase::setupDevice() { /*
init device
int ierr = DKS_ERROR; */
int DKSBase::initDevice() {
//if api is not set default is OpenCL //if api is not set default is OpenCL
if (!m_api_set) { if (!m_api_set) {
setDevice("-gpu", 4); setDevice("-gpu", 4);
setAPI(API_OPENCL, 6); setAPI(API_OPENCL, 6);
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else { } else {
if (apiOpenCL()) { if (apiOpenCL()) {
if (!m_device_set) { if (!m_device_set) {
setDevice("-gpu", 4); setDevice("-gpu", 4);
setAPI(API_OPENCL, 6); setAPI(API_OPENCL, 6);
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else { } else {
setAPI(API_OPENCL, 6); setAPI(API_OPENCL, 6);
ierr = OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) ); return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
} }
} else if (apiCuda()) { } else if (apiCuda()) {
setDevice("-gpu", 4); setDevice("-gpu", 4);
setAPI(API_CUDA, 4); setAPI(API_CUDA, 4);
ierr = CUDA_SAFECALL(DKS_SUCCESS); return CUDA_SAFECALL(DKS_SUCCESS);
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
setDevice("-mic", 4); setDevice("-mic", 4);
setAPI(API_OPENMP, 6); setAPI(API_OPENMP, 6);
ierr = MIC_SAFECALL(DKS_SUCCESS); return MIC_SAFECALL(DKS_SUCCESS);
} }
} }
return ierr; return DKS_ERROR;
}
/*
init device
*/
int DKSBase::initDevice() {
return setupDevice();
} }
/* /*
@ -430,15 +456,358 @@ int DKSBase::syncDevice() {
return DKS_ERROR; return DKS_ERROR;
} }
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSBase::setupFFT(int ndim, int N[3]) {
int DKSBase::callCreateRandomNumbers(void *mem_ptr, int size) { if (apiCuda()) {
if (apiCuda()) return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
return CUDA_SAFECALL(cbase->cuda_createRandomNumbers(mem_ptr, size)); } else if (apiOpenMP()) {
if (apiOpenCL()) //micbase.mic_setupFFT(ndim, N);
return OPENCL_SAFECALL(oclbase->ocl_createRandomNumbers(mem_ptr, size)); //BENI: setting up RC and CR transformations on MIC
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR; return DKS_ERROR;
} }
//BENI:
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
return DKS_ERROR;
}
//BENI:
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
//load kernel and execute
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
DEBUG_MSG("No implementation for selected platform");
return DKS_SUCCESS;
}
/* normalize complex to real iFFT */
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
if (apiCuda()) {
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId) );
} else if (apiOpenMP()) {
//BENI:
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
else
return DKS_ERROR;
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering)
{
if (apiCuda())
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles,
enableRutherfordScattering) );
else if (apiOpenMP())
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
if (apiOpenMP()) {
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles) );
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callInitRandoms(int size, int seed) { int DKSBase::callInitRandoms(int size, int seed) {
if (apiCuda()) if (apiCuda())
@ -452,3 +821,43 @@ int DKSBase::callInitRandoms(int size, int seed) {
return DKS_ERROR; return DKS_ERROR;
} }
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
usedt, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
c, usedt, streamId));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}

View File

@ -29,17 +29,31 @@
#endif #endif
#include "OpenCL/OpenCLBase.h" #include "OpenCL/OpenCLBase.h"
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLChiSquare.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#endif #endif
#ifdef DKS_CUDA #ifdef DKS_CUDA
#include "CUDA/CudaBase.cuh" #include "CUDA/CudaBase.cuh"
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaChiSquare.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
#include "MIC/MICBase.h" #include "MIC/MICBase.h"
#include "MIC/MICChiSquare.h"
#include "MIC/MICFFT.h"
#include "MIC/MICCollimatorPhysics.h"
#include "MIC/MICGreensFunction.hpp"
#endif #endif
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#include "AutoTuning/DKSConfig.h" #include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */ /** DKSBase class for handling function calls to DKS library */
@ -59,14 +73,25 @@ private:
#ifdef DKS_OPENCL #ifdef DKS_OPENCL
OpenCLBase *oclbase; OpenCLBase *oclbase;
OpenCLFFT *oclfft;
OpenCLChiSquare *oclchi;
OpenCLCollimatorPhysics *oclcol;
#endif #endif
#ifdef DKS_CUDA #ifdef DKS_CUDA
CudaBase *cbase; CudaBase *cbase;
CudaFFT *cfft;
CudaGreensFunction *cgreens;
CudaChiSquare *cchi;
CudaCollimatorPhysics *ccol;
#endif #endif
#ifdef DKS_MIC #ifdef DKS_MIC
MICBase *micbase; MICBase *micbase;
MICFFT *micfft;
MICCollimatorPhysics *miccol;
MICGreensFunction *micgreens;
MICChiSquare *micchi;
#endif #endif
protected: protected:
@ -114,12 +139,6 @@ protected:
} }
#endif #endif
#ifdef DKS_MIC
MICBase *getMICBase() {
return micbase;
}
#endif
/** Call OpenCL base to load specified kenrel file. /** Call OpenCL base to load specified kenrel file.
* *
*/ */
@ -135,7 +154,6 @@ protected:
return device_name; return device_name;
} }
public: public:
/** /**
@ -155,11 +173,6 @@ public:
*/ */
~DKSBase(); ~DKSBase();
/** Function to initialize objects based on the device used.
*
*/
int setupDevice();
/** Turn on auto tuning */ /** Turn on auto tuning */
void setAutoTuningOn() { m_auto_tuning = true; } void setAutoTuningOn() { m_auto_tuning = true; }
@ -392,7 +405,7 @@ public:
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
#ifdef DKS_MIC #ifdef DKS_MIC
void * mem_ptr = NULL; void * mem_ptr = NULL;
mem_ptr = micbase->mic_allocateMemory<T>(elements); mem_ptr = micbase.mic_allocateMemory<T>(elements);
return mem_ptr; return mem_ptr;
#endif #endif
} }
@ -485,7 +498,7 @@ public:
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset)); return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset)); return MIC_SAFECALL(micbase.mic_writeData<T>(mem_ptr, data, elements, offset));
} }
@ -519,7 +532,7 @@ public:
size_t size = sizeof(T)*elements; size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset)); return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset)); return MIC_SAFECALL(micbase.mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
} }
return DKS_ERROR; return DKS_ERROR;
@ -819,7 +832,7 @@ public:
size_t size = sizeof(T)*elements; size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset)); return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset)); return MIC_SAFECALL(micbase.mic_readData<T>(mem_ptr, out_data, elements, offset));
} }
return DKS_ERROR; return DKS_ERROR;
@ -847,7 +860,7 @@ public:
size_t size = sizeof(T)*elements; size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset)); return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
} else if (apiOpenMP()) { } else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements, return MIC_SAFECALL(micbase.mic_readDataAsync<T>(mem_ptr, out_data, elements,
streamId, offset)); streamId, offset));
} }
@ -867,23 +880,221 @@ public:
else if (apiCuda()) else if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr)); return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
else if (apiOpenMP()) else if (apiOpenMP())
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements)); return MIC_SAFECALL(micbase.mic_freeMemory<T>(mem_ptr, elements));
return DKS_ERROR; return DKS_ERROR;
} }
/**
* Create random numbers on the device and fille mem_data array ///////////////////////////////////////////////
///////Function library part of dksbase////////
///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/ */
int callCreateRandomNumbers(void *mem_ptr, int size); int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Transpose 2D and 3D arrays, OpenCL implementation
* N - size of dimensions, ndim - number of dimensions, dim - dim to transpose
*/
int callTranspose(void *mem_ptr, int N[3], int ndim, int dim);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/**
* Element by element multiplication.
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
* TODO: opencl and mic implementations.
*/
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
/**
* Chi square for parameter fitting on device.
* mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for
* intermediate results. Chi square results are put in &results
*/
int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result);
/**
* max-log-likelihood for parameter fitting on device.
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
* mem_par - pointer to parameter set, mem_results - pointer for
* intermediate results. Chi square results are put in &results.
* TODO: opencl and mic implementations.
*/
int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffser,
int sensors, int length, int numpar,
double &result);
/**
* max-log-likelihood for parameter fitting on device.
* mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor,
* mem_par - pointer to parameter set, mem_results - pointer for
* intermediate results. Chi square results are put in &results.
* TODO: opencl and mic implementations.
*/
int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffser,
int sensors, int length, int numpar,
double &result);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering = true);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
*/
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
/** /**
* Init random number states and save for reuse on device. * Init random number states and save for reuse on device.
* If seed is -1, a random seed based on current time is taken.
* TODO: opencl and mic implementations. * TODO: opencl and mic implementations.
*/ */
int callInitRandoms(int size, int seed = -1); int callInitRandoms(int size, int seed = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1);
/** /**
* Print memory information on device (total, used, available) * Print memory information on device (total, used, available)
* TODO: opencl and mic imlementation * TODO: opencl and mic imlementation

View File

@ -62,12 +62,6 @@
#define OPENCL_SAFEINIT(x) ( NULL ) #define OPENCL_SAFEINIT(x) ( NULL )
#endif #endif
#ifdef DKS_AMD
#define OPENCL_SAFEINIT_AMD(x) ( x )
#else
#define OPENCL_SAFEINIT_AMD(x) ( NULL )
#endif
#ifdef DKS_MIC #ifdef DKS_MIC
#define MIC_SAFEINIT(x) ( x ) #define MIC_SAFEINIT(x) ( x )
#else #else

View File

@ -1,298 +0,0 @@
#include "DKSOPAL.h"
DKSOPAL::DKSOPAL() {
dksfft = nullptr;
dkscol = nullptr;
dksgreens = nullptr;
}
DKSOPAL::DKSOPAL(const char* api_name, const char* device_name) {
setAPI(api_name, strlen(api_name));
setDevice(device_name, strlen(device_name));
}
DKSOPAL::~DKSOPAL() {
delete dksfft;
delete dkscol;
delete dksgreens;
}
int DKSOPAL::setupOPAL() {
int ierr = DKS_ERROR;
if (apiOpenCL()) {
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
//TODO: only enable if AMD libraries are available
dksfft = OPENCL_SAFEINIT_AMD( new OpenCLFFT(getOpenCLBase()) );
dkscol = OPENCL_SAFEINIT_AMD( new OpenCLCollimatorPhysics(getOpenCLBase()) );
dksgreens = OPENCL_SAFEINIT_AMD( new OpenCLGreensFunction(getOpenCLBase()) );
} else if (apiCuda()) {
ierr = CUDA_SAFECALL( DKS_SUCCESS );
dksfft = CUDA_SAFEINIT( new CudaFFT(getCudaBase()) );
dkscol = CUDA_SAFEINIT( new CudaCollimatorPhysics(getCudaBase()) );
dksgreens = CUDA_SAFEINIT( new CudaGreensFunction(getCudaBase()) );
} else if (apiOpenMP()) {
ierr = MIC_SAFECALL( DKS_SUCCESS );
dksfft = MIC_SAFEINIT( new MICFFT(getMICBase()) );
dkscol = MIC_SAFEINIT( new MICCollimatorPhysics(getMICBase()) );
dksgreens = MIC_SAFEINIT( new MICGreensFunction(getMICBase()) );
} else {
ierr = DKS_ERROR;
}
return ierr;
}
int DKSOPAL::initDevice() {
int ierr = setupDevice();
if (ierr == DKS_SUCCESS)
ierr = setupOPAL();
return ierr;
}
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSOPAL::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return dksfft->setupFFT(ndim, N);
} else if (apiOpenCL()) {
int ierr1 = dksfft->setupFFT(ndim, N);
int ierr2 = dksfft->setupFFTRC(ndim, N);
int ierr3 = dksfft->setupFFTCR(ndim, N);
if (ierr1 != DKS_SUCCESS || ierr2 != DKS_SUCCESS || ierr3 != DKS_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = dksfft->setupFFTRC(ndim, N, 1.);
int ierr2 = dksfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2]));
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTRC(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTRC(ndim, N, scale);
return DKS_ERROR;
}
//BENI:
int DKSOPAL::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return dksfft->setupFFT(ndim, N);
if (apiOpenCL())
return dksfft->setupFFTCR(ndim, N);
else if (apiOpenMP())
return dksfft->setupFFTCR(ndim, N, scale);
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSOPAL::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSOPAL::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL() || apiOpenMP())
return dksfft->executeIFFT(data_ptr, ndim, dimsize);
else if (apiCuda())
return dksfft->executeIFFT(data_ptr, ndim, dimsize, streamId);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSOPAL::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
else
return DKS_ERROR;
} else if (apiCuda()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize, streamId);
} else if (apiOpenMP()) {
return dksfft->normalizeFFT(data_ptr, ndim, dimsize);
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSOPAL::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSOPAL::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId);
else if (apiOpenCL() || apiOpenMP())
return dksfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize);
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSOPAL::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return dksfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId);
else if (apiOpenCL())
return DKS_ERROR;
else if (apiOpenMP())
return DKS_ERROR;
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSOPAL::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
return dksgreens->greensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId);
}
int DKSOPAL::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
return dksgreens->integrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId);
}
int DKSOPAL::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
return dksgreens->mirrorRhoField(mem_ptr, I, J, K, streamId);
}
int DKSOPAL::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
return dksgreens->multiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId);
}
int DKSOPAL::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
{
return dkscol->CollimatorPhysics(mem_ptr, par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
return dkscol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles);
}
int DKSOPAL::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
return dkscol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback);
}
int DKSOPAL::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
return MIC_SAFECALL(dkscol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, usedt, streamId);
}
int DKSOPAL::callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
int npart, double c, int streamId) {
return dkscol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, 0, c, true, streamId);
}
int DKSOPAL::callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge, double mass,
int npart, double c, int streamId)
{
return dkscol->ParallelTTrackerKick(r_ptr, p_ptr, ef_ptr, bf_ptr, dt_ptr,
charge, mass, npart, c, streamId);
}
int DKSOPAL::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
return dkscol->ParallelTTrackerPushTransform(x_ptr, p_ptr, lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt, c, usedt, streamId);
}

View File

@ -1,233 +0,0 @@
#ifndef H_DKS_OPAL
#define H_DKS_OPAL
#include <iostream>
#include "AutoTuning/DKSAutoTuning.h"
#include "DKSBase.h"
#include "DKSDefinitions.h"
#include "Algorithms/GreensFunction.h"
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#ifdef DKS_AMD
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLGreensFunction.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#endif
#ifdef DKS_CUDA
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#endif
#ifdef DKS_MIC
#include "MIC/MICFFT.h"
#include "MIC/MICGreensFunction.hpp"
#include "MIC/MICCollimatorPhysics.h"
#endif
class DKSOPAL : public DKSBase {
private:
DKSFFT *dksfft;
DKSCollimatorPhysics *dkscol;
GreensFunction *dksgreens;
int setupOPAL();
public:
DKSOPAL();
DKSOPAL(const char* api_name, const char* device_name);
~DKSOPAL();
int initDevice();
///////////////////////////////////////////////
///////Function library part of dksbase////////
///////////////////////////////////////////////
/**
* Setup FFT function.
* Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls.
* If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case
* each fft will do its own setup according to fft size and dimensions.
* TODO: opencl and mic implementations
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/**
* Call complex-to-complex fft.
* Executes in place complex to compelx fft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation
*/
int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex-to-complex ifft.
* Executes in place complex to compelx ifft on the device on data pointed by data_ptr.
* stream id can be specified to use other streams than default.
* TODO: mic implementation.
*/
int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize complex to complex ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size
* TODO: mic implementation.
*/
int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call real to complex FFT.
* Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations
*/
int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Call complex to real iFFT.
* Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points
* to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size
* should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast
* (dimsize[0]/2+1)*dimsize[1]*dimsize[2]
* TODO: opencl and mic implementations.
*/
int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Normalize compelx to real ifft.
* Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by
* fft size.
* TODO: opencl and mic implementations.
*/
int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId = -1);
/**
* Integrated greens function from OPAL FFTPoissonsolver.cpp put on device.
* For specifics check OPAL docs.
* TODO: opencl and mic implementations.
*/
int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/**
* Element by element multiplication.
* Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies
* the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1.
* TODO: opencl and mic implementations.
*/
int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* Test function for the MIC to test SoA layout vs AoS layout used in previous versions
*/
int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
/**
* Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device.
* For specifics check OPAL docs and CudaCollimatorPhysics class documentation.
* TODO: opencl and mic implementations.
*/
int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerPush(void *r_ptr, void *p_ptr, void *dt_ptr,
int npart, double c, int streamId = -1);
/**
* Integration code from ParallelTTracker from OPAL.
* For specifics check OPAL docs and CudaCollimatorPhysics class docs
*/
int callParallelTTrackerKick(void *r_ptr, void *p_ptr, void *ef_ptr,
void *bf_ptr, void *dt_ptr, double charge,
double mass, int npart, double c, int streamId = -1);
};
#endif

View File

@ -1,24 +1,19 @@
SET (_SRCS MICBase.cpp) SET (_SRCS
SET (_HDRS MICBase.h) MICBase.cpp
MICChiSquare.cpp
MICFFT.cpp
MICGreensFunction.cpp
MICCollimatorPhysics.cpp
)
IF (ENABLE_OPAL) SET (_HDRS
SET (_SRCS MICBase.h
${_SRCS} MICChiSquare.h
MICChiSquare.cpp MICFFT.h
MICFFT.cpp MICCollimatorPhysics.h
MICGreensFunction.cpp MICGreensFunction.hpp
MICCollimatorPhysics.cpp MICMergeSort.h
) )
SET (_HDRS
${_HDRS}
MICChiSquare.h
MICFFT.h
MICCollimatorPhysics.h
MICGreensFunction.hpp
MICMergeSort.h
)
ENDIF (ENABLE_OPAL)
#INCLUDE_DIRECTORIES ( #INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR} # ${CMAKE_CURRENT_SOURCE_DIR}

View File

@ -18,28 +18,30 @@ int MICBase::mic_createRandStreams(int size) {
int seed = time(NULL); int seed = time(NULL);
int numThreads = 0; #pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
#pragma offload target(mic:m_device_id) inout(numThreads)
{ {
//get the number of threads
int numThreads;
#pragma omp parallel #pragma omp parallel
numThreads = omp_get_num_threads(); numThreads = omp_get_num_threads();
}
defaultRndStream = mic_allocateMemory<VSLStreamStatePtr>(numThreads); //if default rnd stream already allocated delete the array
VSLStreamStatePtr *tmpRndStream = (VSLStreamStatePtr*) defaultRndStream; if (defaultRndSet == 1)
maxThreads = numThreads; delete[] defaultRndStream;
#pragma offload target(mic:m_device_id) \ //allocate defaultRndStream array
in(tmpRndStream:length(0) DKS_REUSE DKS_RETAIN) \ defaultRndStream = new VSLStreamStatePtr[numThreads];
in(seed)
{
//create stream states for each thread //create stream states for each thread
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < omp_get_num_threads(); i++) for (int i = 0; i < omp_get_num_threads(); i++)
vslNewStream(&tmpRndStream[i], VSL_BRNG_MT2203, seed + i); vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
}
defaultRndSet = 1; defaultRndSet = 1;
}
return DKS_SUCCESS; return DKS_SUCCESS;
} }
@ -47,8 +49,15 @@ int MICBase::mic_createRandStreams(int size) {
//delete default rand streams //delete default rand streams
int MICBase::mic_deleteRandStreams() { int MICBase::mic_deleteRandStreams() {
//mic_freeMemory<VSLStreamStatePtr>(defaultRndStream, 236); #pragma offload target(mic:m_device_id) inout(defaultRndSet)
return DKS_SUCCESS; {
if (defaultRndSet == 1) {
delete[] defaultRndStream;
defaultRndSet = -1;
}
}
return DKS_ERROR;
} }
//create a new signal for the mic //create a new signal for the mic

View File

@ -30,19 +30,14 @@ class MICBase {
private: private:
std::vector<int> micStreams; std::vector<int> micStreams;
int maxThreads;
protected: protected:
int defaultRndSet; int defaultRndSet;
public: public:
VSLStreamStatePtr *defaultRndStream;
//#pragma offload_attribute(push,target(mic))
void *defaultRndStream; //VSLSStreamStatePtr
void *testPtr;
//#pragma offload_attribute(pop)
int m_device_id; int m_device_id;
/* constructor */ /* constructor */
@ -207,6 +202,7 @@ public:
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE) #pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
{ {
} }
return DKS_SUCCESS; return DKS_SUCCESS;
} }

View File

@ -292,7 +292,7 @@ void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream)
const double deltas = par[DT_M] * beta * C; const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M]; const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) { if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073; const double Ts = (Eng * 1E6) / 1.0073;
@ -338,7 +338,7 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
const double deltas = par[DT_M] * beta * C; const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M]; const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) { if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073; const double Ts = (Eng * 1E6) / 1.0073;
@ -368,23 +368,23 @@ void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
} }
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) { int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, boll enableRutherfordScattering)
{
//cast device memory pointers to appropriate types //cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr; MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
double *par = (double*) par_ptr; double *par = (double*) par_ptr;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target(mic:m_micbase->m_device_id) \ #pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \ inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles) in(numparticles)
{ {
#pragma omp parallel #pragma omp parallel
{ {
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()]; VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x //for loop trough particles if not checkhit set label to -2 and update R.x
@ -461,8 +461,6 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
int padding = numparticles % MIC_WIDTH; int padding = numparticles % MIC_WIDTH;
int totalpart = numparticles + padding; int totalpart = numparticles + padding;
VSLStreamStatePtr *streamArr = (VSLStreamStatePtr*) m_micbase->defaultRndStream;
#pragma offload target (mic:0) \ #pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \ in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \ in(localID:length(0) DKS_REUSE DKS_RETAIN) \
@ -473,16 +471,14 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
in(py:length(0) DKS_REUSE DKS_RETAIN) \ in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \ in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \ in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(streamArr:length(0) DKS_RETAIN DKS_REUSE) \
in(totalpart) in(totalpart)
{ {
#pragma omp parallel #pragma omp parallel
{ {
//every thread gets its own rnd stream state //every thread gets its own rnd stream state
//VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
VSLStreamStatePtr stream = streamArr[omp_get_thread_num()];
#pragma omp for nowait #pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -518,11 +514,9 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
double Eng = (sq - 1) * M_P; double Eng = (sq - 1) * M_P;
double dEdx = 0; double dEdx = 0;
if (label[i] == 0) { if (label[i] == 0) {
energyLoss(Eng, dEdx, par, randv, i - ii); energyLoss(Eng, dEdx, par, randv, i - ii);
} }
if (Eng > 1e-4 && dEdx < 0) { if (Eng > 1e-4 && dEdx < 0) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
@ -534,12 +528,11 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
if (Eng < 1e-4 || dEdx > 0) if (Eng < 1e-4 || dEdx > 0)
label[i] = -1; label[i] = -1;
} //end inner energy loss loop } //end inner energy loss loop
} //end outer energy loss loop
} //end outer energy loss loop
//vectorize coulomb scattering as much as possible //vectorize coulomb scattering as much as possible
#pragma omp for nowait #pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
@ -549,7 +542,7 @@ int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_pt
} //end omp parallel } //end omp parallel
} //end offload } //end offload
return DKS_SUCCESS; return DKS_SUCCESS;
} }

View File

@ -26,7 +26,7 @@ typedef struct {
} MIC_PART_SMALL; } MIC_PART_SMALL;
class MICCollimatorPhysics : public DKSCollimatorPhysics { class MICCollimatorPhysics : DKSAlogorithms{
private: private:
@ -38,9 +38,10 @@ public:
m_micbase = base; m_micbase = base;
}; };
~MICCollimatorPhysics() { }; ~MICCollimatorPhysics() { };
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles); int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering = true);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,

View File

@ -6,16 +6,13 @@
MICFFT::MICFFT(MICBase *base) { MICFFT::MICFFT(MICBase *base) {
m_micbase = base; m_micbase = base;
m_fftsetup = false;
} }
MICFFT::~MICFFT() { MICFFT::~MICFFT() {
if (m_fftsetup) {
#pragma offload target(mic:0) #pragma offload target(mic:0)
{ {
DftiFreeDescriptor(&FFTHandle_m); DftiFreeDescriptor(&FFTHandle_m);
DftiFreeDescriptor(&handle); DftiFreeDescriptor(&handle);
}
} }
} }
@ -38,7 +35,7 @@ int MICFFT::setupFFT(int ndim, int N[3]) {
} }
m_fftsetup = true;
return DKS_SUCCESS; return DKS_SUCCESS;
} }
//BENI: //BENI:
@ -125,8 +122,8 @@ int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool for
} }
//execute iFFT //execute iFFT
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId) { int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
return executeFFT(mem_ptr, ndim, N, -1, false); return mic_executeFFT(mem_ptr, ndim, N, -1, false);
} }
//execute REAL->COMPLEX FFT //execute REAL->COMPLEX FFT

View File

@ -7,14 +7,13 @@
#include <offload.h> #include <offload.h>
#include <mkl_dfti.h> #include <mkl_dfti.h>
#include "../Algorithms/FFT.h" #include "../Algorithm/DKSFFT.h"
#include "MICBase.h" #include "MICBase.h"
class MICFFT : public DKSFFT { class MICFFT : public DKSFFT {
private: private:
bool m_fftsetup;
MICBase *m_micbase; MICBase *m_micbase;
/// Internal FFT object for performing serial FFTs. /// Internal FFT object for performing serial FFTs.
@ -75,18 +74,6 @@ public:
/* normalize IFFT on MIC */ /* normalize IFFT on MIC */
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1); int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
/**
* Info: destroy default FFT plans
* Return: success or error code
*/
int destroyFFT() { return DKS_SUCCESS; }
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) { return DKS_SUCCESS; }
}; };
#endif #endif

View File

@ -55,11 +55,11 @@ MICGreensFunction::~MICGreensFunction() {
} }
*/ */
int MICGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ, int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
double hr_m0, double hr_m1, double hr_m2, int streamId) double hr_m1, double hr_m2)
{ {
double *tmp_ptr = (double*) tmpgreen; double *tmp_ptr = (double*) tmp_ptr_;
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2) #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
{ {
std::memset(tmp_ptr,0,I*J*K); std::memset(tmp_ptr,0,I*J*K);
@ -173,14 +173,12 @@ return 0;
*/ */
//CUDA similar version: //CUDA similar version:
int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen, int I, int J, int K, int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
int streamId) double *tmpgreen = (double*) tmp_ptr_;
{ double *mem_ptr = (double*) mem_ptr_;
double *tmpgreen_ptr = (double*) tmpgreen;
double *mem_ptr = (double*) rho2_m;
// the actual integration // the actual integration
#pragma offload target(mic:0) in(tmpgreen_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K) #pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{ {
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
std::memset(mem_ptr,0,II*JJ*KK); std::memset(mem_ptr,0,II*JJ*KK);
@ -199,27 +197,27 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp) if (i+1 < NI_tmp)
tmp1 = tmpgreen_ptr[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp) if (j+1 < NJ_tmp)
tmp2 = tmpgreen_ptr[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp) if (k+1 < NK_tmp)
tmp3 = tmpgreen_ptr[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp) if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen_ptr[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp) if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen_ptr[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp) if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen_ptr[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen_ptr[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
@ -236,8 +234,8 @@ int MICGreensFunction::integrationGreensFunction(void * rho2_m, void *tmpgreen,
int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId) { int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
double *mem_ptr = (double*) rho2_m; double *mem_ptr = (double*) mem_ptr_;
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K) #pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{ {
@ -283,11 +281,11 @@ int MICGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int str
} }
/*multiply complex fields*/ /*multiply complex fields*/
int MICGreensFunction::multiplyCompelxFields(void * ptr1, void * ptr2, int size) { int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
// double *mem_ptr1 = (double*) mem_ptr1_; // double *mem_ptr1 = (double*) mem_ptr1_;
// double *mem_ptr2 = (double*) mem_ptr2_; // double *mem_ptr2 = (double*) mem_ptr2_;
_Complex double *mem_ptr1 = (_Complex double *) ptr1; _Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
_Complex double *mem_ptr2 = (_Complex double *) ptr2; _Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size) #pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
{ {

View File

@ -9,13 +9,12 @@
#include <offload.h> #include <offload.h>
#include <mkl_dfti.h> #include <mkl_dfti.h>
#include "../Algorithms/GreensFunction.h"
#include "MICBase.h" #include "MICBase.h"
#define DKS_SUCCESS 0 #define DKS_SUCCESS 0
#define DKS_ERROR 1 #define DKS_ERROR 1
class MICGreensFunction : public GreensFunction { class MICGreensFunction {
private: private:
MICBase *m_micbase; MICBase *m_micbase;
@ -29,18 +28,16 @@ public:
~MICGreensFunction(); ~MICGreensFunction();
/* compute greens integral analytically */ /* compute greens integral analytically */
int greensIntegral(void * tmpgreen_, int I, int J, int K, int NI, int NJ, int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
double hr_m0, double hr_m1, double hr_m2, int streamId = -1);
/* perform the actual integration */ /* perform the actual integration */
int integrationGreensFunction(void * rho2_m, void * tmpgreen,int I,int J, int K, int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
int stremaId = -1);
/* Mirror rho-Field */ /* Mirror rho-Field */
int mirrorRhoField(void * rho2_m, int I, int J, int K, int streamId = -1); int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
/*multiply complex fields*/ /*multiply complex fields*/
int multiplyCompelxFields(void * ptr1, void * ptr2, int size, int streamId = -1); int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
}; };

View File

@ -1,39 +1,31 @@
#dont include FFT, GreensFunction and CollimatorPhysics if clFFT and clRNG not found SET (_SRCS
OpenCLBase.cpp
SET (_HDRS OpenCLBase.h) OpenCLFFT.cpp
SET (_SRCS OpenCLBase.cpp) OpenCLChiSquare.cpp
SET (_KERNELS "") OpenCLCollimatorPhysics.cpp
OpenCLChiSquareRuntime.cpp
IF (ENABLE_MUSR) )
SET (_HDRS ${_HDRS} OpenCLChiSquareRuntime.h)
SET (_SRCS ${_SRCS} OpenCLChiSquareRuntime.cpp) SET (_HDRS
SET (_KERNELS OpenCLKernels/OpenCLChiSquareRuntime.cl) OpenCLBase.h
ENDIF (ENABLE_MUSR) OpenCLFFT.h
OpenCLChiSquare.h
IF (ENABLE_AMD AND ENABLE_OPAL) OpenCLCollimatorPhysics.h
SET (_SRCS OpenCLChiSquareRuntime.h
${_SRCS} )
OpenCLFFT.cpp
OpenCLCollimatorPhysics.cpp #INCLUDE_DIRECTORIES (
OpenCLGreensFunction.cpp # ${CMAKE_CURRENT_SOURCE_DIR}
) #)
SET (_HDRS SET (_KERNELS
${_HDRS} OpenCLKernels/OpenCLChiSquare.cl
OpenCLFFT.h OpenCLKernels/OpenCLFFT.cl
OpenCLCollimatorPhysics.h OpenCLKernels/OpenCLFFTStockham.cl
OpenCLGreensFunction.h OpenCLKernels/OpenCLTranspose.cl
) OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLChiSquareRuntime.cl
SET (_KERNELS
${_KERNELS}
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLGreensFunction.cl
) )
ENDIF (ENABLE_AMD AND ENABLE_OPAL)
ADD_SOURCES (${_SRCS}) ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS}) ADD_HEADERS (${_HDRS})

View File

@ -7,13 +7,21 @@ cl_device_id OpenCLBase::m_device_id = NULL;
cl_event OpenCLBase::m_last_event = NULL; cl_event OpenCLBase::m_last_event = NULL;
OpenCLBase::OpenCLBase() { OpenCLBase::OpenCLBase() {
//m_context = NULL;
//m_command_queue = NULL;
m_program = NULL; m_program = NULL;
m_kernel = NULL; m_kernel = NULL;
//m_device_id = NULL;
//m_platform_id = NULL;
m_kernel_file = NULL; m_kernel_file = NULL;
m_last_event = NULL; m_last_event = NULL;
//m_events = new cl_event[500];
//m_num_events = 0;
defaultRndSet = 0; defaultRndSet = 0;
} }
OpenCLBase::~OpenCLBase() { OpenCLBase::~OpenCLBase() {
@ -33,11 +41,11 @@ int OpenCLBase::ocl_createRndStates(int size) {
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl"); strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
ocl_loadKernel(kernel_file); ocl_loadKernel(kernel_file);
delete[] kernel_file; delete[] kernel_file;
//allocate memory for rand states //allocate memory for rand states
int ierr; int ierr;
defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr); defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr);
//exec kernel //exec kernel
int seed = 0; int seed = 0;
ocl_createKernel("initRand"); ocl_createKernel("initRand");
@ -47,34 +55,13 @@ int OpenCLBase::ocl_createRndStates(int size) {
size_t work_items = size; size_t work_items = size;
size_t work_group_size = 1; size_t work_group_size = 1;
ocl_executeKernel(1, &work_items, &work_group_size); ocl_executeKernel(1, &work_items, &work_group_size);
defaultRndSet = 1; defaultRndSet = 1;
return DKS_SUCCESS;
}
int OpenCLBase::ocl_createRandomNumbers(void *mem_ptr, int size) { return OCL_SUCCESS;
//load kernel
char * kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl");
ocl_loadKernel(kernel_file);
delete[] kernel_file;
//set kernel variables
cl_mem tmp_data = (cl_mem) mem_ptr;
ocl_createKernel("createRandoms");
ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState);
ocl_setKernelArg(1, sizeof(cl_mem), &tmp_data);
ocl_setKernelArg(2, sizeof(int), &size);
size_t work_size = 128;
size_t work_items = (size % work_size + 1) * work_size;
ocl_executeKernel(1, &work_items, &work_size);
return DKS_SUCCESS;
} }
/* destroy rnd states */ /* destroy rnd states */
@ -83,7 +70,7 @@ int OpenCLBase::ocl_deleteRndStates() {
ocl_freeMemory(defaultRndState); ocl_freeMemory(defaultRndState);
defaultRndSet = 0; defaultRndSet = 0;
return DKS_SUCCESS; return OCL_SUCCESS;
} }
@ -441,8 +428,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
int ierr; int ierr;
//create program from kernel //create program from kernel
m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr);
NULL, &ierr);
if (ierr != CL_SUCCESS) { if (ierr != CL_SUCCESS) {
DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr); DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr);
return DKS_ERROR; return DKS_ERROR;
@ -452,7 +438,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL); ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL);
/* /*
check if compiling kernel source succeded, if failed return error code check if compileng kernel source succeded, if failed return error code
if in debug mode get compilation info and print program build log witch if in debug mode get compilation info and print program build log witch
will give indication what made the compilation fail will give indication what made the compilation fail
*/ */
@ -461,8 +447,7 @@ int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts)
//get build status //get build status
cl_build_status status; cl_build_status status;
clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
sizeof(cl_build_status), &status, NULL);
//get log size //get log size
size_t log_size; size_t log_size;
@ -628,12 +613,12 @@ int OpenCLBase::ocl_loadKernel(const char * kernel_file) {
} }
} }
if (ierr != DKS_SUCCESS) { if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Failed to build kernel file " << kernel_file); DEBUG_MSG("Failed to build kernel file " << kernel_file);
return DKS_ERROR; return OCL_ERROR;
} }
return DKS_SUCCESS; return OCL_SUCCESS;
} }
//compile kernel form source code provided //compile kernel form source code provided

View File

@ -30,10 +30,13 @@
#include <CL/cl_ext.h> #include <CL/cl_ext.h>
#endif #endif
#include "../DKSDefinitions.h" #include "../DKSDefinitions.h"
/* struct for random number state */ /* struct for random number state */
typedef struct { typedef struct {
double s10; double s10;
double s11; double s11;
double s12; double s12;
@ -42,12 +45,16 @@ typedef struct {
double s22; double s22;
double z; double z;
bool gen; bool gen;
} RNDState; } RNDState;
class OpenCLBase { class OpenCLBase {
private: private:
static cl_context m_context;
static cl_command_queue m_command_queue;
static cl_platform_id m_platform_id; static cl_platform_id m_platform_id;
static cl_device_id m_device_id; static cl_device_id m_device_id;
@ -111,9 +118,6 @@ protected:
public: public:
static cl_context m_context;
static cl_command_queue m_command_queue;
/* /*
constructor constructor
@ -131,11 +135,6 @@ public:
*/ */
int ocl_createRndStates(int size); int ocl_createRndStates(int size);
/* Create an array of random numbers on the device
*
*/
int ocl_createRandomNumbers(void *mem_ptr, int size);
/* /*
Destroy rnd states Destroy rnd states
Return: success or error code Return: success or error code
@ -196,7 +195,7 @@ public:
Return: return pointer to memory Return: return pointer to memory
*/ */
cl_mem ocl_allocateMemory(size_t size, int &ierr); cl_mem ocl_allocateMemory(size_t size, int &ierr);
/* /*
Name: allocateMemory Name: allocateMemory
Info: allocate memory on device Info: allocate memory on device
@ -204,20 +203,6 @@ public:
*/ */
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/** Zero OpenCL memory buffer
* Set all the elemetns in the device array to zero
*/
template <typename T>
int ocl_fillMemory(cl_mem mem_ptr, size_t size, T value, int offset = 0) {
cl_int ierr;
ierr = clEnqueueFillBuffer(m_command_queue, mem_ptr, &value, sizeof(T), offset,
sizeof(T)*size, 0, nullptr, nullptr);
if (ierr != CL_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
}
/* /*
Name: writeData Name: writeData
Info: write data to device memory (needs ptr to mem object) Info: write data to device memory (needs ptr to mem object)

View File

@ -34,7 +34,7 @@ TODO:
2. boost.compute sort for user defined structure crashes 2. boost.compute sort for user defined structure crashes
*/ */
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles) int numparticles, bool enableRutherfordScattering)
{ {
/* /*
//set number of total threads, and number threads per block //set number of total threads, and number threads per block

View File

@ -52,7 +52,8 @@ public:
} }
/* execute degrader code on device */ /* execute degrader code on device */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles); int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles,
bool enableRutherfordScattering = true);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr, void *rx_ptr, void *ry_ptr, void *rz_ptr,

View File

@ -31,6 +31,7 @@ int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool f
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS) if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
return OCL_ERROR; return OCL_ERROR;
//execute kernel //execute kernel
for (int step = 1; step < N; step <<= 1) { for (int step = 1; step < N; step <<= 1) {
@ -88,78 +89,26 @@ int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N)
call fft execution on device for every dimension call fft execution on device for every dimension
*/ */
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) { int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
int ierr;
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem inout = (cl_mem)data; cl_mem inout = (cl_mem)data;
int n = N[0];
if (forward) for (int dim = 0; dim < ndim; dim++) {
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue, ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
0, NULL, NULL, &inout, NULL, NULL); if (ierr != OCL_SUCCESS) {
else DEBUG_MSG("Error executing bit reverse");
ierr = clfftEnqueueTransform(planHandleZ2Z, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue, return OCL_ERROR;
0, NULL, NULL, &inout, NULL, NULL); }
if (ierr != OCL_SUCCESS) { ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
dkserr = DKS_ERROR; if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing cfFFT\n"); DEBUG_MSG("Error executing fft reverse");
if (ierr == CLFFT_INVALID_PLAN) return OCL_ERROR;
std::cout << "Invlalid plan" << std::endl; }
else
std::cout << "CLFFT error" << std::endl;
} }
return dkserr; return OCL_SUCCESS;
}
/*
call rcfft execution on device for every dimension
*/
int OpenCLFFT::executeRCFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleD2Z, CLFFT_FORWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &real_in, &comp_out, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return dkserr;
}
/*
call rcfft execution on device for every dimension
*/
int OpenCLFFT::executeCRFFT(void *real_ptr, void *comp_ptr, int ndim, int N[3], int streamId) {
int dkserr = DKS_SUCCESS;
cl_int ierr;
cl_mem real_in = (cl_mem)real_ptr;
cl_mem comp_out = (cl_mem)comp_ptr;
ierr = clfftEnqueueTransform(planHandleZ2D, CLFFT_BACKWARD, 1, &m_oclbase->m_command_queue,
0, NULL, NULL, &comp_out, &real_in, NULL);
if (ierr != OCL_SUCCESS) {
dkserr = DKS_ERROR;
DEBUG_MSG("Error executing cfFFT\n");
if (ierr == CLFFT_INVALID_PLAN)
std::cout << "Invlalid plan" << std::endl;
else
std::cout << "CLFFT error" << std::endl;
}
return dkserr;
} }
/* /*
@ -171,11 +120,10 @@ int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
} }
/* /*
call kernel to normalize fft. clFFT inverse already includes the scaling so this is disabled. call kernel to normalize fft
*/ */
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
/*
cl_mem inout = (cl_mem)data; cl_mem inout = (cl_mem)data;
int n = N[0]; int n = N[0];
@ -202,175 +150,132 @@ int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
DEBUG_MSG("Error executing kernel"); DEBUG_MSG("Error executing kernel");
return OCL_ERROR; return OCL_ERROR;
} }
*/
return OCL_SUCCESS; return OCL_SUCCESS;
} }
int OpenCLFFT::setupFFT(int ndim, int N[3]) { int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
int ierr;
int size = sizeof(cl_double2)*pow(N,ndim);
cl_mem mem_tmp;
cl_mem mem_src = (cl_mem)src;
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
cl_int err; //set the number of work items in each dimension
size_t work_items[3];
int p = 1;
int threads = N / 2;
int f = (forward) ? -1 : 1;
//execute kernel
int n = (int)log2(N);
for (int i = 0; i < ndim; i++) {
clfftDim dim; int dim = i+1;
if (ndim == 1) p = 1;
dim = CLFFT_1D; work_items[0] = (dim == 1) ? N/2 : N;
else if (ndim == 2) work_items[1] = (dim == 2) ? N/2 : N;
dim = CLFFT_2D; work_items[2] = (dim == 3) ? N/2 : N;
else
dim = CLFFT_3D; //transpose array if calculating dimension larger than 1
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]}; //if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
return OCL_ERROR;
for (int t = 1; t <= log2(N); t++) {
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
/* Create 3D fft plan*/ mem_tmp = mem_src;
err = clfftCreateDefaultPlan(&planHandleZ2Z, m_oclbase->m_context, dim, clLength); mem_src = mem_dst;
mem_dst = mem_tmp;
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleZ2Z, CLFFT_DOUBLE); p = 2*p;
if (err != CL_SUCCESS)
std::cout << "Error setting precision" << std::endl;
err = clfftSetLayout(planHandleZ2Z, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
if (err != CL_SUCCESS)
std::cout << "Error setting layout" << std::endl;
err = clfftSetResultLocation(planHandleZ2Z, CLFFT_INPLACE);
if (err != CL_SUCCESS)
std::cout << "Error setting result location" << std::endl;
/* Bake the plan */
err = clfftBakePlan(planHandleZ2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Complex-to-complex plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::setupFFTRC(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
size_t clOutStride[3] = {1, half, half * N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleD2Z, m_oclbase->m_context, dim, clLength);
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleD2Z, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleD2Z, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
err = clfftSetResultLocation(planHandleD2Z, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleD2Z, dim, clInStride);
err = clfftSetPlanOutStride(planHandleD2Z, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleD2Z, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Real-to-complex plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::setupFFTCR(int ndim, int N[3], double scale) {
cl_int err;
clfftDim dim;
if (ndim == 1)
dim = CLFFT_1D;
else if (ndim == 2)
dim = CLFFT_2D;
else
dim = CLFFT_3D;
size_t clLength[3] = {(size_t)N[0], (size_t)N[1], (size_t)N[2]};
size_t half = (size_t)N[0] / 2 + 1;
size_t clInStride[3] = {1, half, half * N[1]};
size_t clOutStride[3] = {1, (size_t)N[0], (size_t)N[0]*N[1]};
/* Create 3D fft plan*/
err = clfftCreateDefaultPlan(&planHandleZ2D, m_oclbase->m_context, dim, clLength);
/* Set plan parameters */
err = clfftSetPlanPrecision(planHandleZ2D, CLFFT_DOUBLE);
err = clfftSetLayout(planHandleZ2D, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
err = clfftSetResultLocation(planHandleZ2D, CLFFT_OUTOFPLACE);
err = clfftSetPlanInStride(planHandleZ2D, dim, clInStride);
err = clfftSetPlanOutStride(planHandleZ2D, dim, clOutStride);
/* Bake the plan */
err = clfftBakePlan(planHandleZ2D, 1, &m_oclbase->m_command_queue, NULL, NULL);
if (err != CL_SUCCESS) {
DEBUG_MSG("Error creating Complex-to-real plan");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int OpenCLFFT::destroyFFT() {
clfftDestroyPlan(&planHandleZ2Z);
clfftDestroyPlan(&planHandleD2Z);
clfftDestroyPlan(&planHandleZ2D);
clfftTeardown();
return DKS_SUCCESS;
}
void OpenCLFFT::printError(clfftStatus err) {
if (err != CL_SUCCESS) {
std::cout << "Error creating default plan " << err << std::endl;
switch(err) {
case CLFFT_BUGCHECK:
std::cout << "bugcheck" << std::endl;
break;
case CLFFT_NOTIMPLEMENTED:
std::cout << "not implemented" << std::endl;
break;
case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
std::cout << "transposed not implemented" << std::endl;
break;
case CLFFT_FILE_NOT_FOUND:
std::cout << "file not found" << std::endl;
break;
case CLFFT_FILE_CREATE_FAILURE:
std::cout << "file create failure" << std::endl;
break;
case CLFFT_VERSION_MISMATCH:
std::cout << "version missmatch" << std::endl;
break;
case CLFFT_INVALID_PLAN:
std::cout << "invalid plan" << std::endl;
break;
case CLFFT_DEVICE_NO_DOUBLE:
std::cout << "no double" << std::endl;
break;
case CLFFT_DEVICE_MISMATCH:
std::cout << "device missmatch" << std::endl;
break;
case CLFFT_ENDSTATUS:
std::cout << "end status" << std::endl;
break;
default:
std::cout << "other: " << err << std::endl;
break;
} }
//transpose array back if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
}
if (ndim*n % 2 == 1) {
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
} }
m_oclbase->ocl_freeMemory(mem_dst);
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
cl_mem mem_src = (cl_mem)src;
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
m_oclbase->ocl_createKernel("fft_batch3D");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
for (int dim = 1; dim < ndim+1; dim++) {
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
cl_mem mem_src = (cl_mem)src;
if (ndim == 1)
return OCL_SUCCESS;
size_t work_items[3];
work_items[0] = N[0];
work_items[1] = N[1];
work_items[2] = 1;
size_t work_group_size[3];
work_group_size[0] = N[0];
work_group_size[1] = N[1];
work_group_size[2] = 1;
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
m_oclbase->ocl_createKernel("transpose");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
return OCL_SUCCESS;
} }
/* /*

View File

@ -20,19 +20,12 @@
#include "../Algorithms/FFT.h" #include "../Algorithms/FFT.h"
#include "OpenCLBase.h" #include "OpenCLBase.h"
#include "clFFT.h"
class OpenCLFFT : public DKSFFT { class OpenCLFFT : public DKSFFT {
private: private:
OpenCLBase *m_oclbase; OpenCLBase *m_oclbase;
clfftSetupData fftSetup;
clfftPlanHandle planHandleZ2Z;
clfftPlanHandle planHandleD2Z;
clfftPlanHandle planHandleZ2D;
/* /*
Info: call fft kernels to execute FFT of the given domain, Info: call fft kernels to execute FFT of the given domain,
data - devevice memory ptr, cdim - current dim to transform, data - devevice memory ptr, cdim - current dim to transform,
@ -49,31 +42,15 @@ private:
*/ */
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N); int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
/** Get clfftStatus and print the corresponding error message.
* clfftStatus is returned from all clFFT library functions, print error displays the
* corresponding error message. If "other" is printed then error code corresponds to
* OpenCL error code and not specifically to clFFT library, then OpenCL error codes should
* be checked to determine the reason for the error.
*/
void printError(clfftStatus err);
public: public:
/* constructor - currently does nothing*/ /* constructor - currently does nothing*/
OpenCLFFT(OpenCLBase *base) { OpenCLFFT(OpenCLBase *base) {
m_oclbase = base; m_oclbase = base;
/* Set up fft */
cl_int err;
err = clfftInitSetupData(&fftSetup);
err = clfftSetup(&fftSetup);
if (err != CL_SUCCESS)
DEBUG_MSG("Error seting up clFFT");
} }
/* destructor - currently does nothing*/ /* destructor - currently does nothing*/
~OpenCLFFT() { destroyFFT(); } ~OpenCLFFT() { }
/* /*
Info: execute forward fft function with data set on device Info: execute forward fft function with data set on device
@ -100,23 +77,35 @@ public:
Info: set FFT size Info: set FFT size
Return: success or error code Return: success or error code
*/ */
int setupFFT(int ndim, int N[3]); int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
int setupFFTRC(int ndim, int N[3], double scale = 1.0); int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int setupFFTCR(int ndim, int N[3], double scale = 1.0); int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int destroyFFT(); int destroyFFT() { return DKS_SUCCESS; }
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1); int streamId = -1)
{
return DKS_ERROR;
}
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1); int streamId = -1)
{
return DKS_ERROR;
}
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
{ {
return DKS_ERROR; return DKS_ERROR;
} }
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
//void printData3DN4(cl_double2* &data, int N); //void printData3DN4(cl_double2* &data, int N);
}; };

View File

@ -1,181 +0,0 @@
#include "OpenCLGreensFunction.h"
#define GREENS_KERNEL "OpenCL/OpenCLKernels/OpenCLGreensFunction.cl"
OpenCLGreensFunction::OpenCLGreensFunction(OpenCLBase *base) {
m_base = base;
base_create = false;
}
OpenCLGreensFunction::OpenCLGreensFunction() {
m_base = new OpenCLBase();
base_create = true;
}
OpenCLGreensFunction::~OpenCLGreensFunction() {
if (base_create)
delete m_base;
}
int OpenCLGreensFunction::buildProgram() {
char *kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, GREENS_KERNEL);
return m_base->ocl_loadKernel(kernel_file);
}
int OpenCLGreensFunction::greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
//set the work item size
size_t work_size = 128;
size_t work_items = I * J * K;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelTmpgreen");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(1, sizeof(double), &hr_m0);
m_base->ocl_setKernelArg(2, sizeof(double), &hr_m1);
m_base->ocl_setKernelArg(3, sizeof(double), &hr_m2);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J,
int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem rho2_ptr = (cl_mem)rho2_m;
cl_mem tmpgreen_ptr = (cl_mem)tmpgreen;
int NI = 2*(I - 1);
int NJ = 2*(J - 1);
//set the work item size
size_t work_size = 128;
size_t work_items = I * J * K;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelIntegration");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &tmpgreen_ptr);
m_base->ocl_setKernelArg(2, sizeof(int), &NI);
m_base->ocl_setKernelArg(3, sizeof(int), &NJ);
m_base->ocl_setKernelArg(4, sizeof(int), &I);
m_base->ocl_setKernelArg(5, sizeof(int), &J);
m_base->ocl_setKernelArg(6, sizeof(int), &K);
//execute kernel
double zero = 0.0;
int sizerho = 2*(I - 1) * 2*(J - 1) * 2*(K - 1);
m_base->ocl_fillMemory(rho2_ptr, sizerho, zero, 0);
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem rho2_ptr = (cl_mem)rho2_m;
int NI = I + 1;
int NJ = J + 1;
int NK = K + 1;
int I2 = 2*I;
int J2 = 2*J;
int K2 = 2*K;
int rhosize = ( (I - 1) * 2 ) * ( (J - 1) * 2 ) * ( (K - 1) * 2 );
//set the work item size
size_t work_size = 128;
size_t work_items = NI * NJ * NK;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("kernelMirroredRhoField");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &rho2_ptr);
m_base->ocl_setKernelArg(1, sizeof(int), &I2);
m_base->ocl_setKernelArg(2, sizeof(int), &J2);
m_base->ocl_setKernelArg(3, sizeof(int), &K2);
m_base->ocl_setKernelArg(4, sizeof(int), &NI);
m_base->ocl_setKernelArg(5, sizeof(int), &NJ);
m_base->ocl_setKernelArg(6, sizeof(int), &NK);
m_base->ocl_setKernelArg(7, sizeof(int), &rhosize);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}
int OpenCLGreensFunction::multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId)
{
int ierr = DKS_SUCCESS;
//compile opencl program from source
buildProgram();
//cast the input data ptr to cl_mem
cl_mem mem_ptr1 = (cl_mem) ptr1;
cl_mem mem_ptr2 = (cl_mem) ptr2;
//set the work item size
size_t work_size = 128;
size_t work_items = size;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
//create kernel
ierr = m_base->ocl_createKernel("multiplyComplexFields");
//set kernel parameters
m_base->ocl_setKernelArg(0, sizeof(cl_mem), &mem_ptr1);
m_base->ocl_setKernelArg(1, sizeof(cl_mem), &mem_ptr2);
m_base->ocl_setKernelArg(2, sizeof(int), &size);
//execute kernel
ierr = m_base->ocl_executeKernel(1, &work_items, &work_size);
return ierr;
}

View File

@ -1,63 +0,0 @@
#ifndef H_OPENCL_GREENSFUNCTION
#define H_OPENCL_GREENSFUNCTION
#include <iostream>
#include <cmath>
#include "../Algorithms/GreensFunction.h"
#include "OpenCLBase.h"
class OpenCLGreensFunction : public GreensFunction {
private:
bool base_create;
OpenCLBase *m_base;
public:
/** Constructor with OpenCLBase argument */
OpenCLGreensFunction(OpenCLBase *base);
/** Default constructor */
OpenCLGreensFunction();
/** Destructor */
~OpenCLGreensFunction();
/** Load OpenCL kernel file containing greens function kernels.
* m_base takes the kernel file and compiles the OpenCL programm.
*/
int buildProgram();
/**
Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code
*/
int greensIntegral(void *tmpgreen, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId = -1);
/**
Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code
*/
int integrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
/**
Info: mirror rho field (taken from OPAL src code)
Return: succes or error code
*/
int mirrorRhoField(void *rho2_m, int I, int J, int K, int streamId = -1);
/**
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code
*/
int multiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
};
#endif

View File

@ -1,4 +1,6 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION
/******Random numbers********/ /******Random numbers********/
@ -87,14 +89,13 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
if (id < N) { if (id < N) {
RNDState tmp; RNDState tmp;
int tmp_seed = 2*id;// * 0x100000000ULL; int tmp_seed = id;// * 0x100000000ULL;
tmp.s10 = 12345 + tmp_seed; tmp.s10 = 12345 + tmp_seed;
tmp.s11 = 12345 + tmp_seed; tmp.s11 = 12345 + tmp_seed;
tmp.s12 = 12345 + tmp_seed; tmp.s12 = 123 + tmp_seed;
tmp.s20 = 12345 + tmp_seed; tmp.s20 = 12345 + tmp_seed;
tmp.s21 = 12345 + tmp_seed; tmp.s21 = 12345 + tmp_seed;
tmp.s22 = 12345 + tmp_seed; tmp.s22 = 123 + tmp_seed;
tmp.z = 0; tmp.z = 0;
tmp.gen = true; tmp.gen = true;
@ -104,19 +105,6 @@ __kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
} }
/* create random numbers and fill an array */
__kernel void createRandoms(__global RNDState *states, __global double *data, int size) {
int idx = get_global_id(0);
if (idx < size) {
RNDState s = states[idx];
data[idx] = rand_uniform(&s);
states[idx] = s;
}
}
/**********Degrader**********/ /**********Degrader**********/
enum PARAMS { POSITION, enum PARAMS { POSITION,

View File

@ -1,170 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/** compute the greens integral analytically */
__kernel void kernelTmpgreen(__global double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
int NI, int NJ, int NK)
{
int tid = get_local_size(0);
int id = get_global_id(0);
if (id < NI * NJ * NK) {
int i = id % NI;
int k = id / (NI * NJ);
int j = (id - k * NI * NJ) / NI;
double cellVolume = hr_m0 * hr_m1 * hr_m2;
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgreen[id] = tmpgrn / cellVolume;
}
}
/** perform the actual integration */
__kernel void kernelIntegration(__global double *rho2_m, __global double *tmpgreen,
int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp)
{
int tid = get_local_id(0);
int id = get_global_id(0);
int ni = NI;
int nj = NJ;
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
/** miror rho-field */
__kernel void kernelMirroredRhoField0(__global double *rho2_m, int NI, int NJ) {
rho2_m[0] = rho2_m[NI*NJ];
}
__kernel void kernelMirroredRhoField(__global double *rho2_m,
int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp,
int size)
{
int tid = get_local_id(0);
int id = get_global_id(0);
if (id == 0)
rho2_m[0] = rho2_m[NI * NJ];
barrier(CLK_GLOBAL_MEM_FENCE);
int id1, id2, id3, id4, id5, id6, id7, id8;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
int ri = NI - i;
int rj = NJ - j;
int rk = NK - k;
id1 = k * NI * NJ + j * NI + i;
id2 = k * NI * NJ + j * NI + ri;
id3 = k * NI * NJ + rj * NI + i;
id4 = k * NI * NJ + rj * NI + ri;
id5 = rk * NI * NJ + j * NI + i;
id6 = rk * NI * NJ + j * NI + ri;
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = 0.0;
if (id1 < size)
data = rho2_m[id1];
if (i != 0 && id2 < size) rho2_m[id2] = data;
if (j != 0 && id3 < size) rho2_m[id3] = data;
if (i != 0 && j != 0 && id4 < size) rho2_m[id4] = data;
if (k != 0 && id5 < size) rho2_m[id5] = data;
if (k != 0 && i != 0 && id6 < size) rho2_m[id6] = data;
if (k!= 0 && j != 0 && id7 < size) rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0 && id8 < size) rho2_m[id8] = data;
}
}
/** multiply complex fields */
double2 ComplexMul(double2 a, double2 b) {
double2 c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
__kernel void multiplyComplexFields(__global double2 *ptr1, __global double2 *ptr2,
int size)
{
int idx = get_global_id(0);
if (idx < size)
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
}

View File

@ -7,8 +7,8 @@ LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testFFT testFFT.cpp) #ADD_EXECUTABLE(testFFT testFFT.cpp)
#ADD_EXECUTABLE(testMIC testMIC.cpp) #ADD_EXECUTABLE(testMIC testMIC.cpp)
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp) #ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
ADD_EXECUTABLE(testFFT3D testFFT3D.cpp) #ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp) #ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp) #ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp) #ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp) #ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
@ -22,11 +22,10 @@ ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
#ADD_EXECUTABLE(testGather testGather.cpp) #ADD_EXECUTABLE(testGather testGather.cpp)
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp) #ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
#ADD_EXECUTABLE(testTranspose testTranspose.cpp) #ADD_EXECUTABLE(testTranspose testTranspose.cpp)
ADD_EXECUTABLE(testRandom testRandom.cpp)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) #ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
#ADD_EXECUTABLE(testPush testPush.cpp) #ADD_EXECUTABLE(testPush testPush.cpp)
ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp) #ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp) #ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp) #ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
@ -39,8 +38,8 @@ ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#TARGET_LINK_LIBRARIES(testFFT dks) #TARGET_LINK_LIBRARIES(testFFT dks)
#TARGET_LINK_LIBRARIES(testMIC dks) #TARGET_LINK_LIBRARIES(testMIC dks)
#TARGET_LINK_LIBRARIES(testMICOpenCL dks) #TARGET_LINK_LIBRARIES(testMICOpenCL dks)
TARGET_LINK_LIBRARIES(testFFT3D dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) #TARGET_LINK_LIBRARIES(testFFT3D dks)
TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) #TARGET_LINK_LIBRARIES(testFFT3DRC dks)
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks) #TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks) #TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
#TARGET_LINK_LIBRARIES(testStockhamFFT dks) #TARGET_LINK_LIBRARIES(testStockhamFFT dks)
@ -54,11 +53,10 @@ TARGET_LINK_LIBRARIES(testFFT3DRC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testGather dks) #TARGET_LINK_LIBRARIES(testGather dks)
#TARGET_LINK_LIBRARIES(testGatherAsync dks) #TARGET_LINK_LIBRARIES(testGatherAsync dks)
#TARGET_LINK_LIBRARIES(testTranspose dks) #TARGET_LINK_LIBRARIES(testTranspose dks)
TARGET_LINK_LIBRARIES(testRandom dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES})
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) #TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES})
#TARGET_LINK_LIBRARIES(testPush dks) #TARGET_LINK_LIBRARIES(testPush dks)
TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}) #TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
#TARGET_LINK_LIBRARIES(testIntegration dks) #TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks) #TARGET_LINK_LIBRARIES(testImageReconstruction dks)
@ -83,4 +81,4 @@ TARGET_LINK_LIBRARIES(testFFTSolverMIC dks ${Boost_LIBRARIES} ${CLFFT_LIBRARIES}
#IF (NOT CUDA_VERSION VERSION_LESS "7.0") #IF (NOT CUDA_VERSION VERSION_LESS "7.0")
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
#TARGET_LINK_LIBRARIES(testChiSquareRT dks) #TARGET_LINK_LIBRARIES(testChiSquareRT dks)
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") #ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")

View File

@ -129,9 +129,7 @@ int main(int argc, char *argv[]) {
//init random //init random
base.callInitRandoms(numpart); base.callInitRandoms(numpart);
//**test collimator physics and sort***// //**test collimator physics and sort***//
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr; void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
//allocate memory for particles //allocate memory for particles
@ -212,8 +210,8 @@ int main(int argc, char *argv[]) {
base.freeMemory<double>(pz_ptr, numpart); base.freeMemory<double>(pz_ptr, numpart);
base.freeMemory<double>(param_ptr, 12); base.freeMemory<double>(param_ptr, 12);
/* /*
std::cout << std::fixed << std::setprecision(4); std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
std::cout << p.label[i] << "\t" << p.rx[i] std::cout << p.label[i] << "\t" << p.rx[i]

View File

@ -1,7 +1,6 @@
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include <complex> #include <complex>
#include <string>
#include "Utility/TimeStamp.h" #include "Utility/TimeStamp.h"
#include "DKSBase.h" #include "DKSBase.h"
@ -19,30 +18,22 @@ int main(int argc, char *argv[]) {
int N = 16; int N = 16;
char *api_name = new char[10]; char *api_name = new char[10];
char *device_name = new char[10]; char *device_name = new char[10];
if (argc == 2) {
for (int i = 1; i < argc; i++) { N = atoi(argv[1]);
if (argv[i] == string("-cuda")) { strcpy(api_name, "Cuda");
strcpy(api_name, "Cuda"); strcpy(device_name, "-gpu");
strcpy(device_name, "-gpu"); } else if (argc == 3) {
} N = atoi(argv[1]);
strcpy(api_name, argv[2]);
if (argv[i] == string("-opencl")) { strcpy(device_name, "-gpu");
strcpy(api_name, "OpenCL"); } else if (argc == 4) {
strcpy(device_name, "-gpu"); N = atoi(argv[1]);
} strcpy(api_name, argv[2]);
strcpy(device_name, argv[3]);
if (argv[i] == string("-mic")) { } else {
strcpy(api_name, "OpenMP"); N = 16;
strcpy(device_name, "-mic"); strcpy(api_name, "OpenCL");
} strcpy(device_name, "-gpu");
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
if (argv[i] == string("-N"))
N = atoi(argv[i+1]);
} }
cout << "Use api: " << api_name << ", " << device_name << endl; cout << "Use api: " << api_name << ", " << device_name << endl;
@ -83,16 +74,9 @@ int main(int argc, char *argv[]) {
/* write data to device */ /* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N); ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
if (N < 5)
printData3DN4(cdata, N, 3);
/* execute fft */ /* execute fft */
base.callFFT(mem_ptr, 3, dimsize); base.callFFT(mem_ptr, 3, dimsize);
if (N < 5) {
base.readData< complex<double> > (mem_ptr, cfft, N*N*N);
printData3DN4(cfft, N, 3);
}
/* execute ifft */ /* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize); base.callIFFT(mem_ptr, 3, dimsize);
@ -102,9 +86,7 @@ int main(int argc, char *argv[]) {
/* read data from device */ /* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N); base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
if (N < 5)
printData3DN4(cifft, N, 3);
/* free device memory */ /* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N); base.freeMemory< complex<double> >(mem_ptr, N*N*N);
@ -148,7 +130,7 @@ void printData3DN4(complex<double>* &data, int N, int dim) {
if (a < 10e-5 && a > -10e-5) if (a < 10e-5 && a > -10e-5)
a = 0; a = 0;
cout << "(" << d << "," << a << ") "; cout << d << "; " << a << "\t";
} }
} }
cout << endl; cout << endl;
@ -175,5 +157,3 @@ void compareData(complex<double>* &data1, complex<double>* &data2, int N, int di
cout << "Size " << N << " CC <--> CC diff: " << sum << endl; cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
} }

View File

@ -1,8 +1,6 @@
#include <iostream> #include <iostream>
#include <cstdlib> #include <cstdlib>
#include <complex> #include <complex>
#include <fstream>
#include <iomanip>
#include "Utility/TimeStamp.h" #include "Utility/TimeStamp.h"
#include "DKSBase.h" #include "DKSBase.h"
@ -10,53 +8,54 @@
using namespace std; using namespace std;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim); void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
void initData(double *data, int dimsize[3], int dim); void initData(double *data, int dimsize[3]);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim, bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
char *api_name, char *device_name, char *file_name);
void printHelp(); void printHelp();
void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* &data, int N, int dim);
double precision(double a) {
//if (a < 1e-10)
// return 0.0;
//else
return a;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
int N1 = 8; int N1 = 8;
int N2 = 8; int N2 = 8;
int N3 = 8; int N3 = 8;
int dim = 3; int dim = 3;
int loop = 0; int loop = 10;
char *api_name = new char[10];
char *device_name = new char[10];
char *file_name = new char[50];
if ( readParams(argc, argv, N1, N2, N3, loop, dim, api_name, device_name, file_name) ) if ( readParams(argc, argv, N1, N2, N3, loop) )
return 0; return 0;
cout << "Use api: " << api_name << ", " << device_name << endl; int dimsize[3] = {N3, N2, N1};
int dimsize[3] = {N1, N2, N3};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal]; double *rdata = new double[sizereal];
double *outdata = new double[sizereal]; double *outdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp]; complex<double> *cfft = new complex<double>[sizecomp];
initData(rdata, dimsize, dim);
for (int i=0; i<sizecomp; ++i) {
cfft[i].real() = 7.;
cfft[i].imag() = 3.33;
}
initData(rdata, dimsize);
/* init DKSBase */ /* init DKSBase */
cout << "Init device and set function" << endl; cout << "Init device and set function" << endl;
#ifdef DKS_MIC
DKSBase base; DKSBase base;
base.setAPI(api_name, strlen(api_name)); base.setAPI("OpenMP", 6);
base.setDevice(device_name, strlen(device_name)); base.setDevice("-mic", 4);
base.initDevice();
base.setupFFTRC(dim, dimsize);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
#endif
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice(); base.initDevice();
base.setupFFT(dim, dimsize); base.setupFFT(dim, dimsize);
#endif
// allocate memory on device // allocate memory on device
int ierr; int ierr;
@ -68,59 +67,69 @@ int main(int argc, char *argv[]) {
// execute one run before starting the timers // execute one run before starting the timers
base.writeData<double>(real_ptr, rdata, sizereal); base.writeData<double>(real_ptr, rdata, sizereal);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
base.readData<double>(real_res_ptr, outdata, sizereal); base.readData<double>(real_res_ptr, outdata, sizereal);
//timer for total loop time, FFT and IFFT calls
struct timeval timeStart, timeEnd;
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
gettimeofday(&timeStart, NULL);
for (int i=0; i<loop; ++i){
// write data to device
base.writeData<double>(real_ptr, rdata, sizereal);
// execute rcfft
gettimeofday(&timeFFTStart[i], NULL);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeFFTEnd[i], NULL);
// execute crfft
gettimeofday(&timeIFFTStart[i], NULL);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeIFFTEnd[i], NULL);
//normalize
#ifdef DKS_CUDA
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
#endif
// read IFFT data from device
base.readData<double>(real_res_ptr, outdata, sizereal);
ofstream myfile;
myfile.open(file_name);
myfile<< "in\tout\treal\timag\n";
for (int i = 0; i < sizereal; i++) {
//myfile << precision(rdata[i]) << "\t";
//myfile << precision(outdata[i]) << "\t";
if (i < sizecomp) {
myfile << precision(cfft[i].real()) << "\t";
myfile << precision(cfft[i].imag());
}
myfile << "\n";
} }
myfile.close(); gettimeofday(&timeEnd, NULL);
/*
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1; j++) {
cout << rdata[i*N1 + j] << " ";
}
cout << endl;
}
cout << endl;
}
if (dim == 2) {
for (int i = 0; i < N2; i++) {
for (int j = 0; j < N1 / 2 + 1; j++) {
cout << cfft[i*(N1 / 2 + 1) + j] << " ";
}
cout << endl;
}
cout << endl;
}
*/
// free device memory // free device memory
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp); base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal); base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(real_res_ptr, sizereal); base.freeMemory<double>(real_res_ptr, sizereal);
// compare in and out data to see if we get back the same results // compare in and out data to see if we get back the same results
cout << "comp" << endl;
compareData(rdata, outdata, N1, N2, N3, dim); compareData(rdata, outdata, N1, N2, N3, dim);
cout << "done" << endl;
//calculate seconds for total time and fft times
double tfft = 0;
double tifft = 0;
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
for (int i = 0; i < loop; i++) {
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
}
//print timing results
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
<< "\n\n";
return 0; return 0;
} }
@ -128,10 +137,10 @@ int main(int argc, char *argv[]) {
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) { void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
int id; int id;
double sum = 0; double sum = 0;
for (int i = 0; i < NK; i++) { for (int i = 0; i < NI; i++) {
for (int j = 0; j < NJ; j++) { for (int j = 0; j < NJ; j++) {
for (int k = 0; k < NI; k++) { for (int k = 0; k < NK; k++) {
id = i*NI*NJ + j*NI + k; id = k*NI*NJ + j*NI + i;
sum += fabs(data1[id] - data2[id]); sum += fabs(data1[id] - data2[id]);
} }
} }
@ -139,21 +148,13 @@ void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim)
std::cout << "RC <--> CR diff: " << sum << std::endl; std::cout << "RC <--> CR diff: " << sum << std::endl;
} }
void initData(double *data, int dimsize[3], int dim) { void initData(double *data, int dimsize[3]) {
if (dim == 3) { for (int i = 0; i < dimsize[2]; i++) {
for (int i = 0; i < dimsize[2]; i++)
for (int j = 0; j < dimsize[1]; j++)
for (int k = 0; k < dimsize[0]; k++)
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = sin(k);
} else if (dim == 2) {
for (int j = 0; j < dimsize[1]; j++) { for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) { for (int k = 0; k < dimsize[0]; k++) {
data[j*dimsize[0] + k] = sin(k); data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
} }
} }
} else {
for (int k = 0; k < dimsize[0]; k++)
data[k] = sin(k);
} }
} }
@ -172,17 +173,10 @@ void printHelp() {
std::cout << std::endl; std::cout << std::endl;
} }
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, int &dim, bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
char *api_name, char *device_name, char *file_name)
{
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-dim")) {
dim = atoi(argv[i + 1]);
i++;
}
if ( argv[i] == std::string("-grid") ) { if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]); N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]); N2 = atoi(argv[i + 2]);
@ -199,72 +193,7 @@ bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop, in
printHelp(); printHelp();
return true; return true;
} }
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
strcpy(file_name, "cuda_fft.dat");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
strcpy(file_name, "opencl_fft.dat");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
strcpy(file_name, "openmp_fft.dat");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
strcpy(file_name, "opencl_cpu_fft.dat");
}
} }
return false; return false;
} }
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N/2 + 1; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << "(" << d << "," << a << ") ";
}
}
cout << endl;
}
cout << endl;
}
void printData3DN4(double* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k];
if (d < 10e-5 && d > -10e-5)
d = 0;
cout << d << " ";
}
}
cout << endl;
}
cout << endl;
}

View File

@ -1,4 +1,5 @@
#include <iostream> #include <iostream>
//#include <mpi.h>
#include <string.h> #include <string.h>
#include "DKSBase.h" #include "DKSBase.h"
@ -10,265 +11,309 @@ using namespace std;
void printData3D(double* data, int N, int NI, const char *message = "") { void printData3D(double* data, int N, int NI, const char *message = "") {
if (strcmp(message, "") != 0) if (strcmp(message, "") != 0)
cout << message; cout << message;
for (int i = 0; i < NI; i++) { for (int i = 0; i < NI; i++) {
for (int j = 0; j < N; j++) { for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) { for (int k = 0; k < N; k++) {
cout << data[i*N*N + j*N + k] << "\t"; cout << data[i*N*N + j*N + k] << "\t";
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
} }
void initData(double *data, int N) { void initData(double *data, int N) {
for (int i = 0; i < N/4 + 1; i++) { for (int i = 0; i < N/4 + 1; i++) {
for (int j = 0; j < N/2 + 1; j++) { for (int j = 0; j < N/2 + 1; j++) {
for (int k = 0; k < N/2 + 1; k++) { for (int k = 0; k < N/2 + 1; k++) {
data[i*N*N + j*N + k] = k+1; data[i*N*N + j*N + k] = k+1;
} }
} }
} }
} }
void initData2(double *data, int N) { void initData2(double *data, int N) {
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
data[i] = i; data[i] = i;
} }
void initComplex( complex<double> *d, int N) { void initComplex( complex<double> *d, int N) {
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
d[i] = complex<double>(2, 0); d[i] = complex<double>(2, 0);
} }
} }
void printComplex(complex<double> *d, int N) { void printComplex(complex<double> *d, int N) {
for (int i = 0; i < N; i++) for (int i = 0; i < N; i++)
cout << d[i] << "\t"; cout << d[i] << "\t";
cout << endl; cout << endl;
}
void printDouble(double *d, int N) {
for (int i = 0; i < N; i++)
cout << d[i] << ", ";
cout << endl;
} }
void initMirror(double *data, int n1, int n2, int n3) { void initMirror(double *data, int n1, int n2, int n3) {
int d = 1; int d = 1;
for (int i = 0; i < n3; i++) { for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) { for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) { for (int k = 0; k < n1; k++) {
if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1)
data[i * n2 * n1 + j * n1 + k] = d++; data[i * n2 * n1 + j * n1 + k] = d++;
else else
data[i * n2 * n1 + j * n1 + k] = 0; data[i * n2 * n1 + j * n1 + k] = 0;
} }
} }
} }
} }
void printDiv(int c) { void printDiv(int c) {
for (int i = 0; i < c; i++) for (int i = 0; i < c; i++)
cout << "-"; cout << "-";
cout << endl; cout << endl;
} }
void printMirror(double *data, int n1, int n2, int n3) { void printMirror(double *data, int n1, int n2, int n3) {
printDiv(75); printDiv(75);
for (int i = 0; i < n3; i++) { for (int i = 0; i < n3; i++) {
for (int j = 0; j < n2; j++) { for (int j = 0; j < n2; j++) {
for (int k = 0; k < n1; k++) { for (int k = 0; k < n1; k++) {
cout << data[i * n2 * n1 + j * n1 + k] << "\t"; cout << data[i * n2 * n1 + j * n1 + k] << "\t";
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
cout << endl; cout << endl;
} }
double sumData(double *data, int datasize) { double sumData(double *data, int datasize) {
double sum = 0; double sum = 0;
for (int i = 0; i < datasize; i++) for (int i = 0; i < datasize; i++)
sum += data[i]; sum += data[i];
return sum; return sum;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
char *api_name = new char[10]; /* mpi init */
char *device_name = new char[10]; //int rank, nprocs;
//MPI_Init(&argc, &argv);
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
for (int i = 1; i < argc; i++) { /*
if (argv[i] == string("-cuda")) { if (nprocs != 8) {
strcpy(api_name, "Cuda"); cout << "example was set to run with 8 processes" << endl;
strcpy(device_name, "-gpu"); cout << "exit..." << endl;
} return 0;
}
*/
if (argv[i] == string("-opencl")) { /* set domain size */
strcpy(api_name, "OpenCL"); int NG[3] = {64, 64, 32};
strcpy(device_name, "-gpu"); int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2};
} int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1};
int sizerho = NG[0] * NG[1] * NG[2];
int sizegreen = ng[0] * ng[1] * ng[2];
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
int id[3];
if (argv[i] == string("-mic")) { //id[0] = 0;
strcpy(api_name, "OpenMP"); //id[1] = NL[1] * (rank % 4);
strcpy(device_name, "-mic"); //id[2] = NL[2] * (rank / 4);
}
if (argv[i] == string("-cpu")) { /* print some messages bout the example in the begginig */
strcpy(api_name, "OpenCL"); cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl;
strcpy(device_name, "-cpu"); //cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl;
} cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl;
} //cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl;
int tmp[3];
/* for (int p = 1; p < nprocs; p++) {
MPI_Status mpistatus;
MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus);
cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl;
}*/
// } else {
// MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD);
// }
cout << "Use api: " << api_name << ", " << device_name << endl; /* dks init and create 2 streams */
int dkserr;
//int streamGreens, streamFFT;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
#endif
/* set domain size */ #ifdef DKS_CUDA
int NG[3] = {64, 64, 32}; DKSBase base;
int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; base.setAPI("Cuda", 4);
int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; base.setDevice("-gpu", 4);
int sizerho = NG[0] * NG[1] * NG[2]; base.initDevice();
int sizegreen = ng[0] * ng[1] * ng[2]; #endif
int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1;
/* print some messages bout the example in the begginig */ //base.createStream(streamFFT);
cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; //if (rank == 0) {
cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; // base.createStream(streamGreens);
base.setupFFT(3, NG);
//}
/* dks init and create 2 streams */ /* allocate memory and init rho field */
int dkserr; double *rho = new double[sizerho];
DKSBase base; double *rho_out = new double[sizerho];
base.setAPI(api_name, strlen(api_name)); //double *green_out = new double[sizegreen];
base.setDevice(device_name, strlen(device_name)); initMirror(rho, NL[0], NL[1], NL[2]);
base.initDevice();
base.setupFFT(3, NG);
/* allocate memory and init rho field */ /*
double *rho = new double[sizerho]; allocate memory on device for
double *rho_out = new double[sizerho]; - rho field
//double *green_out = new double[sizegreen]; - rho FFT
double *mirror_out = new double[sizerho]; - tmpgreen
//initMirror(rho, NL[0], NL[1], NL[2]); - greens integral
initMirror(rho, NG[0], NG[1], NG[2]); - greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
// if (rank == 0) {
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* } else {
grntr_ptr = NULL;
rho2_ptr = NULL;
grn_ptr = NULL;
rho2tr_ptr = NULL;
tmpgreen_ptr = NULL;
}*/
/*
allocate memory on device for
- rho field
- rho FFT
- tmpgreen
- greens integral
- greens integral FFT
*/
void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr;
tmpgreen_ptr = base.allocateMemory<double>(sizegreen, dkserr);
rho2_ptr = base.allocateMemory<double>(sizerho, dkserr);
grn_ptr = base.allocateMemory<double>(sizerho, dkserr);
rho2tr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
grntr_ptr = base.allocateMemory< complex<double> >(sizecomp, dkserr);
/* =================================================*/ /* send and receive pointer to allocated memory on device */
/* =================================================*/ /*
/* =====loop trough fftpoison solver iterations=====*/ if (rank == 0) {
/* =================================================*/ for (int p = 1; p < nprocs; p++)
/* =================================================*/ base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD);
} else {
rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr);
}
MPI_Barrier(MPI_COMM_WORLD);
*/
double old_sum = 0;
int hr_m[3] = {1, 1, 1}; /* =================================================*/
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], hr_m[0], hr_m[1], hr_m[2]); /* =================================================*/
/* =====loop trough fftpoison solver iterations=====*/
/* =================================================*/
/* =================================================*/
/* calculate greens integral on gpu */ double old_sum = 0;
base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]); double tmp_sum = 0;
for (int l = 0; l < 100; l++) {
//MPI_Barrier(MPI_COMM_WORLD);
/* on node 0, calculate tmpgreen on gpu */
int hr_m[3] = {1, 1, 1};
//if (rank == 0)
base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1],
hr_m[0], hr_m[1], hr_m[2]);
/* mirror the field */ /* calculate greens integral on gpu */
base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]); //if (rank == 0)
/* base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]);
base.readData<double>(grn_ptr, mirror_out, sizerho);
for (int i = 0; i < sizerho; i++)
cout << mirror_out[i] << " ";
cout << endl << endl;
for (int i = 0; i < sizerho; i++) /* mirror the field */
cout << rho[i] << " "; //if (rank == 0)
cout << endl << endl; base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]);
*/
/* transfer rho field to device */
base.writeData<double>(rho2_ptr, rho, sizerho);
/* get FFT of rho field */
base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
/* get FFT of mirrored greens integral */ /* get FFT of mirrored greens integral */
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG); //if (rank == 0)
base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG);
/* multiply both FFTs */ /* transfer rho field to device */
base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); //base.gather3DDataAsync<double> ( rho2_ptr, rho, NG, NL, id, streamFFT);
base.writeData<double>(rho2_ptr, rho,NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
/* /* get FFT of rho field */
complex<double> *crho = new complex<double>[sizecomp]; //if (rank == 0) {
complex<double> *cgre = new complex<double>[sizecomp]; //base.syncDevice();
base.readData< complex<double> >(rho2tr_ptr, crho, sizecomp); base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG);
base.readData< complex<double> >(grntr_ptr, cgre, sizecomp); //}
for (int i = 0; i < sizecomp; i++) /* multiply both FFTs */
cout << cgre[i].real() << " "; //if (rank == 0)
cout << endl << endl; base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < sizecomp; i++) /* inverse fft and transfer data back */
cout << crho[i].real() << " "; /*
cout << endl << endl; multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
delete[] crho; only when data transfer is finished
delete[] cgre; */
*/ //if (rank == 0) {
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
base.readData<double> (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]);
//MPI_Barrier(MPI_COMM_WORLD);
//base.syncDevice();
//MPI_Barrier(MPI_COMM_WORLD);
//cout << "result: " << sumData(rho_out, sizerho) << endl;
if (l == 0) {
old_sum = sumData(rho_out, sizerho);
} else {
tmp_sum = sumData(rho_out, sizerho);
if (old_sum != tmp_sum) {
cout << "diff in iteration: " << l << endl;
}
}
/*} else {
MPI_Barrier(MPI_COMM_WORLD);
base.scatter3DDataAsync<double> (rho2_ptr, rho_out, NG, NL, id);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
*/
/* inverse fft and transfer data back */
/*
multiple device syncs and mpi barriers are used to make sure data
transfer is started when results are ready and progam moves on
only when data transfer is finished
*/
base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG);
base.readData<double> (rho2_ptr, rho_out, sizerho);
for (int i = 0; i < 10; i++)
cout << rho_out[i] << " ";
cout << endl;
old_sum = sumData(rho_out, sizerho);
}
/* =================================================*/ /* =================================================*/
/* =================================================*/ /* =================================================*/
/* ==========end fftpoison solver test run==========*/ /* ==========end fftpoison solver test run==========*/
/* =================================================*/ /* =================================================*/
/* =================================================*/ /* =================================================*/
base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
base.freeMemory<double>(rho2_ptr, sizerho);
delete[] rho_out;
delete[] rho; /* free memory on device */
delete[] mirror_out; //if (rank == 0) {
cout << "Final sum: " << old_sum << endl; base.freeMemory<double>(tmpgreen_ptr, sizegreen);
base.freeMemory<double>(grn_ptr, sizerho);
base.freeMemory< complex<double> >(rho2tr_ptr, sizecomp);
base.freeMemory< complex<double> >(grntr_ptr, sizecomp);
//MPI_Barrier(MPI_COMM_WORLD);
base.freeMemory<double>(rho2_ptr, sizerho);
cout << "Final sum: " << old_sum << endl;
/*} else {
base.closeHandle(rho2_ptr);
MPI_Barrier(MPI_COMM_WORLD);
}*/
//MPI_Finalize();
} }

View File

@ -1,81 +0,0 @@
#include <iostream>
#include <string>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
int size = 10;
bool apiSet = false;
char *api_name = new char[10];
char *device_name = new char[10];
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
apiSet = true;
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
apiSet = true;
}
if (argv[i] == string("-N")) {
size = atoi(argv[i+1]);
i++;
}
}
if (!apiSet) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of randoms: " << size << endl;
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
base.callInitRandoms(size);
//create host vector to store results
double *host_data = new double[size];
//create device vector
void *device_data = base.allocateMemory<double>(size, ierr);
for (int i = 0; i < 5; i++) {
//fill device vector with random values
base.callCreateRandomNumbers(device_data, size);
//read device vector
base.readData<double>(device_data, host_data, size);
//print host data
for (int i = 0; i < size; i++)
cout << host_data[i] << " ";
cout << endl;
}
//free device vector
base.freeMemory<double>(device_data, size);
//free host data
delete[] host_data;
return 0;
}