snapshot of svn

This commit is contained in:
Uldis Locans
2016-10-10 14:49:32 +02:00
commit 4fa529aaea
122 changed files with 23153 additions and 0 deletions

174
CMakeLists.txt Normal file
View File

@ -0,0 +1,174 @@
CMAKE_MINIMUM_REQUIRED (VERSION 3.2)
PROJECT (DKS)
SET (DKS_VERSION_MAJOR 1)
SET (DKS_VERSION_MINOR 0.1)
SET (PACKAGE \"dks\")
SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\")
SET (PACKAGE_NAME \"DKS\")
SET (PACKAGE_STRING \"DKS\ 1.0.1\")
SET (PACKAGE_TARNAME \"dks\")
SET (PACKAGE_VERSION \"1.0.1\")
SET (VERSION \"1.0.1\")
SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
#get compiler name
#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER})
STRING (REGEX REPLACE ".*/" "" COMPILER_NAME ${CMAKE_CXX_COMPILER})
MESSAGE (STATUS "Your compiler is: ${COMPILER_NAME}")
MESSAGE (STATUS "Your compiler is: ${CMAKE_CXX_COMPILER}")
MESSAGE (STATUS "C compiler: ${CMAKE_C_COMPILER_ID}")
MESSAGE (STATUS "CXX compiler: ${CMAKE_CXX_COMPILER_ID}")
#opencl and cuda kernel files are in the builds include directory
SET (OPENCL_KERNELS -DOPENCL_KERNELS=\\"${CMAKE_INSTALL_PREFIX}/include/\\")
MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}")
#find boost
set (BOOSTROOT $ENV{BOOST_DIR})
SET (Boost_USE_STATIC_LIBS OFF)
SET (Boost_USE_STATIC_RUNTIME OFF)
FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system)
IF (Boost_FOUND)
MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}")
MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}")
MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}")
INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS})
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
ENDIF (Boost_FOUND)
#enable UQTK
OPTION (USE_UQTK "Use UQTK" OFF)
#intel icpc compiler specific flags
IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
#for intel compiler turn on openmp and opencl
OPTION (USE_OPENCL "Use OpenCL" ON)
OPTION (USE_CUDA "Use CUDA" OFF)
OPTION (USE_MIC "Use intel MIC" ON)
#find xiar and xild and set flags for offload build on mic
FIND_PROGRAM(XIAR xiar)
IF(XIAR)
MESSAGE(STATUS "xiar found: ${XIAR}")
SET(CMAKE_AR "${XIAR}")
ENDIF(XIAR)
MARK_AS_ADVANCED(XIAR)
SET(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> rcs -qoffload-build <TARGET> <LINK_FLAGS> <OBJECTS>")
SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs -qoffload-build <TARGET> <LINK_FLAGS> <OBJECTS>")
FIND_PROGRAM(XILD xild)
IF(XILD)
SET(CMAKE_LINKER "${XILD}")
ENDIF(XILD)
MARK_AS_ADVANCED(XILD)
#set flags for openmp and opencl
#TODO: check which opencl to use: nvidia, amd, intel, apple
SET (CMAKE_CXX_FLAGS "-DDEBUG -O3 -Wall -offload -mkl -openmp -lOpenCL -lpthread -DDKS_MIC -DDKS_OPENCL -qopt-report=5 -qopt-report-phase=vec -std=c++11")
IF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc")
ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL)
#gnu copmpiler specific flags
IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
OPTION (USE_OPENCL "Use OpenCL" ON)
OPTION (USE_CUDA "Use CUDA" OFF)
OPTION (USE_MIC "Use intel MIC" OFF)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu")
FIND_PACKAGE(CUDA)
IF (CUDA_FOUND)
SET (USE_CUDA ON)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
MESSAGE (STATUS "cuda include: ${CUDA_INCLUDE_DIRS}")
MESSAGE (STATUS "cuda libs: ${CUDA_TOOLKIT_ROOT_DIR}/lib64")
MESSAGE (STATUS "cuda version: ${CUDA_VERSION}")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA")
SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false")
SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}")
#if cuda version >= 7.0 add runtime commpilation flags
IF (NOT CUDA_VERSION VERSION_LESS "7.0")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda")
ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")
MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}")
SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
#set(CUDA_SEPARABLE_COMPILATION ON)
SET(BUILD_SHARED_LIBS OFF)
ENDIF (CUDA_FOUND)
IF (NOT CUDA_FOUND)
MESSAGE(STATUS "CUDA not found, looking for OpenCL")
FIND_PACKAGE(OpenCL)
IF (OpenCL_FOUND)
MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}")
MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}")
MESSAGE(STATUS "OpenCL library dir: ${OpenCL_LIBRARY}")
INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIR})
LINK_DIRECTORIES(${OpenCL_LIBRARY})
ENDIF (OpenCL_FOUND)
ENDIF (NOT CUDA_FOUND)
#if mac OS and no CUDA set apple opencl flags
IF (APPLE AND NOT CUDA_FOUND)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -framework opencl -lpthread -DDKS_OPENCL")
ENDIF(APPLE AND NOT CUDA_FOUND)
#if cuda found set cuda opencl flags
IF (CUDA_FOUND)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
ENDIF (CUDA_FOUND)
#if cuda not found but amd opencl found set opencl flags
IF (NOT CUDA_FOUND AND OpenCL_FOUND)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL")
ENDIF(NOT CUDA_FOUND AND OpenCL_FOUND)
#if mpi compiler used set mpi flag
IF (${COMPILER_NAME} STREQUAL "mpicxx")
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI")
ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL)
SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}")
MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}")
ADD_SUBDIRECTORY (src)
IF (ENABLE_TESTS)
ADD_SUBDIRECTORY (test)
ENDIF (ENABLE_TESTS)
ADD_SUBDIRECTORY (auto-tuning)
### write configure files ###
CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake )
### install files ###
INSTALL (
FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake
DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}"
RENAME ${PROJECT_NAME}Config.cmake
)

82
ReadMe.first Normal file
View File

@ -0,0 +1,82 @@
##################################################################
#
# Name: Dynamic Kernel Scheduler
# Version: 1.0
# Author: Uldis Locans
# Contacts: locans.uldis@psi.ch
#
##################################################################
Dynamic Kernel Scheduler is a library that provides a software layer between host application
and hardware accelerators. DKS handles communication between host and device and schedules task
execution using predefined algorithms writen using CUDA and OpenCL for GPUs, and OpenMP with
offload pragmas for IntelMIC. See DKSBase class documentation for full list of functions provided
by DKS.
#####Requirements#####
OpenMPI (Cuda aware OpenMPI enabled for full compatability)
g++ or icpc compiler
Cuda 7.0 or higher (optional)
Nvidia or Intel OpenCL SDK (optional)
Intel MIC compilers (optional)
######Install######
#check out DKS
svn co svn+ssh://YOULOGIN@savannah02.psi.ch/repos/amas/users/adelmann/Ph.D-students/Locans/work/DKS/trunk DKS
#set compilers to use
#supported c++ compilers: g++, icpc, mpicxx whith g++
#supported c compilers: gcc, icc, mpicc whith gcc
export CXX_COMPILER=cpp_compiler_name
export CC_COMPILER=c_compiler_name
#set dks root directory directory
cd DKS
export DKS_ROOT = $PWD
#set build directory
mkdir $DKS_BUILD_DIR
cd $DKS_BUILD_DIR
#set install directory
export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/
CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT
make
make install
######DKS usage######
Make install copies the include files and library files to $DKS_BUILD_DIR/build folder, lib folder
in the build directory contains libdks.a and libdksshared.so, on of these libraries can be used to link
with DKS. All the necessary include files are located in $DKS_BUILD_DIR/build/include.
Additional flags needed for CUDA and OpenCL mode:
-lcudart -lcufft -lcublas -lnvToolsExt -lOpenCL -lnvrtc -lcuda -DDKS_CUDA -DDKS_OPENCL
Additional flags needed for IntelMIC and OpenCL mode:
-offload -mkl -openmp -lOpenCL -DDKS_MIC -DDKS_OPENCL
Note: always run make install, during runtime OpenCL and CUDA will search for kernel files in
$DKS_INSTALL_DIR/build/include directory for runtime compilation.
######Running DKS######
#running with cuda
#nvidia multi process service started for better CUDA and MPI execution
#to start mps service (if multiple users use DKS start MPS as root)
nvidia-cuda-mps-control -d
#to stop mps service
echo quit | nvidia-cuda-mps-control
#runnign dks with MIC
#Intel Manycore Platform Software Stack (mpss) service started
#to start mpss
service mpss start

View File

@ -0,0 +1,19 @@
INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#chi square kernel tests
ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES})
ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES})
IF (USE_UQTK)
ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp)
TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ENDIF (USE_UQTK)
#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES})
#test to verify search functions
ADD_EXECUTABLE(testSearch testSearch.cpp)
TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES})

View File

@ -0,0 +1,385 @@
#include <iostream>
#include <cstdlib>
#include <string>
#include <cmath>
#include <fstream>
#include "DKSBaseMuSR.h"
#include "Utility/DKSTimer.h"
#define PI 3.14159265358979323846
#define TWO_PI 6.283185307179586231996
#define DEG_TO_RAD 1.7453292519943295474371681e-2
#define N0 0.25
#define TAU 2.197019
#define BKG 1.0
#define ALPHA 1.0
#define BETA 1.0
using namespace std;
void randData(double *data, int N, int scale = 1) {
for (int i = 0; i < N; i++)
data[i] = ((double)rand() / RAND_MAX ) * scale;
}
/** MusrFit predefined functions.
* Predefined functions from MusrFit that can be used to define the theory function.
* First parameter in all the functions is alwats time - t, rest of the parameters depend
* on the function.
*/
double se(double t, double lamda) {
return exp( -lamda*t );
}
double ge(double t, double lamda, double beta) {
return exp( -pow(lamda*t, beta) );
}
double sg(double t, double sigma) {
return exp( -0.5 * pow(sigma*t, 2) );
}
double stg(double t, double sigma) {
double sigmatsq = pow(sigma*t,2);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
}
double sekt(double t, double lambda) {
double lambdat = lambda*t;
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
}
double lgkt(double t, double lambda, double sigma) {
double lambdat = lambda*t;
double sigmatsq = pow(sigma*t, 2.0);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
}
double skt(double t, double sigma, double beta) {
if (beta < 1.0e-3)
return 0.0;
double sigmatb = pow(sigma*t, beta);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
}
double spg(double t, double lambda, double gamma, double q) {
double lam2 = lambda*lambda;
double lamt2q = t*t*lam2*q;
double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
double rateL = sqrt(fabs(rate2));
double rateT = sqrt(fabs(rate2)+lamt2q);
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
}
double rahf(double t, double nu, double lambda) {
double nut = nu*t;
double nuth = nu*t/2.0;
double lamt = lambda*t;
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
}
double tf(double t, double phi, double nu) {
double tmp_nu = TWO_PI*nu*t;
double tmp_phi = DEG_TO_RAD * phi;
return cos(tmp_nu + tmp_phi);
}
double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
}
double b(double t, double phi, double nu) {
return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
}
double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
double wt = TWO_PI * nu * t;
double ph = DEG_TO_RAD * phi;
return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
}
double ab(double t, double sigma, double gamma) {
double gt = gamma*t;
return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
}
double snkzf(double t, double Delta0, double Rb) {
double D0t2 = pow(Delta0*t, 2.0);
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
}
double snktf(double t, double phi, double nu, double Delta0, double Rb) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
double D0t2 = pow(Delta0*t, 2.0);
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
}
double dnkzf(double t, double Delta0, double Rb, double nuc) {
double nuct = nuc*t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
}
double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
double nuct = nuc*t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
}
double cpuChiSq(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc,
double timeStart, double timeStep, bool mlh = false)
{
double result = 0.0;
for (int i = 0; i < Ndata; i++) {
double t = timeStart + i*timeStep;
double d = data[i];
double e = data[i];
double fTheory = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
double theo = N0 * exp(-t/TAU) * (1.0 + fTheory) + BKG;
if (mlh) {
if ((d > 1.0e-9) && (fabs(theo) > 1.0e-9))
result += 2.0 * ((theo - d) + d * log(d / theo));
else
result += 2.0 * (theo - d);
} else {
if (e != 0.0)
result += ( (theo - d) * (theo - d) ) / (e * e);
else
result += theo * theo;
}
}
return result;
}
double cpuChiSqAsym(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc,
double timeStart, double timeStep, bool mlh = false)
{
double result = 0.0;
for (int i = 0; i < Ndata; i++) {
double t = timeStart + i*timeStep;
double d = data[i];
double e = data[i];
double theoVal = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]);
double ab = ALPHA * BETA;
double theo = ((ab+1.0)*theoVal - (ALPHA-1.0))/((ALPHA+1.0) - (ab-1.0)*theoVal);
if (mlh) {
result += 0.0; //log max likelihood not defined here
} else {
if (e != 0.0)
result += ( (theo - d) * (theo - d) ) / (e * e);
else
result += theo * theo;
}
}
return result;
}
int runTest(const char *api_name, const char *device_name, bool autotune, bool mlh, bool asym) {
int ierr;
/*
* Histogram size used in tests. If autotune run kernes with sizes from 1e5 to 1e6.
* If autotune is off just run the test once (used for debuging to test the kernel)
*/
int Nstart = 1e5;
int Nstep = 1e5;
int Nend = (autotune) ? 1e6 : 1e5;
//parameter, function and map sizes used in tests
int Npar = 66;
int Nfnc = 2;
int Nmap = 5;
//print test info
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Max log likelihood: " << std::boolalpha << mlh << endl;
cout << "Asymetry fit: " << std::boolalpha << asym << endl;
DKSBaseMuSR dksbase;
dksbase.setAPI(api_name);
dksbase.setDevice(device_name);
ierr = dksbase.initDevice();
if (ierr != DKS_SUCCESS) {
std::cout << "Device not supported!" << std::endl;
return DKS_ERROR;
}
//get the list of different devices
std::vector<int> devices;
dksbase.getDeviceList(devices);
std::cout << "Unique devices: " << devices.size() << std::endl;
//create the function string to use in test
string sFnc = "p[m[0]] * f[m[1]] * sg(t, p[m[2]]) * tf(t, p[m[3]], f[m[4]])";
int map[5] = {0, 0, 1, 2, 1};
//runt tests from 100k to 1mil data points
for (unsigned int device = 0; device < devices.size(); device++) {
for (int Ndata = Nstart; Ndata <= Nend; Ndata += Nstep) {
dksbase.setDefaultDevice(device);
std::cout << "Ndata: " << Ndata << std::endl;
//init the chi square calculations
dksbase.initChiSquare(Ndata, Npar, Nfnc, Nmap);
//create random arrays for data, parameter and function storage
double *data = new double[Ndata];
double *par = new double[Npar];
double *fnc = new double[Nfnc];
randData(data, Ndata);
randData(par, Npar);
randData(fnc, Nfnc, 100);
//allocate memory on device
void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
//write data, params, functions and maps to the device
dksbase.writeData<double>(data_ptr, data, Ndata);
dksbase.writeParams(par, Npar);
dksbase.writeFunctions(fnc, Nfnc);
dksbase.writeMaps(map, Nmap);
//set musrfit constants
dksbase.callSetConsts(N0, TAU, BKG);
dksbase.callSetConsts(ALPHA, BETA);
//compile the program created with the function string
dksbase.callCompileProgram(sFnc, mlh);
//set autotuning on/off
if (autotune)
dksbase.setAutoTuningOn();
//tmp values to store results and tmp values for time steps and start time
double result_gpu = 0.0;
double result_cpu = 0.0;
double dt = 1e-12;
double ts = 1e-7;
//execute kernel on the GPU and execute the same function on the cpu
if (!asym) {
dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, Npar, Nfnc,
Nmap, ts, dt, result_gpu);
result_cpu = cpuChiSq(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);
} else {
dksbase.callLaunchChiSquare(2, data_ptr, data_ptr, Ndata, Npar, Nfnc,
Nmap, ts, dt, result_gpu);
result_cpu = cpuChiSqAsym(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh);
}
//check the results
cout << "DKS: " << result_gpu << endl;
cout << "CPU: " << result_cpu << endl;
//free CPU and GPU memory
dksbase.freeMemory<double>(data_ptr, Ndata);
dksbase.freeChiSquare();
delete[] data;
delete[] par;
delete[] fnc;
cout << "------------------------------------------------------------" << endl;
}
}
return DKS_SUCCESS;
}
int main(int argc, char* argv[]) {
bool asym = false;
bool mlh = false;
bool autotune = false;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; ++i) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
if (argv[i] == string("-mlh"))
mlh = true;
if (argv[i] == string("-asym"))
asym = true;
if (argv[i] == string("-autotune"))
autotune = true;
}
int numPlatforms = 2;
const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"};
const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"};
for (int i = 0; i < numPlatforms; i++) {
runTest(api[i], device[i], autotune, mlh, asym);
}
return 0;
}

View File

@ -0,0 +1,450 @@
#include <iostream>
#include <cstdlib>
#include <string>
#include <cmath>
#include <fstream>
#include "DKSBaseMuSR.h"
#include "Utility/DKSTimer.h"
#define PI 3.14159265358979323846
#define TWO_PI 6.283185307179586231996
#define DEG_TO_RAD 1.7453292519943295474371681e-2
//#define N0 0.25
#define N0 1e-10
#define TAU 2.197019
#define BKG 0.05
using namespace std;
typedef std::function<double()> doubleF;
void randData(double *data, int N, int scale = 1) {
for (int i = 0; i < N; i++)
data[i] = ((double)rand() / RAND_MAX ) * scale;
}
/** MusrFit predefined functions.
* Predefined functions from MusrFit that can be used to define the theory function.
* First parameter in all the functions is alwats time - t, rest of the parameters depend
* on the function.
*/
double se(double *t, double *lamda) {
return exp( -*lamda**t );
}
double ge(double *t, double *lamda, double *beta) {
return exp( -pow( (*lamda)*(*t), *beta) );
}
double sg(double *t, double *sigma) {
return exp( -0.5 * pow((*sigma)*(*t), 2) );
}
double stg(double *t, double *sigma) {
double sigmatsq = pow((*sigma)*(*t),2);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
}
double sekt(double *t, double *lambda) {
double lambdat = *lambda*(*t);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
}
double lgkt(double *t, double *lambda, double *sigma) {
double lambdat = *lambda*(*t);
double sigmatsq = pow(*sigma*(*t), 2.0);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
}
double skt(double *t, double *sigma, double *beta) {
if (*beta < 1.0e-3)
return 0.0;
double sigmatb = pow(*sigma*(*t), (*beta));
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
}
double spg(double *t, double *lambda, double *gamma, double *q) {
double lam2 = (*lambda)*(*lambda);
double lamt2q = (*t)*(*t)*lam2*(*q);
double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
double rateL = sqrt(fabs(rate2));
double rateT = sqrt(fabs(rate2)+lamt2q);
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
}
double rahf(double *t, double *nu, double *lambda) {
double nut = *nu*(*t);
double nuth = *nu*(*t)/2.0;
double lamt = *lambda*(*t);
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
}
double tf(double *t, double *phi, double *nu) {
double tmp_nu = TWO_PI**nu**t;
double tmp_phi = DEG_TO_RAD * *phi;
return cos(tmp_nu + tmp_phi);
}
double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
double wt = TWO_PI**nu**t;
double ph = DEG_TO_RAD**phi;
return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
}
double b(double *t, double *phi, double *nu) {
return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
}
double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
double wt = TWO_PI * *nu * *t;
double ph = DEG_TO_RAD * *phi;
return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
}
double ab(double *t, double *sigma, double *gamma) {
double gt = *gamma**t;
return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
}
double snkzf(double *t, double *Delta0, double *Rb) {
double D0t2 = pow(*Delta0**t, 2.0);
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
}
double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
double wt = TWO_PI**nu**t;
double ph = DEG_TO_RAD**phi;
double D0t2 = pow(*Delta0**t, 2.0);
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
}
double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
double nuct = *nuc**t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
}
double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
double wt = TWO_PI**nu**t;
double ph = DEG_TO_RAD**phi;
double nuct = *nuc**t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
}
double evalf(std::vector< std::pair<int, doubleF> > func) {
double result = 0.0;
for (auto f : func) {
switch (f.first) {
case 0: result += f.second(); break;
case 1: result -= f.second(); break;
default: result += f.second(); break;
}
}
return result;
}
double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
double result = 0.0;
double ts = *t;
for (int i = 0; i < ndata; i++) {
*t = ts + i*dt;
double d = data[i];
double e = data[i];
double vf = evalf(func);
double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
if (e != 0.0)
result += ( (theo - d) * (theo - d) ) / (e*e);
else
result += theo * theo;
}
return result;
}
//create a random length from 50 - 1000 array and fill with random values from 0 to 1
void randomParams(double *p, int np) {
for (int i = 0; i < np; i++)
p[i] = (double)rand() / RAND_MAX;
}
//create map array of random size and fill with indexes from 0 to max, max < size of param array
void randomMaps(int *m, int nm, int max) {
for (int i = 0; i < nm; i++)
m[i] = rand() % max;
}
int generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc,
double *t, double *p, int *m, int np, int nm)
{
//nf defines the number of functions to generate (from 1 to 25)
int nf = rand() % 25 + 1;
for (int n = 0; n < nf; n++) {
std::string sf = "";
doubleF f;
int r = rand() % 18; //choose random function to use
int id1 = rand() % nm;
int id2 = rand() % nm;
int id3 = rand() % nm;
int id4 = rand() % nm;
int id5 = rand() % nm;
std::string p1 = "p[m[" + to_string(id1) + "]])";
std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
to_string(id3) + "]])";
std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
//get a random index from maps and use it to get the parameter value, bind function and parameter
//values to f, and create string for gpu in sfunc
switch (r) {
case 0:
f = std::bind(se, t, &p[m[id1]]);
sf = "se(t," + p1;
break;
case 1:
f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
sf = "ge(t," + p2;
break;
case 2:
f = std::bind(sg, t, &p[m[id1]]);
sf = "sg(t, " + p1;
break;
case 3:
f = std::bind(stg, t, &p[m[id1]]);
sf = "stg(t, " + p1;
break;
case 4:
f = std::bind(sekt, t, &p[m[id1]]);
sf = "sekt(t, " + p1;
break;
case 5:
f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
sf = "lgkt(t, " + p2;
break;
case 6:
f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
sf = "skt(t, " + p2;
break;
case 7:
f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
sf = "spg(t, " + p3;
break;
case 8:
f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
sf = "rahf(t, " + p2;
break;
case 9:
f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
sf = "tf(t, " + p2;
break;
case 10:
f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
sf = "ifld(t, " + p5;
break;
case 11:
f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
sf = "b(t, " + p2;
break;
case 12:
f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
sf = "ib(t, " + p5;
break;
case 13:
f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
sf = "ab(t, " + p2;
break;
case 14:
f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
sf = "snkzf(t, " + p2;
break;
case 15:
f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
sf = "snktf(t, " + p4;
break;
case 16:
f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
sf = "dnkzf(t, " + p3;
break;
case 17:
f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
sf = "dnktf(t, " + p5;
break;
}
int sign = rand() % 2;
if (n == 0) sign = 0;
func.push_back( std::make_pair(sign, f) );
if (n == 0)
sfunc = sf;
else {
switch(sign) {
case 0: sfunc += " + " + sf; break;
case 1: sfunc += " - " + sf; break;
default: sfunc += " + " + sf; break;
}
}
}
return nf;
}
int main(int argc, char *argv[]) {
srand(time(NULL));
int ierr;
int Ndata = 1e6;
bool autotune = false;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; ++i) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
if (argv[i] == string("-autotune")) {
autotune = true;
}
}
//create a random number of parameters
int np = ( rand() % (1000 - 50) ) + 50;
int nm = ( rand() % (50 - 5) ) + 5;
int nf = ( rand() % (50 - 5) ) + 5;
int *m = new int[nm];
double *p = new double[np];
double *f = new double[nf];
randomParams(p, np);
randomMaps(m, nm, np);
randomParams(f, nf);
double dt = 1e-10;
double t = 1e-10;
std::vector< std::pair<int, doubleF> > func;
std::string sfunc;
int nfunc = generateRandomFunction(func, sfunc, &t, p, m, np, nm);
//create DKS base object, set and init device / framework
DKSBaseMuSR dksbase;
dksbase.setAPI(api_name);
dksbase.setDevice(device_name);
dksbase.initDevice();
dksbase.initChiSquare(Ndata, np, nf, nm);
dksbase.writeParams(p, np);
dksbase.writeFunctions(f, nf);
dksbase.writeMaps(m, nm);
dksbase.callSetConsts(N0, TAU, BKG);
dksbase.callCompileProgram(sfunc);
if (autotune)
dksbase.setAutoTuningOn();
int oper = 0;
dksbase.getOperations(oper);
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of params: " << np << endl;
cout << "Number of maps: " << nm << endl;
cout << "Number of predefined functions: " << nfunc << endl;
cout << "Number of ptx instructions: " << oper << endl;
cout << "------------------------------------------------------------" << endl;
cout << sfunc << endl;
cout << "------------------------------------------------------------" << endl;
//allocate memory on host and device device
double *data = new double[Ndata];
randomParams(data, Ndata);
void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
dksbase.writeData<double>(data_ptr, data, Ndata);
for (int N = 1e5; N < Ndata + 1; N += 1e5) {
double result_dks, result_cpu;
t = 1e-10;
dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, N, np, nf, nm, t, dt, result_dks);
result_cpu = cpuChiSq(data, func, N, &t, dt);
cout << "Npart: " << N << endl;
cout << "DKS: " << result_dks << endl;
cout << "CPU: " << result_cpu << endl;
}
dksbase.freeMemory<double>(data_ptr, Ndata);
dksbase.freeChiSquare();
delete[] data;
delete[] p;
delete[] f;
delete[] m;
return 0;
}

View File

@ -0,0 +1,618 @@
#include <iostream>
#include <cstdlib>
#include <string>
#include <cmath>
#include <fstream>
#include <cstdio>
#include <stddef.h>
#include <fstream>
#include <math.h>
#include <time.h>
#include <getopt.h>
#include <unistd.h>
#include "DKSBaseMuSR.h"
#include "Utility/DKSTimer.h"
#include "Array1D.h"
#include "Array2D.h"
#include "Array3D.h"
#include "error_handlers.h"
#include "PCSet.h"
#include "fast_laplace.h"
#include "uqtktools.h"
#include "lreg.h"
#define PI 3.14159265358979323846
#define TWO_PI 6.283185307179586231996
#define DEG_TO_RAD 1.7453292519943295474371681e-2
//#define N0 0.25
#define N0 1e-10
#define TAU 2.197019
#define BKG 0.05
using namespace std;
typedef std::function<double()> doubleF;
void randData(double *data, int N, int scale = 1) {
for (int i = 0; i < N; i++)
data[i] = ((double)rand() / RAND_MAX ) * scale;
}
/** MusrFit predefined functions.
* Predefined functions from MusrFit that can be used to define the theory function.
* First parameter in all the functions is alwats time - t, rest of the parameters depend
* on the function.
*/
double se(double *t, double *lamda) {
return exp( -*lamda**t );
}
//math func + math oper + memory loads
//1 + 1 + 2
double ge(double *t, double *lamda, double *beta) {
return exp( -pow( (*lamda)*(*t), *beta) );
}
//2 + 1 + 3
double sg(double *t, double *sigma) {
return exp( -0.5 * pow((*sigma)*(*t), 2) );
}
//2 + 2 + 2
double stg(double *t, double *sigma) {
double sigmatsq = pow((*sigma)*(*t),2);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
}
double sekt(double *t, double *lambda) {
double lambdat = *lambda*(*t);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
}
double lgkt(double *t, double *lambda, double *sigma) {
double lambdat = *lambda*(*t);
double sigmatsq = pow(*sigma*(*t), 2.0);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
}
double skt(double *t, double *sigma, double *beta) {
if (*beta < 1.0e-3)
return 0.0;
double sigmatb = pow(*sigma*(*t), (*beta));
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta));
}
double spg(double *t, double *lambda, double *gamma, double *q) {
double lam2 = (*lambda)*(*lambda);
double lamt2q = (*t)*(*t)*lam2*(*q);
double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma);
double rateL = sqrt(fabs(rate2));
double rateT = sqrt(fabs(rate2)+lamt2q);
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
}
double rahf(double *t, double *nu, double *lambda) {
double nut = *nu*(*t);
double nuth = *nu*(*t)/2.0;
double lamt = *lambda*(*t);
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
}
double tf(double *t, double *phi, double *nu) {
double tmp_nu = TWO_PI**nu**t;
double tmp_phi = DEG_TO_RAD * *phi;
return cos(tmp_nu + tmp_phi);
}
double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
double wt = TWO_PI**nu**t;
double ph = DEG_TO_RAD**phi;
return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
}
double b(double *t, double *phi, double *nu) {
return j0(TWO_PI**nu**t + DEG_TO_RAD**phi);
}
double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) {
double wt = TWO_PI * *nu * *t;
double ph = DEG_TO_RAD * *phi;
return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t);
}
double ab(double *t, double *sigma, double *gamma) {
double gt = *gamma**t;
return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt));
}
double snkzf(double *t, double *Delta0, double *Rb) {
double D0t2 = pow(*Delta0**t, 2.0);
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
}
double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) {
double wt = TWO_PI**nu**t;
double ph = DEG_TO_RAD**phi;
double D0t2 = pow(*Delta0**t, 2.0);
double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2);
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
}
double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) {
double nuct = *nuc**t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta);
return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa);
}
double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) {
double wt = TWO_PI**nu**t;
double ph = DEG_TO_RAD**phi;
double nuct = *nuc**t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0);
double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta);
return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph);
}
double evalf(std::vector< std::pair<int, doubleF> > func) {
double result = 0.0;
for (auto f : func) {
switch (f.first) {
case 0: result += f.second(); break;
case 1: result -= f.second(); break;
default: result += f.second(); break;
}
}
return result;
}
double cpuChiSq(double *data, std::vector< std::pair<int, doubleF> > &func, int ndata, double *t, double dt) {
double result = 0.0;
double ts = *t;
for (int i = 0; i < ndata; i++) {
*t = ts + i*dt;
double d = data[i];
double e = data[i];
double vf = evalf(func);
double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG;
if (e != 0.0)
result += ( (theo - d) * (theo - d) ) / (e * e);
else
result += theo * theo;
}
return result;
}
//create a random length from 50 - 1000 array and fill with random values from 0 to 1
void randomParams(double *p, int np) {
for (int i = 0; i < np; i++)
p[i] = (double)rand() / RAND_MAX;
}
//create map array of random size and fill with indexes from 0 to max, max < size of param array
void randomMaps(int *m, int nm, int max) {
for (int i = 0; i < nm; i++)
m[i] = rand() % max;
}
void generateRandomFunction(std::vector< std::pair<int, doubleF> > &func, std::string &sfunc,
double *t, double *p, int *m, int np, int nm, int nfunc)
{
for (int n = 0; n < nfunc; n++) {
std::string sf = "";
doubleF f;
int r = rand() % 18; //randomly choose one of the predefined functions to use
int id1 = rand() % nm; //randomly select parameters to use in the function
int id2 = rand() % nm;
int id3 = rand() % nm;
int id4 = rand() % nm;
int id5 = rand() % nm;
std::string p1 = "p[m[" + to_string(id1) + "]])";
std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])";
std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
to_string(id3) + "]])";
std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
to_string(id3) + "]], p[m[" + to_string(id4) + "]])";
std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" +
to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])";
//get a random index from maps and use it to get the parameter value, bind function and parameter
//values to f, and create string for gpu in sfunc
switch (r) {
case 0:
f = std::bind(se, t, &p[m[id1]]);
sf = "se(t," + p1;
break;
case 1:
f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]);
sf = "ge(t," + p2;
break;
case 2:
f = std::bind(sg, t, &p[m[id1]]);
sf = "sg(t, " + p1;
break;
case 3:
f = std::bind(stg, t, &p[m[id1]]);
sf = "stg(t, " + p1;
break;
case 4:
f = std::bind(sekt, t, &p[m[id1]]);
sf = "sekt(t, " + p1;
break;
case 5:
f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]);
sf = "lgkt(t, " + p2;
break;
case 6:
f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]);
sf = "skt(t, " + p2;
break;
case 7:
f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
sf = "spg(t, " + p3;
break;
case 8:
f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]);
sf = "rahf(t, " + p2;
break;
case 9:
f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]);
sf = "tf(t, " + p2;
break;
case 10:
f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
sf = "ifld(t, " + p5;
break;
case 11:
f = std::bind(b, t, &p[m[id1]], &p[m[id2]]);
sf = "b(t, " + p2;
break;
case 12:
f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
sf = "ib(t, " + p5;
break;
case 13:
f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]);
sf = "ab(t, " + p2;
break;
case 14:
f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]);
sf = "snkzf(t, " + p2;
break;
case 15:
f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]);
sf = "snktf(t, " + p4;
break;
case 16:
f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]);
sf = "dnkzf(t, " + p3;
break;
case 17:
f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]);
sf = "dnktf(t, " + p5;
break;
}
int sign = rand() % 2;
if (n == 0) sign = 0;
func.push_back( std::make_pair(sign, f) );
if (n == 0)
sfunc = sf;
else {
switch(sign) {
case 0: sfunc += " + " + sf; break;
case 1: sfunc += " - " + sf; break;
default: sfunc += " + " + sf; break;
}
}
}
}
int main(int argc, char *argv[]) {
srand(time(NULL));
bool autotune = false;
bool eval = false;
bool test = false;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
int nord = 15; //the order of the initial, overcomplete basis
int loop = 100;
for (int i = 1; i < argc; ++i) {
if (argv[i] == string("-cuda")) {
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-opencl")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-cpu")) {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-cpu");
}
if (argv[i] == string("-autotune")) {
autotune = true;
}
if (argv[i] == string("-eval"))
eval = true;
if (argv[i] == string("-test"))
test = true;
if (argv[i] == string("-nord"))
nord = atoi(argv[i+1]);
if (argv[i] == string("-loop"))
loop = atoi(argv[i+1]);
}
//init dks and set chi^2 constants
DKSBaseMuSR dksbase;
dksbase.setAPI(api_name);
dksbase.setDevice(device_name);
dksbase.initDevice();
if (autotune)
dksbase.setAutoTuningOn();
int nydim = 2; //the dimensionality of input
int nxdim = 5;
//UQTk arrays
Array2D<double> xdata(loop, nxdim, 0.0);
Array2D<double> ydata(loop, nydim, 0.0);
Array2D<double> xdata_pce(loop, nxdim, 0.0);
Array2D<double> ydata_pce(loop, nydim, 0.0);
int size = 10000;
Array2D<double> xtmp(size, nxdim, 0.0);
Array2D<double> ytmp(size, nydim, 0.0);
if (eval || test) {
for (int l = 0; l < loop; l++) {
int ierr;
//create a random number of parameters
int n = rand() % 9 + 1;
int Ndata = n * 100000; //number of data points 100k to 1milj, with 100k incr.
int np = ( rand() % (1000 - 50) ) + 50; //from 50 to 1000 for different shared memory needs
int nm = ( rand() % (50 - 5) ) + 5; //use 5 to 50 of the parameters, for different memory access
int nf = ( rand() % (50 - 5) ) + 5; //not used in the test case, but changes the shared memory
int nfunc = (rand() % (10 - 1) ) + 1; //1 to 10 user defined functions
//allocate storage for parameters, maps and functions
int *m = new int[nm];
double *p = new double[np];
double *f = new double[nf];
//fill with random numbers
randomParams(p, np);
randomMaps(m, nm, np);
randomParams(f, nf);
//create a random user function that can be passed to GPU kernel and evaluated on the host
double dt = 1e-10;
double t = 1e-10;
std::vector< std::pair<int, doubleF> > func;
std::string sfunc;
generateRandomFunction(func, sfunc, &t, p, m, np, nm, nfunc);
//create a data array and fill with random values
double *data = new double[Ndata];
randomParams(data, Ndata);
//allocate device memory for the data and transfer to the GPU
void *data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
dksbase.writeData<double>(data_ptr, data, Ndata);
//init chi^2
dksbase.initChiSquare(Ndata, np, nf, nm);
dksbase.callSetConsts(N0, TAU, BKG);
//write params to the devic
dksbase.writeParams(p, np);
dksbase.writeFunctions(f, nf);
dksbase.writeMaps(m, nm);
//compile the kernel with the new function
dksbase.callCompileProgram(sfunc);
//run the kernel on the GPU and evaluate the function on the host
double result_dks, result_cpu, tmp_result;
ierr = dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm,
t, dt, result_dks);
if (ierr == DKS_SUCCESS) {
result_cpu = cpuChiSq(data, func, Ndata, &t, dt);
std::vector<int> config;
dksbase.callAutoTuningChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm,
t, dt, tmp_result, config);
cout << "DKS: " << result_dks << endl;
cout << "CPU: " << result_cpu << endl;
cout << "Launch parameters: " << config[0] << ", " << config[1] << endl;
cout << sfunc << endl;
cout << "Kernel parameters: " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
xdata(l,0) = np;
xdata(l,1) = nm;
xdata(l,2) = nf;
xdata(l,3) = nfunc;
xdata(l,4) = Ndata;
ydata(l,0) = config[0];
ydata(l,1) = config[1];
std::cout << std::endl << "Loop " << l + 1 << " finished" << std::endl << std::endl;
} else {
cout << "Created kernel failed! " << np << ", " << nm << ", " << nf << ", " << nfunc << endl;
cout << sfunc << endl;
}
//free temporary resources
delete[] m;
delete[] p;
delete[] f;
delete[] data;
dksbase.freeChiSquare();
dksbase.freeMemory<double>(data_ptr, Ndata);
}
} else {
//read_datafileVS(xdata, "xdata.dat");
//read_datafileVS(ydata, "ydata.dat");
xtmp.SetValue(0.0);
ytmp.SetValue(0.0);
read_datafileVS(xtmp, "xdata_pce.dat");
read_datafileVS(ytmp, "ydata_pce.dat");
for (int i = 0; i < loop; i++) {
for (int j = 0; j < nxdim; j++)
xdata(i,j) = xtmp(i,j);
for (int j = 0; j < nydim; j++)
ydata(i,j) = ytmp(i,j);
}
}
if (eval) {
for (int i = 0; i < nxdim; i++) {
for (int j = 0; j < loop; j++) {
xdata_pce(j,i) = xdata(j,i);
ydata_pce(j,i) = ydata(j,i);
}
}
for (int i = 0; i < nydim; i++) {
for (int j = 0; j < loop; j++) {
xdata_pce(j,i) = xdata(j,i);
ydata_pce(j,i) = ydata(j,i);
}
}
} else {
//read_datafileVS(xdata_pce, "xdata_pce.dat");
//read_datafileVS(ydata_pce, "ydata_pce.dat");
xtmp.SetValue(0.0);
ytmp.SetValue(0.0);
read_datafileVS(xtmp, "xdata_pce.dat");
read_datafileVS(ytmp, "ydata_pce.dat");
for (int i = 0; i < loop; i++) {
for (int j = 0; j < nxdim; j++)
xdata_pce(i,j) = xtmp(i,j);
for (int j = 0; j < nydim; j++)
ydata_pce(i,j) = ytmp(i,j);
}
std::cout << "Built pce with " << xdata_pce.XSize() << " datapoints" << std::endl;
}
//default input settings
string which_chaos="LU"; //PC type
string msc="m";
Lreg* reg;
reg = new PCreg(which_chaos,nord,nxdim);
int nbas = reg->GetNbas();
Array2D<double> ypc_data(xdata.XSize(), nydim, 0.0);
for (int i = 0; i < nydim; i++) {
std::cout << "start dim " << i+1 << std::endl;
Array1D<double> ydata_1d(xdata_pce.XSize(), 0.0);
for (unsigned int j = 0; j < xdata_pce.XSize(); j++)
ydata_1d(j) = ydata_pce(j,i);
std::cout << "setup data" << std::endl;
reg->SetupData(xdata_pce,ydata_1d);
std::cout << "Comput best lambda" << std::endl;
double lambda=reg->LSQ_computeBestLambda();
Array1D<double> lam(nbas,lambda);
reg->SetWeights(lam);
std::cout << "LSQ build regr" << std::endl;
reg->LSQ_BuildRegr();
std::cout << std::endl << "Lambda : " << lambda << std::endl;
Array1D<double> ypc;
Array1D<double> ycheck;
Array2D<double> ycheck_cov;
reg->EvalRegr(xdata,msc,ypc,ycheck,ycheck_cov);
std::cout << std::endl << "Eval" << std::endl;
for (unsigned int j = 0; j < xdata.XSize(); j++)
ypc_data(j,i) = ypc(j);
}
if (eval) {
write_datafile(xdata_pce, "xdata_pce.dat");
write_datafile(ydata_pce, "ydata_pce.dat");
}
write_datafile(xdata, "xdata.dat");
write_datafile(ydata, "ydata.dat");
write_datafile(ypc_data, "ypc_data.dat");
return 0;
}

View File

@ -0,0 +1,22 @@
#include <iostream>
#include "DKSBaseMuSR.h"
/** No accelerator device is used, this test is used to confirm, that search functions
* used for auto-tuning work properly
*/
int main() {
DKSBaseMuSR base;
std::cout << "Start test" << std::endl;
base.testAutoTuning();
std::cout << "Test finished" << std::endl;
return 0;
}

4
cmake/DKSConfig.cmake.in Normal file
View File

@ -0,0 +1,4 @@
SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${${PROJECT_NAME}_CXX_FLAGS}")
SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib")
SET(${PROJECT_NAME}_LIBRARY "dks")

View File

@ -0,0 +1,139 @@
#.rst:
# FindOpenCL
# ----------
#
# Try to find OpenCL
#
# Once done this will define::
#
# OpenCL_FOUND - True if OpenCL was found
# OpenCL_INCLUDE_DIRS - include directories for OpenCL
# OpenCL_LIBRARIES - link against this library to use OpenCL
# OpenCL_VERSION_STRING - Highest supported OpenCL version (eg. 1.2)
# OpenCL_VERSION_MAJOR - The major version of the OpenCL implementation
# OpenCL_VERSION_MINOR - The minor version of the OpenCL implementation
#
# The module will also define two cache variables::
#
# OpenCL_INCLUDE_DIR - the OpenCL include directory
# OpenCL_LIBRARY - the path to the OpenCL library
#
#=============================================================================
# Copyright 2014 Matthaeus G. Chajdas
#
# Distributed under the OSI-approved BSD License (the "License");
# see accompanying file Copyright.txt for details.
#
# This software is distributed WITHOUT ANY WARRANTY; without even the
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the License for more information.
#=============================================================================
# (To distribute this file outside of CMake, substitute the full
# License text for the above reference.)
function(_FIND_OPENCL_VERSION)
include(CheckSymbolExists)
include(CMakePushCheckState)
set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY})
CMAKE_PUSH_CHECK_STATE()
foreach(VERSION "2_0" "1_2" "1_1" "1_0")
set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}")
if(APPLE)
# prefer the header from the Framework
set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/Headers/cl.h")
if(EXISTS "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h")
set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h")
endif()
CHECK_SYMBOL_EXISTS(
CL_VERSION_${VERSION}
${OSX_OpenCL_HEADER}
OPENCL_VERSION_${VERSION})
else()
CHECK_SYMBOL_EXISTS(
CL_VERSION_${VERSION}
"${OpenCL_INCLUDE_DIR}/CL/cl.h"
OPENCL_VERSION_${VERSION})
endif()
if(OPENCL_VERSION_${VERSION})
string(REPLACE "_" "." VERSION "${VERSION}")
set(OpenCL_VERSION_STRING ${VERSION} PARENT_SCOPE)
string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}")
list(GET version_components 0 major_version)
list(GET version_components 1 minor_version)
set(OpenCL_VERSION_MAJOR ${major_version} PARENT_SCOPE)
set(OpenCL_VERSION_MINOR ${minor_version} PARENT_SCOPE)
break()
endif()
endforeach()
CMAKE_POP_CHECK_STATE()
endfunction()
find_path(OpenCL_INCLUDE_DIR
NAMES
CL/cl.h OpenCL/cl.h
PATHS
ENV "PROGRAMFILES(X86)"
ENV AMDAPPSDKROOT
ENV INTELOCLSDKROOT
ENV NVSDKCOMPUTE_ROOT
ENV CUDA_PATH
ENV ATISTREAMSDKROOT
PATH_SUFFIXES
include
OpenCL/common/inc
"AMD APP/include")
_FIND_OPENCL_VERSION()
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
find_path(OpenCL_LIBRARY
NAMES libOpenCL.so
PATHS
ENV "PROGRAMFILES(X86)"
ENV AMDAPPSDKROOT
ENV INTELOCLSDKROOT
ENV CUDA_PATH
ENV NVSDKCOMPUTE_ROOT
ENV ATISTREAMSDKROOT
PATH_SUFFIXES
"AMD APP/lib/x86"
lib/x86
lib/Win32
OpenCL/common/lib/Win32)
elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
find_path(OpenCL_LIBRARY
NAMES libOpenCL.so
PATHS
ENV "PROGRAMFILES(X86)"
ENV AMDAPPSDKROOT
ENV INTELOCLSDKROOT
ENV CUDA_PATH
ENV NVSDKCOMPUTE_ROOT
ENV ATISTREAMSDKROOT
PATH_SUFFIXES
"AMD APP/lib/x86_64"
lib/x86_64
lib/x64
OpenCL/common/lib/x64)
endif()
set(OpenCL_LIBRARIES ${OpenCL_LIBRARY})
set(OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
# Ubuntu 12.04 / Travis CI have an old version of CMake that doesn't
# support "FOUND_VAR OpenCL_FOUND". This could, in principle, be added
# at a later date.
find_package_handle_standard_args(
OpenCL FOUND_VAR OpenCL_FOUND
REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR
VERSION_VAR OpenCL_VERSION_STRING)
mark_as_advanced(
OpenCL_INCLUDE_DIR
OpenCL_LIBRARY)

BIN
doc/refman.pdf Normal file

Binary file not shown.

97
run_tuning_tests.sh Executable file
View File

@ -0,0 +1,97 @@
#!/bin/bash
export MIC_ENV_PREFIX=MIC
echo $MIC_ENV_PREFIX
export MIC_OMP_NUM_THREADS=236
echo $MIC_OMP_NUM_THREADS
export MIC_KMP_PLACE_THREADS=59c4t0o
echo $MIC_KMP_PLACE_THREADS
export MIC_USE_2MB_BUFFERS=64K
echo $MIC_USE_2MB_BUFFERS
export MIC_KMP_AFFINITY=scatter
echo $MIC_KMP_AFFINITY
#./testFFT3DRC 256 256 256
echo 'real strides divisible by 4 but not by 8'
#./testFFT3DRC 257 244 268
#./testFFT3DRC 244 268 257
#./testFFT3DRC 268 257 244
#./testFFT3DRC 257 268 244
#./testFFT3DRC 244 257 268
#./testFFT3DRC 268 244 257
echo 'real strides divisible by 8 but not by 16'
#./testFFT3DRC 257 248 263
#./testFFT3DRC 248 263 257
#./testFFT3DRC 263 257 248
#./testFFT3DRC 257 263 248
#./testFFT3DRC 248 257 263
#./testFFT3DRC 263 248 257
echo 'complex strides divisible by 4 but not by 8'
#./testFFT3DRC 257 246 268
#./testFFT3DRC 246 268 257
#./testFFT3DRC 268 257 246
#./testFFT3DRC 257 268 246
#./testFFT3DRC 246 257 268
#./testFFT3DRC 268 246 257
echo 'complex strides divisible by 8 but not by 16'
#./testFFT3DRC 257 206 317
#./testFFT3DRC 206 317 257
#./testFFT3DRC 317 257 206
#./testFFT3DRC 257 317 206
#./testFFT3DRC 206 257 317
#./testFFT3DRC 317 206 257
echo 'perform scaling tests'
export MIC_OMP_NUM_THREADS=1
echo $MIC_OMP_NUM_THREADS
export MIC_KMP_PLACE_THREADS=1c1t0o
echo $MIC_KMP_PLACE_THREADS
#./testFFT3DRC 256 256 256
export MIC_OMP_NUM_THREADS=2
echo $MIC_OMP_NUM_THREADS
export MIC_KMP_PLACE_THREADS=1c2t0o
echo $MIC_KMP_PLACE_THREADS
#./testFFT3DRC 256 256 256
export MIC_OMP_NUM_THREADS=3
echo $MIC_OMP_NUM_THREADS
export MIC_KMP_PLACE_THREADS=1c3t0o
echo $MIC_KMP_PLACE_THREADS
#./testFFT3DRC 256 256 256
export MIC_OMP_NUM_THREADS=4
echo $MIC_OMP_NUM_THREADS
export MIC_KMP_PLACE_THREADS=1c4t0o
echo $MIC_KMP_PLACE_THREADS
#./testFFT3DRC 256 256 256
NUM_PROC="2 4 8 16 32 59"
for p in $NUM_PROC; do
t=$(($p * 4))
echo $t
export MIC_OMP_NUM_THREADS=$t
echo $MIC_OMP_NUM_THREADS
mystring="$p"
mystring+="c4t0o"
export MIC_KMP_PLACE_THREADS=$mystring
echo $MIC_KMP_PLACE_THREADS
./testFFT3DRC 256 256 256
done

View File

@ -0,0 +1,14 @@
SET (_SRCS
)
SET (_HDRS
ChiSquareRuntime.h
ImageReconstruction.h
CollimatorPhysics.h
FFT.h
)
ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/Algorithms)

View File

@ -0,0 +1,158 @@
#ifndef H_CHISQUARE_RUNTIME
#define H_CHISQUARE_RUNTIME
#include <iostream>
#include <string>
#include <sstream>
#include "../DKSDefinitions.h"
#define BLOCK_SIZE 128
#define FITTYPE_UNDEFINED 0
#define FITTYPE_SINGLE_HISTO 1
#define FITTYPE_ASYMMETRY 2
#define FITTYPE_MU_MINUS 3
class DKSBaseMuSR;
class ChiSquareRuntime {
friend class DKSBaseMuSR;
protected:
// single histo fit parameter
double N0_m;
double tau_m;
double bkg_m;
// asymmetry fit parameter
double alpha_m;
double beta_m;
bool initDone_m;
void *mem_chisq_m;
void *mem_param_m;
void *mem_func_m;
void *mem_map_m;
int numBlocks_m;
int blockSize_m;
char *ptx_m;
void setN0(double value) {
N0_m = value;
}
void setTau(double value) {
tau_m = value;
}
void setBKG(double value) {
bkg_m = value;
}
void setAlpha(double value) {
alpha_m = value;
}
void setBeta(double value) {
beta_m = value;
}
public:
/** Default constructor */
//ChiSquareRuntime();
/** Default destructor */
virtual ~ChiSquareRuntime() { };
virtual int compileProgram(std::string function, bool mlh = false) = 0;
virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result) = 0;
virtual int writeParams(const double *params, int numparams) = 0;
virtual int writeFunc(const double *func, int numfunc) = 0;
virtual int writeMap(const int *map, int nummap) = 0;
virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0;
virtual int freeChiSquare() = 0;
virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0;
/** Set N0, tau and bgk values to use for the kernel.
* If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS.
*/
int setConsts(double N0, double tau, double bkg) {
setN0(N0);
setTau(tau);
setBKG(bkg);
return DKS_SUCCESS;
}
/** Set alpha and beta values to use for the kernel.
* If values changes between data sets this needs to be called before
* every kernel call. Returns DKS_SUCCESS.
*/
int setConsts(double alpha, double beta) {
setAlpha(alpha);
setBeta(beta);
return DKS_SUCCESS;
}
/** Set number of blocks and threads.
* Used to set parameters obtained from auto-tuning
*/
int setKernelParams(int numBlocks, int blockSize) {
int ierr = DKS_ERROR;
if (numBlocks > 0) {
numBlocks_m = numBlocks;
ierr = DKS_SUCCESS;
}
if (blockSize > 0) {
blockSize_m = blockSize;
ierr = DKS_SUCCESS;
}
return ierr;
}
/** Get the number of operations in compiled kernel.
* Count the number of operation in the ptx file for the compiled program.
*/
int getOperations(int &oper) {
std::string ptx_str(ptx_m);
std::istringstream is(ptx_str);
std::string line;
bool start = false;
int count = 0;
while(std::getline(is, line)) {
//when fTheory start enable counting of operations
size_t f1 = line.find("fTheory");
size_t f2 = line.find(".visible");
size_t f3 = line.find(";");
if (f1 != std::string::npos && f2 != std::string::npos) {
start = true;
continue;
}
//exit when the new functions begins
if (start && f2 != std::string::npos)
break;
//count opertations
if (start && f3 != std::string::npos)
count++;
}
oper = count;
return DKS_SUCCESS;
}
};
#endif

View File

@ -0,0 +1,47 @@
#ifndef H_COLLIMATOR_PHYSICS
#define H_COLLIMATOR_PHYSICS
#include <iostream>
#include <string>
#include "../DKSDefinitions.h"
class DKSBaseMuSR;
class DKSCollimatorPhysics {
friend class DKSBaseMuSR;
protected:
int numBlocks_m;
int blockSize_m;
public:
virtual ~DKSCollimatorPhysics() { }
virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0;
virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles) = 0;
virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0;
virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback) = 0;
virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1) = 0;
virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1) = 0;
};
#endif

43
src/Algorithms/FFT.h Normal file
View File

@ -0,0 +1,43 @@
#ifndef H_DKS_FFT
#define H_DKS_FFT
#include <iostream>
#include <math.h>
#include "../DKSDefinitions.h"
class DKSFFT {
protected:
int defaultN[3];
int defaultNdim;
bool useDefaultPlan(int ndim, int N[3]) {
if (ndim != defaultNdim)
return false;
if (N[0] != defaultN[0] && N[1] != defaultN[1] && N[2] != defaultN[2])
return false;
return true;
}
public:
virtual ~DKSFFT() { }
virtual int setupFFT(int ndim, int N[3]) = 0;
virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0;
virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0;
virtual int destroyFFT() = 0;
virtual int executeFFT(void * mem_ptr, int ndim, int N[3],
int streamId = -1, bool forward = true) = 0;
virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0;
virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0;
virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1) = 0;
virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0;
};
#endif

View File

@ -0,0 +1,117 @@
#ifndef H_IMAGERECONSTRUCTION
#define H_IMAGERECONSTRUCTION
#include "../DKSDefinitions.h"
#define BLOCK_SIZE 128
struct VoxelPosition {
float x;
float y;
float z;
};
struct ListEvent {
unsigned detA : 16;
unsigned detB : 16;
};
class ImageReconstruction {
protected:
void *m_event_branch;
public:
virtual ~ImageReconstruction() { }
/** Caluclate source.
* Places a sphere at each voxel position and calculate the avg value and std value of pixels
* that are inside this sphere. All the sphere used have the same diameter.
*/
virtual int calculateSource(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/** Calculate background.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter.
*/
virtual int calculateBackground(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/** Caluclate source using differente sources.
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* each sphere is given by *diameter array.
*/
virtual int calculateSources(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/**
* Places two sphere at each voxel position, calculates the avg value and std value of pixels
* that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the
* smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the
* smaller sphere.
*/
virtual int calculateBackgrounds(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0) = 0;
/** Generate normalization.
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
* that updates voxel values in the image on the slope between these two detectors.
*/
virtual int generateNormalization(void *recon, void *image_position,
void *det_position, int total_det) = 0;
/** Calculate forward projection.
* For image reconstruction calculates forward projections.
* see recon.cpp for details
*/
virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events) = 0;
/** Calculate backward projection.
* For image reconstruction calculates backward projections.
* see recon.cpp for details
*/
virtual int backwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position,
int num_events, int num_voxels) = 0;
/** Set the voxel dimensins on device.
*
*/
virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0;
/** Set the image edge variables on the device.
*
*/
virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0;
/** Set the image edge1 on the device.
*
*/
virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0;
/** Set the minimum crystan in one ring values on the device.
*
*/
virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing,
float min_CrystalDist_InOneRing1) = 0;
/** Set all other required parameters for reconstruction.
*
*/
virtual int setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter) = 0;
};
#endif

View File

@ -0,0 +1,21 @@
SET (_SRCS
DKSAutoTuning.cpp
DKSSearchStates.cpp
DKSConfig.cpp
)
SET (_HDRS
DKSAutoTuning.h
DKSSearchStates.h
DKSAutoTuningTester.h
DKSConfig.h
)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/AutoTuning)

View File

@ -0,0 +1,302 @@
#include "DKSAutoTuning.h"
DKSAutoTuning::DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops) {
base_m = base;
api_name_m = api;
device_name_m = device;
loops_m = loops;
evaluate_time_m = true;
}
DKSAutoTuning::~DKSAutoTuning() {
params_m.clear();
}
int DKSAutoTuning::setParameterValues(States state) {
//if states and params don't match in size something has gone wrong
if (state.size() != params_m.size()) {
DEBUG_MSG("Parameters and states don't match!");
return DKS_ERROR;
}
//set the value pointed by params to value saved in state
for (unsigned int i = 0; i < params_m.size(); i++)
params_m[i].setValue(state[i].value);
return DKS_SUCCESS;
}
/** TODO: might need a better timing for GPU code */
int DKSAutoTuning::evaluateFunction(double &value) {
int ierr = DKS_ERROR;
DKSTimer t;
t.init(function_name_m);
if (evaluate_time_m) {
//run for "loop" times and return the average time.
//syncDevice() is used to make sure that nothing is running on the device before the timer starts
// and to make sure the function has completed on the device before the time stops
for (int j = 0; j < loops_m; j++) {
base_m->syncDevice();
t.start();
ierr = f_m();
base_m->syncDevice();
t.stop();
if (ierr != DKS_SUCCESS) //exit loop if kernel execution fials
break;
}
//returns
value = t.gettime() / loops_m;
} else {
value = fd_m();
ierr = DKS_SUCCESS;
}
return ierr;
}
void DKSAutoTuning::clearParameters() {
params_m.clear();
}
void DKSAutoTuning::exaustiveSearch() {
DKSTimer t;
t.init("exaustive search");
t.start();
if (params_m.size() < 2)
return;
Parameter p1 = params_m[0];
Parameter p2 = params_m[1];
double time;
double mint = 1000000.0;
int minv1 = 0;
int minv2 = 0;
//std::ofstream myfile;
//std::string filename;
//filename = "search_" + api_name_m + "_" + device_name_m + ".dat";
//myfile.open(filename);
for (double v1 = p1.min; v1 <= p1.max; v1 += p1.step) {
for (double v2 = p2.min; v2 <= p2.max; v2 += p2.step) {
p1.setValue(v1);
p2.setValue(v2);
int ierr = evaluateFunction(time);
if (ierr == DKS_SUCCESS && time < mint) {
mint = time;
minv1 = v1;
minv2 = v2;
}
if (ierr == DKS_ERROR)
time = 1;
//myfile << time << "\t";
}
//myfile << "\n";
}
//myfile.close();
//std::cout << "Optimal launch parameters:" << std::endl;
//std::cout << mint << "\t" << minv1 << "\t" << minv2 << std::endl;
p1.setValue(minv1);
p2.setValue(minv2);
t.stop();
//std::cout << "exaustive search: " << t.gettime() << std::endl;
}
void DKSAutoTuning::lineSearch() {
DKSTimer t;
t.init("line search");
t.start();
double time;
int ierr = DKS_ERROR;
if (params_m.size() < 1) {
DEBUG_MSG("Need some parameters to autotune!");
return;
}
double mint = 1000000.0;
//loop trough parameters one parameter at a time
for (auto param : params_m) {
int minv = param.getValue();
//go trough all the values of the parameter, while keeping other parameters const
for (double i = param.min; i <= param.max; i += param.step) {
//adjust parameters
param.setValue(i);
//run for "loop" times and get average
ierr = evaluateFunction(time);
//if there was no error executing the function and time is better than previou
//min time, save the parameter configuration
if (ierr == DKS_SUCCESS && time < mint) {
mint = time;
minv = i;
}
} //repeat
param.setValue(minv);
}
//DEBUG: print out the found best parameters
for (auto param : params_m)
std::cout << "Parameter " << param.name << " set to " << param.getValue() << std::endl;
std::cout << "Best time: " << mint << std::endl;
t.stop();
std::cout << "Line search time: " << t.gettime() << std::endl;
}
void DKSAutoTuning::hillClimbing(int restart_loops) {
DKSTimer t;
t.init("hill climbing");
t.start();
std::cout << "hill climbing" << std::endl;
int ierr;
double time_current;
double time_next;
DKSSearchStates search(params_m);
std::cout << "start " << restart_loops << std::endl;
for (int i = 0; i < restart_loops; i++) {
//init random current state
search.initCurrentState();
//evaluate current state
setParameterValues(search.getCurrentState());
ierr = evaluateFunction(time_current);
//std::cout << "Start iteration " << i+1 << std::endl;
//search.printCurrentState(time_current);
if (ierr == DKS_ERROR)
continue;
//statr the loop
bool topReached = false;
while(!topReached) {
search.getNeighbours();
//get all the neighbors of the current state
bool neighbourFound = false;
while (!neighbourFound && search.nextNeighbourExists()) {
//evaluate all the neighbors of the current state
setParameterValues(search.getNextNeighbour());
ierr = evaluateFunction(time_next);
//search.printNeighbour(time_next);
if (ierr == DKS_ERROR)
std::cout << "Error evaluating function" << std::endl;
//move to the first option that improives the solution
if (ierr == DKS_SUCCESS && time_next < time_current) {
time_current = time_next;
search.moveToNeighbour();
neighbourFound = true;
}
}
//if no better option is found save the state and move to step 1
if (!neighbourFound) {
search.saveCurrentState(time_current);
topReached = true;
}
}
}
std::cout << std::endl;
search.printBest();
t.stop();
std::cout << "hill climbing: " << t.gettime() << std::endl;
}
void DKSAutoTuning::simulatedAnnealing(double Tstart, double Tstep) {
DKSTimer t;
t.init("simulated annealing");
t.start();
int ierr;
double time_current;
double time_next;
DKSSearchStates search(params_m);
//make a random guess
search.initCurrentState();
//evaluate current state
setParameterValues(search.getCurrentState());
ierr = evaluateFunction(time_current);
if (ierr == DKS_ERROR)
return;
for (double Temp = Tstart; Temp > 0; Temp -= Tstep) {
search.printCurrentState(time_current);
//calucate all the neighbours of current state
search.getNeighbours(10);
//make a move to random neighbour and evaluate the runtime
setParameterValues(search.getRandomNeighbour());
ierr = evaluateFunction(time_next);
if (ierr == DKS_ERROR)
return;
//if the solution improves move to this point else move to this point with probabily exp(-dE/T)
if (time_next < time_current) {
time_current = time_next;
search.moveToNeighbour();
} else {
double p = (double)rand() / RAND_MAX;
double dE = time_next - time_current;
double P = exp(-dE/Temp);
if (P > p) {
time_current = time_next;
search.moveToNeighbour();
}
}
}
search.printCurrentState(time_current);
t.stop();
std::cout << "Simulated annealing: " << t.gettime() << std::endl;
}

View File

@ -0,0 +1,103 @@
#ifndef DKS_AUTOTUNIG
#define DKS_AUTOTUNIG
#include <iostream>
#include <functional>
#include <vector>
#include <string>
#include <fstream>
#include <cstdlib>
#include <chrono>
#include <ctime>
#include "../DKSBase.h"
#include "../Utility/DKSTimer.h"
#include "DKSSearchStates.h"
typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States;
class DKSAutoTuning {
private:
bool evaluate_time_m;
std::string api_name_m;
std::string device_name_m;
std::string function_name_m;
std::function<int()> f_m;
std::function<double()> fd_m;
Parameters params_m;
DKSBase *base_m;
int loops_m;
/** Update parameters from a state */
int setParameterValues(States states);
/** Evaluate the function and set execution time
* Returns DKS_ERROR if errors occured during function execution.
* Returns DKS_SUCCESS if function executed as planned.
*/
int evaluateFunction(double &value);
public:
/** Constructor */
DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100);
/** Destructor */
~DKSAutoTuning();
/** Set function to auto tune.
* Caller of setFunction is responsible to bind the correct parameters
* to the function with std::bind.
*/
void setFunction(std::function<int()> f, std::string name, bool evaluate_time = true) {
f_m = f;
function_name_m = name;
evaluate_time_m = evaluate_time;
}
void setFunction(std::function<double()> f, std::string name, bool evaluate_time = false) {
fd_m = f;
function_name_m = name;
evaluate_time_m = evaluate_time;
}
/** Set parameter for auto tuning.
* Provide a pointer to a parameter that will be changed during auto-tuning
* and a min-max value for this element
*/
template <typename T1>
void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) {
Parameter p(value, min, max, step, name);
params_m.push_back(p);
}
/** Delete all added parameters */
void clearParameters();
/** Perform exaustive search evaluating all the parameter configurations */
void exaustiveSearch();
/** Perform auto-tuning.
* Perform line-search auto-tuning by variying parameters one at a time and keeping other
* parameters constant.
*/
void lineSearch();
/** Perform hill climbing
*/
void hillClimbing(int restart_loops = 1);
/** Perfor simulated annealing to find the parameters */
void simulatedAnnealing(double Tstart, double Tstep);
};
#endif

View File

@ -0,0 +1,33 @@
#ifndef DKS_TESTAUTOTUNING
#define DKS_TESTAUTOTUNING
#include <iostream>
#include <cmath>
class DKSAutoTuningTester {
friend class DKSBaseMuSR;
private:
double x;
double y;
public:
DKSAutoTuningTester() {
x = 0.0;
y = 0.0;
}
~DKSAutoTuningTester();
double peaksZ() {
double z = 3 * pow(1-x,2) * exp(-pow(x,2) - pow(y+1,2)) - 10 * (x/5 - pow(x,3) - pow(y,5)) * exp(-pow(x,2) - pow(y,2)) - (1.0/3.0) * exp( - pow(x+1,2) - pow(y,2));
return z;
}
};
#endif

View File

@ -0,0 +1,163 @@
#include "DKSConfig.h"
DKSConfig::DKSConfig() {
//get home directory
homeset_m = true;
if ((homedir_m = getenv("HOME")) == NULL)
homeset_m = false;
loadConfigFile();
}
DKSConfig::~DKSConfig() {
//delete tree_m;
saveConfigFile();
}
int DKSConfig::loadConfigFile() {
int ierr = DKS_ERROR;
/*
if (homeset_m) {
//check if $HOME/.config/DKS exists
std::string filename = homedir_m + config_dir + config_file;
std::cout << "Check for: " << filename << std::endl;
if (fs::exists(filename)) {
try {
pt::read_xml(filename, tree_m);
treeloaded_m = true;
ierr = DKS_SUCCESS;
} catch (std::exception &e) {
DEBUG_MSG("Error loading autotuning file!");
treeloaded_m = false;
ierr = DKS_ERROR;
}
}
}
*/
return ierr;
}
int DKSConfig::saveConfigFile() {
int ierr = DKS_ERROR;
/*
std::string savedir = homedir_m + config_dir;
std::string savefile = homedir_m + config_dir + config_file;
std::cout << savedir << std::endl;
std::cout << savefile << std::endl;
if (homeset_m) {
//check if $HOME/.config/DKS directory exists, if not create
bool homecreated = false;
fs::path p (savedir);
if (!fs::is_directory(p))
homecreated = fs::create_directory(p);
try {
if (homecreated) {
pt::write_xml(savefile, tree_m);
ierr = DKS_SUCCESS;
}
} catch(std::exception &e) {
ierr = DKS_ERROR;
}
}
*/
return ierr;
}
int DKSConfig::addConfigParameter(const std::string api, const std::string device,
const std::string name, const std::string func,
int size, std::string param, int value) {
//keys to acces data in the tree
std::string device_name = name;
device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end());
std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func;
std::string parameter = key + ".parameter";
std::string attr_size = "<xmlattr>.size";
std::string attr_param = "<xmlattr>." + param;
//tmp node where new attributes are cteated in case the attribute doesn't exist in the tree
pt::ptree *tmp;
bool newNode = true;
//loop trough all the items in the node and see if new param needs to be created
//or old one updated
boost::optional< pt::ptree& > child = tree_m.get_child_optional(key);
if (child) {
BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) {
int oldsize = v.second.get<int>(attr_size,-1);
//if param with the same size already exists in the tree save pointer to this
if (size == oldsize) {
tmp = &v.second;
newNode = false;
}
}
}
//if parameter doesnt exist with this size, create a new parameter
if (newNode) {
tmp = new pt::ptree();
tmp->add(attr_size, size);
tmp->add(attr_param, value);
tree_m.add_child(parameter, *tmp);
} else {
//if parameter exists update the parameter value
tmp->put(attr_param, value);
}
return DKS_SUCCESS;
}
int DKSConfig::getConfigParameter(const std::string api, const std::string device,
const std::string name, const std::string func,
int size, std::string param, int &value) {
//get the value of the tree, default to -1 if value doesn't exist
int ierr = DKS_SUCCESS;
//define key and attribute values to find parameters in the tree
std::string device_name = name;
device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end());
std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func;
std::string attr_size = "<xmlattr>.size";
std::string attr_param = "<xmlattr>." + param;
float maxDist = std::numeric_limits<float>::max();
//check if the parameters exist
boost::optional< pt::ptree& > child = tree_m.get_child_optional(key);
if (child) {
//loop trough parameters and get the one that is closes to the size specified
BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) {
int param_size = v.second.get<int>(attr_size,-1); //get parameter size
if (param_size > 0) { // if param_size is -1 param is not defined correctly and not usable
float dist = abs(param_size - size);
if (dist < maxDist) {
value = v.second.get<int>(attr_param,-1);
maxDist = dist;
}
}
}
} else {
value = -1;
ierr = DKS_ERROR;
}
return ierr;
}

View File

@ -0,0 +1,69 @@
/** Class to save and load DKS autotunning configs.
* Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml.
* Uses boost xml_parser to read and write the xml file and boost property tree to store
* the xml content.
*/
#ifndef DKS_CONFIG
#define DKS_CONFIG
#include <boost/property_tree/ptree.hpp>
#include <boost/optional/optional.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <boost/foreach.hpp>
#include <boost/filesystem.hpp>
#include <string>
#include <iostream>
#include <cstdlib>
#include <exception>
#include <limits>
#include <cmath>
#include <algorithm>
#include <cctype>
#include "../DKSDefinitions.h"
namespace pt = boost::property_tree;
namespace fs = boost::filesystem;
const std::string config_dir = "/.config/DKS";
const std::string config_file = "/autotuning.xml";
class DKSConfig {
private:
pt::ptree tree_m;
const char *homedir_m;
bool homeset_m;
bool treeloaded_m;
public:
/** Constructor, set home variable.
* If home directory is not set config file can not be read or saved
*/
DKSConfig();
~DKSConfig();
/** Load autotuinig.xml into tree variable if this file exists */
int loadConfigFile();
/** Save autotuning.xml file */
int saveConfigFile();
/** Add config parameter to tree */
int addConfigParameter(const std::string api, const std::string device,
const std::string name, const std::string func,
int size, std::string param, int value);
/** Get config parameter from the tree */
int getConfigParameter(const std::string api, const std::string device,
const std::string name, const std::string func,
int size, std::string param, int &value);
};
#endif

View File

@ -0,0 +1,233 @@
#include "DKSSearchStates.h"
/** set the current state so that number of parameters and parameter bounds are known */
DKSSearchStates::DKSSearchStates(Parameters params) {
for (auto p : params) {
State s;
s.value = p.getValue();
s.min = p.min;
s.max = p.max;
s.step = p.step;
current_state_m.push_back(s);
}
neighbour_state_m.resize(current_state_m.size());
best_state_m.resize(current_state_m.size());
best_time_m = std::numeric_limits<double>::max();
next_neighbour_m = -1;
srand(time(NULL));
}
DKSSearchStates::~DKSSearchStates() {
current_state_m.clear();
neighbour_state_m.clear();
best_state_m.clear();
neighbours_m.clear();
}
/** Get all the possible neighbours of the current state */
void DKSSearchStates::getNeighbours(int dist) {
std::vector< std::vector<double> > values;
for (auto state : current_state_m) {
std::vector<double> s;
for (int d = dist; d > 0; d--) {
if (state.value - d*state.step >= state.min)
s.push_back(state.value - state.step);
}
s.push_back(state.value);
for (int d = 1; d < dist + 1; d++) {
if (state.value + d*state.step <= state.max)
s.push_back(state.value + state.step);
}
values.push_back(s);
}
std::vector< std::vector<double> > s {{}};
for (auto& u : values) {
std::vector< std::vector<double> > r;
for(auto& x : s) {
for( auto y : u) {
r.push_back(x);
r.back().push_back(y);
}
}
s.swap(r);
}
//get current state values
std::vector<double> current;
for (auto state : current_state_m)
current.push_back(state.value);
s.erase(std::remove(s.begin(), s.end(), current));
neighbours_m.clear();
neighbours_m = s;
next_neighbour_m = 0;
}
void DKSSearchStates::setCurrentState(std::vector<Parameter> current_state) {
current_state_m.clear();
for (auto& p : current_state) {
State s;
s.value = p.getValue();
s.min = p.min;
s.max = p.max;
s.step = p.step;
current_state_m.push_back(s);
}
}
void DKSSearchStates::setCurrentState(std::vector<State> current_state) {
current_state_m.clear();
for (auto& p : current_state) {
State s;
s.value = p.value;
s.min = p.min;
s.max = p.max;
s.step = p.step;
current_state_m.push_back(s);
}
}
void DKSSearchStates::initCurrentState() {
//go trough parameters in current state and generate a new random value
for (auto& s : current_state_m) {
//get number of total values
int values = (s.max - s.min) / s.step + 1;
int r = rand() % values;
s.value = s.min + r * s.step;
}
getNeighbours();
}
States DKSSearchStates::getCurrentState() {
return current_state_m;
}
States DKSSearchStates::getNextNeighbour() {
//check if there are ant neighbours to move on
if (next_neighbour_m < (int)neighbours_m.size()) {
//get the vector of values for each parameters in the neighbour cell
std::vector<double> neighbour_values = neighbours_m[next_neighbour_m];
//set the values to neighbour_state_m
for (unsigned int n = 0; n < neighbour_state_m.size(); n++)
neighbour_state_m[n].value = neighbour_values[n];
}
next_neighbour_m++;
return neighbour_state_m;
}
States DKSSearchStates::getRandomNeighbour() {
int rand_neighbour = rand() % (int)neighbours_m.size();
//get the vector of values for each parameters in the neighbour cell
std::vector<double> neighbour_values = neighbours_m[rand_neighbour];
//set the values to neighbour_state_m
for (unsigned int n = 0; n < neighbour_state_m.size(); n++)
neighbour_state_m[n].value = neighbour_values[n];
next_neighbour_m = rand_neighbour + 1;
return neighbour_state_m;
}
bool DKSSearchStates::nextNeighbourExists() {
bool neighbourExists = false;
if (next_neighbour_m < (int)neighbours_m.size())
neighbourExists = true;
return neighbourExists;
}
void DKSSearchStates::moveToNeighbour() {
for (unsigned int i = 0; i < current_state_m.size(); i++)
current_state_m[i].value = neighbour_state_m[i].value;
//getNeighbours();
}
void DKSSearchStates::saveCurrentState(double current_time) {
if (current_time < best_time_m) {
for (unsigned int i = 0; i < current_state_m.size(); i++) {
best_state_m[i].value = current_state_m[i].value;
best_state_m[i].min = current_state_m[i].min;
best_state_m[i].max = current_state_m[i].max;
best_state_m[i].step = current_state_m[i].step;
}
best_time_m = current_time;
}
}
void DKSSearchStates::printCurrentState(double time) {
std::cout << "Current state: ";
for (auto s : current_state_m)
std::cout << s.value << "\t";
std::cout << time << std::endl;
}
void DKSSearchStates::printInfo() {
std::cout << "Current state: ";
for (auto s : current_state_m)
std::cout << s.value << "\t";
std::cout << std::endl;
std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): ";
if (next_neighbour_m > 0) {
for (auto s : neighbour_state_m)
std::cout << s.value << "\t";
}
std::cout << std::endl;
}
void DKSSearchStates::printNeighbour(double time) {
std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): ";
if (next_neighbour_m > 0) {
for (auto s : neighbour_state_m)
std::cout << s.value << "\t";
}
std::cout << time << std::endl;
}
void DKSSearchStates::printBest() {
std::cout << "Best state (" << best_time_m << "): ";
if (best_time_m > 0) {
for (auto s : best_state_m)
std::cout << s.value << "\t";
}
std::cout << std::endl;
}

View File

@ -0,0 +1,162 @@
#ifndef DKS_SEARCHSTATES
#define DKS_SEARCHSTATES
#include <iostream>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <limits>
enum VALUE_TYPE { DKS_INT, DKS_DOUBLE };
class Parameter {
private:
int *ivalue;
double *dvalue;
VALUE_TYPE type;
public:
double min;
double max;
double step;
std::string name;
Parameter(int *_value, int _min, int _max, int _step, std::string _name) {
ivalue = _value;
min = (double)_min;
max = (double)_max;
step = (double)_step;
name = _name;
type = DKS_INT;
}
Parameter(double *_value, double _min, double _max, double _step, std::string _name) {
std::cout << "Double" << std::endl;
dvalue = _value;
min = _min;
max = _max;
step = _step;
name = _name;
type = DKS_DOUBLE;
}
template <typename T>
void setValue(T v) {
if (type == DKS_INT)
*ivalue = (int)v;
if (type == DKS_DOUBLE)
*dvalue = (double)v;
}
double getValue() {
switch (type) {
case DKS_INT:
return (double)*ivalue;
case DKS_DOUBLE:
return *dvalue;
};
return -1.0;
}
};
struct State {
double value;
double min;
double max;
double step;
};
typedef std::vector<Parameter> Parameters;
typedef std::vector<State> States;
class DKSSearchStates {
private:
States current_state_m;
States neighbour_state_m;
States best_state_m;
double best_time_m;
std::vector< std::vector<double> > neighbours_m;
int next_neighbour_m;
public:
/** Constructor alwats takes params array as variable.
* Params array is needed to know how many params will be searched and what are thou bounds
* of each parameter.
*/
DKSSearchStates(Parameters params);
~DKSSearchStates();
/** Set current state using parameter vector */
void setCurrentState(Parameters current_state);
/** set current state using the state vector */
void setCurrentState(States current_state);
/** init random current state */
void initCurrentState();
/** get current state */
States getCurrentState();
/** get next neighbour state.
* if there are no next neighbore stay at the curretn neighbour
*/
States getNextNeighbour();
/** get random neighbour state */
States getRandomNeighbour();
/** calculate all the neighbour states */
void getNeighbours(int dist = 1);
/** Chech if there are more neighbours to evaluate
* Return true if more neighbors exist, false if we are at the last neighbour
*/
bool nextNeighbourExists();
/** move to next neighbour.
* set the current state as the next neighbour,
* calculate the neighbours of the new current state.
*/
void moveToNeighbour();
/** Save the current state and the evaluation time of the current state.
* If evaluation time of the current state is better than the evaluation time of the
* best state, save the current state as best.
*/
void saveCurrentState(double current_time);
//Print functions - mostly usefull for debugging purposes, or for benchmark runs to print the
//status of the search
/** Print current state.
* cout the current state. Mostly used for debuging purposes
*/
void printCurrentState(double time = 0.0);
/** Print current neighbour info */
void printNeighbour(double time = 0.0);
/** Print info.
* Print the whole info about the search: current state, current neighbour, total neighbors
*/
void printInfo();
/** Print the best saved state */
void printBest();
};
#endif

130
src/CMakeLists.txt Normal file
View File

@ -0,0 +1,130 @@
CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
MACRO (ADD_SOURCES )
FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
FOREACH (_src ${ARGN})
IF (_relPath)
LIST (APPEND DKS_SRCS "${_relPath}/${_src}")
ELSE ()
LIST (APPEND DKS_SRCS "${_src}")
ENDIF ()
ENDFOREACH ()
IF (_relPath)
# propagate SRCS to parent directory
SET (DKS_SRCS ${DKS_SRCS} PARENT_SCOPE)
ENDIF ()
ENDMACRO ()
MACRO (ADD_HEADERS )
FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
FOREACH (_hdr ${ARGN})
IF (_relPath)
LIST (APPEND DKS_HDRS "${_relPath}/${_hdr}")
ELSE ()
LIST (APPEND DKS_HDRS "${_hdr}")
ENDIF ()
ENDFOREACH ()
IF (_relPath)
# propagate HDRS to parent directory
SET (DKS_HDRS ${DKS_HDRS} PARENT_SCOPE)
ENDIF ()
ENDMACRO ()
SET (DKS_BASEDIR_HDRS
DKSBase.h
DKSDefinitions.h
)
SET (DKS_BASEDIR_SRCS
DKSBase.cpp
)
IF (USE_CUDA OR USE_OPENCL)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSBaseMuSR.h
)
SET (DKS_BASEDIR_SRCS
${DKS_BASEDIR_SRCS}
DKSBaseMuSR.cpp
)
ENDIF (USE_CUDA OR USE_OPENCL)
IF (USE_CUDA)
SET (DKS_BASEDIR_HDRS
${DKS_BASEDIR_HDRS}
DKSImageReconstruction.h
)
SET (DKS_BASEDIR_SRCS
${DKS_BASEDIR_SRCS}
DKSImageReconstruction.cpp
)
ENDIF (USE_CUDA)
ADD_HEADERS (${DKS_BASEDIR_HDRS})
ADD_SOURCES (${DKS_BASEDIR_SRCS})
MESSAGE (STATUS "HEADERS: ${DKS_BASEDIR_HDRS}")
MESSAGE (STATUS "SOURCES: ${DKS_BASEDIR_SRCS}")
#add only those source files that will be used
IF (USE_OPENCL)
MESSAGE (STATUS "Add OpenCL source files")
ADD_SUBDIRECTORY (OpenCL)
ENDIF (USE_OPENCL)
IF (USE_CUDA)
MESSAGE (STATUS "Add CUDA source files")
ADD_SUBDIRECTORY (CUDA)
ENDIF (USE_CUDA)
IF (USE_MIC)
MESSAGE (STATUS "Add MIC source files")
ADD_SUBDIRECTORY (MIC)
ENDIF (USE_MIC)
ADD_SUBDIRECTORY (Utility)
ADD_SUBDIRECTORY (AutoTuning)
ADD_SUBDIRECTORY (Algorithms)
IF (USE_CUDA)
CUDA_ADD_LIBRARY(dks ${DKS_SRCS})
CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
IF (USE_UQTK)
TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ELSE (USE_UQTK)
TARGET_LINK_LIBRARIES(dks cudadevrt)
TARGET_LINK_LIBRARIES(dksshared cudadevrt)
ENDIF (USE_UQTK)
ELSE (USE_CUDA)
MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}")
ADD_LIBRARY(dks ${DKS_SRCS})
ADD_LIBRARY(dksshared SHARED ${DKS_SRCS})
IF (USE_UQTK)
TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran)
ELSE (USE_UQTK)
TARGET_LINK_LIBRARIES(dks)
TARGET_LINK_LIBRARIES(dksshared)
ENDIF(USE_UQTK)
ENDIF (USE_CUDA)
INSTALL(TARGETS dks DESTINATION lib)
INSTALL(TARGETS dksshared DESTINATION lib)
INSTALL(FILES ${DKS_BASEDIR_HDRS} DESTINATION include)
#IF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc"))
# INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/libdksMIC.a DESTINATION build/lib)
#ENDIF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc"))

35
src/CUDA/CMakeLists.txt Normal file
View File

@ -0,0 +1,35 @@
SET (_HDRS
CudaBase.cuh
CudaFFT.cuh
CudaGreensFunction.cuh
CudaChiSquare.cuh
CudaCollimatorPhysics.cuh
CudaImageReconstruction.cuh
CudaChiSquareRuntime.cuh
)
SET (_SRCS
CudaBase.cu
CudaFFT.cu
CudaGreensFunction.cu
CudaChiSquare.cu
CudaCollimatorPhysics.cu
CudaImageReconstruction.cu
CudaChiSquareRuntime.cu
)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
ADD_SOURCES(${_SRCS})
ADD_HEADERS(${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/CUDA)
SET (_KERNELS
NVRTCKernels/CudaChiSquareKernel.cu
)
INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels)

View File

@ -0,0 +1,25 @@
CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
FIND_PACKAGE(CUDA REQUIRED)
SET (CUDA_NVCC_FLAGS "-arch=sm_30")
SET(LIB_TYPE STATIC)
SET (DKS_CUDA_HDRS
CudaBase.cuh
CudaFFT.cuh
CudaGreensFunction.cuh
)
SET (DKS_CUDA_SRCS
CudaBase.cu
CudaFFT.cu
CudaGreensFunction.cu
)
INCLUDE_DIRECTORIES (
${CMAKE_CURRENT_SOURCE_DIR}
)
CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS})

386
src/CUDA/CudaBase.cu Normal file
View File

@ -0,0 +1,386 @@
#include "CudaBase.cuh"
//=====================================//
//============Cuda kernels=============//
//=====================================//
__global__ void initcuRandState(curandState *state, int size, int seed = 0) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
curand_init(seed + idx, 0, 0, &state[idx]);
}
}
//=====================================//
//==========Private functions==========//
//=====================================//
//====================================//
//==========Public functions==========//
//====================================//
CudaBase::CudaBase() {
currentStream = -1;
cudaStreams.reserve(10);
defaultRndSet = -1;
}
CudaBase::~CudaBase() {
cuda_deleteStreams();
cuda_deleteCurandStates();
}
/*
create curandStates
*/
int CudaBase::cuda_createCurandStates(int size) {
if (defaultRndSet == 1)
cuda_deleteCurandStates();
int threads = 128;
int blocks = size / threads + 1;
int seed = time(NULL);
//std::cout << "sizeof: " << sizeof(curandState) << std::endl;
cudaMalloc(&defaultRndState, sizeof(curandState)*size);
initcuRandState<<<blocks, threads>>>(defaultRndState, size, seed);
defaultRndSet = 1;
return DKS_SUCCESS;
}
int CudaBase::cuda_deleteCurandStates() {
if (defaultRndSet == 1) {
cudaFree(defaultRndState);
defaultRndSet = -1;
}
return DKS_SUCCESS;
}
curandState* CudaBase::cuda_getCurandStates() {
return defaultRndState;
}
/*
add cuda stream
*/
int CudaBase::cuda_createStream(int &streamId) {
cudaStream_t tmpStream;
cudaError_t cerror;
cerror = cudaStreamCreate(&tmpStream);
if (cerror != cudaSuccess) {
DEBUG_MSG("Failed to create new CUDA stream, cuda error: " << cerror);
return DKS_ERROR;
}
cudaStreams.push_back(tmpStream);
streamId = cudaStreams.size() - 1;
return DKS_SUCCESS;
}
/*
add existing stream to list
*/
int CudaBase::cuda_addStream(cudaStream_t tmpStream, int &streamId) {
cudaStreams.push_back(tmpStream);
streamId = cudaStreams.size() - 1;
return DKS_SUCCESS;
}
/*
delete stream
*/
int CudaBase::cuda_deleteStream(int id) {
//TODO: lets see if this is necessary, currently do nothing
return DKS_ERROR;
}
/*
delete all streams
*/
int CudaBase::cuda_deleteStreams() {
//delete all cuda streams
for (unsigned int i = 0; i < cudaStreams.size(); i++) {
cudaStreamDestroy(cudaStreams[i]);
}
cudaStreams.clear();
currentStream = -1;
return DKS_SUCCESS;
}
/*
set stream id
*/
int CudaBase::cuda_setStream(int id) {
currentStream = id;
return DKS_SUCCESS;
}
/*
return stream id
*/
int CudaBase::cuda_getStreamId() {
return currentStream;
}
/*
set default stream as the stream to use
*/
int CudaBase::cuda_defaultStream() {
currentStream = -1;
return DKS_SUCCESS;
}
int CudaBase::cuda_numberOfStreams() {
return cudaStreams.size();
}
cudaStream_t CudaBase::cuda_getStream(int id) {
return cudaStreams[id];
}
cublasHandle_t CudaBase::cuda_getCublas() {
return defaultCublas;
}
/*
get all available cuda devices
*/
int CudaBase::cuda_getDevices() {
std::cout << std::endl;
std::cout << "==============================" << std::endl;
std::cout << "=============CUDA=============" << std::endl;
std::cout << "==============================" << std::endl;
int ndev;
cudaGetDeviceCount(&ndev);
std::cout << ndev << std::endl;
for (int i = 0; i < ndev; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "Device " << i+1 << ":" << std::endl;
std::cout << "Name: " << prop.name << std::endl;
std::cout << "PCI bus id: " << prop.pciBusID << std::endl;
std::cout << "PCI device id: " << prop.pciDeviceID << std::endl;
std::cout << "PCI domain id: " << prop.pciDomainID << std::endl;
std::cout << "==============================" << std::endl;
}
return DKS_SUCCESS;
}
int CudaBase::cuda_getDeviceCount(int &ndev) {
cudaGetDeviceCount(&ndev);
return DKS_SUCCESS;
}
int CudaBase::cuda_getDeviceName(std::string &device_name) {
int ierr = DKS_SUCCESS;
int ndev = 0;
cudaGetDeviceCount(&ndev);
if (ndev > 0) {
int device = 0;
cudaDeviceProp prop;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
device_name = prop.name;
} else {
ierr = DKS_ERROR;
}
return ierr;
}
int CudaBase::cuda_setDevice(int device) {
int ierr = DKS_SUCCESS;
int ndev = 0;
cudaGetDeviceCount(&ndev);
std::cout << "Init: " << device << "\t" << ndev << std::endl;
if (device < ndev) {
std::cout << "set device to: " << ndev << std::endl;
cudaSetDevice(device);
} else {
if (ndev > 0)
cudaSetDevice(0);
else
ierr = DKS_ERROR;
}
return ierr;
}
int CudaBase::cuda_getUniqueDevices(std::vector<int> &devices) {
std::vector< std::string > names;
int ndev;
cudaGetDeviceCount(&ndev);
for (int i = 0; i < ndev; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
//add first device to the list, for other devices check if the name is already in the list
if (i == 0) {
devices.push_back(i);
names.push_back(prop.name);
} else {
std::string target = prop.name;
bool isPresent = (std::find(names.begin(), names.end(), target) != names.end());
if (!isPresent) {
devices.push_back(i);
names.push_back(prop.name);
}
}
}
return DKS_SUCCESS;
}
/*
set up cuda device
*/
int CudaBase::cuda_setUp() {
std::cout << "set up" << std::endl;
return DKS_SUCCESS;
}
/*
allocate memory on cuda device
*/
void * CudaBase::cuda_allocateMemory(size_t size, int &ierr) {
cudaError cerror;
void * mem_ptr = NULL;
cerror = cudaMalloc((void **) &mem_ptr, size);
if (cerror != cudaSuccess) {
DEBUG_MSG("Failed to allocate memory, cuda error: " << cerror);
std::cout << "Error: " << cudaGetErrorString(cerror) << std::endl;
ierr = DKS_ERROR;
} else {
ierr = DKS_SUCCESS;
}
return mem_ptr;
}
/*
Info: free memory on device
Return: success or error code
*/
int CudaBase::cuda_freeMemory(void * mem_ptr) {
cudaError cerror;
cerror = cudaFree(mem_ptr);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error freeing memory, cuda error: " << cerror);
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int CudaBase::cuda_freeHostMemory(void * mem_ptr) {
cudaError cerror;
cerror = cudaFreeHost(mem_ptr);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error freeing host memory, cuda error: " << cerror);
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/*
Info: allcate memory and write data (push)
Return: pointer to memory object
*/
/*
void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) {
void * mem_ptr;
mem_ptr = cuda_allocateMemory(size, ierr);
if (ierr == DKS_SUCCESS)
ierr = cuda_writeData(mem_ptr, in_data, size);
return mem_ptr;
}
*/
/*
Info: read data and free memory (pull)
Return: success or error code
*/
/*
int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) {
ierr = cuda_readData(mem_ptr, out_data, size);
if (ierr == DKS_SUCCESS)
ierr = cuda_freeMemory(mem_ptr);
else
return DKS_ERROR;
if (ierr == DKS_SUCCESS)
return DKS_SUCCESS;
else
return DKS_ERROR;
}
*/
/*
Info: execute function
Return: success or error code
*/
int CudaBase::cuda_executeFunction() {
std::cout << "Execute function" << std::endl;
return DKS_SUCCESS;
}
/*
Info: clean up
Return: success or error code
*/
int CudaBase::cuda_cleanUp() {
std::cout << "clean up" << std::endl;
return DKS_SUCCESS;
}

390
src/CUDA/CudaBase.cuh Normal file
View File

@ -0,0 +1,390 @@
#ifndef H_CUDA_BASE
#define H_CUDA_BASE
#include "../DKSDefinitions.h"
#include <iostream>
#include <stdio.h>
#include <vector>
#include <string>
#include <algorithm>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cublas_v2.h>
#include <curand_kernel.h>
#include <nvToolsExt.h>
#include <time.h>
class CudaBase {
private:
int currentStream;
std::vector<cudaStream_t> cudaStreams;
protected:
cublasHandle_t defaultCublas;
curandState *defaultRndState;
int defaultRndSet;
public:
CudaBase();
~CudaBase();
/**
* Init cuda random number (cuRand) states.
* Create an array of type curandState with "size" elements on the GPU
* and create a curandState with different seed for each array entry.
* Return success or error code
*/
int cuda_createCurandStates(int size);
/**
* Delete curandState.
* Delete curandState array on the GPU and free memory.
* Return success or error code
*/
int cuda_deleteCurandStates();
/** Get a pointer to curand states
*
*/
curandState* cuda_getCurandStates();
/**
* Create a cuda stream and set streamId to index refering to this stream.
* Return success or error code
*/
int cuda_createStream(int &streamId);
/**
* add existing cuda stream to the list.
* Return: success or error code.
*/
int cuda_addStream(cudaStream_t tmpStream, int &streamId);
/**
* delete cuda stream
* success or error code
*/
int cuda_deleteStream(int id);
/**
* delete all streams
* success or error code
*/
int cuda_deleteStreams();
/**
* set stream to use
* success or error code
*/
int cuda_setStream(int id);
/**
* Info: get stream that is used
* Return: return id of curretn stream
*/
int cuda_getStreamId();
/**
* Info: reset to default stream
* Return: success or error code
*/
int cuda_defaultStream();
/**
* Info: get number of streams
* Return: success or error code
*/
int cuda_numberOfStreams();
/**
* Info: get stream
* Return: stream
*/
cudaStream_t cuda_getStream(int id);
/**
* Get default cublass handle
*/
cublasHandle_t cuda_getCublas();
/**
* Info: get information on cuda devices
* Return: success or error code
*/
int cuda_getDevices();
/** Get CUDA device count.
* Sets the number of devices on the platform that can use CUDA.
* Returns DKS_SUCCESS
*/
int cuda_getDeviceCount(int &ndev);
/** Get the name of the device.
* QUery the device properties of the used device and set the string device_name
*/
int cuda_getDeviceName(std::string &device_name);
/** Set CUDA device to use.
* If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR
*/
int cuda_setDevice(int device);
/** Get unique devices
* Get array of indeces with the unique CUDA devices available on the paltform
*/
int cuda_getUniqueDevices(std::vector<int> &devices);
/**
* Info: init device
* Return: success or error code
*/
int cuda_setUp();
/**
* Info: allocate memory on cuda device
* Return: pointer to memory object
*/
void * cuda_allocateMemory(size_t size, int &ierr);
/**
* Info: allocate host memory in pinned memory
* Return: success or error code
*/
template<typename T>
int cuda_allocateHostMemory(T *&ptr, size_t size) {
cudaError cerror;
cerror = cudaMallocHost((void**)&ptr, sizeof(T)*size);
if (cerror != cudaSuccess)
return DKS_ERROR;
return DKS_SUCCESS;
}
/**
* Info: write data to memory
* Retrun: success or error code
*/
template<typename T>
int cuda_writeData(T * mem_ptr, const void * in_data, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemcpy(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error copying data to device, cuda error: " << cerror);
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/**
* Info: write data assynchonuously
* Return: success or error code
*/
template<typename T>
int cuda_writeDataAsync(T *mem_ptr, const void *in_data, size_t size, int streamId = -1, int offset = 0) {
cudaError cerror;
//if default stream or no stream specified, use default write method
if (streamId == -1) {
cuda_writeData(mem_ptr, in_data, size, offset);
return DKS_SUCCESS;
}
if (streamId < cuda_numberOfStreams()) {
//call async write
cerror = cudaMemcpyAsync(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice,
cuda_getStream(streamId));
if (cerror != cudaSuccess) {
DEBUG_MSG("Error async data copy, cuda error: " << cerror);
return DKS_ERROR;
}
} else {
DEBUG_MSG("Error invalid stream id: " << streamId);
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/**
* Info: read data from memory
* Return: success or error code
*/
template<typename T>
int cuda_readData(const T * mem_ptr, void * out_data, size_t size, int offset = 0) {
cudaError cerror;
cerror = cudaMemcpy(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error reading data from device");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/**
* Info: read data async from device memory
* Return: success or error code
*/
template<typename T>
int cuda_readDataAsync(const T *mem_ptr, void *out_data, size_t size, int streamId = -1, int offset = 0) {
cudaError cerror;
if (streamId == -1) {
cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, 0);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error async read from devie default stream");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
if (streamId < cuda_numberOfStreams()) {
cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost,
cuda_getStream(streamId));
if (cerror != cudaSuccess) {
DEBUG_MSG("Error async read from device, cuda error: " << cerror);
return DKS_ERROR;
}
} else {
DEBUG_MSG("Error invalid stream id: " << streamId);
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/**
* Info: free memory on device
* Return: success or error code
*/
int cuda_freeMemory(void * mem_ptr);
/**
* Info: free page locked memory on host
* Return: success or erro code
*/
int cuda_freeHostMemory(void * mem_ptr);
/**
* Info: allcate memory and write data (push)
* Return: pointer to memory object
*/
template<typename T>
void * cuda_pushData(const void * in_data, size_t size, int &ierr) {
void * mem_ptr;
mem_ptr = cuda_allocateMemory(size, ierr);
if (ierr == DKS_SUCCESS)
ierr = cuda_writeData((T*)mem_ptr, in_data, size);
return mem_ptr;
}
/**
* Info: read data and free memory (pull)
* Return: success or error code
*/
template<typename T>
int cuda_pullData(T * mem_ptr, void * out_data, size_t size, int &ierr) {
ierr = cuda_readData(mem_ptr, out_data, size);
if (ierr == DKS_SUCCESS)
ierr = cuda_freeMemory(mem_ptr);
else
return DKS_ERROR;
if (ierr == DKS_SUCCESS)
return DKS_SUCCESS;
else
return DKS_ERROR;
}
/**
* Info: execute function
* Return: success or error code
*/
int cuda_executeFunction();
/**
* Info: clean up
* Return: success or error code
*/
int cuda_cleanUp();
/**
* Info: sync cuda device
* Return: success or error code
*/
int cuda_syncDevice() {
cudaDeviceSynchronize();
return DKS_SUCCESS;
}
/**
* Page-lock host memory
*/
template<typename T>
int cuda_hostRegister(T *ptr, int size) {
int cerr = cudaHostRegister(ptr, size*sizeof(T), cudaHostRegisterPortable);
if (cerr == cudaSuccess) {
return DKS_SUCCESS;
} else {
DEBUG_MSG("Host memroy was not page locked");
return DKS_ERROR;
}
}
/**
* Release page locked memory
*/
template<typename T>
int cuda_hostUnregister(T *ptr) {
int cerr = cudaHostUnregister(ptr);
if (cerr == cudaSuccess)
return DKS_SUCCESS;
else
return DKS_ERROR;
}
/**
* Info: print device memory info (total, used, avail)
* Return: success or error code
*/
int cuda_memInfo() {
int ierr;
size_t avail;
size_t total;
double mb = 1000000.0;
ierr = cudaMemGetInfo( &avail, &total);
if (ierr != cudaSuccess) {
DEBUG_MSG("Device mem info could not be obtained!");
return DKS_ERROR;
}
std::cout << "Device memory info, total: " << total / mb << "MB,\t";
std::cout << "used: " << (total - avail) / mb << "MB,\t";
std::cout << "avail: " << avail / mb << "MB" << std::endl;
return DKS_SUCCESS;
}
};
#endif

287
src/CUDA/CudaChiSquare.cu Normal file
View File

@ -0,0 +1,287 @@
#include "CudaChiSquare.cuh"
//simple kernel version
__global__ void kernelPHistoTFFcn(double *data, double *par, double *chisq,
double fTimeResolution, double fRebin, int n) {
int j = blockIdx.x;
int i = blockIdx.y;
int idx = i * n + j;
const double tau = 2.197019;
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
double time = dt0 + fTimeResolution * fRebin * j;
double w = par[0]*0.08516155035269027;
double ldata = data[idx];
double theo = par[2 + i*4] * exp(-time/tau) * (1.0 + par[3 + i*4] * exp(-0.5 * pow(par[1]*time,2.0) ) * cos(w * time+par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4];
if (ldata != 0.0)
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
else
chisq[idx] = theo * theo;
}
__global__ void kernelPHistoTFFcn_2(double *data, double *par, double *chisq,
double fTimeResolution, double fRebin, int n, int s) {
int j = blockIdx.x;
const double tau = 2.197019;
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
double time = dt0 + fTimeResolution * fRebin * j;
double w = par[0]*0.08516155035269027;
double tt = exp(-time/tau);
double pp = exp(-0.5 * par[1] * time * par[1] * time);
double wt = w * time;
int idx;
double ldata, theo;
for (int i = 0; i < s; i++) {
idx = i * n + j;
ldata = data[idx];
theo = par[2 + i*4] * tt * (1.0 + par[3 + i*4] * pp * cos(wt + par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4];
if (ldata != 0.0)
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
else
chisq[idx] = theo * theo;
}
}
#define TAU 2.197019
__global__ void kernelPHistoTFFcn_3(double *data, double *par, double *chisq,
double fTimeResolution, double fRebin,
int length, int sensors, int numpar) {
//define shared variable for parameters
extern __shared__ double p[];
//get thread id and calc global id
int tid = threadIdx.x;
int j = blockIdx.x * blockDim.x + threadIdx.x;
//load parameters from global to shared memory
if (tid < numpar)
p[tid] = par[tid];
//sync threads
__syncthreads();
if (j < length) {
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
double time = dt0 + fTimeResolution * fRebin * j;
double w = p[0]*0.08516155035269027;
double tt = exp(-time/TAU);
double pp = exp(-0.5 * pow(p[1]*time, 2.0));
double wt = w * time;
int idx;
double ldata, theo;
for (int i = 0; i < sensors; i++) {
idx = i * length + j;
ldata = data[idx];
theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
if (ldata != 0.0)
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
else
chisq[idx] = theo * theo;
}
}
}
__global__ void kernelSingleGaussTF(double *data, unsigned int *t0, double *par, double *result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int length, int sensors, int numpar)
{
//define shared variable for parameters
extern __shared__ double p[];
//get thread id and calc global id
int tid = threadIdx.x;
int j = blockIdx.x * blockDim.x + threadIdx.x;
//load parameters from global to shared memory
if (tid < numpar)
p[tid] = par[tid];
//sync threads
__syncthreads();
if (j < length) {
double dt0 = fTimeResolution*0.5*(fRebin - 1);
double w1 = par[0]*0.08516155035269027;
int idx;
double ldata, lft0, theo, time;
for (int i = 0; i < sensors; i++) {
idx = i * length + j;
lft0 = t0[i];
if (j >= lft0 + fGoodBinOffset/fRebin) {
ldata = data[idx];
time = dt0 + fTimeResolution * fRebin* (j - lft0);
theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0))
*cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
// 1.74532925199432955e-2 = pi/180
if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) )
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
else
result[idx] = theo - ldata;
} else {
result[idx] = 0;
}
}
}
}
__global__ void kernelDoubleLorentzTF(double *data, unsigned int *t0, double *par, double *result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int length, int sensors, int numpar)
{
//define shared variable for parameters
extern __shared__ double p[];
//get thread id and calc global id
int tid = threadIdx.x;
int j = blockIdx.x * blockDim.x + threadIdx.x;
//load parameters from global to shared memory
if (tid < numpar)
p[tid] = par[tid];
//sync threads
__syncthreads();
if (j < length) {
double dt0 = fTimeResolution*0.5*(fRebin - 1);
double w1 = p[0]*0.08516155035269027;
double w2 = p[2]*0.08516155035269027;
int idx;
double ldata, lft0, theo, time;
for (int i = 0; i < sensors; i++) {
idx = i * length + j;
lft0 = t0[i];
if (j >= lft0 + fGoodBinOffset/fRebin) {
ldata = data[idx];
time = dt0+fTimeResolution*fRebin*(j-lft0);
theo = p[4+i*5]*exp(-time/TAU)*
(1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)*
cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+
(1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)*
cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5];
// 1.74532925199432955e-2 = pi/180
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
else
result[idx] = theo - ldata;
} else {
result[idx] = 0;
}
}
}
}
int CudaChiSquare::cuda_PHistoTFFcn(void *mem_data, void *mem_ptr, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar,
double &result)
{
int threads = 128;
int blocks = length / threads + 1;
kernelPHistoTFFcn_3<<<blocks, threads, numpar*sizeof(double) >>>((double*)mem_data,
(double*)mem_ptr,
(double*)mem_chisq,
fTimeResolution,
fRebin, length,
sensors, numpar);
cublasStatus_t status;
status = cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_chisq, 1, &result);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("cublas asum failed");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
int CudaChiSquare::cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
int threads = 128;
int blocks = length / threads + 1;
kernelSingleGaussTF<<<blocks, threads, numpar*sizeof(double) >>>( (double*)mem_data,
(unsigned int*)mem_t0,
(double*)mem_par,
(double*)mem_result,
fTimeResolution,
fRebin,
fGoodBinOffset,
length, sensors, numpar);
cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result);
result = 2.0 * result;
return DKS_SUCCESS;
}
int CudaChiSquare::cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
int threads = 128;
int blocks = length / threads + 1;
kernelDoubleLorentzTF<<<blocks, threads, numpar*sizeof(double) >>>( (double*)mem_data,
(unsigned int*)mem_t0,
(double*)mem_par,
(double*)mem_result,
fTimeResolution,
fRebin,
fGoodBinOffset,
length, sensors, numpar);
cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result);
result = 2.0 * result;
return DKS_SUCCESS;
}

View File

@ -0,0 +1,59 @@
#ifndef H_CUDA_CHISQUARE
#define H_CUDA_CHISQUARE
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include "CudaBase.cuh"
class CudaChiSquare {
private:
bool base_create;
CudaBase *m_base;
public:
/**
* Constructor which gets CudaBase as argument
*/
CudaChiSquare(CudaBase *base) {
m_base = base;
base_create = false;
}
/* constructor */
CudaChiSquare() {
m_base = new CudaBase();
base_create = true;
}
/* destructor */
~CudaChiSquare() {
if (base_create)
delete m_base;
}
/* PHistoTFFcn calculation */
int cuda_PHistoTFFcn(void * mem_data, void * mem_par, void * mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar,
double &result);
int cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result);
int cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result);
};
#endif

View File

@ -0,0 +1,313 @@
#include "CudaChiSquareRuntime.cuh"
CudaChiSquareRuntime::CudaChiSquareRuntime(CudaBase *base) {
blockSize_m = BLOCK_SIZE;
numBlocks_m = -1;
ptx_m = NULL;
m_base = base;
base_create = false;
setUpContext();
}
//constructor, init cuda device and create context
CudaChiSquareRuntime::CudaChiSquareRuntime() {
blockSize_m = BLOCK_SIZE;
numBlocks_m = -1;
ptx_m = NULL;
m_base = new CudaBase();
base_create = true;
setUpContext();
}
//free resources
CudaChiSquareRuntime::~CudaChiSquareRuntime() {
delete[] ptx_m;
cuCtxDestroy(context_m);
freeChiSquare();
if (base_create)
delete m_base;
}
void CudaChiSquareRuntime::setUpContext() {
cuInit(0);
cuDeviceGet(&cuDevice_m, 0);
cuCtxCreate(&context_m, 0, cuDevice_m);
N0_m = 1.0;
tau_m = 1.0;
bkg_m = 1.0;
initDone_m = false;
}
//build program string
std::string CudaChiSquareRuntime::buildProgram(std::string function) {
long fsize;
char *kernel_source;
//get kernel source
char * kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, "CUDA/NVRTCKernels/CudaChiSquareKernel.cu");
//read kernels from file
FILE *fp = fopen(kernel_file, "rb");
if (!fp)
DEBUG_MSG("Can't open kernel file" << kernel_file);
//get file size and allocate memory
fseek(fp, 0, SEEK_END);
fsize = ftell(fp);
kernel_source = new char[fsize+1];
//read file and content in kernel source
rewind(fp);
fread(kernel_source, 1, sizeof(char)*fsize, fp);
kernel_source[fsize] = '\0';
fclose(fp);
std::string kernel_string (kernel_source);
return kernel_string + cudaFunctHeader + "return " + function + ";" + cudaFunctFooter;
}
//
int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) {
//build program string
std::string cudaProg = buildProgram(function);
//create program
nvrtcProgram prog;
//std::cout << cudaProg.c_str() << std::endl;
nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL);
//compile program
const char *opts[] = {"-fmad=false", ""};
int numopts = 1;
if (mlh) {
opts[1] = "-DMLH";
numopts = 2;
}
nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts);
if (compileResults != NVRTC_SUCCESS) {
//obtain compilation log
size_t logSize;
nvrtcGetProgramLogSize(prog, &logSize);
char *log = new char[logSize];
nvrtcGetProgramLog(prog, log);
DEBUG_MSG("Compilation failed!");
DEBUG_MSG(log);
delete[] log;
return DKS_ERROR;
} else {
DEBUG_MSG("Compilation successfull!");
}
//obtain PTX from program
if (ptx_m != NULL)
delete[] ptx_m;
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
ptx_m = new char[ptxSize];
nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m);
if (nvrtcPTXResult != NVRTC_SUCCESS) {
DEBUG_MSG("Get PTX failed!");
return DKS_ERROR;
}
//load module from ptx
CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0);
if (loadResult != CUDA_SUCCESS) {
DEBUG_MSG("Load module from ptx failed!");
return DKS_ERROR;
}
// Destroy the program
nvrtcDestroyProgram(&prog);
return DKS_SUCCESS;
}
int CudaChiSquareRuntime::launchChiSquare(int fitType,
void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep, double &result)
{
if (!initDone_m) {
DEBUG_MSG("ChiSquare init needs to be called at some point!");
return DKS_ERROR;
}
int blocks;
int threads = blockSize_m;
if (numBlocks_m < 0)
blocks = length / threads + 1;
else
blocks = numBlocks_m;
CUresult cuStatus;
void **args = 0;
if (fitType == FITTYPE_SINGLE_HISTO) {
cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareSingleHisto");
if (cuStatus != CUDA_SUCCESS) {
DEBUG_MSG("Failed to get function from module!");
return DKS_ERROR;
}
args = (void**) malloc(15 * sizeof(void*));
args[0] = &mem_data;
args[1] = &mem_err;
args[2] = &mem_param_m;
args[3] = &mem_chisq_m;
args[4] = &mem_map_m;
args[5] = &mem_func_m;
args[6] = &length;
args[7] = &numpar;
args[8] = &numfunc;
args[9] = &nummap;
args[10] = &timeStart;
args[11] = &timeStep;
args[12] = &tau_m;
args[13] = &N0_m;
args[14] = &bkg_m;
} else if (fitType == FITTYPE_ASYMMETRY) {
cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareAsymmetry");
if (cuStatus != CUDA_SUCCESS) {
DEBUG_MSG("Failed to get function from module!");
return DKS_ERROR;
}
args = (void**) malloc(14 * sizeof(void*));
args[0] = &mem_data;
args[1] = &mem_err;
args[2] = &mem_param_m;
args[3] = &mem_chisq_m;
args[4] = &mem_map_m;
args[5] = &mem_func_m;
args[6] = &length;
args[7] = &numpar;
args[8] = &numfunc;
args[9] = &nummap;
args[10] = &timeStart;
args[11] = &timeStep;
args[12] = &alpha_m;
args[13] = &beta_m;
} else if (fitType == FITTYPE_MU_MINUS) {
DEBUG_MSG("Not Yet Implemented!");
return DKS_ERROR;
} else {
DEBUG_MSG("Undefined Fit Type!");
return DKS_ERROR;
}
cuStatus = cuLaunchKernel(kernel_m,
blocks, 1, 1,
threads, 1, 1,
(numpar + numfunc)*sizeof(double) + nummap*sizeof(int), NULL,
args, 0);
if (cuStatus != CUDA_SUCCESS) {
std::string msg;
msg = "Failed to run kernel! (" + std::to_string(blocks) + ", " + std::to_string(threads) + ")";
DEBUG_MSG(msg);
const char *desc;
cuGetErrorString(cuStatus, &desc);
std::cout << desc << std::endl;
return DKS_ERROR;
}
cublasStatus_t status;
status = cublasDasum(defaultCublasRT, length, (double*)mem_chisq_m, 1, &result);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("cublas sum failed!");
return DKS_ERROR;
}
// cleanup
if (args)
free(args);
return DKS_SUCCESS;
}
int CudaChiSquareRuntime::writeParams(const double *params, int numparams) {
int ierr = m_base->cuda_writeData( (double*)mem_param_m, params, sizeof(double)*numparams);
return ierr;
}
int CudaChiSquareRuntime::writeFunc(const double *func, int numfunc) {
int ierr = m_base->cuda_writeData( (double*)mem_func_m, func, sizeof(double)*numfunc);
return ierr;
}
int CudaChiSquareRuntime::writeMap(const int *map, int nummap) {
int ierr = m_base->cuda_writeData( (int*)mem_map_m, map, sizeof(int)*nummap);
return ierr;
}
int CudaChiSquareRuntime::initChiSquare(int size_data, int size_param, int size_func,
int size_map) {
int ierr = DKS_ERROR;
if (initDone_m) {
DEBUG_MSG("Reinitializing ChiSquare");
freeChiSquare();
}
//init cublas
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasCreate(&defaultCublasRT);
if (status != CUBLAS_STATUS_SUCCESS)
DEBUG_MSG("CUBLAS create default handle failed!");
//allocate temporary memory
mem_chisq_m = m_base->cuda_allocateMemory(size_data*sizeof(double), ierr);
mem_param_m = m_base->cuda_allocateMemory(size_param*sizeof(double), ierr);
mem_func_m = m_base->cuda_allocateMemory(size_func*sizeof(double), ierr);
mem_map_m = m_base->cuda_allocateMemory(size_map*sizeof(int), ierr);
initDone_m = true;
return ierr;
}
int CudaChiSquareRuntime::freeChiSquare() {
int ierr = DKS_ERROR;
if (initDone_m) {
//delete cublas
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasDestroy(defaultCublasRT);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("CUBLAS delete default handle failed!");
return DKS_ERROR;
}
//free memory
ierr = m_base->cuda_freeMemory(mem_chisq_m);
ierr = m_base->cuda_freeMemory(mem_param_m);
ierr = m_base->cuda_freeMemory(mem_func_m);
ierr = m_base->cuda_freeMemory(mem_map_m);
initDone_m = false;
}
return ierr;
}

View File

@ -0,0 +1,114 @@
#ifndef H_CUDA_CHISQUARE_RUNTIME
#define H_CUDA_CHISQUARE_RUNTIME
#include <iostream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
#include "../Algorithms/ChiSquareRuntime.h"
#include "CudaBase.cuh"
const std::string cudaFunctHeader = "__device__ double fTheory(double t, double *p, double *f, int *m) {";
const std::string cudaFunctFooter = "}\n";
class CudaChiSquareRuntime : public ChiSquareRuntime{
private:
bool base_create;
CudaBase *m_base;
CUdevice cuDevice_m;
CUcontext context_m;
CUmodule module_m;
CUfunction kernel_m;
cublasHandle_t defaultCublasRT;
/** Setup to init device
* Create context and init device for RT compilation
*/
void setUpContext();
/** Private function to add function to kernel string
*
*/
std::string buildProgram(std::string function);
public:
/** Constructor with CudaBase argument
*
*/
CudaChiSquareRuntime(CudaBase *base);
/** Default constructor init cuda device
*
*/
CudaChiSquareRuntime();
/** Default destructor
*
*/
~CudaChiSquareRuntime();
/** Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]]
*/
int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel
* Launched the selected kernel from the compiled code.
* Result is put in &result variable
*/
int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result);
/** Write params to device.
* Write params from double array to mem_param_m memory on the device.
*/
int writeParams(const double *params, int numparams);
/** Write functions to device.
* Write function values from double array to mem_func_m memory on the device.
*/
int writeFunc(const double *func, int numfunc);
/** Write maps to device.
* Write map values from int array to mem_map_m memory on the device.
*/
int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in
* calculations.
*/
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps
*/
int freeChiSquare();
/** Check if CUDA device is able to run the chi square kernel.
* Redundant - all new CUDA devices that support RT compilation will also support
* double precision, there are no other requirements to run chi square on GPU
*/
int checkChiSquareKernels(int fitType, int &threadsPerBlock) {
return DKS_SUCCESS;
}
};
#endif

View File

@ -0,0 +1,728 @@
#include "CudaCollimatorPhysics.cuh"
//#define M_P 0.93827231e+00
#define M_P 0.93827204e+00
#define C 299792458.0
#define PI 3.14159265358979323846
#define AVO 6.022e23
#define R_E 2.81794092e-15
//#define eM_E 0.51099906e-03
#define eM_E 0.51099892e-03
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
#define Z_M 3
#define A_M 4
#define A2_C 5
#define A3_C 6
#define A4_C 7
#define A5_C 8
#define X0_M 9
#define I_M 10
#define DT_M 11
#define BLOCK_SIZE 128
#define NUMPAR 12
__device__ inline double dot(double3 &d1, double3 &d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
__device__ inline bool checkHit(double &z, double *par) {
/* check if particle is in the degrader material */
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
}
__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par)
{
volatile double dEdx = 0.0;
volatile double gamma = (Eng + M_P) / M_P;
volatile double gamma2 = gamma * gamma;
double beta = sqrt(1.0 - 1.0 / gamma2);
volatile double beta2 = beta * beta;
double deltas = par[DT_M] * beta * C;
volatile double deltasrho = deltas * 100 * par[RHO_M];
volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
double Ts = (Eng * 1E6) / 1.0073;
double epsilon_low = par[A2_C] * pow(Ts, 0.45);
double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
Eng = Eng + delta_E / 1E3;
}
if (Eng >= 0.0006) {
double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state);
Eng = Eng + delta_E / 1E3;
}
pdead = ((Eng<1E-4) || (dEdx>0));
}
__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane,
double &normP, double &thetacou, double &deltas, int coord,
double *par)
{
double Psixz;
double pxz;
if (px>=0 && pz>=0)
Psixz = atan(px/pz);
else if (px>0 && pz<0)
Psixz = atan(px/pz) + PI;
else if (px<0 && pz>0)
Psixz = atan(px/pz) + 2*PI;
else
Psixz = atan(px/pz) + PI;
pxz = sqrt(px*px + pz*pz);
if(coord==1) {
x = x + deltas * px/normP + xplane*cos(Psixz);
z = z - xplane * sin(Psixz);
}
if(coord==2) {
x = x + deltas * px/normP + xplane*cos(Psixz);
z = z - xplane * sin(Psixz) + deltas * pz / normP;
}
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P;
double normP = sqrt(dot(P, P));
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
double deltas = par[DT_M] * beta * C;
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
// x-direction: See Physical Review, "Multiple Scattering"
double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
double thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
}
//__syncthreads();
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par);
double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P2 < 0.0047) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P4 > 0.5)
thetaru = -thetaru;
Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par);
}
// y-direction: See Physical Review, "Multiple Scattering"
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
}
//__syncthreads();
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par);
P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P2 < 0.0047) {
double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P4 > 0.5)
thetaru = -thetaru;
Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par);
}
}
template <typename T>
__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state,
int numparticles)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory
extern __shared__ double smem[];
double *p = (double*)smem;
double3 *R = (double3*)&smem[NUMPAR];
curandState s;
double3 P;
for (int tt = tid; tt < NUMPAR; tt += blockDim.x)
p[tt] = par[tt];
__syncthreads();
if (idx < numparticles) {
s = state[idx];
R[tid] = data[idx].Rincol;
P = data[idx].Pincol;
bool pdead = false;
volatile double sq = sqrt(1.0 + dot(P, P));
double Eng;
if (checkHit(R[tid].z, p)) {
Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p);
if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P));
P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p);
data[idx].Pincol = P;
} else {
data[idx].label = -1;
}
state[idx] = s;
} else {
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
data[idx].label = -2;
}
data[idx].Rincol = R[tid];
}
}
__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par,
curandState *state, int numparticles)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
//transfer params to shared memory
__shared__ double p[NUMPAR];
__shared__ double3 R[BLOCK_SIZE];
if (tid < NUMPAR)
p[tid] = par[tid];
__syncthreads();
curandState s;
double3 P;
if (idx < numparticles) {
R[tid] = data.Rincol[idx];
P = data.Pincol[idx];
s = state[idx];
double sq = sqrt(1.0 + dot(P, P));
bool pdead = false;
if (checkHit(R[tid].z, p)) {
double Eng = (sq - 1) * M_P;
energyLoss(Eng, pdead, s, p);
if (!pdead) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(P, P));
P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq;
coulombScat(R[tid], P, s, p);
data.Pincol[idx] = P;
} else {
data.label[idx] = -1;
}
} else {
R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq;
R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq;
R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq;
data.label[idx] = -2;
}
data.Rincol[idx] = R[tid];
state[idx] = s;
}
}
inline __device__ void unitlessOff(double3 &a, const double &c) {
a.x *= c;
a.y *= c;
a.z *= c;
}
inline __device__ void unitlessOn(double3 &a, const double &c) {
a.x /= c;
a.y /= c;
a.z /= c;
}
//swithch to unitless positions with dtc
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOn(R, dtc);
unitlessOn(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc to unitless positions with dt*c
__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
//swithc off unitless positions with dtc
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
unitlessOff(R, dtc);
unitlessOff(X, dtc);
gR[idx] = R;
gX[idx] = X;
}
}
//switch off unitelss positions with dt*c
__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) {
volatile int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < npart) {
double3 R = gR[idx];
double3 X = gX[idx];
double dt = gdt[idx];
unitlessOff(R, dt*c);
unitlessOff(X, dt*c);
gR[idx] = R;
gX[idx] = X;
}
}
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) {
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
//switch to unitless positions
unitlessOn(R, dtc);
//push
double tmp = sqrt(1.0 + dot(P, P));
R.x += 0.5 * P.x / tmp;
R.y += 0.5 * P.y / tmp;
R.z += 0.5 * P.z / tmp;
//switch off unitless positions with dt*c
unitlessOff(R, dtc);
gR[idx] = R;
}
}
__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) {
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 R = gR[idx];
double3 P = gP[idx];
double dt = gdt[idx];
//switch to unitless positions with dt*c
unitlessOn(R, dt*c);
R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P));
R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P));
R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P));
//switch off unitless positions with dt*c
unitlessOff(R, dt*c);
gR[idx] = R;
}
}
//TODO: kernel for push with switch off unitless positions with dt[i]*c
__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) {
const double sina = sin(ori.x);
const double cosa = cos(ori.x);
const double sinb = sin(ori.y);
const double cosb = cos(ori.y);
const double sinc = sin(ori.z);
const double cosc = cos(ori.z);
double3 temp;
temp.x = 0.0;
temp.y = 0.0;
temp.z = 0.0;
temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
(cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
(sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
return temp;
}
__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
int npart, int nsect, double dtc)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 X = gX[idx];
double3 P = gP[idx];
long lLastSection = gLastSection[idx];
double3 ori;
if (lLastSection > -1 && lLastSection < nsect) {
ori = gOrient[lLastSection];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
double3 tmp = deviceTransformTo(P, ori);
unitlessOn(X, dtc);
X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
unitlessOff(X, dtc);
gX[idx] = X;
}
}
__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient,
int npart, int nsect, double *gdt, double c)
{
//get global id and thread id
volatile int tid = threadIdx.x;
volatile int idx = blockIdx.x * blockDim.x + tid;
if (idx < npart) {
double3 X = gX[idx];
double3 P = gP[idx];
long lLastSection = gLastSection[idx];
double dt = gdt[idx];
double3 ori;
if (lLastSection > -1 && lLastSection < nsect) {
ori = gOrient[lLastSection];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
double3 tmp = deviceTransformTo(P, ori);
unitlessOn(X, dt*c);
X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp));
X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp));
X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp));
unitlessOff(X, dt*c);
gX[idx] = X;
}
}
struct compare_particle
{
int threshold;
compare_particle() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART p1, CUDA_PART p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART p1) {
return p1.label < threshold;
}
};
struct compare_particle_small
{
int threshold;
compare_particle_small() {
threshold = 0;
}
void set_threshold(int t) {
threshold = t;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) {
return p1.label > p2.label;
}
__host__ __device__
bool operator()(CUDA_PART_SMALL p1) {
return p1.label < threshold;
}
};
struct less_then
{
__host__ __device__
bool operator()(int x)
{
return x < 0;
}
};
int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles)
{
int threads = BLOCK_SIZE;
int blocks = numparticles / threads + 1;
//calc shared memory size
int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE;
//call kernel
kernelCollimatorPhysics<<<blocks, threads, smem_size>>>((CUDA_PART_SMALL*)mem_ptr,
(double*)par_ptr,
m_base->cuda_getCurandStates(),
numparticles);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
std::cout << "Err2: " << cudaGetErrorString(err) << std::endl;
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
int &numaddback)
{
//wrap mem_ptr with thrust device ptr
thrust::device_ptr<CUDA_PART_SMALL> dev_ptr( (CUDA_PART_SMALL*)mem_ptr);
//count -2 and -1 particles
compare_particle_small comp;
comp.set_threshold(0);
numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp);
//sort particles
if (numaddback > 0)
thrust::sort(dev_ptr, dev_ptr + numparticles, comp);
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c, bool usedt,
int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
//call kernel
if (!usedt) {
if (streamId == -1) {
kernelPush<<<blocks, threads >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c);
}
} else {
if (streamId == -1) {
kernelPush<<<blocks, threads>>>((double3*)r_ptr, (double3*)p_ptr, npart,
(double*)dt_ptr, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPush<<<blocks, threads, 0, cs >>>((double3*)r_ptr, (double3*)p_ptr, npart,
(double*)dt_ptr, c);
}
}
return DKS_SUCCESS;
}
int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec,
void *dt_ptr, double dt,
double c, bool usedt,
int streamId)
{
int threads = BLOCK_SIZE;
int blocks = npart / threads + 1;
int smem = sizeof(double3) * nsec;
//call kernel
if (!usedt) {
if (streamId == -1) {
kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, dt*c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, dt*c);
}
} else {
if (streamId == -1) {
kernelPushTransform<<<blocks, threads, smem>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, (double*)dt_ptr, c);
} else {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelPushTransform<<<blocks, threads, smem, cs>>>((double3*)x_ptr, (double3*)p_ptr,
(long*)lastSec_ptr, (double3*)orient_ptr,
npart, nsec, (double*)dt_ptr, c);
}
}
return DKS_SUCCESS;
}

View File

@ -0,0 +1,155 @@
#ifndef H_CUDA_COLLIMATORPHYSICS
#define H_CUDA_COLLIMATORPHYSICS
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector_types.h>
#include <curand_kernel.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/count.h>
#include <cublas_v2.h>
#include "../Algorithms/CollimatorPhysics.h"
#include "CudaBase.cuh"
/**
* Structure for storing particle on GPU
*/
typedef struct __align__(16) {
int label;
unsigned localID;
double3 Rincol;
double3 Pincol;
long IDincol;
int Binincol;
double DTincol;
double Qincol;
long LastSecincol;
double3 Bfincol;
double3 Efincol;
} CUDA_PART;
/**
* Structure for storing particle on GPU
*/
typedef struct {
int label;
unsigned localID;
double3 Rincol;
double3 Pincol;
} CUDA_PART_SMALL;
/**
* Structure for storing particle on GPU
*/
typedef struct {
int *label;
unsigned *localID;
double3 *Rincol;
double3 *Pincol;
long *IDincol;
int *Binincol;
double *DTincol;
double *Qincol;
long *LastSecincol;
double3 *Bfincol;
double3 *Efincol;
} CUDA_PART2;
/**
* Structure for storing particle on GPU
*/
typedef struct {
int *label;
unsigned *localID;
double3 *Rincol;
double3 *Pincol;
} CUDA_PART2_SMALL;
/** CudaCollimatorPhysics class.
* Contains kerenls that execute CollimatorPhysics functions form OPAL.
* For detailed documentation on CollimatorPhysics functions see OPAL documentation
*/
class CudaCollimatorPhysics : public DKSCollimatorPhysics{
private:
bool base_create;
CudaBase *m_base;
public:
/** Constructor with CudaBase argument
*
*/
CudaCollimatorPhysics(CudaBase *base) {
m_base = base;
base_create = false;
}
/** Constructor - empty. */
CudaCollimatorPhysics() {
m_base = new CudaBase();
base_create = true;
}
/** Destructor - empty */
~CudaCollimatorPhysics() {
if (base_create)
delete m_base;
};
/** Execute collimator physics kernel.
*
*/
int CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numpartices);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
return DKS_ERROR;
}
/** Sort particle array on GPU.
* Count particles that are dead (label -1) or leaving material (label -2) and sort particle
* array so these particles are at the end of array
*/
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
return DKS_ERROR;
}
/** BorisPusher push function for integration from OPAL.
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1);
/** BorisPusher push function with transformto function form OPAL
* ParallelTTracker integration from OPAL implemented in cuda.
* For more details see ParallelTTracler docomentation in opal
*/
int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
};
#endif

376
src/CUDA/CudaFFT.cu Normal file
View File

@ -0,0 +1,376 @@
#include "CudaFFT.cuh"
__global__ void normalize(cufftDoubleComplex *in, int N) {
int id = blockIdx.x; //*blockDim.x + threadIdx.x;
if (id < N) {
in[id].x = in[id].x / N;
in[id].y = in[id].y / N;
}
}
CudaFFT::CudaFFT(CudaBase *base) {
m_base = base;
base_create = false;
}
/* constructor */
CudaFFT::CudaFFT() {
m_base = new CudaBase();
base_create = true;
}
/* destructor */
CudaFFT::~CudaFFT() {
if (base_create)
delete m_base;
}
/*
Info: execute fft using cufft library
Return: success or error code
*/
int CudaFFT::executeFFT(void * mem_ptr, int ndim, int N[3], int streamId, bool forward) {
//create fft plan
cufftResult cresult;
cufftHandle plan;
if (useDefaultPlan(ndim, N)) {
plan = defaultPlanZ2Z;
} else {
switch (ndim) {
case 1:
cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2Z, 1);
break;
case 2:
cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2Z);
break;
case 3:
cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2Z);
break;
default:
cresult = CUFFT_SUCCESS;
break;
}
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error creating plan, cuda error: " << cresult);
if (cresult == CUFFT_SETUP_FAILED)
DEBUG_MSG("Setup failed");
if (cresult == CUFFT_INVALID_SIZE)
DEBUG_MSG("Invalid size");
if (cresult == CUFFT_INVALID_TYPE)
DEBUG_MSG("Invalid type");
if (cresult == CUFFT_ALLOC_FAILED)
DEBUG_MSG("Alloc failed");
return DKS_ERROR;
}
}
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
cufftSetStream(plan, m_base->cuda_getStream(streamId));
else
cufftSetStream(plan, 0);
//execute perform in place FFT on created plan
if (forward) {
cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr,
(cufftDoubleComplex*)mem_ptr, CUFFT_FORWARD);
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error executing fft, cuda error: " << cresult);
cufftDestroy(plan);
return DKS_ERROR;
}
} else {
cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr,
(cufftDoubleComplex*)mem_ptr, CUFFT_INVERSE);
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error executing ifft, cuda error: " << cresult);
cufftDestroy(plan);
return DKS_ERROR;
}
}
//clean up resources
if (!useDefaultPlan(ndim, N))
cufftDestroy(plan);
return DKS_SUCCESS;
}
/*
Info: execute ifft
Return: success or error code
*/
int CudaFFT::executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId) {
return executeFFT(mem_ptr, ndim, N, streamId, false);
}
/*
Info: execute normalize using cuda kernel
Return: success or error code
*/
int CudaFFT::normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId) {
cublasStatus_t status;
unsigned int size = N[0]*N[1]*N[2];
cuDoubleComplex alpha = make_cuDoubleComplex(1.0/size, 0);
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId));
status = cublasZscal(defaultCublasFFT, size, &alpha, (cuDoubleComplex*)mem_ptr, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("CUBLAS exec Zscal failed!");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/*
Info: execute real to complex double precision FFT
Return: success or error code
*/
int CudaFFT::executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) {
//create fft plan
cufftResult cresult;
cufftHandle plan;
if (useDefaultPlan(ndim, N)) {
plan = defaultPlanD2Z;
} else {
switch (ndim) {
case 1:
cresult = cufftPlan1d(&plan, N[0], CUFFT_D2Z, 1);
break;
case 2:
cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_D2Z);
break;
case 3:
cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_D2Z);
break;
default:
cresult = CUFFT_SUCCESS;
break;
}
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error creating plan, cuda error: " << cresult);
return DKS_ERROR;
}
}
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
cresult = cufftSetStream(plan, m_base->cuda_getStream(streamId));
else
cufftSetStream(plan, 0);
//execute perform in place FFT on created plan
cresult = cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr, (cufftDoubleComplex*)comp_ptr);
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error executing fft, cuda error: " << cresult);
if (cresult == CUFFT_INVALID_PLAN)
DEBUG_MSG("invalid plan");
if (cresult == CUFFT_INVALID_VALUE)
DEBUG_MSG("invalid value");
if (cresult == CUFFT_INTERNAL_ERROR)
DEBUG_MSG("internal error");
if (cresult == CUFFT_EXEC_FAILED)
DEBUG_MSG("exec failed");
if (cresult == CUFFT_SETUP_FAILED)
DEBUG_MSG("setup failed");
return DKS_ERROR;
}
//clean up resources
if (!useDefaultPlan(ndim, N)) {
cresult = cufftDestroy(plan);
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult);
return DKS_ERROR;
}
}
return DKS_SUCCESS;
}
/*
Info: exectue complex to real double precision FFT
Return: success or error code
*/
int CudaFFT::executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) {
//create fft plan
cufftResult cresult;
cufftHandle plan;
if (useDefaultPlan(ndim, N)) {
plan = defaultPlanZ2D;
} else {
switch (ndim) {
case 1:
cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2D, 1);
break;
case 2:
cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2D);
break;
case 3:
cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2D);
break;
default:
cresult = CUFFT_SUCCESS;
break;
}
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error creating plan, cuda error: " << cresult);
return DKS_ERROR;
}
}
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
cufftSetStream(plan, m_base->cuda_getStream(streamId));
else
cufftSetStream(plan, 0);
//execute perform in place FFT on created plan
cresult = cufftExecZ2D(plan, (cufftDoubleComplex*)comp_ptr, (cufftDoubleReal*)real_ptr);
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error executing fft, cuda error: " << cresult);
cufftDestroy(plan);
return DKS_ERROR;
}
//clean up resources
if (!useDefaultPlan(ndim, N)) {
cresult = cufftDestroy(plan);
if (cresult != CUFFT_SUCCESS) {
DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult);
return DKS_ERROR;
}
}
return DKS_SUCCESS;
}
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int CudaFFT::normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId) {
cublasStatus_t status;
unsigned int size = N[0]*N[1]*N[2];
double alpha = 1.0/size;
if (streamId != -1 && streamId < m_base->cuda_numberOfStreams())
cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId));
status = cublasDscal(defaultCublasFFT, size, &alpha, (double*)real_ptr, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("CUBLAS exec Zscal failed!");
return DKS_ERROR;
}
return DKS_SUCCESS;
}
/*
Info: init cufftPlans witch can be reused for all FFTs of the same size and type
Return: success or error code
*/
int CudaFFT::setupFFT(int ndim, int N[3]) {
cufftResult cr1 = CUFFT_SUCCESS;
cufftResult cr2 = CUFFT_SUCCESS;
cufftResult cr3 = CUFFT_SUCCESS;
//create default fft plans
if (ndim == 1) {
cr1 = cufftPlan1d(&defaultPlanZ2Z, N[0], CUFFT_Z2Z, 1);
cr2 = cufftPlan1d(&defaultPlanD2Z, N[0], CUFFT_D2Z, 1);
cr3 = cufftPlan1d(&defaultPlanZ2D, N[0], CUFFT_Z2D, 1);
}
if (ndim == 2) {
cr1 = cufftPlan2d(&defaultPlanZ2Z, N[1], N[0], CUFFT_Z2Z);
cr2 = cufftPlan2d(&defaultPlanD2Z, N[1], N[0], CUFFT_D2Z);
cr3 = cufftPlan2d(&defaultPlanZ2D, N[1], N[0], CUFFT_Z2D);
}
if (ndim == 3) {
cr1 = cufftPlan3d(&defaultPlanZ2Z, N[2], N[1], N[0], CUFFT_Z2Z);
cr2 = cufftPlan3d(&defaultPlanD2Z, N[2], N[1], N[0], CUFFT_D2Z);
cr3 = cufftPlan3d(&defaultPlanZ2D, N[2], N[1], N[0], CUFFT_Z2D);
}
if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) {
DEBUG_MSG("Error creating default plan");
return DKS_ERROR;
}
//create cublas plan
cublasStatus_t status;
status = cublasCreate(&defaultCublasFFT);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("CUBLAS create default handle failed!");
return DKS_ERROR;
}
//std::cout << "cublas created" << std::endl;
defaultNdim = ndim;
if (ndim > 0) {
defaultN[0] = N[0];
defaultN[1] = N[1];
defaultN[2] = N[2];
}
return DKS_SUCCESS;
}
/*
Info: destroy default FFT plans
Return: success or error code
*/
int CudaFFT::destroyFFT() {
cufftResult cr1 = CUFFT_SUCCESS;
cufftResult cr2 = CUFFT_SUCCESS;
cufftResult cr3 = CUFFT_SUCCESS;
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
if (defaultNdim > 0) {
//clean up resources
cr1 = cufftDestroy(defaultPlanZ2Z);
cr2 = cufftDestroy(defaultPlanD2Z);
cr3 = cufftDestroy(defaultPlanZ2D);
if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) {
DEBUG_MSG("Error destroying default cufft plans");
return DKS_ERROR;
}
}
if (defaultNdim > -1) {
status = cublasDestroy(defaultCublasFFT);
if (status != CUBLAS_STATUS_SUCCESS) {
DEBUG_MSG("CUBLAS delete default handle failed!");
return DKS_ERROR;
}
}
defaultN[0] = -1;
defaultN[1] = -1;
defaultN[2] = -1;
defaultNdim = -1;
return DKS_SUCCESS;
}

88
src/CUDA/CudaFFT.cuh Normal file
View File

@ -0,0 +1,88 @@
#ifndef H_CUDA_FFT
#define H_CUDA_FFT
#include <iostream>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "cublas_v2.h"
#include "../Algorithms/FFT.h"
#include "CudaBase.cuh"
class CudaFFT : public DKSFFT{
private:
bool base_create;
CudaBase *m_base;
cufftHandle defaultPlanZ2Z;
cufftHandle defaultPlanD2Z;
cufftHandle defaultPlanZ2D;
cublasHandle_t defaultCublasFFT;
public:
/** Constructor with CudaBase as argument */
CudaFFT(CudaBase *base);
/** constructor */
CudaFFT();
/** destructor */
~CudaFFT();
/**
* Info: init cufftPlans witch can be reused for all FFTs of the same size and type
* Return: success or error code
*/
int setupFFT(int ndim, int N[3]);
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
/**
* Info: destroy default FFT plans
* Return: success or error code
*/
int destroyFFT();
/*
Info: execute complex to complex double precision fft using cufft library
Return: success or error code
*/
int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
/*
Info: execute ifft
Return: success or error code
*/
int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize using cuda kernel for complex to complex iFFT
Return: success or error code
*/
int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute real to complex double precision FFT
Return: success or error code
*/
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: exectue complex to real double precision FFT
Return: success or error code
*/
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize for complex to real iFFT
Return: success or error code
*/
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1);
};
#endif

View File

@ -0,0 +1,469 @@
#include "CudaGreensFunction.cuh"
__global__ void kernelTmpgreen(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ) {
int i = blockIdx.x;
int j = blockIdx.y;
int k = blockIdx.z;
double cellVolume = hr_m0 * hr_m1 * hr_m2;
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgreen[i + j * NI + k * NI * NJ] = tmpgrn / cellVolume;
}
__global__ void kernelTmpgreen_2(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ, int NK) {
int tid = threadIdx.x;
int id = blockIdx.x * blockDim.x + tid;
if (id < NI * NJ * NK) {
int i = id % NI;
int k = id / (NI * NJ);
int j = (id - k * NI * NJ) / NI;
double cellVolume = hr_m0 * hr_m1 * hr_m2;
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgreen[id] = tmpgrn / cellVolume;
}
}
//calculate greens integral on cpu and transfer to gpu
void kernelTmpgreenCPU(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2,
int NI, int NJ, int NK)
{
double cellVolume = hr_m0 * hr_m1 * hr_m2;
for (int k = 0; k < NK; k++) {
for (int j = 0; j < NJ; j++) {
for (int i = 0; i < NI; i++) {
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = 0;
tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgrn = tmpgrn / cellVolume;
tmpgreen[k*NJ*NI + j*NJ + i] = tmpgrn;
}
}
}
}
__global__ void kernelIngration(double *rho2_m, double *tmpgreen, int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) {
int i = blockIdx.x;
int j = blockIdx.y;
int k = blockIdx.z;
int ni = NI;
int nj = NJ;
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
__global__ void kernelIngration_2(double *rho2_m, double *tmpgreen,
int NI, int NJ,
int NI_tmp, int NJ_tmp, int NK_tmp) {
int tid = threadIdx.x;
int id = blockIdx.x * blockDim.x + tid;
int ni = NI;
int nj = NJ;
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
rho2_m[i + j*ni + k*ni*nj] = tmp_rho;
}
}
//just one kernel will be executed
__global__ void mirroredRhoField0(double *rho2_m, int NI, int NJ) {
rho2_m[0] = rho2_m[NI*NJ];
}
__global__ void mirroredRhoFieldI(double *rho2_m, int NI, int NJ) {
int i = blockIdx.x;
int j = blockIdx.y;
int k = blockIdx.z;
int idx1 = i + j*NI + k*NI*NJ;
int idx2 = (NI-i) + j*NI + k*NI*NJ;
if (NI-i < NI)
rho2_m[idx2] = rho2_m[idx1];
}
__global__ void mirroredRhoFieldJ(double *rho2_m, int NI, int NJ) {
int i = blockIdx.x;
int j = blockIdx.y;
int k = blockIdx.z;
int idx1 = i + j*NI + k*NI*NJ;
int idx2 = i + (NJ-j)*NI + k*NI*NJ;
if (NJ-j < NJ)
rho2_m[idx2] = rho2_m[idx1];
}
__global__ void mirroredRhoFieldK(double *rho2_m, int NI, int NJ, int NK) {
int i = blockIdx.x;
int j = blockIdx.y;
int k = blockIdx.z;
int idx1 = i + j*NI + k*NI*NJ;
int idx2 = i + j*NI + (NK-k)*NI*NJ;
if (NK-k < NK)
rho2_m[idx2] = rho2_m[idx1];
}
__global__ void mirroredRhoField(double *rho2_m,
int NI, int NJ, int NK,
int NI_tmp, int NJ_tmp, int NK_tmp) {
int tid = threadIdx.x;
int id = blockIdx.x * blockDim.x + tid;
int id1, id2, id3, id4, id5, id6, id7, id8;
if (id < NI_tmp * NJ_tmp * NK_tmp) {
int i = id % NI_tmp;
int k = id / (NI_tmp * NJ_tmp);
int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp;
int ri = NI - i;
int rj = NJ - j;
int rk = NK - k;
id1 = k * NI * NJ + j * NI + i;
id2 = k * NI * NJ + j * NI + ri;
id3 = k * NI * NJ + rj * NI + i;
id4 = k * NI * NJ + rj * NI + ri;
id5 = rk * NI * NJ + j * NI + i;
id6 = rk * NI * NJ + j * NI + ri;
id7 = rk * NI * NJ + rj * NI + i;
id8 = rk * NI * NJ + rj * NI + ri;
double data = rho2_m[id1];
if (i != 0)
rho2_m[id2] = data;
if (j != 0)
rho2_m[id3] = data;
if (i != 0 && j != 0)
rho2_m[id4] = data;
if (k != 0)
rho2_m[id5] = data;
if (k != 0 && i != 0)
rho2_m[id6] = data;
if (k!= 0 && j != 0)
rho2_m[id7] = data;
if (k != 0 && j != 0 & i != 0)
rho2_m[id8] = data;
}
}
__device__ inline cuDoubleComplex ComplexMul(cuDoubleComplex a, cuDoubleComplex b) {
cuDoubleComplex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
__global__ void multiplyComplexFields(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2) {
int idx = blockIdx.x;
ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]);
}
/*
copy data in shared memory first to improve memory access (few global memory accesses, maybo no improvements)
use more threads per block to improve occupancy of hardware (test for best block and thread sizes)
*/
__global__ void multiplyComplexFields_2(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2,
int size)
{
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ cuDoubleComplex data[];
if (idx < size) {
data[2*tid] = ptr1[idx];
data[2*tid + 1] = ptr2[idx];
}
__syncthreads();
if (idx < size)
ptr1[idx] = ComplexMul(data[2*tid], data[2*tid+1]);
}
CudaGreensFunction::CudaGreensFunction(CudaBase *base) {
m_base = base;
base_create = false;
}
/* constructor */
CudaGreensFunction::CudaGreensFunction() {
m_base = new CudaBase();
base_create = true;
}
/* destructor */
CudaGreensFunction::~CudaGreensFunction() {
if (base_create)
delete m_base;
}
int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId)
{
int thread = 128;
int block = (I * J * K / thread) + 1;
//if no stream specified use default stream
if (streamId == -1) {
kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS;
}
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K);
return DKS_SUCCESS;
}
return DKS_ERROR;
}
int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen,
int I, int J, int K,
int streamId)
{
int thread = 128;
int block = (I * J * K / thread) + 1;
if (streamId == -1) {
kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS;
}
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen,
2*(I - 1), 2*(J - 1), I, J, K);
return DKS_SUCCESS;
}
return DKS_ERROR;
}
int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
int thread = 128;
int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1;
if (streamId == -1) {
mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J);
mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1);
return DKS_SUCCESS;
}
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J);
mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1);
return DKS_SUCCESS;
}
return DKS_ERROR;
}
int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2,
int size, int streamId) {
int threads = 128;
int blocks = size / threads + 1;
int datasize = 2 * threads * sizeof(cuDoubleComplex);
if (streamId == -1) {
multiplyComplexFields_2<<<blocks, threads, datasize>>> ( (cuDoubleComplex*)ptr1,
(cuDoubleComplex*)ptr2,
size);
return DKS_SUCCESS;
}
if (streamId < m_base->cuda_numberOfStreams()) {
cudaStream_t cs = m_base->cuda_getStream(streamId);
multiplyComplexFields_2<<<blocks, threads, datasize, cs >>> ( (cuDoubleComplex*)ptr1,
(cuDoubleComplex*) ptr2, size);
return DKS_SUCCESS;
}
return DKS_ERROR;
}

View File

@ -0,0 +1,63 @@
#ifndef H_CUDA_GREENSFUNCTION
#define H_CUDA_GREENSFUNCTION
#include <iostream>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuComplex.h>
#include "cublas_v2.h"
#include "CudaBase.cuh"
class CudaGreensFunction {
private:
bool base_create;
CudaBase *m_base;
public:
/** Constructor with CudaBase argument */
CudaGreensFunction(CudaBase *base);
/* constructor */
CudaGreensFunction();
/* destructor */
~CudaGreensFunction();
/*
Info: calc itegral on device memory (taken from OPAL src code)
Return: success or error code
*/
int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ,
double hr_m0, double hr_m1, double hr_m2,
int streamId = -1);
/*
Info: integration of rho2_m field (taken from OPAL src code)
Return: success or error code
*/
int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K,
int streamId = -1);
/*
Info: mirror rho field (taken from OPAL src code)
Return: succes or error code
*/
int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1);
/*
Info: multiply complex fields already on the GPU memory, result will be put in ptr1
Return: success or error code
*/
int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1);
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,118 @@
#ifndef H_CUDA_IMAGERECONSTRUCTION
#define H_CUDA_IMAGERECONSTRUCTION
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/count.h>
#include "../Algorithms/ImageReconstruction.h"
#include "CudaBase.cuh"
class CudaImageReconstruction : public ImageReconstruction {
private:
bool base_create;
CudaBase *m_base;
public:
/** Constructor */
CudaImageReconstruction() {
m_base = new CudaBase();
base_create = true;
};
/** Constructor with base **/
CudaImageReconstruction(CudaBase *base) {
m_base = base;
base_create = false;
}
/** Destructor */
~CudaImageReconstruction() {
if (base_create)
delete m_base;
};
/** CUDA implementation of caluclate source
*/
int calculateSource(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0);
/** Cuda implementation of calculate background
*/
int calculateBackground(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0);
/**
* Caluclate source for differente sources
*/
int calculateSources(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0);
/**
* Calculate background for differente sources
*/
int calculateBackgrounds(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0);
/** Generate normalization.
* Goes trough detectors pairs and if detector pair crosses image launches seperate kernel
* that updates voxel values in the image on the slope between these two detectors.
*/
int generateNormalization(void *recon, void *image_position,
void *det_position, int total_det);
/** Calculate forward projection.
* For image reconstruction calculates forward projections.
* see recon.cpp for details
*/
int forwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events);
/** Calculate backward projection.
* For image reconstruction calculates backward projections.
* see recon.cpp for details
*/
int backwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position,
int num_events, int num_voxels);
/** Set the voxel dimensins on device.
*
*/
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
/** Set the image edge.
*
*/
int setEdge(float x_edge, float y_edge, float z_edge);
/** Set the image edge1.
*
*/
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
/** Set the minimum crystan in one ring values.
*
*/
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
/** Set all other required parameters for reconstruction.
*
*/
int setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter);
};
#endif

View File

@ -0,0 +1,316 @@
#define PI 3.141592653589793115998
#define TWO_PI 6.283185307179586231996
#define DEG_TO_RAD 1.7453292519943295474371681e-2
/** Theory function declaration.
* Definition of the theory function will be build during runtime before compilation.
*/
__device__ double fTheory(double t, double *p, double *f, int *m);
/** MusrFit predefined functions.
* Predefined functions from MusrFit that can be used to define the theory function.
* First parameter in all the functions is alwats time - t, rest of the parameters depend
* on the function.
*/
__device__ double se(double t, double lamda) {
return exp( -lamda*t );
}
__device__ double ge(double t, double lamda, double beta) {
return exp( -pow(lamda*t, beta) );
}
__device__ double sg(double t, double sigma) {
return exp( -0.5*pow(sigma*t, 2.0) );
}
__device__ double stg(double t, double sigma) {
double sigmatsq = pow(sigma*t, 2.0);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5*sigmatsq);
}
__device__ double sekt(double t, double lambda) {
double lambdat = lambda*t;
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
}
__device__ double lgkt(double t, double lambda, double sigma) {
double lambdat = lambda*t;
double sigmatsq = pow(sigma*t, 2.0);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
}
__device__ double skt(double t, double sigma, double beta) {
if (beta < 1.0e-3)
return 0.0;
double sigmatb = pow(sigma*t, beta);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
}
__device__ double spg(double t, double lambda, double gamma, double q) {
double lam2 = lambda*lambda;
double lamt2q = t*t*lam2*q;
double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
double rateL = sqrt(fabs(rate2));
double rateT = sqrt(fabs(rate2)+lamt2q);
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
}
__device__ double rahf(double t, double nu, double lambda) {
double nut = nu*t;
double nuth = nu*t/2.0;
double lamt = lambda*t;
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
}
__device__ double tf(double t, double phi, double nu) {
double tmp_nu = TWO_PI*nu*t;
double tmp_phi = DEG_TO_RAD*phi;
return cos(tmp_nu + tmp_phi);
}
__device__ double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
}
__device__ double b(double t, double phi, double nu) {
return j0(TWO_PI*nu*t + DEG_TO_RAD*phi);
}
__device__ double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
double wt = TWO_PI * nu * t;
double ph = DEG_TO_RAD * phi;
return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
}
__device__ double ab(double t, double sigma, double gamma) {
double gt = gamma*t;
return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
}
__device__ double snkzf(double t, double Delta0, double Rb) {
double D0t2 = pow(Delta0*t, 2.0);
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
}
__device__ double snktf(double t, double phi, double nu, double Delta0, double Rb) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
double D0t2 = pow(Delta0*t, 2.0);
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
}
__device__ double dnkzf(double t, double Delta0, double Rb, double nuc) {
double nuct = nuc*t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
}
__device__ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
double nuct = nuc*t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
}
/** Theory and chisquare functions.
* Based on the compiler flags set theory is calculated either in single hist mode or asymetric.
* Based on the compiler flags calculate either chisq or MLE
*/
__device__ inline double singleHist(double &N0, double &tau, double &bkg, double &f, double &t) {
return N0 * exp (-t/tau ) * (1.0 + f) + bkg;
}
__device__ inline double asymetry(double &a, double &b, double &f) {
return (f * (a * b) - (a - 1.0)) / ((a + 1.0) - f * (a * b - 1.0));
}
__device__ inline double getTheory(double &c1, double &c2, double &c3, double &f, double &t) {
#ifndef ASYMETRY
return singleHist(c1, c2, c3, f, t);
#elif
return asymetry(c1, c2, f);
#endif
}
__device__ inline double chiSq(double &data, double &theo, double &err) {
double res = (theo - data) * (theo - data);
if (err != 0.0)
res /= err;
return res;
}
__device__ inline double mle(double &data, double &theo, double &err) {
double res = (theo - data);
if ( data > 1.0e-9 && fabs(theo) > 1.0e-9 )
res += data * log(data / theo);
return res;
}
__device__ inline double getChiSq(double &data, double &theo, double &err) {
#ifndef MLE
return chiSq(data, theo, err);
#elif
return mle(data, theo, err);
#endif
}
//-----------------------------------------------------------------------------------------------
/**
* Kernel to calculate theory function and chisquare/mle values for single histogram fits.
*/
extern "C" __global__ void kernelChiSquareSingleHisto(double *data, double *err, double *par,
double *chisq, int *map, double *funcv, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double tau, double N0, double bkg) {
//define shared variable for parameters
extern __shared__ double smem[];
double *p = (double*)smem;
double *f = (double*)&smem[numpar];
int *m = (int*)&smem[numpar + numfunc];
//get thread id and calc global id
int tid;
int j = blockIdx.x * blockDim.x + threadIdx.x;
//load parameters from global to shared memory
tid = threadIdx.x;
while (tid < numpar) {
p[tid] = par[tid];
tid += blockDim.x;
}
//load functions from global to shared memory
tid = threadIdx.x;
while (tid < numfunc) {
f[tid] = funcv[tid];
tid += blockDim.x;
}
//load maps from global memory
tid = threadIdx.x;
while (tid < nummap) {
m[tid] = map[tid];
tid += blockDim.x;
}
//sync threads
__syncthreads();
while (j < length) {
double t = timeStart + j*timeStep;
double ldata = data[j];
double lerr = err[j];
double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg;
#ifdef MLH
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo));
else
chisq[j] = 2.0 * (theo - ldata);
#else
if (lerr != 0.0)
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
else
chisq[j] = theo * theo;
#endif
j += gridDim.x * blockDim.x;
}
}
//-----------------------------------------------------------------------------------------------
/**
* Kernel to calculate theory function and chisquare/mle values for asymmetry fits.
*/
extern "C" __global__ void kernelChiSquareAsymmetry(double *data, double *err, double *par,
double *chisq, int *map, double *funcv, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double alpha, double beta) {
//define shared variable for parameters
extern __shared__ double smem[];
double *p = (double*)smem;
double *f = (double*)&smem[numpar];
int *m = (int*)&smem[numpar + numfunc];
//get thread id and calc global id
int tid;
int j = blockIdx.x * blockDim.x + threadIdx.x;
//load parameters from global to shared memory
tid = threadIdx.x;
while (tid < numpar) {
p[tid] = par[tid];
tid += blockDim.x;
}
//load functions from global to shared memory
tid = threadIdx.x;
while (tid < numfunc) {
f[tid] = funcv[tid];
tid += blockDim.x;
}
//load maps from global memory
tid = threadIdx.x;
while (tid < nummap) {
m[tid] = map[tid];
tid += blockDim.x;
}
//sync threads
__syncthreads();
while (j < length) {
double t = timeStart + j*timeStep;
double ldata = data[j];
double lerr = err[j];
double theoVal = fTheory(t, p, f, m);
double ab = alpha*beta;
double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0) - (ab-1.0)*theoVal);
#ifdef MLH
chisq[j] = 0.0; // log max likelihood not defined here
#else
if (lerr != 0.0)
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
else
chisq[j] = theo * theo;
#endif
j += gridDim.x * blockDim.x;
}
}

861
src/DKSBase.cpp Normal file
View File

@ -0,0 +1,861 @@
#include "DKSBase.h"
#define API_OPENCL "OpenCL"
#define API_CUDA "Cuda"
#define API_OPENMP "OpenMP"
#define DEVICE_GPU "-gpu"
#define DEVICE_CPU "-cpu"
#define DEVICE_MIC "-mic"
//=====================================//
//==========Private functions==========//
//=====================================//
bool DKSBase::apiOpenCL() {
if (!m_api_set)
return false;
if (strcmp(m_api_name, API_OPENCL) != 0)
return false;
return true;
}
bool DKSBase::apiCuda() {
if (!m_api_set)
return false;
if (strcmp(m_api_name, API_CUDA) != 0)
return false;
return true;
}
bool DKSBase::apiOpenMP() {
if (!m_api_set)
return false;
if (strcmp(m_api_name, API_OPENMP) != 0)
return false;
return true;
}
bool DKSBase::deviceGPU() {
if (!m_device_set)
return false;
if (strcmp(m_device_name, DEVICE_GPU) != 0)
return false;
return true;
}
bool DKSBase::deviceCPU() {
if (!m_device_set)
return false;
if (strcmp(m_device_name, DEVICE_CPU) != 0)
return false;
return true;
}
bool DKSBase::deviceMIC() {
if (!m_device_set)
return false;
if (strcmp(m_device_name, DEVICE_MIC) != 0)
return false;
return true;
}
int DKSBase::loadOpenCLKernel(const char *kernel_name) {
//load kernel
char * kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, kernel_name);
int ierr = OPENCL_SAFECALL( oclbase->ocl_loadKernel(kernel_file) );
delete[] kernel_file;
return ierr;
}
//=====================================//
//==========Public functions===========//
//=====================================//
DKSBase::DKSBase() {
m_device_name = NULL;
m_api_name = NULL;
m_function_name = NULL;
m_device_set = false;
m_api_set = false;
m_function_set = false;
m_auto_tuning = false;
m_use_config = false;
#ifdef DKS_CUDA
cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif
#ifdef DKS_OPENCL
oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif
#ifdef DKS_MIC
micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif
}
DKSBase::DKSBase(const char* api_name, const char* device_name) {
setAPI(api_name, strlen(api_name));
setDevice(device_name, strlen(device_name));
m_function_name = NULL;
m_function_set = false;
m_auto_tuning = false;
m_use_config = false;
#ifdef DKS_CUDA
cbase = new CudaBase();
cfft = new CudaFFT(cbase);
cgreens = new CudaGreensFunction(cbase);
cchi = new CudaChiSquare(cbase);
ccol = new CudaCollimatorPhysics(cbase);
#endif
#ifdef DKS_OPENCL
oclbase = new OpenCLBase();
oclfft = new OpenCLFFT(oclbase);
oclchi = new OpenCLChiSquare(oclbase);
oclcol = new OpenCLCollimatorPhysics(oclbase);
#endif
#ifdef DKS_MIC
micbase = new MICBase();
micfft = new MICFFT(micbase);
miccol = new MICCollimatorPhysics(micbase);
micgreens = new MICGreensFunction(micbase);
micchi = new MICChiSquare(micbase);
#endif
}
DKSBase::~DKSBase() {
if (m_device_name != NULL)
delete[] m_device_name;
if (m_api_name != NULL)
delete[] m_api_name;
if (m_function_name != NULL)
delete[] m_function_name;
#ifdef DKS_CUDA
delete cfft;
delete cgreens;
delete cchi;
delete ccol;
delete cbase;
#endif
#ifdef DKS_OPENCL
delete oclfft;
delete oclchi;
delete oclcol;
delete oclbase;
#endif
#ifdef DKS_MIC
delete micfft;
delete miccol;
delete micgreens;
delete micchi;
delete micbase;
#endif
}
/*
Name: setDevice
Info: sets specific device to use. length specifies device_name string length (deprecated)
Return: success or error code
*/
int DKSBase::setDevice(const char* device_name, int length) {
if (m_device_set)
delete[] m_device_name;
int l = strlen(device_name);
m_device_name = new char[l+1];
for (int i = 0; i < l; i++)
m_device_name[i] = device_name[i];
m_device_name[l] = '\0';
m_device_set = true;
return DKS_SUCCESS;
}
/*
Name: setAPI
Info: sets specific api (OpenCL, CUDA, OpenACC, OpenMP) to use
Return: success or error code
*/
int DKSBase::setAPI(const char* api_name, int length) {
if (m_api_set)
delete[] m_api_name;
int l = strlen(api_name);
m_api_name = new char[l+1];
for (int i = 0; i < l; i++)
m_api_name[i] = api_name[i];
m_api_name[l] = '\0';
m_api_set = true;
return DKS_SUCCESS;
}
/*
Name: getDevices
Info: get all available devices
Return: success or error code
*/
int DKSBase::getDevices() {
int ierr1 = OPENCL_SAFECALL( oclbase->ocl_getAllDevices() );
int ierr2 = CUDA_SAFECALL( cbase->cuda_getDevices() );
int ierr3 = MIC_SAFECALL( micbase->mic_getDevices() );
if (ierr1 + ierr2 + ierr3 != DKS_SUCCESS)
return DKS_ERROR;
return DKS_SUCCESS;
}
int DKSBase::getDeviceCount(int &ndev) {
ndev = 0;
if (apiOpenCL())
return OPENCL_SAFECALL( oclbase->ocl_getDeviceCount(ndev) );
else if (apiCuda())
return CUDA_SAFECALL( cbase->cuda_getDeviceCount(ndev) );
else if (apiOpenMP())
return DKS_ERROR;
else
return DKS_ERROR;
}
int DKSBase::getDeviceName(std::string &device_name) {
if (apiOpenCL())
return OPENCL_SAFECALL( oclbase->ocl_getDeviceName(device_name) );
else if (apiCuda())
return CUDA_SAFECALL( cbase->cuda_getDeviceName(device_name) );
else if (apiOpenMP())
return DKS_ERROR;
else
return DKS_ERROR;
}
int DKSBase::setDefaultDevice(int device) {
std::cout << "Set device " << device << std::endl;
if (apiOpenCL())
return OPENCL_SAFECALL( oclbase->ocl_setDevice(device) );
else if (apiCuda())
return CUDA_SAFECALL( cbase->cuda_setDevice(device) );
else if (apiOpenMP())
return DKS_ERROR;
else
return DKS_ERROR;
}
int DKSBase::getDeviceList(std::vector<int> &devices) {
if (apiOpenCL())
return OPENCL_SAFECALL( oclbase->ocl_getUniqueDevices(devices) );
else if (apiCuda())
return CUDA_SAFECALL( cbase->cuda_getUniqueDevices(devices) );
else if (apiOpenMP())
return DKS_ERROR;
else
return DKS_ERROR;
}
/*
init device
*/
int DKSBase::initDevice() {
//if api is not set default is OpenCL
if (!m_api_set) {
setDevice("-gpu", 4);
setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else {
if (apiOpenCL()) {
if (!m_device_set) {
setDevice("-gpu", 4);
setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") );
} else {
setAPI(API_OPENCL, 6);
return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) );
}
} else if (apiCuda()) {
setDevice("-gpu", 4);
setAPI(API_CUDA, 4);
return CUDA_SAFECALL(DKS_SUCCESS);
} else if (apiOpenMP()) {
setDevice("-mic", 4);
setAPI(API_OPENMP, 6);
return MIC_SAFECALL(DKS_SUCCESS);
}
}
return DKS_ERROR;
}
/*
set up cuda, opencl and mic to allow async data transfer and kernel execution.
name stream 'stolen' from cuda. opencl context ~ cuda stream.
TODO: implementations for OpenCL and MIC still needed
*/
int DKSBase::createStream(int &streamId) {
if (apiCuda())
return CUDA_SAFECALL( cbase->cuda_createStream(streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micbase->mic_createStream(streamId) );
DEBUG_MSG("Streams not enbled for this platforms jet");
return DKS_ERROR;
}
/* send device pointer to other processes */
#ifdef DKS_MPI
int DKSBase::sendPointer(void *mem_ptr, int dest, MPI_Comm comm) {
if ( apiCuda() ) {
#ifdef DKS_CUDA
cudaError cerror;
cudaIpcMemHandle_t shandle;
cerror = cudaIpcGetMemHandle(&shandle, mem_ptr);
MPI_Send(&shandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, dest, 100, comm);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error geting mem handle");
return DKS_ERROR;
}
return DKS_SUCCESS;
#endif
}
else if (apiOpenMP()) {
#ifdef DKS_MIC
//BENI:
DEBUG_MSG("No SendPointer for MIC is implemented");
return DKS_ERROR;
#endif
}
else {
DEBUG_MSG("Send device pointer not implemented on selected platform");
return DKS_ERROR;
}
return DKS_ERROR;
}
#endif
/* receive device pointer */
#ifdef DKS_MPI
void * DKSBase::receivePointer(int hostproc, MPI_Comm comm, int &ierr) {
void *mem_ptr;
if (apiCuda()) {
#ifdef DKS_CUDA
cudaError cerror;
cudaIpcMemHandle_t rhandle;
MPI_Recv(&rhandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, hostproc, 100, comm, NULL);
cerror = cudaIpcOpenMemHandle(&mem_ptr, rhandle, cudaIpcMemLazyEnablePeerAccess);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error opening received handle");
ierr = DKS_ERROR;
}
#endif
return mem_ptr;
}
else if (apiOpenMP()) {
#ifdef DKS_MIC
//BENI:
DEBUG_MSG("No ReceivePointer for MIC is implemented");
return DKS_SUCCESS;
#endif
return mem_ptr;
}
else {
ierr = DKS_ERROR;
DEBUG_MSG("Receive device pointer not implemented for selected platform");
return mem_ptr;
}
}
#endif
/* close received handle */
int DKSBase::closeHandle(void *mem_ptr) {
if (apiCuda()) {
#ifdef DKS_CUDA
cudaError cerror;
cerror = cudaIpcCloseMemHandle(mem_ptr);
if (cerror != cudaSuccess) {
DEBUG_MSG("Error closing memory handle");
return DKS_ERROR;
}
return DKS_SUCCESS;
#endif
}
DEBUG_MSG("Memory handles not implemented for selected platform");
return DKS_ERROR;
}
/* sync device calls */
int DKSBase::syncDevice() {
if (apiCuda())
return CUDA_SAFECALL( cbase->cuda_syncDevice() );
else if (apiOpenMP())
return MIC_SAFECALL( micbase->mic_syncDevice() );
return DKS_ERROR;
}
/* setup fft plans to reuse if multiple ffts of same size are needed */
int DKSBase::setupFFT(int ndim, int N[3]) {
if (apiCuda()) {
return CUDA_SAFECALL( cfft->setupFFT(ndim, N) );
} else if (apiOpenMP()) {
//micbase.mic_setupFFT(ndim, N);
//BENI: setting up RC and CR transformations on MIC
int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) );
int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) );
if (ierr1 != DKS_SUCCESS)
return ierr1;
if (ierr2 != DKS_SUCCESS)
return ierr2;
return DKS_SUCCESS;
}
return DKS_ERROR;
}
//BENI:
int DKSBase::setupFFTRC(int ndim, int N[3], double scale) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale));
return DKS_ERROR;
}
//BENI:
int DKSBase::setupFFTCR(int ndim, int N[3], double scale) {
if (apiCuda())
return CUDA_SAFECALL(cfft->setupFFT(ndim, N));
else if (apiOpenMP())
return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale));
return DKS_ERROR;
}
/* call OpenCL FFT function for selected platform */
int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
//load kernel and execute
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize));
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call OpenCL IFFT function for selected platform */
int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call normalize FFT function for selected platform */
int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) {
if (apiOpenCL()) {
if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS )
return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) );
else
return DKS_ERROR;
} else if (apiCuda()) {
return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) );
} else if (apiOpenMP()) {
return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) );
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call real to complex FFT */
int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* call complex to real FFT */
int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) );
else if (apiOpenMP())
return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) );
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
/* normalize complex to real iFFT */
int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) {
if (apiCuda())
return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) );
DEBUG_MSG("No implementation for selected platform");
return DKS_SUCCESS;
}
/* normalize complex to real iFFT */
int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) {
if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selected platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ,
double hz_m0, double hz_m1, double hz_m2, int streamId) {
if (apiCuda()) {
return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ,
hz_m0, hz_m1, hz_m2, streamId) );
} else if (apiOpenMP()) {
//BENI:
return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr,
int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) {
if (apiCuda())
return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar, double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
if (apiCuda()) {
return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar,
result));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result,
fTimeResolution, fRebin, fGoodBinOffset,
sensors, length, numpar, result));
else
return DKS_ERROR;
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles, int numparams,
int &numaddback, int &numdead)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
} else if (apiOpenCL()) {
if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS)
return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
else
return DKS_ERROR;
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles)
{
if (apiCuda())
return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
else if (apiOpenMP())
return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) );
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
if (apiOpenMP()) {
return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles) );
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback)
{
if (apiOpenMP()) {
return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr,
rx_ptr, ry_ptr, rz_ptr,
px_ptr, py_ptr, pz_ptr,
par_ptr, numparticles, numaddback));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callInitRandoms(int size) {
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_createCurandStates(size));
else if (apiOpenCL())
return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size));
else if (apiOpenMP())
return MIC_SAFECALL(micbase->mic_createRandStreams(size));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart,
void *dt_ptr, double dt, double c,
bool usedt, int streamId)
{
if (apiCuda())
return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c,
usedt, streamId));
else if (apiOpenMP())
return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt,
c, usedt, streamId));
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}
int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr, void *orient_ptr,
int npart, int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
if (apiCuda()) {
return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
} else if (apiOpenMP()) {
return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr,
lastSec_ptr, orient_ptr,
npart, nsec, dt_ptr, dt,
c, usedt, streamId));
}
DEBUG_MSG("No implementation for selceted platform");
return DKS_ERROR;
}

1133
src/DKSBase.h Normal file

File diff suppressed because it is too large Load Diff

196
src/DKSBaseMuSR.cpp Normal file
View File

@ -0,0 +1,196 @@
#include "DKSBaseMuSR.h"
DKSBaseMuSR::DKSBaseMuSR() {
chiSq = nullptr;
chiSquareSize_m = -1;
}
DKSBaseMuSR::~DKSBaseMuSR() {
freeChiSquare();
}
int DKSBaseMuSR::callCompileProgram(std::string function, bool mlh) {
return chiSq->compileProgram(function, mlh);
}
int DKSBaseMuSR::callLaunchChiSquare(int fitType,
void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result)
{
//if we are not auto tuning and the size of the problem has changed find the new parameters
//from autotuning config file
if (!isAutoTuningOn() && length != chiSquareSize_m) {
int numBlocks, blockSize;
std::string device_name;
getDeviceName(device_name);
dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare",
length, "NumBlocks", numBlocks);
dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare",
length, "BlockSize", blockSize);
chiSq->setKernelParams(numBlocks, blockSize);
//std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
chiSquareSize_m = length;
}
int ierr = chiSq->launchChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc,
nummap, timeStart, timeStep, result);
if ( isAutoTuningOn() ) {
std::vector<int> config;
callAutoTuningChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, nummap, timeStart,
timeStep, result, config);
}
return ierr;
}
int DKSBaseMuSR::callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result, std::vector<int> &config)
{
int loops = 100;
DKSAutoTuning *autoTuning;
if (apiCuda())
autoTuning = new DKSAutoTuning(this, API_CUDA, DEVICE_GPU_NEW, loops);
else if (apiOpenCL() && deviceGPU())
autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_GPU_NEW, loops);
else if (apiOpenCL() && deviceCPU())
autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_CPU_NEW, loops);
else if (apiOpenCL() && deviceMIC())
autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_MIC_NEW, loops);
else
autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW, loops);
int maxThreadsPerBlock = 1024;
checkMuSRKernels(fitType, maxThreadsPerBlock);
std::cout << "Max threads for autotune " << maxThreadsPerBlock << std::endl;
//create the function to be timed
std::function<int()> f = std::bind(&ChiSquareRuntime::launchChiSquare, chiSq,
fitType, mem_data, mem_err, length, numpar, numfunc, nummap,
timeStart, timeStep, result);
autoTuning->setFunction(f, "launchChiSquare");
//create the parameters for auto-tuning
autoTuning->addParameter(&chiSq->blockSize_m, 32, maxThreadsPerBlock, 32, "BlockSize");
autoTuning->addParameter(&chiSq->numBlocks_m, 100, 5000, 100, "NumBlocks");
autoTuning->lineSearch();
//autoTuning->hillClimbing(100);
//autoTuning->simulatedAnnealing(1e-3, 1e-6);
//autoTuning->exaustiveSearch();
std::string device_name;
getDeviceName(device_name);
dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length,
"NumBlocks", chiSq->numBlocks_m);
dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length,
"BlockSize", chiSq->blockSize_m);
config.push_back(chiSq->blockSize_m);
config.push_back(chiSq->numBlocks_m);
delete autoTuning;
return DKS_SUCCESS;
}
int DKSBaseMuSR::testAutoTuning() {
DKSAutoTuning *autoTuning;
DKSAutoTuningTester *tester;
autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW);
tester = new DKSAutoTuningTester();
std::function<double()> f = std::bind(&DKSAutoTuningTester::peaksZ, tester);
autoTuning->setFunction(f, "testAutoTuner", false);
autoTuning->addParameter(&tester->x, -3.0, 3.0, 0.5, "x");
autoTuning->addParameter(&tester->y, -3.0, 3.0, 0.5, "y");
autoTuning->exaustiveSearch();
autoTuning->hillClimbing(10);
autoTuning->simulatedAnnealing(10, 0.0005);
return DKS_SUCCESS;
}
int DKSBaseMuSR::callSetConsts(double N0, double tau, double bkg) {
return chiSq->setConsts(N0, tau, bkg);
}
int DKSBaseMuSR::callSetConsts(double alpha, double beta) {
return chiSq->setConsts(alpha, beta);
}
int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int size_map) {
int ierr;
if (apiCuda()) {
ierr = CUDA_SAFECALL( DKS_SUCCESS );
chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase()));
} else {
ierr = OPENCL_SAFECALL( DKS_SUCCESS );
chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase()));
}
if (ierr == DKS_SUCCESS) {
return chiSq->initChiSquare(size_data, size_param, size_func, size_map);
} else {
DEBUG_MSG("DKS API not set, or DKS compiled without sellected API support");
return DKS_ERROR;
}
}
int DKSBaseMuSR::freeChiSquare() {
int ierr = DKS_SUCCESS;
if (chiSq != NULL) {
ierr = chiSq->freeChiSquare();
delete chiSq;
chiSq = NULL;
}
return ierr;
}
int DKSBaseMuSR::writeParams(const double *params, int numparams) {
return chiSq->writeParams(params, numparams);
}
int DKSBaseMuSR::writeFunctions(const double *func, int numfunc) {
return chiSq->writeFunc(func, numfunc);
}
int DKSBaseMuSR::writeMaps(const int *map, int numfunc) {
return chiSq->writeMap(map, numfunc);;
}
int DKSBaseMuSR::checkMuSRKernels(int fitType) {
int threadsPerBlock = 1;
return chiSq->checkChiSquareKernels(fitType, threadsPerBlock);
}
int DKSBaseMuSR::checkMuSRKernels(int fitType, int &threadsPerBlock) {
return chiSq->checkChiSquareKernels(fitType, threadsPerBlock);
}
int DKSBaseMuSR::getOperations(int &oper) {
return chiSq->getOperations(oper);
}

137
src/DKSBaseMuSR.h Normal file
View File

@ -0,0 +1,137 @@
#ifndef H_DKS_BASEMUSR
#define H_DKS_BASEMUSR
#include <iostream>
#include <string>
#include "AutoTuning/DKSAutoTuning.h"
#include "AutoTuning/DKSAutoTuningTester.h"
#include "DKSBase.h"
#include "Algorithms/ChiSquareRuntime.h"
#ifdef DKS_CUDA
#include "CUDA/CudaChiSquareRuntime.cuh"
#endif
#ifdef DKS_OPENCL
#include "OpenCL/OpenCLChiSquareRuntime.h"
#endif
class DKSBaseMuSR : public DKSBase {
private:
ChiSquareRuntime *chiSq;
int chiSquareSize_m;
public:
DKSBaseMuSR();
~DKSBaseMuSR();
/** Compile the program with kernels to be run.
* String function contains the string that will be added to the code to compile in the
* function: __device__ double fTheory(double t, double *p, double *f, int *m);
* Function string must be a valid C math expression. It can contain operators, math functions
* and predefined functions listed in:
* http://lmu.web.psi.ch/musrfit/user/MUSR/MusrFit.html#A_4.3_The_THEORY_Block
* Predifined functions can be accessed by the abbreviation given in the table
* Parameters can be accesed in form p[idx] or p[m[idx]] - where p represents parameter array
* m represents map array and idx is the index to use from the maps. Precalculated function
* values can be accessed the same way - f[idx] or f[m[idx]]. Returns DKS_SUCCESS if everythin
* runs successfully, otherwise returns DKS_ERROR. If DKS is compiled with debug flag enabled
* prints DKS error message in case something fails
*/
int callCompileProgram(std::string function, bool mlh = false);
/** Launch chi square calculation on data set writen in mem_data memory on device.
* mem_par, mem_map and mem_func hold pointers to parameter, function and map values
* for this data set (parameter array is one for all the data sets, maps and functions
* change between data sets). Resulting chi square value for this dataset will be put in
* result variable. Returns DKS_SUCCESS if everythin runs successfully, otherwise returns
* DKS_ERROR. If DKS is compiled with debug flag enabled prints DKS error message in case
* something fails
*/
int callLaunchChiSquare(int fitType,
void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result);
/** Launch auto-tuning of chisquare function for the selected device.
* Creates a function pointer to callLaunchChiSquare with necessary arguments bind to
* function call. CUDA and OpenCL version - gives AutoTuning class access to numThreads
* parameter which is varied to find the optimal value by AutoTuning class. Uses brute force
* method to test all the values.
*/
int callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result, std::vector<int> &config);
/** Set N0, tau and BKG values for the run.
* Needs to be called before kernel launch if these values are changing
*/
int callSetConsts(double N0, double tau, double bkg);
/** Set alpha and beta values for the run.
* Needs to be called before kernel launch if these values are changing
*/
int callSetConsts(double alpha, double beta);
/** Init chisquare calculations.
* Size is the maximum number of elements in any of the data sets used.
*/
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary device storage allocated for chi^2 kernel.
* Return error code if freeing the device fails.
*/
int freeChiSquare();
/** Write params to device.
* Write pramas from double array to device, params device memory is managed by DKS.
*/
int writeParams(const double *params, int numparams);
/** Write function values to device.
* Write precalculated function values to device, memory for functions on device is handled
* by DKS.
*/
int writeFunctions(const double *func, int numfunc);
/** Write map indexes to device.
* Write map indexes to use in defined theory function to devive. Memory for map indexes is
* handeld by DKS.
*/
int writeMaps(const int *map, int numfunc);
/** Check if device can run necessary kernels.
* Check selected device properties to see if device
* suports double precision and if device can run the
* necessary number of work_items / work_groups to successfully
* execute CUDA/OpenCL kernels.
*/
int checkMuSRKernels(int fitType);
/** Perform the same check as checkMuSRKernels(int fitType) and return max threads per block.
* Used for autotuning to check what is the device limit for threads per block to correctly
* set the upper bound when searching the parameter space.
*/
int checkMuSRKernels(int fitType, int &threadsPerBlock);
/** Debug function to test auto-tuning search functions
*/
int testAutoTuning();
/** Get the number of operations in compiled kernel.
*/
int getOperations(int &oper);
};
#endif

71
src/DKSDefinitions.h Normal file
View File

@ -0,0 +1,71 @@
#ifndef H_DKS_DEFINITIONS
#define H_DKS_DEFINITIONS
#define API_OPENCL "OpenCL"
#define API_CUDA "Cuda"
#define API_OPENMP "OpenMP"
#define API_UNKNOWN "Unknown"
#define DEVICE_GPU_NEW "GPU"
#define DEVICE_CPU_NEW "CPU"
#define DEVICE_MIC_NEW "MIC"
#define DEVICE_UNKNOWN_NEW "Unknown"
#define DEVICE_GPU "-gpu"
#define DEVICE_CPU "-cpu"
#define DEVICE_MIC "-mic"
//define macro for printing debug messages if debug flag is set
#ifdef DEBUG
#define DEBUG_MSG(x) (std::cout << x << std::endl)
#else
#define DEBUG_MSG(x)
#endif
//define DKS error codes
#define DKS_SUCCESS 0
#define DKS_ERROR 1
#define DKS_API_NOT_ENABLED 100
#define OCL_SUCCESS 0
#define OCL_ERROR 1
//define macros to enable or disable calls to specific frameworks
//if framework specific flag is set execute the satement, of not give DKS_API_NOT_ENABLED error
#ifdef DKS_CUDA
#define CUDA_SAFECALL(...) ( __VA_ARGS__ )
#else
#define CUDA_SAFECALL(...) ( DKS_API_NOT_ENABLED )
#endif
#ifdef DKS_OPENCL
#define OPENCL_SAFECALL(...) ( __VA_ARGS__ )
#else
#define OPENCL_SAFECALL(...) ( DKS_API_NOT_ENABLED )
#endif
#ifdef DKS_MIC
#define MIC_SAFECALL(...) ( __VA_ARGS__ )
#else
#define MIC_SAFECALL(...) ( DKS_API_NOT_ENABLED )
#endif
#ifdef DKS_CUDA
#define CUDA_SAFEINIT(x) ( x )
#else
#define CUDA_SAFEINIT(x) ( NULL )
#endif
#ifdef DKS_OPENCL
#define OPENCL_SAFEINIT(x) ( x )
#else
#define OPENCL_SAFEINIT(x) ( NULL )
#endif
#ifdef DKS_MIC
#define MIC_SAFEINIT(x) ( x )
#else
#define MIC_SAFEINIT(x) ( NULL )
#endif
#endif

0
src/DKSDevice.cpp Normal file
View File

37
src/DKSDevice.h Normal file
View File

@ -0,0 +1,37 @@
/*
Author: Uldis Locans
Info: class that holds information about the compute device
Data: 25.09.2014
*/
#define DKS_DEVICE_TYPE_GPU 1
#define DKS_DEVICE_TYPE_MIC 2
#define DKS_DEVICE_TYPE_CPU 3
class Device {
private:
int m_device_id;
int m_device_type;
char *m_device_name;
char *m_device_vendor;
bool m_sup_opencl;
bool m_sup_cuda;
bool m_sup_openmp;
bool m_sup_openacc;
int m_pci_bus_id;
public:
Device();
~Device();
};

View File

@ -0,0 +1,130 @@
#include "DKSImageReconstruction.h"
DKSImageRecon::DKSImageRecon() {
//set up base. since reconstruction is always using cuda, set up base to CUDA
setAPI("Cuda");
setDevice("-gpu");
initDevice();
imageRecon = CUDA_SAFEINIT( new CudaImageReconstruction(getCudaBase()) );
}
DKSImageRecon::~DKSImageRecon() {
delete[] imageRecon;
}
int DKSImageRecon::callCalculateSource(void *image_space, void *image_position,
void *source_position, void *avg, void *std,
float diameter, int total_voxels,
int total_sources, int start)
{
int ierr;
ierr = imageRecon->calculateSource(image_space, image_position, source_position,
avg, std, diameter, total_voxels,
total_sources, start);
return ierr;
}
int DKSImageRecon::callCalculateBackground(void *image_space, void *image_position,
void *source_position, void *avg, void *std,
float diameter, int total_voxels,
int total_sources, int start)
{
int ierr;
ierr = imageRecon->calculateBackground(image_space, image_position,
source_position, avg, std, diameter,
total_voxels, total_sources, start);
return ierr;
}
int DKSImageRecon::callCalculateSources(void *image_space, void *image_position,
void *source_position, void *avg, void *std,
void *diameter, int total_voxels,
int total_sources, int start)
{
int ierr;
ierr = imageRecon->calculateSources(image_space, image_position,
source_position, avg, std, diameter,
total_voxels, total_sources, start);
return ierr;
}
int DKSImageRecon::callCalculateBackgrounds(void *image_space, void *image_position,
void *source_position, void *avg, void *std,
void *diameter, int total_voxels,
int total_sources, int start)
{
int ierr;
ierr = imageRecon->calculateBackgrounds(image_space, image_position,
source_position, avg, std, diameter,
total_voxels, total_sources, start);
return ierr;
}
int DKSImageRecon::callGenerateNormalization(void *recon, void *image_position,
void *det_position, int total_det)
{
int ierr = imageRecon->generateNormalization(recon, image_position,
det_position, total_det);
return ierr;
}
int DKSImageRecon::callForwardProjection(void *correction, void *recon, void *list_data,
void *det_position, void *image_position, int num_events)
{
int ierr;
ierr = imageRecon->forwardProjection(correction, recon, list_data, det_position,
image_position, num_events);
return ierr;
}
int DKSImageRecon::callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position,
int num_events, int num_voxels)
{
int ierr;
ierr = imageRecon->backwardProjection(correction, recon_corrector, list_data,
det_position, image_position, num_events,
num_voxels);
return ierr;
}
int DKSImageRecon::setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) {
int ierr = imageRecon->setDimensions(voxel_x, voxel_y, voxel_z, voxel_size);
return ierr;
}
int DKSImageRecon::setEdge(float x_edge, float y_edge, float z_edge) {
int ierr = imageRecon->setEdge(x_edge, y_edge, z_edge);
return ierr;
}
int DKSImageRecon::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) {
int ierr = imageRecon->setEdge1(x_edge1, y_edge1, z_edge1, z_edge2);
return ierr;
}
int DKSImageRecon::setMinCrystalInRing(float min_CrystalDist_InOneRing,
float min_CrystalDist_InOneRing1)
{
int ierr = imageRecon->setMinCrystalInRing(min_CrystalDist_InOneRing,
min_CrystalDist_InOneRing1);
return ierr;
}
int DKSImageRecon::setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter)
{
int ierr = imageRecon->setParams(matrix_distance_factor, phantom_diameter,
atten_per_mm, ring_diameter);
return ierr;
}

View File

@ -0,0 +1,120 @@
#ifndef H_DKS_IMAGERECONSTRUCTION
#define H_DKS_IMAGERECONSTRUCTION
#include <iostream>
#include "DKSBase.h"
#include "Algorithms/ImageReconstruction.h"
#ifdef DKS_CUDA
#include "CUDA/CudaImageReconstruction.cuh"
#endif
class DKSImageRecon : public DKSBase {
private:
ImageReconstruction *imageRecon;
public:
DKSImageRecon();
~DKSImageRecon();
/** Image reconstruction analaysis calculate source.
*
*
*/
int callCalculateSource(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source.
*
*
*/
int callCalculateBackground(void *image_space, void *image_position, void *source_position,
void *avg, void *std, float diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source.
*
*
*/
int callCalculateSources(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction analaysis calculate source.
*
*
*/
int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position,
void *avg, void *std, void *diameter, int total_voxels,
int total_sources, int start = 0);
/** Image reconstruction - generate normalization.
*
*/
int callGenerateNormalization(void *recon, void *image_position,
void *det_position, int total_det);
/** Image reconstruction - forward correction.
*
*/
int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position,
void *image_position, int num_events);
/** Image reconstruction - backward projection.
*
*/
int callBackwardProjection(void *correction, void *recon_corrector, void *list_data,
void *det_position, void *image_position,
int num_events, int num_voxels);
/** Set the voxel dimensins on device.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size);
/** Set the image edge.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setEdge(float x_edge, float y_edge, float z_edge);
/** Set the image edge1.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2);
/** Set the minimum crystan in one ring values.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1);
/** Set all other required parameters for reconstruction.
* Values are stored in GPU memory and used in forward and backward projection calculations.
* Call set function once to transfer the values from host side to GPU.
* If value changes on the host side set functions needs to be called again to update GPU values.
*/
int setParams(float matrix_distance_factor, float phantom_diameter,
float atten_per_mm, float ring_diameter);
};
#endif

24
src/DKSStream.h Normal file
View File

@ -0,0 +1,24 @@
/*
Author: Uldis Locans
Date: 12.12.2014
Comment: based on device used create different cuda streams, opencl contexts, (mic - dont know yet)
that allow handling of asynchronoes data transfer and kernel execution on the device
*/
#ifndef H_DKSSTREAM
#define H_DKSSTREAM
#define DKS_SUCCESS 0
#define DKS_ERROR 1
#include <iostream>
#include <cuda_runtime.h>
class DKSStream {
}

25
src/MIC/CMakeLists.txt Normal file
View File

@ -0,0 +1,25 @@
SET (_SRCS
MICBase.cpp
MICChiSquare.cpp
MICFFT.cpp
MICGreensFunction.cpp
MICCollimatorPhysics.cpp
)
SET (_HDRS
MICBase.h
MICChiSquare.h
MICFFT.h
MICCollimatorPhysics.h
MICGreensFunction.hpp
MICMergeSort.h
)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/MIC)

124
src/MIC/MICBase.cpp Normal file
View File

@ -0,0 +1,124 @@
#include "MICBase.h"
//constructor, sets default device id equal to 0
MICBase::MICBase() {
m_device_id = 0;
defaultRndSet = -1;
}
//destructor, delete defaultrnd streams if they are set
MICBase::~MICBase() {
mic_deleteRandStreams();
}
//create default rand streams
int MICBase::mic_createRandStreams(int size) {
int seed = time(NULL);
#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed)
{
//get the number of threads
int numThreads;
#pragma omp parallel
numThreads = omp_get_num_threads();
//if default rnd stream already allocated delete the array
if (defaultRndSet == 1)
delete[] defaultRndStream;
//allocate defaultRndStream array
defaultRndStream = new VSLStreamStatePtr[numThreads];
//create stream states for each thread
#pragma omp parallel for
for (int i = 0; i < omp_get_num_threads(); i++)
vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i);
defaultRndSet = 1;
}
return DKS_SUCCESS;
}
//delete default rand streams
int MICBase::mic_deleteRandStreams() {
#pragma offload target(mic:m_device_id) inout(defaultRndSet)
{
if (defaultRndSet == 1) {
delete[] defaultRndStream;
defaultRndSet = -1;
}
}
return DKS_ERROR;
}
//create a new signal for the mic
int MICBase::mic_createStream(int & streamId) {
//use int as signal, create a new int in micStreams vector, return the id
int tmpStream = micStreams.size();
micStreams.push_back(tmpStream);
streamId = micStreams.size() - 1;
//empty offload to create the signal on the mic
/*
#pragma offload target(mic:m_device_id) signal(mic_getStream(streamId))
{
}
*/
return DKS_SUCCESS;
}
//get the signal from the vector
int& MICBase::mic_getStream(int id) {
return micStreams[id];
}
//delete streams
int MICBase::mic_deleteStreams() {
micStreams.clear();
return DKS_SUCCESS;
}
//sets device id
int MICBase::mic_setDeviceId(int id) {
m_device_id = id;
return DKS_SUCCESS;
}
//get information abaut all available mic devices
//TODO: find a way to check system for avaialbel mic devices
int MICBase::mic_getDevices() {
int devices = _Offload_number_of_devices();
int thread_count = 0;
std::cout << "==============================" << std::endl;
std::cout << "==========Intel MICs==========" << std::endl;
std::cout << "==============================" << std::endl;
std::cout << "Total mic devices: " << devices << std::endl;
//std::cout << "Total mic devices: currently cant be found, but it's 1 on kraftwerk" << std::endl;
#pragma offload target(mic:m_device_id) inout(thread_count)
{
thread_count = omp_get_max_threads();
}
std::cout << "Max threads: " << thread_count << std::endl;
return DKS_SUCCESS;
}

244
src/MIC/MICBase.h Normal file
View File

@ -0,0 +1,244 @@
/*
Name: MIC Base
Author: Uldis Locans
Info: class to handle set up and data transfer from host to Intel MIC devices
Date: 29.09.2014
*/
#ifndef H_MIC_BASE
#define H_MIC_BASE
#include <iostream>
#include <omp.h>
#include <offload.h>
#include <mkl_dfti.h>
#include <mkl_vsl.h>
#include <vector>
#include <time.h>
#include "../DKSDefinitions.h"
#define DKS_ALLOC alloc_if(1)
#define DKS_FREE free_if(1)
#define DKS_RETAIN free_if(0)
#define DKS_REUSE alloc_if(0)
#define MIC_WIDTH 128
class MICBase {
private:
std::vector<int> micStreams;
protected:
int defaultRndSet;
public:
VSLStreamStatePtr *defaultRndStream;
int m_device_id;
/* constructor */
MICBase();
/* destructor */
~MICBase();
/*
Info: create MKL rand streams for each thread
Return: success or error code
*/
int mic_createRandStreams(int size);
/*
Info: delete MKL rand streams
Return: succes or error code
*/
int mic_deleteRandStreams();
/*
Info: create a new signal for the mic
Return: success or error code
*/
int mic_createStream(int & streamId);
/*
Info: get the signal from the vector
Return: mic signal
*/
int& mic_getStream(int id);
/*
Info: delete streams
Return: success or error code
*/
int mic_deleteStreams();
/*
Info: set device id
Return: success or error code
*/
int mic_setDeviceId(int id);
/*
Info: get mic devices
Return: success or error code
*/
int mic_getDevices();
/*
Info: allocate memory on MIC device
Return: success or error code
*/
template<typename T>
void * mic_allocateMemory(int size) {
int padding = size % MIC_WIDTH;
int totalsize = size + padding;
T *tmp = (T*)_mm_malloc(sizeof(T)*totalsize, 64); // = new T[size];
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp:length(totalsize) DKS_ALLOC DKS_RETAIN)
return tmp;
}
/*
Info: transfer data to device
Return: success or error code
*/
template<typename T>
int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) {
T* tmp_ptr = (T*)data_ptr;
T* tmp_data = (T*)data;
#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) )
return DKS_SUCCESS;
}
/*
Info: write data to device, non-blocking
Return: success or error code
*/
template<typename T>
int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0)
{
T* tmp_ptr = (T*)data_ptr;
T* tmp_data = (T*)data;
#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) )
return DKS_SUCCESS;
}
/*
Info: read data from device
Return: success or error code
*/
template<typename T>
int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) {
T* tmp_ptr = (T*)data_ptr;
T* tmp_result = (T*)result;
//std::cout << "try to read data with size = " << size << " adn offset = " << offset << std::endl;
#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) )
return DKS_SUCCESS;
}
/*
Info: read data from device waiting for signal
Return: success or error code
*/
template<typename T>
int mic_readDataAsync(const void * data_ptr, void * result, int size,
int streamId = -1, int offset = 0) {
T* tmp_ptr = (T*)data_ptr;
T* tmp_result = (T*)result;
#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) )
{
}
return DKS_SUCCESS;
}
/*
Info: wait till all the signals are complete
Return siccess or error code
*/
int mic_syncDevice() {
//empty offload to wait for all the signals to finish and launch a new empy signal
/*
for (int i = 0; i < micStreams.size(); i++) {
#pragma offload target(mic:m_device_id) wait(mic_getStream(i)) signal(mic_getStream(i))
{
}
}
*/
//std::cout << "done read data" << std::endl;
return DKS_SUCCESS;
}
/*
Info: free memory on device
Return: success or error code
*/
template<typename T>
int mic_freeMemory(void * data_ptr, int size) {
int padding = size % MIC_WIDTH;
int totalsize = size + padding;
T* tmp_ptr = (T*)data_ptr;
#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE)
{
}
return DKS_SUCCESS;
}
/*
Info: allocate memory and write data to device
Return: success or error code
*/
template<typename T>
void * mic_pushData(const void * data, int size) {
T* tmp_ptr = new T[size];
T* tmp_data = (T*)data;
#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_ALLOC DKS_RETAIN
into(tmp_ptr[0:size]) )
{
}
return tmp_ptr;
}
/*
Info: read data and free memory on device
Return: success or erro code
*/
template<typename T>
int mic_pullData(void * data_ptr, void * result, int size) {
T* tmp_ptr = (T*)data_ptr;
T* tmp_data = (T*)result;
#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[0:size] : DKS_REUSE DKS_FREE into(tmp_data[0:size]) )
{
}
return DKS_SUCCESS;
}
};
#endif

93
src/MIC/MICChiSquare.cpp Normal file
View File

@ -0,0 +1,93 @@
#include "MICChiSquare.h"
/*
calculate chi^2 on intel mic, use data already loaded on device
*/
int MICChiSquare::mic_chi2(double *O, double *E, double *result, int size) {
#pragma offload target(mic:m_micbase->m_device_id) \
in(O:length(0) DKS_RETAIN DKS_REUSE) \
in(E:length(0) DKS_RETAIN DKS_REUSE) \
in(result:length(0) DKS_RETAIN DKS_REUSE) \
in(size)
{
#pragma omp parallel for
for (int i = 0; i < size; i++) {
result[i] = pow(O[i] - E[i], 2) / E[i];
}
}
return DKS_SUCCESS;
}
/*
calculate function N(t), use data already loaded on device
*/
int MICChiSquare::mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT) {
#pragma offload target(mic:m_micbase->m_device_id) \
in(nt:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) \
in(psize) in(nsize) in(jsize) in(deltaT)
{
double gamma = 0.01; //???
double tau = 0.01; //???
for (int j = 0; j < jsize; j++) {
int pid = j*psize;
double N0 = p[pid];
double Nbkg = p[pid+1];
double A0 = p[pid+2];
double phi = p[pid+3];
double sigma = p[pid+4];
double B = p[pid+5];
int idj = j*nsize;
double a1 = -0.5*sigma*sigma;
double b1 = gamma*B;
#pragma omp parallel for
for (int n = 0; n < nsize; n++) {
int id = idj + n;
double t = n*deltaT;
double a = a1*t*t;
double b = b1*t + phi;
double At = A0 * exp2(a) * cos(b);
double c = -t/tau;
double Nt = N0 * exp2(c) * (1 + At) + Nbkg;
nt[id] = Nt;
}
}
}
return DKS_SUCCESS;
}
/*
calculate sum of array
*/
int MICChiSquare::mic_sum(double *data, double *result, int size) {
double sum = 0;
#pragma offload target(mic:m_micbase->m_device_id) \
in(data:length(0) DKS_REUSE DKS_RETAIN) \
in(result:length(0) DKS_REUSE DKS_RETAIN) \
in(size) in(sum)
{
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; i++) {
sum += data[i];
}
result[0] = sum;
}
return DKS_SUCCESS;
}

51
src/MIC/MICChiSquare.h Normal file
View File

@ -0,0 +1,51 @@
/*
Name: MICChiSquare
Info: calculate chi^2 using intel mic coporcessor
Author: Uldis Locans
Date: 29.09.2014
*/
#ifndef H_MIC_CHI_SQUARE
#define H_MIC_CHI_SQUARE
#include <math.h>
#include <omp.h>
#include <offload.h>
#include "MICBase.h"
class MICChiSquare {
MICBase *m_micbase;
public:
/* constructor */
MICChiSquare(MICBase *base) {
m_micbase = base;
}
/* destructor */
~MICChiSquare() { }
/*
Info: calucate chi square
Return: success or error code
*/
int mic_chi2(double *O, double *E, double *result, int size);
/*
Info: calculate Nt function
Return: success or error code
*/
int mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT = 1);
/*
Info: calculate sum of array
Return: success or error code
*/
int mic_sum(double *data, double *result, int size);
};
#endif

View File

@ -0,0 +1,876 @@
#include "MICCollimatorPhysics.h"
#define M_P 0.93827231e+00
#define C 299792458.0
#define PI 3.14159265358979323846
#define AVO 6.022e23
#define R_E 2.81794092e-15
#define eM_E 0.51099906e-03
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
#define Z_M 3
#define A_M 4
#define A2_C 5
#define A3_C 6
#define A4_C 7
#define A5_C 8
#define X0_M 9
#define I_M 10
#define DT_M 11
__declspec(target(mic))
double dot(mic_double3 d1, mic_double3 d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
__declspec(target(mic))
double dot(double dx, double dy, double dz) {
return (dx * dx + dy * dy + dz * dz);
}
__declspec(target(mic))
bool checkHit(double &z, double *par) {
return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) );
}
__declspec(target(mic))
void Rot(double &px, double &pz, double &x, double &z, double xplane,
double normP, double thetacou, double deltas, int coord)
{
double Psixz = 1;
double pxz = 1;
if ( px >= 0 && pz >= 0 )
Psixz = atan(px/pz);
else if ( px > 0 && pz < 0 )
Psixz = atan(px/pz) + PI;
else if (px < 0 && pz > 0)
Psixz = atan(px/pz) + 2*PI;
else
Psixz = atan(px/pz) + PI;
pxz = sqrt(px*px + pz*pz);
if(coord == 1) {
x = x + deltas * px / normP + xplane*cos(Psixz);
z = z - xplane * sin(Psixz);
}
if(coord == 2) {
x = x + deltas * px / normP + xplane * cos(Psixz);
z = z - xplane * sin(Psixz) + deltas * pz / normP;
}
px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
__declspec(target(mic))
void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) {
double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P;
double normP = sqrt(dot(P, P));
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
double deltas = par[DT_M] * beta * C;
double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) *
Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
// x-direction: See Physical Review, "Multiple Scattering"
double z1, z2;
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
double thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
thetacou = z2 * theta0;
}
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1);
double P2;//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
if(P2 < 0.0047) {
double P3, P4;
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
if(P4 > 0.5)
thetaru = -thetaru;
Rot(P.x ,P.z, R.x, R.z, xplane, normP, thetaru, deltas, 0);
}
// y-direction: See Physical Review, "Multiple Scattering"
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 );
thetacou = z2 * theta0;
}
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
Rot(P.y, P.z, R.y, R.z, yplane, normP, thetacou, deltas, 2);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1);
if(P2 < 0.0047) {
double P3, P4;
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
if(P4 > 0.5)
thetaru = -thetaru;
Rot(P.y, P.z, R.y, R.z, yplane, normP, thetaru, deltas, 0);
}
}
__declspec(target(mic))
void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label,
double *par, VSLStreamStatePtr &stream, int ii, int size)
{
double normP[MIC_WIDTH] __attribute__((aligned(64)));
double deltas[MIC_WIDTH] __attribute__((aligned(64)));
double theta0[MIC_WIDTH] __attribute__((aligned(64)));
double P1[MIC_WIDTH] __attribute__((aligned(64)));
double P2[MIC_WIDTH] __attribute__((aligned(64)));
double P3[MIC_WIDTH] __attribute__((aligned(64)));
double z1[MIC_WIDTH] __attribute__((aligned(64)));
double z2[MIC_WIDTH] __attribute__((aligned(64)));
double thetacou[MIC_WIDTH] __attribute__((aligned(64)));
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
double dotp = dot(px[i], py[i], pz[i]);
double Eng = sqrt(dotp + 1.0) * M_P - M_P;
double gamma = (Eng + M_P) / M_P;
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
normP[idx] = sqrt(dotp);
deltas[idx] = par[DT_M] * beta * C;
theta0[idx] = 13.6e6 / (beta * normP[idx] * M_P * 1e9) *
Z_P * sqrt(deltas[idx] / par[X0_M]) * (1.0 + 0.038 * log(deltas[idx] / par[X0_M]));
}
}
// x-direction: See Physical Review, "Multiple Scattering"
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0);
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0);
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + size; i++) {
int idx = i - ii;
thetacou[idx] = z2[idx] * theta0[idx];
}
//unknown number of iterations, cannot vectorize
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
thetacou[idx] = z2[idx] * theta0[idx];
}
}
}
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + size; i++) {
int idx = i - ii;
if (label[i] == 0) {
double xplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) +
z2[idx] * deltas[idx] * theta0[idx] / 2.0;
Rot(px[i], pz[i], rx[i], rz[i], xplane, normP[idx], thetacou[idx], deltas[idx], 1);
}
}
//generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
//P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
if(P1[idx] < 0.0047) {
double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
if(P3[idx] > 0.5)
thetaru = -thetaru;
Rot(px[i] ,pz[i], rx[i], rz[i], 0, 0, thetaru, 0, 0);
}
}
}
// y-direction: See Physical Review, "Multiple Scattering"
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0);
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0);
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
thetacou[idx] = z2[idx] * theta0[idx];
}
//unknown number of iterations, cannot vectorize
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) {
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 );
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 );
thetacou[idx] = z2[idx] * theta0[idx];
}
}
}
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
double yplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0)
+ z2[idx] * deltas[idx] * theta0[idx] / 2.0;
Rot(py[i], pz[i], ry[i], rz[i], yplane, normP[idx], thetacou[idx], deltas[idx], 2);
}
}
//generate array of random numbers
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1);
vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1);
//P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH]
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
int idx = i - ii;
if (label[i] == 0) {
if(P1[idx] < 0.0047) {
double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx];
if(P3[idx] > 0.5)
thetaru = -thetaru;
Rot(py[i], pz[i], ry[i], rz[i], 0, 0, thetaru, 0, 0);
}
}
}
}
__declspec(target(mic))
void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) {
double dEdx = 0.0;
const double gamma = (Eng + M_P) / M_P;
const double gamma2 = gamma * gamma;
const double beta = sqrt(1.0 - 1.0 / gamma2);
const double beta2 = beta * beta;
const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073;
const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
double tmprnd;
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
const double delta_E = deltasrho * dEdx + tmprnd;
Eng = Eng + delta_E / 1E3;
}
if (Eng >= 0.0006) {
const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
double tmprnd;
vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E );
const double delta_E = deltasrho * dEdx + tmprnd;
Eng = Eng + delta_E / 1E3;
}
if ((Eng<1E-4) || (dEdx>0))
pdead = 1;
}
__declspec(target(mic))
void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) {
const double gamma = (Eng + M_P) / M_P;
const double gamma2 = gamma * gamma;
const double beta = sqrt(1.0 - 1.0 / gamma2);
const double beta2 = beta * beta;
const double deltas = par[DT_M] * beta * C;
const double deltasrho = deltas * 100 * par[RHO_M];
const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5);
if ( (Eng > 0.00001) && (Eng < 0.0006) ) {
const double Ts = (Eng * 1E6) / 1.0073;
const double epsilon_low = par[A2_C] * pow(Ts, 0.45);
const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) );
const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high);
dEdx = -epsilon / (1E21 * (par[A_M] / AVO) );
const double delta_E = deltasrho * dEdx + sigma_E * randv[ri];
Eng = Eng + delta_E / 1E3;
}
if (Eng >= 0.0006) {
const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P));
dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
const double delta_E = deltasrho * dEdx + sigma_E * randv[ri + MIC_WIDTH];
Eng = Eng + delta_E / 1E3;
}
}
int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) {
//cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
double *par = (double*) par_ptr;
#pragma offload target(mic:m_micbase->m_device_id) \
inout(data:length(0) DKS_RETAIN DKS_REUSE) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles)
{
#pragma omp parallel
{
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
//for loop trough particles if not checkhit set label to -2 and update R.x
#pragma omp for simd
for (int i = 0; i < numparticles; i++) {
if ( !checkHit(data[i].Rincol.z, par) ) {
double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
data[i].Rincol.x = data[i].Rincol.x + par[DT_M] * C * data[i].Pincol.x / sq;
data[i].Rincol.y = data[i].Rincol.y + par[DT_M] * C * data[i].Pincol.y / sq;
data[i].Rincol.z = data[i].Rincol.z + par[DT_M] * C * data[i].Pincol.z / sq;
data[i].label = -2;
}
}
//for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
#pragma omp for simd
for (int i = 0; i < numparticles; i++) {
int pdead = -1;
double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol));
double Eng = (sq - 1) * M_P;
if (data[i].label == 0) {
energyLoss(Eng, pdead, par, stream);
}
if (pdead == -1) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(data[i].Pincol, data[i].Pincol));
data[i].Pincol.x = data[i].Pincol.x * ptot / sq;
data[i].Pincol.y = data[i].Pincol.y * ptot / sq;
data[i].Pincol.z = data[i].Pincol.z * ptot / sq;
}
if (pdead == 1)
data[i].label = -1;
}
//for loop trough particles if label == 0 coulomb scat
#pragma omp for
for (int i = 0; i < numparticles; i++) {
if (data[i].label == 0) {
coulombScat(data[i].Rincol, data[i].Pincol, par, stream);
}
}
} //end omp parallel
} //end offload
return DKS_SUCCESS;
}
int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles)
{
int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr;
double *ry = (double*)ry_ptr;
double *rz = (double*)rz_ptr;
double *px = (double*)px_ptr;
double *py = (double*)py_ptr;
double *pz = (double*)pz_ptr;
double *par = (double*)par_ptr;
int padding = numparticles % MIC_WIDTH;
int totalpart = numparticles + padding;
#pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
in(rx:length(0) DKS_REUSE DKS_RETAIN) \
in(ry:length(0) DKS_REUSE DKS_RETAIN) \
in(rz:length(0) DKS_REUSE DKS_RETAIN) \
in(px:length(0) DKS_REUSE DKS_RETAIN) \
in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(totalpart)
{
#pragma omp parallel
{
//every thread gets its own rnd stream state
VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()];
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
//vectorize main loop
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
if ( !checkHit(rz[i], par) ) {
double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
rx[i] = rx[i] + par[DT_M] * C * px[i] / sq;
ry[i] = ry[i] + par[DT_M] * C * py[i] / sq;
rz[i] = rz[i] + par[DT_M] * C * pz[i] / sq;
label[i] = -2;
}
}
}
//array of size 2*WIDTH for storing random values for the energyloss function
double randv[2*MIC_WIDTH] __attribute__((aligned(64)));
//for loop trough particles if label == 0 eneregy loss and if pdead update label to -1
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
//create array of rand values (2 per thread)
vdRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*MIC_WIDTH, randv, 0.0, 1.0);
#pragma vector aligned
#pragma simd
for (int i = ii; i < ii + MIC_WIDTH; i++) {
double sq = sqrt(1.0 + dot(px[i], py[i], pz[i]));
double Eng = (sq - 1) * M_P;
double dEdx = 0;
if (label[i] == 0) {
energyLoss(Eng, dEdx, par, randv, i - ii);
}
if (Eng > 1e-4 && dEdx < 0) {
double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P;
sq = sqrt(dot(px[i], py[i], pz[i]));
px[i] = px[i] * ptot / sq;
py[i] = py[i] * ptot / sq;
pz[i] = pz[i] * ptot / sq;
}
if (Eng < 1e-4 || dEdx > 0)
label[i] = -1;
} //end inner energy loss loop
} //end outer energy loss loop
//vectorize coulomb scattering as much as possible
#pragma omp for nowait
for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) {
coulombScat(rx, ry, rz, px, py, pz, label, par, stream, ii, MIC_WIDTH);
} //end coulomb scattering
} //end omp parallel
} //end offload
return DKS_SUCCESS;
}
int MICCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles,
int &numaddback)
{
//cast device memory pointers to appropriate types
MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr;
int privateback;
#pragma offload target(mic:m_micbase->m_device_id) \
in(data:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles) \
out(privateback)
{
//count dead and addback particles
int privateback = 0;
#pragma omp parallel for reduction(+:privateback)
for (int i = 0; i < numparticles; i++) {
if (data[i].label < 0)
privateback++;
}
//move particles with label < 0 to the end of the array (serial. can we do this parallel?)
if (privateback > 0) {
int moved = 0;
for (int i = numparticles - 1; i > 0; i--) {
if (data[i].label < 0) {
int idx = numparticles - 1 - moved;
if (i != idx) {
MIC_PART_SMALL tmp = data[i];
data[i] = data[idx];
data[idx] = tmp;
}
moved++;
}
}
}
numaddback = privateback;
}
return DKS_SUCCESS;
}
__declspec(target(mic))
void micmove(double &a, double &b) {
double tmp = a;
a = b;
b = tmp;
}
__declspec(target(mic))
void micmove(int &a, int &b) {
int tmp = a;
a = b;
b = tmp;
}
__declspec(target(mic))
void micmove(unsigned &a, unsigned &b) {
unsigned tmp = a;
a = b;
b = tmp;
}
int MICCollimatorPhysics::CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles,
int &numaddback)
{
int *label = (int*)label_ptr;
unsigned *localID = (unsigned*)localID_ptr;
double *rx = (double*)rx_ptr;
double *ry = (double*)ry_ptr;
double *rz = (double*)rz_ptr;
double *px = (double*)px_ptr;
double *py = (double*)py_ptr;
double *pz = (double*)pz_ptr;
double *par = (double*)par_ptr;
//int padding = numparticles % WIDTH;
//int totalpart = numparticles + padding;
int privateback;
#pragma offload target (mic:0) \
in(label:length(0) DKS_REUSE DKS_RETAIN) \
in(localID:length(0) DKS_REUSE DKS_RETAIN) \
in(rx:length(0) DKS_REUSE DKS_RETAIN) \
in(ry:length(0) DKS_REUSE DKS_RETAIN) \
in(rz:length(0) DKS_REUSE DKS_RETAIN) \
in(px:length(0) DKS_REUSE DKS_RETAIN) \
in(py:length(0) DKS_REUSE DKS_RETAIN) \
in(pz:length(0) DKS_REUSE DKS_RETAIN) \
in(par:length(0) DKS_RETAIN DKS_REUSE) \
in(numparticles) \
out(privateback)
{
//count dead and addback particles
int privateback = 0;
#pragma omp parallel for reduction(+:privateback)
for (int i = 0; i < numparticles; i++) {
if (label[i] < 0)
privateback++;
}
//move particles with label < 0 to the end of the array (serial. can we do this parallel?)
if (privateback > 0) {
int moved = 0;
for (int i = numparticles - 1; i >= 0; i--) {
if (label[i] < 0) {
int idx = numparticles - 1 - moved;
if (i != idx) {
micmove(rx[i], rx[idx]);
micmove(ry[i], ry[idx]);
micmove(rz[i], rz[idx]);
micmove(px[i], px[idx]);
micmove(py[i], py[idx]);
micmove(pz[i], pz[idx]);
micmove(label[i], label[idx]);
micmove(localID[i], localID[idx]);
}
moved++;
}
}
}
numaddback = privateback;
}
return DKS_SUCCESS;
}
__declspec(target(mic))
inline void unitlessOff(mic_double3 &a, const double c) {
a.x *= c;
a.y *= c;
a.z *= c;
}
__declspec(target(mic))
inline void unitlessOn(mic_double3 &a, const double c) {
a.x /= c;
a.y /= c;
a.z /= c;
}
__declspec(target(mic))
mic_double3 deviceTransformTo(const mic_double3 &vec, const mic_double3 &ori) {
const double sina = sin(ori.x);
const double cosa = cos(ori.x);
const double sinb = sin(ori.y);
const double cosb = cos(ori.y);
const double sinc = sin(ori.z);
const double cosc = cos(ori.z);
mic_double3 temp;
temp.x = 0.0;
temp.y = 0.0;
temp.z = 0.0;
temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z;
temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x +
(cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z;
temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x +
(sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z;
return temp;
}
__declspec(target(mic))
inline void updateR(mic_double3 &R, mic_double3 &P, double dotp, double dtc) {
R.x /= dtc;
R.x += 0.5 * P.x / dotp;
R.x *= dtc;
R.y /= dtc;
R.y += 0.5 * P.y / dotp;
R.y *= dtc;
R.z /= dtc;
R.z += 0.5 * P.z / dotp;
R.z *= dtc;
}
__declspec(target(mic))
inline void push(mic_double3 *r, mic_double3 *p, double dtc, int npart) {
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 R = r[i];
mic_double3 P = p[i];
double dotp = sqrt(1.0 + dot(P, P));
updateR(R, P, dotp, dtc);
r[i] = R;
}
}
__declspec(target(mic))
inline void push(mic_double3 *r, mic_double3 *p, double *gdt, double c, int npart) {
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 R = r[i];
mic_double3 P = p[i];
double dtc = gdt[i] * c;
double dotp = sqrt(1.0 + dot(P, P));
updateR(R, P, dotp, dtc);
r[i] = R;
}
}
int MICCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt, int streamId)
{
mic_double3 *r = (mic_double3*)r_ptr;
mic_double3 *p = (mic_double3*)p_ptr;
double *gdt = (double*)dt_ptr;
double dtc = dt * c;
if (!usedt) {
#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(npart, dtc)
{
push(r, p, dtc, npart);
}
} else {
#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) in(npart, c)
{
push(r, p, gdt, c, npart);
}
}
return DKS_SUCCESS;
}
__declspec(target(mic))
inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
double dtc, int npart, int nsec)
{
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 ori;
if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
ori = gOrient[gLastSect[i]];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
mic_double3 tmp = deviceTransformTo(p[i], ori);
mic_double3 X = x[i];
double dotp = sqrt(1.0 + dot(tmp, tmp));
updateR(X, tmp, dotp, dtc);
x[i] = X;
}
}
__declspec(target(mic))
inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect,
double *gdt, double c, int npart, int nsec)
{
#pragma omp parallel for simd
for (int i = 0; i < npart; i++) {
mic_double3 ori;
if (gLastSect[i] > -1 && gLastSect[i] < nsec) {
ori = gOrient[gLastSect[i]];
} else {
ori.x = 0.0;
ori.y = 0.0;
ori.z = 0.0;
}
mic_double3 tmp = deviceTransformTo(p[i], ori);
mic_double3 X = x[i];
double dotp = sqrt(1.0 + dot(tmp, tmp));
double dtc = gdt[i] * c;
updateR(X, tmp, dotp, dtc);
x[i] = X;
}
}
int MICCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr,
void *lastSec_ptr,
void *orient_ptr, int npart,
int nsec, void *dt_ptr, double dt,
double c, bool usedt, int streamId)
{
mic_double3 *x = (mic_double3*)x_ptr;
mic_double3 *p = (mic_double3*)p_ptr;
mic_double3 *gOrient = (mic_double3*)orient_ptr;
double *gdt = (double*)dt_ptr;
long *gLastSect = (long*)lastSec_ptr;
double dtc = dt * c;
if (!usedt) {
#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gOrient:length(0) DKS_RETAIN DKS_REUSE) \
in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) in(npart, nsec, dtc)
{
pushTransform(x, p, gOrient, gLastSect, dtc, npart, nsec);
}
} else {
#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \
in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) \
in(gOrient:length(0) DKS_RETAIN DKS_REUSE) in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) \
in(npart, nsec, c)
{
pushTransform(x, p, gOrient, gLastSect, gdt, c, npart, nsec);
}
}
return DKS_SUCCESS;
}

View File

@ -0,0 +1,68 @@
#ifndef H_MIC_COLLIMATORPHYSICS
#define H_MIC_COLLIMATORPHYSICS
#include <iostream>
#include <cstdio>
#include <cmath>
#include <omp.h>
#include <vector>
#include "../Algorithms/CollimatorPhysics.h"
#include "MICBase.h"
__declspec(target(mic))
typedef struct {
double x;
double y;
double z;
} mic_double3;
__declspec(target(mic))
typedef struct {
int label;
unsigned localID;
mic_double3 Rincol;
mic_double3 Pincol;
} MIC_PART_SMALL;
class MICCollimatorPhysics : DKSAlogorithms{
private:
MICBase *m_micbase;
public:
MICCollimatorPhysics(MICBase *base) {
m_micbase = base;
};
~MICCollimatorPhysics() { };
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles);
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback);
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback);
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1);
int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec,
void *dt_ptr, double dt, double c,
bool usedt = false, int streamId = -1);
};
#endif

210
src/MIC/MICFFT.cpp Normal file
View File

@ -0,0 +1,210 @@
#include "MICFFT.h"
#include<stdio.h>
#include<complex>
#include <time.h>
#include <sys/time.h>
MICFFT::MICFFT(MICBase *base) {
m_micbase = base;
}
MICFFT::~MICFFT() {
#pragma offload target(mic:0)
{
DftiFreeDescriptor(&FFTHandle_m);
DftiFreeDescriptor(&handle);
}
}
//setup fft
int MICFFT::setupFFT(int ndim, int N[3]) {
//set up FFT engine
#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
{
MKL_LONG sizes[3], strides[4];
sizes[0] = N[0]; sizes[1] = N[1]; sizes[2] = N[2];
//strides[0] = 0; strides[1] = sizes[1]; strides[2] = 1; strides[3] = sizes[0]*sizes[1];
strides[0] = 0; strides[1] = sizes[0]*sizes[1]; strides[2] = sizes[0]; strides[3] = 1;
MKL_LONG dims = 3;
DftiCreateDescriptor(&(this->getHandle()), DFTI_DOUBLE, DFTI_COMPLEX, dims, sizes);
DftiSetValue(this->getHandle(), DFTI_INPUT_STRIDES, strides);
DftiSetValue(this->getHandle(), DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
DftiCommitDescriptor(this->getHandle());
}
return DKS_SUCCESS;
}
//BENI:
//setup fft
int MICFFT::setupFFTRC(int ndim, int N[3], double scale) {
//set up FFT engine for REAL->COMPLEX
#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
{
MKL_LONG sizes[3], real_strides[4], complex_strides[4];
sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0];
//real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1;
real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1;
//real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1];
//complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1;
complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1;
//complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1;
MKL_LONG dims = 3;
DftiCreateDescriptor(&(this->getHandleRC()), DFTI_DOUBLE, DFTI_REAL, dims, sizes);
DftiSetValue(this->getHandleRC(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
DftiSetValue(this->getHandleRC(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
DftiSetValue(this->getHandleRC(), DFTI_PLACEMENT, DFTI_NOT_INPLACE);
DftiSetValue(this->getHandleRC(), DFTI_INPUT_STRIDES, real_strides);
DftiSetValue(this->getHandleRC(), DFTI_OUTPUT_STRIDES, complex_strides);
DftiSetValue(this->getHandleRC(), DFTI_FORWARD_SCALE, scale);
DftiCommitDescriptor(this->getHandleRC());
}
return DKS_SUCCESS;
}
//BENI:
//setup fft
int MICFFT::setupFFTCR(int ndim, int N[3], double scale) {
//set up FFT engine for COMPLEX->REAL
#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE)
{
MKL_LONG sizes[3], real_strides[4], complex_strides[4];
sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0];
//real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1;
real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1;
//real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1];
//complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1;
complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1;
//complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1;
MKL_LONG dims = 3;
DftiCreateDescriptor(&(this->getHandleCR()), DFTI_DOUBLE, DFTI_REAL, dims, sizes);
DftiSetValue(this->getHandleCR(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
DftiSetValue(this->getHandleCR(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT);
DftiSetValue(this->getHandleCR(), DFTI_PLACEMENT, DFTI_NOT_INPLACE);
DftiSetValue(this->getHandleCR(), DFTI_INPUT_STRIDES, complex_strides);
DftiSetValue(this->getHandleCR(), DFTI_OUTPUT_STRIDES, real_strides);
DftiSetValue(this->getHandleCR(), DFTI_BACKWARD_SCALE, scale);
DftiCommitDescriptor(this->getHandleCR());
}
return DKS_SUCCESS;
}
//execute COMPLEX->COMPLEX FFT
int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool forward) {
_Complex double *ptr = (_Complex double*) mem_ptr;
#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(forward)
{
if (forward)
DftiComputeForward(this->getHandle(), ptr);
else
DftiComputeBackward(this->getHandle(), ptr);
}
return DKS_SUCCESS;
}
//execute iFFT
int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) {
return mic_executeFFT(mem_ptr, ndim, N, -1, false);
}
//execute REAL->COMPLEX FFT
int MICFFT::executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) {
double *real_ptr = (double*) in_ptr;
//std::complex<double> *compl_ptr = (std::complex<double> *) out_ptr;
_Complex double *compl_ptr = (_Complex double *) out_ptr;
int sizereal = N[0]*N[1]*N[2];
int sizecompl = (N[0]/2+1)*N[1]*N[2];
//std::cout << "start real-compl fft on mic " << std::endl;
//std::cout << "real_ptr = " << real_ptr << std::endl;
//std::cout << "compl_ptr = " << compl_ptr << std::endl;
//std::cout << "EXECUTE AVERAGING OVER 10 LOOPS OF FFT" << std::endl;
#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE)
//#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE)
{
//for (int i=0;i<10;++i){ //loop 10 times for benchmarking
DftiComputeForward(this->getHandleRC(), real_ptr, compl_ptr);
//}
}
//std::cout << "end real-compl fft on mic " << std::endl;
return DKS_SUCCESS;
}
//execute COMPLEX->REAL FFT
int MICFFT::executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) {
//_Complex double *ptr = (_Complex double*) mem_ptr;
double *real_ptr = (double*) out_ptr;
_Complex double *compl_ptr = (_Complex double *) in_ptr;
//std::cout << "real_ptr = " << real_ptr << std::endl;
//std::cout << "compl_ptr = " << compl_ptr << std::endl;
int sizereal = N[0]*N[1]*N[2];
int sizecompl = (N[0]/2+1)*N[1]*N[2];
//std::cout << "offload to perform backward fft ... " << std::endl;
//struct timeval start, end;
//gettimeofday(&start,NULL);
#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE)
//#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE)
{
//for (int i=0;i<10;++i){ //loop 10 times for benchmarking
DftiComputeBackward(this->getHandleCR(), compl_ptr, real_ptr);
//}
}
// End timing offloaded FFT.
//gettimeofday(&end,NULL);
// Print execution time of offloaded computational loop.
//printf ("Total time for IFFT spent = %f seconds\n",
//(double) (end.tv_usec-start.tv_usec) /1000000+(double) (end.tv_sec-start.tv_sec));
//std::cout << "IFFT DONE!" << std::endl;
return DKS_SUCCESS;
}
//normalize IFFT
int MICFFT::normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId) {
int size = N[0] * N[1] * N[2];
_Complex double *ptr = (_Complex double*) mem_ptr;
#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(size)
{
#pragma omp parallel for
for (int i = 0; i < size; i++) {
__real__ ptr[i] = __real__ ptr[i] / size;
__imag__ ptr[i] = __imag__ ptr[i] / size;
}
}
return DKS_SUCCESS;
}

79
src/MIC/MICFFT.h Normal file
View File

@ -0,0 +1,79 @@
#ifndef H_MIC_FFT
#define H_MIC_FFT
#include <iostream>
#include <complex>
#include <offload.h>
#include <mkl_dfti.h>
#include "../Algorithm/DKSFFT.h"
#include "MICBase.h"
class MICFFT : public DKSFFT {
private:
MICBase *m_micbase;
/// Internal FFT object for performing serial FFTs.
#pragma offload_attribute(push,target(mic))
DFTI_DESCRIPTOR_HANDLE FFTHandle_m; //declspec only works for global variables
DFTI_DESCRIPTOR_HANDLE handle;
DFTI_DESCRIPTOR_HANDLE rc_handle; //handle for REAL->COMPLEX
DFTI_DESCRIPTOR_HANDLE cr_handle; //handle for COMPLEX->REAL
#pragma offload_attribute(pop)
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle(void) {
return FFTHandle_m;
}
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle1(void) {
return handle;
}
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleRC(void) {
return rc_handle;
}
__attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleCR(void) {
return cr_handle;
}
public:
/* constructor */
MICFFT(MICBase *base);
/* destructir */
~MICFFT();
/*
Info: setup mkl fft
Return: success or error code
*/
int setupFFT(int ndim, int N[3]);
//BENI:
int setupFFTRC(int ndim, int N[3], double scale = 1.0);
//BENI:
int setupFFTCR(int ndim, int N[3], double scale = 1.0);
/* execute FFT on MIC */
int executeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true);
/* execute IFFT on MIC */
int executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
/* execute REAL->COMPLEX FFT on MIC */
int executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1);
/* execute COMPLEX->REAL FFT on MIC */
int executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1);
/* normalize IFFT on MIC */
int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1);
};
#endif

View File

@ -0,0 +1,307 @@
#include "MICGreensFunction.hpp"
#include<stdio.h>
#include<complex>
#include <cstring>
/* constructor */
MICGreensFunction::MICGreensFunction(MICBase *base) {
m_micbase = base;
}
/* destructor */
MICGreensFunction::~MICGreensFunction() {
}
/* compute greens integral analytically */
// Version with extended domain
/*
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,double hr_m1, double hr_m2) {
double *tmp_ptr = (double*) tmp_ptr_;
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
{
std::memset(tmp_ptr,0,(I+1)*(J+1)*(K+1));
double cellVolume = hr_m0 * hr_m1 * hr_m2;
#pragma omp parallel for collapse(3) schedule(dynamic)
for (int k = 0; k < K; k++) {
for (int j = 0; j < J; j++) {
for (int i = 0; i < I; i++) {
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = 0;
tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgrn = tmpgrn / cellVolume;
tmp_ptr[k*(J+1)*(I+1) + j*(I+1) + i] = tmpgrn;
}
}
}
}
return 0;
}
*/
int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,
double hr_m1, double hr_m2)
{
double *tmp_ptr = (double*) tmp_ptr_;
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2)
{
std::memset(tmp_ptr,0,I*J*K);
double cellVolume = hr_m0 * hr_m1 * hr_m2;
#pragma omp parallel for collapse(3) schedule(dynamic)
for (int k = 0; k < K; k++) {
for (int j = 0; j < J; j++) {
for (int i = 0; i < I; i++) {
double vv0 = i * hr_m0 - hr_m0 / 2;
double vv1 = j * hr_m1 - hr_m1 / 2;
double vv2 = k * hr_m2 - hr_m2 / 2;
double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2);
double tmpgrn = 0;
tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) );
tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) );
tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) );
tmpgrn = tmpgrn / 2;
tmpgrn += vv1 * vv2 * log(vv0 + r);
tmpgrn += vv0 * vv2 * log(vv1 + r);
tmpgrn += vv0 * vv1 * log(vv2 + r);
tmpgrn = tmpgrn / cellVolume;
tmp_ptr[k*(J)*(I) + j*(I) + i] = tmpgrn;
}
}
}
}
return 0;
}
/* perform the actual integration */
// version with extended domain
/*
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
double *tmp_ptr = (double*) tmp_ptr_;
double *mem_ptr = (double*) mem_ptr_;
// the actual integration
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{
int Ii = I;
int Jj = J;
int Kk = K;
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
std::memset(mem_ptr,0,II*JJ*KK);
I=I+1; J=J+1; K=K+1;
#pragma omp parallel for collapse(3)
for (int i=0; i<Ii; i++) {
for (int j=0; j<Jj; j++) {
for (int k=0; k<Kk; k++) {
//mem_ptr[k*JJ*II + j*II + i] = 0.0;
mem_ptr[k*JJ*II + j*II + i] = tmp_ptr[(k+1)*J*I + (j+1)*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + j*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + (j+1)*I + i];
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[(k+1)*J*I + j*I + i];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + (j+1)*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + j*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + (j+1)*I + i];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + j*I + i];
}
}
}
}
return 0;
}
*/
/*
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
double *tmp_ptr = (double*) tmp_ptr_;
double *mem_ptr = (double*) mem_ptr_;
// the actual integration
#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{
int Ii = I;
int Jj = J;
int Kk = K;
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
std::memset(mem_ptr,0,II*JJ*KK);
//I=I+1; J=J+1; K=K+1;
#pragma omp parallel for collapse(3)
for (int i=0; i<Ii; i++) {
for (int j=0; j<Jj; j++) {
for (int k=0; k<Kk; k++) {
//mem_ptr[k*JJ*II + j*II + i] = 0.0;
mem_ptr[k*JJ*II + j*II + i] = tmp_ptr[(k+1)*J*I + (j+1)*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + j*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[k*J*I + (j+1)*I + i];
mem_ptr[k*JJ*II + j*II + i] += tmp_ptr[(k+1)*J*I + j*I + i];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + (j+1)*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + j*I + (i+1)];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[(k+1)*J*I + (j+1)*I + i];
mem_ptr[k*JJ*II + j*II + i] -= tmp_ptr[k*J*I + j*I + i];
}
}
}
}
return 0;
}
*/
//CUDA similar version:
int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) {
double *tmpgreen = (double*) tmp_ptr_;
double *mem_ptr = (double*) mem_ptr_;
// the actual integration
#pragma offload target(mic:0) in(tmpgreen:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{
int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1);
std::memset(mem_ptr,0,II*JJ*KK);
//I=I+1; J=J+1; K=K+1;
double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int NI_tmp=I;
int NJ_tmp=J;
int NK_tmp=K;
#pragma omp parallel for collapse(3)
for (int i=0; i<I; i++) {
for (int j=0; j<J; j++) {
for (int k=0; k<K; k++) {
tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0;
tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0;
if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp)
tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp)
tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp)
tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (k+1 < NK_tmp)
tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && j+1 < NJ_tmp)
tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp];
if (i+1 < NI_tmp && k+1 < NK_tmp)
tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
if (j+1 < NJ_tmp && k+1 < NK_tmp)
tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp];
tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp];
double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7;
mem_ptr[i + j*II + k*II*JJ] = tmp_rho;
}
}
}
}
return 0;
}
int MICGreensFunction::mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K) {
double *mem_ptr = (double*) mem_ptr_;
#pragma offload target(mic:0) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K)
{
int id, id_mirr;
int II = 2*I; int JJ = 2*J; int KK = 2*K;
mem_ptr[0] = mem_ptr[II*JJ];
#pragma omp parallel for collapse(3) schedule(dynamic)
for (int ie = I+1; ie<2*I; ++ie) {
for(int j = 0; j<= J; ++j) {
for (int k=0; k<= K; ++k) {
id = k * II * JJ + j * II + ie;
id_mirr = k * II * JJ + j * II + (2*I-ie);
mem_ptr[id] = mem_ptr[id_mirr];
}
}
}
#pragma omp parallel for collapse(3) schedule(dynamic)
for (int ai = 0; ai<2*I; ++ai) {
for(int je = J+1; je< 2*J; ++je) {
for (int k=0; k<= K; ++k) {
id = k * II * JJ + je * II + ai;
id_mirr = k * II * JJ + (2*J-je) * II + ai;
mem_ptr[id] = mem_ptr[id_mirr];
}
}
}
#pragma omp parallel for collapse(3) schedule(dynamic)
for (int ai = 0; ai<2*I; ++ai) {
for(int aj = 0; aj< 2*J; ++aj) {
for (int ke=K+1; ke< 2*K; ++ke) {
id = ke * II * JJ + aj * II + ai;
id_mirr = (2*K-ke) * II * JJ + aj * II + ai;
mem_ptr[id] = mem_ptr[id_mirr];
}
}
}
}
return 0;
}
/*multiply complex fields*/
int MICGreensFunction::mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size) {
// double *mem_ptr1 = (double*) mem_ptr1_;
// double *mem_ptr2 = (double*) mem_ptr2_;
_Complex double *mem_ptr1 = (_Complex double *) mem_ptr1_;
_Complex double *mem_ptr2 = (_Complex double *) mem_ptr2_;
#pragma offload target(mic:0) in(mem_ptr1:length(0) DKS_RETAIN DKS_REUSE) in (mem_ptr2:length(0) DKS_RETAIN DKS_REUSE) in(size)
{
#pragma omp parallel for
for (int i=0; i<size; ++i) {
mem_ptr1[i]*=mem_ptr2[i];
}
}
return 0;
}

View File

@ -0,0 +1,44 @@
//AUTHOR: Benjamin Ulmer
#ifndef H_MIC_GREENS
#define H_MIC_GREENS
#include <iostream>
#include <complex>
#include <offload.h>
#include <mkl_dfti.h>
#include "MICBase.h"
#define DKS_SUCCESS 0
#define DKS_ERROR 1
class MICGreensFunction {
private:
MICBase *m_micbase;
public:
/* constructor */
MICGreensFunction(MICBase *base);
/* destructor */
~MICGreensFunction();
/* compute greens integral analytically */
int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2);
/* perform the actual integration */
int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K);
/* Mirror rho-Field */
int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K);
/*multiply complex fields*/
int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size);
};
#endif

116
src/MIC/MICMergeSort.h Normal file
View File

@ -0,0 +1,116 @@
#include <iostream>
#include <cstdlib>
#include <omp.h>
/* default comparison function */
template<typename T>
inline bool greaterThan(T x, T y) {
return x > y;
}
/* swap a and b */
template<typename T>
void mergeswap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
void split_merge(T *a, int ibegin, int iend, T *b, bool (*comp)(T, T) ) {
if (iend - ibegin < 500) {
quick_sort(a + ibegin, 0, iend - ibegin - 1, comp);
return;
}
int imiddle = (iend + ibegin) / 2;
#pragma omp task
split_merge(a, ibegin, imiddle, b, comp);
split_merge(a, imiddle, iend, b, comp);
#pragma omp taskwait
merge(a, ibegin, imiddle, iend, b, comp);
}
template <typename T>
void merge(T *a, int ibegin, int imiddle, int iend, T *b, bool (*comp)(T, T)) {
int i0 = ibegin;
int i1 = imiddle;
//merge two halfs of array a to tmp array b
int i = ibegin;
while (i < iend) {
if (i0 < imiddle && ( i1 >= iend || comp(a[i1], a[i0]) ) )
b[i++] = a[i0++];
else
b[i++] = a[i1++];
}
//copy b back to a
for (int i = ibegin; i < iend; i++)
a[i] = b[i];
}
template <typename T>
int partition(T *a, int start, int end, bool (*comp)(T, T) ) {
int p = start;
T x = a[start];
for (int i = start + 1; i <= end; i++) {
if ( comp(x, a[i]) ) {
p++;
mergeswap(a[i], a[p]);
}
}
mergeswap(a[p], a[start]);
return p;
}
template <typename T>
void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) {
#pragma omp parallel
{
#pragma omp single
{
T *b = new T[n];
split_merge(list, 0, n, b, comp);
}
}
}
template <typename T>
void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
if (start < end) {
//for small elements move to insertion sort
if ( (end - start) < 9 ) {
insertion_sort(list, start, end + 1, comp);
} else {
int part = partition(list, start, end, comp);
quick_sort(list, start, part - 1, comp);
quick_sort(list, part + 1, end, comp);
}
}
}
template <typename T>
void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) {
for (int i = start + 1; i < end; i++) {
T key = list[i];
int j = i - 1;
while ( j >= 0 && comp(list[j], key) ) {
list[j + 1] = list[j];
j--;
}
list[j + 1] = key;
}
}

34
src/OpenCL/CMakeLists.txt Normal file
View File

@ -0,0 +1,34 @@
SET (_SRCS
OpenCLBase.cpp
OpenCLFFT.cpp
OpenCLChiSquare.cpp
OpenCLCollimatorPhysics.cpp
OpenCLChiSquareRuntime.cpp
)
SET (_HDRS
OpenCLBase.h
OpenCLFFT.h
OpenCLChiSquare.h
OpenCLCollimatorPhysics.h
OpenCLChiSquareRuntime.h
)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
SET (_KERNELS
OpenCLKernels/OpenCLChiSquare.cl
OpenCLKernels/OpenCLFFT.cl
OpenCLKernels/OpenCLFFTStockham.cl
OpenCLKernels/OpenCLTranspose.cl
OpenCLKernels/OpenCLCollimatorPhysics.cl
OpenCLKernels/OpenCLChiSquareRuntime.cl
)
ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/OpenCL)
INSTALL(FILES ${_KERNELS} DESTINATION include/OpenCL/OpenCLKernels)

1132
src/OpenCL/OpenCLBase.cpp Normal file

File diff suppressed because it is too large Load Diff

303
src/OpenCL/OpenCLBase.h Normal file
View File

@ -0,0 +1,303 @@
/*
Name: OpenCLBase
Author: Uldis Locans
Info: OpenCL base class to handle all the common details associated
with kernel launch on OpenCL device
Date: 2014.09.18
*/
#ifndef H_OPENCL_BASE
#define H_OPENCL_BASE
#include <iostream>
#include <iomanip>
#include <vector>
#include <algorithm>
#include <string.h>
#include <stdio.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#include <OpenCL/cl_ext.h>
#else
#include <CL/cl.h>
#include <CL/cl_ext.h>
#endif
#include "../DKSDefinitions.h"
/* struct for random number state */
typedef struct {
double s10;
double s11;
double s12;
double s20;
double s21;
double s22;
double z;
bool gen;
} RNDState;
class OpenCLBase {
private:
static cl_context m_context;
static cl_command_queue m_command_queue;
static cl_platform_id m_platform_id;
static cl_device_id m_device_id;
cl_context_properties m_context_properties[3];
cl_program m_program;
cl_kernel m_kernel;
static cl_event m_last_event;
cl_int m_num_events;
std::vector<cl_event> m_events;
char * m_kernel_file;
cl_device_type m_device_type;
/*
Name: getPlatforms
Info: get all avaialble platforms and save in m_platform_ids, save number of platforms
Return: success or error code
*/
int ocl_getPlatforms();
/*
Name: getDevice
Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu)
ReturnL success or error code
*/
int ocl_getDevice(const char* device_name);
/*
Name getDeviceType
Info: get device type from device name (-gpu, -cpu, -mic)
Return: success or error code
*/
int ocl_getDeviceType(const char* device_name, cl_device_type &device_type);
/*
Name: createContext
Info: create context with specified device
Return: success or error code
*/
int ocl_createContext();
/*
Name: buildProgram
Info: build program from specified kernel file
Return: success or error code
*/
int ocl_buildProgram(const char* kernel_file);
/** Compile program from kernel source string
*
*/
int ocl_compileProgram(const char* kernel_source, const char* opts = NULL);
protected:
int defaultRndSet;
cl_mem defaultRndState;
public:
/*
constructor
*/
OpenCLBase();
/*
destructor
*/
~OpenCLBase();
/*
Create RND states
Return: success or error code
*/
int ocl_createRndStates(int size);
/*
Destroy rnd states
Return: success or error code
*/
int ocl_deleteRndStates();
/*
Name: getAllDevices
Info: get all available devices
ReturnL success or error code
*/
int ocl_getAllDevices();
/** Get the OpenCL device count for the set type of device
*
*/
int ocl_getDeviceCount(int &ndev);
/** Get the name of the device used
*/
int ocl_getDeviceName(std::string &device_name);
/** Set the device to use for OpenCL kernels.
* device id to use is passed as integer.
*/
int ocl_setDevice(int device);
/** Get a list of all the unique devices of the same type that can run OpenCL kernels
* Used when GPUs of different types might be pressent on the system.
*/
int ocl_getUniqueDevices(std::vector<int> &devices);
/*
Name: setUp
Info: set up opencl resources
Return: success or error code
*/
int ocl_setUp(const char* device_name);
/*
Name: loadKernel
Info: load and compile opencl kernel file if it has changed
Return: success or error code
*/
int ocl_loadKernel(const char* kernel_file);
/** Build program from kernel source.
* Builds a program from source code provided in kernel_source.
* If compilation fails will return DKS_ERROR
*/
int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL);
/*
Name: allocateMemory
Info: allocate memory on device
Return: return pointer to memory
*/
cl_mem ocl_allocateMemory(size_t size, int &ierr);
/*
Name: allocateMemory
Info: allocate memory on device
Return: return pointer to memory
*/
cl_mem ocl_allocateMemory(size_t size, int type, int &ierr);
/*
Name: writeData
Info: write data to device memory (needs ptr to mem object)
Return: success or error code
*/
int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/*
Name: copyData
Info: copy data from one buffer on the device to another
Return: success or error code
*/
int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size);
/*
Name: createKernel
Info: create kernel from program
Return: success or error code
*/
int ocl_createKernel(const char* kernel_name);
/*
Name: setKernelArgs
Info: set opencl kernel arguments
Return: success or error code
*/
int ocl_setKernelArg(int idx, size_t size, const void *arg_value);
/*
Name: executeKernel
Info: execute selected kernel (needs kernel parameters)
Return: success or error code
*/
int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL);
/*
Name: readData
Info: read data from device (needs pointer to mem object)
Return: success or error code
*/
int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE);
/*
Name: freeMemory
Info: free device memory (needs ptr to mem object)
Return: success or error code
*/
int ocl_freeMemory(cl_mem mem_ptr);
/*
Name: cleanUp
Info: free opencl resources
Return: success or error code
*/
int ocl_cleanUp();
/*
Name: deviceInfo
Info: print device info (mostly for debugging purposes)
Return: success or error code
*/
int ocl_deviceInfo(bool verbose = true);
/* Check OpenCL kernel.
* Query device and check if it can run the kernel with required parameters
*/
int ocl_checkKernel(const char* kernel_name, int work_group_size,
bool double_precision, int &threadsPerBlock);
/*
Name: clearEvents
Info: clear saved events (for debuging purposes)
Return: nothing
*/
void ocl_clearEvents();
/*
Name: eventInfo
Info: print information about kernel timings (for debuging purposes)
Return: nothing
*/
void ocl_eventInfo();
/*
Return current command queue
*/
cl_command_queue ocl_getQueue() { return m_command_queue; }
};
#endif

View File

@ -0,0 +1,157 @@
#include "OpenCLChiSquare.h"
double OpenCLChiSquare::ocl_sum(cl_mem data, int length) {
int ierr;
//calc number of thread sper workgroup and nr of work groups
size_t work_size_sum = 128;
size_t work_items = (size_t)length;
if (length % work_size_sum > 0)
work_items = (length / work_size_sum + 1) * work_size_sum;
int work_groups = length / work_size_sum + 1;
//create tmp array for partial sums
cl_mem tmp_ptr;
double *partial_sums = new double[work_groups];
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
//execute sum kernel
m_oclbase->ocl_createKernel("parallelReductionSum");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
//read partial sums and free temp mempry
m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
m_oclbase->ocl_freeMemory(tmp_ptr);
//sumup partial sums on the host
double result = 0;
for (int i = 0; i < work_groups; i++)
result += partial_sums[i];
delete[] partial_sums;
return result;
}
int OpenCLChiSquare::ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar,
double &result)
{
//set number of work items and work group sizes for kernel execution
size_t work_size = 128;
size_t work_items = (size_t)length * sensors;
if (length % work_size > 0)
work_items = (length / work_size + 1) * work_size;
cl_mem data = (cl_mem)mem_data;
cl_mem par = (cl_mem)mem_par;
cl_mem chi = (cl_mem)mem_result;
//load and execute PHistotFFcn kernel
m_oclbase->ocl_createKernel("kernelPHistoTFFcn");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &par);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &chi);
m_oclbase->ocl_setKernelArg(3, sizeof(double), &fTimeResolution);
m_oclbase->ocl_setKernelArg(4, sizeof(double), &fRebin);
m_oclbase->ocl_setKernelArg(5, sizeof(int), &length);
m_oclbase->ocl_setKernelArg(6, sizeof(int), &sensors);
m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
m_oclbase->ocl_setKernelArg(8, sizeof(double)*numpar, NULL);
m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
result = ocl_sum(chi, sensors*length);
return DKS_SUCCESS;
}
int OpenCLChiSquare::ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
//set number of work items and work group sizes for kernel execution
size_t work_size = 128;
size_t work_items = (size_t)length * sensors;
if (length % work_size > 0)
work_items = (length / work_size + 1) * work_size;
cl_mem data = (cl_mem)mem_data;
cl_mem t0 = (cl_mem)mem_t0;
cl_mem par = (cl_mem)mem_par;
cl_mem chi = (cl_mem)mem_result;
//load and execute PHistotFFcn kernel
m_oclbase->ocl_createKernel("kernelSingleGaussTF");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi);
m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution);
m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin);
m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset);
m_oclbase->ocl_setKernelArg(7, sizeof(int), &length);
m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors);
m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar);
m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL);
m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
result = ocl_sum(chi, length);
return DKS_SUCCESS;
}
int OpenCLChiSquare::ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result)
{
//set number of work items and work group sizes for kernel execution
size_t work_size = 128;
size_t work_items = (size_t)length * sensors;
if (length % work_size > 0)
work_items = (length / work_size + 1) * work_size;
cl_mem data = (cl_mem)mem_data;
cl_mem t0 = (cl_mem)mem_t0;
cl_mem par = (cl_mem)mem_par;
cl_mem chi = (cl_mem)mem_result;
//load and execute PHistotFFcn kernel
m_oclbase->ocl_createKernel("kernelDoubleLorentzTF");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi);
m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution);
m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin);
m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset);
m_oclbase->ocl_setKernelArg(7, sizeof(int), &length);
m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors);
m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar);
m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL);
m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
result = ocl_sum(chi, length);
return DKS_SUCCESS;
}

View File

@ -0,0 +1,53 @@
#ifndef H_OPENCL_CHI_SQUARE
#define H_OPENCL_CHI_SQUARE
#include <iostream>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "OpenCLBase.h"
#define DKS_SUCCESS 0
#define DKS_ERROR 1
class OpenCLChiSquare {
private:
OpenCLBase *m_oclbase;
double ocl_sum(cl_mem data, int length);
public:
OpenCLChiSquare(OpenCLBase *base) {
m_oclbase = base;
}
~OpenCLChiSquare() { }
int ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin,
int sensors, int length, int numpar,
double &result);
int ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result);
int ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int sensors, int length, int numpar,
double &result);
};
#endif

View File

@ -0,0 +1,316 @@
#include "OpenCLChiSquareRuntime.h"
OpenCLChiSquareRuntime::OpenCLChiSquareRuntime(OpenCLBase *base) {
blockSize_m = BLOCK_SIZE;
numBlocks_m = -1;
m_oclbase = base;
N0_m = 1.0;
tau_m = 1.0;
bkg_m = 1.0;
alpha_m = 1.0;
beta_m = 1.0;
ptx_m = NULL;
initDone_m = false;
}
//free temporary resources
OpenCLChiSquareRuntime::~OpenCLChiSquareRuntime() {
delete[] ptx_m;
freeChiSquare();
}
//build program string
std::string OpenCLChiSquareRuntime::buildProgram(std::string function) {
long fsize;
char *kernel_source;
//get kernel source
char * kernel_file = new char[500];
kernel_file[0] = '\0';
strcat(kernel_file, OPENCL_KERNELS);
strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl");
//read kernels from file
FILE *fp = fopen(kernel_file, "rb");
if (!fp)
DEBUG_MSG("Can't open kernel file" << kernel_file);
//get file size and allocate memory
fseek(fp, 0, SEEK_END);
fsize = ftell(fp);
kernel_source = new char[fsize+1];
//read file and content in kernel source
rewind(fp);
fread(kernel_source, 1, sizeof(char)*fsize, fp);
kernel_source[fsize] = '\0';
fclose(fp);
std::string kernel_string (kernel_source);
return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter;
}
int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) {
//build program string
std::string openclProg = buildProgram(function);
//compile flags
std::string opts("");
if (mlh)
opts = "-DMLH";
//compile opencl program from source string
int ierr = m_oclbase->ocl_loadKernelFromSource(openclProg.c_str(), opts.c_str());
return ierr;
}
double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
int ierr;
//calc number of thread sper workgroup and nr of work groups
size_t work_size_sum = 128;
/*
size_t work_items = (size_t)length;
if (length % work_size_sum > 0)
work_items = (length / work_size_sum + 1) * work_size_sum;
int work_groups = length / work_size_sum + 1;
*/
size_t work_items = 80 * work_size_sum;
int work_groups = 80;
//create tmp array for partial sums
cl_mem tmp_ptr;
double *partial_sums = new double[work_groups];
tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr);
//execute sum kernel
//ocl_createKernel("parallelReductionSum");
m_oclbase->ocl_createKernel("parallelReductionTwoPhase");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr);
m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &length);
m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum);
//read partial sums and free temp mempry
m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups);
m_oclbase->ocl_freeMemory(tmp_ptr);
//sumup partial sums on the host
double result = 0;
for (int i = 0; i < work_groups; i++)
result += partial_sums[i];
delete[] partial_sums;
return result;
}
int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep, double &result)
{
int ierr;
//convert memory to cl_mem
cl_mem cl_mem_data = (cl_mem)mem_data;
cl_mem cl_mem_err = (cl_mem)mem_err;
cl_mem cl_param = (cl_mem)mem_param_m;
cl_mem cl_chisq = (cl_mem)mem_chisq_m;
cl_mem cl_map = (cl_mem)mem_map_m;
cl_mem cl_func = (cl_mem)mem_func_m;
//set work item size
size_t work_items;
size_t work_size = (size_t)blockSize_m;
if (numBlocks_m < 0)
work_items = (size_t)length;
else
work_items = (size_t)numBlocks_m * (size_t)blockSize_m;
if (work_items % work_size > 0)
work_items = (work_items / work_size + 1) * work_size;
if (fitType == FITTYPE_SINGLE_HISTO) {
//create kernel
ierr = m_oclbase->ocl_createKernel("kernelChiSquareSingleHisto");
if (ierr != DKS_SUCCESS)
return ierr;
//set kernel args
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq);
m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map);
m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func);
m_oclbase->ocl_setKernelArg(6, sizeof(int), &length);
m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc);
m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap);
m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart);
m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m);
m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m);
m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m);
m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL);
m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL);
m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL);
if (ierr != DKS_SUCCESS)
return ierr;
} else if (fitType == FITTYPE_ASYMMETRY) {
//create kernel
ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry");
if (ierr != DKS_SUCCESS)
return ierr;
//set kernel args
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq);
m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map);
m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func);
m_oclbase->ocl_setKernelArg(6, sizeof(int), &length);
m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar);
m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc);
m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap);
m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart);
m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep);
m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m);
m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m);
m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL);
m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL);
m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL);
if (ierr != DKS_SUCCESS)
return ierr;
} else if (fitType == FITTYPE_MU_MINUS) {
// not yet implemented
} else {
return DKS_ERROR;
}
//execute kernel
ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size);
if (ierr != DKS_SUCCESS)
return ierr;
//execute sum kernel
result = calculateSum((cl_mem)mem_chisq_m, length);
return ierr;
}
int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) {
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams);
return ierr;
}
int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) {
if (numfunc == 0)
return DKS_SUCCESS;
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc);
return ierr;
}
int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) {
if (nummap == 0)
return DKS_SUCCESS;
int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap);
return ierr;
}
int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param,
int size_func, int size_map)
{
int ierr = DKS_ERROR;
if (initDone_m) {
DEBUG_MSG("Reinitializing ChiSquare");
freeChiSquare();
}
//allocate temporary memory
mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr);
mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr);
if (size_func == 0)
size_func = 1;
mem_func_m = m_oclbase->ocl_allocateMemory(size_func*sizeof(double), ierr);
if (size_map == 0)
size_map = 1;
mem_map_m = m_oclbase->ocl_allocateMemory(size_map*sizeof(int), ierr);
initDone_m = true;
return ierr;
}
int OpenCLChiSquareRuntime::freeChiSquare() {
int ierr = DKS_ERROR;
if (initDone_m) {
//free memory
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m);
ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m);
initDone_m = false;
}
return ierr;
}
int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBlock) {
int ierr;
char kernel[64];
switch (fitType) {
case FITTYPE_SINGLE_HISTO:
strncpy(kernel, "kernelChiSquareSingleHisto", sizeof(kernel));
break;
case FITTYPE_ASYMMETRY:
strncpy(kernel, "kernelChiSquareAsymmetry", sizeof(kernel));
break;
case FITTYPE_MU_MINUS:
// not yet implemented
default:
return DKS_ERROR;
}
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
return ierr;
}

View File

@ -0,0 +1,103 @@
#ifndef H_OPENCL_CHISQUARE_RUNTIME
#define H_OPENCL_CHISQUARE_RUNTIME
#include <iostream>
#include <string>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "../Algorithms/ChiSquareRuntime.h"
#include "OpenCLBase.h"
const std::string openclFunctHeader = "double fTheory(double t, __local double *p, __local double *f, __local int *m) {";
const std::string openclFunctFooter = "}\n";
class OpenCLChiSquareRuntime : public ChiSquareRuntime {
private:
OpenCLBase *m_oclbase;
/** Private function to add user defined function to kernel string
*
*/
std::string buildProgram(std::string function);
double calculateSum(cl_mem data, int length);
public:
/** Constructor wiht openclbase argument
*
*/
OpenCLChiSquareRuntime(OpenCLBase *base);
/** Default constructor
*
*/
OpenCLChiSquareRuntime();
/** Default destructor
*
*/
~OpenCLChiSquareRuntime();
/** Compile program and save ptx.
* Add function string to the calcFunction kernel and compile the program
* Function must be valid C math expression. Parameters can be addressed in
* a form par[map[idx]]
*/
int compileProgram(std::string function, bool mlh = false);
/** Launch selected kernel
* Launched the selected kernel from the compiled code.
* Result is put in &result variable
*/
int launchChiSquare(int fitType,
void *mem_data, void *mem_err, int length,
int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double &result);
/** Write params to device.
* Write params from double array to mem_param_m memory on the device.
*/
int writeParams(const double *params, int numparams);
/** Write functions to device.
* Write function values from double array to mem_func_m memory on the device.
*/
int writeFunc(const double *func, int numfunc);
/** Write maps to device.
* Write map values from int array to mem_map_m memory on the device.
*/
int writeMap(const int *map, int nummap);
/** Allocate temporary memory needed for chi square.
* Initializes the necessary temporary memory for the chi square calculations. Size_data needs to
* the maximum number of elements in any datasets that will be used for calculations. Size_param,
* size_func and size_map are the maximum number of parameters, functions and maps used in
* calculations.
*/
int initChiSquare(int size_data, int size_param, int size_func, int size_map);
/** Free temporary memory allocated for chi square.
* Frees the chisq temporary memory and memory for params, functions and maps
*/
int freeChiSquare();
/** Check MuSR kernels for necessary resources.
* Query device properties to get if sufficient resources are
* available to run the kernels
*/
int checkChiSquareKernels(int fitType, int &threadsPerBlock);
};
#endif

View File

@ -0,0 +1,107 @@
#include "OpenCLCollimatorPhysics.h"
#define M_P 0.93827231e+00
#define C 299792458.0
#define PI 3.14159265358979323846
#define AVO 6.022e23
#define R_E 2.81794092e-15
#define eM_E 0.51099906e-03
#define Z_P 1
#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7
#define POSITION 0
#define ZSIZE 1
#define RHO_M 2
#define Z_M 3
#define A_M 4
#define A2_C 5
#define A3_C 6
#define A4_C 7
#define A5_C 8
#define X0_M 9
#define I_M 10
#define DT_M 11
#define BLOCK_SIZE 128
#define NUMPAR 12
/*
TODO:
1. test OpenCL kernel
- is it launched for all particles
- does the random number generatror function properly
- is particle structure updated correctly in memory
2. boost.compute sort for user defined structure crashes
*/
int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr,
int numparticles)
{
/*
//set number of total threads, and number threads per block
size_t threads = 1;
size_t blocks = numparticles;
//cast void ptrs to cl_mem ptrs
cl_mem data = (cl_mem)mem_ptr;
cl_mem params = (cl_mem)par_ptr;
int numparams = 19;
//set kernel to execute and kernel arguments
ocl_createKernel("kernelCollimatorPhysics");
ocl_setKernelArg(0, sizeof(cl_mem), &data);
ocl_setKernelArg(1, sizeof(cl_mem), &params);
ocl_setKernelArg(2, sizeof(cl_mem), &defaultRndState);
ocl_setKernelArg(3, sizeof(int), &numparticles);
ocl_setKernelArg(4, sizeof(double)*numparams, NULL);
std::cout << "blocks: " << blocks << ", threads: " << threads << std::endl;
//execute kernel on device
ocl_executeKernel(1, &blocks, &threads);
//create functions for comparing two particles and counting particles with labels < 0
BOOST_COMPUTE_FUNCTION(bool, sort_by_label, (PART_OPENCL a, PART_OPENCL b),
{
return a.label < b.label;
});
BOOST_COMPUTE_FUNCTION(bool, count_by_label, (PART_OPENCL a),
{
return a.label < 0;
});
//wrap cl_mem memory object in Boost.Compute buffer
std::cout << "wrap buffer" << std::endl;
boost::compute::buffer buf(data);
//count particles with labels < 0
std::cout << "wrap command queue" << std::endl;
boost::compute::command_queue queue(ocl_getQueue());
std::cout << "count if" << std::endl;
numaddback = boost::compute::count_if(boost::compute::make_buffer_iterator<PART_OPENCL>(buf,0),
boost::compute::make_buffer_iterator<PART_OPENCL>(buf,numparticles),
count_by_label, queue);
//sort particles with dead and leaving particles at the end using boos::compute
numaddback = 0;
if (numaddback > 0) {
std::cout << "sort" << std::endl;
boost::compute::sort(boost::compute::make_buffer_iterator<PART_OPENCL>(buf,0),
boost::compute::make_buffer_iterator<PART_OPENCL>(buf, numparticles),
sort_by_label, queue);
}
return DKS_SUCCESS;
*/
std::cout << "OpenCL implementation disabled" << std::endl;
return DKS_ERROR;
}

View File

@ -0,0 +1,85 @@
#ifndef H_OPENCL_DEGRADER
#define H_OPENCL_DEGRADER
#include <iostream>
#include <math.h>
#include "../Algorithms/CollimatorPhysics.h"
#include "OpenCLBase.h"
/*
#include "boost/compute/types/struct.hpp"
#include "boost/compute/type_traits/type_name.hpp"
#include "boost/compute/algorithm/count_if.hpp"
#include "boost/compute/algorithm/sort.hpp"
#include "boost/compute/container/vector.hpp"
#include "boost/compute/iterator/buffer_iterator.hpp"
#include "boost/compute/core.hpp"
*/
typedef struct {
double x;
double y;
double z;
} Double3;
typedef struct {
int label;
unsigned localID;
Double3 Rincol;
Double3 Pincol;
} PART_OPENCL;
//adapt struct PART for use in Boost.Compute
//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z));
//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol));
class OpenCLCollimatorPhysics : public DKSCollimatorPhysics {
private:
OpenCLBase *m_oclbase;
public:
/* constructor */
OpenCLCollimatorPhysics(OpenCLBase *base) {
m_oclbase = base;
}
/* destructor */
~OpenCLCollimatorPhysics() {
}
/* execute degrader code on device */
int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles);
int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles) { return DKS_ERROR; }
int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { return DKS_ERROR; }
int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr,
void *rx_ptr, void *ry_ptr, void *rz_ptr,
void *px_ptr, void *py_ptr, void *pz_ptr,
void *par_ptr, int numparticles, int &numaddback) { return DKS_ERROR; }
int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr,
double dt, double c, bool usedt = false, int streamId = -1)
{
return DKS_ERROR;
}
int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr,
void *orient_ptr, int npart, int nsec, void *dt_ptr,
double dt, double c, bool usedt = false,
int streamId = -1)
{
return DKS_ERROR;
}
};
#endif

303
src/OpenCL/OpenCLFFT.cpp Normal file
View File

@ -0,0 +1,303 @@
#include "OpenCLFFT.h"
//=====================================//
//==========Private functions==========//
//=====================================//
/*
call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform,
ndim - totla number of dimmensions, N - size of dimension
*/
int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) {
//set the number of work items in each dimension
size_t work_items[3];
work_items[0] = N;
work_items[1] = (ndim > 1) ? N : 1;
work_items[2] = (ndim > 1) ? N : 1;
work_items[cdim] = N / 2;
int f = (forward) ? 1 : 0;
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
for (int step = 1; step < N; step <<= 1) {
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
}
return OCL_SUCCESS;
}
/*
call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform,
ndim - totla number of dimmensions, N - size of dimension
*/
int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) {
//set work item size
size_t work_items[3];
work_items[0] = N;
work_items[1] = (ndim > 1) ? N : 1;
work_items[2] = (ndim > 2) ? N : 1;
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS)
return OCL_ERROR;
int bits = log2(N);
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
DEBUG_MSG("Error executing kernel");
return OCL_ERROR;
}
return OCL_SUCCESS;
}
//=====================================//
//==========Public functions==========//
//=====================================//
/*
call fft execution on device for every dimension
*/
int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) {
int ierr;
cl_mem inout = (cl_mem)data;
int n = N[0];
for (int dim = 0; dim < ndim; dim++) {
ierr = ocl_callBitReverseKernel(inout, dim, ndim, n);
if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing bit reverse");
return OCL_ERROR;
}
ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward);
if (ierr != OCL_SUCCESS) {
DEBUG_MSG("Error executing fft reverse");
return OCL_ERROR;
}
}
return OCL_SUCCESS;
}
/*
execute ifft
*/
int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) {
executeFFT(data, ndim, N, streamId, false);
return OCL_SUCCESS;
}
/*
call kernel to normalize fft
*/
int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) {
cl_mem inout = (cl_mem)data;
int n = N[0];
//set work item size
size_t work_items[3];
work_items[0] = n;
work_items[1] = (ndim > 1) ? n : 1;
work_items[2] = (ndim > 2) ? n : 1;
//create kernel
if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS)
return OCL_ERROR;
//set kernel args
unsigned int elements = pow(n, ndim);
if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS)
return OCL_ERROR;
if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS)
return OCL_ERROR;
//execute kernel
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) {
DEBUG_MSG("Error executing kernel");
return OCL_ERROR;
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) {
int ierr;
int size = sizeof(cl_double2)*pow(N,ndim);
cl_mem mem_tmp;
cl_mem mem_src = (cl_mem)src;
cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr);
//set the number of work items in each dimension
size_t work_items[3];
int p = 1;
int threads = N / 2;
int f = (forward) ? -1 : 1;
//execute kernel
int n = (int)log2(N);
for (int i = 0; i < ndim; i++) {
int dim = i+1;
p = 1;
work_items[0] = (dim == 1) ? N/2 : N;
work_items[1] = (dim == 2) ? N/2 : N;
work_items[2] = (dim == 3) ? N/2 : N;
//transpose array if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
//create kernel and set kernel arguments
if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS)
return OCL_ERROR;
for (int t = 1; t <= log2(N); t++) {
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &p);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim);
m_oclbase->ocl_setKernelArg(5, sizeof(int), &f);
if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS)
return OCL_ERROR;
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
p = 2*p;
}
//transpose array back if calculating dimension larger than 1
//if (dim > 1)
// ocl_executeTranspose(mem_src, N, ndim, dim);
}
if (ndim*n % 2 == 1) {
m_oclbase->ocl_copyData(mem_src, mem_dst, size);
mem_tmp = mem_src;
mem_src = mem_dst;
mem_dst = mem_tmp;
}
m_oclbase->ocl_freeMemory(mem_dst);
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) {
cl_mem mem_src = (cl_mem)src;
size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N};
size_t work_group_size[3] = {(size_t)N/2, 1, 1};
m_oclbase->ocl_createKernel("fft_batch3D");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL);
m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL);
m_oclbase->ocl_setKernelArg(4, sizeof(int), &N);
for (int dim = 1; dim < ndim+1; dim++) {
m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim);
m_oclbase->ocl_executeKernel(3, work_items, work_group_size);
}
return OCL_SUCCESS;
}
int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) {
cl_mem mem_src = (cl_mem)src;
if (ndim == 1)
return OCL_SUCCESS;
size_t work_items[3];
work_items[0] = N[0];
work_items[1] = N[1];
work_items[2] = 1;
size_t work_group_size[3];
work_group_size[0] = N[0];
work_group_size[1] = N[1];
work_group_size[2] = 1;
size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2];
m_oclbase->ocl_createKernel("transpose");
m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src);
m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]);
m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]);
m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL);
m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size);
return OCL_SUCCESS;
}
/*
void OpenCLFFT::printData3DN4(cl_double2* &data, int N) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].x;
if (d > 10e-5 || d < -10e-5)
std::cout << d << "\t";
else
std::cout << 0 << "\t";
}
}
std::cout << std::endl;
}
std::cout << std::endl;
}
*/

113
src/OpenCL/OpenCLFFT.h Normal file
View File

@ -0,0 +1,113 @@
/*
Name: OpenCLFFT
Author: Uldis Locans
Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL
Data: 19.09.2014
*/
#ifndef H_OPENCL_FFT
#define H_OPENCL_FFT
#include <iostream>
#include <math.h>
#include <complex>
#include "../Algorithms/FFT.h"
#include "OpenCLBase.h"
class OpenCLFFT : public DKSFFT {
private:
OpenCLBase *m_oclbase;
/*
Info: call fft kernels to execute FFT of the given domain,
data - devevice memory ptr, cdim - current dim to transform,
ndim - totla number of dimmensions, N - size of dimension
Return: success or error code
*/
int ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward = true);
/*
Info: call ifft kernel to execute the bit reverse sort
data - devevice memory ptr, cdim - current dim to transform,
ndim - totla number of dimmensions, N - size of dimension
Return: success or error code
*/
int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N);
public:
/* constructor - currently does nothing*/
OpenCLFFT(OpenCLBase *base) {
m_oclbase = base;
}
/* destructor - currently does nothing*/
~OpenCLFFT() { }
/*
Info: execute forward fft function with data set on device
Return: success or error code
*/
//int ocl_executeFFT(cl_mem &data, int ndim, int N, bool forward = true);
int executeFFT(void *data, int ndim, int N[3], int streamId = -1, bool forward = true);
/*
Info: execute inverse fft with data set on device
Return: success or error code
*/
//int ocl_executeIFFT(cl_mem &data, int ndim, int N);
int executeIFFT(void *data, int ndim, int N[3], int streamId = -1);
/*
Info: execute normalize kernel
Return: success or error code
*/
//int ocl_normalizeFFT(cl_mem &data, int ndim, int N);
int normalizeFFT(void *data, int ndim, int N[3], int streamId = -1);
/*
Info: set FFT size
Return: success or error code
*/
int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; }
int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; }
int destroyFFT() { return DKS_SUCCESS; }
int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1)
{
return DKS_ERROR;
}
int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3],
int streamId = -1)
{
return DKS_ERROR;
}
int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1)
{
return DKS_ERROR;
}
int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true);
int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true);
int ocl_executeTranspose(void *src, int N[3], int ndim, int dim);
//void printData3DN4(cl_double2* &data, int N);
};
#endif

View File

@ -0,0 +1,175 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define TAU 2.197019
__kernel void parallelReductionSum(__global double *data_in, __global double *data_out,
__local double *data_local, int size)
{
//get local and global ids, and work group size
int local_id = get_local_id(0);
int global_id = get_global_id(0);
int group_size = get_local_size(0);
//copy from global memory to local, if global id out of bounds fill with 0s
if (global_id < size)
data_local[local_id] = data_in[global_id];
else
data_local[local_id] = 0;
//loop trough reduction steps
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
//synch all work items in work group
barrier(CLK_LOCAL_MEM_FENCE);
//create partials summs each step
if (local_id < stride)
data_local[local_id] += data_local[local_id + stride];
}
//local thread 0 writes final partial sum to global memory
if (local_id == 0)
data_out[get_group_id(0)] = data_local[0];
}
__kernel void kernelPHistoTFFcn(__global double *data, __global double *par, __global double *chisq,
double fTimeResolution, double fRebin,
int length, int sensors, int numpar,
__local double *p)
{
//get work item id and calc global id
int tid = get_local_id(0);
int j = get_global_id(0);
//load parameters from global to shared memory
if (tid < numpar)
p[tid] = par[tid];
//sync work items inside work group
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
if (j < length) {
double dt0 = fTimeResolution * 0.5 * (fRebin - 1);
double time = dt0 + fTimeResolution * fRebin * j;
double w = p[0]*0.08516155035269027;
double tt = exp(-time/TAU);
double pp = exp(-0.5 * pow(p[1]*time, 2.0));
double wt = w * time;
int idx;
double ldata, theo;
for (int i = 0; i < sensors; i++) {
idx = i * length + j;
ldata = data[idx];
theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
if (ldata != 0.0)
chisq[idx] = (theo - ldata) * (theo - ldata) / ldata;
else
chisq[idx] = theo * theo;
}
}
}
__kernel void kernelSingleGaussTF(__global double *data, __global unsigned int *t0,
__global double *par, __global double *result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int length, int sensors, int numpar, __local double *p)
{
//get work item id and calc global id
int tid = get_local_id(0);
int j = get_global_id(0);
//load para,eters from global to shared memory
if (tid < numpar)
p[tid] = par[tid];
//sync work items inside work group
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
if (j < length) {
double dt0 = fTimeResolution*0.5*(fRebin - 1);
double w1 = par[0]*0.08516155035269027;
int idx;
double ldata, lft0, theo, time;
for (int i = 0; i < sensors; i++) {
idx = i * length + j;
lft0 = t0[i];
if (j >= lft0 + fGoodBinOffset/fRebin) {
ldata = data[idx];
time = dt0 + fTimeResolution * fRebin* (j - lft0);
theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0))
*cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4];
// 1.74532925199432955e-2 = pi/180
if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) )
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
else
result[idx] = theo - ldata;
} else {
result[idx] = 0;
}
}
}
}
__kernel void kernelDoubleLorentzTF(__global double *data, __global unsigned int *t0,
__global double *par, __global double *result,
double fTimeResolution, double fRebin, double fGoodBinOffset,
int length, int sensors, int numpar, __local double *p)
{
//get work item id and calc global id
int tid = get_local_id(0);
int j = get_global_id(0);
//load para,eters from global to shared memory
if (tid < numpar)
p[tid] = par[tid];
//sync work items inside work group
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
if (j < length) {
double dt0 = fTimeResolution*0.5*(fRebin - 1);
double w1 = p[0]*0.08516155035269027;
double w2 = p[2]*0.08516155035269027;
int idx;
double ldata, lft0, theo, time;
for (int i = 0; i < sensors; i++) {
idx = i * length + j;
lft0 = t0[i];
if (j >= lft0 + fGoodBinOffset/fRebin) {
ldata = data[idx];
time = dt0+fTimeResolution*fRebin*(j-lft0);
theo = p[4+i*5]*exp(-time/TAU)*
(1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)*
cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+
(1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)*
cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5];
// 1.74532925199432955e-2 = pi/180
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
result[idx] = (theo - ldata) + ldata*log(ldata/theo);
else
result[idx] = theo - ldata;
} else {
result[idx] = 0;
}
}
}
}

View File

@ -0,0 +1,344 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define PI 3.141592653589793115998
#define TWO_PI 6.283185307179586231996
#define DEG_TO_RAD 1.7453292519943295474371681e-2
/** From 'Numerical Recipes in C' by Press et.al, 1992. */
//Returns the Bessel function J0(x) for any real x.
double bessj0(double x) {
double ax,z;
double xx,y,ans,ans1,ans2; //Accumulate polynomials in double precision.
if ((ax=fabs(x)) < 8.0) { //Direct rational function fit.
y=x*x;
ans1=57568490574.0+y*(-13362590354.0+y*(651619640.7+y*(-11214424.18+y*(77392.33017+y*(-184.9052456)))));
ans2=57568490411.0+y*(1029532985.0+y*(9494680.718+y*(59272.64853+y*(267.8532712+y*1.0))));
ans=ans1/ans2;
} else { //Fitting function (6.5.9).
z=8.0/ax;
y=z*z;
xx=ax-0.785398164;
ans1=1.0+y*(-0.1098628627e-2+y*(0.2734510407e-4+y*(-0.2073370639e-5+y*0.2093887211e-6)));
ans2 = -0.1562499995e-1+y*(0.1430488765e-3+y*(-0.6911147651e-5+y*(0.7621095161e-6-y*0.934945152e-7)));
ans=sqrt(0.636619772/ax)*(cos(xx)*ans1-z*sin(xx)*ans2);
}
return ans;
}
/** Theory function declaration.
* Definition of the theory function will be build during runtime before compilation.
*/
double fTheory(double t, __local double *p, __local double *f, __local int *m);
/** MusrFit predefined functions.
* Predefined functions from MusrFit that can be used to define the theory function.
* First parameter in all the functions is alwats time - t, rest of the parameters depend
* on the function.
*/
double se(double t, double lamda) {
return exp( -lamda*t );
}
double ge(double t, double lamda, double beta) {
return exp( -pow(lamda*t, beta) );
}
double sg(double t, double sigma) {
return exp( -0.5 * pow(sigma*t, 2) );
}
double stg(double t, double sigma) {
double sigmatsq = pow(sigma*t,2);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq);
}
double sekt(double t, double lambda) {
double lambdat = lambda*t;
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat);
}
double lgkt(double t, double lambda, double sigma) {
double lambdat = lambda*t;
double sigmatsq = pow(sigma*t, 2.0);
return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq);
}
double skt(double t, double sigma, double beta) {
if (beta < 1.0e-3)
return 0.0;
double sigmatb = pow(sigma*t, beta);
return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta);
}
double spg(double t, double lambda, double gamma, double q) {
double lam2 = lambda*lambda;
double lamt2q = t*t*lam2*q;
double rate2 = 4.0*lam2*(1.0-q)*t/gamma;
double rateL = sqrt(fabs(rate2));
double rateT = sqrt(fabs(rate2)+lamt2q);
return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT);
}
double rahf(double t, double nu, double lambda) {
double nut = nu*t;
double nuth = nu*t/2.0;
double lamt = lambda*t;
return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt));
}
double tf(double t, double phi, double nu) {
double tmp_nu = TWO_PI*nu*t;
double tmp_phi = DEG_TO_RAD * phi;
return cos(tmp_nu + tmp_phi);
}
double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
}
double b(double t, double phi, double nu) {
return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi);
}
double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) {
double wt = TWO_PI * nu * t;
double ph = DEG_TO_RAD * phi;
return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t);
}
double ab(double t, double sigma, double gamma) {
double gt = gamma*t;
return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt));
}
double snkzf(double t, double Delta0, double Rb) {
double D0t2 = pow(Delta0*t, 2.0);
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa);
}
double snktf(double t, double phi, double nu, double Delta0, double Rb) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
double D0t2 = pow(Delta0*t, 2.0);
double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2);
return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph);
}
double dnkzf(double t, double Delta0, double Rb, double nuc) {
double nuct = nuc*t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta);
return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa);
}
double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) {
double wt = TWO_PI*nu*t;
double ph = DEG_TO_RAD*phi;
double nuct = nuc*t;
double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0);
double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta);
return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph);
}
__kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err,
__global double *par, __global double *chisq, __global int *map, __global double *funcv,
int length, int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double tau, double N0, double bkg,
__local double *p, __local double *f, __local int *m)
{
//get thread id and calc global id
int tid = get_local_id(0);
int j = get_global_id(0);
int lsize = get_local_size(0);
//load parameters from global to shared memory
while (tid < numpar) {
p[tid] = par[tid];
tid += lsize;
}
//load functions from global to shared memory
tid = get_local_id(0);
while (tid < numfunc) {
f[tid] = funcv[tid];
tid += lsize;
}
//load maps from global memory
tid = get_local_id(0);
while (tid < nummap) {
m[tid] = map[tid];
tid += lsize;
}
//sync threads
barrier(CLK_LOCAL_MEM_FENCE);
while (j < length) {
double t = timeStart + j*timeStep;
double ldata = data[j];
double lerr = err[j];
double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg;
#ifdef MLH
if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9))
chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo));
else
chisq[j] = 2.0 * (theo - ldata);
#else
if (lerr != 0.0)
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
else
chisq[j] = theo * theo;
#endif
j += get_global_size(0);
}
}
__kernel void kernelChiSquareAsymmetry(__global double *data, __global double *err,
__global double *par, __global double *chisq, __global int *map, __global double *funcv,
int length, int numpar, int numfunc, int nummap,
double timeStart, double timeStep,
double alpha, double beta,
__local double *p, __local double *f, __local int *m)
{
//get thread id and calc global id
int tid = get_local_id(0);
int j = get_global_id(0);
int lsize = get_local_size(0);
//load parameters from global to shared memory
while (tid < numpar) {
p[tid] = par[tid];
tid += lsize;
}
//load functions from global to shared memory
tid = get_local_id(0);
while (tid < numfunc) {
f[tid] = funcv[tid];
tid += lsize;
}
//load maps from global memory
tid = get_local_id(0);
if (tid < nummap) {
m[tid] = map[tid];
tid += lsize;
}
//sync threads
barrier(CLK_LOCAL_MEM_FENCE);
while (j < length) {
double t = timeStart + j*timeStep;
double ldata = data[j];
double lerr = err[j];
double ab = alpha*beta;
double theoVal = fTheory(t, p, f, m);
double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0)-(ab-1.0)*theoVal);
#ifdef MLH
chisq[j] = 0.0; // max log likelihood not defined for asymmetry fit
#else
if (lerr != 0.0)
chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr);
else
chisq[j] = theo * theo;
#endif
j += get_global_size(0);
}
}
__kernel void parallelReductionSum(__global double *data_in, __global double *data_out,
__local double *data_local, int size)
{
//get local and global ids, and work group size
int local_id = get_local_id(0);
int global_id = get_global_id(0);
int group_size = get_local_size(0);
//copy from global memory to local, if global id out of bounds fill with 0s
if (global_id < size)
data_local[local_id] = data_in[global_id];
else
data_local[local_id] = 0;
//loop trough reduction steps
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
//synch all work items in work group
barrier(CLK_LOCAL_MEM_FENCE);
//create partials summs each step
if (local_id < stride)
data_local[local_id] += data_local[local_id + stride];
}
//local thread 0 writes final partial sum to global memory
if (local_id == 0)
data_out[get_group_id(0)] = data_local[0];
}
__kernel void parallelReductionTwoPhase(__global double *data_in, __global double *data_out,
__local double *data_local, int size)
{
//get local and global ids, and work group size
int local_id = get_local_id(0);
int global_id = get_global_id(0);
int global_size = get_global_size(0);
int group_size = get_local_size(0);
double acc = 0;
while (global_id < size) {
acc += data_in[global_id];
global_id += global_size;
}
//parallel reduction on local work group
data_local[local_id] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
for (uint stride = group_size / 2; stride > 0; stride /= 2) {
//synch all work items in work group
barrier(CLK_LOCAL_MEM_FENCE);
//create partials summs each step
if (local_id < stride)
data_local[local_id] += data_local[local_id + stride];
}
//local thread 0 writes final partial sum to global memory
if (local_id == 0)
data_out[get_group_id(0)] = data_local[0];
}

View File

@ -0,0 +1,362 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION
/******Random numbers********/
/* struct for random number state */
typedef struct {
double s10;
double s11;
double s12;
double s20;
double s21;
double s22;
double z;
bool gen;
} RNDState;
#define NORM 2.328306549295728e-10
#define M1 4294967087.0
#define M2 4294944443.0
#define A12 1403580.0
#define A13N 810728.0
#define A21 527612.0
#define A23N 1370589.0
/* MRG32k3a uniform random number generator */
double rand_uniform(RNDState *s) {
long k;
double p1, p2;
/* Component 1 */
p1 = A12 * (*s).s11 - A13N * (*s).s10;
k = p1 / M1;
p1 -= k * M1;
if (p1 < 0.0)
p1 += M1;
(*s).s10 = (*s).s11;
(*s).s11 = (*s).s12;
(*s).s12 = p1;
/* Component 2 */
p2 = A21 * (*s).s22 - A23N * (*s).s20;
k = p2 / M2;
p2 -= k * M2;
if (p2 < 0.0)
p2 += M2;
(*s).s20 = (*s).s21;
(*s).s21 = (*s).s22;
(*s).s22 = p2;
/* Combination */
if (p1 <= p2)
return ((p1 - p2 + M1) * NORM);
else
return ((p1 - p2) * NORM);
}
/* get random variable with gaussian distribution */
double rand_normal(RNDState *s, double mu, double sigma) {
const double two_pi = 2.0 * 3.141592653589793223846;
double z0;
if (!(*s).gen) {
(*s).gen = true;
return (*s).z * sigma + mu;
}
double u1, u2;
u1 = rand_uniform(s);
u2 = rand_uniform(s);
z0 = sqrt(-2.0 * log(u1)) * cos(two_pi * u2);
(*s).z = sqrt(-2.0 * log(u1)) * sin(two_pi * u2);
(*s).gen = false;
return z0 * sigma + mu;
}
/* initialize random states */
__kernel void initRand(__global RNDState *s, unsigned int seed, int N) {
int id = get_global_id(0);
if (id < N) {
RNDState tmp;
int tmp_seed = id;// * 0x100000000ULL;
tmp.s10 = 12345 + tmp_seed;
tmp.s11 = 12345 + tmp_seed;
tmp.s12 = 123 + tmp_seed;
tmp.s20 = 12345 + tmp_seed;
tmp.s21 = 12345 + tmp_seed;
tmp.s22 = 123 + tmp_seed;
tmp.z = 0;
tmp.gen = true;
s[id] = tmp;
}
}
/**********Degrader**********/
enum PARAMS { POSITION,
ZSIZE,
M_P,
C,
RHO_M,
PI,
AVO,
R_E,
eM_E,
Z_M,
A_M,
A2_C,
A3_C,
A4_C,
A5_C,
Z_P,
X0_M,
I_M,
DT_M};
typedef struct {
int label;
unsigned localID;
double3 Rincol;
double3 Pincol;
} PART;
double Dot(double3 d1, double3 d2) {
return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z);
}
/* check if particle is in degrader material */
bool checkHit(double z, double position, double zsize) {
return ( ( z > position) && ( z <= position + zsize) );
}
/* calculate particles energy loss */
void energyLoss(double *Eng, bool *pdead, double deltat, RNDState *s, __local double *par) {
double dEdx = 0.0;
double gamma = ( (*Eng) + par[M_P]) / par[M_P];
double gamma2 = gamma * gamma;
double beta = sqrt(1.0 - 1.0 / gamma2);
double beta2 = beta * beta;
double deltas = deltat * beta * par[C];
double deltasrho = deltas * 100 * par[RHO_M];
double K = 4.0 * par[PI] * par[AVO] * par[R_E] * par[R_E] * par[eM_E] * 1E7;
double sigma_E = sqrt(K * par[eM_E] * par[RHO_M] * (par[Z_M]/par[A_M])* deltas * 1E5);
if (((*Eng) > 0.00001) && ((*Eng) < 0.0006)) {
double Ts = ((*Eng)*1E6)/1.0073;
double epsilon_low = par[A2_C]*pow(Ts,0.45);
double epsilon_high = (par[A3_C]/Ts)*log(1+(par[A4_C]/Ts)+(par[A5_C]*Ts));
double epsilon = (epsilon_low*epsilon_high)/(epsilon_low + epsilon_high);
dEdx = - epsilon /(1E21*(par[A_M]/par[AVO]));
double delta_Eave = deltasrho * dEdx;
double delta_E = delta_Eave + rand_normal(s, 0, sigma_E);
(*Eng) = (*Eng) + delta_E / 1E3;
}
if ((*Eng) >= 0.0006) {
double Tmax = 2.0 * par[eM_E] * 1e9 * beta2 * gamma2 /
(1.0 + 2.0 * gamma * par[eM_E] / par[M_P] +
(par[eM_E] / par[M_P]) * (par[eM_E] / par[M_P]));
dEdx = -K * par[Z_P] * par[Z_P] * par[Z_M] / (par[A_M] * beta2) *
(1.0 / 2.0 * log(2 * par[eM_E] * 1e9 * beta2 * gamma2 *
Tmax / par[I_M] / par[I_M]) - beta2);
double delta_Eave = deltasrho * dEdx;
double delta_E = delta_Eave + rand_normal(s, 0, sigma_E);
(*Eng) = (*Eng)+delta_E / 1E3;
}
(*pdead) = (((*Eng)<1E-4) || (dEdx>0));
}
/* rotate partocle */
void Rot(double3 *P, double3 *R, double xplane,
double normP, double thetacou, double deltas, int coord,
__local double *par)
{
double Psixz;
double pxz;
double px = (*P).x;
double pz = (*P).z;
double x = (*R).x;
double z = (*R).z;
if (px>=0 && pz>=0) Psixz = atan(px/pz);
else if (px>0 && pz<0)
Psixz = atan(px/pz) + par[PI];
else if (px<0 && pz>0)
Psixz = atan(px/pz) + 2*par[PI];
else
Psixz = atan(px/pz) + par[PI];
pxz = sqrt(px*px + pz*pz);
if(coord==1) {
(*R).x = x + deltas * px/normP + xplane*cos(Psixz);
(*R).z = z - xplane * sin(Psixz);
}
if(coord==2) {
(*R).x = x + deltas * px/normP + xplane*cos(Psixz);
(*R).z = z - xplane * sin(Psixz) + deltas * pz / normP;
}
(*P).x = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou);
(*P).z = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou);
}
void coulombScat(double3 *R, double3 *P, double deltat,
RNDState *s, __local double* par) {
double dotP = Dot((*P), (*P));
double Eng = sqrt(dotP + 1.0) * par[M_P] - par[M_P];
double gamma = (Eng + par[M_P]) / par[M_P];
double beta = sqrt(1.0 - 1.0 / (gamma * gamma));
double normP = sqrt(dotP);
double deltas = deltat * beta * par[C];
double theta0 = 13.6e6 / (beta * sqrt(dotP) * par[M_P] * 1e9) *
par[Z_P] * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M]));
// x-direction: See Physical Review, "Multiple Scattering"
double z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
double z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
double thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
}
double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
int coord = 1;
Rot(P, R, xplane, normP, thetacou, deltas, coord, par);
double P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P2 < 0.0047) {
double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P4 > 0.5)
thetaru = -thetaru;
coord = 0; // no change in coordinates but one in momenta-direction
Rot(P, R, xplane, normP, thetaru, deltas, coord, par);
}
// y-direction: See Physical Review, "Multiple Scattering"
z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
while(fabs(thetacou) > 3.5 * theta0) {
z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0);
thetacou = z2 * theta0;
}
double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0;
coord = 2;
Rot(P, R, yplane, normP, thetacou, deltas, coord, par);
P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P2 < 0.0047) {
double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0;
double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m);
if(P4 > 0.5)
thetaru = -thetaru;
coord = 0; // no change in coordinates but one in momenta-direction
Rot(P, R, yplane, normP, thetaru, deltas, coord, par);
}
}
#define NUMPARAMS 19
__kernel void kernelCollimatorPhysics(__global PART *data, __global double *par,
__global RNDState *state, int numparticles,
__local double *p)
{
//get global id
int tid = get_local_id(0);
int idx = get_global_id(0);
printf("idx:\n");//, idx);
//transfer params to local memory
if (tid < NUMPARAMS)
p[tid] = par[tid];
barrier(CLK_LOCAL_MEM_FENCE);
RNDState s;
double3 R, P;
int l = 0;
if (idx < numparticles) {
R = data[idx].Rincol;
P = data[idx].Pincol;
s = state[idx];
}
double sq = sqrt(1.0 + Dot(P, P));
bool pdead = false;
bool hit = checkHit(R.z, p[POSITION], p[ZSIZE]);
double Eng;
if (hit) {
Eng = (sq - 1) * p[M_P];
energyLoss(&Eng, &pdead, p[DT_M], &s, p);
} else {
R.x = R.x + p[DT_M] * p[C] * P.x / sq;
R.y = R.y + p[DT_M] * p[C] * P.y / sq;
R.z = R.z + p[DT_M] * p[C] * P.z / sq;
l = -2;
}
if (hit && !pdead) {
double ptot = sqrt((p[M_P] + Eng) * (p[M_P] + Eng) - (p[M_P] * p[M_P])) / p[M_P];
sq = sqrt(Dot(P, P));
P.x = P.x * ptot / sq;
P.y = P.y * ptot / sq;
P.z = P.z * ptot / sq;
coulombScat(&R, &P, p[DT_M], &s, p);
}
if (hit && pdead)
l = -1;
if (idx < numparticles) {
data[idx].Rincol = R;
data[idx].Pincol = P;
data[idx].label = l;
state[idx] = s;
}
}
/* count dead particles and particles leaving material - boost compute? */
/* sort particles so dead and leaving particles are at the end of PART array - boost compute */

View File

@ -0,0 +1,181 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/* 3D normalize FFT kernel */
__kernel void normalizeFFT(__global double2 *input, int N) {
int i1 = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int n1 = get_global_size(0);
int n2 = get_global_size(1);
int n3 = get_global_size(2);
int id = i1;
if (n2 > 1)
id += i2*n2;
if (n3 > 1)
id += i3*n2*n2;
input[id].x = input[id].x / N;
input[id].y = input[id].y / N;
}
/* 3D radix 2 FFT kernel */
__kernel void FFT3D(__global double2 *input, int step, int dim, int forward) {
int n1 = get_global_size(0);
int n2 = get_global_size(1);
int n3 = get_global_size(2);
int i1 = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int jump = step << 1;
int d, idGroup, idLoc, idTwidle, id, match;
if (dim == 0) {
d = n1 / step; // n1 >> log2(step)
idLoc = i1 / d;
idGroup = i1 & (d-1); //modulo
idTwidle = idGroup * jump + idLoc;
id = i3*n3*n3 + i2*n2 + idTwidle;
match = id + step;
} else if (dim == 1) {
d = n2 / step;
idLoc = i2 / d;
idGroup = i2 & (d-1);
idTwidle = idGroup * jump + idLoc;
id = i3*n3*n3 + idTwidle*n1 + i1;
match = id + step*n1;
} else if (dim == 2) {
d = n3 / step;
idLoc = i3 / d;
idGroup = i3 & (d-1);
idTwidle = idGroup * jump + idLoc;
id = idTwidle*n1*n1 + i2*n2 + i1;
match = id + step*n1*n1;
}
double alpha;
if (forward == 1)
alpha = -( 2 * M_PI / jump ) * idTwidle;
else
alpha = ( 2 * M_PI / jump ) * idTwidle;
double wr, wi;
wi = sincos(alpha, &wr);
double2 cTemp;
double2 cTempId = input[id];
double2 cTempMatch = input[match];
cTemp.x = wr*cTempMatch.x - wi*cTempMatch.y;
cTemp.y = wr*cTempMatch.y + wi*cTempMatch.x;
input[match] = cTempId - cTemp;
input[id] = cTempId + cTemp;
}
/* 3D bit reversal sort */
__kernel void BitReverseSort3D(__global double2 *input, int bits, int dim) {
int n = get_global_size(0);
int i1 = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int irev, itmp, istart;
if (dim == 0) {
istart = i1;
irev = i1;
itmp = i1;
} else if (dim == 1) {
irev = i2;
itmp = i2;
istart = i2;
} else if (dim == 2) {
irev = i3;
itmp = i3;
istart = i3;
}
for (int j = 1; j < bits; j++) {
itmp >>= 1;
irev <<= 1;
irev |= itmp & 1;
}
irev &= n - 1;
int id1, id2;
if (istart < irev) {
double2 tmp;
id1 = i3*n*n + i2*n + i1;
if (dim == 0) { //i1, irev - w, i2 - h, i3 - d
id2 = i3*n*n + i2*n + irev;
tmp = input[id1];
input[id1] = input[id2];
input[id2] = tmp;
} else if (dim == 1) { // i1 - w, i2, irev - h, i3 - d
id2 = i3*n*n + irev*n + i1;
tmp = input[id1];
input[id1] = input[id2];
input[id2] = tmp;
} else if (dim == 2) { // i1 - w, i2 - h, i3, irev - d
id2 = irev*n*n + i2*n + i1;
tmp = input[id1];
input[id1] = input[id2];
input[id2] = tmp;
}
}
}
/* 3D FFT kernel based on Stockham's out-of-place algorithm */
__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim, const int forward) {
const int gid1 = get_global_id(0);
const int gid2 = get_global_id(1);
const int gid3 = get_global_id(2);
int t2 = 2*t;
int k, m, in1, in2, out1, out2;
in1 = gid3*t2*t2 + gid2*t2 + gid1;
if (ndim == 1) {
k = gid1 & (p - 1);
m = (gid1 << 1) - k;
in2 = in1 + t;
out1 = gid3*t2*t2 + gid2*t2 + m;
out2 = out1 + p;
} else if (ndim == 2) {
k = gid2 & (p - 1);
m = (gid2 << 1) - k;
in2 = in1 + t2*t;
out1 = gid3*t2*t2 + m*t2 + gid1;
out2 = out1 + t2*p;
} else if (ndim == 3) {
k = gid3 & (p - 1);
m = (gid3 << 1) - k;
in2 = in1 + t2*t2*t;
out1 = m*t2*t2 + gid2*t2 + gid1;
out2 = out1 + p*t2*t2;
}
const double2 d1 = src[in1];
const double2 d2 = src[in2];
const double theta = (forward*2*M_PI*k) / (p << 1);
double cs;
double sn = sincos(theta, &cs);
const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
dst[out1] = d1 + temp;
dst[out2] = d1 - temp;
}

View File

@ -0,0 +1,214 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define TWOPI 6.28318530718
__kernel void fft_radix2(__global double2* src, __global double2* dst, const int p, const int t) {
const int gid = get_global_id(0);
const int k = gid & (p - 1);
const int m = (gid << 1) - k;
//src += gid;
//dst += (gid << 1) - k;
//const double2 in1 = src[0];
//const double2 in2 = src[t];
const double2 in1 = src[gid];
const double2 in2 = src[gid+t];
const double theta = (-2*M_PI*k) / (p << 1);
double cs;
double sn = sincos(theta, &cs);
const double2 temp = (double2) (in2.x * cs - in2.y * sn, in2.y * cs + in2.x * sn);
//dst[0] = in1 + temp;
//dst[p] = in1 - temp;
dst[m] = in1 + temp;
dst[m+p] = in1 - temp;
}
__kernel void fft3d_radix2_transpose(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) {
/* get ids */
const int gid1 = get_global_id(0);
const int gid2 = get_global_id(1);
const int gid3 = get_global_id(2);
/* calc indexes */
int t2 = 2*t;
int k = gid1 & (p - 1);
int m = (gid1 << 1) - k;
int tmp = gid3*t2*t2 + gid2*t2;
int in1 = tmp + gid1;
int in2 = in1 + t;
int out1 = tmp + m;
int out2 = out1 + p;
/* calc FFT */
const double2 d1 = src[in1];
const double2 d2 = src[in2];
const double theta = (-2*M_PI*k) / (p << 1);
double cs;
double sn = sincos(theta, &cs);
const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
dst[out1] = d1 + temp;
dst[out2] = d1 - temp;
}
__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) {
const int gid1 = get_global_id(0);
const int gid2 = get_global_id(1);
const int gid3 = get_global_id(2);
int t2 = 2*t;
int k, m, in1, in2, out1, out2;
in1 = gid3*t2*t2 + gid2*t2 + gid1;
if (ndim == 1) {
k = gid1 & (p - 1);
m = (gid1 << 1) - k;
in2 = in1 + t;
out1 = gid3*t2*t2 + gid2*t2 + m;
out2 = out1 + p;
} else if (ndim == 2) {
k = gid2 & (p - 1);
m = (gid2 << 1) - k;
in2 = in1 + t2*t;
out1 = gid3*t2*t2 + m*t2 + gid1;
out2 = out1 + t2*p;
} else if (ndim == 3) {
k = gid3 & (p - 1);
m = (gid3 << 1) - k;
in2 = in1 + t2*t2*t;
out1 = m*t2*t2 + gid2*t2 + gid1;
out2 = out1 + p*t2*t2;
}
const double2 d1 = src[in1];
const double2 d2 = src[in2];
const double theta = (-2*M_PI*k) / (p << 1);
double cs;
double sn = sincos(theta, &cs);
const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn);
dst[out1] = d1 + temp;
dst[out2] = d1 - temp;
}
__kernel void transpose(__global double2 *data, int ndim, int dim) {
int k = get_global_id(0);
int j = get_global_id(1);
int i = get_global_id(2);
int nk = get_global_size(0);
int nj = get_global_size(1);
int ni = get_global_size(2);
int n, m;
n = i*ni*ni + j*nj + k;
if (dim == 2)
m = i*ni*ni + k*nj + j;
else
m = k*ni*ni + j*nj + i;
if (n < m) {
double2 tmp = data[m];
data[m] = data[n];
data[n] = tmp;
}
}
#define PI2 6.28318530718
__kernel void fft_batch3D(__global double2 *data_in, __local double2 *d, __local double2 *r, __local double2 *tmp, int N, int dim) {
int id1 = get_global_id(0);
int id2 = get_global_id(1);
int id3 = get_global_id(2);
//calc indexes
int sid, offset;
if (dim == 1) {
sid = id3*N*N + id2*N;
offset = 1;
} else if (dim == 2) {
sid = id3*N*N + id2;
offset = N;
} else if (dim == 3) {
sid = id3*N + id2;
offset = N*N;
}
//copy data from global memory to local
int i1 = id1;
int i2 = id1+N/2;
d[i1] = data_in[sid + i1*offset];
d[i2] = data_in[sid + i2*offset];
barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_GLOBAL_MEM_FENCE);
//exec fft
int p1, p2, j, k, out1, step, jump, t;
double theta, cs, sn;
t = 1;
step = 1;
while (step < N) {
jump = step << 1;
j = i1 >> (t - 1); // same as i1 / step, because t-1 = log2(step)
k = i2 & (step - 1); // same as i2 % step
out1 = j * jump + k;
theta = -PI2 * k / jump;
sn = sincos(theta, &cs);
double2 temp = (double2) (d[i2].x*cs - d[i2].y*sn, d[i2].y*cs + d[i2].x * sn);
r[out1] = d[i1] + temp;
r[out1+step] = d[i1] - temp;
t++;
step = jump;
//swap local arrays
tmp = r;
r = d;
d = tmp;
//wait for all threads to finish this iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
tmp = r;
r = d;
d = tmp;
//copy data from local memory to global
data_in[sid + i1*offset] = r[i1];
data_in[sid + i2*offset] = r[i2];
}

View File

@ -0,0 +1,41 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/* transpose matrix */
__kernel void transpose(__global double2 *input, __global double2 *output,
int width, int height, __local double2 *block)
{
//transfer row in shared memory
unsigned int xIdx = get_global_id(0);
unsigned int yIdx = get_global_id(1);
int block_dim = get_local_size(0);
if ( (xIdx < width) && (yIdx < height) ) {
unsigned int idx_in = yIdx * width + xIdx;
block[get_local_id(1)*(block_dim+1)+get_local_id(0)] = input[idx_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
xIdx = get_group_id(1) * block_dim + get_local_id(0);
yIdx = get_group_id(0) * block_dim + get_local_id(1);
if ( (xIdx < height) && (yIdx < width) ) {
unsigned int idx_out = yIdx * height + xIdx;
output[idx_out] = block[get_local_id(0)*(block_dim+1)+get_local_id(1)];
}
}
/* naive transpose matrix kernel */
__kernel void transpose_naive(__global double2 *input, __global double2 *output, int width, int height)
{
unsigned int xIdx = get_global_id(0);
unsigned int yIdx = get_global_id(1);
if (xIdx < width && yIdx < height) {
unsigned int idx_in = xIdx + width * yIdx;
unsigned int idx_out = yIdx + height * xIdx;
output[idx_out] = input[idx_in];
}
}

View File

@ -0,0 +1,18 @@
SET (_SRCS
TimeStamp.cpp
DKSTimer.cpp
)
SET (_HDRS
TimeStamp.h
DKSTimer.h
)
#INCLUDE_DIRECTORIES (
# ${CMAKE_CURRENT_SOURCE_DIR}
#)
ADD_SOURCES (${_SRCS})
ADD_HEADERS (${_HDRS})
INSTALL(FILES ${_HDRS} DESTINATION include/Utility)

53
src/Utility/DKSTimer.cpp Normal file
View File

@ -0,0 +1,53 @@
#include "DKSTimer.h"
//set initial values - running to false, timervalue to zero and name to empty string
DKSTimer::DKSTimer() {
running = false;
timervalue = 0.0;
name = "";
}
//destructor does nothing
DKSTimer::~DKSTimer() {
}
//init the timer by setting name and clearing timervalue, also sets running to false
void DKSTimer::init(std::string n) {
running = false;
timervalue = 0.0;
name = n;
}
//if timer is not running get the current time and save to timeStart, set the timer as running
void DKSTimer::start() {
if (!running) {
gettimeofday(&timeStart, NULL);
running = true;
}
}
//if the timer is running get the current time to timeEnd, calculate the elapsed time befor start
//and end, add elapsed time to timervalue, set the timer as not running
void DKSTimer::stop() {
if (running) {
gettimeofday(&timeEnd, NULL);
timervalue += ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6;
running = false;
}
}
void DKSTimer::reset() {
running = false;
timervalue = 0.0;
}
//return the accumulated value of timervalue
double DKSTimer::gettime() {
return timervalue;
}
void DKSTimer::print() {
std::cout << "DKSTimer " << name << " elapsed time\t" << timervalue << "s" << std::endl;
}

59
src/Utility/DKSTimer.h Normal file
View File

@ -0,0 +1,59 @@
#ifndef H_DKSTIMER
#define H_DKSTIMER
#include <iostream>
#include <string>
#include <sys/time.h>
class DKSTimer {
private:
bool running;
double timervalue;
struct timeval timeStart;
struct timeval timeEnd;
std::string name;
public:
/** Init DKSTimer by seting timer to zero */
DKSTimer();
~DKSTimer();
/** Init the timer
* Set the name for timer and clear all values
*/
void init(std::string n);
/** Start the timer.
* Get the curret time with gettimeofday and save in timeStart
*/
void start();
/** Stop the timer
* Get the curretn time with gettimeofday and save in timeEnd
* Calculate elapsed time by timeEnd - timeStart and add to timervalue
*/
void stop();
/** Reset timervalue to zero.
* Set timervalue, timeStart and timeEnd to zero
*/
void reset();
/** Return elapsed time in seconds.
* Return the value of timervalue
*/
double gettime();
/** Print timer.
* Print the elapsed time of the timer
*/
void print();
};
#endif

11
src/Utility/TimeStamp.cpp Normal file
View File

@ -0,0 +1,11 @@
#include "TimeStamp.h"
timestamp_t get_timestamp() {
struct timeval now;
gettimeofday (&now, NULL);
return now.tv_usec + (timestamp_t)now.tv_sec * 1000000;
}
double get_secs(timestamp_t t_start, timestamp_t t_end) {
return (t_end - t_start) / 1000000.0L;
}

14
src/Utility/TimeStamp.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef H_TIMESTAMPE
#define H_TIMESTAMPE
#include <iostream>
#include <time.h>
#include <sys/time.h>
typedef unsigned long long timestamp_t;
timestamp_t get_timestamp();
double get_secs(timestamp_t t_start, timestamp_t t_end);
#endif

84
test/CMakeLists.txt Normal file
View File

@ -0,0 +1,84 @@
INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src )
#ADD_EXECUTABLE(testDKS testDKS.cpp)
#ADD_EXECUTABLE(testChi testChi.cpp)
#ADD_EXECUTABLE(testFFT testFFT.cpp)
#ADD_EXECUTABLE(testMIC testMIC.cpp)
#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp)
#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp)
#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp)
#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp)
#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp)
#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp)
#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp)
#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp)
#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp)
#ADD_EXECUTABLE(testOffset testOffset.cpp)
#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp)
#ADD_EXECUTABLE(testMPI testMPI.cpp)
#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp)
#ADD_EXECUTABLE(testGather testGather.cpp)
#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp)
#ADD_EXECUTABLE(testTranspose testTranspose.cpp)
ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp)
#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp)
#ADD_EXECUTABLE(testPush testPush.cpp)
#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp)
#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp)
#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp)
#shared library
#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp)
#TARGET_LINK_LIBRARIES(testDKS dks)
#TARGET_LINK_LIBRARIES(testChi dks)
#TARGET_LINK_LIBRARIES(testFFT dks)
#TARGET_LINK_LIBRARIES(testMIC dks)
#TARGET_LINK_LIBRARIES(testMICOpenCL dks)
#TARGET_LINK_LIBRARIES(testFFT3D dks)
#TARGET_LINK_LIBRARIES(testFFT3DRC dks)
#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks)
#TARGET_LINK_LIBRARIES(testFFT3DTiming dks)
#TARGET_LINK_LIBRARIES(testStockhamFFT dks)
#TARGET_LINK_LIBRARIES(testStockFFT3D dks)
#TARGET_LINK_LIBRARIES(testMemObjects dks)
#TARGET_LINK_LIBRARIES(testRCFFT dks)
#TARGET_LINK_LIBRARIES(testOffset dks)
#TARGET_LINK_LIBRARIES(testOffsetMPI dks)
#TARGET_LINK_LIBRARIES(testMPI dks)
#TARGET_LINK_LIBRARIES(testMPIFFT dks)
#TARGET_LINK_LIBRARIES(testGather dks)
#TARGET_LINK_LIBRARIES(testGatherAsync dks)
#TARGET_LINK_LIBRARIES(testTranspose dks)
TARGET_LINK_LIBRARIES(testCollimatorPhysics dks)
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks)
#TARGET_LINK_LIBRARIES(testPush dks)
#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks)
#TARGET_LINK_LIBRARIES(testIntegration dks)
#TARGET_LINK_LIBRARIES(testImageReconstruction dks)
#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared)
#IF (${COMPILER_NAME} STREQUAL "mpicxx")
#ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp)
#ADD_EXECUTABLE(testGreens testGreens.cpp)
#ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp)
#ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp)
#TARGET_LINK_LIBRARIES(testGatherAsync2 dks)
#TARGET_LINK_LIBRARIES(testGreens dks)
#TARGET_LINK_LIBRARIES(testFFTSolver dks)
#TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks)
#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx")
#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp)
#TARGET_LINK_LIBRARIES(testChiSquare dks)
#IF (NOT CUDA_VERSION VERSION_LESS "7.0")
#ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp)
#TARGET_LINK_LIBRARIES(testChiSquareRT dks)
#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0")

141
test/testChi.cpp Normal file
View File

@ -0,0 +1,141 @@
#include <iostream>
#include <complex>
#include <cstdlib>
#include "DKSBase.h"
#include "Utility/TimeStamp.h"
using namespace std;
int main(int argc, char *argv[]) {
char *api_name = new char[10];
char *device_name = new char[4];
if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else if (argc == 2){
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << endl;
cout << "Begin DKS Base tests" << endl;
/* inti data */
int ierr;
int nsize = 4000000;
int jsize = 16;
int psize = 6;
double *data = new double[nsize*jsize];
double *p = new double[psize*jsize];
double data_out = 0;
srand(time(NULL));
for (int i = 0; i < nsize*jsize; i++) {
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
//data[i] = sign*(double)rand()/RAND_MAX;
data[i] = (double)i / (nsize*jsize);
//data[i] = 1;
}
for (int i = 0; i < psize*jsize; i++) {
//int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
//p[i] = sign*(double)rand()/RAND_MAX;
p[i] = (double)i / (nsize*jsize);
//p[i] = 1;
}
/* end init */
timestamp_t tstart, tend;
//timestamp_t t0, t1;
tstart = get_timestamp();
//init dks base class, set API to opencl and init connection with OpenCL device
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
//ptrs to hold reference to device memory
void *dptr, *ntptr, *pptr;
//allocate memory on device
//t0 = get_timestamp();
dptr = base.allocateMemory<double>(nsize*jsize, ierr);
ntptr = base.allocateMemory<double>(nsize*jsize, ierr);
pptr = base.allocateMemory<double>(psize*jsize, ierr);
//t1 = get_timestamp();
//cout << "Allocate memory: " << get_secs(t0, t1) << endl;
//write data to device
//t0 = get_timestamp();
base.writeData<double>(dptr, data, nsize*jsize);
//t1 = get_timestamp();
//cout << "Write data set: " << get_secs(t0, t1) << endl << endl;
for (int i = 0; i < 5; i++) {
//write parameters to device
//t0 = get_timestamp();
base.writeData<double>(pptr, p, psize*jsize);
//t1 = get_timestamp();
//cout << "Write parameters: " << get_secs(t0, t1) << endl;
//set function to calcNt and execute it with necessary parameters
//t0 = get_timestamp();
base.callNt<double>(ntptr, pptr, psize, nsize, jsize, 0.025);
//t1 = get_timestamp();
//cout << "Calc N(t): " << get_secs(t0, t1) << endl;
//set function to chi2 and execute it with necessary parameters
//t0 = get_timestamp();
base.callChi2<double>(ntptr, dptr, ntptr, nsize*jsize);
//t1 = get_timestamp();
//cout << "Calc chi^2: " << get_secs(t0, t1) << endl;
//set function so sum and execute it with necessary parameters
//t0 = get_timestamp();
base.callSum<double>(ntptr, ntptr, nsize*jsize);
//t1 = get_timestamp();
//cout << "Calc sum: " << get_secs(t0, t1) << endl;
//read calculated sum (one value)
//t0 = get_timestamp();
base.readData<double>(ntptr, &data_out, 1);
//t1 = get_timestamp();
//cout << "Read sum: " << get_secs(t0, t1) << endl;
cout << "Sum nt: " << data_out << endl;
/*
for (int i = 0; i < psize*jsize; i++) {
int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1;
p[i] = sign*(double)rand()/RAND_MAX;
}
*/
//cout << endl;
}
//free device memory
//t0 = get_timestamp();
base.freeMemory<double>(dptr, nsize*jsize);
base.freeMemory<double>(ntptr, nsize*jsize);
base.freeMemory<double>(pptr, psize*jsize);
//t1 = get_timestamp();
//cout << "Free memory: " << get_secs(t0, t1) << endl;
tend = get_timestamp();
cout << endl << "time: " << get_secs(tstart, tend) << endl;
return 0;
}

168
test/testChiSquare.cpp Normal file
View File

@ -0,0 +1,168 @@
#include <iostream>
#include <vector>
#include "DKSBase.h"
using namespace std;
void initData(vector< vector<double> > &v, int length) {
for (unsigned int i = 0; i < v.size(); i++) {
for (int j = 0; j < length; j++) {
v[i].push_back(j);
}
}
}
void printData(vector< vector<double> > &v) {
for (unsigned int i = 0; i < v.size(); i++) {
for (unsigned int j = 0; j < v[i].size(); j++) {
cout << v[i][j] << "\t";
}
cout << endl;
}
}
void initData(double *data, int sensors, int length) {
for (int i = 0; i < sensors; i++) {
for (int j = 0; j < length; j++) {
data[i*length + j] = j;
}
}
}
void printData(double *data, int sensors, int length) {
for (int i = 0; i < sensors; i++) {
for (int j = 0; j < length; j++) {
cout << data[i*length + j] << "\t";
}
cout << endl;
}
}
void initPar(double *par, int npar) {
for (int i = 0; i < npar; i++)
par[i] = (double)i / npar;
}
void printDiv(int size) {
for (int i = 0; i < size; i++)
cout << "=";
cout << endl;
}
void calcChisq(vector< vector<double> > fData, double * par, double fTimeResolution, double fRebin)
{
double chisq = 0.0;
double theo, data;
const double tau=2.197019;
const double dt0 = fTimeResolution*0.5*(fRebin-1);
double time;
double w = par[0]*0.08516155035269027;
unsigned int i, j;
for (i=0; i<fData.size(); i++) {
for (j=0; j<fData[0].size(); j++) {
data = fData[i][j];
time = dt0+fTimeResolution*fRebin*j;
theo = par[2 + i*4] * exp(-time/tau)*(1.0 + par[3 + i*4]*exp(-0.5 * pow(par[1]*time,2.0))*cos(w*time+par[4+i*4]*1.74532925199432955e-2))+par[5+i*4];
if (data != 0.0) {
chisq += (theo-data)*(theo-data)/data;
cout << (theo-data)*(theo-data)/data << "\t";
} else {
chisq += theo*theo;
cout << theo*theo << "\t";
}
}
cout << endl;
}
cout << "Chisq: " << chisq << endl;
}
int main(int argc, char *argv[]) {
bool useCuda = true;
if (argc == 2 && atoi(argv[1]) == 1)
useCuda = false;
int ierr;
int sensors = 5;
int length = 10;
int npar = 4 * sensors + 2;
int ndata = sensors * length;
double result;
double fTimeResolution = 0.05;
double fRebin = 5;
double *par = new double[npar];
initPar(par, npar);
vector< vector< double > > fData;
fData.resize(sensors);
initData(fData, length);
printData(fData);
printDiv(75);
DKSBase dksbase;
if (useCuda)
dksbase.setAPI("Cuda", 4);
else
dksbase.setAPI("OpenCL", 6);
dksbase.setDevice("-gpu", 4);
dksbase.initDevice();
dksbase.setupFFT(0, NULL);
void *mem_data, *mem_par, *mem_chisq;
cout << "Allocate memory" << endl;
mem_par = dksbase.allocateMemory<double>(npar, ierr);
mem_data = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
mem_chisq = dksbase.allocateMemory<double>(fData.size() * fData[0].size(), ierr);
cout << "Write data" << endl;
dksbase.writeData<double>(mem_par, par, npar);
for (int i = 0; i < sensors; i++)
dksbase.writeData<double>(mem_data, &fData[i][0], length, i*length);
cout << "Call PHistoTFFcn" << endl;
dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq,
fTimeResolution, fRebin,
sensors, length, npar, result);
cout << "Result: " << result << endl;
double *out_data = new double[ndata];
dksbase.readData<double>(mem_chisq, out_data, ndata);
printDiv(75);
printData(out_data, sensors, length);
printDiv(75);
calcChisq(fData, par, fTimeResolution, fRebin);
printDiv(75);
cout << "Free memory" << endl;
dksbase.freeMemory<double>(mem_par, npar);
dksbase.freeMemory<double>(mem_data, ndata);
dksbase.freeMemory<double>(mem_chisq, ndata);
return 0;
}

193
test/testChiSquareRT.cpp Normal file
View File

@ -0,0 +1,193 @@
#include <iostream>
#include <cstdlib>
#include <string>
#include <cmath>
#include <omp.h>
#include "DKSBaseMuSR.h"
#include "Utility/DKSTimer.h"
void initData(double *data, int N, bool ones = false) {
for (int i = 0; i < N; i++) {
if (ones)
data[i] = 1.0;
else
data[i] = (double)rand() / RAND_MAX;
}
}
template <typename T>
void printData(T *data, int N) {
for (int i = 0; i < N; i++)
std::cout << data[i] << "\t";
std::cout << std::endl;
}
const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])";
//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])";
//const std::string funct = "p[m[0]] * se(t, p[m[1]])";
//const std::string funct = "p[m[1]] + p[m[0]]";
double fTheory(double time, double *par, double *func, int *map) {
return cos(time*par[0]) - exp(-time*par[map[0]]);
}
double testFunctionSerial(double *data, double *par, double *func, int *map,
double N0, double tau, double bkg, double timeStep,
int startTimeBin, int endTimeBin)
{
double time, diff, theo;
double chisq = 0;
for (int i = startTimeBin; i < endTimeBin; ++i) {
time = i * timeStep;
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
diff = data[i] - theo;
chisq += diff * diff / data[i];
}
return chisq;
}
double testFunctionParallel(double *data, double *par, double *func, int *map,
double N0, double tau, double bkg, double timeStep,
int startTimeBin, int endTimeBin)
{
int i, chunk;
double time, diff, theo;
double chisq = 0;
chunk = (endTimeBin - startTimeBin) / omp_get_num_procs();
if (chunk < 10)
chunk = 10;
#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq)
for (i = startTimeBin; i < endTimeBin; ++i) {
time = i * timeStep;
theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg;
diff = data[i] - theo;
chisq += diff * diff / data[i];
}
return chisq;
}
int main(int argc, char *argv[]) {
int Loop = 100;
//init test data on the host
int Ndata = 8;
if (argc > 1)
Ndata = atoi(argv[1]);
int api = 1;
if (argc > 2)
api = atoi(argv[2]);
int Npar = 66;
int Nfunc = 1;
int Nmap = 4;
double *data = new double[Ndata];
double *par = new double[Npar];
double *func = new double[Nfunc];
int *map = new int[Nmap];
initData(data, Ndata);
initData(par, Npar);
initData(func, Nfunc);
map[0] = 1;
map[1] = 2;
map[2] = 3;
map[3] = 4;
//create timers
DKSTimer serialTimer;
DKSTimer cudaTimer;
DKSTimer ompTimer;
DKSTimer gpuOverhead;
serialTimer.init("Serial timer");
cudaTimer.init("Cuda timer");
ompTimer.init("OpenMP timer");
gpuOverhead.init("Overhead for gpu");
//serial version
double resultSerial;
serialTimer.start();
for (int i = 0; i < Loop; i++)
resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
serialTimer.stop();
//openmp version
double resultOMP = 0.0;
ompTimer.start();
//for (int i = 0; i < Loop; i++)
// resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata);
ompTimer.stop();
//create and init dkabase
gpuOverhead.start();
DKSBaseMuSR dksbase;
if (api == 1)
dksbase.setAPI("Cuda");
else
dksbase.setAPI("OpenCL");
dksbase.setDevice("-gpu");
dksbase.initDevice();
dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap);
//allocate memory on the device
int ierr;
void *data_ptr;
data_ptr = dksbase.allocateMemory<double>(Ndata, ierr);
dksbase.writeData<double>(data_ptr, data, Ndata);
dksbase.writeFunctions(func, Nfunc);
dksbase.writeMaps(map, Nmap);
dksbase.callCompileProgram(funct);
gpuOverhead.stop();
double resultCuda;
cudaTimer.start();
for (int i = 0; i < Loop; i++) {
dksbase.writeParams(par, Npar);
int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap,
0.0, 0.1, 0, resultCuda);
if (ierr != 0)
exit (EXIT_FAILURE);
}
cudaTimer.stop();
std::cout << std::endl;
std::cout << "=======================Results=======================" << std::endl;
std::cout << "Result serial = " << resultSerial << std::endl;
std::cout << "Result prallel = " << resultOMP << std::endl;
std::cout << "Result cuda = " << resultCuda << std::endl;
std::cout << std::endl;
std::cout << "=======================Timings=======================" << std::endl;
serialTimer.print();
ompTimer.print();
cudaTimer.print();
gpuOverhead.print();
std::cout << std::endl;
dksbase.freeMemory<double>(data_ptr, Ndata);
return 0;
}

View File

@ -0,0 +1,248 @@
#include <iostream>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
#include <vector_types.h>
#include "cuda_runtime.h"
using namespace std;
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
} PART_SMALL;
typedef struct {
double x;
double y;
double z;
} Vector;
PART_SMALL initPartSmall(int d) {
PART_SMALL p;
p.label = 0;
p.localID = d;
p.Rincol[0] = 0.0;
p.Rincol[1] = 0.0;
p.Rincol[2] = 0.02;
p.Pincol[0] = 0.0;
p.Pincol[1] = 0.0;
p.Pincol[2] = 3.9920183237269791e-01;
return p;
}
Vector initVector() {
Vector tmp;
tmp.x = 0.5;
tmp.y = 0.5;
tmp.z = 0.5;
return tmp;
}
void printPart(PART_SMALL p) {
cout << "label: " << p.label << ", ";
cout << "localid: " << p.localID << ",";
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2];
cout << endl;
}
void printVector(Vector v) {
cout << v.x << "\t" << v.y << "\t" << v.z << endl;
}
void initParts(PART_SMALL *p, int N) {
for (int i = 0; i < N; i++)
p[i] = initPartSmall(i);
}
void printParts(PART_SMALL *p, int N) {
for (int i = 0; i < N; i++)
printPart(p[i]);
cout << endl;
}
void initVectors(Vector *v, int N) {
for (int i = 0; i < N; i++)
v[i] = initVector();
}
void printVectors(Vector *v, int N) {
for (int i = 0; i < N; i++)
printVector(v[i]);
cout << endl;
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
void printDouble(double *data, int N) {
for (int i = 0; i < N; i++)
std::cout << data[i] << "\t";
std::cout << std::endl;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of particles: " << numpart << endl;
cout << "Number of loops: " << loop << endl;
cout << "------------------------------------------------------------" << endl;
//init part vector to test mc
PART_SMALL *parts = new PART_SMALL[numpart];
initParts(parts, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *part_ptr, *param_ptr;
//allocate memory for particles
part_ptr = base.allocateMemory<PART_SMALL>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<PART_SMALL>(part_ptr, parts, numpart);
base.writeData<double>(param_ptr, params, 12);
int numaddback;
//test calls to do some first executions
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
//std::cout << "particles to add back: " << numaddback << std::endl;
struct timeval timeStart, timeEnd;
std::cout << "Start MC" << std::endl;
gettimeofday(&timeStart, NULL);
for (int i = 0; i < loop; i++) {
base.callCollimatorPhysics2(part_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback);
base.syncDevice();
}
gettimeofday(&timeEnd, NULL);
std::cout << "addback: " << numaddback << std::endl;
std::cout << "End MC" << std::endl;
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec));
std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl;
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
//read data from device
base.readData<PART_SMALL>(part_ptr, parts, numpart);
//free memory
base.freeMemory<PART_SMALL>(part_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << parts[i].label << "\t"
<< parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t"
<< parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t"
<< parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t"
<< std::endl;
}
double arx = 0, ary = 0, arz = 0;
double apx = 0, apy = 0, apz = 0;
for (int i = 0; i < numpart; i++) {
arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart;
ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart;
arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart;
apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart;
apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart;
apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart;
}
std::cout << std::fixed << std::setprecision(10);
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
cout << "==========================END TEST==========================" << endl;
return 0;
}

View File

@ -0,0 +1,126 @@
#include <iostream>
#include <vector>
#include "DKSBase.h"
#include "cuda_runtime.h"
#include <mpi.h>
using namespace std;
typedef struct {
int label;
unsigned localID;
double Rincol[3];
double Pincol[3];
long IDincol;
int Binincol;
double DTincol;
double Qincol;
long LastSecincol;
double Bfincol[3];
double Efincol[3];
} PART;
PART initPart(int d) {
PART p;
p.label = d;
p.localID = d;
for (int i = 0; i < 3; i++) {
p.Rincol[i] = 0.5;// / (d+1);
p.Pincol[i] = 0.5;// / (d+1);
p.Bfincol[i] = 1.0 / (d+1);
p.Efincol[i] = 1.0 / (d+1);
}
p.IDincol = d;
p.Binincol = d;
p.DTincol = d;
p.Qincol = d;
p.LastSecincol = d;
return p;
}
void printPart(PART p) {
cout << "label: " << p.label << ", ";
//cout << "localID: " << p.localID << ", ";
cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", ";
cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", ";
//cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", ";
//cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", ";
//cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", ";
//cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl;
cout << endl;
}
int main(int argc, char *argv[]) {
int ierr;
int rank, nprocs;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int numpart = 500501;
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
base.callInitRandoms(numpart);
PART tmp;
vector<PART> p;
vector<PART> p_out;
p_out.resize(numpart);
for (int i = 0; i < numpart; i++) {
tmp = initPart(i + 1);
p.push_back(tmp);
}
if (numpart <= 20) {
for (int i = 0; i < 10; i++)
printPart(p[i]);
cout << endl;
}
double params[19];
for (int i = 0; i < 19; i++)
params[i] = 0.05;
params[0] = 0;
params[1] = 1;
void *mem_ptr, *par_ptr;
par_ptr = base.allocateMemory<double>(19, ierr);
base.writeData<double>(par_ptr, params, 19);
mem_ptr = base.allocateMemory<PART>(numpart, ierr);
base.writeData<PART>(mem_ptr, &p[0], numpart);
int addback, dead;
for (int i = 0; i < 100; i++)
base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead);
cout << "Add back: " << addback << ", dead: " << dead << endl;
base.readData<PART>(mem_ptr, &p_out[0], numpart);
base.freeMemory<PART>(mem_ptr, ierr);
base.freeMemory<double>(par_ptr, ierr);
if (numpart <= 20) {
for (int i = 0; i < numpart; i++)
printPart(p_out[i]);
}
MPI_Finalize();
return 0;
}

View File

@ -0,0 +1,250 @@
#include <iostream>
#include <iomanip>
#include <vector>
#include <sys/time.h>
#include "DKSBase.h"
#include <vector_types.h>
#include "cuda_runtime.h"
#include <omp.h>
using namespace std;
typedef struct {
int *label;
unsigned *localID;
double *rx;
double *ry;
double *rz;
double *px;
double *py;
double *pz;
} PART;
void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz,
double *px, double *py, double *pz, int npart) {
for (int i = 0; i < npart; i++) {
label[i] = 0;
localID[i] = i;
rx[i] = 0.0;
ry[i] = 0.0;
rz[i] = 0.02;
px[i] = 0.0;
py[i] = 0.0;
pz[i] = 3.9920183237269791e-01;
}
}
void initParams(double *data) {
data[0] = 0.0;//2.0000000000000000e-02;
data[1] = 1.0;//1.0000000000000000e-02;
data[2] = 2.2100000000000000e+00;
data[3] = 6.0000000000000000e+00;
data[4] = 1.2010700000000000e+01;
data[5] = 2.6010000000000000e+00;
data[6] = 1.7010000000000000e+03;
data[7] = 1.2790000000000000e+03;
data[8] = 1.6379999999999999e-02;
data[9] = 1.9321266968325795e-01;
data[10] = 7.9000000000000000e+01;
data[11] = 1.0000000000000002e-12;
}
int main(int argc, char *argv[]) {
int loop = 10;
int numpart = 1e5;
char *api_name = new char[10];
char *device_name = new char[10];
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
for (int i = 1; i < argc; i++) {
if (argv[i] == string("-mic")) {
strcpy(api_name, "OpenMP");
strcpy(device_name, "-mic");
}
if (argv[i] == string("-npart")) {
numpart = atoi(argv[i+1]);
i++;
}
if (argv[i] == string("-loop")) {
loop = atoi(argv[i+1]);
i++;
}
}
int threads = 0;
/*
#pragma offload target(mic:0) out(threads)
{
#pragma omp parallel
{
threads = omp_get_num_threads();
}
}
*/
cout << "=========================BEGIN TEST=========================" << endl;
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Number of particles: " << numpart << endl;
cout << "Number of loops: " << loop << endl;
cout << "Number of threads: " << threads << endl;
cout << "------------------------------------------------------------" << endl;
//init part vector to test mc
//int *label;
//unsigned *localID;
//double *rx, *ry, *rz, *px, *py, *pz;
PART p;
p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64);
p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64);
p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64);
p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64);
initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart);
double *params = new double[12];
initParams(params);
//init dks
int ierr;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
//init random
base.callInitRandoms(numpart);
//**test collimator physics and sort***//
void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr;
//allocate memory for particles
label_ptr = base.allocateMemory<int>(numpart, ierr);
localID_ptr = base.allocateMemory<unsigned>(numpart, ierr);
rx_ptr = base.allocateMemory<double>(numpart, ierr);
ry_ptr = base.allocateMemory<double>(numpart, ierr);
rz_ptr = base.allocateMemory<double>(numpart, ierr);
px_ptr = base.allocateMemory<double>(numpart, ierr);
py_ptr = base.allocateMemory<double>(numpart, ierr);
pz_ptr = base.allocateMemory<double>(numpart, ierr);
param_ptr = base.allocateMemory<double>(12, ierr);
//transfer data to device
base.writeData<int>(label_ptr, p.label, numpart);
base.writeData<unsigned>(localID_ptr, p.localID, numpart);
base.writeData<double>(rx_ptr, p.rx, numpart);
base.writeData<double>(ry_ptr, p.ry, numpart);
base.writeData<double>(rz_ptr, p.rz, numpart);
base.writeData<double>(px_ptr, p.px, numpart);
base.writeData<double>(py_ptr, p.py, numpart);
base.writeData<double>(pz_ptr, p.pz, numpart);
//transfer params to device
base.writeData<double>(param_ptr, params, 12);
std::cout << "test runs" << std::endl;
int numaddback;
//test calls to do some first executions
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
base.syncDevice();
struct timeval timeStart, timeEnd;
std::cout << "Start MC" << std::endl;
gettimeofday(&timeStart, NULL);
for (int i = 0; i < loop; i++) {
base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart);
base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr,
py_ptr, pz_ptr, param_ptr, numpart, numaddback);
base.syncDevice();
}
gettimeofday(&timeEnd, NULL);
std::cout << "addback: " << numaddback << std::endl;
std::cout << "End MC" << std::endl;
double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 +
(timeEnd.tv_usec - timeStart.tv_usec));
std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl;
std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl;
//read data from device
base.readData<int>(label_ptr, p.label, numpart);
base.readData<unsigned>(localID_ptr, p.localID, numpart);
base.readData<double>(rx_ptr, p.rx, numpart);
base.readData<double>(ry_ptr, p.ry, numpart);
base.readData<double>(rz_ptr, p.rz, numpart);
base.readData<double>(px_ptr, p.px, numpart);
base.readData<double>(py_ptr, p.py, numpart);
base.readData<double>(pz_ptr, p.pz, numpart);
//free memory
base.freeMemory<int>(label_ptr, numpart);
base.freeMemory<unsigned>(localID_ptr, numpart);
base.freeMemory<double>(rx_ptr, numpart);
base.freeMemory<double>(ry_ptr, numpart);
base.freeMemory<double>(rz_ptr, numpart);
base.freeMemory<double>(px_ptr, numpart);
base.freeMemory<double>(py_ptr, numpart);
base.freeMemory<double>(pz_ptr, numpart);
base.freeMemory<double>(param_ptr, 12);
/*
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < 10; i++) {
std::cout << p.label[i] << "\t" << p.rx[i]
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
}
std:: cout << "..." << std::endl;
for (int i = numpart - 10; i < numpart; i++) {
std::cout << p.label[i] << "\t" << p.rx[i]
<< "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i]
<< "\t" << p.py[i] << "\t" << p.pz[i] << std::endl;
}
double arx = 0, ary = 0, arz = 0;
double apx = 0, apy = 0, apz = 0;
for (int i = 0; i < numpart; i++) {
arx += sqrt(p.rx[i] * p.rx[i]) / numpart;
ary += sqrt(p.ry[i] * p.ry[i]) / numpart;
arz += sqrt(p.rz[i] * p.rz[i]) / numpart;
apx += sqrt(p.px[i] * p.px[i]) / numpart;
apy += sqrt(p.py[i] * p.py[i]) / numpart;
apz += sqrt(p.pz[i] * p.pz[i]) / numpart;
}
std::cout << std::fixed << std::setprecision(10);
std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl
<< "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl;
*/
cout << "==========================END TEST==========================" << endl;
return 0;
}

15
test/testDKS.cpp Normal file
View File

@ -0,0 +1,15 @@
#include <iostream>
#include <complex>
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
DKSBase base = DKSBase();
base.getDevices();
return 0;
}

83
test/testFFT.cpp Normal file
View File

@ -0,0 +1,83 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
int main(int argc, char *argv[]) {
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else if (argc == 3) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << "\t" << device_name << endl;
cout << "Begin DKS Base tests" << endl;
int N = 2;
int dimsize[3] = {N, N, N};
complex<double> *cdata = new complex<double>[N];
complex<double> *cfft = new complex<double>[N];
for (int i = 0; i < N; i++) {
cdata[i] = complex<double>(0, 0);
cfft[i] = complex<double>(0, 0);
}
cdata[0] = complex<double>(1.73205, 1.73205);
timestamp_t t0, t1;
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
void *mem_ptr;
int ierr;
/* write data to device */
mem_ptr = base.pushData< complex<double> >( (const void*)cdata, N, ierr);
/* execute fft */
base.callFFT(mem_ptr, 1, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 1, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 1, dimsize);
/* read data from device */
base.pullData< complex<double> >(mem_ptr, cfft, N);
/* print results */
cout << "Data" << endl;
for (int i = 0; i < N; i++)
cout << cdata[i] << "\t";
cout << endl;
cout << "FFT" << endl;
for (int i = 0; i < N; i++)
cout << cfft[i] << "\t";
cout << endl;
return 0;
}

159
test/testFFT3D.cpp Normal file
View File

@ -0,0 +1,159 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
void printData3DN4(complex<double>* &data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
/* usage - ./testFFT3D */
int main(int argc, char *argv[]) {
int N = 16;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
N = atoi(argv[1]);
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
} else if (argc == 3) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, "-gpu");
} else if (argc == 4) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, argv[3]);
} else {
N = 16;
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N, N, N};
cout << "Begin DKS Base tests, N = " << N << endl;
int dim = 3;
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cifft = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
}
}
}
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, dimsize);
void *mem_ptr;
int ierr;
/* allocate memory on device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 3, dimsize);
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
/* compare results */
compareData(cdata, cifft, N, dim);
return 0;
}
void printData(complex<double>* &data, int N, int dim, bool normalize) {
int ni, nj, nk;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
if (!normalize) {
cout << data[i*ni*ni + j*nj + k].real() << " ";
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
} else
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

199
test/testFFT3DRC.cpp Normal file
View File

@ -0,0 +1,199 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim);
void initData(double *data, int dimsize[3]);
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop);
void printHelp();
int main(int argc, char *argv[]) {
int N1 = 8;
int N2 = 8;
int N3 = 8;
int dim = 3;
int loop = 10;
if ( readParams(argc, argv, N1, N2, N3, loop) )
return 0;
int dimsize[3] = {N3, N2, N1};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2];
double *rdata = new double[sizereal];
double *outdata = new double[sizereal];
complex<double> *cfft = new complex<double>[sizecomp];
for (int i=0; i<sizecomp; ++i) {
cfft[i].real() = 7.;
cfft[i].imag() = 3.33;
}
initData(rdata, dimsize);
/* init DKSBase */
cout << "Init device and set function" << endl;
#ifdef DKS_MIC
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
base.setupFFTRC(dim, dimsize);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N1*N2*N3));
#endif
#ifdef DKS_CUDA
DKSBase base;
base.setAPI("Cuda", 4);
base.setDevice("-gpu", 4);
base.initDevice();
base.setupFFT(dim, dimsize);
#endif
// allocate memory on device
int ierr;
void *real_ptr, *comp_ptr, *real_res_ptr;
real_ptr = base.allocateMemory<double>(sizereal, ierr);
real_res_ptr = base.allocateMemory<double>(sizereal, ierr);
comp_ptr = base.allocateMemory< std::complex<double> >(sizecomp, ierr);
// execute one run before starting the timers
base.writeData<double>(real_ptr, rdata, sizereal);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
base.readData<double>(real_res_ptr, outdata, sizereal);
//timer for total loop time, FFT and IFFT calls
struct timeval timeStart, timeEnd;
struct timeval timeFFTStart[loop], timeFFTEnd[loop];
struct timeval timeIFFTStart[loop], timeIFFTEnd[loop];
gettimeofday(&timeStart, NULL);
for (int i=0; i<loop; ++i){
// write data to device
base.writeData<double>(real_ptr, rdata, sizereal);
// execute rcfft
gettimeofday(&timeFFTStart[i], NULL);
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeFFTEnd[i], NULL);
// execute crfft
gettimeofday(&timeIFFTStart[i], NULL);
base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize);
gettimeofday(&timeIFFTEnd[i], NULL);
//normalize
#ifdef DKS_CUDA
base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize);
#endif
// read IFFT data from device
base.readData<double>(real_res_ptr, outdata, sizereal);
}
gettimeofday(&timeEnd, NULL);
// free device memory
base.freeMemory< std::complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal);
base.freeMemory<double>(real_res_ptr, sizereal);
// compare in and out data to see if we get back the same results
compareData(rdata, outdata, N1, N2, N3, dim);
//calculate seconds for total time and fft times
double tfft = 0;
double tifft = 0;
double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 +
(timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6;
for (int i = 0; i < loop; i++) {
tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 +
(timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6;
tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 +
(timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6;
}
//print timing results
std::cout << std::fixed << std::setprecision(5) << "\nTiming results"
<< "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s"
<< "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s"
<< "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s"
<< "\n\n";
return 0;
}
void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) {
int id;
double sum = 0;
for (int i = 0; i < NI; i++) {
for (int j = 0; j < NJ; j++) {
for (int k = 0; k < NK; k++) {
id = k*NI*NJ + j*NI + i;
sum += fabs(data1[id] - data2[id]);
}
}
}
std::cout << "RC <--> CR diff: " << sum << std::endl;
}
void initData(double *data, int dimsize[3]) {
for (int i = 0; i < dimsize[2]; i++) {
for (int j = 0; j < dimsize[1]; j++) {
for (int k = 0; k < dimsize[0]; k++) {
data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k;
}
}
}
}
void printHelp() {
std::cout << std::endl;
std::cout << "testFFT3DRC executes 3D real complex and 3D complex real"
<< "function on the Intel MIC.\n";
std::cout << "Operations performed by testRC are: "
<< "write data to MIC -> FFT -> IFFT -> read data from MIC.\n";
std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z "
<< "-loop $l\n";
std::cout << "where $x $y $z are number of elements in each dimension and "
<< "$l is the number of times all the operations will be performed.\n";
std::cout << std::endl;
}
bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) {
for (int i = 1; i < argc; i++) {
if ( argv[i] == std::string("-grid") ) {
N1 = atoi(argv[i + 1]);
N2 = atoi(argv[i + 2]);
N3 = atoi(argv[i + 3]);
i += 3;
}
if ( argv[i] == std::string("-loop") ) {
loop = atoi(argv[i + 1]);
i += 1;
}
if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) {
printHelp();
return true;
}
}
return false;
}

220
test/testFFT3DRC_MIC.cpp Normal file
View File

@ -0,0 +1,220 @@
#include <iostream>
#include <stdlib.h>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
void printData3DN4(complex<double>* &data, int N, int dim);
void printData3DN4(double* data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
void compareData(double* data1, double* data2, int N, int dim);
/* Compute (K*L)%M accurately */
static double moda(int K, int L, int M)
{
return (double)(((long long)K * L) % M);
}
/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */
static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4)
{
double TWOPI = 6.2831853071795864769, phase, factor;
int n1, n2, n3, S1, S2, S3, index;
/* Generalized strides for row-major addressing of x */
S3 = 1;
S2 = (N3/2+1)*2;
S1 = N2*(N3/2+1)*2;
factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0;
for (n1 = 0; n1 < N1; n1++)
{
for (n2 = 0; n2 < N2; n2++)
{
for (n3 = 0; n3 < N3; n3++)
{
phase = moda(n1,H1,N1) / N1;
phase += moda(n2,H2,N2) / N2;
phase += moda(n3,H3,N3) / N3;
index = n1*S1 + n2*S2 + n3*S3;
//cout << "index = " << index << endl;
x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3);
}
}
}
}
int main(int argc, char *argv[]) {
int N = atoi(argv[1]);
int dim = 3;
int dimsize[3] = {N, N, N};
int sizereal = dimsize[0] * dimsize[1] * dimsize[2];
int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2];
//double *rdata = new double[sizereal];
//double *outdata = new double[sizereal];
//complex<double> *cfft = new complex<double>[sizecomp];
double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double));
complex<double> *cfft = (complex<double> *)malloc(sizecomp*sizeof(complex<double>));
init_r(rdata, N,N,N);
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI("OpenMP", 6);
base.setDevice("-mic", 4);
base.initDevice();
/* setup forward fft (REAL->COMPLEX) */
base.setupFFTRC(dim, dimsize);
int ierr;
void *real_ptr, *comp_ptr;
/* allocate memory on device */;
real_ptr = base.allocateMemory<double>(sizereal, ierr);
comp_ptr = base.allocateMemory< complex<double> >(sizecomp, ierr);
/* write data to device */
base.writeData<double>(real_ptr, rdata, sizereal);
//printData3DN4(rdata,N,3);
/* execute rcfft */
base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize);
/* read FFT data from device */
base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
base.writeData<double>(comp_ptr, cfft, sizereal);
/* setup backward fft (COMPLEX->REAL) */
base.setupFFTCR(dim, dimsize,1./(N*N*N));
/* execute crfft */
base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize);
/* normalize */
//base.callNormalizeC2RFFT(real_ptr, dim, dimsize);
/* read FFT data from device */
//base.readData< complex<double> >(comp_ptr, cfft, sizecomp);
/* read IFFT data from device */
base.readData<double>(real_ptr, outdata, sizereal);
/* free device memory */
base.freeMemory< complex<double> >(comp_ptr, sizecomp);
base.freeMemory<double>(real_ptr, sizereal);
/* compare data */
compareData(rdata, outdata, N, dim);
return 0;
}
void printData(complex<double>* &data, int N, int dim, bool normalize) {
int ni, nj, nk;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
if (!normalize)
cout << data[i*ni*ni + j*nj + k].real() << "\t";
else
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void printData3DN4(double* data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k];
//double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
//if (a < 10e-5 && a > -10e-5)
// a = 0;
cout << d << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}
void compareData(double* data1, double* data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
//sum += fabs(data1[id] - data2[id]/(N*N*N));
sum += fabs(data1[id] - data2[id]);
}
}
}
cout << "Size " << N << " RC <--> CR diff: " << sum << endl;
}

159
test/testFFT3DSO.cpp Normal file
View File

@ -0,0 +1,159 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void printData(complex<double>* &data, int N, int dim, bool normalize = false);
void printData3DN4(complex<double>* &data, int N, int dim);
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
/* usage - ./testFFT3D */
int main(int argc, char *argv[]) {
int N = 16;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
N = atoi(argv[1]);
strcpy(api_name, "Cuda");
strcpy(device_name, "-gpu");
} else if (argc == 3) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, "-gpu");
} else if (argc == 4) {
N = atoi(argv[1]);
strcpy(api_name, argv[2]);
strcpy(device_name, argv[3]);
} else {
N = 16;
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
cout << "Use api: " << api_name << ", " << device_name << endl;
int dimsize[3] = {N, N, N};
cout << "Begin DKS Base tests, N = " << N << endl;
int dim = 3;
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cifft = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cdata[i*N*N + j*N + k] = complex<double>((double)k / N, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
}
}
}
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(device_name));
base.initDevice();
base.setupFFT(3, dimsize);
void *mem_ptr;
int ierr;
/* allocate memory on device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 3, dimsize);
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
/* compare results */
compareData(cdata, cifft, N, dim);
return 0;
}
void printData(complex<double>* &data, int N, int dim, bool normalize) {
int ni, nj, nk;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
if (!normalize) {
cout << data[i*ni*ni + j*nj + k].real() << " ";
cout << data[i*ni*ni + j*nj + k].imag() << "\t";
} else
cout << data[i*ni*ni + j*nj + k].real() / N << "\t";
}
cout << endl;
}
cout << endl;
}
}
void printData3DN4(complex<double>* &data, int N, int dim) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
for (int k = 0; k < N; k++) {
double d = data[i*N*N + j*N + k].real();
double a = data[i*N*N + j*N + k].imag();
if (d < 10e-5 && d > -10e-5)
d = 0;
if (a < 10e-5 && a > -10e-5)
a = 0;
cout << d << "; " << a << "\t";
}
}
cout << endl;
}
cout << endl;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

130
test/testFFT3DTiming.cpp Normal file
View File

@ -0,0 +1,130 @@
#include <iostream>
#include <cstdlib>
#include <complex>
#include "Utility/TimeStamp.h"
#include "DKSBase.h"
using namespace std;
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim);
int main(int argc, char *argv[]) {
int N = 4;
char *api_name = new char[10];
char *device_name = new char[10];
if (argc == 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, "-gpu");
} else if (argc > 2) {
strcpy(api_name, argv[1]);
strcpy(device_name, argv[2]);
N = atoi(argv[3]);
} else {
strcpy(api_name, "OpenCL");
strcpy(device_name, "-gpu");
}
int dimsize[3] = {N, N, N};
cout << "Use api: " << api_name << endl;
cout << "Begin DKS Base tests, N = " << N << endl;
complex<double> *cdata = new complex<double>[N*N*N];
complex<double> *cfft = new complex<double>[N*N*N];
complex<double> *cifft = new complex<double>[N*N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
cdata[i*N*N + j*N + k] = complex<double>((double)i / N, 0);
cfft[i*N*N + j*N + k] = complex<double>(0, 0);
cifft[i*N*N + j*N + k] = complex<double>(0, 0);
}
}
}
timestamp_t t0, t1;
/* init DKSBase */
cout << "Init device and set function" << endl;
DKSBase base;
base.setAPI(api_name, strlen(api_name));
base.setDevice(device_name, strlen(api_name));
base.initDevice();
void *mem_ptr;
int ierr;
/* run stest funct to init device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
base.callFFT(mem_ptr, 3, dimsize);
base.callIFFT(mem_ptr, 3, dimsize);
base.callNormalizeFFT(mem_ptr, 3, dimsize);
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
base.freeMemory< complex<double> >(mem_ptr, N*N*N);
/* end test */
int steps = 10;
base.oclClearEvents();
t0 = get_timestamp();
for (int i = 0; i < steps; i++) {
/* allocate memory on device */
mem_ptr = base.allocateMemory< complex<double> >(N*N*N, ierr);
/* write data to device */
ierr = base.writeData< complex<double> >(mem_ptr, cdata, N*N*N);
/* execute fft */
base.callFFT(mem_ptr, 3, dimsize);
/* execute ifft */
base.callIFFT(mem_ptr, 3, dimsize);
/* execute normalize */
base.callNormalizeFFT(mem_ptr, 3, dimsize);
/* read data from device */
base.readData< complex<double> >(mem_ptr, cifft, N*N*N);
/* free device memory */
base.freeMemory< complex<double> >(mem_ptr, N);
//compareData(cdata, cifft, N, 3);
}
t1 = get_timestamp();
cout << "=========================" << endl;
//base.oclEventInfo();
cout << "Average total: " << get_secs(t0, t1) / steps << endl;
cout << "=========================" << endl;
return 0;
}
void compareData(complex<double>* &data1, complex<double>* &data2, int N, int dim) {
int ni, nj, nk, id;
ni = (dim > 2) ? N : 1;
nj = (dim > 1) ? N : 1;
nk = N;
double sum = 0;
for (int i = 0; i < ni; i++) {
for (int j = 0; j < nj; j++) {
for (int k = 0; k < nk; k++) {
id = i*ni*ni + j*nj + k;
sum += fabs(data1[id].real() - data2[id].real());
sum += fabs(data1[id].imag() - data2[id].imag());
}
}
}
cout << "Size " << N << " CC <--> CC diff: " << sum << endl;
}

Some files were not shown because too many files have changed in this diff Show More