commit 4fa529aaeaa65f3e0af23649bcc893d693a0cc35 Author: Uldis Locans Date: Mon Oct 10 14:49:32 2016 +0200 snapshot of svn diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..9c08e39 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,174 @@ +CMAKE_MINIMUM_REQUIRED (VERSION 3.2) +PROJECT (DKS) +SET (DKS_VERSION_MAJOR 1) +SET (DKS_VERSION_MINOR 0.1) +SET (PACKAGE \"dks\") +SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\") +SET (PACKAGE_NAME \"DKS\") +SET (PACKAGE_STRING \"DKS\ 1.0.1\") +SET (PACKAGE_TARNAME \"dks\") +SET (PACKAGE_VERSION \"1.0.1\") +SET (VERSION \"1.0.1\") + +SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") + +#get compiler name +#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER}) +STRING (REGEX REPLACE ".*/" "" COMPILER_NAME ${CMAKE_CXX_COMPILER}) +MESSAGE (STATUS "Your compiler is: ${COMPILER_NAME}") +MESSAGE (STATUS "Your compiler is: ${CMAKE_CXX_COMPILER}") + +MESSAGE (STATUS "C compiler: ${CMAKE_C_COMPILER_ID}") +MESSAGE (STATUS "CXX compiler: ${CMAKE_CXX_COMPILER_ID}") + +#opencl and cuda kernel files are in the builds include directory +SET (OPENCL_KERNELS -DOPENCL_KERNELS=\\"${CMAKE_INSTALL_PREFIX}/include/\\") +MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}") + +#find boost +set (BOOSTROOT $ENV{BOOST_DIR}) +SET (Boost_USE_STATIC_LIBS OFF) +SET (Boost_USE_STATIC_RUNTIME OFF) +FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system) +IF (Boost_FOUND) + MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}") + MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}") + MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}") + INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS}) + LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) +ENDIF (Boost_FOUND) + +#enable UQTK +OPTION (USE_UQTK "Use UQTK" OFF) + + +#intel icpc compiler specific flags +IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) + + #for intel compiler turn on openmp and opencl + OPTION (USE_OPENCL "Use OpenCL" ON) + OPTION (USE_CUDA "Use CUDA" OFF) + OPTION (USE_MIC "Use intel MIC" ON) + + #find xiar and xild and set flags for offload build on mic + FIND_PROGRAM(XIAR xiar) + IF(XIAR) + MESSAGE(STATUS "xiar found: ${XIAR}") + SET(CMAKE_AR "${XIAR}") + ENDIF(XIAR) + MARK_AS_ADVANCED(XIAR) + SET(CMAKE_CXX_ARCHIVE_CREATE " rcs -qoffload-build ") + SET(CMAKE_C_ARCHIVE_CREATE " rcs -qoffload-build ") + + FIND_PROGRAM(XILD xild) + IF(XILD) + SET(CMAKE_LINKER "${XILD}") + ENDIF(XILD) + MARK_AS_ADVANCED(XILD) + + #set flags for openmp and opencl + #TODO: check which opencl to use: nvidia, amd, intel, apple + SET (CMAKE_CXX_FLAGS "-DDEBUG -O3 -Wall -offload -mkl -openmp -lOpenCL -lpthread -DDKS_MIC -DDKS_OPENCL -qopt-report=5 -qopt-report-phase=vec -std=c++11") + + IF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI") + ENDIF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc") + +ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) + +#gnu copmpiler specific flags +IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL) + + + OPTION (USE_OPENCL "Use OpenCL" ON) + OPTION (USE_CUDA "Use CUDA" OFF) + OPTION (USE_MIC "Use intel MIC" OFF) + + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu") + + FIND_PACKAGE(CUDA) + IF (CUDA_FOUND) + SET (USE_CUDA ON) + INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) + LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64) + + MESSAGE (STATUS "cuda include: ${CUDA_INCLUDE_DIRS}") + MESSAGE (STATUS "cuda libs: ${CUDA_TOOLKIT_ROOT_DIR}/lib64") + MESSAGE (STATUS "cuda version: ${CUDA_VERSION}") + + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA") + SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false") + + SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}") + + #if cuda version >= 7.0 add runtime commpilation flags + IF (NOT CUDA_VERSION VERSION_LESS "7.0") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda") + ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") + + MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}") + + SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) + #set(CUDA_SEPARABLE_COMPILATION ON) + SET(BUILD_SHARED_LIBS OFF) + + ENDIF (CUDA_FOUND) + + IF (NOT CUDA_FOUND) + + MESSAGE(STATUS "CUDA not found, looking for OpenCL") + + FIND_PACKAGE(OpenCL) + IF (OpenCL_FOUND) + MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}") + MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}") + MESSAGE(STATUS "OpenCL library dir: ${OpenCL_LIBRARY}") + INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIR}) + LINK_DIRECTORIES(${OpenCL_LIBRARY}) + ENDIF (OpenCL_FOUND) + + ENDIF (NOT CUDA_FOUND) + + #if mac OS and no CUDA set apple opencl flags + IF (APPLE AND NOT CUDA_FOUND) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -framework opencl -lpthread -DDKS_OPENCL") + ENDIF(APPLE AND NOT CUDA_FOUND) + + #if cuda found set cuda opencl flags + IF (CUDA_FOUND) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL") + ENDIF (CUDA_FOUND) + + #if cuda not found but amd opencl found set opencl flags + IF (NOT CUDA_FOUND AND OpenCL_FOUND) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL") + ENDIF(NOT CUDA_FOUND AND OpenCL_FOUND) + + #if mpi compiler used set mpi flag + IF (${COMPILER_NAME} STREQUAL "mpicxx") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI") + ENDIF (${COMPILER_NAME} STREQUAL "mpicxx") + +ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL) + +SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}") +MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}") + +ADD_SUBDIRECTORY (src) + +IF (ENABLE_TESTS) + ADD_SUBDIRECTORY (test) +ENDIF (ENABLE_TESTS) + +ADD_SUBDIRECTORY (auto-tuning) + +### write configure files ### +CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake ) + +### install files ### +INSTALL ( + FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake + DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}" + RENAME ${PROJECT_NAME}Config.cmake + ) diff --git a/ReadMe.first b/ReadMe.first new file mode 100644 index 0000000..e781b63 --- /dev/null +++ b/ReadMe.first @@ -0,0 +1,82 @@ +################################################################## +# +# Name: Dynamic Kernel Scheduler +# Version: 1.0 +# Author: Uldis Locans +# Contacts: locans.uldis@psi.ch +# +################################################################## + +Dynamic Kernel Scheduler is a library that provides a software layer between host application +and hardware accelerators. DKS handles communication between host and device and schedules task +execution using predefined algorithms writen using CUDA and OpenCL for GPUs, and OpenMP with +offload pragmas for IntelMIC. See DKSBase class documentation for full list of functions provided +by DKS. + +#####Requirements##### + +OpenMPI (Cuda aware OpenMPI enabled for full compatability) +g++ or icpc compiler +Cuda 7.0 or higher (optional) +Nvidia or Intel OpenCL SDK (optional) +Intel MIC compilers (optional) + + +######Install###### + +#check out DKS +svn co svn+ssh://YOULOGIN@savannah02.psi.ch/repos/amas/users/adelmann/Ph.D-students/Locans/work/DKS/trunk DKS + +#set compilers to use +#supported c++ compilers: g++, icpc, mpicxx whith g++ +#supported c compilers: gcc, icc, mpicc whith gcc +export CXX_COMPILER=cpp_compiler_name +export CC_COMPILER=c_compiler_name + +#set dks root directory directory +cd DKS +export DKS_ROOT = $PWD + +#set build directory +mkdir $DKS_BUILD_DIR +cd $DKS_BUILD_DIR + +#set install directory +export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/ + +CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT + +make +make install + + +######DKS usage###### +Make install copies the include files and library files to $DKS_BUILD_DIR/build folder, lib folder +in the build directory contains libdks.a and libdksshared.so, on of these libraries can be used to link +with DKS. All the necessary include files are located in $DKS_BUILD_DIR/build/include. + +Additional flags needed for CUDA and OpenCL mode: +-lcudart -lcufft -lcublas -lnvToolsExt -lOpenCL -lnvrtc -lcuda -DDKS_CUDA -DDKS_OPENCL + +Additional flags needed for IntelMIC and OpenCL mode: +-offload -mkl -openmp -lOpenCL -DDKS_MIC -DDKS_OPENCL + +Note: always run make install, during runtime OpenCL and CUDA will search for kernel files in +$DKS_INSTALL_DIR/build/include directory for runtime compilation. + +######Running DKS###### + +#running with cuda +#nvidia multi process service started for better CUDA and MPI execution + +#to start mps service (if multiple users use DKS start MPS as root) +nvidia-cuda-mps-control -d +#to stop mps service +echo quit | nvidia-cuda-mps-control + + +#runnign dks with MIC +#Intel Manycore Platform Software Stack (mpss) service started + +#to start mpss +service mpss start diff --git a/auto-tuning/CMakeLists.txt b/auto-tuning/CMakeLists.txt new file mode 100644 index 0000000..e3be789 --- /dev/null +++ b/auto-tuning/CMakeLists.txt @@ -0,0 +1,19 @@ +INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) +LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) + +#chi square kernel tests +ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) +TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES}) + +ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp) +TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES}) + +IF (USE_UQTK) + ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp) + TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) +ENDIF (USE_UQTK) +#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES}) + +#test to verify search functions +ADD_EXECUTABLE(testSearch testSearch.cpp) +TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES}) diff --git a/auto-tuning/testChiSquareRT.cpp b/auto-tuning/testChiSquareRT.cpp new file mode 100644 index 0000000..01e4ae0 --- /dev/null +++ b/auto-tuning/testChiSquareRT.cpp @@ -0,0 +1,385 @@ +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +#define PI 3.14159265358979323846 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +#define N0 0.25 +#define TAU 2.197019 +#define BKG 1.0 + +#define ALPHA 1.0 +#define BETA 1.0 + +using namespace std; + +void randData(double *data, int N, int scale = 1) { + for (int i = 0; i < N; i++) + data[i] = ((double)rand() / RAND_MAX ) * scale; +} + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double t, double lamda) { + return exp( -lamda*t ); +} + +double ge(double t, double lamda, double beta) { + return exp( -pow(lamda*t, beta) ); +} + +double sg(double t, double sigma) { + return exp( -0.5 * pow(sigma*t, 2) ); +} + +double stg(double t, double sigma) { + double sigmatsq = pow(sigma*t,2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double t, double lambda) { + double lambdat = lambda*t; + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double t, double lambda, double sigma) { + double lambdat = lambda*t; + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double t, double sigma, double beta) { + if (beta < 1.0e-3) + return 0.0; + double sigmatb = pow(sigma*t, beta); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta); +} + +double spg(double t, double lambda, double gamma, double q) { + double lam2 = lambda*lambda; + double lamt2q = t*t*lam2*q; + double rate2 = 4.0*lam2*(1.0-q)*t/gamma; + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double t, double nu, double lambda) { + double nut = nu*t; + double nuth = nu*t/2.0; + double lamt = lambda*t; + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double t, double phi, double nu) { + double tmp_nu = TWO_PI*nu*t; + double tmp_phi = DEG_TO_RAD * phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + + return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double b(double t, double phi, double nu) { + return j0(TWO_PI*nu*t + DEG_TO_RAD*phi); +} + +double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI * nu * t; + double ph = DEG_TO_RAD * phi; + + return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double ab(double t, double sigma, double gamma) { + double gt = gamma*t; + + return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double t, double Delta0, double Rb) { + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double t, double phi, double nu, double Delta0, double Rb) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double t, double Delta0, double Rb, double nuc) { + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa); +} + +double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph); +} + + +double cpuChiSq(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, + double timeStart, double timeStep, bool mlh = false) +{ + + double result = 0.0; + for (int i = 0; i < Ndata; i++) { + + double t = timeStart + i*timeStep; + double d = data[i]; + double e = data[i]; + + double fTheory = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]); + double theo = N0 * exp(-t/TAU) * (1.0 + fTheory) + BKG; + + if (mlh) { + if ((d > 1.0e-9) && (fabs(theo) > 1.0e-9)) + result += 2.0 * ((theo - d) + d * log(d / theo)); + else + result += 2.0 * (theo - d); + } else { + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e * e); + else + result += theo * theo; + } + } + + return result; +} + +double cpuChiSqAsym(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, + double timeStart, double timeStep, bool mlh = false) +{ + + double result = 0.0; + for (int i = 0; i < Ndata; i++) { + + double t = timeStart + i*timeStep; + double d = data[i]; + double e = data[i]; + + double theoVal = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]); + double ab = ALPHA * BETA; + + + double theo = ((ab+1.0)*theoVal - (ALPHA-1.0))/((ALPHA+1.0) - (ab-1.0)*theoVal); + + if (mlh) { + result += 0.0; //log max likelihood not defined here + } else { + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e * e); + else + result += theo * theo; + } + } + + return result; +} + +int runTest(const char *api_name, const char *device_name, bool autotune, bool mlh, bool asym) { + + int ierr; + + /* + * Histogram size used in tests. If autotune run kernes with sizes from 1e5 to 1e6. + * If autotune is off just run the test once (used for debuging to test the kernel) + */ + int Nstart = 1e5; + int Nstep = 1e5; + int Nend = (autotune) ? 1e6 : 1e5; + + //parameter, function and map sizes used in tests + int Npar = 66; + int Nfnc = 2; + int Nmap = 5; + + //print test info + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Max log likelihood: " << std::boolalpha << mlh << endl; + cout << "Asymetry fit: " << std::boolalpha << asym << endl; + + DKSBaseMuSR dksbase; + dksbase.setAPI(api_name); + dksbase.setDevice(device_name); + ierr = dksbase.initDevice(); + if (ierr != DKS_SUCCESS) { + std::cout << "Device not supported!" << std::endl; + return DKS_ERROR; + } + + //get the list of different devices + std::vector devices; + dksbase.getDeviceList(devices); + std::cout << "Unique devices: " << devices.size() << std::endl; + + //create the function string to use in test + string sFnc = "p[m[0]] * f[m[1]] * sg(t, p[m[2]]) * tf(t, p[m[3]], f[m[4]])"; + int map[5] = {0, 0, 1, 2, 1}; + + //runt tests from 100k to 1mil data points + for (unsigned int device = 0; device < devices.size(); device++) { + for (int Ndata = Nstart; Ndata <= Nend; Ndata += Nstep) { + + dksbase.setDefaultDevice(device); + + std::cout << "Ndata: " << Ndata << std::endl; + + //init the chi square calculations + dksbase.initChiSquare(Ndata, Npar, Nfnc, Nmap); + + //create random arrays for data, parameter and function storage + double *data = new double[Ndata]; + double *par = new double[Npar]; + double *fnc = new double[Nfnc]; + + randData(data, Ndata); + randData(par, Npar); + randData(fnc, Nfnc, 100); + + //allocate memory on device + void *data_ptr = dksbase.allocateMemory(Ndata, ierr); + + //write data, params, functions and maps to the device + dksbase.writeData(data_ptr, data, Ndata); + dksbase.writeParams(par, Npar); + dksbase.writeFunctions(fnc, Nfnc); + dksbase.writeMaps(map, Nmap); + + //set musrfit constants + dksbase.callSetConsts(N0, TAU, BKG); + dksbase.callSetConsts(ALPHA, BETA); + + //compile the program created with the function string + dksbase.callCompileProgram(sFnc, mlh); + + //set autotuning on/off + if (autotune) + dksbase.setAutoTuningOn(); + + //tmp values to store results and tmp values for time steps and start time + double result_gpu = 0.0; + double result_cpu = 0.0; + double dt = 1e-12; + double ts = 1e-7; + + //execute kernel on the GPU and execute the same function on the cpu + if (!asym) { + dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, Npar, Nfnc, + Nmap, ts, dt, result_gpu); + result_cpu = cpuChiSq(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh); + } else { + dksbase.callLaunchChiSquare(2, data_ptr, data_ptr, Ndata, Npar, Nfnc, + Nmap, ts, dt, result_gpu); + result_cpu = cpuChiSqAsym(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh); + } + + //check the results + cout << "DKS: " << result_gpu << endl; + cout << "CPU: " << result_cpu << endl; + + //free CPU and GPU memory + dksbase.freeMemory(data_ptr, Ndata); + dksbase.freeChiSquare(); + + delete[] data; + delete[] par; + delete[] fnc; + cout << "------------------------------------------------------------" << endl; + } + } + + return DKS_SUCCESS; +} + +int main(int argc, char* argv[]) { + + bool asym = false; + bool mlh = false; + bool autotune = false; + + char *api_name = new char[10]; + char *device_name = new char[10]; + + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; ++i) { + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-mlh")) + mlh = true; + + if (argv[i] == string("-asym")) + asym = true; + + if (argv[i] == string("-autotune")) + autotune = true; + + } + + int numPlatforms = 2; + const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"}; + const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"}; + + for (int i = 0; i < numPlatforms; i++) { + runTest(api[i], device[i], autotune, mlh, asym); + } + + return 0; +} diff --git a/auto-tuning/testChiSquareRTRandom.cpp b/auto-tuning/testChiSquareRTRandom.cpp new file mode 100644 index 0000000..b9e9b53 --- /dev/null +++ b/auto-tuning/testChiSquareRTRandom.cpp @@ -0,0 +1,450 @@ +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +#define PI 3.14159265358979323846 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +//#define N0 0.25 +#define N0 1e-10 +#define TAU 2.197019 +#define BKG 0.05 + +using namespace std; + +typedef std::function doubleF; + +void randData(double *data, int N, int scale = 1) { + for (int i = 0; i < N; i++) + data[i] = ((double)rand() / RAND_MAX ) * scale; +} + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double *t, double *lamda) { + return exp( -*lamda**t ); +} + +double ge(double *t, double *lamda, double *beta) { + return exp( -pow( (*lamda)*(*t), *beta) ); +} + +double sg(double *t, double *sigma) { + return exp( -0.5 * pow((*sigma)*(*t), 2) ); +} + +double stg(double *t, double *sigma) { + double sigmatsq = pow((*sigma)*(*t),2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double *t, double *lambda) { + double lambdat = *lambda*(*t); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double *t, double *lambda, double *sigma) { + double lambdat = *lambda*(*t); + double sigmatsq = pow(*sigma*(*t), 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double *t, double *sigma, double *beta) { + if (*beta < 1.0e-3) + return 0.0; + double sigmatb = pow(*sigma*(*t), (*beta)); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta)); +} + +double spg(double *t, double *lambda, double *gamma, double *q) { + double lam2 = (*lambda)*(*lambda); + double lamt2q = (*t)*(*t)*lam2*(*q); + double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma); + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double *t, double *nu, double *lambda) { + double nut = *nu*(*t); + double nuth = *nu*(*t)/2.0; + double lamt = *lambda*(*t); + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double *t, double *phi, double *nu) { + double tmp_nu = TWO_PI**nu**t; + double tmp_phi = DEG_TO_RAD * *phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + + return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double b(double *t, double *phi, double *nu) { + return j0(TWO_PI**nu**t + DEG_TO_RAD**phi); +} + +double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI * *nu * *t; + double ph = DEG_TO_RAD * *phi; + + return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double ab(double *t, double *sigma, double *gamma) { + double gt = *gamma**t; + + return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double *t, double *Delta0, double *Rb) { + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) { + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa); +} + +double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph); +} + +double evalf(std::vector< std::pair > func) { + + double result = 0.0; + for (auto f : func) { + switch (f.first) { + case 0: result += f.second(); break; + case 1: result -= f.second(); break; + default: result += f.second(); break; + } + } + + return result; +} + +double cpuChiSq(double *data, std::vector< std::pair > &func, int ndata, double *t, double dt) { + + double result = 0.0; + double ts = *t; + + for (int i = 0; i < ndata; i++) { + + *t = ts + i*dt; + double d = data[i]; + double e = data[i]; + + double vf = evalf(func); + double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG; + + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e*e); + else + result += theo * theo; + + } + return result; +} + +//create a random length from 50 - 1000 array and fill with random values from 0 to 1 +void randomParams(double *p, int np) { + for (int i = 0; i < np; i++) + p[i] = (double)rand() / RAND_MAX; +} + +//create map array of random size and fill with indexes from 0 to max, max < size of param array +void randomMaps(int *m, int nm, int max) { + for (int i = 0; i < nm; i++) + m[i] = rand() % max; +} + +int generateRandomFunction(std::vector< std::pair > &func, std::string &sfunc, + double *t, double *p, int *m, int np, int nm) +{ + + //nf defines the number of functions to generate (from 1 to 25) + int nf = rand() % 25 + 1; + + for (int n = 0; n < nf; n++) { + std::string sf = ""; + doubleF f; + + int r = rand() % 18; //choose random function to use + + int id1 = rand() % nm; + int id2 = rand() % nm; + int id3 = rand() % nm; + int id4 = rand() % nm; + int id5 = rand() % nm; + + std::string p1 = "p[m[" + to_string(id1) + "]])"; + std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])"; + std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]])"; + std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]])"; + std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])"; + + //get a random index from maps and use it to get the parameter value, bind function and parameter + //values to f, and create string for gpu in sfunc + switch (r) { + case 0: + f = std::bind(se, t, &p[m[id1]]); + sf = "se(t," + p1; + break; + case 1: + f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]); + sf = "ge(t," + p2; + break; + case 2: + f = std::bind(sg, t, &p[m[id1]]); + sf = "sg(t, " + p1; + break; + case 3: + f = std::bind(stg, t, &p[m[id1]]); + sf = "stg(t, " + p1; + break; + case 4: + f = std::bind(sekt, t, &p[m[id1]]); + sf = "sekt(t, " + p1; + break; + case 5: + f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]); + sf = "lgkt(t, " + p2; + break; + case 6: + f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]); + sf = "skt(t, " + p2; + break; + case 7: + f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "spg(t, " + p3; + break; + case 8: + f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]); + sf = "rahf(t, " + p2; + break; + case 9: + f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]); + sf = "tf(t, " + p2; + break; + case 10: + f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ifld(t, " + p5; + break; + case 11: + f = std::bind(b, t, &p[m[id1]], &p[m[id2]]); + sf = "b(t, " + p2; + break; + case 12: + f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ib(t, " + p5; + break; + case 13: + f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]); + sf = "ab(t, " + p2; + break; + case 14: + f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]); + sf = "snkzf(t, " + p2; + break; + case 15: + f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]); + sf = "snktf(t, " + p4; + break; + case 16: + f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "dnkzf(t, " + p3; + break; + case 17: + f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "dnktf(t, " + p5; + break; + } + + + int sign = rand() % 2; + if (n == 0) sign = 0; + func.push_back( std::make_pair(sign, f) ); + if (n == 0) + sfunc = sf; + else { + switch(sign) { + case 0: sfunc += " + " + sf; break; + case 1: sfunc += " - " + sf; break; + default: sfunc += " + " + sf; break; + } + + } + } + + return nf; +} + +int main(int argc, char *argv[]) { + + + srand(time(NULL)); + + int ierr; + int Ndata = 1e6; + + bool autotune = false; + + char *api_name = new char[10]; + char *device_name = new char[10]; + + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; ++i) { + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-autotune")) { + autotune = true; + } + + } + + //create a random number of parameters + int np = ( rand() % (1000 - 50) ) + 50; + int nm = ( rand() % (50 - 5) ) + 5; + int nf = ( rand() % (50 - 5) ) + 5; + + int *m = new int[nm]; + double *p = new double[np]; + double *f = new double[nf]; + + randomParams(p, np); + randomMaps(m, nm, np); + randomParams(f, nf); + + double dt = 1e-10; + double t = 1e-10; + std::vector< std::pair > func; + std::string sfunc; + int nfunc = generateRandomFunction(func, sfunc, &t, p, m, np, nm); + + //create DKS base object, set and init device / framework + DKSBaseMuSR dksbase; + dksbase.setAPI(api_name); + dksbase.setDevice(device_name); + + dksbase.initDevice(); + dksbase.initChiSquare(Ndata, np, nf, nm); + + dksbase.writeParams(p, np); + dksbase.writeFunctions(f, nf); + dksbase.writeMaps(m, nm); + + dksbase.callSetConsts(N0, TAU, BKG); + + dksbase.callCompileProgram(sfunc); + + if (autotune) + dksbase.setAutoTuningOn(); + + int oper = 0; + dksbase.getOperations(oper); + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of params: " << np << endl; + cout << "Number of maps: " << nm << endl; + cout << "Number of predefined functions: " << nfunc << endl; + cout << "Number of ptx instructions: " << oper << endl; + cout << "------------------------------------------------------------" << endl; + cout << sfunc << endl; + cout << "------------------------------------------------------------" << endl; + + //allocate memory on host and device device + double *data = new double[Ndata]; + randomParams(data, Ndata); + void *data_ptr = dksbase.allocateMemory(Ndata, ierr); + dksbase.writeData(data_ptr, data, Ndata); + + for (int N = 1e5; N < Ndata + 1; N += 1e5) { + double result_dks, result_cpu; + + t = 1e-10; + + dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, N, np, nf, nm, t, dt, result_dks); + result_cpu = cpuChiSq(data, func, N, &t, dt); + + cout << "Npart: " << N << endl; + cout << "DKS: " << result_dks << endl; + cout << "CPU: " << result_cpu << endl; + + } + + dksbase.freeMemory(data_ptr, Ndata); + dksbase.freeChiSquare(); + delete[] data; + delete[] p; + delete[] f; + delete[] m; + + return 0; +} diff --git a/auto-tuning/testChiSquareRTUQTK.cpp b/auto-tuning/testChiSquareRTUQTK.cpp new file mode 100644 index 0000000..c8602fc --- /dev/null +++ b/auto-tuning/testChiSquareRTUQTK.cpp @@ -0,0 +1,618 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +#include "Array1D.h" +#include "Array2D.h" +#include "Array3D.h" +#include "error_handlers.h" +#include "PCSet.h" +#include "fast_laplace.h" +#include "uqtktools.h" +#include "lreg.h" + +#define PI 3.14159265358979323846 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +//#define N0 0.25 +#define N0 1e-10 +#define TAU 2.197019 +#define BKG 0.05 + +using namespace std; + +typedef std::function doubleF; + +void randData(double *data, int N, int scale = 1) { + for (int i = 0; i < N; i++) + data[i] = ((double)rand() / RAND_MAX ) * scale; +} + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double *t, double *lamda) { + return exp( -*lamda**t ); +} +//math func + math oper + memory loads +//1 + 1 + 2 + + +double ge(double *t, double *lamda, double *beta) { + return exp( -pow( (*lamda)*(*t), *beta) ); +} +//2 + 1 + 3 + +double sg(double *t, double *sigma) { + return exp( -0.5 * pow((*sigma)*(*t), 2) ); +} +//2 + 2 + 2 + +double stg(double *t, double *sigma) { + double sigmatsq = pow((*sigma)*(*t),2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double *t, double *lambda) { + double lambdat = *lambda*(*t); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double *t, double *lambda, double *sigma) { + double lambdat = *lambda*(*t); + double sigmatsq = pow(*sigma*(*t), 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double *t, double *sigma, double *beta) { + if (*beta < 1.0e-3) + return 0.0; + double sigmatb = pow(*sigma*(*t), (*beta)); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta)); +} + +double spg(double *t, double *lambda, double *gamma, double *q) { + double lam2 = (*lambda)*(*lambda); + double lamt2q = (*t)*(*t)*lam2*(*q); + double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma); + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double *t, double *nu, double *lambda) { + double nut = *nu*(*t); + double nuth = *nu*(*t)/2.0; + double lamt = *lambda*(*t); + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double *t, double *phi, double *nu) { + double tmp_nu = TWO_PI**nu**t; + double tmp_phi = DEG_TO_RAD * *phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + + return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double b(double *t, double *phi, double *nu) { + return j0(TWO_PI**nu**t + DEG_TO_RAD**phi); +} + +double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI * *nu * *t; + double ph = DEG_TO_RAD * *phi; + + return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double ab(double *t, double *sigma, double *gamma) { + double gt = *gamma**t; + + return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double *t, double *Delta0, double *Rb) { + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) { + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa); +} + +double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph); +} + +double evalf(std::vector< std::pair > func) { + + double result = 0.0; + for (auto f : func) { + switch (f.first) { + case 0: result += f.second(); break; + case 1: result -= f.second(); break; + default: result += f.second(); break; + } + } + + return result; +} + +double cpuChiSq(double *data, std::vector< std::pair > &func, int ndata, double *t, double dt) { + + double result = 0.0; + double ts = *t; + + for (int i = 0; i < ndata; i++) { + + *t = ts + i*dt; + double d = data[i]; + double e = data[i]; + + double vf = evalf(func); + double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG; + + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e * e); + else + result += theo * theo; + + } + return result; +} + +//create a random length from 50 - 1000 array and fill with random values from 0 to 1 +void randomParams(double *p, int np) { + for (int i = 0; i < np; i++) + p[i] = (double)rand() / RAND_MAX; +} + +//create map array of random size and fill with indexes from 0 to max, max < size of param array +void randomMaps(int *m, int nm, int max) { + for (int i = 0; i < nm; i++) + m[i] = rand() % max; +} + +void generateRandomFunction(std::vector< std::pair > &func, std::string &sfunc, + double *t, double *p, int *m, int np, int nm, int nfunc) +{ + + for (int n = 0; n < nfunc; n++) { + std::string sf = ""; + doubleF f; + + int r = rand() % 18; //randomly choose one of the predefined functions to use + + int id1 = rand() % nm; //randomly select parameters to use in the function + int id2 = rand() % nm; + int id3 = rand() % nm; + int id4 = rand() % nm; + int id5 = rand() % nm; + + std::string p1 = "p[m[" + to_string(id1) + "]])"; + std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])"; + std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]])"; + std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]])"; + std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])"; + + //get a random index from maps and use it to get the parameter value, bind function and parameter + //values to f, and create string for gpu in sfunc + switch (r) { + case 0: + f = std::bind(se, t, &p[m[id1]]); + sf = "se(t," + p1; + break; + case 1: + f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]); + sf = "ge(t," + p2; + break; + case 2: + f = std::bind(sg, t, &p[m[id1]]); + sf = "sg(t, " + p1; + break; + case 3: + f = std::bind(stg, t, &p[m[id1]]); + sf = "stg(t, " + p1; + break; + case 4: + f = std::bind(sekt, t, &p[m[id1]]); + sf = "sekt(t, " + p1; + break; + case 5: + f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]); + sf = "lgkt(t, " + p2; + break; + case 6: + f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]); + sf = "skt(t, " + p2; + break; + case 7: + f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "spg(t, " + p3; + break; + case 8: + f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]); + sf = "rahf(t, " + p2; + break; + case 9: + f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]); + sf = "tf(t, " + p2; + break; + case 10: + f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ifld(t, " + p5; + break; + case 11: + f = std::bind(b, t, &p[m[id1]], &p[m[id2]]); + sf = "b(t, " + p2; + break; + case 12: + f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ib(t, " + p5; + break; + case 13: + f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]); + sf = "ab(t, " + p2; + break; + case 14: + f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]); + sf = "snkzf(t, " + p2; + break; + case 15: + f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]); + sf = "snktf(t, " + p4; + break; + case 16: + f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "dnkzf(t, " + p3; + break; + case 17: + f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "dnktf(t, " + p5; + break; + } + + + int sign = rand() % 2; + if (n == 0) sign = 0; + func.push_back( std::make_pair(sign, f) ); + if (n == 0) + sfunc = sf; + else { + switch(sign) { + case 0: sfunc += " + " + sf; break; + case 1: sfunc += " - " + sf; break; + default: sfunc += " + " + sf; break; + } + + } + } +} + +int main(int argc, char *argv[]) { + + + srand(time(NULL)); + + bool autotune = false; + bool eval = false; + bool test = false; + + char *api_name = new char[10]; + char *device_name = new char[10]; + + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + int nord = 15; //the order of the initial, overcomplete basis + int loop = 100; + + for (int i = 1; i < argc; ++i) { + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-autotune")) { + autotune = true; + } + + if (argv[i] == string("-eval")) + eval = true; + + if (argv[i] == string("-test")) + test = true; + + if (argv[i] == string("-nord")) + nord = atoi(argv[i+1]); + + if (argv[i] == string("-loop")) + loop = atoi(argv[i+1]); + + } + + //init dks and set chi^2 constants + DKSBaseMuSR dksbase; + dksbase.setAPI(api_name); + dksbase.setDevice(device_name); + dksbase.initDevice(); + + if (autotune) + dksbase.setAutoTuningOn(); + + int nydim = 2; //the dimensionality of input + int nxdim = 5; + //UQTk arrays + Array2D xdata(loop, nxdim, 0.0); + Array2D ydata(loop, nydim, 0.0); + + Array2D xdata_pce(loop, nxdim, 0.0); + Array2D ydata_pce(loop, nydim, 0.0); + + int size = 10000; + Array2D xtmp(size, nxdim, 0.0); + Array2D ytmp(size, nydim, 0.0); + + if (eval || test) { + for (int l = 0; l < loop; l++) { + + int ierr; + + //create a random number of parameters + int n = rand() % 9 + 1; + int Ndata = n * 100000; //number of data points 100k to 1milj, with 100k incr. + int np = ( rand() % (1000 - 50) ) + 50; //from 50 to 1000 for different shared memory needs + int nm = ( rand() % (50 - 5) ) + 5; //use 5 to 50 of the parameters, for different memory access + int nf = ( rand() % (50 - 5) ) + 5; //not used in the test case, but changes the shared memory + int nfunc = (rand() % (10 - 1) ) + 1; //1 to 10 user defined functions + + //allocate storage for parameters, maps and functions + int *m = new int[nm]; + double *p = new double[np]; + double *f = new double[nf]; + + //fill with random numbers + randomParams(p, np); + randomMaps(m, nm, np); + randomParams(f, nf); + + //create a random user function that can be passed to GPU kernel and evaluated on the host + double dt = 1e-10; + double t = 1e-10; + std::vector< std::pair > func; + std::string sfunc; + generateRandomFunction(func, sfunc, &t, p, m, np, nm, nfunc); + + //create a data array and fill with random values + double *data = new double[Ndata]; + randomParams(data, Ndata); + + + //allocate device memory for the data and transfer to the GPU + void *data_ptr = dksbase.allocateMemory(Ndata, ierr); + dksbase.writeData(data_ptr, data, Ndata); + + //init chi^2 + dksbase.initChiSquare(Ndata, np, nf, nm); + dksbase.callSetConsts(N0, TAU, BKG); + + //write params to the devic + dksbase.writeParams(p, np); + dksbase.writeFunctions(f, nf); + dksbase.writeMaps(m, nm); + + //compile the kernel with the new function + dksbase.callCompileProgram(sfunc); + + //run the kernel on the GPU and evaluate the function on the host + double result_dks, result_cpu, tmp_result; + + ierr = dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, + t, dt, result_dks); + + if (ierr == DKS_SUCCESS) { + result_cpu = cpuChiSq(data, func, Ndata, &t, dt); + + std::vector config; + dksbase.callAutoTuningChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, + t, dt, tmp_result, config); + + cout << "DKS: " << result_dks << endl; + cout << "CPU: " << result_cpu << endl; + cout << "Launch parameters: " << config[0] << ", " << config[1] << endl; + cout << sfunc << endl; + cout << "Kernel parameters: " << np << ", " << nm << ", " << nf << ", " << nfunc << endl; + + xdata(l,0) = np; + xdata(l,1) = nm; + xdata(l,2) = nf; + xdata(l,3) = nfunc; + xdata(l,4) = Ndata; + + ydata(l,0) = config[0]; + ydata(l,1) = config[1]; + + std::cout << std::endl << "Loop " << l + 1 << " finished" << std::endl << std::endl; + } else { + cout << "Created kernel failed! " << np << ", " << nm << ", " << nf << ", " << nfunc << endl; + cout << sfunc << endl; + } + + + //free temporary resources + delete[] m; + delete[] p; + delete[] f; + delete[] data; + dksbase.freeChiSquare(); + dksbase.freeMemory(data_ptr, Ndata); + } + } else { + //read_datafileVS(xdata, "xdata.dat"); + //read_datafileVS(ydata, "ydata.dat"); + xtmp.SetValue(0.0); + ytmp.SetValue(0.0); + read_datafileVS(xtmp, "xdata_pce.dat"); + read_datafileVS(ytmp, "ydata_pce.dat"); + for (int i = 0; i < loop; i++) { + for (int j = 0; j < nxdim; j++) + xdata(i,j) = xtmp(i,j); + for (int j = 0; j < nydim; j++) + ydata(i,j) = ytmp(i,j); + } + } + + + if (eval) { + for (int i = 0; i < nxdim; i++) { + for (int j = 0; j < loop; j++) { + xdata_pce(j,i) = xdata(j,i); + ydata_pce(j,i) = ydata(j,i); + } + } + + for (int i = 0; i < nydim; i++) { + for (int j = 0; j < loop; j++) { + xdata_pce(j,i) = xdata(j,i); + ydata_pce(j,i) = ydata(j,i); + } + } + } else { + //read_datafileVS(xdata_pce, "xdata_pce.dat"); + //read_datafileVS(ydata_pce, "ydata_pce.dat"); + xtmp.SetValue(0.0); + ytmp.SetValue(0.0); + read_datafileVS(xtmp, "xdata_pce.dat"); + read_datafileVS(ytmp, "ydata_pce.dat"); + for (int i = 0; i < loop; i++) { + for (int j = 0; j < nxdim; j++) + xdata_pce(i,j) = xtmp(i,j); + for (int j = 0; j < nydim; j++) + ydata_pce(i,j) = ytmp(i,j); + } + std::cout << "Built pce with " << xdata_pce.XSize() << " datapoints" << std::endl; + } + + //default input settings + string which_chaos="LU"; //PC type + string msc="m"; + + Lreg* reg; + reg = new PCreg(which_chaos,nord,nxdim); + int nbas = reg->GetNbas(); + + Array2D ypc_data(xdata.XSize(), nydim, 0.0); + for (int i = 0; i < nydim; i++) { + + std::cout << "start dim " << i+1 << std::endl; + + Array1D ydata_1d(xdata_pce.XSize(), 0.0); + for (unsigned int j = 0; j < xdata_pce.XSize(); j++) + ydata_1d(j) = ydata_pce(j,i); + + std::cout << "setup data" << std::endl; + reg->SetupData(xdata_pce,ydata_1d); + + std::cout << "Comput best lambda" << std::endl; + double lambda=reg->LSQ_computeBestLambda(); + Array1D lam(nbas,lambda); + + + reg->SetWeights(lam); + + std::cout << "LSQ build regr" << std::endl; + + reg->LSQ_BuildRegr(); + std::cout << std::endl << "Lambda : " << lambda << std::endl; + + Array1D ypc; + Array1D ycheck; + Array2D ycheck_cov; + + reg->EvalRegr(xdata,msc,ypc,ycheck,ycheck_cov); + std::cout << std::endl << "Eval" << std::endl; + + for (unsigned int j = 0; j < xdata.XSize(); j++) + ypc_data(j,i) = ypc(j); + + } + + if (eval) { + write_datafile(xdata_pce, "xdata_pce.dat"); + write_datafile(ydata_pce, "ydata_pce.dat"); + } + + write_datafile(xdata, "xdata.dat"); + write_datafile(ydata, "ydata.dat"); + write_datafile(ypc_data, "ypc_data.dat"); + + return 0; +} diff --git a/auto-tuning/testSearch.cpp b/auto-tuning/testSearch.cpp new file mode 100644 index 0000000..e3b8efe --- /dev/null +++ b/auto-tuning/testSearch.cpp @@ -0,0 +1,22 @@ +#include + +#include "DKSBaseMuSR.h" + +/** No accelerator device is used, this test is used to confirm, that search functions + * used for auto-tuning work properly + */ + +int main() { + + DKSBaseMuSR base; + + std::cout << "Start test" << std::endl; + + base.testAutoTuning(); + + std::cout << "Test finished" << std::endl; + + + + return 0; +} diff --git a/cmake/DKSConfig.cmake.in b/cmake/DKSConfig.cmake.in new file mode 100644 index 0000000..d764963 --- /dev/null +++ b/cmake/DKSConfig.cmake.in @@ -0,0 +1,4 @@ +SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${${PROJECT_NAME}_CXX_FLAGS}") +SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include") +SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib") +SET(${PROJECT_NAME}_LIBRARY "dks") \ No newline at end of file diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake new file mode 100644 index 0000000..c0b848e --- /dev/null +++ b/cmake/Modules/FindOpenCL.cmake @@ -0,0 +1,139 @@ +#.rst: +# FindOpenCL +# ---------- +# +# Try to find OpenCL +# +# Once done this will define:: +# +# OpenCL_FOUND - True if OpenCL was found +# OpenCL_INCLUDE_DIRS - include directories for OpenCL +# OpenCL_LIBRARIES - link against this library to use OpenCL +# OpenCL_VERSION_STRING - Highest supported OpenCL version (eg. 1.2) +# OpenCL_VERSION_MAJOR - The major version of the OpenCL implementation +# OpenCL_VERSION_MINOR - The minor version of the OpenCL implementation +# +# The module will also define two cache variables:: +# +# OpenCL_INCLUDE_DIR - the OpenCL include directory +# OpenCL_LIBRARY - the path to the OpenCL library +# + +#============================================================================= +# Copyright 2014 Matthaeus G. Chajdas +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +function(_FIND_OPENCL_VERSION) + include(CheckSymbolExists) + include(CMakePushCheckState) + set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY}) + + CMAKE_PUSH_CHECK_STATE() + foreach(VERSION "2_0" "1_2" "1_1" "1_0") + set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}") + + if(APPLE) + # prefer the header from the Framework + set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/Headers/cl.h") + if(EXISTS "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h") + set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h") + endif() + + CHECK_SYMBOL_EXISTS( + CL_VERSION_${VERSION} + ${OSX_OpenCL_HEADER} + OPENCL_VERSION_${VERSION}) + else() + CHECK_SYMBOL_EXISTS( + CL_VERSION_${VERSION} + "${OpenCL_INCLUDE_DIR}/CL/cl.h" + OPENCL_VERSION_${VERSION}) + endif() + + if(OPENCL_VERSION_${VERSION}) + string(REPLACE "_" "." VERSION "${VERSION}") + set(OpenCL_VERSION_STRING ${VERSION} PARENT_SCOPE) + string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}") + list(GET version_components 0 major_version) + list(GET version_components 1 minor_version) + set(OpenCL_VERSION_MAJOR ${major_version} PARENT_SCOPE) + set(OpenCL_VERSION_MINOR ${minor_version} PARENT_SCOPE) + break() + endif() + endforeach() + CMAKE_POP_CHECK_STATE() +endfunction() + +find_path(OpenCL_INCLUDE_DIR + NAMES + CL/cl.h OpenCL/cl.h + PATHS + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV NVSDKCOMPUTE_ROOT + ENV CUDA_PATH + ENV ATISTREAMSDKROOT + PATH_SUFFIXES + include + OpenCL/common/inc + "AMD APP/include") + +_FIND_OPENCL_VERSION() + +if(CMAKE_SIZEOF_VOID_P EQUAL 4) + find_path(OpenCL_LIBRARY + NAMES libOpenCL.so + PATHS + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV CUDA_PATH + ENV NVSDKCOMPUTE_ROOT + ENV ATISTREAMSDKROOT + PATH_SUFFIXES + "AMD APP/lib/x86" + lib/x86 + lib/Win32 + OpenCL/common/lib/Win32) +elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) + find_path(OpenCL_LIBRARY + NAMES libOpenCL.so + PATHS + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV CUDA_PATH + ENV NVSDKCOMPUTE_ROOT + ENV ATISTREAMSDKROOT + PATH_SUFFIXES + "AMD APP/lib/x86_64" + lib/x86_64 + lib/x64 + OpenCL/common/lib/x64) +endif() + +set(OpenCL_LIBRARIES ${OpenCL_LIBRARY}) +set(OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# Ubuntu 12.04 / Travis CI have an old version of CMake that doesn't +# support "FOUND_VAR OpenCL_FOUND". This could, in principle, be added +# at a later date. +find_package_handle_standard_args( + OpenCL FOUND_VAR OpenCL_FOUND + REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR + VERSION_VAR OpenCL_VERSION_STRING) + +mark_as_advanced( + OpenCL_INCLUDE_DIR + OpenCL_LIBRARY) diff --git a/doc/refman.pdf b/doc/refman.pdf new file mode 100644 index 0000000..41d9b03 Binary files /dev/null and b/doc/refman.pdf differ diff --git a/run_tuning_tests.sh b/run_tuning_tests.sh new file mode 100755 index 0000000..05a4e12 --- /dev/null +++ b/run_tuning_tests.sh @@ -0,0 +1,97 @@ +#!/bin/bash +export MIC_ENV_PREFIX=MIC +echo $MIC_ENV_PREFIX +export MIC_OMP_NUM_THREADS=236 +echo $MIC_OMP_NUM_THREADS +export MIC_KMP_PLACE_THREADS=59c4t0o +echo $MIC_KMP_PLACE_THREADS +export MIC_USE_2MB_BUFFERS=64K +echo $MIC_USE_2MB_BUFFERS +export MIC_KMP_AFFINITY=scatter +echo $MIC_KMP_AFFINITY + +#./testFFT3DRC 256 256 256 + +echo 'real strides divisible by 4 but not by 8' +#./testFFT3DRC 257 244 268 +#./testFFT3DRC 244 268 257 +#./testFFT3DRC 268 257 244 +#./testFFT3DRC 257 268 244 +#./testFFT3DRC 244 257 268 +#./testFFT3DRC 268 244 257 + +echo 'real strides divisible by 8 but not by 16' +#./testFFT3DRC 257 248 263 +#./testFFT3DRC 248 263 257 +#./testFFT3DRC 263 257 248 +#./testFFT3DRC 257 263 248 +#./testFFT3DRC 248 257 263 +#./testFFT3DRC 263 248 257 + +echo 'complex strides divisible by 4 but not by 8' +#./testFFT3DRC 257 246 268 +#./testFFT3DRC 246 268 257 +#./testFFT3DRC 268 257 246 +#./testFFT3DRC 257 268 246 +#./testFFT3DRC 246 257 268 +#./testFFT3DRC 268 246 257 + +echo 'complex strides divisible by 8 but not by 16' +#./testFFT3DRC 257 206 317 +#./testFFT3DRC 206 317 257 +#./testFFT3DRC 317 257 206 +#./testFFT3DRC 257 317 206 +#./testFFT3DRC 206 257 317 +#./testFFT3DRC 317 206 257 + +echo 'perform scaling tests' +export MIC_OMP_NUM_THREADS=1 +echo $MIC_OMP_NUM_THREADS +export MIC_KMP_PLACE_THREADS=1c1t0o +echo $MIC_KMP_PLACE_THREADS +#./testFFT3DRC 256 256 256 + +export MIC_OMP_NUM_THREADS=2 +echo $MIC_OMP_NUM_THREADS +export MIC_KMP_PLACE_THREADS=1c2t0o +echo $MIC_KMP_PLACE_THREADS +#./testFFT3DRC 256 256 256 + + + +export MIC_OMP_NUM_THREADS=3 +echo $MIC_OMP_NUM_THREADS +export MIC_KMP_PLACE_THREADS=1c3t0o +echo $MIC_KMP_PLACE_THREADS +#./testFFT3DRC 256 256 256 + + +export MIC_OMP_NUM_THREADS=4 +echo $MIC_OMP_NUM_THREADS +export MIC_KMP_PLACE_THREADS=1c4t0o +echo $MIC_KMP_PLACE_THREADS +#./testFFT3DRC 256 256 256 + +NUM_PROC="2 4 8 16 32 59" +for p in $NUM_PROC; do + t=$(($p * 4)) + echo $t + export MIC_OMP_NUM_THREADS=$t + echo $MIC_OMP_NUM_THREADS + mystring="$p" + mystring+="c4t0o" + export MIC_KMP_PLACE_THREADS=$mystring + echo $MIC_KMP_PLACE_THREADS + ./testFFT3DRC 256 256 256 + + +done + + + + + + + + + diff --git a/src/Algorithms/CMakeLists.txt b/src/Algorithms/CMakeLists.txt new file mode 100644 index 0000000..0a189b6 --- /dev/null +++ b/src/Algorithms/CMakeLists.txt @@ -0,0 +1,14 @@ +SET (_SRCS + ) + +SET (_HDRS + ChiSquareRuntime.h + ImageReconstruction.h + CollimatorPhysics.h + FFT.h + ) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/Algorithms) \ No newline at end of file diff --git a/src/Algorithms/ChiSquareRuntime.h b/src/Algorithms/ChiSquareRuntime.h new file mode 100644 index 0000000..85e8d9f --- /dev/null +++ b/src/Algorithms/ChiSquareRuntime.h @@ -0,0 +1,158 @@ +#ifndef H_CHISQUARE_RUNTIME +#define H_CHISQUARE_RUNTIME + +#include +#include +#include +#include "../DKSDefinitions.h" + +#define BLOCK_SIZE 128 + +#define FITTYPE_UNDEFINED 0 +#define FITTYPE_SINGLE_HISTO 1 +#define FITTYPE_ASYMMETRY 2 +#define FITTYPE_MU_MINUS 3 + +class DKSBaseMuSR; + +class ChiSquareRuntime { + friend class DKSBaseMuSR; + +protected: + // single histo fit parameter + double N0_m; + double tau_m; + double bkg_m; + // asymmetry fit parameter + double alpha_m; + double beta_m; + + bool initDone_m; + void *mem_chisq_m; + void *mem_param_m; + void *mem_func_m; + void *mem_map_m; + + int numBlocks_m; + int blockSize_m; + + char *ptx_m; + + void setN0(double value) { + N0_m = value; + } + + void setTau(double value) { + tau_m = value; + } + + void setBKG(double value) { + bkg_m = value; + } + + void setAlpha(double value) { + alpha_m = value; + } + + void setBeta(double value) { + beta_m = value; + } + +public: + + /** Default constructor */ + //ChiSquareRuntime(); + + /** Default destructor */ + virtual ~ChiSquareRuntime() { }; + + virtual int compileProgram(std::string function, bool mlh = false) = 0; + virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result) = 0; + + virtual int writeParams(const double *params, int numparams) = 0; + virtual int writeFunc(const double *func, int numfunc) = 0; + virtual int writeMap(const int *map, int nummap) = 0; + virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0; + virtual int freeChiSquare() = 0; + virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0; + + /** Set N0, tau and bgk values to use for the kernel. + * If values changes between data sets this needs to be called before + * every kernel call. Returns DKS_SUCCESS. + */ + int setConsts(double N0, double tau, double bkg) { + setN0(N0); + setTau(tau); + setBKG(bkg); + + return DKS_SUCCESS; + } + + /** Set alpha and beta values to use for the kernel. + * If values changes between data sets this needs to be called before + * every kernel call. Returns DKS_SUCCESS. + */ + int setConsts(double alpha, double beta) { + setAlpha(alpha); + setBeta(beta); + return DKS_SUCCESS; + } + + /** Set number of blocks and threads. + * Used to set parameters obtained from auto-tuning + */ + int setKernelParams(int numBlocks, int blockSize) { + int ierr = DKS_ERROR; + if (numBlocks > 0) { + numBlocks_m = numBlocks; + ierr = DKS_SUCCESS; + } + if (blockSize > 0) { + blockSize_m = blockSize; + ierr = DKS_SUCCESS; + } + + return ierr; + } + + /** Get the number of operations in compiled kernel. + * Count the number of operation in the ptx file for the compiled program. + */ + int getOperations(int &oper) { + + std::string ptx_str(ptx_m); + std::istringstream is(ptx_str); + + std::string line; + bool start = false; + int count = 0; + while(std::getline(is, line)) { + + //when fTheory start enable counting of operations + size_t f1 = line.find("fTheory"); + size_t f2 = line.find(".visible"); + size_t f3 = line.find(";"); + if (f1 != std::string::npos && f2 != std::string::npos) { + start = true; + continue; + } + + //exit when the new functions begins + if (start && f2 != std::string::npos) + break; + + //count opertations + if (start && f3 != std::string::npos) + count++; + } + + oper = count; + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/Algorithms/CollimatorPhysics.h b/src/Algorithms/CollimatorPhysics.h new file mode 100644 index 0000000..b7e8190 --- /dev/null +++ b/src/Algorithms/CollimatorPhysics.h @@ -0,0 +1,47 @@ +#ifndef H_COLLIMATOR_PHYSICS +#define H_COLLIMATOR_PHYSICS + +#include +#include +#include "../DKSDefinitions.h" + +class DKSBaseMuSR; + +class DKSCollimatorPhysics { + friend class DKSBaseMuSR; + +protected: + + int numBlocks_m; + int blockSize_m; + +public: + + virtual ~DKSCollimatorPhysics() { } + + virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0; + + virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) = 0; + + virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0; + + virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) = 0; + + virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1) = 0; + + virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1) = 0; + + +}; + +#endif diff --git a/src/Algorithms/FFT.h b/src/Algorithms/FFT.h new file mode 100644 index 0000000..b16e5f6 --- /dev/null +++ b/src/Algorithms/FFT.h @@ -0,0 +1,43 @@ +#ifndef H_DKS_FFT +#define H_DKS_FFT + +#include +#include + +#include "../DKSDefinitions.h" + +class DKSFFT { + +protected: + int defaultN[3]; + int defaultNdim; + + bool useDefaultPlan(int ndim, int N[3]) { + if (ndim != defaultNdim) + return false; + if (N[0] != defaultN[0] && N[1] != defaultN[1] && N[2] != defaultN[2]) + return false; + return true; + } + +public: + + virtual ~DKSFFT() { } + + virtual int setupFFT(int ndim, int N[3]) = 0; + virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0; + virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0; + virtual int destroyFFT() = 0; + virtual int executeFFT(void * mem_ptr, int ndim, int N[3], + int streamId = -1, bool forward = true) = 0; + virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; + virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; + virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) = 0; + virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) = 0; + virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0; + +}; + +#endif diff --git a/src/Algorithms/ImageReconstruction.h b/src/Algorithms/ImageReconstruction.h new file mode 100644 index 0000000..3a6266e --- /dev/null +++ b/src/Algorithms/ImageReconstruction.h @@ -0,0 +1,117 @@ +#ifndef H_IMAGERECONSTRUCTION +#define H_IMAGERECONSTRUCTION + +#include "../DKSDefinitions.h" + +#define BLOCK_SIZE 128 + +struct VoxelPosition { + float x; + float y; + float z; +}; + +struct ListEvent { + unsigned detA : 16; + unsigned detB : 16; +}; + +class ImageReconstruction { + +protected: + void *m_event_branch; + +public: + + virtual ~ImageReconstruction() { } + + /** Caluclate source. + * Places a sphere at each voxel position and calculate the avg value and std value of pixels + * that are inside this sphere. All the sphere used have the same diameter. + */ + virtual int calculateSource(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** Calculate background. + * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the + * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter. + */ + virtual int calculateBackground(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** Caluclate source using differente sources. + * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the + * each sphere is given by *diameter array. + */ + virtual int calculateSources(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** + * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the + * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the + * smaller sphere. + */ + virtual int calculateBackgrounds(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** Generate normalization. + * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel + * that updates voxel values in the image on the slope between these two detectors. + */ + virtual int generateNormalization(void *recon, void *image_position, + void *det_position, int total_det) = 0; + + + /** Calculate forward projection. + * For image reconstruction calculates forward projections. + * see recon.cpp for details + */ + virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, + void *image_position, int num_events) = 0; + + /** Calculate backward projection. + * For image reconstruction calculates backward projections. + * see recon.cpp for details + */ + virtual int backwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels) = 0; + + /** Set the voxel dimensins on device. + * + */ + virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0; + + /** Set the image edge variables on the device. + * + */ + virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0; + + /** Set the image edge1 on the device. + * + */ + virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0; + + /** Set the minimum crystan in one ring values on the device. + * + */ + virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, + float min_CrystalDist_InOneRing1) = 0; + + /** Set all other required parameters for reconstruction. + * + */ + virtual int setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter) = 0; + + +}; + +#endif diff --git a/src/AutoTuning/CMakeLists.txt b/src/AutoTuning/CMakeLists.txt new file mode 100644 index 0000000..62cd2b6 --- /dev/null +++ b/src/AutoTuning/CMakeLists.txt @@ -0,0 +1,21 @@ +SET (_SRCS + DKSAutoTuning.cpp + DKSSearchStates.cpp + DKSConfig.cpp + ) + +SET (_HDRS + DKSAutoTuning.h + DKSSearchStates.h + DKSAutoTuningTester.h + DKSConfig.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/AutoTuning) \ No newline at end of file diff --git a/src/AutoTuning/DKSAutoTuning.cpp b/src/AutoTuning/DKSAutoTuning.cpp new file mode 100644 index 0000000..050d1a8 --- /dev/null +++ b/src/AutoTuning/DKSAutoTuning.cpp @@ -0,0 +1,302 @@ +#include "DKSAutoTuning.h" + +DKSAutoTuning::DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops) { + + base_m = base; + api_name_m = api; + device_name_m = device; + loops_m = loops; + + evaluate_time_m = true; +} + +DKSAutoTuning::~DKSAutoTuning() { + params_m.clear(); +} + +int DKSAutoTuning::setParameterValues(States state) { + + //if states and params don't match in size something has gone wrong + if (state.size() != params_m.size()) { + DEBUG_MSG("Parameters and states don't match!"); + return DKS_ERROR; + } + + //set the value pointed by params to value saved in state + for (unsigned int i = 0; i < params_m.size(); i++) + params_m[i].setValue(state[i].value); + + return DKS_SUCCESS; +} + +/** TODO: might need a better timing for GPU code */ +int DKSAutoTuning::evaluateFunction(double &value) { + + int ierr = DKS_ERROR; + DKSTimer t; + + t.init(function_name_m); + + if (evaluate_time_m) { + //run for "loop" times and return the average time. + //syncDevice() is used to make sure that nothing is running on the device before the timer starts + // and to make sure the function has completed on the device before the time stops + for (int j = 0; j < loops_m; j++) { + base_m->syncDevice(); + t.start(); + ierr = f_m(); + base_m->syncDevice(); + t.stop(); + if (ierr != DKS_SUCCESS) //exit loop if kernel execution fials + break; + } + + //returns + value = t.gettime() / loops_m; + } else { + value = fd_m(); + ierr = DKS_SUCCESS; + } + + return ierr; +} + +void DKSAutoTuning::clearParameters() { + params_m.clear(); +} + +void DKSAutoTuning::exaustiveSearch() { + + DKSTimer t; + t.init("exaustive search"); + t.start(); + + if (params_m.size() < 2) + return; + + Parameter p1 = params_m[0]; + Parameter p2 = params_m[1]; + + double time; + double mint = 1000000.0; + int minv1 = 0; + int minv2 = 0; + + //std::ofstream myfile; + //std::string filename; + //filename = "search_" + api_name_m + "_" + device_name_m + ".dat"; + //myfile.open(filename); + + for (double v1 = p1.min; v1 <= p1.max; v1 += p1.step) { + for (double v2 = p2.min; v2 <= p2.max; v2 += p2.step) { + p1.setValue(v1); + p2.setValue(v2); + + int ierr = evaluateFunction(time); + + if (ierr == DKS_SUCCESS && time < mint) { + mint = time; + minv1 = v1; + minv2 = v2; + } + if (ierr == DKS_ERROR) + time = 1; + + //myfile << time << "\t"; + } + //myfile << "\n"; + } + //myfile.close(); + + //std::cout << "Optimal launch parameters:" << std::endl; + //std::cout << mint << "\t" << minv1 << "\t" << minv2 << std::endl; + p1.setValue(minv1); + p2.setValue(minv2); + + t.stop(); + //std::cout << "exaustive search: " << t.gettime() << std::endl; +} + +void DKSAutoTuning::lineSearch() { + DKSTimer t; + t.init("line search"); + t.start(); + + double time; + int ierr = DKS_ERROR; + + if (params_m.size() < 1) { + DEBUG_MSG("Need some parameters to autotune!"); + return; + } + + double mint = 1000000.0; + //loop trough parameters one parameter at a time + for (auto param : params_m) { + int minv = param.getValue(); + + //go trough all the values of the parameter, while keeping other parameters const + for (double i = param.min; i <= param.max; i += param.step) { + //adjust parameters + param.setValue(i); + + //run for "loop" times and get average + ierr = evaluateFunction(time); + + //if there was no error executing the function and time is better than previou + //min time, save the parameter configuration + if (ierr == DKS_SUCCESS && time < mint) { + mint = time; + minv = i; + } + + } //repeat + + param.setValue(minv); + } + + //DEBUG: print out the found best parameters + for (auto param : params_m) + std::cout << "Parameter " << param.name << " set to " << param.getValue() << std::endl; + + std::cout << "Best time: " << mint << std::endl; + + t.stop(); + std::cout << "Line search time: " << t.gettime() << std::endl; + +} + +void DKSAutoTuning::hillClimbing(int restart_loops) { + + DKSTimer t; + t.init("hill climbing"); + t.start(); + + std::cout << "hill climbing" << std::endl; + + int ierr; + double time_current; + double time_next; + DKSSearchStates search(params_m); + + std::cout << "start " << restart_loops << std::endl; + + for (int i = 0; i < restart_loops; i++) { + + + //init random current state + search.initCurrentState(); + + //evaluate current state + setParameterValues(search.getCurrentState()); + ierr = evaluateFunction(time_current); + + //std::cout << "Start iteration " << i+1 << std::endl; + //search.printCurrentState(time_current); + + if (ierr == DKS_ERROR) + continue; + + //statr the loop + bool topReached = false; + while(!topReached) { + + search.getNeighbours(); + + //get all the neighbors of the current state + bool neighbourFound = false; + while (!neighbourFound && search.nextNeighbourExists()) { + + //evaluate all the neighbors of the current state + setParameterValues(search.getNextNeighbour()); + ierr = evaluateFunction(time_next); + + //search.printNeighbour(time_next); + + if (ierr == DKS_ERROR) + std::cout << "Error evaluating function" << std::endl; + + //move to the first option that improives the solution + if (ierr == DKS_SUCCESS && time_next < time_current) { + time_current = time_next; + search.moveToNeighbour(); + neighbourFound = true; + } + + } + + //if no better option is found save the state and move to step 1 + if (!neighbourFound) { + search.saveCurrentState(time_current); + topReached = true; + } + + } + } + + std::cout << std::endl; + search.printBest(); + + t.stop(); + std::cout << "hill climbing: " << t.gettime() << std::endl; +} + +void DKSAutoTuning::simulatedAnnealing(double Tstart, double Tstep) { + + DKSTimer t; + t.init("simulated annealing"); + t.start(); + + int ierr; + double time_current; + double time_next; + + DKSSearchStates search(params_m); + + //make a random guess + search.initCurrentState(); + + //evaluate current state + setParameterValues(search.getCurrentState()); + ierr = evaluateFunction(time_current); + + if (ierr == DKS_ERROR) + return; + + for (double Temp = Tstart; Temp > 0; Temp -= Tstep) { + + search.printCurrentState(time_current); + + //calucate all the neighbours of current state + search.getNeighbours(10); + + //make a move to random neighbour and evaluate the runtime + setParameterValues(search.getRandomNeighbour()); + ierr = evaluateFunction(time_next); + + if (ierr == DKS_ERROR) + return; + + //if the solution improves move to this point else move to this point with probabily exp(-dE/T) + if (time_next < time_current) { + time_current = time_next; + search.moveToNeighbour(); + } else { + double p = (double)rand() / RAND_MAX; + double dE = time_next - time_current; + double P = exp(-dE/Temp); + + if (P > p) { + time_current = time_next; + search.moveToNeighbour(); + } + } + } + + search.printCurrentState(time_current); + + t.stop(); + std::cout << "Simulated annealing: " << t.gettime() << std::endl; + +} + diff --git a/src/AutoTuning/DKSAutoTuning.h b/src/AutoTuning/DKSAutoTuning.h new file mode 100644 index 0000000..ca8f3a3 --- /dev/null +++ b/src/AutoTuning/DKSAutoTuning.h @@ -0,0 +1,103 @@ +#ifndef DKS_AUTOTUNIG +#define DKS_AUTOTUNIG + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "../DKSBase.h" +#include "../Utility/DKSTimer.h" +#include "DKSSearchStates.h" + +typedef std::vector Parameters; +typedef std::vector States; + +class DKSAutoTuning { + +private: + + bool evaluate_time_m; + + std::string api_name_m; + std::string device_name_m; + std::string function_name_m; + + std::function f_m; + std::function fd_m; + Parameters params_m; + + DKSBase *base_m; + + int loops_m; + + /** Update parameters from a state */ + int setParameterValues(States states); + + /** Evaluate the function and set execution time + * Returns DKS_ERROR if errors occured during function execution. + * Returns DKS_SUCCESS if function executed as planned. + */ + int evaluateFunction(double &value); + +public: + + /** Constructor */ + DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100); + + /** Destructor */ + ~DKSAutoTuning(); + + /** Set function to auto tune. + * Caller of setFunction is responsible to bind the correct parameters + * to the function with std::bind. + */ + void setFunction(std::function f, std::string name, bool evaluate_time = true) { + f_m = f; + function_name_m = name; + evaluate_time_m = evaluate_time; + } + + void setFunction(std::function f, std::string name, bool evaluate_time = false) { + fd_m = f; + function_name_m = name; + evaluate_time_m = evaluate_time; + } + + /** Set parameter for auto tuning. + * Provide a pointer to a parameter that will be changed during auto-tuning + * and a min-max value for this element + */ + template + void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) { + Parameter p(value, min, max, step, name); + params_m.push_back(p); + } + + /** Delete all added parameters */ + void clearParameters(); + + /** Perform exaustive search evaluating all the parameter configurations */ + void exaustiveSearch(); + + /** Perform auto-tuning. + * Perform line-search auto-tuning by variying parameters one at a time and keeping other + * parameters constant. + */ + void lineSearch(); + + /** Perform hill climbing + */ + void hillClimbing(int restart_loops = 1); + + /** Perfor simulated annealing to find the parameters */ + void simulatedAnnealing(double Tstart, double Tstep); + +}; + +#endif diff --git a/src/AutoTuning/DKSAutoTuningTester.h b/src/AutoTuning/DKSAutoTuningTester.h new file mode 100644 index 0000000..9c44309 --- /dev/null +++ b/src/AutoTuning/DKSAutoTuningTester.h @@ -0,0 +1,33 @@ +#ifndef DKS_TESTAUTOTUNING +#define DKS_TESTAUTOTUNING + +#include +#include + +class DKSAutoTuningTester { + + friend class DKSBaseMuSR; + +private: + + double x; + double y; + +public: + + DKSAutoTuningTester() { + x = 0.0; + y = 0.0; + } + + ~DKSAutoTuningTester(); + + double peaksZ() { + + double z = 3 * pow(1-x,2) * exp(-pow(x,2) - pow(y+1,2)) - 10 * (x/5 - pow(x,3) - pow(y,5)) * exp(-pow(x,2) - pow(y,2)) - (1.0/3.0) * exp( - pow(x+1,2) - pow(y,2)); + return z; + } + +}; + +#endif diff --git a/src/AutoTuning/DKSConfig.cpp b/src/AutoTuning/DKSConfig.cpp new file mode 100644 index 0000000..645c6ab --- /dev/null +++ b/src/AutoTuning/DKSConfig.cpp @@ -0,0 +1,163 @@ +#include "DKSConfig.h" + +DKSConfig::DKSConfig() { + + //get home directory + homeset_m = true; + if ((homedir_m = getenv("HOME")) == NULL) + homeset_m = false; + + loadConfigFile(); + +} + +DKSConfig::~DKSConfig() { + //delete tree_m; + + saveConfigFile(); +} + + +int DKSConfig::loadConfigFile() { + + int ierr = DKS_ERROR; + /* + if (homeset_m) { + //check if $HOME/.config/DKS exists + std::string filename = homedir_m + config_dir + config_file; + std::cout << "Check for: " << filename << std::endl; + if (fs::exists(filename)) { + try { + pt::read_xml(filename, tree_m); + treeloaded_m = true; + ierr = DKS_SUCCESS; + } catch (std::exception &e) { + DEBUG_MSG("Error loading autotuning file!"); + treeloaded_m = false; + ierr = DKS_ERROR; + } + } + } + */ + return ierr; +} + + +int DKSConfig::saveConfigFile() { + + int ierr = DKS_ERROR; + /* + std::string savedir = homedir_m + config_dir; + std::string savefile = homedir_m + config_dir + config_file; + + std::cout << savedir << std::endl; + std::cout << savefile << std::endl; + + if (homeset_m) { + //check if $HOME/.config/DKS directory exists, if not create + bool homecreated = false; + fs::path p (savedir); + if (!fs::is_directory(p)) + homecreated = fs::create_directory(p); + + try { + if (homecreated) { + pt::write_xml(savefile, tree_m); + ierr = DKS_SUCCESS; + } + } catch(std::exception &e) { + ierr = DKS_ERROR; + } + + } + */ + return ierr; +} + + +int DKSConfig::addConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int value) { + + + //keys to acces data in the tree + std::string device_name = name; + device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end()); + std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func; + std::string parameter = key + ".parameter"; + std::string attr_size = ".size"; + std::string attr_param = "." + param; + + //tmp node where new attributes are cteated in case the attribute doesn't exist in the tree + pt::ptree *tmp; + bool newNode = true; + + //loop trough all the items in the node and see if new param needs to be created + //or old one updated + boost::optional< pt::ptree& > child = tree_m.get_child_optional(key); + if (child) { + BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) { + int oldsize = v.second.get(attr_size,-1); + + //if param with the same size already exists in the tree save pointer to this + if (size == oldsize) { + tmp = &v.second; + newNode = false; + } + } + } + + //if parameter doesnt exist with this size, create a new parameter + if (newNode) { + tmp = new pt::ptree(); + tmp->add(attr_size, size); + tmp->add(attr_param, value); + tree_m.add_child(parameter, *tmp); + } else { + //if parameter exists update the parameter value + tmp->put(attr_param, value); + } + + return DKS_SUCCESS; +} + +int DKSConfig::getConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int &value) { + + //get the value of the tree, default to -1 if value doesn't exist + int ierr = DKS_SUCCESS; + + //define key and attribute values to find parameters in the tree + std::string device_name = name; + device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end()); + std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func; + std::string attr_size = ".size"; + std::string attr_param = "." + param; + + float maxDist = std::numeric_limits::max(); + + //check if the parameters exist + boost::optional< pt::ptree& > child = tree_m.get_child_optional(key); + if (child) { + //loop trough parameters and get the one that is closes to the size specified + BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) { + int param_size = v.second.get(attr_size,-1); //get parameter size + if (param_size > 0) { // if param_size is -1 param is not defined correctly and not usable + float dist = abs(param_size - size); + if (dist < maxDist) { + value = v.second.get(attr_param,-1); + maxDist = dist; + } + } + } + } else { + value = -1; + ierr = DKS_ERROR; + } + + return ierr; +} + + + diff --git a/src/AutoTuning/DKSConfig.h b/src/AutoTuning/DKSConfig.h new file mode 100644 index 0000000..bf7255a --- /dev/null +++ b/src/AutoTuning/DKSConfig.h @@ -0,0 +1,69 @@ +/** Class to save and load DKS autotunning configs. + * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml. + * Uses boost xml_parser to read and write the xml file and boost property tree to store + * the xml content. + */ + +#ifndef DKS_CONFIG +#define DKS_CONFIG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../DKSDefinitions.h" + +namespace pt = boost::property_tree; +namespace fs = boost::filesystem; + +const std::string config_dir = "/.config/DKS"; +const std::string config_file = "/autotuning.xml"; + +class DKSConfig { + +private: + + pt::ptree tree_m; + const char *homedir_m; + bool homeset_m; + bool treeloaded_m; + +public: + + /** Constructor, set home variable. + * If home directory is not set config file can not be read or saved + */ + DKSConfig(); + + ~DKSConfig(); + + /** Load autotuinig.xml into tree variable if this file exists */ + int loadConfigFile(); + + /** Save autotuning.xml file */ + int saveConfigFile(); + + /** Add config parameter to tree */ + int addConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int value); + + /** Get config parameter from the tree */ + int getConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int &value); + + +}; + +#endif diff --git a/src/AutoTuning/DKSSearchStates.cpp b/src/AutoTuning/DKSSearchStates.cpp new file mode 100644 index 0000000..4bfcaba --- /dev/null +++ b/src/AutoTuning/DKSSearchStates.cpp @@ -0,0 +1,233 @@ +#include "DKSSearchStates.h" + +/** set the current state so that number of parameters and parameter bounds are known */ +DKSSearchStates::DKSSearchStates(Parameters params) { + + for (auto p : params) { + State s; + s.value = p.getValue(); + s.min = p.min; + s.max = p.max; + s.step = p.step; + current_state_m.push_back(s); + } + + neighbour_state_m.resize(current_state_m.size()); + best_state_m.resize(current_state_m.size()); + + best_time_m = std::numeric_limits::max(); + + next_neighbour_m = -1; + + srand(time(NULL)); + +} + +DKSSearchStates::~DKSSearchStates() { + current_state_m.clear(); + neighbour_state_m.clear(); + best_state_m.clear(); + neighbours_m.clear(); +} + +/** Get all the possible neighbours of the current state */ +void DKSSearchStates::getNeighbours(int dist) { + + std::vector< std::vector > values; + + for (auto state : current_state_m) { + std::vector s; + + for (int d = dist; d > 0; d--) { + if (state.value - d*state.step >= state.min) + s.push_back(state.value - state.step); + } + + s.push_back(state.value); + + for (int d = 1; d < dist + 1; d++) { + if (state.value + d*state.step <= state.max) + s.push_back(state.value + state.step); + } + + values.push_back(s); + } + + + std::vector< std::vector > s {{}}; + for (auto& u : values) { + std::vector< std::vector > r; + for(auto& x : s) { + for( auto y : u) { + r.push_back(x); + r.back().push_back(y); + } + } + s.swap(r); + } + + //get current state values + std::vector current; + for (auto state : current_state_m) + current.push_back(state.value); + s.erase(std::remove(s.begin(), s.end(), current)); + + neighbours_m.clear(); + neighbours_m = s; + next_neighbour_m = 0; +} + +void DKSSearchStates::setCurrentState(std::vector current_state) { + + current_state_m.clear(); + for (auto& p : current_state) { + State s; + s.value = p.getValue(); + s.min = p.min; + s.max = p.max; + s.step = p.step; + current_state_m.push_back(s); + } +} + +void DKSSearchStates::setCurrentState(std::vector current_state) { + + current_state_m.clear(); + for (auto& p : current_state) { + State s; + s.value = p.value; + s.min = p.min; + s.max = p.max; + s.step = p.step; + current_state_m.push_back(s); + } +} + +void DKSSearchStates::initCurrentState() { + + //go trough parameters in current state and generate a new random value + for (auto& s : current_state_m) { + //get number of total values + int values = (s.max - s.min) / s.step + 1; + + int r = rand() % values; + + s.value = s.min + r * s.step; + } + + getNeighbours(); +} + +States DKSSearchStates::getCurrentState() { + return current_state_m; +} + +States DKSSearchStates::getNextNeighbour() { + + //check if there are ant neighbours to move on + if (next_neighbour_m < (int)neighbours_m.size()) { + + //get the vector of values for each parameters in the neighbour cell + std::vector neighbour_values = neighbours_m[next_neighbour_m]; + + //set the values to neighbour_state_m + for (unsigned int n = 0; n < neighbour_state_m.size(); n++) + neighbour_state_m[n].value = neighbour_values[n]; + + } + + next_neighbour_m++; + return neighbour_state_m; + +} + +States DKSSearchStates::getRandomNeighbour() { + + int rand_neighbour = rand() % (int)neighbours_m.size(); + + //get the vector of values for each parameters in the neighbour cell + std::vector neighbour_values = neighbours_m[rand_neighbour]; + + //set the values to neighbour_state_m + for (unsigned int n = 0; n < neighbour_state_m.size(); n++) + neighbour_state_m[n].value = neighbour_values[n]; + + next_neighbour_m = rand_neighbour + 1; + return neighbour_state_m; + +} + +bool DKSSearchStates::nextNeighbourExists() { + bool neighbourExists = false; + if (next_neighbour_m < (int)neighbours_m.size()) + neighbourExists = true; + + return neighbourExists; +} + +void DKSSearchStates::moveToNeighbour() { + + for (unsigned int i = 0; i < current_state_m.size(); i++) + current_state_m[i].value = neighbour_state_m[i].value; + + //getNeighbours(); + +} + +void DKSSearchStates::saveCurrentState(double current_time) { + + if (current_time < best_time_m) { + for (unsigned int i = 0; i < current_state_m.size(); i++) { + best_state_m[i].value = current_state_m[i].value; + best_state_m[i].min = current_state_m[i].min; + best_state_m[i].max = current_state_m[i].max; + best_state_m[i].step = current_state_m[i].step; + } + + best_time_m = current_time; + } + +} + + +void DKSSearchStates::printCurrentState(double time) { + std::cout << "Current state: "; + for (auto s : current_state_m) + std::cout << s.value << "\t"; + std::cout << time << std::endl; + +} + +void DKSSearchStates::printInfo() { + + std::cout << "Current state: "; + for (auto s : current_state_m) + std::cout << s.value << "\t"; + std::cout << std::endl; + + std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): "; + if (next_neighbour_m > 0) { + for (auto s : neighbour_state_m) + std::cout << s.value << "\t"; + } + std::cout << std::endl; + +} + +void DKSSearchStates::printNeighbour(double time) { + std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): "; + if (next_neighbour_m > 0) { + for (auto s : neighbour_state_m) + std::cout << s.value << "\t"; + } + std::cout << time << std::endl; +} + +void DKSSearchStates::printBest() { + std::cout << "Best state (" << best_time_m << "): "; + if (best_time_m > 0) { + for (auto s : best_state_m) + std::cout << s.value << "\t"; + } + std::cout << std::endl; +} diff --git a/src/AutoTuning/DKSSearchStates.h b/src/AutoTuning/DKSSearchStates.h new file mode 100644 index 0000000..cdd8fb0 --- /dev/null +++ b/src/AutoTuning/DKSSearchStates.h @@ -0,0 +1,162 @@ +#ifndef DKS_SEARCHSTATES +#define DKS_SEARCHSTATES + +#include +#include +#include +#include +#include + +enum VALUE_TYPE { DKS_INT, DKS_DOUBLE }; + +class Parameter { + +private: + int *ivalue; + double *dvalue; + VALUE_TYPE type; + +public: + double min; + double max; + double step; + std::string name; + + Parameter(int *_value, int _min, int _max, int _step, std::string _name) { + + ivalue = _value; + min = (double)_min; + max = (double)_max; + step = (double)_step; + name = _name; + type = DKS_INT; + } + + Parameter(double *_value, double _min, double _max, double _step, std::string _name) { + + std::cout << "Double" << std::endl; + + dvalue = _value; + min = _min; + max = _max; + step = _step; + name = _name; + type = DKS_DOUBLE; + } + + template + void setValue(T v) { + if (type == DKS_INT) + *ivalue = (int)v; + if (type == DKS_DOUBLE) + *dvalue = (double)v; + } + + double getValue() { + switch (type) { + case DKS_INT: + return (double)*ivalue; + case DKS_DOUBLE: + return *dvalue; + }; + return -1.0; + } + +}; + +struct State { + double value; + double min; + double max; + double step; +}; + +typedef std::vector Parameters; +typedef std::vector States; + +class DKSSearchStates { + +private: + + States current_state_m; + States neighbour_state_m; + + States best_state_m; + double best_time_m; + + std::vector< std::vector > neighbours_m; + int next_neighbour_m; + +public: + + /** Constructor alwats takes params array as variable. + * Params array is needed to know how many params will be searched and what are thou bounds + * of each parameter. + */ + DKSSearchStates(Parameters params); + + ~DKSSearchStates(); + + /** Set current state using parameter vector */ + void setCurrentState(Parameters current_state); + + /** set current state using the state vector */ + void setCurrentState(States current_state); + + /** init random current state */ + void initCurrentState(); + + /** get current state */ + States getCurrentState(); + + /** get next neighbour state. + * if there are no next neighbore stay at the curretn neighbour + */ + States getNextNeighbour(); + + /** get random neighbour state */ + States getRandomNeighbour(); + + /** calculate all the neighbour states */ + void getNeighbours(int dist = 1); + + /** Chech if there are more neighbours to evaluate + * Return true if more neighbors exist, false if we are at the last neighbour + */ + bool nextNeighbourExists(); + + /** move to next neighbour. + * set the current state as the next neighbour, + * calculate the neighbours of the new current state. + */ + void moveToNeighbour(); + + /** Save the current state and the evaluation time of the current state. + * If evaluation time of the current state is better than the evaluation time of the + * best state, save the current state as best. + */ + void saveCurrentState(double current_time); + + + //Print functions - mostly usefull for debugging purposes, or for benchmark runs to print the + //status of the search + + /** Print current state. + * cout the current state. Mostly used for debuging purposes + */ + void printCurrentState(double time = 0.0); + + /** Print current neighbour info */ + void printNeighbour(double time = 0.0); + + /** Print info. + * Print the whole info about the search: current state, current neighbour, total neighbors + */ + void printInfo(); + + /** Print the best saved state */ + void printBest(); + +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..df12a31 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,130 @@ +CMAKE_MINIMUM_REQUIRED (VERSION 2.8) + +SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +MACRO (ADD_SOURCES ) + FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}") + FOREACH (_src ${ARGN}) + IF (_relPath) + LIST (APPEND DKS_SRCS "${_relPath}/${_src}") + ELSE () + LIST (APPEND DKS_SRCS "${_src}") + ENDIF () + ENDFOREACH () + IF (_relPath) + # propagate SRCS to parent directory + SET (DKS_SRCS ${DKS_SRCS} PARENT_SCOPE) + ENDIF () +ENDMACRO () + +MACRO (ADD_HEADERS ) + FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}") + FOREACH (_hdr ${ARGN}) + IF (_relPath) + LIST (APPEND DKS_HDRS "${_relPath}/${_hdr}") + ELSE () + LIST (APPEND DKS_HDRS "${_hdr}") + ENDIF () + ENDFOREACH () + IF (_relPath) + # propagate HDRS to parent directory + SET (DKS_HDRS ${DKS_HDRS} PARENT_SCOPE) + ENDIF () +ENDMACRO () + + +SET (DKS_BASEDIR_HDRS + DKSBase.h + DKSDefinitions.h + ) + +SET (DKS_BASEDIR_SRCS + DKSBase.cpp + ) + +IF (USE_CUDA OR USE_OPENCL) + SET (DKS_BASEDIR_HDRS + ${DKS_BASEDIR_HDRS} + DKSBaseMuSR.h + ) + + SET (DKS_BASEDIR_SRCS + ${DKS_BASEDIR_SRCS} + DKSBaseMuSR.cpp + ) +ENDIF (USE_CUDA OR USE_OPENCL) + +IF (USE_CUDA) + SET (DKS_BASEDIR_HDRS + ${DKS_BASEDIR_HDRS} + DKSImageReconstruction.h + ) + + SET (DKS_BASEDIR_SRCS + ${DKS_BASEDIR_SRCS} + DKSImageReconstruction.cpp + ) +ENDIF (USE_CUDA) + +ADD_HEADERS (${DKS_BASEDIR_HDRS}) +ADD_SOURCES (${DKS_BASEDIR_SRCS}) + +MESSAGE (STATUS "HEADERS: ${DKS_BASEDIR_HDRS}") +MESSAGE (STATUS "SOURCES: ${DKS_BASEDIR_SRCS}") + +#add only those source files that will be used +IF (USE_OPENCL) + MESSAGE (STATUS "Add OpenCL source files") + ADD_SUBDIRECTORY (OpenCL) +ENDIF (USE_OPENCL) + +IF (USE_CUDA) + MESSAGE (STATUS "Add CUDA source files") + ADD_SUBDIRECTORY (CUDA) +ENDIF (USE_CUDA) + +IF (USE_MIC) + MESSAGE (STATUS "Add MIC source files") + ADD_SUBDIRECTORY (MIC) +ENDIF (USE_MIC) + +ADD_SUBDIRECTORY (Utility) +ADD_SUBDIRECTORY (AutoTuning) +ADD_SUBDIRECTORY (Algorithms) + +IF (USE_CUDA) + CUDA_ADD_LIBRARY(dks ${DKS_SRCS}) + CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS}) + + IF (USE_UQTK) + TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + ELSE (USE_UQTK) + TARGET_LINK_LIBRARIES(dks cudadevrt) + TARGET_LINK_LIBRARIES(dksshared cudadevrt) + ENDIF (USE_UQTK) + +ELSE (USE_CUDA) + MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}") + ADD_LIBRARY(dks ${DKS_SRCS}) + ADD_LIBRARY(dksshared SHARED ${DKS_SRCS}) + + IF (USE_UQTK) + TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + ELSE (USE_UQTK) + TARGET_LINK_LIBRARIES(dks) + TARGET_LINK_LIBRARIES(dksshared) + ENDIF(USE_UQTK) + +ENDIF (USE_CUDA) + +INSTALL(TARGETS dks DESTINATION lib) +INSTALL(TARGETS dksshared DESTINATION lib) +INSTALL(FILES ${DKS_BASEDIR_HDRS} DESTINATION include) + +#IF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc")) +# INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/libdksMIC.a DESTINATION build/lib) +#ENDIF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc")) + + + diff --git a/src/CUDA/CMakeLists.txt b/src/CUDA/CMakeLists.txt new file mode 100644 index 0000000..977d570 --- /dev/null +++ b/src/CUDA/CMakeLists.txt @@ -0,0 +1,35 @@ +SET (_HDRS + CudaBase.cuh + CudaFFT.cuh + CudaGreensFunction.cuh + CudaChiSquare.cuh + CudaCollimatorPhysics.cuh + CudaImageReconstruction.cuh + CudaChiSquareRuntime.cuh + ) + +SET (_SRCS + CudaBase.cu + CudaFFT.cu + CudaGreensFunction.cu + CudaChiSquare.cu + CudaCollimatorPhysics.cu + CudaImageReconstruction.cu + CudaChiSquareRuntime.cu +) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES(${_SRCS}) +ADD_HEADERS(${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/CUDA) + +SET (_KERNELS + NVRTCKernels/CudaChiSquareKernel.cu + ) + +INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels) + diff --git a/src/CUDA/CMakeListsLibcuda.txt b/src/CUDA/CMakeListsLibcuda.txt new file mode 100644 index 0000000..a94b877 --- /dev/null +++ b/src/CUDA/CMakeListsLibcuda.txt @@ -0,0 +1,25 @@ +CMAKE_MINIMUM_REQUIRED (VERSION 2.8) + +FIND_PACKAGE(CUDA REQUIRED) + +SET (CUDA_NVCC_FLAGS "-arch=sm_30") + +SET(LIB_TYPE STATIC) + +SET (DKS_CUDA_HDRS + CudaBase.cuh + CudaFFT.cuh + CudaGreensFunction.cuh + ) + +SET (DKS_CUDA_SRCS + CudaBase.cu + CudaFFT.cu + CudaGreensFunction.cu +) + +INCLUDE_DIRECTORIES ( + ${CMAKE_CURRENT_SOURCE_DIR} +) + +CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS}) \ No newline at end of file diff --git a/src/CUDA/CudaBase.cu b/src/CUDA/CudaBase.cu new file mode 100644 index 0000000..f352cf2 --- /dev/null +++ b/src/CUDA/CudaBase.cu @@ -0,0 +1,386 @@ +#include "CudaBase.cuh" + +//=====================================// +//============Cuda kernels=============// +//=====================================// + +__global__ void initcuRandState(curandState *state, int size, int seed = 0) { + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + curand_init(seed + idx, 0, 0, &state[idx]); + } + +} + + +//=====================================// +//==========Private functions==========// +//=====================================// + + +//====================================// +//==========Public functions==========// +//====================================// + +CudaBase::CudaBase() { + + currentStream = -1; + cudaStreams.reserve(10); + defaultRndSet = -1; + +} + +CudaBase::~CudaBase() { + + cuda_deleteStreams(); + cuda_deleteCurandStates(); + +} + +/* + create curandStates +*/ +int CudaBase::cuda_createCurandStates(int size) { + + if (defaultRndSet == 1) + cuda_deleteCurandStates(); + + int threads = 128; + int blocks = size / threads + 1; + int seed = time(NULL); + + //std::cout << "sizeof: " << sizeof(curandState) << std::endl; + cudaMalloc(&defaultRndState, sizeof(curandState)*size); + initcuRandState<<>>(defaultRndState, size, seed); + + defaultRndSet = 1; + + return DKS_SUCCESS; +} + +int CudaBase::cuda_deleteCurandStates() { + if (defaultRndSet == 1) { + cudaFree(defaultRndState); + defaultRndSet = -1; + } + + return DKS_SUCCESS; +} + +curandState* CudaBase::cuda_getCurandStates() { + return defaultRndState; +} + +/* + add cuda stream +*/ +int CudaBase::cuda_createStream(int &streamId) { + + cudaStream_t tmpStream; + cudaError_t cerror; + + cerror = cudaStreamCreate(&tmpStream); + if (cerror != cudaSuccess) { + DEBUG_MSG("Failed to create new CUDA stream, cuda error: " << cerror); + return DKS_ERROR; + } + + cudaStreams.push_back(tmpStream); + streamId = cudaStreams.size() - 1; + + return DKS_SUCCESS; +} + +/* + add existing stream to list +*/ +int CudaBase::cuda_addStream(cudaStream_t tmpStream, int &streamId) { + cudaStreams.push_back(tmpStream); + streamId = cudaStreams.size() - 1; + + return DKS_SUCCESS; +} + + +/* + delete stream +*/ +int CudaBase::cuda_deleteStream(int id) { + //TODO: lets see if this is necessary, currently do nothing + return DKS_ERROR; +} + +/* + delete all streams +*/ +int CudaBase::cuda_deleteStreams() { + + //delete all cuda streams + for (unsigned int i = 0; i < cudaStreams.size(); i++) { + cudaStreamDestroy(cudaStreams[i]); + } + cudaStreams.clear(); + currentStream = -1; + + return DKS_SUCCESS; +} + + +/* + set stream id +*/ +int CudaBase::cuda_setStream(int id) { + currentStream = id; + return DKS_SUCCESS; +} + +/* + return stream id +*/ +int CudaBase::cuda_getStreamId() { + return currentStream; +} + +/* + set default stream as the stream to use +*/ +int CudaBase::cuda_defaultStream() { + currentStream = -1; + return DKS_SUCCESS; +} + +int CudaBase::cuda_numberOfStreams() { + return cudaStreams.size(); +} + +cudaStream_t CudaBase::cuda_getStream(int id) { + return cudaStreams[id]; +} + +cublasHandle_t CudaBase::cuda_getCublas() { + return defaultCublas; +} + +/* + get all available cuda devices +*/ +int CudaBase::cuda_getDevices() { + + std::cout << std::endl; + std::cout << "==============================" << std::endl; + std::cout << "=============CUDA=============" << std::endl; + std::cout << "==============================" << std::endl; + + int ndev; + cudaGetDeviceCount(&ndev); + + std::cout << ndev << std::endl; + + + for (int i = 0; i < ndev; i++) { + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + + std::cout << "Device " << i+1 << ":" << std::endl; + std::cout << "Name: " << prop.name << std::endl; + std::cout << "PCI bus id: " << prop.pciBusID << std::endl; + std::cout << "PCI device id: " << prop.pciDeviceID << std::endl; + std::cout << "PCI domain id: " << prop.pciDomainID << std::endl; + std::cout << "==============================" << std::endl; + } + + return DKS_SUCCESS; + +} + + +int CudaBase::cuda_getDeviceCount(int &ndev) { + cudaGetDeviceCount(&ndev); + return DKS_SUCCESS; +} + +int CudaBase::cuda_getDeviceName(std::string &device_name) { + + int ierr = DKS_SUCCESS; + + int ndev = 0; + cudaGetDeviceCount(&ndev); + if (ndev > 0) { + int device = 0; + cudaDeviceProp prop; + cudaGetDevice(&device); + cudaGetDeviceProperties(&prop, device); + + device_name = prop.name; + } else { + ierr = DKS_ERROR; + } + return ierr; +} + +int CudaBase::cuda_setDevice(int device) { + int ierr = DKS_SUCCESS; + int ndev = 0; + cudaGetDeviceCount(&ndev); + + std::cout << "Init: " << device << "\t" << ndev << std::endl; + + if (device < ndev) { + std::cout << "set device to: " << ndev << std::endl; + cudaSetDevice(device); + } else { + if (ndev > 0) + cudaSetDevice(0); + else + ierr = DKS_ERROR; + } + return ierr; +} + +int CudaBase::cuda_getUniqueDevices(std::vector &devices) { + + std::vector< std::string > names; + + int ndev; + cudaGetDeviceCount(&ndev); + + for (int i = 0; i < ndev; i++) { + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + + //add first device to the list, for other devices check if the name is already in the list + if (i == 0) { + devices.push_back(i); + names.push_back(prop.name); + } else { + std::string target = prop.name; + bool isPresent = (std::find(names.begin(), names.end(), target) != names.end()); + if (!isPresent) { + devices.push_back(i); + names.push_back(prop.name); + } + } + } + + return DKS_SUCCESS; +} + + +/* + set up cuda device +*/ +int CudaBase::cuda_setUp() { + + std::cout << "set up" << std::endl; + return DKS_SUCCESS; +} + +/* + allocate memory on cuda device +*/ +void * CudaBase::cuda_allocateMemory(size_t size, int &ierr) { + + cudaError cerror; + void * mem_ptr = NULL; + + cerror = cudaMalloc((void **) &mem_ptr, size); + if (cerror != cudaSuccess) { + DEBUG_MSG("Failed to allocate memory, cuda error: " << cerror); + std::cout << "Error: " << cudaGetErrorString(cerror) << std::endl; + ierr = DKS_ERROR; + } else { + ierr = DKS_SUCCESS; + } + + return mem_ptr; +} + +/* + Info: free memory on device + Return: success or error code +*/ +int CudaBase::cuda_freeMemory(void * mem_ptr) { + cudaError cerror; + + cerror = cudaFree(mem_ptr); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error freeing memory, cuda error: " << cerror); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +int CudaBase::cuda_freeHostMemory(void * mem_ptr) { + cudaError cerror; + + cerror = cudaFreeHost(mem_ptr); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error freeing host memory, cuda error: " << cerror); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +/* + Info: allcate memory and write data (push) + Return: pointer to memory object +*/ +/* + void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) { + + void * mem_ptr; + mem_ptr = cuda_allocateMemory(size, ierr); + + if (ierr == DKS_SUCCESS) + ierr = cuda_writeData(mem_ptr, in_data, size); + + return mem_ptr; + } +*/ + +/* + Info: read data and free memory (pull) + Return: success or error code +*/ +/* + int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) { + + ierr = cuda_readData(mem_ptr, out_data, size); + if (ierr == DKS_SUCCESS) + ierr = cuda_freeMemory(mem_ptr); + else + return DKS_ERROR; + + + if (ierr == DKS_SUCCESS) + return DKS_SUCCESS; + else + return DKS_ERROR; + } +*/ + +/* + Info: execute function + Return: success or error code +*/ +int CudaBase::cuda_executeFunction() { + + std::cout << "Execute function" << std::endl; + return DKS_SUCCESS; +} + +/* + Info: clean up + Return: success or error code +*/ +int CudaBase::cuda_cleanUp() { + + std::cout << "clean up" << std::endl; + return DKS_SUCCESS; + +} diff --git a/src/CUDA/CudaBase.cuh b/src/CUDA/CudaBase.cuh new file mode 100644 index 0000000..325016d --- /dev/null +++ b/src/CUDA/CudaBase.cuh @@ -0,0 +1,390 @@ +#ifndef H_CUDA_BASE +#define H_CUDA_BASE + +#include "../DKSDefinitions.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class CudaBase { + +private: + + int currentStream; + std::vector cudaStreams; + +protected: + + cublasHandle_t defaultCublas; + + curandState *defaultRndState; + int defaultRndSet; + +public: + + CudaBase(); + + ~CudaBase(); + + /** + * Init cuda random number (cuRand) states. + * Create an array of type curandState with "size" elements on the GPU + * and create a curandState with different seed for each array entry. + * Return success or error code + */ + int cuda_createCurandStates(int size); + + /** + * Delete curandState. + * Delete curandState array on the GPU and free memory. + * Return success or error code + */ + int cuda_deleteCurandStates(); + + /** Get a pointer to curand states + * + */ + curandState* cuda_getCurandStates(); + + /** + * Create a cuda stream and set streamId to index refering to this stream. + * Return success or error code + */ + int cuda_createStream(int &streamId); + + /** + * add existing cuda stream to the list. + * Return: success or error code. + */ + int cuda_addStream(cudaStream_t tmpStream, int &streamId); + + /** + * delete cuda stream + * success or error code + */ + int cuda_deleteStream(int id); + + /** + * delete all streams + * success or error code + */ + int cuda_deleteStreams(); + + /** + * set stream to use + * success or error code + */ + int cuda_setStream(int id); + + /** + * Info: get stream that is used + * Return: return id of curretn stream + */ + int cuda_getStreamId(); + + /** + * Info: reset to default stream + * Return: success or error code + */ + int cuda_defaultStream(); + + /** + * Info: get number of streams + * Return: success or error code + */ + int cuda_numberOfStreams(); + + /** + * Info: get stream + * Return: stream + */ + cudaStream_t cuda_getStream(int id); + + /** + * Get default cublass handle + */ + cublasHandle_t cuda_getCublas(); + + /** + * Info: get information on cuda devices + * Return: success or error code + */ + int cuda_getDevices(); + + /** Get CUDA device count. + * Sets the number of devices on the platform that can use CUDA. + * Returns DKS_SUCCESS + */ + int cuda_getDeviceCount(int &ndev); + + /** Get the name of the device. + * QUery the device properties of the used device and set the string device_name + */ + int cuda_getDeviceName(std::string &device_name); + + /** Set CUDA device to use. + * If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR + */ + int cuda_setDevice(int device); + + /** Get unique devices + * Get array of indeces with the unique CUDA devices available on the paltform + */ + int cuda_getUniqueDevices(std::vector &devices); + + /** + * Info: init device + * Return: success or error code + */ + int cuda_setUp(); + + /** + * Info: allocate memory on cuda device + * Return: pointer to memory object + */ + void * cuda_allocateMemory(size_t size, int &ierr); + + /** + * Info: allocate host memory in pinned memory + * Return: success or error code + */ + template + int cuda_allocateHostMemory(T *&ptr, size_t size) { + cudaError cerror; + cerror = cudaMallocHost((void**)&ptr, sizeof(T)*size); + if (cerror != cudaSuccess) + return DKS_ERROR; + + return DKS_SUCCESS; + } + + /** + * Info: write data to memory + * Retrun: success or error code + */ + template + int cuda_writeData(T * mem_ptr, const void * in_data, size_t size, int offset = 0) { + cudaError cerror; + + cerror = cudaMemcpy(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error copying data to device, cuda error: " << cerror); + return DKS_ERROR; + } + + return DKS_SUCCESS; + } + + /** + * Info: write data assynchonuously + * Return: success or error code + */ + template + int cuda_writeDataAsync(T *mem_ptr, const void *in_data, size_t size, int streamId = -1, int offset = 0) { + cudaError cerror; + + //if default stream or no stream specified, use default write method + if (streamId == -1) { + cuda_writeData(mem_ptr, in_data, size, offset); + return DKS_SUCCESS; + } + + if (streamId < cuda_numberOfStreams()) { + //call async write + cerror = cudaMemcpyAsync(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice, + cuda_getStream(streamId)); + + if (cerror != cudaSuccess) { + DEBUG_MSG("Error async data copy, cuda error: " << cerror); + return DKS_ERROR; + } + } else { + DEBUG_MSG("Error invalid stream id: " << streamId); + return DKS_ERROR; + } + + + return DKS_SUCCESS; + } + + /** + * Info: read data from memory + * Return: success or error code + */ + template + int cuda_readData(const T * mem_ptr, void * out_data, size_t size, int offset = 0) { + cudaError cerror; + + cerror = cudaMemcpy(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error reading data from device"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + } + + /** + * Info: read data async from device memory + * Return: success or error code + */ + template + int cuda_readDataAsync(const T *mem_ptr, void *out_data, size_t size, int streamId = -1, int offset = 0) { + cudaError cerror; + + if (streamId == -1) { + cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, 0); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error async read from devie default stream"); + return DKS_ERROR; + } + return DKS_SUCCESS; + } + + if (streamId < cuda_numberOfStreams()) { + cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, + cuda_getStream(streamId)); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error async read from device, cuda error: " << cerror); + return DKS_ERROR; + } + } else { + DEBUG_MSG("Error invalid stream id: " << streamId); + return DKS_ERROR; + } + + return DKS_SUCCESS; + } + + /** + * Info: free memory on device + * Return: success or error code + */ + int cuda_freeMemory(void * mem_ptr); + + /** + * Info: free page locked memory on host + * Return: success or erro code + */ + int cuda_freeHostMemory(void * mem_ptr); + + /** + * Info: allcate memory and write data (push) + * Return: pointer to memory object + */ + template + void * cuda_pushData(const void * in_data, size_t size, int &ierr) { + + void * mem_ptr; + mem_ptr = cuda_allocateMemory(size, ierr); + + if (ierr == DKS_SUCCESS) + ierr = cuda_writeData((T*)mem_ptr, in_data, size); + + return mem_ptr; + } + + /** + * Info: read data and free memory (pull) + * Return: success or error code + */ + template + int cuda_pullData(T * mem_ptr, void * out_data, size_t size, int &ierr) { + + ierr = cuda_readData(mem_ptr, out_data, size); + if (ierr == DKS_SUCCESS) + ierr = cuda_freeMemory(mem_ptr); + else + return DKS_ERROR; + + + if (ierr == DKS_SUCCESS) + return DKS_SUCCESS; + else + return DKS_ERROR; + } + + /** + * Info: execute function + * Return: success or error code + */ + int cuda_executeFunction(); + + /** + * Info: clean up + * Return: success or error code + */ + int cuda_cleanUp(); + + /** + * Info: sync cuda device + * Return: success or error code + */ + int cuda_syncDevice() { + cudaDeviceSynchronize(); + return DKS_SUCCESS; + } + + /** + * Page-lock host memory + */ + template + int cuda_hostRegister(T *ptr, int size) { + int cerr = cudaHostRegister(ptr, size*sizeof(T), cudaHostRegisterPortable); + if (cerr == cudaSuccess) { + return DKS_SUCCESS; + } else { + DEBUG_MSG("Host memroy was not page locked"); + return DKS_ERROR; + } + } + + /** + * Release page locked memory + */ + template + int cuda_hostUnregister(T *ptr) { + int cerr = cudaHostUnregister(ptr); + if (cerr == cudaSuccess) + return DKS_SUCCESS; + else + return DKS_ERROR; + + } + + /** + * Info: print device memory info (total, used, avail) + * Return: success or error code + */ + int cuda_memInfo() { + int ierr; + size_t avail; + size_t total; + double mb = 1000000.0; + + ierr = cudaMemGetInfo( &avail, &total); + + if (ierr != cudaSuccess) { + DEBUG_MSG("Device mem info could not be obtained!"); + return DKS_ERROR; + } + + std::cout << "Device memory info, total: " << total / mb << "MB,\t"; + std::cout << "used: " << (total - avail) / mb << "MB,\t"; + std::cout << "avail: " << avail / mb << "MB" << std::endl; + + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/CUDA/CudaChiSquare.cu b/src/CUDA/CudaChiSquare.cu new file mode 100644 index 0000000..db7f4f7 --- /dev/null +++ b/src/CUDA/CudaChiSquare.cu @@ -0,0 +1,287 @@ +#include "CudaChiSquare.cuh" + +//simple kernel version +__global__ void kernelPHistoTFFcn(double *data, double *par, double *chisq, + double fTimeResolution, double fRebin, int n) { + + int j = blockIdx.x; + int i = blockIdx.y; + + int idx = i * n + j; + + const double tau = 2.197019; + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + + double w = par[0]*0.08516155035269027; + + double ldata = data[idx]; + + double theo = par[2 + i*4] * exp(-time/tau) * (1.0 + par[3 + i*4] * exp(-0.5 * pow(par[1]*time,2.0) ) * cos(w * time+par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4]; + + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + +} + +__global__ void kernelPHistoTFFcn_2(double *data, double *par, double *chisq, + double fTimeResolution, double fRebin, int n, int s) { + + int j = blockIdx.x; + + const double tau = 2.197019; + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + double w = par[0]*0.08516155035269027; + double tt = exp(-time/tau); + double pp = exp(-0.5 * par[1] * time * par[1] * time); + double wt = w * time; + + int idx; + double ldata, theo; + for (int i = 0; i < s; i++) { + idx = i * n + j; + ldata = data[idx]; + + theo = par[2 + i*4] * tt * (1.0 + par[3 + i*4] * pp * cos(wt + par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4]; + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + } + +} + +#define TAU 2.197019 + +__global__ void kernelPHistoTFFcn_3(double *data, double *par, double *chisq, + double fTimeResolution, double fRebin, + int length, int sensors, int numpar) { + + + //define shared variable for parameters + extern __shared__ double p[]; + + //get thread id and calc global id + int tid = threadIdx.x; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync threads + __syncthreads(); + + if (j < length) { + + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + double w = p[0]*0.08516155035269027; + double tt = exp(-time/TAU); + double pp = exp(-0.5 * pow(p[1]*time, 2.0)); + double wt = w * time; + + int idx; + double ldata, theo; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + ldata = data[idx]; + + theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + } + } + + +} + +__global__ void kernelSingleGaussTF(double *data, unsigned int *t0, double *par, double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar) +{ + + //define shared variable for parameters + extern __shared__ double p[]; + + //get thread id and calc global id + int tid = threadIdx.x; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync threads + __syncthreads(); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = par[0]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0 + fTimeResolution * fRebin* (j - lft0); + theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0)) + *cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + // 1.74532925199432955e-2 = pi/180 + + if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) ) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } + +} + +__global__ void kernelDoubleLorentzTF(double *data, unsigned int *t0, double *par, double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar) +{ + + //define shared variable for parameters + extern __shared__ double p[]; + + //get thread id and calc global id + int tid = threadIdx.x; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync threads + __syncthreads(); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = p[0]*0.08516155035269027; + double w2 = p[2]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0+fTimeResolution*fRebin*(j-lft0); + + theo = p[4+i*5]*exp(-time/TAU)* + (1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)* + cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+ + (1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)* + cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5]; + // 1.74532925199432955e-2 = pi/180 + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } +} + + + +int CudaChiSquare::cuda_PHistoTFFcn(void *mem_data, void *mem_ptr, void *mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result) +{ + + int threads = 128; + int blocks = length / threads + 1; + + kernelPHistoTFFcn_3<<>>((double*)mem_data, + (double*)mem_ptr, + (double*)mem_chisq, + fTimeResolution, + fRebin, length, + sensors, numpar); + + + cublasStatus_t status; + status = cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_chisq, 1, &result); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("cublas asum failed"); + return DKS_ERROR; + } + + + return DKS_SUCCESS; +} + + +int CudaChiSquare::cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + int threads = 128; + int blocks = length / threads + 1; + + kernelSingleGaussTF<<>>( (double*)mem_data, + (unsigned int*)mem_t0, + (double*)mem_par, + (double*)mem_result, + fTimeResolution, + fRebin, + fGoodBinOffset, + length, sensors, numpar); + + cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result); + result = 2.0 * result; + + + return DKS_SUCCESS; + +} + + +int CudaChiSquare::cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + int threads = 128; + int blocks = length / threads + 1; + + kernelDoubleLorentzTF<<>>( (double*)mem_data, + (unsigned int*)mem_t0, + (double*)mem_par, + (double*)mem_result, + fTimeResolution, + fRebin, + fGoodBinOffset, + length, sensors, numpar); + + cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result); + result = 2.0 * result; + + + return DKS_SUCCESS; + +} diff --git a/src/CUDA/CudaChiSquare.cuh b/src/CUDA/CudaChiSquare.cuh new file mode 100644 index 0000000..588dec5 --- /dev/null +++ b/src/CUDA/CudaChiSquare.cuh @@ -0,0 +1,59 @@ +#ifndef H_CUDA_CHISQUARE +#define H_CUDA_CHISQUARE + +#include + +#include +#include + +#include "CudaBase.cuh" + +class CudaChiSquare { + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** + * Constructor which gets CudaBase as argument + */ + CudaChiSquare(CudaBase *base) { + m_base = base; + base_create = false; + } + + /* constructor */ + CudaChiSquare() { + m_base = new CudaBase(); + base_create = true; + } + + /* destructor */ + ~CudaChiSquare() { + if (base_create) + delete m_base; + } + + /* PHistoTFFcn calculation */ + int cuda_PHistoTFFcn(void * mem_data, void * mem_par, void * mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result); + + int cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + int cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + +}; + +#endif diff --git a/src/CUDA/CudaChiSquareRuntime.cu b/src/CUDA/CudaChiSquareRuntime.cu new file mode 100644 index 0000000..ebbbcd0 --- /dev/null +++ b/src/CUDA/CudaChiSquareRuntime.cu @@ -0,0 +1,313 @@ +#include "CudaChiSquareRuntime.cuh" + +CudaChiSquareRuntime::CudaChiSquareRuntime(CudaBase *base) { + blockSize_m = BLOCK_SIZE; + numBlocks_m = -1; + + ptx_m = NULL; + + m_base = base; + base_create = false; + setUpContext(); +} + +//constructor, init cuda device and create context +CudaChiSquareRuntime::CudaChiSquareRuntime() { + blockSize_m = BLOCK_SIZE; + numBlocks_m = -1; + + ptx_m = NULL; + + m_base = new CudaBase(); + base_create = true; + setUpContext(); +} + +//free resources +CudaChiSquareRuntime::~CudaChiSquareRuntime() { + delete[] ptx_m; + cuCtxDestroy(context_m); + + freeChiSquare(); + + if (base_create) + delete m_base; +} + +void CudaChiSquareRuntime::setUpContext() { + cuInit(0); + cuDeviceGet(&cuDevice_m, 0); + cuCtxCreate(&context_m, 0, cuDevice_m); + + N0_m = 1.0; + tau_m = 1.0; + bkg_m = 1.0; + + initDone_m = false; +} + +//build program string +std::string CudaChiSquareRuntime::buildProgram(std::string function) { + + long fsize; + char *kernel_source; + + //get kernel source + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, "CUDA/NVRTCKernels/CudaChiSquareKernel.cu"); + + //read kernels from file + FILE *fp = fopen(kernel_file, "rb"); + if (!fp) + DEBUG_MSG("Can't open kernel file" << kernel_file); + + //get file size and allocate memory + fseek(fp, 0, SEEK_END); + fsize = ftell(fp); + kernel_source = new char[fsize+1]; + + //read file and content in kernel source + rewind(fp); + fread(kernel_source, 1, sizeof(char)*fsize, fp); + kernel_source[fsize] = '\0'; + fclose(fp); + + std::string kernel_string (kernel_source); + return kernel_string + cudaFunctHeader + "return " + function + ";" + cudaFunctFooter; +} + +// +int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) { + + //build program string + std::string cudaProg = buildProgram(function); + + //create program + nvrtcProgram prog; + //std::cout << cudaProg.c_str() << std::endl; + nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL); + + //compile program + const char *opts[] = {"-fmad=false", ""}; + int numopts = 1; + if (mlh) { + opts[1] = "-DMLH"; + numopts = 2; + } + + nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts); + + if (compileResults != NVRTC_SUCCESS) { + //obtain compilation log + size_t logSize; + nvrtcGetProgramLogSize(prog, &logSize); + char *log = new char[logSize]; + nvrtcGetProgramLog(prog, log); + DEBUG_MSG("Compilation failed!"); + DEBUG_MSG(log); + delete[] log; + + return DKS_ERROR; + } else { + DEBUG_MSG("Compilation successfull!"); + } + + //obtain PTX from program + if (ptx_m != NULL) + delete[] ptx_m; + size_t ptxSize; + nvrtcGetPTXSize(prog, &ptxSize); + ptx_m = new char[ptxSize]; + nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m); + + if (nvrtcPTXResult != NVRTC_SUCCESS) { + DEBUG_MSG("Get PTX failed!"); + return DKS_ERROR; + } + + //load module from ptx + CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); + if (loadResult != CUDA_SUCCESS) { + DEBUG_MSG("Load module from ptx failed!"); + return DKS_ERROR; + } + + // Destroy the program + nvrtcDestroyProgram(&prog); + + return DKS_SUCCESS; +} + +int CudaChiSquareRuntime::launchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, double &result) +{ + + if (!initDone_m) { + DEBUG_MSG("ChiSquare init needs to be called at some point!"); + return DKS_ERROR; + } + + int blocks; + int threads = blockSize_m; + if (numBlocks_m < 0) + blocks = length / threads + 1; + else + blocks = numBlocks_m; + + CUresult cuStatus; + void **args = 0; + + if (fitType == FITTYPE_SINGLE_HISTO) { + cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareSingleHisto"); + + if (cuStatus != CUDA_SUCCESS) { + DEBUG_MSG("Failed to get function from module!"); + return DKS_ERROR; + } + + args = (void**) malloc(15 * sizeof(void*)); + args[0] = &mem_data; + args[1] = &mem_err; + args[2] = &mem_param_m; + args[3] = &mem_chisq_m; + args[4] = &mem_map_m; + args[5] = &mem_func_m; + args[6] = &length; + args[7] = &numpar; + args[8] = &numfunc; + args[9] = &nummap; + args[10] = &timeStart; + args[11] = &timeStep; + args[12] = &tau_m; + args[13] = &N0_m; + args[14] = &bkg_m; + } else if (fitType == FITTYPE_ASYMMETRY) { + cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareAsymmetry"); + + if (cuStatus != CUDA_SUCCESS) { + DEBUG_MSG("Failed to get function from module!"); + return DKS_ERROR; + } + + args = (void**) malloc(14 * sizeof(void*)); + args[0] = &mem_data; + args[1] = &mem_err; + args[2] = &mem_param_m; + args[3] = &mem_chisq_m; + args[4] = &mem_map_m; + args[5] = &mem_func_m; + args[6] = &length; + args[7] = &numpar; + args[8] = &numfunc; + args[9] = &nummap; + args[10] = &timeStart; + args[11] = &timeStep; + args[12] = &alpha_m; + args[13] = &beta_m; + } else if (fitType == FITTYPE_MU_MINUS) { + DEBUG_MSG("Not Yet Implemented!"); + return DKS_ERROR; + } else { + DEBUG_MSG("Undefined Fit Type!"); + return DKS_ERROR; + } + + cuStatus = cuLaunchKernel(kernel_m, + blocks, 1, 1, + threads, 1, 1, + (numpar + numfunc)*sizeof(double) + nummap*sizeof(int), NULL, + args, 0); + + + + if (cuStatus != CUDA_SUCCESS) { + std::string msg; + msg = "Failed to run kernel! (" + std::to_string(blocks) + ", " + std::to_string(threads) + ")"; + DEBUG_MSG(msg); + const char *desc; + cuGetErrorString(cuStatus, &desc); + std::cout << desc << std::endl; + return DKS_ERROR; + } + + cublasStatus_t status; + status = cublasDasum(defaultCublasRT, length, (double*)mem_chisq_m, 1, &result); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("cublas sum failed!"); + return DKS_ERROR; + } + + // cleanup + if (args) + free(args); + + return DKS_SUCCESS; + +} + +int CudaChiSquareRuntime::writeParams(const double *params, int numparams) { + int ierr = m_base->cuda_writeData( (double*)mem_param_m, params, sizeof(double)*numparams); + return ierr; +} + +int CudaChiSquareRuntime::writeFunc(const double *func, int numfunc) { + int ierr = m_base->cuda_writeData( (double*)mem_func_m, func, sizeof(double)*numfunc); + return ierr; +} + +int CudaChiSquareRuntime::writeMap(const int *map, int nummap) { + int ierr = m_base->cuda_writeData( (int*)mem_map_m, map, sizeof(int)*nummap); + return ierr; +} + +int CudaChiSquareRuntime::initChiSquare(int size_data, int size_param, int size_func, + int size_map) { + + int ierr = DKS_ERROR; + if (initDone_m) { + DEBUG_MSG("Reinitializing ChiSquare"); + freeChiSquare(); + } + + //init cublas + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + status = cublasCreate(&defaultCublasRT); + if (status != CUBLAS_STATUS_SUCCESS) + DEBUG_MSG("CUBLAS create default handle failed!"); + + //allocate temporary memory + mem_chisq_m = m_base->cuda_allocateMemory(size_data*sizeof(double), ierr); + mem_param_m = m_base->cuda_allocateMemory(size_param*sizeof(double), ierr); + mem_func_m = m_base->cuda_allocateMemory(size_func*sizeof(double), ierr); + mem_map_m = m_base->cuda_allocateMemory(size_map*sizeof(int), ierr); + initDone_m = true; + + return ierr; +} + +int CudaChiSquareRuntime::freeChiSquare() { + int ierr = DKS_ERROR; + if (initDone_m) { + //delete cublas + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + status = cublasDestroy(defaultCublasRT); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS delete default handle failed!"); + return DKS_ERROR; + } + + //free memory + ierr = m_base->cuda_freeMemory(mem_chisq_m); + ierr = m_base->cuda_freeMemory(mem_param_m); + ierr = m_base->cuda_freeMemory(mem_func_m); + ierr = m_base->cuda_freeMemory(mem_map_m); + + initDone_m = false; + } + + return ierr; +} diff --git a/src/CUDA/CudaChiSquareRuntime.cuh b/src/CUDA/CudaChiSquareRuntime.cuh new file mode 100644 index 0000000..79a9af5 --- /dev/null +++ b/src/CUDA/CudaChiSquareRuntime.cuh @@ -0,0 +1,114 @@ +#ifndef H_CUDA_CHISQUARE_RUNTIME +#define H_CUDA_CHISQUARE_RUNTIME + +#include +#include + +#include +#include +#include + +#include "../Algorithms/ChiSquareRuntime.h" +#include "CudaBase.cuh" + +const std::string cudaFunctHeader = "__device__ double fTheory(double t, double *p, double *f, int *m) {"; + +const std::string cudaFunctFooter = "}\n"; + +class CudaChiSquareRuntime : public ChiSquareRuntime{ + +private: + + bool base_create; + CudaBase *m_base; + + CUdevice cuDevice_m; + CUcontext context_m; + CUmodule module_m; + CUfunction kernel_m; + + cublasHandle_t defaultCublasRT; + + /** Setup to init device + * Create context and init device for RT compilation + */ + void setUpContext(); + + /** Private function to add function to kernel string + * + */ + std::string buildProgram(std::string function); + +public: + + /** Constructor with CudaBase argument + * + */ + CudaChiSquareRuntime(CudaBase *base); + + /** Default constructor init cuda device + * + */ + CudaChiSquareRuntime(); + + /** Default destructor + * + */ + ~CudaChiSquareRuntime(); + + /** Compile program and save ptx. + * Add function string to the calcFunction kernel and compile the program + * Function must be valid C math expression. Parameters can be addressed in + * a form par[map[idx]] + */ + int compileProgram(std::string function, bool mlh = false); + + /** Launch selected kernel + * Launched the selected kernel from the compiled code. + * Result is put in &result variable + */ + int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result); + + /** Write params to device. + * Write params from double array to mem_param_m memory on the device. + */ + int writeParams(const double *params, int numparams); + + /** Write functions to device. + * Write function values from double array to mem_func_m memory on the device. + */ + int writeFunc(const double *func, int numfunc); + + /** Write maps to device. + * Write map values from int array to mem_map_m memory on the device. + */ + int writeMap(const int *map, int nummap); + + /** Allocate temporary memory needed for chi square. + * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to + * the maximum number of elements in any datasets that will be used for calculations. Size_param, + * size_func and size_map are the maximum number of parameters, functions and maps used in + * calculations. + */ + int initChiSquare(int size_data, int size_param, int size_func, int size_map); + + + /** Free temporary memory allocated for chi square. + * Frees the chisq temporary memory and memory for params, functions and maps + */ + int freeChiSquare(); + + /** Check if CUDA device is able to run the chi square kernel. + * Redundant - all new CUDA devices that support RT compilation will also support + * double precision, there are no other requirements to run chi square on GPU + */ + int checkChiSquareKernels(int fitType, int &threadsPerBlock) { + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/CUDA/CudaCollimatorPhysics.cu b/src/CUDA/CudaCollimatorPhysics.cu new file mode 100644 index 0000000..495ad59 --- /dev/null +++ b/src/CUDA/CudaCollimatorPhysics.cu @@ -0,0 +1,728 @@ +#include "CudaCollimatorPhysics.cuh" + +//#define M_P 0.93827231e+00 +#define M_P 0.93827204e+00 +#define C 299792458.0 +#define PI 3.14159265358979323846 +#define AVO 6.022e23 +#define R_E 2.81794092e-15 +//#define eM_E 0.51099906e-03 +#define eM_E 0.51099892e-03 +#define Z_P 1 +#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 + +#define POSITION 0 +#define ZSIZE 1 +#define RHO_M 2 +#define Z_M 3 +#define A_M 4 +#define A2_C 5 +#define A3_C 6 +#define A4_C 7 +#define A5_C 8 +#define X0_M 9 +#define I_M 10 +#define DT_M 11 + +#define BLOCK_SIZE 128 +#define NUMPAR 12 + +__device__ inline double dot(double3 &d1, double3 &d2) { + + return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); + +} + +__device__ inline bool checkHit(double &z, double *par) { + + /* check if particle is in the degrader material */ + return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); + +} + + +__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) +{ + + volatile double dEdx = 0.0; + + volatile double gamma = (Eng + M_P) / M_P; + volatile double gamma2 = gamma * gamma; + + double beta = sqrt(1.0 - 1.0 / gamma2); + volatile double beta2 = beta * beta; + + double deltas = par[DT_M] * beta * C; + volatile double deltasrho = deltas * 100 * par[RHO_M]; + volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); + + if ( (Eng > 0.00001) && (Eng < 0.0006) ) { + double Ts = (Eng * 1E6) / 1.0073; + double epsilon_low = par[A2_C] * pow(Ts, 0.45); + double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) ); + double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high); + + dEdx = -epsilon / (1E21 * (par[A_M] / AVO) ); + + double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state); + Eng = Eng + delta_E / 1E3; + } + + if (Eng >= 0.0006) { + double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P)); + + dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state); + + Eng = Eng + delta_E / 1E3; + } + + pdead = ((Eng<1E-4) || (dEdx>0)); + +} + +__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, + double &normP, double &thetacou, double &deltas, int coord, + double *par) +{ + double Psixz; + double pxz; + + if (px>=0 && pz>=0) + Psixz = atan(px/pz); + else if (px>0 && pz<0) + Psixz = atan(px/pz) + PI; + else if (px<0 && pz>0) + Psixz = atan(px/pz) + 2*PI; + else + Psixz = atan(px/pz) + PI; + + pxz = sqrt(px*px + pz*pz); + + if(coord==1) { + x = x + deltas * px/normP + xplane*cos(Psixz); + z = z - xplane * sin(Psixz); + } + + if(coord==2) { + x = x + deltas * px/normP + xplane*cos(Psixz); + z = z - xplane * sin(Psixz) + deltas * pz / normP; + } + + px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou); + pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); +} + +__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) { + + double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; + double gamma = (Eng + M_P) / M_P; + double normP = sqrt(dot(P, P)); + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + double deltas = par[DT_M] * beta * C; + + double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * + Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); + + // x-direction: See Physical Review, "Multiple Scattering" + double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + //__syncthreads(); + + double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par); + + double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par); + } + + // y-direction: See Physical Review, "Multiple Scattering" + z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + //__syncthreads(); + + double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par); + + P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par); + } + +} + + +template +__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, + int numparticles) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + //transfer params to shared memory + extern __shared__ double smem[]; + double *p = (double*)smem; + double3 *R = (double3*)&smem[NUMPAR]; + + curandState s; + double3 P; + + for (int tt = tid; tt < NUMPAR; tt += blockDim.x) + p[tt] = par[tt]; + + __syncthreads(); + + if (idx < numparticles) { + s = state[idx]; + R[tid] = data[idx].Rincol; + P = data[idx].Pincol; + + bool pdead = false; + volatile double sq = sqrt(1.0 + dot(P, P)); + + double Eng; + + if (checkHit(R[tid].z, p)) { + + Eng = (sq - 1) * M_P; + energyLoss(Eng, pdead, s, p); + + if (!pdead) { + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(P, P)); + + P.x = P.x * ptot / sq; + P.y = P.y * ptot / sq; + P.z = P.z * ptot / sq; + coulombScat(R[tid], P, s, p); + + data[idx].Pincol = P; + } else { + data[idx].label = -1; + } + + state[idx] = s; + } else { + + R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; + R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; + R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; + data[idx].label = -2; + + } + + data[idx].Rincol = R[tid]; + } + +} + +__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, + curandState *state, int numparticles) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + //transfer params to shared memory + __shared__ double p[NUMPAR]; + __shared__ double3 R[BLOCK_SIZE]; + + if (tid < NUMPAR) + p[tid] = par[tid]; + + __syncthreads(); + + curandState s; + double3 P; + if (idx < numparticles) { + R[tid] = data.Rincol[idx]; + P = data.Pincol[idx]; + s = state[idx]; + + double sq = sqrt(1.0 + dot(P, P)); + bool pdead = false; + + if (checkHit(R[tid].z, p)) { + + double Eng = (sq - 1) * M_P; + energyLoss(Eng, pdead, s, p); + + if (!pdead) { + + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(P, P)); + P.x = P.x * ptot / sq; + P.y = P.y * ptot / sq; + P.z = P.z * ptot / sq; + coulombScat(R[tid], P, s, p); + + data.Pincol[idx] = P; + } else { + data.label[idx] = -1; + } + + } else { + R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; + R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; + R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; + + data.label[idx] = -2; + } + + data.Rincol[idx] = R[tid]; + state[idx] = s; + } + +} + + +inline __device__ void unitlessOff(double3 &a, const double &c) { + a.x *= c; + a.y *= c; + a.z *= c; +} + +inline __device__ void unitlessOn(double3 &a, const double &c) { + a.x /= c; + a.y /= c; + a.z /= c; +} + +//swithch to unitless positions with dtc +__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + + unitlessOn(R, dtc); + unitlessOn(X, dtc); + + gR[idx] = R; + gX[idx] = X; + } + +} + +//swithc to unitless positions with dt*c +__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + double dt = gdt[idx]; + + unitlessOff(R, dt*c); + unitlessOff(X, dt*c); + + gR[idx] = R; + gX[idx] = X; + } +} + +//swithc off unitless positions with dtc +__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + + unitlessOff(R, dtc); + unitlessOff(X, dtc); + + gR[idx] = R; + gX[idx] = X; + } + +} + +//switch off unitelss positions with dt*c +__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + double dt = gdt[idx]; + + unitlessOff(R, dt*c); + unitlessOff(X, dt*c); + + gR[idx] = R; + gX[idx] = X; + } +} + + +__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + if (idx < npart) { + + double3 R = gR[idx]; + double3 P = gP[idx]; + + //switch to unitless positions + unitlessOn(R, dtc); + + //push + double tmp = sqrt(1.0 + dot(P, P)); + R.x += 0.5 * P.x / tmp; + R.y += 0.5 * P.y / tmp; + R.z += 0.5 * P.z / tmp; + + //switch off unitless positions with dt*c + unitlessOff(R, dtc); + + gR[idx] = R; + } +} + + +__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) { + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + if (idx < npart) { + + double3 R = gR[idx]; + double3 P = gP[idx]; + double dt = gdt[idx]; + + //switch to unitless positions with dt*c + unitlessOn(R, dt*c); + + R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P)); + R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P)); + R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P)); + + //switch off unitless positions with dt*c + unitlessOff(R, dt*c); + + gR[idx] = R; + } +} + +//TODO: kernel for push with switch off unitless positions with dt[i]*c + +__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) { + + const double sina = sin(ori.x); + const double cosa = cos(ori.x); + const double sinb = sin(ori.y); + const double cosb = cos(ori.y); + const double sinc = sin(ori.z); + const double cosc = cos(ori.z); + + double3 temp; + temp.x = 0.0; + temp.y = 0.0; + temp.z = 0.0; + + temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z; + temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x + + (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z; + temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x + + (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z; + + return temp; + +} + +__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient, + int npart, int nsect, double dtc) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + + if (idx < npart) { + + double3 X = gX[idx]; + double3 P = gP[idx]; + long lLastSection = gLastSection[idx]; + + double3 ori; + if (lLastSection > -1 && lLastSection < nsect) { + ori = gOrient[lLastSection]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + double3 tmp = deviceTransformTo(P, ori); + + unitlessOn(X, dtc); + + X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp)); + X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp)); + X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp)); + + unitlessOff(X, dtc); + + gX[idx] = X; + } + +} + +__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient, + int npart, int nsect, double *gdt, double c) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + + if (idx < npart) { + + double3 X = gX[idx]; + double3 P = gP[idx]; + long lLastSection = gLastSection[idx]; + double dt = gdt[idx]; + + double3 ori; + if (lLastSection > -1 && lLastSection < nsect) { + ori = gOrient[lLastSection]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + double3 tmp = deviceTransformTo(P, ori); + + unitlessOn(X, dt*c); + + X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp)); + X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp)); + X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp)); + + unitlessOff(X, dt*c); + + gX[idx] = X; + } + +} + +struct compare_particle +{ + int threshold; + + compare_particle() { + threshold = 0; + } + + void set_threshold(int t) { + threshold = t; + } + + __host__ __device__ + bool operator()(CUDA_PART p1, CUDA_PART p2) { + return p1.label > p2.label; + } + + __host__ __device__ + bool operator()(CUDA_PART p1) { + return p1.label < threshold; + } +}; + + +struct compare_particle_small +{ + int threshold; + + compare_particle_small() { + threshold = 0; + } + + void set_threshold(int t) { + threshold = t; + } + + __host__ __device__ + bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) { + return p1.label > p2.label; + } + + __host__ __device__ + bool operator()(CUDA_PART_SMALL p1) { + return p1.label < threshold; + } +}; + + +struct less_then +{ + __host__ __device__ + bool operator()(int x) + { + return x < 0; + } +}; + +int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) +{ + + int threads = BLOCK_SIZE; + int blocks = numparticles / threads + 1; + + //calc shared memory size + int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE; + + //call kernel + kernelCollimatorPhysics<<>>((CUDA_PART_SMALL*)mem_ptr, + (double*)par_ptr, + m_base->cuda_getCurandStates(), + numparticles); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + std::cout << "Err2: " << cudaGetErrorString(err) << std::endl; + + return DKS_SUCCESS; + +} + +int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles, + int &numaddback) +{ + + //wrap mem_ptr with thrust device ptr + thrust::device_ptr dev_ptr( (CUDA_PART_SMALL*)mem_ptr); + + //count -2 and -1 particles + compare_particle_small comp; + comp.set_threshold(0); + numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp); + + //sort particles + if (numaddback > 0) + thrust::sort(dev_ptr, dev_ptr + numparticles, comp); + + return DKS_SUCCESS; +} + +int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, bool usedt, + int streamId) +{ + + int threads = BLOCK_SIZE; + int blocks = npart / threads + 1; + + //call kernel + if (!usedt) { + if (streamId == -1) { + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c); + } + } else { + if (streamId == -1) { + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, + (double*)dt_ptr, c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, + (double*)dt_ptr, c); + } + } + + + return DKS_SUCCESS; +} + +int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, + void *dt_ptr, double dt, + double c, bool usedt, + int streamId) +{ + + int threads = BLOCK_SIZE; + int blocks = npart / threads + 1; + int smem = sizeof(double3) * nsec; + + //call kernel + if (!usedt) { + if (streamId == -1) { + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, dt*c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, dt*c); + } + } else { + if (streamId == -1) { + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, (double*)dt_ptr, c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, (double*)dt_ptr, c); + } + } + + return DKS_SUCCESS; +} + + + diff --git a/src/CUDA/CudaCollimatorPhysics.cuh b/src/CUDA/CudaCollimatorPhysics.cuh new file mode 100644 index 0000000..9808f33 --- /dev/null +++ b/src/CUDA/CudaCollimatorPhysics.cuh @@ -0,0 +1,155 @@ +#ifndef H_CUDA_COLLIMATORPHYSICS +#define H_CUDA_COLLIMATORPHYSICS + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include "../Algorithms/CollimatorPhysics.h" +#include "CudaBase.cuh" + +/** + * Structure for storing particle on GPU + */ +typedef struct __align__(16) { + int label; + unsigned localID; + double3 Rincol; + double3 Pincol; + long IDincol; + int Binincol; + double DTincol; + double Qincol; + long LastSecincol; + double3 Bfincol; + double3 Efincol; +} CUDA_PART; + +/** + * Structure for storing particle on GPU + */ +typedef struct { + int label; + unsigned localID; + double3 Rincol; + double3 Pincol; +} CUDA_PART_SMALL; + +/** + * Structure for storing particle on GPU + */ +typedef struct { + int *label; + unsigned *localID; + double3 *Rincol; + double3 *Pincol; + long *IDincol; + int *Binincol; + double *DTincol; + double *Qincol; + long *LastSecincol; + double3 *Bfincol; + double3 *Efincol; +} CUDA_PART2; + +/** + * Structure for storing particle on GPU + */ +typedef struct { + int *label; + unsigned *localID; + double3 *Rincol; + double3 *Pincol; +} CUDA_PART2_SMALL; + +/** CudaCollimatorPhysics class. + * Contains kerenls that execute CollimatorPhysics functions form OPAL. + * For detailed documentation on CollimatorPhysics functions see OPAL documentation + */ +class CudaCollimatorPhysics : public DKSCollimatorPhysics{ + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** Constructor with CudaBase argument + * + */ + CudaCollimatorPhysics(CudaBase *base) { + m_base = base; + base_create = false; + } + + /** Constructor - empty. */ + CudaCollimatorPhysics() { + m_base = new CudaBase(); + base_create = true; + } + + /** Destructor - empty */ + ~CudaCollimatorPhysics() { + if (base_create) + delete m_base; + }; + + /** Execute collimator physics kernel. + * + */ + int CollimatorPhysics(void *mem_ptr, void *par_ptr, + int numpartices); + + int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) + { + return DKS_ERROR; + } + + /** Sort particle array on GPU. + * Count particles that are dead (label -1) or leaving material (label -2) and sort particle + * array so these particles are at the end of array + */ + int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) + { + return DKS_ERROR; + } + + /** BorisPusher push function for integration from OPAL. + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ + int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1); + + /** BorisPusher push function with transformto function form OPAL + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ + int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + +}; + +#endif diff --git a/src/CUDA/CudaFFT.cu b/src/CUDA/CudaFFT.cu new file mode 100644 index 0000000..88e45ca --- /dev/null +++ b/src/CUDA/CudaFFT.cu @@ -0,0 +1,376 @@ +#include "CudaFFT.cuh" + +__global__ void normalize(cufftDoubleComplex *in, int N) { + + int id = blockIdx.x; //*blockDim.x + threadIdx.x; + if (id < N) { + in[id].x = in[id].x / N; + in[id].y = in[id].y / N; + } + +} + +CudaFFT::CudaFFT(CudaBase *base) { + m_base = base; + base_create = false; +} + +/* constructor */ +CudaFFT::CudaFFT() { + m_base = new CudaBase(); + base_create = true; +} + +/* destructor */ +CudaFFT::~CudaFFT() { + if (base_create) + delete m_base; +} + +/* + Info: execute fft using cufft library + Return: success or error code +*/ +int CudaFFT::executeFFT(void * mem_ptr, int ndim, int N[3], int streamId, bool forward) { + + //create fft plan + cufftResult cresult; + cufftHandle plan; + + if (useDefaultPlan(ndim, N)) { + plan = defaultPlanZ2Z; + } else { + switch (ndim) { + case 1: + cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2Z, 1); + break; + case 2: + cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2Z); + break; + case 3: + cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2Z); + break; + default: + cresult = CUFFT_SUCCESS; + break; + } + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating plan, cuda error: " << cresult); + if (cresult == CUFFT_SETUP_FAILED) + DEBUG_MSG("Setup failed"); + + if (cresult == CUFFT_INVALID_SIZE) + DEBUG_MSG("Invalid size"); + + if (cresult == CUFFT_INVALID_TYPE) + DEBUG_MSG("Invalid type"); + + if (cresult == CUFFT_ALLOC_FAILED) + DEBUG_MSG("Alloc failed"); + + return DKS_ERROR; + } + } + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cufftSetStream(plan, m_base->cuda_getStream(streamId)); + else + cufftSetStream(plan, 0); + + //execute perform in place FFT on created plan + if (forward) { + cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr, + (cufftDoubleComplex*)mem_ptr, CUFFT_FORWARD); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing fft, cuda error: " << cresult); + cufftDestroy(plan); + return DKS_ERROR; + } + } else { + cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr, + (cufftDoubleComplex*)mem_ptr, CUFFT_INVERSE); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing ifft, cuda error: " << cresult); + cufftDestroy(plan); + return DKS_ERROR; + } + } + + //clean up resources + if (!useDefaultPlan(ndim, N)) + cufftDestroy(plan); + return DKS_SUCCESS; +} + +/* + Info: execute ifft + Return: success or error code +*/ +int CudaFFT::executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId) { + return executeFFT(mem_ptr, ndim, N, streamId, false); +} + +/* + Info: execute normalize using cuda kernel + Return: success or error code +*/ +int CudaFFT::normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId) { + + cublasStatus_t status; + unsigned int size = N[0]*N[1]*N[2]; + cuDoubleComplex alpha = make_cuDoubleComplex(1.0/size, 0); + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId)); + + status = cublasZscal(defaultCublasFFT, size, &alpha, (cuDoubleComplex*)mem_ptr, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS exec Zscal failed!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +/* + Info: execute real to complex double precision FFT + Return: success or error code +*/ +int CudaFFT::executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) { + + //create fft plan + cufftResult cresult; + cufftHandle plan; + if (useDefaultPlan(ndim, N)) { + plan = defaultPlanD2Z; + } else { + switch (ndim) { + case 1: + cresult = cufftPlan1d(&plan, N[0], CUFFT_D2Z, 1); + break; + case 2: + cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_D2Z); + break; + case 3: + cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_D2Z); + break; + default: + cresult = CUFFT_SUCCESS; + break; + } + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cresult = cufftSetStream(plan, m_base->cuda_getStream(streamId)); + else + cufftSetStream(plan, 0); + + //execute perform in place FFT on created plan + cresult = cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr, (cufftDoubleComplex*)comp_ptr); + + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing fft, cuda error: " << cresult); + if (cresult == CUFFT_INVALID_PLAN) + DEBUG_MSG("invalid plan"); + if (cresult == CUFFT_INVALID_VALUE) + DEBUG_MSG("invalid value"); + if (cresult == CUFFT_INTERNAL_ERROR) + DEBUG_MSG("internal error"); + if (cresult == CUFFT_EXEC_FAILED) + DEBUG_MSG("exec failed"); + if (cresult == CUFFT_SETUP_FAILED) + DEBUG_MSG("setup failed"); + + return DKS_ERROR; + } + + //clean up resources + if (!useDefaultPlan(ndim, N)) { + cresult = cufftDestroy(plan); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + return DKS_SUCCESS; +} + +/* + Info: exectue complex to real double precision FFT + Return: success or error code +*/ +int CudaFFT::executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) { + + //create fft plan + cufftResult cresult; + cufftHandle plan; + + if (useDefaultPlan(ndim, N)) { + plan = defaultPlanZ2D; + } else { + switch (ndim) { + case 1: + cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2D, 1); + break; + case 2: + cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2D); + break; + case 3: + cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2D); + break; + default: + cresult = CUFFT_SUCCESS; + break; + } + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cufftSetStream(plan, m_base->cuda_getStream(streamId)); + else + cufftSetStream(plan, 0); + + //execute perform in place FFT on created plan + cresult = cufftExecZ2D(plan, (cufftDoubleComplex*)comp_ptr, (cufftDoubleReal*)real_ptr); + + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing fft, cuda error: " << cresult); + cufftDestroy(plan); + return DKS_ERROR; + } + + //clean up resources + if (!useDefaultPlan(ndim, N)) { + cresult = cufftDestroy(plan); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + return DKS_SUCCESS; +} + +/* + Info: execute normalize for complex to real iFFT + Return: success or error code +*/ +int CudaFFT::normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId) { + cublasStatus_t status; + unsigned int size = N[0]*N[1]*N[2]; + double alpha = 1.0/size; + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId)); + + status = cublasDscal(defaultCublasFFT, size, &alpha, (double*)real_ptr, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS exec Zscal failed!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +/* + Info: init cufftPlans witch can be reused for all FFTs of the same size and type + Return: success or error code +*/ +int CudaFFT::setupFFT(int ndim, int N[3]) { + + cufftResult cr1 = CUFFT_SUCCESS; + cufftResult cr2 = CUFFT_SUCCESS; + cufftResult cr3 = CUFFT_SUCCESS; + + //create default fft plans + if (ndim == 1) { + cr1 = cufftPlan1d(&defaultPlanZ2Z, N[0], CUFFT_Z2Z, 1); + cr2 = cufftPlan1d(&defaultPlanD2Z, N[0], CUFFT_D2Z, 1); + cr3 = cufftPlan1d(&defaultPlanZ2D, N[0], CUFFT_Z2D, 1); + } + + if (ndim == 2) { + cr1 = cufftPlan2d(&defaultPlanZ2Z, N[1], N[0], CUFFT_Z2Z); + cr2 = cufftPlan2d(&defaultPlanD2Z, N[1], N[0], CUFFT_D2Z); + cr3 = cufftPlan2d(&defaultPlanZ2D, N[1], N[0], CUFFT_Z2D); + } + + if (ndim == 3) { + cr1 = cufftPlan3d(&defaultPlanZ2Z, N[2], N[1], N[0], CUFFT_Z2Z); + cr2 = cufftPlan3d(&defaultPlanD2Z, N[2], N[1], N[0], CUFFT_D2Z); + cr3 = cufftPlan3d(&defaultPlanZ2D, N[2], N[1], N[0], CUFFT_Z2D); + } + + if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating default plan"); + return DKS_ERROR; + } + + //create cublas plan + cublasStatus_t status; + status = cublasCreate(&defaultCublasFFT); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS create default handle failed!"); + return DKS_ERROR; + } + //std::cout << "cublas created" << std::endl; + + defaultNdim = ndim; + if (ndim > 0) { + defaultN[0] = N[0]; + defaultN[1] = N[1]; + defaultN[2] = N[2]; + } + + return DKS_SUCCESS; + +} + +/* + Info: destroy default FFT plans + Return: success or error code +*/ +int CudaFFT::destroyFFT() { + + cufftResult cr1 = CUFFT_SUCCESS; + cufftResult cr2 = CUFFT_SUCCESS; + cufftResult cr3 = CUFFT_SUCCESS; + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + if (defaultNdim > 0) { + //clean up resources + cr1 = cufftDestroy(defaultPlanZ2Z); + cr2 = cufftDestroy(defaultPlanD2Z); + cr3 = cufftDestroy(defaultPlanZ2D); + + if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) { + DEBUG_MSG("Error destroying default cufft plans"); + return DKS_ERROR; + } + + } + + if (defaultNdim > -1) { + status = cublasDestroy(defaultCublasFFT); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS delete default handle failed!"); + return DKS_ERROR; + } + } + + defaultN[0] = -1; + defaultN[1] = -1; + defaultN[2] = -1; + defaultNdim = -1; + return DKS_SUCCESS; + +} + + + diff --git a/src/CUDA/CudaFFT.cuh b/src/CUDA/CudaFFT.cuh new file mode 100644 index 0000000..0c22f2c --- /dev/null +++ b/src/CUDA/CudaFFT.cuh @@ -0,0 +1,88 @@ +#ifndef H_CUDA_FFT +#define H_CUDA_FFT + +#include +#include +#include +#include +#include "cublas_v2.h" + +#include "../Algorithms/FFT.h" +#include "CudaBase.cuh" + +class CudaFFT : public DKSFFT{ + +private: + + bool base_create; + CudaBase *m_base; + + cufftHandle defaultPlanZ2Z; + cufftHandle defaultPlanD2Z; + cufftHandle defaultPlanZ2D; + cublasHandle_t defaultCublasFFT; + +public: + + /** Constructor with CudaBase as argument */ + CudaFFT(CudaBase *base); + + /** constructor */ + CudaFFT(); + + /** destructor */ + ~CudaFFT(); + + /** + * Info: init cufftPlans witch can be reused for all FFTs of the same size and type + * Return: success or error code + */ + int setupFFT(int ndim, int N[3]); + int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + + /** + * Info: destroy default FFT plans + * Return: success or error code + */ + int destroyFFT(); + + /* + Info: execute complex to complex double precision fft using cufft library + Return: success or error code + */ + int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); + + /* + Info: execute ifft + Return: success or error code + */ + int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: execute normalize using cuda kernel for complex to complex iFFT + Return: success or error code + */ + int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: execute real to complex double precision FFT + Return: success or error code + */ + int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: exectue complex to real double precision FFT + Return: success or error code + */ + int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: execute normalize for complex to real iFFT + Return: success or error code + */ + int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1); + +}; + +#endif diff --git a/src/CUDA/CudaGreensFunction.cu b/src/CUDA/CudaGreensFunction.cu new file mode 100644 index 0000000..140954b --- /dev/null +++ b/src/CUDA/CudaGreensFunction.cu @@ -0,0 +1,469 @@ +#include "CudaGreensFunction.cuh" + +__global__ void kernelTmpgreen(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ) { + + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgreen[i + j * NI + k * NI * NJ] = tmpgrn / cellVolume; + +} + +__global__ void kernelTmpgreen_2(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ, int NK) { + + int tid = threadIdx.x; + int id = blockIdx.x * blockDim.x + tid; + + if (id < NI * NJ * NK) { + int i = id % NI; + int k = id / (NI * NJ); + int j = (id - k * NI * NJ) / NI; + + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgreen[id] = tmpgrn / cellVolume; + + } + +} + +//calculate greens integral on cpu and transfer to gpu +void kernelTmpgreenCPU(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, + int NI, int NJ, int NK) +{ + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + for (int k = 0; k < NK; k++) { + for (int j = 0; j < NJ; j++) { + for (int i = 0; i < NI; i++) { + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = 0; + tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgrn = tmpgrn / cellVolume; + + tmpgreen[k*NJ*NI + j*NJ + i] = tmpgrn; + } + } + } + +} + + +__global__ void kernelIngration(double *rho2_m, double *tmpgreen, int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int ni = NI; + int nj = NJ; + + double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0; + tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; + + + if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) + tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp) + tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp) + tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (k+1 < NK_tmp) + tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && j+1 < NJ_tmp) + tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && k+1 < NK_tmp) + tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp && k+1 < NK_tmp) + tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + + double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; + + rho2_m[i + j*ni + k*ni*nj] = tmp_rho; + +} + +__global__ void kernelIngration_2(double *rho2_m, double *tmpgreen, + int NI, int NJ, + int NI_tmp, int NJ_tmp, int NK_tmp) { + + int tid = threadIdx.x; + int id = blockIdx.x * blockDim.x + tid; + + int ni = NI; + int nj = NJ; + + double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + if (id < NI_tmp * NJ_tmp * NK_tmp) { + int i = id % NI_tmp; + int k = id / (NI_tmp * NJ_tmp); + int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp; + + tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0; + tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; + + if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) + tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp) + tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp) + tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (k+1 < NK_tmp) + tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && j+1 < NJ_tmp) + tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && k+1 < NK_tmp) + tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp && k+1 < NK_tmp) + tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; + + rho2_m[i + j*ni + k*ni*nj] = tmp_rho; + } + +} + + +//just one kernel will be executed +__global__ void mirroredRhoField0(double *rho2_m, int NI, int NJ) { + rho2_m[0] = rho2_m[NI*NJ]; +} + +__global__ void mirroredRhoFieldI(double *rho2_m, int NI, int NJ) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int idx1 = i + j*NI + k*NI*NJ; + int idx2 = (NI-i) + j*NI + k*NI*NJ; + + if (NI-i < NI) + rho2_m[idx2] = rho2_m[idx1]; + +} + +__global__ void mirroredRhoFieldJ(double *rho2_m, int NI, int NJ) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int idx1 = i + j*NI + k*NI*NJ; + int idx2 = i + (NJ-j)*NI + k*NI*NJ; + + if (NJ-j < NJ) + rho2_m[idx2] = rho2_m[idx1]; + +} + +__global__ void mirroredRhoFieldK(double *rho2_m, int NI, int NJ, int NK) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int idx1 = i + j*NI + k*NI*NJ; + int idx2 = i + j*NI + (NK-k)*NI*NJ; + + if (NK-k < NK) + rho2_m[idx2] = rho2_m[idx1]; + +} + +__global__ void mirroredRhoField(double *rho2_m, + int NI, int NJ, int NK, + int NI_tmp, int NJ_tmp, int NK_tmp) { + + int tid = threadIdx.x; + int id = blockIdx.x * blockDim.x + tid; + + int id1, id2, id3, id4, id5, id6, id7, id8; + + if (id < NI_tmp * NJ_tmp * NK_tmp) { + int i = id % NI_tmp; + int k = id / (NI_tmp * NJ_tmp); + int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp; + + int ri = NI - i; + int rj = NJ - j; + int rk = NK - k; + + id1 = k * NI * NJ + j * NI + i; + id2 = k * NI * NJ + j * NI + ri; + id3 = k * NI * NJ + rj * NI + i; + id4 = k * NI * NJ + rj * NI + ri; + + id5 = rk * NI * NJ + j * NI + i; + id6 = rk * NI * NJ + j * NI + ri; + id7 = rk * NI * NJ + rj * NI + i; + id8 = rk * NI * NJ + rj * NI + ri; + + + double data = rho2_m[id1]; + if (i != 0) + rho2_m[id2] = data; + + if (j != 0) + rho2_m[id3] = data; + + if (i != 0 && j != 0) + rho2_m[id4] = data; + + if (k != 0) + rho2_m[id5] = data; + + if (k != 0 && i != 0) + rho2_m[id6] = data; + + if (k!= 0 && j != 0) + rho2_m[id7] = data; + + if (k != 0 && j != 0 & i != 0) + rho2_m[id8] = data; + + } + +} + +__device__ inline cuDoubleComplex ComplexMul(cuDoubleComplex a, cuDoubleComplex b) { + + cuDoubleComplex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + + return c; + +} + +__global__ void multiplyComplexFields(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2) { + + int idx = blockIdx.x; + + ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]); +} + + +/* +copy data in shared memory first to improve memory access (few global memory accesses, maybo no improvements) +use more threads per block to improve occupancy of hardware (test for best block and thread sizes) +*/ +__global__ void multiplyComplexFields_2(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2, + int size) +{ + + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + + extern __shared__ cuDoubleComplex data[]; + + if (idx < size) { + data[2*tid] = ptr1[idx]; + data[2*tid + 1] = ptr2[idx]; + } + + __syncthreads(); + + if (idx < size) + ptr1[idx] = ComplexMul(data[2*tid], data[2*tid+1]); + + +} + + +CudaGreensFunction::CudaGreensFunction(CudaBase *base) { + m_base = base; + base_create = false; +} + +/* constructor */ +CudaGreensFunction::CudaGreensFunction() { + m_base = new CudaBase(); + base_create = true; +} + +/* destructor */ +CudaGreensFunction::~CudaGreensFunction() { + if (base_create) + delete m_base; +} + +int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, + double hr_m0, double hr_m1, double hr_m2, + int streamId) +{ + + int thread = 128; + int block = (I * J * K / thread) + 1; + + //if no stream specified use default stream + if (streamId == -1) { + kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K); + + return DKS_SUCCESS; + } + + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K); + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} + +int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, + int I, int J, int K, + int streamId) +{ + + int thread = 128; + int block = (I * J * K / thread) + 1; + + if (streamId == -1) { + kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, + 2*(I - 1), 2*(J - 1), I, J, K); + return DKS_SUCCESS; + } + + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, + 2*(I - 1), 2*(J - 1), I, J, K); + return DKS_SUCCESS; + } + + + return DKS_ERROR; +} + +int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { + + int thread = 128; + int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1; + + if (streamId == -1) { + mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J); + mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1); + return DKS_SUCCESS; + } + + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J); + mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1); + + return DKS_SUCCESS; + } + + + + return DKS_ERROR; +} + +int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, + int size, int streamId) { + + int threads = 128; + int blocks = size / threads + 1; + int datasize = 2 * threads * sizeof(cuDoubleComplex); + + if (streamId == -1) { + multiplyComplexFields_2<<>> ( (cuDoubleComplex*)ptr1, + (cuDoubleComplex*)ptr2, + size); + return DKS_SUCCESS; + } + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + multiplyComplexFields_2<<>> ( (cuDoubleComplex*)ptr1, + (cuDoubleComplex*) ptr2, size); + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} + + + diff --git a/src/CUDA/CudaGreensFunction.cuh b/src/CUDA/CudaGreensFunction.cuh new file mode 100644 index 0000000..5095e7a --- /dev/null +++ b/src/CUDA/CudaGreensFunction.cuh @@ -0,0 +1,63 @@ +#ifndef H_CUDA_GREENSFUNCTION +#define H_CUDA_GREENSFUNCTION + +#include +#include + +#include +#include +#include +#include "cublas_v2.h" + + +#include "CudaBase.cuh" + +class CudaGreensFunction { + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** Constructor with CudaBase argument */ + CudaGreensFunction(CudaBase *base); + + /* constructor */ + CudaGreensFunction(); + + /* destructor */ + ~CudaGreensFunction(); + + /* + Info: calc itegral on device memory (taken from OPAL src code) + Return: success or error code + */ + int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, + double hr_m0, double hr_m1, double hr_m2, + int streamId = -1); + + /* + Info: integration of rho2_m field (taken from OPAL src code) + Return: success or error code + */ + int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, + int streamId = -1); + + /* + Info: mirror rho field (taken from OPAL src code) + Return: succes or error code + */ + int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); + + /* + Info: multiply complex fields already on the GPU memory, result will be put in ptr1 + Return: success or error code + */ + int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); + + +}; + +#endif diff --git a/src/CUDA/CudaImageReconstruction.cu b/src/CUDA/CudaImageReconstruction.cu new file mode 100644 index 0000000..14ab4ee --- /dev/null +++ b/src/CUDA/CudaImageReconstruction.cu @@ -0,0 +1,1221 @@ +#include "CudaImageReconstruction.cuh" + +//x_edge, y_edge, z_edge and matrix_distance_factor need to be set as const for the run +//voxel_x, voxel_y and voxel_z also need to be set as const for the run +__device__ float d_x_edge = 30.8; +__device__ float d_y_edge = 30.8; +__device__ float d_z_edge = 16.8; + +__device__ float d_matrix_distance_factor = 1.2; + +__device__ int d_voxel_x = 90; +__device__ int d_voxel_y = 90; +__device__ int d_voxel_z = 50; + +__device__ float d_voxel_size = 0.7; + + +//phantom_diameter needs to be defined, atten_per_mm as well +__device__ float d_phantom_diameter = 51; +__device__ float d_atten_per_mm = 0.0095; +__device__ float d_ring_diameter = 138; +__device__ float d_minimum_CrystalDistance_InOneRing = 123.489; + + +__device__ float d_x_edge1 = 29.26; +__device__ float d_y_edge1 = 29.26; +__device__ float d_z_edge1 = 15.96; +__device__ float d_z_edge2 = 14.28; +__device__ float d_minimum_CrystalDistance_InOneRing1 = 127.681; + + +__device__ inline float distance(VoxelPosition &a, VoxelPosition &b) { + float dist_x = pow(a.x - b.x, 2); + float dist_y = pow(a.y - b.y, 2); + float dist_z = pow(a.z - b.z, 2); + return sqrt(dist_x + dist_y + dist_z); +} + +__global__ void kernelCalculateSource(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - diameter) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - diameter) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - diameter) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + diameter) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + diameter) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + diameter) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + if (dist < diameter * 0.5 ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + } +} + +__global__ void kernelCalculateBackground(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + //if ( dist > diameter * 0.5 && dist < (diameter * 0.5 + 1) ) { + if ( dist > diameter * 0.5 && dist < (diameter) ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + + } +} + + +__global__ void kernelCalculateSources(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float *diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + float diam = diameter[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - diam) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - diam) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - diam) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + diam) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + diam) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + diam) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + if (dist < diam * 0.5 ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + + } +} + +__global__ void kernelCalculateBackgrounds(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float *diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + float diam = diameter[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - (diam + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - (diam + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - (diam + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + (diam + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + (diam + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + (diam + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + //if ( dist > diam * 0.5 && dist < (diam * 0.5 + 1) ) { + if ( dist > diam * 0.5 && dist < diam ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + + } +} + +__device__ void localRaytracingX(float *recon, VoxelPosition *image_position, + float &atten_factor, float &slope_y, float &slope_z, + float &a_x, float &a_y, float &a_z) +{ + + for (int x = 0; x < d_voxel_x; x++) { + float lor_x = image_position[x].x; + float lor_y = slope_y * ( lor_x - a_x ) + a_y; + float lor_z = slope_z * ( lor_x - a_x ) + a_z; + + if ( pow(lor_x / d_x_edge,2) + pow( lor_y/d_y_edge, 2) < 1.0 && abs(lor_z) < d_z_edge ) { + + int y = floor( (lor_y+d_y_edge) / d_voxel_size); + int z = floor( (lor_z+d_z_edge) / d_voxel_size); + + int voxel_id = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + + voxel_id = z * d_voxel_y * d_voxel_x + (y+1) * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + + voxel_id = (z+1) * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + + voxel_id = (z+1) * d_voxel_y * d_voxel_x + (y+1) * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + } + } + +} + + +__device__ void localRaytracingY(float *recon, VoxelPosition *image_position, + float &atten_factor, float &slope_x, float &slope_z, + float &a_x, float &a_y, float &a_z) +{ + + for (int y=0;y0.001) { + distance_xy = 2.0 * sqrt( distance_tocenter ) ; + } + else + distance_xy = 0.0; + + float distance_z = abs( a_z - b_z ) * distance_xy / distance_lor_xy; + float distance = sqrt( pow(distance_xy,2) + pow(distance_z,2) ); + + return exp(-distance*d_atten_per_mm); + +} + +__global__ void kernelNormalization(float *recon, VoxelPosition *image_position, + VoxelPosition *det_position, int total_det) +{ + + int tidx = threadIdx.x; + int tidy = threadIdx.y; + + int detA = blockIdx.x * blockDim.x + tidx; + int detB = blockIdx.y * blockDim.y + tidy; + + if (detA != detB && detA < total_det && detB < total_det) { + + VoxelPosition pA = det_position[detA]; + VoxelPosition pB = det_position[detB]; + + float distance_x = abs( pA.x - pB.x); + float distance_y = abs( pA.y - pB.y); + float distance_z = abs( pA.z - pB.z); + + if( sqrt(pow(distance_x,2) + pow(distance_y,2)) > d_minimum_CrystalDistance_InOneRing) { + float atten_factor; + atten_factor = atten_factor_calcu(pA.x,pA.y,pA.z,pB.x,pB.y,pB.z); + + if (distance_x > distance_y && distance_x > distance_z) { + + float slope_y = ( pB.y - pA.y ) / ( pB.x - pA.x ); + float slope_z = ( pB.z - pA.z ) / ( pB.x - pA.x ); + + localRaytracingX(recon, image_position, atten_factor, slope_y, slope_z, pA.x, pA.y, pA.z); + + } + else if (distance_y > distance_z) { + + float slope_x = ( pB.x - pA.x ) / ( pB.y - pA.y ); + float slope_z = ( pB.z - pA.z ) / ( pB.y - pA.y ); + + localRaytracingY(recon, image_position, atten_factor, slope_x, slope_z, pA.x, pA.y, pA.z); + } + else { + + float slope_x = ( pB.x - pA.x ) / ( pB.z - pA.z ); + float slope_y = ( pB.y - pA.y ) / ( pB.z - pA.z ); + + localRaytracingZ(recon, image_position, atten_factor, slope_x, slope_y, pA.x, pA.y, pA.z); + } + + } + } +} + +__device__ float localRaytracingForwardX(float*recon, VoxelPosition &pos, + float &a_x, float &a_y, float &a_z, + float &b_x, float &b_y, float &b_z) +{ + + float result = 0.000001; + float slope_y = ( b_y - a_y ) / ( b_x - a_x); + float slope_z = ( b_z - a_z ) / ( b_x - a_x); + + for (int x=0; x d_minimum_CrystalDistance_InOneRing1 && + (abs(distance_z1) distance_y && distance_x > distance_z) + branch = 1; + else if (distance_y > distance_z) + branch = 2; + else + branch = 3; + + } + + event_branch[idx] = branch; + } + +} + +__global__ void kernelZeroBackward(float *recon_corrector, int size) { + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + recon_corrector[idx] = 0; + +} + +__device__ void localRaytracingBackwardX(float &correction, float *recon_corrector, + VoxelPosition &pos, + float &a_x, float &a_y, float &a_z, + float &b_x, float &b_y, float &b_z) +{ + + float slope_y = ( b_y - a_y ) / ( b_x - a_x); + float slope_z = ( b_z - a_z ) / ( b_x - a_x); + + for (int x=0;x 0) { + corr = correction[idx]; + pos = image_position[0]; + } + + if (branch == 1) + localRaytracingBackwardX(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z); + else if (branch == 2) + localRaytracingBackwardY(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z); + else if (branch == 3) + localRaytracingBackwardZ(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z); + + } + +} + + +int CudaImageReconstruction::calculateSource(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + //call kernel + kernelCalculateSource<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::calculateBackground(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + + //call kernel + kernelCalculateBackground<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::calculateSources(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + //call kernel + kernelCalculateSources<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + (float*) diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::calculateBackgrounds(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + + //call kernel + kernelCalculateBackgrounds<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + (float*) diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::generateNormalization(void *recon, void *image_position, + void *det_position, int total_det) +{ + + int blocksize = 32; + dim3 threads(blocksize, blocksize, 1); + + dim3 blocks(total_det / blocksize + 1, total_det / blocksize + 1); + + kernelNormalization<<>>( (float*) recon, + (VoxelPosition*) image_position, + (VoxelPosition*) det_position, + total_det); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error launching normalization kernel!"); + std::cout << cudaGetErrorString(err); + return DKS_ERROR; + } + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::forwardProjection(void *correction, void *recon, + void *list_data, void *det_position, + void *image_position, int num_events) +{ + + int threads = BLOCK_SIZE; + int blocks = num_events / threads + 1; + + int ierr; + m_event_branch = m_base->cuda_allocateMemory(sizeof(int)*num_events, ierr); + + kernelCheckEvents<<>>((ListEvent*)list_data, + (VoxelPosition*)det_position, + (int*)m_event_branch, + num_events); + + //warp mem pointers with thrust device ptr + thrust::device_ptr t_event_branch( (int*)m_event_branch ); + thrust::device_ptr t_list_data( (ListEvent*)list_data ); + + thrust::sort_by_key( t_event_branch, t_event_branch + num_events, t_list_data ); + + kernelForwardProjection<<>>( (float*)correction, + (float*)recon, + (ListEvent*)list_data, + (VoxelPosition*)det_position, + (VoxelPosition*)image_position, + (int*)m_event_branch, + num_events); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::cout << "Error launching kernel!" << std::endl; + std::cout << cudaGetErrorString(err) << std::endl; + + } + + return DKS_SUCCESS; + +} + + +int CudaImageReconstruction::backwardProjection(void *correction, void *recon_corrector, + void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels) +{ + + int threads = BLOCK_SIZE; + int blocks1 = num_voxels / threads + 1; + int blocks2 = num_events / threads + 1; + + kernelZeroBackward<<>>((float*)recon_corrector, num_voxels); + + + kernelBackwardProjection<<>>( (float*)correction, + (float*)recon_corrector, + (ListEvent*)list_data, + (VoxelPosition*)det_position, + (VoxelPosition*)image_position, + (int*)m_event_branch, + num_events); + + m_base->cuda_freeMemory( m_event_branch ); + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setDimensions(int voxel_x, int voxel_y, int voxel_z, + float voxel_size) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_voxel_x, &voxel_x, sizeof(int)); + cudaMemcpyToSymbol(d_voxel_y, &voxel_y, sizeof(int)); + cudaMemcpyToSymbol(d_voxel_z, &voxel_z, sizeof(int)); + cudaMemcpyToSymbol(d_voxel_size, &voxel_size, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setEdge(float x_edge, float y_edge, float z_edge) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_x_edge, &x_edge, sizeof(float)); + cudaMemcpyToSymbol(d_y_edge, &y_edge, sizeof(float)); + cudaMemcpyToSymbol(d_z_edge, &z_edge, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_x_edge1, &x_edge1, sizeof(float)); + cudaMemcpyToSymbol(d_y_edge1, &y_edge1, sizeof(float)); + cudaMemcpyToSymbol(d_z_edge1, &z_edge1, sizeof(float)); + cudaMemcpyToSymbol(d_z_edge2, &z_edge2, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setMinCrystalInRing(float min_CrystalDist_InOneRing, + float min_CrystalDist_InOneRing1) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_minimum_CrystalDistance_InOneRing, + &min_CrystalDist_InOneRing, sizeof(float)); + + cudaMemcpyToSymbol(d_minimum_CrystalDistance_InOneRing1, + &min_CrystalDist_InOneRing1, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_matrix_distance_factor, &matrix_distance_factor, sizeof(float)); + cudaMemcpyToSymbol(d_phantom_diameter, &phantom_diameter, sizeof(float)); + cudaMemcpyToSymbol(d_atten_per_mm, &atten_per_mm, sizeof(float)); + cudaMemcpyToSymbol(d_ring_diameter, &ring_diameter, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} diff --git a/src/CUDA/CudaImageReconstruction.cuh b/src/CUDA/CudaImageReconstruction.cuh new file mode 100644 index 0000000..4cf532c --- /dev/null +++ b/src/CUDA/CudaImageReconstruction.cuh @@ -0,0 +1,118 @@ +#ifndef H_CUDA_IMAGERECONSTRUCTION +#define H_CUDA_IMAGERECONSTRUCTION + +#include +#include +#include +#include +#include + +#include "../Algorithms/ImageReconstruction.h" +#include "CudaBase.cuh" + +class CudaImageReconstruction : public ImageReconstruction { + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** Constructor */ + CudaImageReconstruction() { + m_base = new CudaBase(); + base_create = true; + }; + + /** Constructor with base **/ + CudaImageReconstruction(CudaBase *base) { + m_base = base; + base_create = false; + } + + /** Destructor */ + ~CudaImageReconstruction() { + if (base_create) + delete m_base; + }; + + /** CUDA implementation of caluclate source + */ + int calculateSource(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + /** Cuda implementation of calculate background + */ + int calculateBackground(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + /** + * Caluclate source for differente sources + */ + int calculateSources(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** + * Calculate background for differente sources + */ + int calculateBackgrounds(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** Generate normalization. + * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel + * that updates voxel values in the image on the slope between these two detectors. + */ + int generateNormalization(void *recon, void *image_position, + void *det_position, int total_det); + + + /** Calculate forward projection. + * For image reconstruction calculates forward projections. + * see recon.cpp for details + */ + int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, + void *image_position, int num_events); + + /** Calculate backward projection. + * For image reconstruction calculates backward projections. + * see recon.cpp for details + */ + int backwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels); + + /** Set the voxel dimensins on device. + * + */ + int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); + + /** Set the image edge. + * + */ + int setEdge(float x_edge, float y_edge, float z_edge); + + /** Set the image edge1. + * + */ + int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); + + /** Set the minimum crystan in one ring values. + * + */ + int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); + + /** Set all other required parameters for reconstruction. + * + */ + int setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter); + + +}; + +#endif diff --git a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu new file mode 100644 index 0000000..b22cab7 --- /dev/null +++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu @@ -0,0 +1,316 @@ +#define PI 3.141592653589793115998 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +/** Theory function declaration. + * Definition of the theory function will be build during runtime before compilation. + */ +__device__ double fTheory(double t, double *p, double *f, int *m); + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +__device__ double se(double t, double lamda) { + return exp( -lamda*t ); +} + +__device__ double ge(double t, double lamda, double beta) { + return exp( -pow(lamda*t, beta) ); +} + +__device__ double sg(double t, double sigma) { + return exp( -0.5*pow(sigma*t, 2.0) ); +} + +__device__ double stg(double t, double sigma) { + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5*sigmatsq); +} + +__device__ double sekt(double t, double lambda) { + double lambdat = lambda*t; + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +__device__ double lgkt(double t, double lambda, double sigma) { + double lambdat = lambda*t; + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +__device__ double skt(double t, double sigma, double beta) { + if (beta < 1.0e-3) + return 0.0; + double sigmatb = pow(sigma*t, beta); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta); +} + +__device__ double spg(double t, double lambda, double gamma, double q) { + double lam2 = lambda*lambda; + double lamt2q = t*t*lam2*q; + double rate2 = 4.0*lam2*(1.0-q)*t/gamma; + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +__device__ double rahf(double t, double nu, double lambda) { + double nut = nu*t; + double nuth = nu*t/2.0; + double lamt = lambda*t; + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +__device__ double tf(double t, double phi, double nu) { + double tmp_nu = TWO_PI*nu*t; + double tmp_phi = DEG_TO_RAD*phi; + + return cos(tmp_nu + tmp_phi); +} + +__device__ double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + + return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +__device__ double b(double t, double phi, double nu) { + return j0(TWO_PI*nu*t + DEG_TO_RAD*phi); +} + +__device__ double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI * nu * t; + double ph = DEG_TO_RAD * phi; + + return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +__device__ double ab(double t, double sigma, double gamma) { + double gt = gamma*t; + + return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt)); +} + +__device__ double snkzf(double t, double Delta0, double Rb) { + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +__device__ double snktf(double t, double phi, double nu, double Delta0, double Rb) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +__device__ double dnkzf(double t, double Delta0, double Rb, double nuc) { + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa); +} + +__device__ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph); +} + +/** Theory and chisquare functions. + * Based on the compiler flags set theory is calculated either in single hist mode or asymetric. + * Based on the compiler flags calculate either chisq or MLE + */ + +__device__ inline double singleHist(double &N0, double &tau, double &bkg, double &f, double &t) { + return N0 * exp (-t/tau ) * (1.0 + f) + bkg; +} + +__device__ inline double asymetry(double &a, double &b, double &f) { + return (f * (a * b) - (a - 1.0)) / ((a + 1.0) - f * (a * b - 1.0)); +} + +__device__ inline double getTheory(double &c1, double &c2, double &c3, double &f, double &t) { +#ifndef ASYMETRY + return singleHist(c1, c2, c3, f, t); +#elif + return asymetry(c1, c2, f); +#endif +} + +__device__ inline double chiSq(double &data, double &theo, double &err) { + double res = (theo - data) * (theo - data); + if (err != 0.0) + res /= err; + + return res; +} + +__device__ inline double mle(double &data, double &theo, double &err) { + double res = (theo - data); + if ( data > 1.0e-9 && fabs(theo) > 1.0e-9 ) + res += data * log(data / theo); + + return res; +} + +__device__ inline double getChiSq(double &data, double &theo, double &err) { +#ifndef MLE + return chiSq(data, theo, err); +#elif + return mle(data, theo, err); +#endif +} + +//----------------------------------------------------------------------------------------------- +/** + * Kernel to calculate theory function and chisquare/mle values for single histogram fits. + */ +extern "C" __global__ void kernelChiSquareSingleHisto(double *data, double *err, double *par, + double *chisq, int *map, double *funcv, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double tau, double N0, double bkg) { + //define shared variable for parameters + extern __shared__ double smem[]; + double *p = (double*)smem; + double *f = (double*)&smem[numpar]; + int *m = (int*)&smem[numpar + numfunc]; + + //get thread id and calc global id + int tid; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + tid = threadIdx.x; + while (tid < numpar) { + p[tid] = par[tid]; + tid += blockDim.x; + } + + //load functions from global to shared memory + tid = threadIdx.x; + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += blockDim.x; + } + + //load maps from global memory + tid = threadIdx.x; + while (tid < nummap) { + m[tid] = map[tid]; + tid += blockDim.x; + } + + //sync threads + __syncthreads(); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg; + + #ifdef MLH + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo)); + else + chisq[j] = 2.0 * (theo - ldata); + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += gridDim.x * blockDim.x; + + } +} + +//----------------------------------------------------------------------------------------------- +/** + * Kernel to calculate theory function and chisquare/mle values for asymmetry fits. + */ +extern "C" __global__ void kernelChiSquareAsymmetry(double *data, double *err, double *par, + double *chisq, int *map, double *funcv, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double alpha, double beta) { + //define shared variable for parameters + extern __shared__ double smem[]; + double *p = (double*)smem; + double *f = (double*)&smem[numpar]; + int *m = (int*)&smem[numpar + numfunc]; + + //get thread id and calc global id + int tid; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + tid = threadIdx.x; + while (tid < numpar) { + p[tid] = par[tid]; + tid += blockDim.x; + } + + //load functions from global to shared memory + tid = threadIdx.x; + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += blockDim.x; + } + + //load maps from global memory + tid = threadIdx.x; + while (tid < nummap) { + m[tid] = map[tid]; + tid += blockDim.x; + } + + //sync threads + __syncthreads(); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double theoVal = fTheory(t, p, f, m); + double ab = alpha*beta; + + double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0) - (ab-1.0)*theoVal); + + #ifdef MLH + chisq[j] = 0.0; // log max likelihood not defined here + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += gridDim.x * blockDim.x; + } +} + diff --git a/src/DKSBase.cpp b/src/DKSBase.cpp new file mode 100644 index 0000000..96e9b19 --- /dev/null +++ b/src/DKSBase.cpp @@ -0,0 +1,861 @@ +#include "DKSBase.h" + +#define API_OPENCL "OpenCL" +#define API_CUDA "Cuda" +#define API_OPENMP "OpenMP" + +#define DEVICE_GPU "-gpu" +#define DEVICE_CPU "-cpu" +#define DEVICE_MIC "-mic" + +//=====================================// +//==========Private functions==========// +//=====================================// + +bool DKSBase::apiOpenCL() { + + if (!m_api_set) + return false; + + if (strcmp(m_api_name, API_OPENCL) != 0) + return false; + + return true; +} + +bool DKSBase::apiCuda() { + + if (!m_api_set) + return false; + + if (strcmp(m_api_name, API_CUDA) != 0) + return false; + + return true; +} + +bool DKSBase::apiOpenMP() { + if (!m_api_set) + return false; + + if (strcmp(m_api_name, API_OPENMP) != 0) + return false; + + return true; +} + +bool DKSBase::deviceGPU() { + if (!m_device_set) + return false; + if (strcmp(m_device_name, DEVICE_GPU) != 0) + return false; + + return true; +} + +bool DKSBase::deviceCPU() { + if (!m_device_set) + return false; + if (strcmp(m_device_name, DEVICE_CPU) != 0) + return false; + + return true; +} + +bool DKSBase::deviceMIC() { + if (!m_device_set) + return false; + if (strcmp(m_device_name, DEVICE_MIC) != 0) + return false; + + return true; +} + + +int DKSBase::loadOpenCLKernel(const char *kernel_name) { + //load kernel + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, kernel_name); + int ierr = OPENCL_SAFECALL( oclbase->ocl_loadKernel(kernel_file) ); + delete[] kernel_file; + + return ierr; +} + +//=====================================// +//==========Public functions===========// +//=====================================// + +DKSBase::DKSBase() { + + m_device_name = NULL; + m_api_name = NULL; + m_function_name = NULL; + + m_device_set = false; + m_api_set = false; + m_function_set = false; + + m_auto_tuning = false; + m_use_config = false; + +#ifdef DKS_CUDA + cbase = new CudaBase(); + cfft = new CudaFFT(cbase); + cgreens = new CudaGreensFunction(cbase); + cchi = new CudaChiSquare(cbase); + ccol = new CudaCollimatorPhysics(cbase); +#endif + +#ifdef DKS_OPENCL + oclbase = new OpenCLBase(); + oclfft = new OpenCLFFT(oclbase); + oclchi = new OpenCLChiSquare(oclbase); + oclcol = new OpenCLCollimatorPhysics(oclbase); +#endif + +#ifdef DKS_MIC + micbase = new MICBase(); + micfft = new MICFFT(micbase); + miccol = new MICCollimatorPhysics(micbase); + micgreens = new MICGreensFunction(micbase); + micchi = new MICChiSquare(micbase); +#endif + +} + +DKSBase::DKSBase(const char* api_name, const char* device_name) { + + setAPI(api_name, strlen(api_name)); + setDevice(device_name, strlen(device_name)); + m_function_name = NULL; + m_function_set = false; + + m_auto_tuning = false; + m_use_config = false; + +#ifdef DKS_CUDA + cbase = new CudaBase(); + cfft = new CudaFFT(cbase); + cgreens = new CudaGreensFunction(cbase); + cchi = new CudaChiSquare(cbase); + ccol = new CudaCollimatorPhysics(cbase); +#endif + +#ifdef DKS_OPENCL + oclbase = new OpenCLBase(); + oclfft = new OpenCLFFT(oclbase); + oclchi = new OpenCLChiSquare(oclbase); + oclcol = new OpenCLCollimatorPhysics(oclbase); +#endif + +#ifdef DKS_MIC + micbase = new MICBase(); + micfft = new MICFFT(micbase); + miccol = new MICCollimatorPhysics(micbase); + micgreens = new MICGreensFunction(micbase); + micchi = new MICChiSquare(micbase); +#endif + +} + + +DKSBase::~DKSBase() { + + if (m_device_name != NULL) + delete[] m_device_name; + + if (m_api_name != NULL) + delete[] m_api_name; + + if (m_function_name != NULL) + delete[] m_function_name; + + +#ifdef DKS_CUDA + delete cfft; + delete cgreens; + delete cchi; + delete ccol; + delete cbase; +#endif + +#ifdef DKS_OPENCL + delete oclfft; + delete oclchi; + delete oclcol; + delete oclbase; +#endif + +#ifdef DKS_MIC + delete micfft; + delete miccol; + delete micgreens; + delete micchi; + delete micbase; +#endif + +} + +/* + Name: setDevice + Info: sets specific device to use. length specifies device_name string length (deprecated) + Return: success or error code +*/ +int DKSBase::setDevice(const char* device_name, int length) { + + if (m_device_set) + delete[] m_device_name; + + int l = strlen(device_name); + m_device_name = new char[l+1]; + + for (int i = 0; i < l; i++) + m_device_name[i] = device_name[i]; + m_device_name[l] = '\0'; + + m_device_set = true; + + return DKS_SUCCESS; + +} + +/* + Name: setAPI + Info: sets specific api (OpenCL, CUDA, OpenACC, OpenMP) to use + Return: success or error code +*/ +int DKSBase::setAPI(const char* api_name, int length) { + + if (m_api_set) + delete[] m_api_name; + + int l = strlen(api_name); + m_api_name = new char[l+1]; + + for (int i = 0; i < l; i++) + m_api_name[i] = api_name[i]; + m_api_name[l] = '\0'; + + m_api_set = true; + + return DKS_SUCCESS; +} + +/* + Name: getDevices + Info: get all available devices + Return: success or error code +*/ +int DKSBase::getDevices() { + + int ierr1 = OPENCL_SAFECALL( oclbase->ocl_getAllDevices() ); + int ierr2 = CUDA_SAFECALL( cbase->cuda_getDevices() ); + int ierr3 = MIC_SAFECALL( micbase->mic_getDevices() ); + + if (ierr1 + ierr2 + ierr3 != DKS_SUCCESS) + return DKS_ERROR; + + return DKS_SUCCESS; +} + +int DKSBase::getDeviceCount(int &ndev) { + ndev = 0; + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_getDeviceCount(ndev) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_getDeviceCount(ndev) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +int DKSBase::getDeviceName(std::string &device_name) { + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_getDeviceName(device_name) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_getDeviceName(device_name) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +int DKSBase::setDefaultDevice(int device) { + std::cout << "Set device " << device << std::endl; + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_setDevice(device) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_setDevice(device) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +int DKSBase::getDeviceList(std::vector &devices) { + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_getUniqueDevices(devices) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_getUniqueDevices(devices) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +/* + init device +*/ +int DKSBase::initDevice() { + + //if api is not set default is OpenCL + if (!m_api_set) { + setDevice("-gpu", 4); + setAPI(API_OPENCL, 6); + return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); + } else { + if (apiOpenCL()) { + if (!m_device_set) { + setDevice("-gpu", 4); + setAPI(API_OPENCL, 6); + return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); + } else { + setAPI(API_OPENCL, 6); + return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) ); + } + } else if (apiCuda()) { + setDevice("-gpu", 4); + setAPI(API_CUDA, 4); + return CUDA_SAFECALL(DKS_SUCCESS); + } else if (apiOpenMP()) { + setDevice("-mic", 4); + setAPI(API_OPENMP, 6); + return MIC_SAFECALL(DKS_SUCCESS); + } + } + + return DKS_ERROR; +} + +/* + set up cuda, opencl and mic to allow async data transfer and kernel execution. + name stream 'stolen' from cuda. opencl context ~ cuda stream. + TODO: implementations for OpenCL and MIC still needed +*/ +int DKSBase::createStream(int &streamId) { + + if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_createStream(streamId) ); + else if (apiOpenMP()) + return MIC_SAFECALL( micbase->mic_createStream(streamId) ); + + DEBUG_MSG("Streams not enbled for this platforms jet"); + return DKS_ERROR; +} + +/* send device pointer to other processes */ +#ifdef DKS_MPI +int DKSBase::sendPointer(void *mem_ptr, int dest, MPI_Comm comm) { + + if ( apiCuda() ) { +#ifdef DKS_CUDA + cudaError cerror; + cudaIpcMemHandle_t shandle; + cerror = cudaIpcGetMemHandle(&shandle, mem_ptr); + MPI_Send(&shandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, dest, 100, comm); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error geting mem handle"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +#endif + } + else if (apiOpenMP()) { +#ifdef DKS_MIC + //BENI: + DEBUG_MSG("No SendPointer for MIC is implemented"); + return DKS_ERROR; +#endif + } + else { + DEBUG_MSG("Send device pointer not implemented on selected platform"); + return DKS_ERROR; + } + return DKS_ERROR; +} +#endif + +/* receive device pointer */ +#ifdef DKS_MPI +void * DKSBase::receivePointer(int hostproc, MPI_Comm comm, int &ierr) { + + void *mem_ptr; + if (apiCuda()) { +#ifdef DKS_CUDA + cudaError cerror; + cudaIpcMemHandle_t rhandle; + MPI_Recv(&rhandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, hostproc, 100, comm, NULL); + cerror = cudaIpcOpenMemHandle(&mem_ptr, rhandle, cudaIpcMemLazyEnablePeerAccess); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error opening received handle"); + ierr = DKS_ERROR; + } +#endif + return mem_ptr; + } + else if (apiOpenMP()) { +#ifdef DKS_MIC + //BENI: + DEBUG_MSG("No ReceivePointer for MIC is implemented"); + return DKS_SUCCESS; +#endif + return mem_ptr; + } + else { + ierr = DKS_ERROR; + DEBUG_MSG("Receive device pointer not implemented for selected platform"); + return mem_ptr; + } +} +#endif + +/* close received handle */ +int DKSBase::closeHandle(void *mem_ptr) { + + if (apiCuda()) { +#ifdef DKS_CUDA + cudaError cerror; + cerror = cudaIpcCloseMemHandle(mem_ptr); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error closing memory handle"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +#endif + } + + DEBUG_MSG("Memory handles not implemented for selected platform"); + return DKS_ERROR; + +} + +/* sync device calls */ +int DKSBase::syncDevice() { + + if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_syncDevice() ); + else if (apiOpenMP()) + return MIC_SAFECALL( micbase->mic_syncDevice() ); + + return DKS_ERROR; +} + +/* setup fft plans to reuse if multiple ffts of same size are needed */ +int DKSBase::setupFFT(int ndim, int N[3]) { + + if (apiCuda()) { + return CUDA_SAFECALL( cfft->setupFFT(ndim, N) ); + } else if (apiOpenMP()) { + //micbase.mic_setupFFT(ndim, N); + //BENI: setting up RC and CR transformations on MIC + int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) ); + int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) ); + if (ierr1 != DKS_SUCCESS) + return ierr1; + if (ierr2 != DKS_SUCCESS) + return ierr2; + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} +//BENI: +int DKSBase::setupFFTRC(int ndim, int N[3], double scale) { + + if (apiCuda()) + return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); + else if (apiOpenMP()) + return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale)); + + return DKS_ERROR; + +} + +//BENI: +int DKSBase::setupFFTCR(int ndim, int N[3], double scale) { + + if (apiCuda()) + return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); + else if (apiOpenMP()) + return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale)); + + return DKS_ERROR; + +} + +/* call OpenCL FFT function for selected platform */ +int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiOpenCL()) { + //load kernel and execute + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) ); + else + return DKS_ERROR; + } else if (apiCuda()) { + return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize)); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call OpenCL IFFT function for selected platform */ +int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + if (apiOpenCL()) { + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) ); + else + return DKS_ERROR; + } else if (apiCuda()) { + return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) ); + } else if (apiOpenMP()) { + return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) ); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call normalize FFT function for selected platform */ +int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiOpenCL()) { + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) ); + else + return DKS_ERROR; + } else if (apiCuda()) { + return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) ); + } else if (apiOpenMP()) { + return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) ); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call real to complex FFT */ +int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) ); + else if (apiOpenMP()) + return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) ); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call complex to real FFT */ +int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { + if (apiCuda()) + return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) ); + else if (apiOpenMP()) + return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) ); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* normalize complex to real iFFT */ +int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) { + if (apiCuda()) + return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) ); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_SUCCESS; +} + +/* normalize complex to real iFFT */ +int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) { + if (apiOpenCL()) { + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; + +} + +int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, + double hz_m0, double hz_m1, double hz_m2, int streamId) { + + if (apiCuda()) { + return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ, + hz_m0, hz_m1, hz_m2, streamId) ); + } else if (apiOpenMP()) { + //BENI: + return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2)); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, + int I, int J, int K, int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + + +int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, double &result) +{ + + if (apiCuda()) { + return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq, + fTimeResolution, fRebin, + sensors, length, numpar, + result)); + } else if (apiOpenCL()) { + + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq, + fTimeResolution, fRebin, + sensors, length, numpar, result)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + if (apiCuda()) { + return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, + result)); + } else if (apiOpenCL()) { + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, result)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + if (apiCuda()) { + return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, + result)); + } else if (apiOpenCL()) { + + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, result)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles, int numparams, + int &numaddback, int &numdead) +{ + + if (apiCuda()) { + return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles)); + } else if (apiOpenCL()) { + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles)); + else + return DKS_ERROR; + + } else if (apiOpenMP()) { + return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles)); + } + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + + +int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) +{ + + if (apiCuda()) + return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) ); + else if (apiOpenMP()) + return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) ); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) +{ + + if (apiOpenMP()) { + return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr, + rx_ptr, ry_ptr, rz_ptr, + px_ptr, py_ptr, pz_ptr, + par_ptr, numparticles) ); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + + +int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) +{ + + if (apiCuda()) + return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback)); + else if (apiOpenMP()) + return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) +{ + + if (apiOpenMP()) { + return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, + rx_ptr, ry_ptr, rz_ptr, + px_ptr, py_ptr, pz_ptr, + par_ptr, numparticles, numaddback)); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + + +int DKSBase::callInitRandoms(int size) { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_createCurandStates(size)); + else if (apiOpenCL()) + return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size)); + else if (apiOpenMP()) + return MIC_SAFECALL(micbase->mic_createRandStreams(size)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, + bool usedt, int streamId) +{ + + if (apiCuda()) + return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, + usedt, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, + c, usedt, streamId)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, void *dt_ptr, double dt, + double c, bool usedt, int streamId) +{ + + if (apiCuda()) { + return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, + lastSec_ptr, orient_ptr, + npart, nsec, dt_ptr, dt, + c, usedt, streamId)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, + lastSec_ptr, orient_ptr, + npart, nsec, dt_ptr, dt, + c, usedt, streamId)); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} diff --git a/src/DKSBase.h b/src/DKSBase.h new file mode 100644 index 0000000..ea8bc39 --- /dev/null +++ b/src/DKSBase.h @@ -0,0 +1,1133 @@ +/** DKSBase class. + * DKSBase.h + * Author: Uldis Locans + * Date: 15.09.2014 + * Base class of Dynamic Kernel Scheduler that handles the function calls + * from host application to DKS + */ + +#ifndef H_DKS_BASE +#define H_DKS_BASE + +#include +#include +#include +#include + +#include "DKSDefinitions.h" + +#ifdef DKS_MPI +#include +#endif + +#ifdef DKS_OPENCL + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "OpenCL/OpenCLBase.h" +#include "OpenCL/OpenCLFFT.h" +#include "OpenCL/OpenCLChiSquare.h" +#include "OpenCL/OpenCLCollimatorPhysics.h" +#endif + +#ifdef DKS_CUDA +#include "CUDA/CudaBase.cuh" +#include "CUDA/CudaFFT.cuh" +#include "CUDA/CudaGreensFunction.cuh" +#include "CUDA/CudaChiSquare.cuh" +#include "CUDA/CudaCollimatorPhysics.cuh" +#include "nvToolsExt.h" +#endif + +#ifdef DKS_MIC +#include "MIC/MICBase.h" +#include "MIC/MICChiSquare.h" +#include "MIC/MICFFT.h" +#include "MIC/MICCollimatorPhysics.h" +#include "MIC/MICGreensFunction.hpp" +#endif + +#include "Algorithms/CollimatorPhysics.h" +#include "Algorithms/FFT.h" + +#include "AutoTuning/DKSConfig.h" + +/** DKSBase class for handling function calls to DKS library */ +class DKSBase { + +private: + char *m_device_name; + char *m_api_name; + char *m_function_name; + + bool m_device_set; + bool m_api_set; + bool m_function_set; + + bool m_auto_tuning; + bool m_use_config; + +#ifdef DKS_OPENCL + OpenCLBase *oclbase; + OpenCLFFT *oclfft; + OpenCLChiSquare *oclchi; + OpenCLCollimatorPhysics *oclcol; +#endif + +#ifdef DKS_CUDA + CudaBase *cbase; + CudaFFT *cfft; + CudaGreensFunction *cgreens; + CudaChiSquare *cchi; + CudaCollimatorPhysics *ccol; +#endif + +#ifdef DKS_MIC + MICBase *micbase; + MICFFT *micfft; + MICCollimatorPhysics *miccol; + MICGreensFunction *micgreens; + MICChiSquare *micchi; +#endif + +protected: + + //gives access to dks autotuning config file + DKSConfig dksconfig; + + /** + * Check if current API is set to OpenCL + * Return true/false wether current api is opencl + */ + bool apiOpenCL(); + + /** + * Check if current API is set to CUDA. + * Return true/false wether curretn api is cuda + */ + bool apiCuda(); + + /** + * Check if current API is set to OpenMP. + * Return true/false whether current api is OpenMP + */ + bool apiOpenMP(); + + /** Check if device is GPU */ + bool deviceGPU(); + /** Check if device is CPU */ + bool deviceCPU(); + /** Check if device is MIC */ + bool deviceMIC(); + + /** + * Get cbase pointer + */ +#ifdef DKS_CUDA + CudaBase *getCudaBase() { + return cbase; + } +#endif + +#ifdef DKS_OPENCL + OpenCLBase *getOpenCLBase() { + return oclbase; + } +#endif + + /** Call OpenCL base to load specified kenrel file. + * + */ + int loadOpenCLKernel(const char *kernel_name); + + std::string getAPI() { + std::string api_name(m_api_name); + return api_name; + } + + std::string getDevice() { + std::string device_name(&m_device_name[1]); + return device_name; + } + +public: + + /** + * Default constructor. + */ + DKSBase(); + + /** + * Constructor that sets api and devcie to use with DKS. + */ + DKSBase(const char* api_name, const char* device_name); + + + /** + * Destructor. + * Free DKS resources. + */ + ~DKSBase(); + + /** Turn on auto tuning */ + void setAutoTuningOn() { m_auto_tuning = true; } + + /** Turn of auto tuning */ + void setAutoTuningOff() { m_auto_tuning = false; } + + /** Get status of auto tuning */ + bool isAutoTuningOn() { return m_auto_tuning; } + + /** Turn on use of config file */ + void setUseConfigOn() { m_use_config = true; } + + /** Turn off use of config file */ + void setUseConfigOff() { m_use_config = false; } + + /** Check if using config file */ + bool isUseConfigOn() { return m_use_config; } + + /** + * Set device to use with DKS. + * Sets specific device to use with DKS. Supported devices are -gpu and -mic. + * Length specifies the number of characters in device_name array (length - deprecated). + * Return success or error code. + */ + int setDevice(const char* device_name, int length = -1); + + /** + * Set framework to use with DKS. + * Sets framework and API that DKS uses to execute code on device. Supported API's + * are OpenCL, CUDA and OpenMP. Returns success or error code. Length specifies + * the number of characters in api_name array (length - deprecated). + */ + int setAPI(const char* api_name, int length = -1); + + /** + * Prints information about all available devices. + * Calls CUDA, OpenCL and MIC functions to query for available devices + * for each framework and pirnts information about each device. Length specifies + * the number of characters in api_name array + * Returns success or error code + */ + int getDevices(); + + /** + * Returns device count. + * Saves the number of the devices available on the platform to ndev. + */ + int getDeviceCount(int &ndev); + + /** Get the name of the device in use. + * Query the device that is used and get the naem of the device. The name is saved in the + * device_name string. Returns DKS_SUCCESS + */ + int getDeviceName(std::string &device_name); + + /** Set the device to use. + * Pass the index of the device to use by dks. + */ + int setDefaultDevice(int device); + + /** Get unique devices. + * Get a list of all the unique devices available on the platform. + * When API and device type for DKS is set, getDeviceList can get all the unique devices + * available for this API and device type. Used for autotuning if multiple different GPUs are + * installed on the system. + */ + int getDeviceList(std::vector &devices); + + /** + * Inititialize DKS. + * Set framework and device to use. If OpenCL is used create context with device. + * Return success or error code. + */ + int initDevice(); + + /** + * Create stream for async execution. + * Function to create different streams with device to allow assync kernel execution and data + * transfer. Currently implemented for CUDA with cuda streams. streamId will be can be used later + * use the created stream. Returns success or error code. + * TODO: for opencl use different + * contexts similar as cuda streams to achieve async execution. TODO: for intel mic look at + * library (libxstream) from Hans Pabst. + */ + int createStream(int &streamId); + + /** + * Send pointer to device memory from one MPI process to another. + * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses + * cuda icp. Gets icp handle of memory allocated on device pointed by mem_ptr does MPI_Send to + * dest process where matching receivePointer should be called. Returns success or error code. + * TODO: opencl and mic cases still need implementations + */ +#ifdef DKS_MPI + int sendPointer(void *mem_ptr, int dest, MPI_Comm comm); +#endif + + /** + * Receive pointer to device memory from another MPI process. + * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses + * cuda icp. Uses MPI_Recv to get icp handle from another MPI process and opens a reference + * to this memory. Togeter with sendPointer function allows multiple MPI processes to share + * one memory region of the device. Returns success or error code. + * TODO: opencl and mic cases still need implementations + */ +#ifdef DKS_MPI + void * receivePointer(int hostproc, MPI_Comm comm, int &ierr); +#endif + + /** + * Close handle to device memory. + * If receivePointer is used to open memory handle allocated by another MPI process closeHandle + * should be called to free resources instead of freeMemory. Returns success or error code. + * TODO: opencl and mic cases still need implementations. + */ + int closeHandle(void *mem_ptr); + + /** + * Wait till all tasks running on device are completed. + * Forces a device synchronization - waits till all tasks on the device are complete. + * Implemented for cuda. Forces sync only in context in witch it is called - only waits + * for tasks launched by process calling syncDevice. If multiple processes launch different + * tasks each process is responsible for its own synchronization. Returns success or error code. + * TODO: opencl and mic implementations still necessary + */ + int syncDevice(); + + /** + * Allocate memory and transfer data to device. + * Returns a void pointer which can be used in later kernels to reference + * allocated device memory. data_in pointer to data to be transfered to device, + * elements is the number of data elements to transfer, T - type of data to transfer. + * If memory allocation or data transfer fails ierr will be set to error code. + */ + template + void * pushData(const void *data_in, int elements, int &ierr) { + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem mem_ptr; + size_t size = sizeof(T)*elements; + mem_ptr = oclbase->ocl_allocateMemory(size, ierr); + oclbase->ocl_writeData(mem_ptr, data_in, size, CL_FALSE); + + ierr = DKS_SUCCESS; + return mem_ptr; +#endif + } else if (apiCuda()){ +#ifdef DKS_CUDA + //cuda version + void * mem_ptr = NULL; + size_t size = sizeof(T)*elements; + mem_ptr = cbase->cuda_allocateMemory(size, ierr); + cbase->cuda_writeData((T*)mem_ptr, data_in, size); + + ierr = DKS_SUCCESS; + return mem_ptr; +#endif + } else if (apiOpenMP()) { +#ifdef DKS_MIC + void * mem_ptr = NULL; + mem_ptr = micbase.mic_pushData(data_in, elements); + + return mem_ptr; +#endif + } + + ierr = DKS_ERROR; + return NULL; + } + + /** + * Read data from device and free device memory. + * Reads data from device pointed by mem_ptr into data_out pointer. Elements + * specifies the number of data elements to read, T specifies the datatype of + * elements to copy. Returns error code if read data or free memory fails. + */ + template + int pullData(void *mem_ptr, void* data_out, int elements) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + size_t size = sizeof(T)*elements; + cl_mem clmem_ptr = (cl_mem)mem_ptr; + oclbase->ocl_readData(clmem_ptr, data_out, size); + oclbase->ocl_freeMemory(clmem_ptr); +#endif + } else if (apiCuda()) { +#ifdef DKS_CUDA + //cuda version + size_t size = sizeof(T)*elements; + cbase->cuda_readData((T*)mem_ptr, data_out, size); + cbase->cuda_freeMemory(mem_ptr); +#endif + } else if (apiOpenMP()) { +#ifdef DKS_MIC + micbase.mic_pullData(mem_ptr, data_out, elements); +#endif + } + + return DKS_SUCCESS; + } + + /** + * Allocate memory on device and return pointer to device memory. + * Allocates memory of type T, elements specifies the number of + * elements for which memory should be allocated. If memory allocation + * fails ierr is set to error code. Returns void pointer to device memory. + */ + template + void * allocateMemory(int elements, int &ierr) { + ierr = DKS_SUCCESS; + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem mem_ptr; + size_t size = sizeof(T)*elements; + mem_ptr = oclbase->ocl_allocateMemory(size, ierr); + return mem_ptr; +#endif + } else if (apiCuda()) { +#ifdef DKS_CUDA + //cuda version + void * mem_ptr = NULL; + size_t size = sizeof(T)*elements; + mem_ptr = cbase->cuda_allocateMemory(size, ierr); + return mem_ptr; +#endif + } else if (apiOpenMP()) { +#ifdef DKS_MIC + void * mem_ptr = NULL; + mem_ptr = micbase.mic_allocateMemory(elements); + return mem_ptr; +#endif + } + + ierr = DKS_ERROR; + return NULL; + } + + /** + * Allocates host memory as page-locked. + * Used for memroy allocation on the host side for pointer ptr for size elements. + * Page locked memory improves + * data transfer rates between host and device and allows async data transfer + * and kernel execution. Reurns succes or error code. + * TODO: opencl and mic implementations needed. + */ + template + int allocateHostMemory(T *&ptr, int size) + { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_allocateHostMemory(ptr, size)); + + DEBUG_MSG("Pinned memory allocation not implemented for this platform"); + return DKS_ERROR; + } + + /** + * Free host page-locked memory. + * Used to free page-locked memory on the host that was allocated using + * allocateHostMemory. ptr is the host pointer where page-locked memory was allocated, + * size - number of elements held by the memroy. + */ + template + int freeHostMemory(T* &ptr, int size) + { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_freeHostMemory(ptr)); + + return DKS_ERROR; + } + + /** + * Page lock allocated host memory. + * Page locked memory improves data transfer between host and device (true for cuda and + * opencl, maybe also mic). ptr - pointer to memory that needs to be page locked, + * size - number of elements in array. + * TODO: mic and opencl implementations needed + */ + template + int registerHostMemory(T *ptr, int size) { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_hostRegister(ptr, size)); + + return DKS_ERROR; + } + + /** + * Unregister page locked memory. + * TODO: opencl and mic implementations needed· + */ + template + int unregisterHostMemory(T *ptr) { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_hostUnregister(ptr)); + return DKS_ERROR; + } + + /** + * Write data from host to device. + * Write data from data to device memory referenced by mem_ptr. Elements spicify the + * number of elements to write, offset specifies the offset from the first element. + * Returns success or error code. Performs a blocking write - control to the host + * is returned only when data transfer is complete. + */ + template + int writeData(void *mem_ptr, const void *data, int elements, int offset = 0) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + size_t size = sizeof(T)*elements; + size_t offset_bytes = sizeof(T)*offset; + cl_mem clmem_ptr = (cl_mem)mem_ptr; + return oclbase->ocl_writeData(clmem_ptr, data, size, offset_bytes, CL_FALSE); +#endif + + } else if (apiCuda()){ + //cuda version + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset)); + + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_writeData(mem_ptr, data, elements, offset)); + + } + + return DKS_ERROR; + + } + + /** + * Write data to device using async write. + * Queue a async data write and return control to host imediately. + * mem_ptr - device memory pointer, data - host memory pointer, + * elements - number of data elements to write + * stremaId - stream id to use, offset - offset on device from first element + * For trully async execution on cuda stream other than default needs to be created + * and device memory must be page-locked. Otherwise functions just asynchronosly with + * respect to host. + * TODO: mic and opencl implementations needed (goes to blocking writes) + */ + template + int writeDataAsync(void *mem_ptr, const void *data, int elements, + int streamId = -1, int offset = 0) { + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + size_t size = sizeof(T)*elements; + cl_mem clmem_ptr = (cl_mem)mem_ptr; + oclbase->ocl_writeData(clmem_ptr, data, size, 0, CL_FALSE); +#endif + } else if (apiCuda()){ + //cuda version + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_writeDataAsync(mem_ptr, data, elements, streamId, offset)); + } + + return DKS_ERROR; + + } + + /** + * Gather 3D data from multiple mpi processes to one memory region. + * When multiple processes share the same device memory using sendPointer and receivePointer + * gather3DDataAsync allows each process to write data to its memory region. Uses async writes. + * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local + * data dimensions, id - starting indexes in global domain for each process + * streamId - stream to use for data transfers. + * Returns success or error code. + */ +#ifdef DKS_MPI + template + int gather3DDataAsync(void *mem_ptr, const T *data, int Ng[3], int Nl[3], + int id[3], int streamId = -1 ) { + + + //int p = 1; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int hoffset, doffset, ierr; + + //number of continuous memory elements + int elements = Nl[0]; + if (Nl[0] == Ng[0]) { + elements *= Nl[1]; + if (Nl[1] == Ng[1]) + elements *= Nl[2]; + } + + //starting index + int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0]; + + //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split + if (Nl[0] != Ng[0]) { + for (int i = 0; i < Nl[2]; i++) { + for (int j = 0; j < Nl[1]; j++) { + doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid; + hoffset = (i * Nl[1] + j) * elements; + ierr = writeDataAsync(mem_ptr, data + hoffset, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + } + return DKS_SUCCESS; + } + + //copy piece by piece 3rd dim if 2nd dim is split + if (Nl[1] != Ng[1]) { + for (int i = 0; i < Nl[2]; i++) { + doffset = i* Ng[1] * Ng[0] + sid; + ierr = writeDataAsync(mem_ptr, data + i*elements, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + return DKS_SUCCESS; + } + + //if only 3rd dim is split all elements are continuous so write one chunk + doffset = sid; + return writeDataAsync(mem_ptr, data, elements, streamId, doffset); + + } +#endif + + /** + * Scatter 3D data to multiple MPI processes from one device memory region. + * When multiple processes share the same device memory using sendPointer and receivePointer + * scatter3DDataAsync allows each process to read data from its memory region. Uses async reads. + * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local + * data dimensions, id - starting indexes in global domain for each process + * streamId - stream to use for data transfers. + * Returns success or error code. + */ +#ifdef DKS_MPI + template + int scatter3DDataAsync(const void *mem_ptr, T *data, int Ng[3], int Nl[3], + int id[3], int streamId = -1) { + + //int p = 1; + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int hoffset, doffset, ierr; + + //number of continuous memory elements + int elements = Nl[0]; + if (Nl[0] == Ng[0]) { + elements *= Nl[1]; + if (Nl[1] == Ng[1]) + elements *= Nl[2]; + } + + //starting index + int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0]; + + //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split + if (Nl[0] != Ng[0]) { + for (int i = 0; i < Nl[2]; i++) { + for (int j = 0; j < Nl[1]; j++) { + doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid; + hoffset = (i * Nl[1] + j) * elements; + ierr = readDataAsync(mem_ptr, data + hoffset, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + } + return DKS_SUCCESS; + } + + //copy piece by piece 3rd dim if 2nd dim is split + if (Nl[1] != Ng[1]) { + for (int i = 0; i < Nl[2]; i++) { + doffset = i* Ng[1] * Ng[0] + sid; + hoffset = i * elements; + ierr = readDataAsync(mem_ptr, data + hoffset, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + return DKS_SUCCESS; + } + + //if only 3rd dim is split all elements are continuous so write one chunk + doffset = sid; + return readDataAsync(mem_ptr, data, elements, streamId, doffset); + + } +#endif + + /** + * Create MPI subarray for 3D data gather and scatter using cuda aware MPI. + * If multiple MPI processes share device and cuda aware MPI is used for data transfer + * creates a MPI subarray so each MPI process can write and read to its own memory region. + * N_global - global domain dimensions, N_local - local domain dimensions, datatype - MPI datatype + */ +#ifdef DKS_MPI + template + MPI_Datatype create3DMPISubarray(int N_global[3], int N_local[3], MPI_Datatype datatype) { + //create MPI datatypes to transfer decomposed domain from GPU memory + int sizes[3] = {N_global[2], N_global[1], N_global[0]}; + int subsizes[3] = {N_local[2], N_local[1], N_local[0]}; + int starts[3] = {0, 0, 0}; + + MPI_Datatype stype, rtype; + MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, datatype, &stype); + MPI_Type_create_resized(stype, 0, sizeof(T), &rtype); + MPI_Type_commit(&rtype); + + return rtype; + } +#endif + + /** + * Gather 3D data from multiple MPI processes to device using cuda aware MPI. + * Using cuda aware mpi allows to gather data to one device memory region allocated + * by one of the mpi processes. mem_ptr - device pointer, data - host memory pointer, + * size - number of elements to transfer, stype - data type of elements, N_global - + * global dimensions of the domain, N_local - local domain dimensions, + * idx,idy,idz - starting indexes in global domain for each process, numNodes - number + * of processes, myNode - current node, rootNode - node that allocated device memory, + * comm - MPI communicator + * TODO: opencl and mic implementations (solution other than cuda aware mpi needed). + */ +#ifdef DKS_MPI + template + int gather3DData(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3], + int N_local[3], int * idx, int * idy, int * idz, + int numNodes, int myNode, int rootNode, MPI_Comm comm) + { + + MPI_Datatype rtype = create3DMPISubarray(N_global, N_local, stype); + + //calculate displacements from global domain size and local domain starting index + int *counts = new int[numNodes]; + int *displs = new int[numNodes]; + for (int i = 0; i < numNodes; i++) { + counts[i] = 1; + displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1]; + } + + if (apiOpenCL()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } else if (apiCuda()) { + MPI_Gatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm ); + } else if (apiOpenMP()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } + + return DKS_SUCCESS; + + } +#endif + + /** + * Gather 3D data from multiple MPI processes to device using cuda aware MPI and non blocking gather. + * For detailed parameter description see gather3DData docs. + * TODO: opencl and mic implementations (solution other than cuda aware mpi needed). + */ +#ifdef DKS_MPI + template + int gather3DDataAsync(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3], + int N_local[3], int * idx, int * idy, int * idz, + int numNodes, int myNode, int rootNode, + MPI_Comm comm, MPI_Request &request) + { + + MPI_Datatype rtype = create3DMPISubarray(N_global, N_local, stype); + + //calculate displacements from global domain size and local domain starting index + int *counts = new int[numNodes]; + int *displs = new int[numNodes]; + for (int i = 0; i < numNodes; i++) { + counts[i] = 1; + displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1]; + } + + if (apiOpenCL()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } else if (apiCuda()) { + MPI_Igatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm, &request ); + + } else if (apiOpenMP()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } + + return DKS_SUCCESS; + + } +#endif + + /** + * Scatter 3D data from device to multiple MPI processes using cuda aware MPI. + * If multiple MPI prcesses share one device allows to scatter 3D data regions + * from device memory allocated by one of the processes to all other MPI processes. + * For detailed parameter description see gather3DData docs. + * TODO: opencl and mic implementations (solution other than cuda aware mpi needed). + */ +#ifdef DKS_MPI + template + int scatter3DData(void *mem_ptr, T *data, int size, MPI_Datatype rtype, int N_global[3], + int N_local[3], int * idx, int * idy, int * idz, + int numNodes, int myNode, int rootNode, MPI_Comm comm) + { + + MPI_Datatype stype = create3DMPISubarray(N_global, N_local, rtype); + + //calculate displacements from global domain size and local domain starting index + int *counts = new int[numNodes]; + int *displs = new int[numNodes]; + for (int i = 0; i < numNodes; i++) { + counts[i] = 1; + displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1]; + } + + if (apiOpenCL()) { + //TODO: gather all the date in root node, transfer to device from root node + } else if (apiCuda()) { + + //async scatter + //use cuda aware mpi + MPI_Scatterv( mem_ptr, counts, displs, stype, data, size, rtype, rootNode, comm ); + return DKS_ERROR; + } else if (apiOpenMP()) { + + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } + + return DKS_SUCCESS; + + } +#endif + + /** + * Read data from device memory. + * Read data referenced by mem_ptr int out_data. Elements indicates the number of data + * elements to read and offset is the offset on the device from start of the memroy. + * Data type to read is specified by T. Performs a blocking read. + */ + template + int readData(const void *mem_ptr, void *out_data, int elements, int offset = 0) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem clmem_ptr = (cl_mem)mem_ptr; + size_t size = sizeof(T)*elements; + size_t offset_bytes = sizeof(T)*offset; + return oclbase->ocl_readData(clmem_ptr, out_data, size, offset_bytes); +#endif + } else if (apiCuda()){ + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_readData(mem_ptr, out_data, elements, offset)); + } + + return DKS_ERROR; + } + + /** + * Performs an async data read from device. + * Queues data read from device and returns control to host. stream id specifies stream to use for + * the read. Device async read can be performed if host memroy is page-locked and strema other than + * default -1 is used. For other parameter detailed description see readData function. + * TODO: opencl and mic implementations (currently reverts to blocking reads). + */ + template + int readDataAsync(const void *mem_ptr, void *out_data, int elements, int streamId = -1, int offset = 0) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem clmem_ptr = (cl_mem)mem_ptr; + size_t size = sizeof(T)*elements; + return oclbase->ocl_readData(clmem_ptr, out_data, size, 0); +#endif + } else if (apiCuda()){ + //cuda version + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_readDataAsync(mem_ptr, out_data, elements, + streamId, offset)); + } + + return DKS_ERROR; + } + + + /** + * Free memory allocated on device. + * Free memory referenced by mem_ptr, elements - number of elements in memory, + * T - data type. + */ + template + int freeMemory(void *mem_ptr, int elements) { + if (apiOpenCL()) + return OPENCL_SAFECALL(oclbase->ocl_freeMemory((cl_mem)mem_ptr)); + else if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr)); + else if (apiOpenMP()) + return MIC_SAFECALL(micbase.mic_freeMemory(mem_ptr, elements)); + + return DKS_ERROR; + } + + + /////////////////////////////////////////////// + ///////Function library part of dksbase//////// + /////////////////////////////////////////////// + + /** + * Setup FFT function. + * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls. + * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case + * each fft will do its own setup according to fft size and dimensions. + * TODO: opencl and mic implementations + */ + int setupFFT(int ndim, int N[3]); + //BENI: + int setupFFTRC(int ndim, int N[3], double scale = 1.0); + //BENI: + int setupFFTCR(int ndim, int N[3], double scale = 1.0); + + /** + * Call complex-to-complex fft. + * Executes in place complex to compelx fft on the device on data pointed by data_ptr. + * stream id can be specified to use other streams than default. + * TODO: mic implementation + */ + int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call complex-to-complex ifft. + * Executes in place complex to compelx ifft on the device on data pointed by data_ptr. + * stream id can be specified to use other streams than default. + * TODO: mic implementation. + */ + int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Normalize complex to complex ifft. + * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by + * fft size + * TODO: mic implementation. + */ + int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call real to complex FFT. + * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points + * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size + * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast + * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] + * TODO: opencl and mic implementations + */ + int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call complex to real iFFT. + * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points + * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size + * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast + * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] + * TODO: opencl and mic implementations. + */ + int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Normalize compelx to real ifft. + * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by + * fft size. + * TODO: opencl and mic implementations. + */ + int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Transpose 2D and 3D arrays, OpenCL implementation + * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose + */ + int callTranspose(void *mem_ptr, int N[3], int ndim, int dim); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, + double hz_m0, double hz_m1, double hz_m2, int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callGreensIntegration(void *mem_ptr, void *tmp_ptr, + int I, int J, int K, int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); + + /** + * Element by element multiplication. + * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies + * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1. + * TODO: opencl and mic implementations. + */ + int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1); + + /** + * Chi square for parameter fitting on device. + * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for + * intermediate results. Chi square results are put in &results + */ + int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, double &result); + + /** + * max-log-likelihood for parameter fitting on device. + * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, + * mem_par - pointer to parameter set, mem_results - pointer for + * intermediate results. Chi square results are put in &results. + * TODO: opencl and mic implementations. + */ + int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffser, + int sensors, int length, int numpar, + double &result); + + /** + * max-log-likelihood for parameter fitting on device. + * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, + * mem_par - pointer to parameter set, mem_results - pointer for + * intermediate results. Chi square results are put in &results. + * TODO: opencl and mic implementations. + */ + int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffser, + int sensors, int length, int numpar, + double &result); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles, int numparams, + int &numaddback, int &numdead); + + + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * Test function for the MIC to test SoA layout vs AoS layout used in previous versions + */ + int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback); + + /** + * Init random number states and save for reuse on device. + * TODO: opencl and mic implementations. + */ + int callInitRandoms(int size); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1); + + /** + * Print memory information on device (total, used, available) + * TODO: opencl and mic imlementation + */ + int callMemInfo() { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_memInfo()); + + return DKS_ERROR; + } + + /** + * Test function to profile opencl kernel calls. + * Used for debuging and timing purposes only. + */ + void oclEventInfo() { + if (apiOpenCL()) + return OPENCL_SAFECALL(oclbase->ocl_eventInfo()); + + } + + /** + * Test function to profile opencl kernel calls. + * Used for debuging and timing purposes only. + */ + void oclClearEvents() { + if (apiOpenCL()) { +#ifdef DKS_OPENCL + oclbase->ocl_clearEvents(); +#endif + } + } + + +}; + +#endif diff --git a/src/DKSBaseMuSR.cpp b/src/DKSBaseMuSR.cpp new file mode 100644 index 0000000..3df59e9 --- /dev/null +++ b/src/DKSBaseMuSR.cpp @@ -0,0 +1,196 @@ +#include "DKSBaseMuSR.h" + +DKSBaseMuSR::DKSBaseMuSR() { + chiSq = nullptr; + chiSquareSize_m = -1; +} + +DKSBaseMuSR::~DKSBaseMuSR() { + freeChiSquare(); +} + +int DKSBaseMuSR::callCompileProgram(std::string function, bool mlh) { + return chiSq->compileProgram(function, mlh); +} + +int DKSBaseMuSR::callLaunchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result) +{ + + + //if we are not auto tuning and the size of the problem has changed find the new parameters + //from autotuning config file + if (!isAutoTuningOn() && length != chiSquareSize_m) { + int numBlocks, blockSize; + std::string device_name; + getDeviceName(device_name); + dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", + length, "NumBlocks", numBlocks); + dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", + length, "BlockSize", blockSize); + chiSq->setKernelParams(numBlocks, blockSize); + + //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl; + + chiSquareSize_m = length; + } + + int ierr = chiSq->launchChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, + nummap, timeStart, timeStep, result); + + if ( isAutoTuningOn() ) { + std::vector config; + callAutoTuningChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, nummap, timeStart, + timeStep, result, config); + } + + return ierr; +} + +int DKSBaseMuSR::callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result, std::vector &config) +{ + + int loops = 100; + DKSAutoTuning *autoTuning; + if (apiCuda()) + autoTuning = new DKSAutoTuning(this, API_CUDA, DEVICE_GPU_NEW, loops); + else if (apiOpenCL() && deviceGPU()) + autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_GPU_NEW, loops); + else if (apiOpenCL() && deviceCPU()) + autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_CPU_NEW, loops); + else if (apiOpenCL() && deviceMIC()) + autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_MIC_NEW, loops); + else + autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW, loops); + + + int maxThreadsPerBlock = 1024; + checkMuSRKernels(fitType, maxThreadsPerBlock); + std::cout << "Max threads for autotune " << maxThreadsPerBlock << std::endl; + + //create the function to be timed + std::function f = std::bind(&ChiSquareRuntime::launchChiSquare, chiSq, + fitType, mem_data, mem_err, length, numpar, numfunc, nummap, + timeStart, timeStep, result); + autoTuning->setFunction(f, "launchChiSquare"); + + //create the parameters for auto-tuning + autoTuning->addParameter(&chiSq->blockSize_m, 32, maxThreadsPerBlock, 32, "BlockSize"); + autoTuning->addParameter(&chiSq->numBlocks_m, 100, 5000, 100, "NumBlocks"); + + autoTuning->lineSearch(); + + //autoTuning->hillClimbing(100); + + //autoTuning->simulatedAnnealing(1e-3, 1e-6); + + //autoTuning->exaustiveSearch(); + + std::string device_name; + getDeviceName(device_name); + dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length, + "NumBlocks", chiSq->numBlocks_m); + dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length, + "BlockSize", chiSq->blockSize_m); + + + config.push_back(chiSq->blockSize_m); + config.push_back(chiSq->numBlocks_m); + + delete autoTuning; + + return DKS_SUCCESS; + +} + +int DKSBaseMuSR::testAutoTuning() { + + DKSAutoTuning *autoTuning; + DKSAutoTuningTester *tester; + + autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW); + tester = new DKSAutoTuningTester(); + + std::function f = std::bind(&DKSAutoTuningTester::peaksZ, tester); + autoTuning->setFunction(f, "testAutoTuner", false); + + autoTuning->addParameter(&tester->x, -3.0, 3.0, 0.5, "x"); + autoTuning->addParameter(&tester->y, -3.0, 3.0, 0.5, "y"); + + autoTuning->exaustiveSearch(); + + autoTuning->hillClimbing(10); + + autoTuning->simulatedAnnealing(10, 0.0005); + + return DKS_SUCCESS; +} + +int DKSBaseMuSR::callSetConsts(double N0, double tau, double bkg) { + return chiSq->setConsts(N0, tau, bkg); +} + +int DKSBaseMuSR::callSetConsts(double alpha, double beta) { + return chiSq->setConsts(alpha, beta); +} + +int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int size_map) { + int ierr; + + if (apiCuda()) { + ierr = CUDA_SAFECALL( DKS_SUCCESS ); + chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase())); + } else { + ierr = OPENCL_SAFECALL( DKS_SUCCESS ); + chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase())); + } + + if (ierr == DKS_SUCCESS) { + return chiSq->initChiSquare(size_data, size_param, size_func, size_map); + } else { + DEBUG_MSG("DKS API not set, or DKS compiled without sellected API support"); + return DKS_ERROR; + } +} + +int DKSBaseMuSR::freeChiSquare() { + int ierr = DKS_SUCCESS; + if (chiSq != NULL) { + ierr = chiSq->freeChiSquare(); + delete chiSq; + chiSq = NULL; + } + return ierr; +} + +int DKSBaseMuSR::writeParams(const double *params, int numparams) { + return chiSq->writeParams(params, numparams); +} + +int DKSBaseMuSR::writeFunctions(const double *func, int numfunc) { + return chiSq->writeFunc(func, numfunc); +} + +int DKSBaseMuSR::writeMaps(const int *map, int numfunc) { + return chiSq->writeMap(map, numfunc);; + +} + +int DKSBaseMuSR::checkMuSRKernels(int fitType) { + int threadsPerBlock = 1; + return chiSq->checkChiSquareKernels(fitType, threadsPerBlock); +} + +int DKSBaseMuSR::checkMuSRKernels(int fitType, int &threadsPerBlock) { + return chiSq->checkChiSquareKernels(fitType, threadsPerBlock); +} + +int DKSBaseMuSR::getOperations(int &oper) { + return chiSq->getOperations(oper); +} diff --git a/src/DKSBaseMuSR.h b/src/DKSBaseMuSR.h new file mode 100644 index 0000000..30f2d89 --- /dev/null +++ b/src/DKSBaseMuSR.h @@ -0,0 +1,137 @@ +#ifndef H_DKS_BASEMUSR +#define H_DKS_BASEMUSR + +#include +#include + +#include "AutoTuning/DKSAutoTuning.h" +#include "AutoTuning/DKSAutoTuningTester.h" + +#include "DKSBase.h" + +#include "Algorithms/ChiSquareRuntime.h" + +#ifdef DKS_CUDA +#include "CUDA/CudaChiSquareRuntime.cuh" +#endif + +#ifdef DKS_OPENCL +#include "OpenCL/OpenCLChiSquareRuntime.h" +#endif + +class DKSBaseMuSR : public DKSBase { + +private: + + ChiSquareRuntime *chiSq; + + int chiSquareSize_m; + +public: + + DKSBaseMuSR(); + + ~DKSBaseMuSR(); + + /** Compile the program with kernels to be run. + * String function contains the string that will be added to the code to compile in the + * function: __device__ double fTheory(double t, double *p, double *f, int *m); + * Function string must be a valid C math expression. It can contain operators, math functions + * and predefined functions listed in: + * http://lmu.web.psi.ch/musrfit/user/MUSR/MusrFit.html#A_4.3_The_THEORY_Block + * Predifined functions can be accessed by the abbreviation given in the table + * Parameters can be accesed in form p[idx] or p[m[idx]] - where p represents parameter array + * m represents map array and idx is the index to use from the maps. Precalculated function + * values can be accessed the same way - f[idx] or f[m[idx]]. Returns DKS_SUCCESS if everythin + * runs successfully, otherwise returns DKS_ERROR. If DKS is compiled with debug flag enabled + * prints DKS error message in case something fails + */ + int callCompileProgram(std::string function, bool mlh = false); + + /** Launch chi square calculation on data set writen in mem_data memory on device. + * mem_par, mem_map and mem_func hold pointers to parameter, function and map values + * for this data set (parameter array is one for all the data sets, maps and functions + * change between data sets). Resulting chi square value for this dataset will be put in + * result variable. Returns DKS_SUCCESS if everythin runs successfully, otherwise returns + * DKS_ERROR. If DKS is compiled with debug flag enabled prints DKS error message in case + * something fails + */ + int callLaunchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result); + + /** Launch auto-tuning of chisquare function for the selected device. + * Creates a function pointer to callLaunchChiSquare with necessary arguments bind to + * function call. CUDA and OpenCL version - gives AutoTuning class access to numThreads + * parameter which is varied to find the optimal value by AutoTuning class. Uses brute force + * method to test all the values. + */ + int callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result, std::vector &config); + + /** Set N0, tau and BKG values for the run. + * Needs to be called before kernel launch if these values are changing + */ + int callSetConsts(double N0, double tau, double bkg); + + /** Set alpha and beta values for the run. + * Needs to be called before kernel launch if these values are changing + */ + int callSetConsts(double alpha, double beta); + + /** Init chisquare calculations. + * Size is the maximum number of elements in any of the data sets used. + */ + int initChiSquare(int size_data, int size_param, int size_func, int size_map); + + /** Free temporary device storage allocated for chi^2 kernel. + * Return error code if freeing the device fails. + */ + int freeChiSquare(); + + /** Write params to device. + * Write pramas from double array to device, params device memory is managed by DKS. + */ + int writeParams(const double *params, int numparams); + + /** Write function values to device. + * Write precalculated function values to device, memory for functions on device is handled + * by DKS. + */ + int writeFunctions(const double *func, int numfunc); + + /** Write map indexes to device. + * Write map indexes to use in defined theory function to devive. Memory for map indexes is + * handeld by DKS. + */ + int writeMaps(const int *map, int numfunc); + + /** Check if device can run necessary kernels. + * Check selected device properties to see if device + * suports double precision and if device can run the + * necessary number of work_items / work_groups to successfully + * execute CUDA/OpenCL kernels. + */ + int checkMuSRKernels(int fitType); + + /** Perform the same check as checkMuSRKernels(int fitType) and return max threads per block. + * Used for autotuning to check what is the device limit for threads per block to correctly + * set the upper bound when searching the parameter space. + */ + int checkMuSRKernels(int fitType, int &threadsPerBlock); + + /** Debug function to test auto-tuning search functions + */ + int testAutoTuning(); + + /** Get the number of operations in compiled kernel. + */ + int getOperations(int &oper); + +}; + +#endif diff --git a/src/DKSDefinitions.h b/src/DKSDefinitions.h new file mode 100644 index 0000000..63fba34 --- /dev/null +++ b/src/DKSDefinitions.h @@ -0,0 +1,71 @@ +#ifndef H_DKS_DEFINITIONS +#define H_DKS_DEFINITIONS + +#define API_OPENCL "OpenCL" +#define API_CUDA "Cuda" +#define API_OPENMP "OpenMP" +#define API_UNKNOWN "Unknown" + +#define DEVICE_GPU_NEW "GPU" +#define DEVICE_CPU_NEW "CPU" +#define DEVICE_MIC_NEW "MIC" +#define DEVICE_UNKNOWN_NEW "Unknown" + +#define DEVICE_GPU "-gpu" +#define DEVICE_CPU "-cpu" +#define DEVICE_MIC "-mic" + +//define macro for printing debug messages if debug flag is set +#ifdef DEBUG +#define DEBUG_MSG(x) (std::cout << x << std::endl) +#else +#define DEBUG_MSG(x) +#endif + +//define DKS error codes +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 +#define DKS_API_NOT_ENABLED 100 + +#define OCL_SUCCESS 0 +#define OCL_ERROR 1 + +//define macros to enable or disable calls to specific frameworks +//if framework specific flag is set execute the satement, of not give DKS_API_NOT_ENABLED error +#ifdef DKS_CUDA +#define CUDA_SAFECALL(...) ( __VA_ARGS__ ) +#else +#define CUDA_SAFECALL(...) ( DKS_API_NOT_ENABLED ) +#endif + +#ifdef DKS_OPENCL +#define OPENCL_SAFECALL(...) ( __VA_ARGS__ ) +#else +#define OPENCL_SAFECALL(...) ( DKS_API_NOT_ENABLED ) +#endif + +#ifdef DKS_MIC +#define MIC_SAFECALL(...) ( __VA_ARGS__ ) +#else +#define MIC_SAFECALL(...) ( DKS_API_NOT_ENABLED ) +#endif + +#ifdef DKS_CUDA +#define CUDA_SAFEINIT(x) ( x ) +#else +#define CUDA_SAFEINIT(x) ( NULL ) +#endif + +#ifdef DKS_OPENCL +#define OPENCL_SAFEINIT(x) ( x ) +#else +#define OPENCL_SAFEINIT(x) ( NULL ) +#endif + +#ifdef DKS_MIC +#define MIC_SAFEINIT(x) ( x ) +#else +#define MIC_SAFEINIT(x) ( NULL ) +#endif + +#endif diff --git a/src/DKSDevice.cpp b/src/DKSDevice.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/DKSDevice.h b/src/DKSDevice.h new file mode 100644 index 0000000..79a69fe --- /dev/null +++ b/src/DKSDevice.h @@ -0,0 +1,37 @@ +/* + +Author: Uldis Locans + +Info: class that holds information about the compute device + +Data: 25.09.2014 + +*/ + +#define DKS_DEVICE_TYPE_GPU 1 +#define DKS_DEVICE_TYPE_MIC 2 +#define DKS_DEVICE_TYPE_CPU 3 + +class Device { + + private: + int m_device_id; + int m_device_type; + char *m_device_name; + char *m_device_vendor; + + bool m_sup_opencl; + bool m_sup_cuda; + bool m_sup_openmp; + bool m_sup_openacc; + + int m_pci_bus_id; + + public: + + Device(); + ~Device(); + + + +}; \ No newline at end of file diff --git a/src/DKSImageReconstruction.cpp b/src/DKSImageReconstruction.cpp new file mode 100644 index 0000000..5f2222a --- /dev/null +++ b/src/DKSImageReconstruction.cpp @@ -0,0 +1,130 @@ +#include "DKSImageReconstruction.h" + +DKSImageRecon::DKSImageRecon() { + + //set up base. since reconstruction is always using cuda, set up base to CUDA + setAPI("Cuda"); + setDevice("-gpu"); + initDevice(); + + imageRecon = CUDA_SAFEINIT( new CudaImageReconstruction(getCudaBase()) ); +} + +DKSImageRecon::~DKSImageRecon() { + delete[] imageRecon; +} + +int DKSImageRecon::callCalculateSource(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + int ierr; + ierr = imageRecon->calculateSource(image_space, image_position, source_position, + avg, std, diameter, total_voxels, + total_sources, start); + return ierr; +} + +int DKSImageRecon::callCalculateBackground(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + + int ierr; + ierr = imageRecon->calculateBackground(image_space, image_position, + source_position, avg, std, diameter, + total_voxels, total_sources, start); + return ierr; +} + +int DKSImageRecon::callCalculateSources(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + int ierr; + ierr = imageRecon->calculateSources(image_space, image_position, + source_position, avg, std, diameter, + total_voxels, total_sources, start); + return ierr; +} + +int DKSImageRecon::callCalculateBackgrounds(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + + int ierr; + ierr = imageRecon->calculateBackgrounds(image_space, image_position, + source_position, avg, std, diameter, + total_voxels, total_sources, start); + +return ierr; +} + + +int DKSImageRecon::callGenerateNormalization(void *recon, void *image_position, + void *det_position, int total_det) +{ + + int ierr = imageRecon->generateNormalization(recon, image_position, + det_position, total_det); + return ierr; +} + + +int DKSImageRecon::callForwardProjection(void *correction, void *recon, void *list_data, + void *det_position, void *image_position, int num_events) +{ + + int ierr; + ierr = imageRecon->forwardProjection(correction, recon, list_data, det_position, + image_position, num_events); + return ierr; +} + +int DKSImageRecon::callBackwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels) +{ + + int ierr; + ierr = imageRecon->backwardProjection(correction, recon_corrector, list_data, + det_position, image_position, num_events, + num_voxels); + return ierr; +} + +int DKSImageRecon::setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) { + int ierr = imageRecon->setDimensions(voxel_x, voxel_y, voxel_z, voxel_size); + return ierr; +} + +int DKSImageRecon::setEdge(float x_edge, float y_edge, float z_edge) { + int ierr = imageRecon->setEdge(x_edge, y_edge, z_edge); + return ierr; +} + +int DKSImageRecon::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) { + int ierr = imageRecon->setEdge1(x_edge1, y_edge1, z_edge1, z_edge2); + return ierr; +} + +int DKSImageRecon::setMinCrystalInRing(float min_CrystalDist_InOneRing, + float min_CrystalDist_InOneRing1) +{ + int ierr = imageRecon->setMinCrystalInRing(min_CrystalDist_InOneRing, + min_CrystalDist_InOneRing1); + return ierr; +} + +int DKSImageRecon::setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter) +{ + int ierr = imageRecon->setParams(matrix_distance_factor, phantom_diameter, + atten_per_mm, ring_diameter); + return ierr; +} diff --git a/src/DKSImageReconstruction.h b/src/DKSImageReconstruction.h new file mode 100644 index 0000000..32f67ef --- /dev/null +++ b/src/DKSImageReconstruction.h @@ -0,0 +1,120 @@ +#ifndef H_DKS_IMAGERECONSTRUCTION +#define H_DKS_IMAGERECONSTRUCTION + +#include +#include "DKSBase.h" + +#include "Algorithms/ImageReconstruction.h" + +#ifdef DKS_CUDA +#include "CUDA/CudaImageReconstruction.cuh" +#endif + +class DKSImageRecon : public DKSBase { + +private: + + ImageReconstruction *imageRecon; + +public: + + DKSImageRecon(); + + ~DKSImageRecon(); + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateSource(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateBackground(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateSources(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** Image reconstruction - generate normalization. + * + */ + int callGenerateNormalization(void *recon, void *image_position, + void *det_position, int total_det); + + /** Image reconstruction - forward correction. + * + */ + int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, + void *image_position, int num_events); + + /** Image reconstruction - backward projection. + * + */ + int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels); + + /** Set the voxel dimensins on device. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); + + /** Set the image edge. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setEdge(float x_edge, float y_edge, float z_edge); + + /** Set the image edge1. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); + + /** Set the minimum crystan in one ring values. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); + + /** Set all other required parameters for reconstruction. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter); + + + + + +}; + + +#endif diff --git a/src/DKSStream.h b/src/DKSStream.h new file mode 100644 index 0000000..17e1089 --- /dev/null +++ b/src/DKSStream.h @@ -0,0 +1,24 @@ +/* + Author: Uldis Locans + + Date: 12.12.2014 + + Comment: based on device used create different cuda streams, opencl contexts, (mic - dont know yet) + that allow handling of asynchronoes data transfer and kernel execution on the device + +*/ + +#ifndef H_DKSSTREAM +#define H_DKSSTREAM + +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 + +#include +#include + +class DKSStream { + + + +} diff --git a/src/MIC/CMakeLists.txt b/src/MIC/CMakeLists.txt new file mode 100644 index 0000000..d9b8dcd --- /dev/null +++ b/src/MIC/CMakeLists.txt @@ -0,0 +1,25 @@ +SET (_SRCS + MICBase.cpp + MICChiSquare.cpp + MICFFT.cpp + MICGreensFunction.cpp + MICCollimatorPhysics.cpp + ) + +SET (_HDRS + MICBase.h + MICChiSquare.h + MICFFT.h + MICCollimatorPhysics.h + MICGreensFunction.hpp + MICMergeSort.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/MIC) diff --git a/src/MIC/MICBase.cpp b/src/MIC/MICBase.cpp new file mode 100644 index 0000000..43c15c4 --- /dev/null +++ b/src/MIC/MICBase.cpp @@ -0,0 +1,124 @@ +#include "MICBase.h" + +//constructor, sets default device id equal to 0 +MICBase::MICBase() { + m_device_id = 0; + defaultRndSet = -1; + +} + +//destructor, delete defaultrnd streams if they are set +MICBase::~MICBase() { + mic_deleteRandStreams(); +} + + +//create default rand streams +int MICBase::mic_createRandStreams(int size) { + + int seed = time(NULL); + +#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed) + { + + //get the number of threads + int numThreads; + +#pragma omp parallel + numThreads = omp_get_num_threads(); + + //if default rnd stream already allocated delete the array + if (defaultRndSet == 1) + delete[] defaultRndStream; + + //allocate defaultRndStream array + defaultRndStream = new VSLStreamStatePtr[numThreads]; + + //create stream states for each thread +#pragma omp parallel for + for (int i = 0; i < omp_get_num_threads(); i++) + vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i); + + defaultRndSet = 1; + } + + return DKS_SUCCESS; + +} + +//delete default rand streams +int MICBase::mic_deleteRandStreams() { + +#pragma offload target(mic:m_device_id) inout(defaultRndSet) + { + if (defaultRndSet == 1) { + delete[] defaultRndStream; + defaultRndSet = -1; + } + } + + return DKS_ERROR; +} + +//create a new signal for the mic +int MICBase::mic_createStream(int & streamId) { + + //use int as signal, create a new int in micStreams vector, return the id + int tmpStream = micStreams.size(); + micStreams.push_back(tmpStream); + streamId = micStreams.size() - 1; + + //empty offload to create the signal on the mic + /* +#pragma offload target(mic:m_device_id) signal(mic_getStream(streamId)) + { + } + */ + return DKS_SUCCESS; +} + +//get the signal from the vector +int& MICBase::mic_getStream(int id) { + return micStreams[id]; +} + +//delete streams +int MICBase::mic_deleteStreams() { + micStreams.clear(); + + return DKS_SUCCESS; +} + + +//sets device id +int MICBase::mic_setDeviceId(int id) { + m_device_id = id; + + return DKS_SUCCESS; +} + +//get information abaut all available mic devices +//TODO: find a way to check system for avaialbel mic devices + +int MICBase::mic_getDevices() { + + int devices = _Offload_number_of_devices(); + int thread_count = 0; + + std::cout << "==============================" << std::endl; + std::cout << "==========Intel MICs==========" << std::endl; + std::cout << "==============================" << std::endl; + + std::cout << "Total mic devices: " << devices << std::endl; + //std::cout << "Total mic devices: currently cant be found, but it's 1 on kraftwerk" << std::endl; + +#pragma offload target(mic:m_device_id) inout(thread_count) + { + thread_count = omp_get_max_threads(); + } + + std::cout << "Max threads: " << thread_count << std::endl; + + + return DKS_SUCCESS; +} diff --git a/src/MIC/MICBase.h b/src/MIC/MICBase.h new file mode 100644 index 0000000..92b4fe9 --- /dev/null +++ b/src/MIC/MICBase.h @@ -0,0 +1,244 @@ +/* + + Name: MIC Base + Author: Uldis Locans + Info: class to handle set up and data transfer from host to Intel MIC devices + Date: 29.09.2014 + +*/ +#ifndef H_MIC_BASE +#define H_MIC_BASE + +#include +#include +#include +#include +#include +#include +#include + +#include "../DKSDefinitions.h" + +#define DKS_ALLOC alloc_if(1) +#define DKS_FREE free_if(1) +#define DKS_RETAIN free_if(0) +#define DKS_REUSE alloc_if(0) + +#define MIC_WIDTH 128 + +class MICBase { + +private: + std::vector micStreams; + +protected: + + + int defaultRndSet; + +public: + VSLStreamStatePtr *defaultRndStream; + int m_device_id; + + /* constructor */ + MICBase(); + + /* destructor */ + ~MICBase(); + + /* + Info: create MKL rand streams for each thread + Return: success or error code + */ + int mic_createRandStreams(int size); + + /* + Info: delete MKL rand streams + Return: succes or error code + */ + int mic_deleteRandStreams(); + + /* + Info: create a new signal for the mic + Return: success or error code + */ + int mic_createStream(int & streamId); + + /* + Info: get the signal from the vector + Return: mic signal + */ + int& mic_getStream(int id); + + /* + Info: delete streams + Return: success or error code + */ + int mic_deleteStreams(); + + /* + Info: set device id + Return: success or error code + */ + int mic_setDeviceId(int id); + + /* + Info: get mic devices + Return: success or error code + */ + int mic_getDevices(); + + /* + Info: allocate memory on MIC device + Return: success or error code + */ + template + void * mic_allocateMemory(int size) { + + int padding = size % MIC_WIDTH; + int totalsize = size + padding; + + T *tmp = (T*)_mm_malloc(sizeof(T)*totalsize, 64); // = new T[size]; +#pragma offload_transfer target(mic:m_device_id) nocopy(tmp:length(totalsize) DKS_ALLOC DKS_RETAIN) + + return tmp; + } + + /* + Info: transfer data to device + Return: success or error code + */ + template + int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_data = (T*)data; + +#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) ) + + return DKS_SUCCESS; + } + + /* + Info: write data to device, non-blocking + Return: success or error code + */ + template + int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) + { + T* tmp_ptr = (T*)data_ptr; + T* tmp_data = (T*)data; + +#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) ) + + return DKS_SUCCESS; + } + + + /* + Info: read data from device + Return: success or error code + */ + template + int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_result = (T*)result; + + //std::cout << "try to read data with size = " << size << " adn offset = " << offset << std::endl; +#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) ) + + return DKS_SUCCESS; + } + + /* + Info: read data from device waiting for signal + Return: success or error code + */ + template + int mic_readDataAsync(const void * data_ptr, void * result, int size, + int streamId = -1, int offset = 0) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_result = (T*)result; + +#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) ) + { + } + + return DKS_SUCCESS; + + } + + /* + Info: wait till all the signals are complete + Return siccess or error code + */ + int mic_syncDevice() { + + //empty offload to wait for all the signals to finish and launch a new empy signal + /* + for (int i = 0; i < micStreams.size(); i++) { +#pragma offload target(mic:m_device_id) wait(mic_getStream(i)) signal(mic_getStream(i)) + { + } + } + */ + + //std::cout << "done read data" << std::endl; + + return DKS_SUCCESS; + + } + + /* + Info: free memory on device + Return: success or error code + */ + template + int mic_freeMemory(void * data_ptr, int size) { + + int padding = size % MIC_WIDTH; + int totalsize = size + padding; + + T* tmp_ptr = (T*)data_ptr; +#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE) + { + } + + return DKS_SUCCESS; + } + + /* + Info: allocate memory and write data to device + Return: success or error code + */ + template + void * mic_pushData(const void * data, int size) { + T* tmp_ptr = new T[size]; + T* tmp_data = (T*)data; + +#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_ALLOC DKS_RETAIN + into(tmp_ptr[0:size]) ) + { + } + + return tmp_ptr; +} + +/* + Info: read data and free memory on device + Return: success or erro code +*/ + template + int mic_pullData(void * data_ptr, void * result, int size) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_data = (T*)result; + +#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[0:size] : DKS_REUSE DKS_FREE into(tmp_data[0:size]) ) + { + } + + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/MIC/MICChiSquare.cpp b/src/MIC/MICChiSquare.cpp new file mode 100644 index 0000000..35b6d77 --- /dev/null +++ b/src/MIC/MICChiSquare.cpp @@ -0,0 +1,93 @@ +#include "MICChiSquare.h" + +/* + calculate chi^2 on intel mic, use data already loaded on device +*/ +int MICChiSquare::mic_chi2(double *O, double *E, double *result, int size) { + +#pragma offload target(mic:m_micbase->m_device_id) \ + in(O:length(0) DKS_RETAIN DKS_REUSE) \ + in(E:length(0) DKS_RETAIN DKS_REUSE) \ + in(result:length(0) DKS_RETAIN DKS_REUSE) \ + in(size) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) { + result[i] = pow(O[i] - E[i], 2) / E[i]; + } + } + + return DKS_SUCCESS; +} + + +/* + calculate function N(t), use data already loaded on device +*/ +int MICChiSquare::mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT) { + +#pragma offload target(mic:m_micbase->m_device_id) \ + in(nt:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) \ + in(psize) in(nsize) in(jsize) in(deltaT) + { + + double gamma = 0.01; //??? + double tau = 0.01; //??? + + for (int j = 0; j < jsize; j++) { + + int pid = j*psize; + double N0 = p[pid]; + double Nbkg = p[pid+1]; + double A0 = p[pid+2]; + double phi = p[pid+3]; + double sigma = p[pid+4]; + double B = p[pid+5]; + + int idj = j*nsize; + + double a1 = -0.5*sigma*sigma; + double b1 = gamma*B; + +#pragma omp parallel for + for (int n = 0; n < nsize; n++) { + + int id = idj + n; + double t = n*deltaT; + + double a = a1*t*t; + double b = b1*t + phi; + double At = A0 * exp2(a) * cos(b); + + double c = -t/tau; + double Nt = N0 * exp2(c) * (1 + At) + Nbkg; + + nt[id] = Nt; + } + } + + } + + return DKS_SUCCESS; +} + +/* + calculate sum of array +*/ +int MICChiSquare::mic_sum(double *data, double *result, int size) { + double sum = 0; +#pragma offload target(mic:m_micbase->m_device_id) \ + in(data:length(0) DKS_REUSE DKS_RETAIN) \ + in(result:length(0) DKS_REUSE DKS_RETAIN) \ + in(size) in(sum) + { +#pragma omp parallel for reduction(+:sum) + for (int i = 0; i < size; i++) { + sum += data[i]; + } + result[0] = sum; + } + return DKS_SUCCESS; +} + diff --git a/src/MIC/MICChiSquare.h b/src/MIC/MICChiSquare.h new file mode 100644 index 0000000..c62de0b --- /dev/null +++ b/src/MIC/MICChiSquare.h @@ -0,0 +1,51 @@ +/* + + Name: MICChiSquare + Info: calculate chi^2 using intel mic coporcessor + Author: Uldis Locans + Date: 29.09.2014 + +*/ +#ifndef H_MIC_CHI_SQUARE +#define H_MIC_CHI_SQUARE + +#include +#include +#include +#include "MICBase.h" + +class MICChiSquare { + + MICBase *m_micbase; + +public: + + /* constructor */ + MICChiSquare(MICBase *base) { + m_micbase = base; + } + + /* destructor */ + ~MICChiSquare() { } + + /* + Info: calucate chi square + Return: success or error code + */ + int mic_chi2(double *O, double *E, double *result, int size); + + /* + Info: calculate Nt function + Return: success or error code + */ + int mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT = 1); + + /* + Info: calculate sum of array + Return: success or error code + */ + int mic_sum(double *data, double *result, int size); + +}; + +#endif diff --git a/src/MIC/MICCollimatorPhysics.cpp b/src/MIC/MICCollimatorPhysics.cpp new file mode 100644 index 0000000..6a1b937 --- /dev/null +++ b/src/MIC/MICCollimatorPhysics.cpp @@ -0,0 +1,876 @@ +#include "MICCollimatorPhysics.h" + +#define M_P 0.93827231e+00 +#define C 299792458.0 +#define PI 3.14159265358979323846 +#define AVO 6.022e23 +#define R_E 2.81794092e-15 +#define eM_E 0.51099906e-03 +#define Z_P 1 +#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 + +#define POSITION 0 +#define ZSIZE 1 +#define RHO_M 2 +#define Z_M 3 +#define A_M 4 +#define A2_C 5 +#define A3_C 6 +#define A4_C 7 +#define A5_C 8 +#define X0_M 9 +#define I_M 10 +#define DT_M 11 + +__declspec(target(mic)) +double dot(mic_double3 d1, mic_double3 d2) { + return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); +} + +__declspec(target(mic)) +double dot(double dx, double dy, double dz) { + return (dx * dx + dy * dy + dz * dz); +} + +__declspec(target(mic)) +bool checkHit(double &z, double *par) { + return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); +} + + +__declspec(target(mic)) +void Rot(double &px, double &pz, double &x, double &z, double xplane, + double normP, double thetacou, double deltas, int coord) +{ + double Psixz = 1; + double pxz = 1; + + if ( px >= 0 && pz >= 0 ) + Psixz = atan(px/pz); + else if ( px > 0 && pz < 0 ) + Psixz = atan(px/pz) + PI; + else if (px < 0 && pz > 0) + Psixz = atan(px/pz) + 2*PI; + else + Psixz = atan(px/pz) + PI; + + pxz = sqrt(px*px + pz*pz); + + if(coord == 1) { + x = x + deltas * px / normP + xplane*cos(Psixz); + z = z - xplane * sin(Psixz); + } + + if(coord == 2) { + x = x + deltas * px / normP + xplane * cos(Psixz); + z = z - xplane * sin(Psixz) + deltas * pz / normP; + } + + px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou); + pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); +} + +__declspec(target(mic)) +void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) { + double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; + double gamma = (Eng + M_P) / M_P; + double normP = sqrt(dot(P, P)); + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + double deltas = par[DT_M] * beta * C; + + double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * + Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); + + // x-direction: See Physical Review, "Multiple Scattering" + double z1, z2; + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + double thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + thetacou = z2 * theta0; + } + + double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1); + + double P2;//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1); + if(P2 < 0.0047) { + double P3, P4; + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.x ,P.z, R.x, R.z, xplane, normP, thetaru, deltas, 0); + } + + // y-direction: See Physical Review, "Multiple Scattering" + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + thetacou = z2 * theta0; + } + + double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.y, P.z, R.y, R.z, yplane, normP, thetacou, deltas, 2); + + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1); + if(P2 < 0.0047) { + double P3, P4; + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.y, P.z, R.y, R.z, yplane, normP, thetaru, deltas, 0); + } + +} + +__declspec(target(mic)) +void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label, + double *par, VSLStreamStatePtr &stream, int ii, int size) +{ + + double normP[MIC_WIDTH] __attribute__((aligned(64))); + double deltas[MIC_WIDTH] __attribute__((aligned(64))); + double theta0[MIC_WIDTH] __attribute__((aligned(64))); + double P1[MIC_WIDTH] __attribute__((aligned(64))); + double P2[MIC_WIDTH] __attribute__((aligned(64))); + double P3[MIC_WIDTH] __attribute__((aligned(64))); + + double z1[MIC_WIDTH] __attribute__((aligned(64))); + double z2[MIC_WIDTH] __attribute__((aligned(64))); + double thetacou[MIC_WIDTH] __attribute__((aligned(64))); + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + double dotp = dot(px[i], py[i], pz[i]); + double Eng = sqrt(dotp + 1.0) * M_P - M_P; + double gamma = (Eng + M_P) / M_P; + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + + normP[idx] = sqrt(dotp); + deltas[idx] = par[DT_M] * beta * C; + theta0[idx] = 13.6e6 / (beta * normP[idx] * M_P * 1e9) * + Z_P * sqrt(deltas[idx] / par[X0_M]) * (1.0 + 0.038 * log(deltas[idx] / par[X0_M])); + } + } + + // x-direction: See Physical Review, "Multiple Scattering" + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0); + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + size; i++) { + int idx = i - ii; + thetacou[idx] = z2[idx] * theta0[idx]; + } + + //unknown number of iterations, cannot vectorize + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 ); + thetacou[idx] = z2[idx] * theta0[idx]; + } + } + } + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + size; i++) { + int idx = i - ii; + if (label[i] == 0) { + double xplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) + + z2[idx] * deltas[idx] * theta0[idx] / 2.0; + Rot(px[i], pz[i], rx[i], rz[i], xplane, normP[idx], thetacou[idx], deltas[idx], 1); + } + } + + + //generate array of random numbers + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1); + + //P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH] + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + if(P1[idx] < 0.0047) { + double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx]; + + if(P3[idx] > 0.5) + thetaru = -thetaru; + + Rot(px[i] ,pz[i], rx[i], rz[i], 0, 0, thetaru, 0, 0); + } + } + } + + // y-direction: See Physical Review, "Multiple Scattering" + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0); + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + thetacou[idx] = z2[idx] * theta0[idx]; + } + + //unknown number of iterations, cannot vectorize + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 ); + thetacou[idx] = z2[idx] * theta0[idx]; + } + } + } + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + double yplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) + + z2[idx] * deltas[idx] * theta0[idx] / 2.0; + Rot(py[i], pz[i], ry[i], rz[i], yplane, normP[idx], thetacou[idx], deltas[idx], 2); + } + } + + //generate array of random numbers + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1); + + //P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH] + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + if(P1[idx] < 0.0047) { + double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx]; + if(P3[idx] > 0.5) + thetaru = -thetaru; + Rot(py[i], pz[i], ry[i], rz[i], 0, 0, thetaru, 0, 0); + } + } + } + +} + +__declspec(target(mic)) +void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) { + + double dEdx = 0.0; + const double gamma = (Eng + M_P) / M_P; + const double gamma2 = gamma * gamma; + const double beta = sqrt(1.0 - 1.0 / gamma2); + const double beta2 = beta * beta; + + const double deltas = par[DT_M] * beta * C; + const double deltasrho = deltas * 100 * par[RHO_M]; + const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); + + if ( (Eng > 0.00001) && (Eng < 0.0006) ) { + const double Ts = (Eng * 1E6) / 1.0073; + const double epsilon_low = par[A2_C] * pow(Ts, 0.45); + const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) ); + const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high); + + dEdx = -epsilon / (1E21 * (par[A_M] / AVO) ); + + double tmprnd; + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E ); + const double delta_E = deltasrho * dEdx + tmprnd; + Eng = Eng + delta_E / 1E3; + } + + if (Eng >= 0.0006) { + const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P)); + + dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + double tmprnd; + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E ); + const double delta_E = deltasrho * dEdx + tmprnd; + + Eng = Eng + delta_E / 1E3; + } + + + if ((Eng<1E-4) || (dEdx>0)) + pdead = 1; +} + +__declspec(target(mic)) +void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) { + + const double gamma = (Eng + M_P) / M_P; + const double gamma2 = gamma * gamma; + const double beta = sqrt(1.0 - 1.0 / gamma2); + const double beta2 = beta * beta; + + const double deltas = par[DT_M] * beta * C; + const double deltasrho = deltas * 100 * par[RHO_M]; + const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); + + if ( (Eng > 0.00001) && (Eng < 0.0006) ) { + const double Ts = (Eng * 1E6) / 1.0073; + const double epsilon_low = par[A2_C] * pow(Ts, 0.45); + const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) ); + const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high); + + dEdx = -epsilon / (1E21 * (par[A_M] / AVO) ); + + const double delta_E = deltasrho * dEdx + sigma_E * randv[ri]; + + Eng = Eng + delta_E / 1E3; + } + + if (Eng >= 0.0006) { + const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P)); + + dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + const double delta_E = deltasrho * dEdx + sigma_E * randv[ri + MIC_WIDTH]; + + Eng = Eng + delta_E / 1E3; + } + +} + +int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) { + + //cast device memory pointers to appropriate types + MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr; + double *par = (double*) par_ptr; + +#pragma offload target(mic:m_micbase->m_device_id) \ + inout(data:length(0) DKS_RETAIN DKS_REUSE) \ + in(par:length(0) DKS_RETAIN DKS_REUSE) \ + in(numparticles) + { + +#pragma omp parallel + { + VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; + + //for loop trough particles if not checkhit set label to -2 and update R.x + +#pragma omp for simd + for (int i = 0; i < numparticles; i++) { + if ( !checkHit(data[i].Rincol.z, par) ) { + double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol)); + data[i].Rincol.x = data[i].Rincol.x + par[DT_M] * C * data[i].Pincol.x / sq; + data[i].Rincol.y = data[i].Rincol.y + par[DT_M] * C * data[i].Pincol.y / sq; + data[i].Rincol.z = data[i].Rincol.z + par[DT_M] * C * data[i].Pincol.z / sq; + data[i].label = -2; + } + } + + //for loop trough particles if label == 0 eneregy loss and if pdead update label to -1 +#pragma omp for simd + for (int i = 0; i < numparticles; i++) { + + int pdead = -1; + double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol)); + double Eng = (sq - 1) * M_P; + + if (data[i].label == 0) { + energyLoss(Eng, pdead, par, stream); + } + + if (pdead == -1) { + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(data[i].Pincol, data[i].Pincol)); + data[i].Pincol.x = data[i].Pincol.x * ptot / sq; + data[i].Pincol.y = data[i].Pincol.y * ptot / sq; + data[i].Pincol.z = data[i].Pincol.z * ptot / sq; + } + + if (pdead == 1) + data[i].label = -1; + } + + //for loop trough particles if label == 0 coulomb scat +#pragma omp for + for (int i = 0; i < numparticles; i++) { + if (data[i].label == 0) { + coulombScat(data[i].Rincol, data[i].Pincol, par, stream); + } + } + + } //end omp parallel + + } //end offload + return DKS_SUCCESS; + +} + + + +int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) +{ + + + + int *label = (int*)label_ptr; + unsigned *localID = (unsigned*)localID_ptr; + double *rx = (double*)rx_ptr; + double *ry = (double*)ry_ptr; + double *rz = (double*)rz_ptr; + double *px = (double*)px_ptr; + double *py = (double*)py_ptr; + double *pz = (double*)pz_ptr; + double *par = (double*)par_ptr; + + int padding = numparticles % MIC_WIDTH; + int totalpart = numparticles + padding; + +#pragma offload target (mic:0) \ + in(label:length(0) DKS_REUSE DKS_RETAIN) \ + in(localID:length(0) DKS_REUSE DKS_RETAIN) \ + in(rx:length(0) DKS_REUSE DKS_RETAIN) \ + in(ry:length(0) DKS_REUSE DKS_RETAIN) \ + in(rz:length(0) DKS_REUSE DKS_RETAIN) \ + in(px:length(0) DKS_REUSE DKS_RETAIN) \ + in(py:length(0) DKS_REUSE DKS_RETAIN) \ + in(pz:length(0) DKS_REUSE DKS_RETAIN) \ + in(par:length(0) DKS_RETAIN DKS_REUSE) \ + in(totalpart) + { + +#pragma omp parallel + { + //every thread gets its own rnd stream state + VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; + + + #pragma omp for nowait + for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { + //vectorize main loop + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + if ( !checkHit(rz[i], par) ) { + double sq = sqrt(1.0 + dot(px[i], py[i], pz[i])); + rx[i] = rx[i] + par[DT_M] * C * px[i] / sq; + ry[i] = ry[i] + par[DT_M] * C * py[i] / sq; + rz[i] = rz[i] + par[DT_M] * C * pz[i] / sq; + label[i] = -2; + } + } + } + + + //array of size 2*WIDTH for storing random values for the energyloss function + double randv[2*MIC_WIDTH] __attribute__((aligned(64))); + + //for loop trough particles if label == 0 eneregy loss and if pdead update label to -1 + #pragma omp for nowait + for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { + //create array of rand values (2 per thread) + vdRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*MIC_WIDTH, randv, 0.0, 1.0); + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + + double sq = sqrt(1.0 + dot(px[i], py[i], pz[i])); + double Eng = (sq - 1) * M_P; + double dEdx = 0; + + if (label[i] == 0) { + energyLoss(Eng, dEdx, par, randv, i - ii); + } + + if (Eng > 1e-4 && dEdx < 0) { + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(px[i], py[i], pz[i])); + px[i] = px[i] * ptot / sq; + py[i] = py[i] * ptot / sq; + pz[i] = pz[i] * ptot / sq; + } + + if (Eng < 1e-4 || dEdx > 0) + label[i] = -1; + + } //end inner energy loss loop + + } //end outer energy loss loop + + //vectorize coulomb scattering as much as possible +#pragma omp for nowait + for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { + coulombScat(rx, ry, rz, px, py, pz, label, par, stream, ii, MIC_WIDTH); + } //end coulomb scattering + + } //end omp parallel + + } //end offload + + return DKS_SUCCESS; +} + +int MICCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles, + int &numaddback) +{ + + //cast device memory pointers to appropriate types + MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr; + int privateback; + +#pragma offload target(mic:m_micbase->m_device_id) \ + in(data:length(0) DKS_RETAIN DKS_REUSE) \ + in(numparticles) \ + out(privateback) + { + //count dead and addback particles + int privateback = 0; +#pragma omp parallel for reduction(+:privateback) + for (int i = 0; i < numparticles; i++) { + if (data[i].label < 0) + privateback++; + } + //move particles with label < 0 to the end of the array (serial. can we do this parallel?) + if (privateback > 0) { + + int moved = 0; + for (int i = numparticles - 1; i > 0; i--) { + if (data[i].label < 0) { + int idx = numparticles - 1 - moved; + if (i != idx) { + MIC_PART_SMALL tmp = data[i]; + data[i] = data[idx]; + data[idx] = tmp; + } + moved++; + } + } + } + numaddback = privateback; + } + return DKS_SUCCESS; +} + +__declspec(target(mic)) +void micmove(double &a, double &b) { + double tmp = a; + a = b; + b = tmp; +} + +__declspec(target(mic)) +void micmove(int &a, int &b) { + int tmp = a; + a = b; + b = tmp; +} + +__declspec(target(mic)) +void micmove(unsigned &a, unsigned &b) { + unsigned tmp = a; + a = b; + b = tmp; +} + + +int MICCollimatorPhysics::CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, + int &numaddback) +{ + + int *label = (int*)label_ptr; + unsigned *localID = (unsigned*)localID_ptr; + double *rx = (double*)rx_ptr; + double *ry = (double*)ry_ptr; + double *rz = (double*)rz_ptr; + double *px = (double*)px_ptr; + double *py = (double*)py_ptr; + double *pz = (double*)pz_ptr; + double *par = (double*)par_ptr; + + //int padding = numparticles % WIDTH; + //int totalpart = numparticles + padding; + + int privateback; + +#pragma offload target (mic:0) \ + in(label:length(0) DKS_REUSE DKS_RETAIN) \ + in(localID:length(0) DKS_REUSE DKS_RETAIN) \ + in(rx:length(0) DKS_REUSE DKS_RETAIN) \ + in(ry:length(0) DKS_REUSE DKS_RETAIN) \ + in(rz:length(0) DKS_REUSE DKS_RETAIN) \ + in(px:length(0) DKS_REUSE DKS_RETAIN) \ + in(py:length(0) DKS_REUSE DKS_RETAIN) \ + in(pz:length(0) DKS_REUSE DKS_RETAIN) \ + in(par:length(0) DKS_RETAIN DKS_REUSE) \ + in(numparticles) \ + out(privateback) + { + + //count dead and addback particles + int privateback = 0; +#pragma omp parallel for reduction(+:privateback) + for (int i = 0; i < numparticles; i++) { + if (label[i] < 0) + privateback++; + } + + //move particles with label < 0 to the end of the array (serial. can we do this parallel?) + if (privateback > 0) { + int moved = 0; + for (int i = numparticles - 1; i >= 0; i--) { + if (label[i] < 0) { + int idx = numparticles - 1 - moved; + if (i != idx) { + micmove(rx[i], rx[idx]); + micmove(ry[i], ry[idx]); + micmove(rz[i], rz[idx]); + micmove(px[i], px[idx]); + micmove(py[i], py[idx]); + micmove(pz[i], pz[idx]); + micmove(label[i], label[idx]); + micmove(localID[i], localID[idx]); + } + moved++; + } + } + } + numaddback = privateback; + } + + return DKS_SUCCESS; +} + +__declspec(target(mic)) +inline void unitlessOff(mic_double3 &a, const double c) { + a.x *= c; + a.y *= c; + a.z *= c; +} + +__declspec(target(mic)) +inline void unitlessOn(mic_double3 &a, const double c) { + a.x /= c; + a.y /= c; + a.z /= c; +} + +__declspec(target(mic)) +mic_double3 deviceTransformTo(const mic_double3 &vec, const mic_double3 &ori) { + const double sina = sin(ori.x); + const double cosa = cos(ori.x); + const double sinb = sin(ori.y); + const double cosb = cos(ori.y); + const double sinc = sin(ori.z); + const double cosc = cos(ori.z); + + mic_double3 temp; + temp.x = 0.0; + temp.y = 0.0; + temp.z = 0.0; + + temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z; + temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x + + (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z; + temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x + + (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z; + + return temp; +} + +__declspec(target(mic)) +inline void updateR(mic_double3 &R, mic_double3 &P, double dotp, double dtc) { + R.x /= dtc; + R.x += 0.5 * P.x / dotp; + R.x *= dtc; + + R.y /= dtc; + R.y += 0.5 * P.y / dotp; + R.y *= dtc; + + R.z /= dtc; + R.z += 0.5 * P.z / dotp; + R.z *= dtc; +} + +__declspec(target(mic)) +inline void push(mic_double3 *r, mic_double3 *p, double dtc, int npart) { +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 R = r[i]; + mic_double3 P = p[i]; + double dotp = sqrt(1.0 + dot(P, P)); + updateR(R, P, dotp, dtc); + r[i] = R; + } +} + +__declspec(target(mic)) +inline void push(mic_double3 *r, mic_double3 *p, double *gdt, double c, int npart) { +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 R = r[i]; + mic_double3 P = p[i]; + double dtc = gdt[i] * c; + double dotp = sqrt(1.0 + dot(P, P)); + updateR(R, P, dotp, dtc); + r[i] = R; + } +} + + +int MICCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt, int streamId) +{ + + mic_double3 *r = (mic_double3*)r_ptr; + mic_double3 *p = (mic_double3*)p_ptr; + double *gdt = (double*)dt_ptr; + double dtc = dt * c; + + if (!usedt) { +#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(npart, dtc) + { + push(r, p, dtc, npart); + } + + } else { + +#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) in(npart, c) + { + push(r, p, gdt, c, npart); + } + } + + return DKS_SUCCESS; +} + +__declspec(target(mic)) +inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect, + double dtc, int npart, int nsec) +{ + +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 ori; + if (gLastSect[i] > -1 && gLastSect[i] < nsec) { + ori = gOrient[gLastSect[i]]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + mic_double3 tmp = deviceTransformTo(p[i], ori); + mic_double3 X = x[i]; + double dotp = sqrt(1.0 + dot(tmp, tmp)); + updateR(X, tmp, dotp, dtc); + x[i] = X; + } + +} + +__declspec(target(mic)) +inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect, + double *gdt, double c, int npart, int nsec) +{ + +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 ori; + if (gLastSect[i] > -1 && gLastSect[i] < nsec) { + ori = gOrient[gLastSect[i]]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + mic_double3 tmp = deviceTransformTo(p[i], ori); + mic_double3 X = x[i]; + double dotp = sqrt(1.0 + dot(tmp, tmp)); + double dtc = gdt[i] * c; + + updateR(X, tmp, dotp, dtc); + x[i] = X; + } + +} + +int MICCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, + void *orient_ptr, int npart, + int nsec, void *dt_ptr, double dt, + double c, bool usedt, int streamId) +{ + + mic_double3 *x = (mic_double3*)x_ptr; + mic_double3 *p = (mic_double3*)p_ptr; + mic_double3 *gOrient = (mic_double3*)orient_ptr; + double *gdt = (double*)dt_ptr; + long *gLastSect = (long*)lastSec_ptr; + double dtc = dt * c; + + if (!usedt) { + +#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(gOrient:length(0) DKS_RETAIN DKS_REUSE) \ + in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) in(npart, nsec, dtc) + { + pushTransform(x, p, gOrient, gLastSect, dtc, npart, nsec); + } + + } else { + +#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) \ + in(gOrient:length(0) DKS_RETAIN DKS_REUSE) in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) \ + in(npart, nsec, c) + { + pushTransform(x, p, gOrient, gLastSect, gdt, c, npart, nsec); + } + } + + return DKS_SUCCESS; + +} + + diff --git a/src/MIC/MICCollimatorPhysics.h b/src/MIC/MICCollimatorPhysics.h new file mode 100644 index 0000000..0795779 --- /dev/null +++ b/src/MIC/MICCollimatorPhysics.h @@ -0,0 +1,68 @@ +#ifndef H_MIC_COLLIMATORPHYSICS +#define H_MIC_COLLIMATORPHYSICS + +#include +#include +#include +#include +#include + +#include "../Algorithms/CollimatorPhysics.h" +#include "MICBase.h" + +__declspec(target(mic)) +typedef struct { + double x; + double y; + double z; +} mic_double3; + +__declspec(target(mic)) +typedef struct { + int label; + unsigned localID; + mic_double3 Rincol; + mic_double3 Pincol; +} MIC_PART_SMALL; + + +class MICCollimatorPhysics : DKSAlogorithms{ + +private: + + MICBase *m_micbase; + +public: + + MICCollimatorPhysics(MICBase *base) { + m_micbase = base; + }; + + ~MICCollimatorPhysics() { }; + + int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles); + + int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles); + + int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback); + + int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1); + + int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + +}; + + +#endif diff --git a/src/MIC/MICFFT.cpp b/src/MIC/MICFFT.cpp new file mode 100644 index 0000000..ab82c83 --- /dev/null +++ b/src/MIC/MICFFT.cpp @@ -0,0 +1,210 @@ +#include "MICFFT.h" +#include +#include +#include +#include + +MICFFT::MICFFT(MICBase *base) { + m_micbase = base; +} + +MICFFT::~MICFFT() { +#pragma offload target(mic:0) + { + DftiFreeDescriptor(&FFTHandle_m); + DftiFreeDescriptor(&handle); + } +} + +//setup fft +int MICFFT::setupFFT(int ndim, int N[3]) { + //set up FFT engine +#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE) + { + + MKL_LONG sizes[3], strides[4]; + sizes[0] = N[0]; sizes[1] = N[1]; sizes[2] = N[2]; + //strides[0] = 0; strides[1] = sizes[1]; strides[2] = 1; strides[3] = sizes[0]*sizes[1]; + strides[0] = 0; strides[1] = sizes[0]*sizes[1]; strides[2] = sizes[0]; strides[3] = 1; + + MKL_LONG dims = 3; + DftiCreateDescriptor(&(this->getHandle()), DFTI_DOUBLE, DFTI_COMPLEX, dims, sizes); + DftiSetValue(this->getHandle(), DFTI_INPUT_STRIDES, strides); + DftiSetValue(this->getHandle(), DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX); + DftiCommitDescriptor(this->getHandle()); + + } + + + return DKS_SUCCESS; +} +//BENI: +//setup fft +int MICFFT::setupFFTRC(int ndim, int N[3], double scale) { + + //set up FFT engine for REAL->COMPLEX + +#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE) + { + + MKL_LONG sizes[3], real_strides[4], complex_strides[4]; + sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0]; + //real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1; + real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1; + //real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1]; + //complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1; + complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1; + //complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1; + + MKL_LONG dims = 3; + DftiCreateDescriptor(&(this->getHandleRC()), DFTI_DOUBLE, DFTI_REAL, dims, sizes); + DftiSetValue(this->getHandleRC(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + DftiSetValue(this->getHandleRC(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT); + DftiSetValue(this->getHandleRC(), DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DftiSetValue(this->getHandleRC(), DFTI_INPUT_STRIDES, real_strides); + DftiSetValue(this->getHandleRC(), DFTI_OUTPUT_STRIDES, complex_strides); + DftiSetValue(this->getHandleRC(), DFTI_FORWARD_SCALE, scale); + DftiCommitDescriptor(this->getHandleRC()); + + } + + return DKS_SUCCESS; +} + +//BENI: +//setup fft +int MICFFT::setupFFTCR(int ndim, int N[3], double scale) { + + //set up FFT engine for COMPLEX->REAL + +#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE) + { + MKL_LONG sizes[3], real_strides[4], complex_strides[4]; + sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0]; + //real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1; + real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1; + //real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1]; + //complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1; + complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1; + //complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1; + + MKL_LONG dims = 3; + DftiCreateDescriptor(&(this->getHandleCR()), DFTI_DOUBLE, DFTI_REAL, dims, sizes); + DftiSetValue(this->getHandleCR(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + DftiSetValue(this->getHandleCR(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT); + DftiSetValue(this->getHandleCR(), DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DftiSetValue(this->getHandleCR(), DFTI_INPUT_STRIDES, complex_strides); + DftiSetValue(this->getHandleCR(), DFTI_OUTPUT_STRIDES, real_strides); + DftiSetValue(this->getHandleCR(), DFTI_BACKWARD_SCALE, scale); + DftiCommitDescriptor(this->getHandleCR()); + + + + } + + return DKS_SUCCESS; +} + +//execute COMPLEX->COMPLEX FFT +int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool forward) { + + _Complex double *ptr = (_Complex double*) mem_ptr; + +#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(forward) + { + if (forward) + DftiComputeForward(this->getHandle(), ptr); + else + DftiComputeBackward(this->getHandle(), ptr); + } + + return DKS_SUCCESS; +} + +//execute iFFT +int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) { + return mic_executeFFT(mem_ptr, ndim, N, -1, false); +} + +//execute REAL->COMPLEX FFT +int MICFFT::executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) { + + double *real_ptr = (double*) in_ptr; + //std::complex *compl_ptr = (std::complex *) out_ptr; + _Complex double *compl_ptr = (_Complex double *) out_ptr; + int sizereal = N[0]*N[1]*N[2]; + int sizecompl = (N[0]/2+1)*N[1]*N[2]; + +//std::cout << "start real-compl fft on mic " << std::endl; + + //std::cout << "real_ptr = " << real_ptr << std::endl; + //std::cout << "compl_ptr = " << compl_ptr << std::endl; + //std::cout << "EXECUTE AVERAGING OVER 10 LOOPS OF FFT" << std::endl; + +#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE) + //#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE) + { + //for (int i=0;i<10;++i){ //loop 10 times for benchmarking + DftiComputeForward(this->getHandleRC(), real_ptr, compl_ptr); + //} + } + +//std::cout << "end real-compl fft on mic " << std::endl; + + + return DKS_SUCCESS; +} + +//execute COMPLEX->REAL FFT +int MICFFT::executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) { + + //_Complex double *ptr = (_Complex double*) mem_ptr; + + double *real_ptr = (double*) out_ptr; + _Complex double *compl_ptr = (_Complex double *) in_ptr; + + //std::cout << "real_ptr = " << real_ptr << std::endl; + //std::cout << "compl_ptr = " << compl_ptr << std::endl; + int sizereal = N[0]*N[1]*N[2]; + int sizecompl = (N[0]/2+1)*N[1]*N[2]; + + //std::cout << "offload to perform backward fft ... " << std::endl; +//struct timeval start, end; +//gettimeofday(&start,NULL); +#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE) + //#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE) + { + //for (int i=0;i<10;++i){ //loop 10 times for benchmarking + DftiComputeBackward(this->getHandleCR(), compl_ptr, real_ptr); + //} + } + +// End timing offloaded FFT. +//gettimeofday(&end,NULL); +// Print execution time of offloaded computational loop. +//printf ("Total time for IFFT spent = %f seconds\n", +//(double) (end.tv_usec-start.tv_usec) /1000000+(double) (end.tv_sec-start.tv_sec)); + //std::cout << "IFFT DONE!" << std::endl; + return DKS_SUCCESS; +} + + +//normalize IFFT +int MICFFT::normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId) { + + int size = N[0] * N[1] * N[2]; + + _Complex double *ptr = (_Complex double*) mem_ptr; +#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(size) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) { + __real__ ptr[i] = __real__ ptr[i] / size; + __imag__ ptr[i] = __imag__ ptr[i] / size; + } + } + + return DKS_SUCCESS; + +} + diff --git a/src/MIC/MICFFT.h b/src/MIC/MICFFT.h new file mode 100644 index 0000000..626fc19 --- /dev/null +++ b/src/MIC/MICFFT.h @@ -0,0 +1,79 @@ +#ifndef H_MIC_FFT +#define H_MIC_FFT + +#include +#include + +#include +#include + +#include "../Algorithm/DKSFFT.h" +#include "MICBase.h" + +class MICFFT : public DKSFFT { + +private: + + MICBase *m_micbase; + + /// Internal FFT object for performing serial FFTs. +#pragma offload_attribute(push,target(mic)) + DFTI_DESCRIPTOR_HANDLE FFTHandle_m; //declspec only works for global variables + DFTI_DESCRIPTOR_HANDLE handle; + DFTI_DESCRIPTOR_HANDLE rc_handle; //handle for REAL->COMPLEX + DFTI_DESCRIPTOR_HANDLE cr_handle; //handle for COMPLEX->REAL + +#pragma offload_attribute(pop) + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle(void) { + return FFTHandle_m; + } + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle1(void) { + return handle; + } + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleRC(void) { + return rc_handle; + } + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleCR(void) { + return cr_handle; + } + +public: + + /* constructor */ + MICFFT(MICBase *base); + + /* destructir */ + ~MICFFT(); + + /* + Info: setup mkl fft + Return: success or error code + */ + int setupFFT(int ndim, int N[3]); + //BENI: + int setupFFTRC(int ndim, int N[3], double scale = 1.0); + //BENI: + int setupFFTCR(int ndim, int N[3], double scale = 1.0); + + /* execute FFT on MIC */ + int executeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); + + /* execute IFFT on MIC */ + int executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1); + + /* execute REAL->COMPLEX FFT on MIC */ + int executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1); + + /* execute COMPLEX->REAL FFT on MIC */ + int executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1); + + /* normalize IFFT on MIC */ + int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1); + +}; + +#endif diff --git a/src/MIC/MICGreensFunction.cpp b/src/MIC/MICGreensFunction.cpp new file mode 100644 index 0000000..6725a1e --- /dev/null +++ b/src/MIC/MICGreensFunction.cpp @@ -0,0 +1,307 @@ +#include "MICGreensFunction.hpp" +#include +#include +#include + +/* constructor */ +MICGreensFunction::MICGreensFunction(MICBase *base) { + m_micbase = base; +} + +/* destructor */ +MICGreensFunction::~MICGreensFunction() { +} + + +/* compute greens integral analytically */ +// Version with extended domain +/* + int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,double hr_m1, double hr_m2) { + double *tmp_ptr = (double*) tmp_ptr_; + #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2) + { + std::memset(tmp_ptr,0,(I+1)*(J+1)*(K+1)); + double cellVolume = hr_m0 * hr_m1 * hr_m2; + #pragma omp parallel for collapse(3) schedule(dynamic) + for (int k = 0; k < K; k++) { + for (int j = 0; j < J; j++) { + for (int i = 0; i < I; i++) { + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = 0; + tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgrn = tmpgrn / cellVolume; + + tmp_ptr[k*(J+1)*(I+1) + j*(I+1) + i] = tmpgrn; + } + } + } + } + return 0; + } +*/ + +int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0, + double hr_m1, double hr_m2) +{ + + double *tmp_ptr = (double*) tmp_ptr_; +#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2) + { + std::memset(tmp_ptr,0,I*J*K); + double cellVolume = hr_m0 * hr_m1 * hr_m2; +#pragma omp parallel for collapse(3) schedule(dynamic) + for (int k = 0; k < K; k++) { + for (int j = 0; j < J; j++) { + for (int i = 0; i < I; i++) { + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = 0; + tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgrn = tmpgrn / cellVolume; + + tmp_ptr[k*(J)*(I) + j*(I) + i] = tmpgrn; + } + } + } + } + return 0; +} + + + +/* perform the actual integration */ +// version with extended domain +/* + int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) { + double *tmp_ptr = (double*) tmp_ptr_; + double *mem_ptr = (double*) mem_ptr_; + +// the actual integration +#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K) +{ +int Ii = I; +int Jj = J; +int Kk = K; +int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); +std::memset(mem_ptr,0,II*JJ*KK); +I=I+1; J=J+1; K=K+1; + +#pragma omp parallel for collapse(3) +for (int i=0; i +#include + +#include +#include + +#include "MICBase.h" + +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 + +class MICGreensFunction { + +private: + MICBase *m_micbase; + +public: + + /* constructor */ + MICGreensFunction(MICBase *base); + + /* destructor */ + ~MICGreensFunction(); + + /* compute greens integral analytically */ + int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2); + + /* perform the actual integration */ + int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K); + + /* Mirror rho-Field */ + int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K); + + /*multiply complex fields*/ + int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size); + +}; + +#endif diff --git a/src/MIC/MICMergeSort.h b/src/MIC/MICMergeSort.h new file mode 100644 index 0000000..408037b --- /dev/null +++ b/src/MIC/MICMergeSort.h @@ -0,0 +1,116 @@ +#include +#include +#include + +/* default comparison function */ +template +inline bool greaterThan(T x, T y) { + return x > y; +} + +/* swap a and b */ +template +void mergeswap(T &a, T &b) { + T tmp = a; + a = b; + b = tmp; +} + +template +void split_merge(T *a, int ibegin, int iend, T *b, bool (*comp)(T, T) ) { + + if (iend - ibegin < 500) { + quick_sort(a + ibegin, 0, iend - ibegin - 1, comp); + return; + } + + int imiddle = (iend + ibegin) / 2; + +#pragma omp task + split_merge(a, ibegin, imiddle, b, comp); + split_merge(a, imiddle, iend, b, comp); +#pragma omp taskwait + + merge(a, ibegin, imiddle, iend, b, comp); + +} + +template +void merge(T *a, int ibegin, int imiddle, int iend, T *b, bool (*comp)(T, T)) { + + int i0 = ibegin; + int i1 = imiddle; + + //merge two halfs of array a to tmp array b + int i = ibegin; + while (i < iend) { + if (i0 < imiddle && ( i1 >= iend || comp(a[i1], a[i0]) ) ) + b[i++] = a[i0++]; + else + b[i++] = a[i1++]; + } + + //copy b back to a + for (int i = ibegin; i < iend; i++) + a[i] = b[i]; + +} + +template +int partition(T *a, int start, int end, bool (*comp)(T, T) ) { + int p = start; + T x = a[start]; + + for (int i = start + 1; i <= end; i++) { + if ( comp(x, a[i]) ) { + p++; + mergeswap(a[i], a[p]); + } + } + mergeswap(a[p], a[start]); + return p; +} + +template +void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) { + +#pragma omp parallel + { +#pragma omp single + { + T *b = new T[n]; + split_merge(list, 0, n, b, comp); + } + } +} + +template +void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) { + + if (start < end) { + //for small elements move to insertion sort + if ( (end - start) < 9 ) { + insertion_sort(list, start, end + 1, comp); + } else { + int part = partition(list, start, end, comp); + quick_sort(list, start, part - 1, comp); + quick_sort(list, part + 1, end, comp); + } + } + +} + +template +void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) { + + for (int i = start + 1; i < end; i++) { + T key = list[i]; + int j = i - 1; + while ( j >= 0 && comp(list[j], key) ) { + list[j + 1] = list[j]; + j--; + } + list[j + 1] = key; + } + +} diff --git a/src/OpenCL/CMakeLists.txt b/src/OpenCL/CMakeLists.txt new file mode 100644 index 0000000..19cedbe --- /dev/null +++ b/src/OpenCL/CMakeLists.txt @@ -0,0 +1,34 @@ +SET (_SRCS + OpenCLBase.cpp + OpenCLFFT.cpp + OpenCLChiSquare.cpp + OpenCLCollimatorPhysics.cpp + OpenCLChiSquareRuntime.cpp + ) + +SET (_HDRS + OpenCLBase.h + OpenCLFFT.h + OpenCLChiSquare.h + OpenCLCollimatorPhysics.h + OpenCLChiSquareRuntime.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +SET (_KERNELS + OpenCLKernels/OpenCLChiSquare.cl + OpenCLKernels/OpenCLFFT.cl + OpenCLKernels/OpenCLFFTStockham.cl + OpenCLKernels/OpenCLTranspose.cl + OpenCLKernels/OpenCLCollimatorPhysics.cl + OpenCLKernels/OpenCLChiSquareRuntime.cl + ) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/OpenCL) +INSTALL(FILES ${_KERNELS} DESTINATION include/OpenCL/OpenCLKernels) diff --git a/src/OpenCL/OpenCLBase.cpp b/src/OpenCL/OpenCLBase.cpp new file mode 100644 index 0000000..b40fd64 --- /dev/null +++ b/src/OpenCL/OpenCLBase.cpp @@ -0,0 +1,1132 @@ +#include "OpenCLBase.h" + +cl_context OpenCLBase::m_context = NULL; +cl_command_queue OpenCLBase::m_command_queue = NULL; +cl_platform_id OpenCLBase::m_platform_id = NULL; +cl_device_id OpenCLBase::m_device_id = NULL; +cl_event OpenCLBase::m_last_event = NULL; + +OpenCLBase::OpenCLBase() { + //m_context = NULL; + //m_command_queue = NULL; + m_program = NULL; + m_kernel = NULL; + //m_device_id = NULL; + //m_platform_id = NULL; + m_kernel_file = NULL; + + m_last_event = NULL; + + //m_events = new cl_event[500]; + //m_num_events = 0; + + defaultRndSet = 0; + +} + +OpenCLBase::~OpenCLBase() { + ocl_cleanUp(); + m_last_event = NULL; + + if (defaultRndSet == 1) + ocl_deleteRndStates(); +} + +/* create random states */ +int OpenCLBase::ocl_createRndStates(int size) { + //load kernel + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl"); + ocl_loadKernel(kernel_file); + delete[] kernel_file; + + //allocate memory for rand states + int ierr; + defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr); + + //exec kernel + int seed = 0; + ocl_createKernel("initRand"); + ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState); + ocl_setKernelArg(1, sizeof(unsigned int), &seed); + ocl_setKernelArg(2, sizeof(int), &size); + + size_t work_items = size; + size_t work_group_size = 1; + + ocl_executeKernel(1, &work_items, &work_group_size); + + defaultRndSet = 1; + + return OCL_SUCCESS; + +} + +/* destroy rnd states */ +int OpenCLBase::ocl_deleteRndStates() { + + ocl_freeMemory(defaultRndState); + defaultRndSet = 0; + + return OCL_SUCCESS; + +} + + +/* + get platform id and device id of device specified by device_name (device name can be -mic, -cpu, -gpu, -all) + finds the first device of the specified type and saves device id and platform id +*/ +int OpenCLBase::ocl_getDevice(const char* device_name) { + + int ierr = 0; + + cl_platform_id *tmp_platform_ids; + cl_uint num_of_platforms, num_of_devices; + + //get device type from name, return with error on failure + ierr = ocl_getDeviceType(device_name, m_device_type); + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Can't find device, OpenCL error: " << ierr << ", " << device_name); + return ierr; + } + + //find all available platforms + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); + return ierr; + } + + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return ierr; + } + + //search each platform for specified device + for (unsigned int i = 0; i < num_of_platforms; i++) { + + //get number of devices and first avaialble device id + ierr = clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 1, &m_device_id, &num_of_devices); + + if (ierr != CL_SUCCESS && ierr != CL_DEVICE_NOT_FOUND) { + DEBUG_MSG("Can't find device id's, OpenCL error: " << ierr); + return ierr; + } + + //if device exists in current platform + if (num_of_devices > 0) { + //save platform id + m_platform_id = tmp_platform_ids[i]; + + //get the name of device that will be used and print its name + size_t size; + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, 0, NULL, &size); + + char* info = new char[size]; + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, size, info, NULL); + + DEBUG_MSG("Accelerator device: " << info); + delete[] info; + + //get the name of the platform + clGetPlatformInfo(m_platform_id, CL_PLATFORM_NAME, 0, NULL, &size); + info = new char[size]; + clGetPlatformInfo(m_platform_id, CL_PLATFORM_NAME, size, info, NULL); + + DEBUG_MSG("Accelerator platform: " << info); + + return OCL_SUCCESS; + } + } + + return OCL_ERROR; +} + +int OpenCLBase::ocl_getDeviceCount(int &ndev) { + int ierr = DKS_SUCCESS; + + cl_platform_id *tmp_platform_ids; + cl_uint num_of_platforms, num_of_devices, total_devices; + + //find platform count + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of platforms, OpenCL error: " << ierr); + return DKS_ERROR; + } + + //find all platform IDs + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return ierr; + } + + //for each platform find number of devices + total_devices = 0; + for (unsigned int i = 0; i < num_of_platforms; i++) { + //get device count for platform + ierr = clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices); + if (ierr != CL_SUCCESS && ierr != CL_DEVICE_NOT_FOUND) { + DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr); + return OCL_ERROR; + } + total_devices += num_of_devices; + num_of_devices = 0; + } + + ndev = total_devices; + return DKS_SUCCESS; + +} + +int OpenCLBase::ocl_getDeviceName(std::string &device_name) { + + int ierr = DKS_SUCCESS; + size_t size; + + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, 0, NULL, &size); + char* name = new char[size]; + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, size, name, NULL); + + device_name = name; + delete[] name; + return ierr; +} + +int OpenCLBase::ocl_setDevice(int device) { + + int ierr; + + cl_device_id *tmp_device_ids; + cl_platform_id *tmp_platform_ids; + cl_int *tmp_device_counts; + cl_uint num_of_platforms, num_of_devices; + cl_uint total_devices = 0; + + //find all available platforms + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); + return DKS_ERROR; + } + + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + tmp_device_counts = new cl_int[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return DKS_ERROR; + } + + //search each platform for specified device + for (unsigned int i = 0; i < num_of_platforms; i++) { + + //get the number of devices in the platform + num_of_devices = 0; + clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices); + tmp_device_counts[i] = num_of_devices; + total_devices += num_of_devices; + } + + //check in which platform the selected device is located + int tmp_count = 0; + int checked_count = 0; + int id = -1; + int platform = -1; + for (unsigned int i = 0; i < num_of_platforms; i++) { + tmp_count += tmp_device_counts[i]; + if (device < tmp_count) { + id = device - checked_count; + platform = i; + break; + } + checked_count += tmp_device_counts[i]; + } + + ierr = DKS_ERROR; + if (id > 0) { + num_of_devices = tmp_device_counts[platform]; + tmp_device_ids = new cl_device_id[num_of_devices]; + clGetDeviceIDs(tmp_platform_ids[platform], m_device_type, num_of_devices, tmp_device_ids, NULL); + + m_device_id = tmp_device_ids[id]; + m_platform_id = tmp_platform_ids[platform]; + ierr = ocl_createContext(); + + delete[] tmp_device_ids; + } + + delete[] tmp_platform_ids; + delete[] tmp_device_counts; + + return ierr; +} + +int OpenCLBase::ocl_getUniqueDevices(std::vector &devices) { + + int ierr; + + size_t size; + cl_device_id *tmp_device_ids; + cl_platform_id *tmp_platform_ids; + cl_uint num_of_platforms, num_of_devices; + + //find all available platforms + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); + return DKS_ERROR; + } + + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return DKS_ERROR; + } + + std::vector< std::string > names; + int checked_count = 0; + int id = 0; + for (unsigned int i = 0; i < num_of_platforms; i++) { + + //get the number of devices in the platform + num_of_devices = 0; + clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices); + tmp_device_ids = new cl_device_id[num_of_devices]; + clGetDeviceIDs(tmp_platform_ids[i], m_device_type, num_of_devices, tmp_device_ids, NULL); + + for (unsigned int j = 0; j < num_of_devices; j++) { + id = checked_count + j; + clGetDeviceInfo(tmp_device_ids[j], CL_DEVICE_NAME, 0, NULL, &size); + char* name = new char[size]; + clGetDeviceInfo(tmp_device_ids[j], CL_DEVICE_NAME, size, name, NULL); + std::string target = name; + if (id == 0) { + devices.push_back(id); + names.push_back(target); + } else { + bool isPresent = (std::find(names.begin(), names.end(), target) != names.end()); + if (!isPresent) { + devices.push_back(id); + names.push_back(target); + } + } + delete[] name; + } + + checked_count += num_of_devices; + delete[] tmp_device_ids; + } + + delete[] tmp_platform_ids; + + return DKS_SUCCESS; +} + +/* + checks wether device name is specified and sets device type to search for + if invalid device name is specified set device type to default +*/ +int OpenCLBase::ocl_getDeviceType(const char* device_name, cl_device_type &device_type) { + + device_type = CL_DEVICE_TYPE_DEFAULT; + + if (strcmp(device_name, "-mic") == 0) + device_type = CL_DEVICE_TYPE_ACCELERATOR; + + if (strcmp(device_name, "-cpu") == 0) + device_type = CL_DEVICE_TYPE_CPU; + + if (strcmp(device_name, "-gpu") == 0) + device_type = CL_DEVICE_TYPE_GPU; + + if (strcmp(device_name, "-all") == 0) + device_type = CL_DEVICE_TYPE_ALL; + + return OCL_SUCCESS; +} + +/* + creates a context and command queue between host and device +*/ +int OpenCLBase::ocl_createContext() { + int ierr; + + //context properties list + m_context_properties[0] = CL_CONTEXT_PLATFORM; + m_context_properties[1] = (cl_context_properties) m_platform_id; + m_context_properties[2] = 0; + + //create a context with specified device + m_context = clCreateContext(m_context_properties, 1, &m_device_id, NULL, NULL, &ierr); + + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't create context, OpenCL error: " << ierr); + return ierr; + } + + //create command queue using context and device + //m_command_queue = clCreateCommandQueue(m_context, m_device_id, CL_QUEUE_PROFILING_ENABLE, &ierr); + m_command_queue = clCreateCommandQueue(m_context, m_device_id, 0, &ierr); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't create command queue, OpenCL error: " << ierr); + return ierr; + } + + return OCL_SUCCESS; +} + +/* + read file specified by kernel_file and compile the kernel code contained in kernel_file + save reference to the built program to m_program, from witch individual kernels can be extracted +*/ +int OpenCLBase::ocl_buildProgram(const char *kernel_file) { + + cl_int ierr; + long fsize; + char *kernel_source; + + //open file + FILE *fp = fopen(kernel_file, "rb"); + if (!fp) { + DEBUG_MSG("Can't open kernel file: " << kernel_file); + return OCL_ERROR; + } + + //get file size and allocate memory + fseek(fp, 0, SEEK_END); + fsize = ftell(fp); + kernel_source = new char[fsize+1]; + + //read file and content in kernel source + rewind(fp); + fread(kernel_source, 1, sizeof(char)*fsize, fp); + kernel_source[fsize] = '\0'; + fclose(fp); + + ierr = ocl_compileProgram(kernel_source); + + //save currently loaded kernel file + m_kernel_file = new char[strlen(kernel_file) + 1]; + strcpy(m_kernel_file, kernel_file); + + return ierr; + +} + +//given kernel source compile the OpenCL programm +int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts) { + + int ierr; + + //create program from kernel + m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr); + return DKS_ERROR; + } + + //compile the program, if compilation + ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL); + + /* + check if compileng kernel source succeded, if failed return error code + if in debug mode get compilation info and print program build log witch + will give indication what made the compilation fail + */ +#ifdef DEBUG + if (ierr != CL_SUCCESS) { + + //get build status + cl_build_status status; + clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); + + //get log size + size_t log_size; + clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + + //get log message + char *log = new char[log_size]; + clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, log_size+1, log, NULL); + + //print log messsage + DEBUG_MSG("Build failed! Status:" << status); + DEBUG_MSG("LOG: " << log); + + delete[] log; + + return DKS_ERROR; + } +#else + if (ierr != CL_SUCCESS) + return DKS_ERROR; +#endif + + return DKS_SUCCESS; + +} + + + +//=========================================// +//===============public functions==========// +//=========================================// + +/* + get all device from all platforms +*/ +int OpenCLBase::ocl_getAllDevices() { + + int ierr = DKS_SUCCESS; + + cl_platform_id *tmp_platform_ids, *platform_ids; + cl_uint num_of_platforms, num_of_devices, total_devices; + cl_device_id *tmp_device_ids, *device_ids; + + //find platform count + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of platforms, OpenCL error: " << ierr); + return OCL_ERROR; + } + + //find all platform IDs + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return ierr; + } + + //for each platform find number of devices + total_devices = 0; + for (unsigned int i = 0; i < num_of_platforms; i++) { + //get device count for platform + ierr = clGetDeviceIDs(tmp_platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_of_devices); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr); + return OCL_ERROR; + } + total_devices += num_of_devices; + } + + //get all device ids + int idx = 0; + platform_ids = new cl_platform_id[total_devices]; + device_ids = new cl_device_id[total_devices]; + tmp_device_ids = new cl_device_id[total_devices]; + + for (unsigned int i = 0; i < num_of_platforms; i++) { + //get device ids + ierr = clGetDeviceIDs(tmp_platform_ids[i], CL_DEVICE_TYPE_ALL, total_devices, tmp_device_ids, &num_of_devices); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr); + return OCL_ERROR; + } + + for (unsigned j = 0; j < num_of_devices; j++) { + platform_ids[idx] = tmp_platform_ids[i]; + device_ids[idx] = tmp_device_ids[j]; + idx++; + } + } + + std::cout << std::endl; + std::cout << "==============================" << std::endl; + std::cout << "============OpenCL============" << std::endl; + std::cout << "==============================" << std::endl; + + for (unsigned int i = 0; i < total_devices; i++) { + + //get the name of device that will be used and print its name + size_t size; + + DEBUG_MSG("Device " << i+1 << ":"); + + clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 0, NULL, &size); + char *device_name = new char[size]; + clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, size, device_name, NULL); + DEBUG_MSG("Name: \"" << device_name << "\""); + + clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, 0, NULL, &size); + char *device_vendor = new char[size]; + clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, size, device_vendor, NULL); + DEBUG_MSG("Vendor: \"" << device_vendor << "\""); + + cl_device_type device_type; + clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); + + if (device_type == CL_DEVICE_TYPE_GPU) { + DEBUG_MSG("Device type: GPU"); + } else if (device_type == CL_DEVICE_TYPE_CPU) { + DEBUG_MSG("Device type: CPU"); + } else if (device_type == CL_DEVICE_TYPE_ACCELERATOR) { + DEBUG_MSG("Device type: Accelerator"); + } + + std::cout << "==============================" << std::endl; + + } + + return OCL_SUCCESS; +} + + +/* + find available device, create context and command queue, load kernel file and kompile kernel code +*/ +int OpenCLBase::ocl_setUp(const char *device_name) { + cl_int ierr; + ierr = ocl_getDevice(device_name); + if (ierr != CL_SUCCESS) + return ierr; + + ocl_deviceInfo(false); + + ierr = ocl_createContext(); + if (ierr != CL_SUCCESS) + return ierr; + + return DKS_SUCCESS; +} + +/* + load and compile kernel file if it has changed +*/ +int OpenCLBase::ocl_loadKernel(const char * kernel_file) { + int ierr = OCL_SUCCESS; + + //kernel file has changed + if (m_kernel_file == NULL) { + ierr = ocl_buildProgram(kernel_file); + } else { + if (strcmp(m_kernel_file, kernel_file) != 0) { + ierr = ocl_buildProgram(kernel_file); + } + } + + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Failed to build kernel file " << kernel_file); + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + +//compile kernel form source code provided +int OpenCLBase::ocl_loadKernelFromSource(const char *kernel_source, const char *opts) { + + int ierr = ocl_compileProgram(kernel_source, opts); + + return ierr; +} + +/* + Allocate memory buffer of specified size and type, + available types (read only, write only, read/write) + return memory object +*/ +cl_mem OpenCLBase::ocl_allocateMemory(size_t size, int type, cl_int &ierr) { + cl_mem mem; + mem = clCreateBuffer(m_context, type, size, NULL, &ierr); + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error allocating memory, OpenCL error: " << ierr); + + return mem; +} + +/* + Allocate memory buffer of specified size, type is set to read/write + return memory object +*/ +cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) { + cl_mem mem; + + mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, size, NULL, &ierr); + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error allocating memory, OpenCL error: " << ierr); + + return mem; +} + +/* + write data specified by in_data to device memory, device memory space defined by cl_mem +*/ +int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) { + + cl_int ierr; + + + //std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl; + ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event); + + //m_events[m_num_events] = m_last_event; + m_events.push_back(m_last_event); + + + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr); + return ierr; + } + + return OCL_SUCCESS; +} + +/* + copy src buffer into dst buffer +*/ +int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) { + + int ierr; + ierr = clEnqueueCopyBuffer(m_command_queue, src_ptr, dst_ptr, 0, 0, size, 0, NULL, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error copying buffers, OpenCL error: " << ierr); + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + + +/* + create kernel specified by kernel_name from compiled program +*/ +int OpenCLBase::ocl_createKernel(const char* kernel_name) { + cl_int ierr; + m_kernel = clCreateKernel(m_program, kernel_name, &ierr); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr); + return ierr; + } + return OCL_SUCCESS; +} + +/* + set kernel argument, idx is the index of arument, size specifies data size, arg_value value of the argument +*/ +int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) { + cl_int ierr; + ierr = clSetKernelArg(m_kernel, idx, size, arg_value); + + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error setting kernel arg, OpenCL error: " << ierr); + + return ierr; +} + +/* + executes set kernel, must provide dimensions ndim (1, 2 or 3) and total number of work items + work_items should be an arry of size ndim + optional: work_group_size - can specify how work items are divided in work groups, + if left NULL OpenCL implementation handles this part. +*/ +int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) { + cl_int ierr; + + cl_event tmp_event; + if (m_last_event == NULL) { + ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, + 0, NULL, &tmp_event); + } else { + ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, + 1, &m_last_event, &tmp_event); + } + + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr); + + m_last_event = tmp_event; + m_events.push_back(m_last_event); + + return ierr; +} + +/* + read data from device, mem_ptr points to data on device out_data points to memory in host + blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE) +*/ +int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) { + cl_int ierr; + + ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event); + + m_events.push_back(m_last_event); + + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr); + + return ierr; +} + +/* + free device memory specified by mem_ptr +*/ +int OpenCLBase::ocl_freeMemory(cl_mem mem_ptr) { + cl_int ierr; + ierr = clReleaseMemObject(mem_ptr); + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error freeing memory on device, OpenCL error: " << ierr); + + return ierr; +} + +/* + delete created OpenCL resources +*/ +int OpenCLBase::ocl_cleanUp() { + + if (m_kernel != NULL) { + clReleaseKernel(m_kernel); + m_kernel = NULL; + } + + if (m_program != NULL) { + clReleaseProgram(m_program); + m_program = NULL; + } + + if (m_command_queue != NULL) { + clReleaseCommandQueue(m_command_queue); + m_command_queue = NULL; + } + + if (m_context != NULL) { + clReleaseContext(m_context); + m_context = NULL; + } + + return OCL_SUCCESS; +} + +int OpenCLBase::ocl_deviceInfo(bool verbose) { + + + if (m_device_id == NULL) { + std::cout << "Device not set" << std::endl; + return OCL_ERROR; + } + + + char *info; + cl_bool b_info; + cl_ulong ul_info; + cl_uint ui_info; + size_t info_size; + //size_t *wi_info; + cl_device_type device_type; + + const int count = 12; + const char *info_type[count] = {"char", "cl_device_type", "cl_bool", + "cl_bool", "cl_ulong", "cl_uint", + "cl_uint", "cl_ulong", "size_t", + "size_t[]", "cl_ulong", "char"}; + const char* info_name[count] = {"Name", "Device type","Device available", + "Compiler available", "Global mem size (gb)", "Max clock freq (MHz)", + "Max compute units", "Max buffer size (bytes)", "Max work group size", + "Max work item sizes", "Local mem size (bytes)", "Extensions"}; + const cl_device_info info_value[count] = {CL_DEVICE_NAME, CL_DEVICE_TYPE, CL_DEVICE_AVAILABLE, + CL_DEVICE_COMPILER_AVAILABLE, CL_DEVICE_GLOBAL_MEM_SIZE, CL_DEVICE_MAX_CLOCK_FREQUENCY, + CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, CL_DEVICE_MAX_WORK_GROUP_SIZE, + CL_DEVICE_MAX_WORK_ITEM_SIZES, CL_DEVICE_LOCAL_MEM_SIZE, CL_DEVICE_EXTENSIONS}; + + int print_count; + if (verbose) + print_count = count; + else + print_count = 3; + + + std::cout << "--------------------" << std::endl; + std::cout << "OpenCL device information" << std::endl; + std::cout << "--------------------" << std::endl; + + for (int k = 0; k < print_count; k++) { + if (strcmp(info_type[k], "char") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], 0, NULL, &info_size); + info = new char[info_size]; + clGetDeviceInfo(m_device_id, info_value[k], info_size, info, NULL); + std::cout << info_name[k] << ": " << info << std::endl; + delete[] info; + + } else if (strcmp(info_type[k], "cl_bool") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_bool), &b_info, NULL); + std::cout << info_name[k] << ": " << b_info << std::endl; + + } else if (strcmp(info_type[k], "cl_ulong") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_ulong), &ul_info, NULL); + + if (info_value[k] == CL_DEVICE_GLOBAL_MEM_SIZE) { + double gb = (double)ul_info*1e-9; + std::cout << info_name[k] << ": " << gb << std::endl; + } else if (info_value[k] == CL_DEVICE_LOCAL_MEM_SIZE) { + std::cout << info_name[k] << ": " << ul_info << std::endl; + std::cout << "512^2 bytes: " << sizeof(cl_double2)*512*5 << std::endl; + } else { + std::cout << info_name[k] << ": " << ul_info << std::endl; + } + } else if (strcmp(info_type[k], "cl_uint") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_uint), &ui_info, NULL); + std::cout << info_name[k] << ": " << ui_info << std::endl; + + } else if (strcmp(info_type[k], "size_t") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(size_t), &info_size, NULL); + std::cout << info_name[k] << ": " << info_size << std::endl; + + } else if (strcmp(info_type[k], "size_t[]") == 0 ){ + clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &ui_info, NULL); + size_t wi_info[3];// = new size_t[ui_info]; + clGetDeviceInfo(m_device_id, info_value[k], 3 * sizeof(size_t), &wi_info, NULL); + std::cout << info_name[k] << ": "; + for (unsigned int m = 0; m < ui_info; m++) + std::cout << wi_info[m] << " "; + std::cout << std::endl; + + } else if (strcmp(info_type[k], "cl_device_type") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_device_type), &device_type, NULL); + switch (device_type) { + case CL_DEVICE_TYPE_CPU: + std::cout << info_name[k] << ": CPU" << std::endl; + break; + case CL_DEVICE_TYPE_GPU: + std::cout << info_name[k] << ": GPU" << std::endl; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + std::cout << info_name[k] << ": Accelerator" << std::endl; + break; + case CL_DEVICE_TYPE_DEFAULT: + std::cout << info_name[k] << ": Default" << std::endl; + break; + default: + std::cout << info_name[k] << ": Unknown" << std::endl; + break; + } + } + } + return OCL_SUCCESS; +} + +int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size, + bool double_precision, int &threadsPerBlock) +{ + + //build kernel + int ierr = ocl_createKernel(kernel_name); + if (ierr != DKS_SUCCESS) + return ierr; + + //get device properties + size_t max_group_size; + clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0); + cl_ulong local_mem_size; + clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0); + size_t ext_size; + clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size); + char *ext = new char[ext_size]; + clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0); + + //get kernel properties + size_t kernel_group_size; + clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &kernel_group_size, 0); + threadsPerBlock = kernel_group_size; + + cl_ulong kernel_local_mem; + clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(cl_ulong), &kernel_local_mem, 0); + + + std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl; + + + std::cout << "Work groups: device limit " << max_group_size << ", " + << "kernel limit " << kernel_group_size << ", " + << "required " << work_group_size << std::endl; + + + std::cout << "Local memory: device limit " << local_mem_size << std::endl; + + + + std::cout << "Available extensions: " << ext << std::endl; + + std::cout << "End " << kernel_name << " check..." << std::endl << std::endl; + + return DKS_SUCCESS; +} + +void OpenCLBase::ocl_clearEvents() { + + m_events.clear(); + + //delete[] m_events; + //m_num_events = 0; + //m_events = new cl_event[500]; + +} + + + +void OpenCLBase::ocl_eventInfo() { + + std::cout << "Number of events launched: " << m_events.size() << std::endl; + + if (m_events.size() > 0) { + + cl_ulong twrite = 0; + cl_ulong texec = 0; + cl_ulong tread = 0; + int cw = 0; + int ce = 0; + int cr = 0; + + for (unsigned i = 0; i < m_events.size(); i++) { + + cl_ulong tqueue, tsubmit, tstart, tend; + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &tqueue, NULL); + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_SUBMIT, + sizeof(cl_ulong), &tsubmit, NULL); + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &tstart, NULL); + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &tend, NULL); + + cl_command_type type; + clGetEventInfo(m_events[i], CL_EVENT_COMMAND_TYPE, sizeof(cl_int), &type, NULL); + + if (type == CL_COMMAND_WRITE_BUFFER) { + twrite += (tend - tstart); + cw++; + } + + if (type == CL_COMMAND_READ_BUFFER) { + tread += (tend - tstart); + cr++; + } + + if (type == CL_COMMAND_NDRANGE_KERNEL) { + texec += (tend - tstart); + ce++; + } + } + + std::cout << "OpenCL write: " << (twrite * 1e-9) << " in: " << cw << std::endl; + std::cout << "OpenCL exec: " << (texec * 1e-9) << " in: " << ce << std::endl; + std::cout << "OpenCL read: " << (tread * 1e-9) << " in: " << cr << std::endl; + + } + + /* + cl_ulong tqueue, tsubmit, tstart, tend, tref; + + int *list_bad_events = new int[m_num_events]; + int num_bad_events = 0; + + if (m_num_events > 0) { + + double *list_ended = new double[m_num_events]; + + clGetEventProfilingInfo(m_events[0], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &tref, NULL); + + std::cout << std::endl; + std::cout << setw(10) << left << "Event\t| "; + std::cout << setw(10) << left << "queued\t| "; + std::cout << setw(10) << left << "submited\t| "; + std::cout << setw(10) << left << "started\t| "; + std::cout << setw(10) << left << "ended \t| "; + + std::cout << setw(10) << left << "in queue" << std::endl; + std::cout << setw(10) << "-----------------------------------------------------------------------------------" << std::endl; + for (unsigned int i = 0; i < m_num_events; i++) { + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &tqueue, NULL); + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &tsubmit, NULL); + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &tstart, NULL); + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tend, NULL); + + cl_command_type type; + clGetEventInfo(m_events[i], CL_EVENT_COMMAND_TYPE, sizeof(cl_int), &type, NULL); + + tqueue = (tqueue >= tref) ? tqueue - tref : tqueue; + tsubmit = (tsubmit > tref) ? tsubmit - tref : tsubmit; + tstart = (tstart > tref) ? tstart - tref : tstart; + tend = (tend > tref) ? tend - tref : tend; + + if (type == CL_COMMAND_READ_BUFFER || type == CL_COMMAND_WRITE_BUFFER) + std::cout << left << i << "*\t| "; + else + std::cout << left << i << "\t| "; + std::cout << setw(7) << left << tqueue << "\t| "; + std::cout << setw(7) << left << tsubmit << "\t| "; + std::cout << setw(7) << left << tstart << "\t| "; + std::cout << setw(7) << left << tend << "\t| "; + + int count = 0; + if (i > 0) { + for (unsigned int j = 0; j < i; j++) { + if (list_ended[j] > tqueue) + count++; + } + } + list_ended[i] = tend; + + std::cout << setw(7) << left << count << std::endl; + + //this seems to be a problem on MIC sometimes + if (tstart == 0) { + list_bad_events[num_bad_events] = i; + num_bad_events++; + } + } + std::cout << setw(10) << "-----------------------------------------------------------------------------------" << std::endl << std::endl; + + //print info about failed events + for (int i = 0; i < num_bad_events; i++) { + cl_int event_status; + int id = list_bad_events[i]; + clGetEventInfo(m_events[id], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &event_status, NULL); + std::cout << "Event " << id << " : "; + switch(event_status) { + case CL_QUEUED: + std::cout << "queued" << std::endl; + break; + case CL_SUBMITTED: + std::cout << "submited" << std::endl; + break; + case CL_RUNNING: + std::cout << "running" << std::endl; + break; + case CL_COMPLETE: + std::cout << "complete" << std::endl; + break; + default: + std::cout << "error" << std::endl; + break; + } + } + } + */ + +} + + + + + + + + + + diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h new file mode 100644 index 0000000..ae0a15c --- /dev/null +++ b/src/OpenCL/OpenCLBase.h @@ -0,0 +1,303 @@ +/* + + Name: OpenCLBase + + Author: Uldis Locans + + Info: OpenCL base class to handle all the common details associated + with kernel launch on OpenCL device + + Date: 2014.09.18 + +*/ + +#ifndef H_OPENCL_BASE +#define H_OPENCL_BASE + +#include +#include +#include +#include +#include +#include + + +#ifdef __APPLE__ +#include +#include +#else +#include +#include +#endif + + + +#include "../DKSDefinitions.h" + +/* struct for random number state */ +typedef struct { + + double s10; + double s11; + double s12; + double s20; + double s21; + double s22; + double z; + bool gen; + +} RNDState; + +class OpenCLBase { + +private: + + static cl_context m_context; + static cl_command_queue m_command_queue; + + static cl_platform_id m_platform_id; + static cl_device_id m_device_id; + + cl_context_properties m_context_properties[3]; + cl_program m_program; + cl_kernel m_kernel; + + static cl_event m_last_event; + cl_int m_num_events; + std::vector m_events; + + char * m_kernel_file; + + cl_device_type m_device_type; + + /* + Name: getPlatforms + Info: get all avaialble platforms and save in m_platform_ids, save number of platforms + Return: success or error code + */ + int ocl_getPlatforms(); + + + /* + Name: getDevice + Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu) + ReturnL success or error code + */ + int ocl_getDevice(const char* device_name); + + /* + Name getDeviceType + Info: get device type from device name (-gpu, -cpu, -mic) + Return: success or error code + */ + int ocl_getDeviceType(const char* device_name, cl_device_type &device_type); + + /* + Name: createContext + Info: create context with specified device + Return: success or error code + */ + int ocl_createContext(); + + /* + Name: buildProgram + Info: build program from specified kernel file + Return: success or error code + */ + int ocl_buildProgram(const char* kernel_file); + + /** Compile program from kernel source string + * + */ + int ocl_compileProgram(const char* kernel_source, const char* opts = NULL); + +protected: + + int defaultRndSet; + cl_mem defaultRndState; + + +public: + + /* + constructor + */ + OpenCLBase(); + + /* + destructor + */ + ~OpenCLBase(); + + /* + Create RND states + Return: success or error code + */ + int ocl_createRndStates(int size); + + /* + Destroy rnd states + Return: success or error code + */ + int ocl_deleteRndStates(); + + + /* + Name: getAllDevices + Info: get all available devices + ReturnL success or error code + */ + int ocl_getAllDevices(); + + /** Get the OpenCL device count for the set type of device + * + */ + int ocl_getDeviceCount(int &ndev); + + /** Get the name of the device used + */ + int ocl_getDeviceName(std::string &device_name); + + /** Set the device to use for OpenCL kernels. + * device id to use is passed as integer. + */ + int ocl_setDevice(int device); + + /** Get a list of all the unique devices of the same type that can run OpenCL kernels + * Used when GPUs of different types might be pressent on the system. + */ + int ocl_getUniqueDevices(std::vector &devices); + + /* + Name: setUp + Info: set up opencl resources + Return: success or error code + */ + int ocl_setUp(const char* device_name); + + /* + Name: loadKernel + Info: load and compile opencl kernel file if it has changed + Return: success or error code + */ + int ocl_loadKernel(const char* kernel_file); + + + /** Build program from kernel source. + * Builds a program from source code provided in kernel_source. + * If compilation fails will return DKS_ERROR + */ + int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL); + + /* + Name: allocateMemory + Info: allocate memory on device + Return: return pointer to memory + */ + cl_mem ocl_allocateMemory(size_t size, int &ierr); + + /* + Name: allocateMemory + Info: allocate memory on device + Return: return pointer to memory + */ + cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); + + /* + Name: writeData + Info: write data to device memory (needs ptr to mem object) + Return: success or error code + */ + int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); + + /* + Name: copyData + Info: copy data from one buffer on the device to another + Return: success or error code + */ + int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size); + + /* + Name: createKernel + Info: create kernel from program + Return: success or error code + */ + int ocl_createKernel(const char* kernel_name); + + /* + Name: setKernelArgs + Info: set opencl kernel arguments + Return: success or error code + */ + int ocl_setKernelArg(int idx, size_t size, const void *arg_value); + + /* + Name: executeKernel + Info: execute selected kernel (needs kernel parameters) + Return: success or error code + */ + int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL); + + /* + Name: readData + Info: read data from device (needs pointer to mem object) + Return: success or error code + */ + int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); + + /* + Name: freeMemory + Info: free device memory (needs ptr to mem object) + Return: success or error code + */ + int ocl_freeMemory(cl_mem mem_ptr); + + /* + Name: cleanUp + Info: free opencl resources + Return: success or error code + */ + int ocl_cleanUp(); + + /* + Name: deviceInfo + Info: print device info (mostly for debugging purposes) + Return: success or error code + */ + int ocl_deviceInfo(bool verbose = true); + + /* Check OpenCL kernel. + * Query device and check if it can run the kernel with required parameters + */ + int ocl_checkKernel(const char* kernel_name, int work_group_size, + bool double_precision, int &threadsPerBlock); + + /* + Name: clearEvents + Info: clear saved events (for debuging purposes) + Return: nothing + */ + void ocl_clearEvents(); + + /* + Name: eventInfo + Info: print information about kernel timings (for debuging purposes) + Return: nothing + */ + void ocl_eventInfo(); + + /* + Return current command queue + */ + cl_command_queue ocl_getQueue() { return m_command_queue; } +}; + +#endif + + + + + + + + diff --git a/src/OpenCL/OpenCLChiSquare.cpp b/src/OpenCL/OpenCLChiSquare.cpp new file mode 100644 index 0000000..7de4a62 --- /dev/null +++ b/src/OpenCL/OpenCLChiSquare.cpp @@ -0,0 +1,157 @@ +#include "OpenCLChiSquare.h" + +double OpenCLChiSquare::ocl_sum(cl_mem data, int length) { + + + int ierr; + //calc number of thread sper workgroup and nr of work groups + size_t work_size_sum = 128; + size_t work_items = (size_t)length; + if (length % work_size_sum > 0) + work_items = (length / work_size_sum + 1) * work_size_sum; + + int work_groups = length / work_size_sum + 1; + + //create tmp array for partial sums + cl_mem tmp_ptr; + + double *partial_sums = new double[work_groups]; + tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr); + + //execute sum kernel + m_oclbase->ocl_createKernel("parallelReductionSum"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr); + m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum); + + //read partial sums and free temp mempry + m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups); + m_oclbase->ocl_freeMemory(tmp_ptr); + + //sumup partial sums on the host + double result = 0; + for (int i = 0; i < work_groups; i++) + result += partial_sums[i]; + + delete[] partial_sums; + + return result; + +} + +int OpenCLChiSquare::ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result) +{ + + //set number of work items and work group sizes for kernel execution + size_t work_size = 128; + + size_t work_items = (size_t)length * sensors; + if (length % work_size > 0) + work_items = (length / work_size + 1) * work_size; + + cl_mem data = (cl_mem)mem_data; + cl_mem par = (cl_mem)mem_par; + cl_mem chi = (cl_mem)mem_result; + + //load and execute PHistotFFcn kernel + m_oclbase->ocl_createKernel("kernelPHistoTFFcn"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &par); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &chi); + m_oclbase->ocl_setKernelArg(3, sizeof(double), &fTimeResolution); + m_oclbase->ocl_setKernelArg(4, sizeof(double), &fRebin); + m_oclbase->ocl_setKernelArg(5, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(6, sizeof(int), &sensors); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(8, sizeof(double)*numpar, NULL); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + result = ocl_sum(chi, sensors*length); + + return DKS_SUCCESS; +} + +int OpenCLChiSquare::ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + //set number of work items and work group sizes for kernel execution + size_t work_size = 128; + size_t work_items = (size_t)length * sensors; + if (length % work_size > 0) + work_items = (length / work_size + 1) * work_size; + + cl_mem data = (cl_mem)mem_data; + cl_mem t0 = (cl_mem)mem_t0; + cl_mem par = (cl_mem)mem_par; + cl_mem chi = (cl_mem)mem_result; + + //load and execute PHistotFFcn kernel + m_oclbase->ocl_createKernel("kernelSingleGaussTF"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi); + m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution); + m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin); + m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + result = ocl_sum(chi, length); + + return DKS_SUCCESS; + +} + + +int OpenCLChiSquare::ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + //set number of work items and work group sizes for kernel execution + size_t work_size = 128; + size_t work_items = (size_t)length * sensors; + if (length % work_size > 0) + work_items = (length / work_size + 1) * work_size; + + cl_mem data = (cl_mem)mem_data; + cl_mem t0 = (cl_mem)mem_t0; + cl_mem par = (cl_mem)mem_par; + cl_mem chi = (cl_mem)mem_result; + + //load and execute PHistotFFcn kernel + m_oclbase->ocl_createKernel("kernelDoubleLorentzTF"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi); + m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution); + m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin); + m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + result = ocl_sum(chi, length); + + return DKS_SUCCESS; + +} + + + diff --git a/src/OpenCL/OpenCLChiSquare.h b/src/OpenCL/OpenCLChiSquare.h new file mode 100644 index 0000000..bbc5da6 --- /dev/null +++ b/src/OpenCL/OpenCLChiSquare.h @@ -0,0 +1,53 @@ +#ifndef H_OPENCL_CHI_SQUARE +#define H_OPENCL_CHI_SQUARE + +#include + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "OpenCLBase.h" + +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 + + +class OpenCLChiSquare { + +private: + + OpenCLBase *m_oclbase; + + double ocl_sum(cl_mem data, int length); + +public: + + OpenCLChiSquare(OpenCLBase *base) { + m_oclbase = base; + } + + ~OpenCLChiSquare() { } + + int ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result); + + int ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + int ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + + +}; + +#endif diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp new file mode 100644 index 0000000..f8e21a6 --- /dev/null +++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp @@ -0,0 +1,316 @@ +#include "OpenCLChiSquareRuntime.h" + +OpenCLChiSquareRuntime::OpenCLChiSquareRuntime(OpenCLBase *base) { + + blockSize_m = BLOCK_SIZE; + numBlocks_m = -1; + + m_oclbase = base; + + N0_m = 1.0; + tau_m = 1.0; + bkg_m = 1.0; + alpha_m = 1.0; + beta_m = 1.0; + + ptx_m = NULL; + + initDone_m = false; + +} + +//free temporary resources +OpenCLChiSquareRuntime::~OpenCLChiSquareRuntime() { + delete[] ptx_m; + freeChiSquare(); +} + +//build program string +std::string OpenCLChiSquareRuntime::buildProgram(std::string function) { + + long fsize; + char *kernel_source; + + //get kernel source + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl"); + + //read kernels from file + FILE *fp = fopen(kernel_file, "rb"); + if (!fp) + DEBUG_MSG("Can't open kernel file" << kernel_file); + + //get file size and allocate memory + fseek(fp, 0, SEEK_END); + fsize = ftell(fp); + kernel_source = new char[fsize+1]; + + //read file and content in kernel source + rewind(fp); + fread(kernel_source, 1, sizeof(char)*fsize, fp); + kernel_source[fsize] = '\0'; + fclose(fp); + + std::string kernel_string (kernel_source); + return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter; + +} + +int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) { + + //build program string + std::string openclProg = buildProgram(function); + + //compile flags + std::string opts(""); + if (mlh) + opts = "-DMLH"; + + //compile opencl program from source string + int ierr = m_oclbase->ocl_loadKernelFromSource(openclProg.c_str(), opts.c_str()); + + return ierr; +} + +double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) { + + + int ierr; + //calc number of thread sper workgroup and nr of work groups + size_t work_size_sum = 128; + + /* + size_t work_items = (size_t)length; + if (length % work_size_sum > 0) + work_items = (length / work_size_sum + 1) * work_size_sum; + int work_groups = length / work_size_sum + 1; + */ + + size_t work_items = 80 * work_size_sum; + int work_groups = 80; + + //create tmp array for partial sums + cl_mem tmp_ptr; + + double *partial_sums = new double[work_groups]; + tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr); + + //execute sum kernel + //ocl_createKernel("parallelReductionSum"); + m_oclbase->ocl_createKernel("parallelReductionTwoPhase"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr); + m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum); + + //read partial sums and free temp mempry + m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups); + m_oclbase->ocl_freeMemory(tmp_ptr); + + //sumup partial sums on the host + double result = 0; + for (int i = 0; i < work_groups; i++) + result += partial_sums[i]; + + delete[] partial_sums; + + return result; + +} + +int OpenCLChiSquareRuntime::launchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, double &result) +{ + + int ierr; + + //convert memory to cl_mem + cl_mem cl_mem_data = (cl_mem)mem_data; + cl_mem cl_mem_err = (cl_mem)mem_err; + + cl_mem cl_param = (cl_mem)mem_param_m; + cl_mem cl_chisq = (cl_mem)mem_chisq_m; + cl_mem cl_map = (cl_mem)mem_map_m; + cl_mem cl_func = (cl_mem)mem_func_m; + + //set work item size + size_t work_items; + size_t work_size = (size_t)blockSize_m; + if (numBlocks_m < 0) + work_items = (size_t)length; + else + work_items = (size_t)numBlocks_m * (size_t)blockSize_m; + + if (work_items % work_size > 0) + work_items = (work_items / work_size + 1) * work_size; + + if (fitType == FITTYPE_SINGLE_HISTO) { + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelChiSquareSingleHisto"); + + if (ierr != DKS_SUCCESS) + return ierr; + + //set kernel args + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq); + m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map); + m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func); + m_oclbase->ocl_setKernelArg(6, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap); + m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart); + m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep); + m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m); + m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m); + m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m); + m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL); + m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL); + m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL); + + if (ierr != DKS_SUCCESS) + return ierr; + } else if (fitType == FITTYPE_ASYMMETRY) { + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry"); + + if (ierr != DKS_SUCCESS) + return ierr; + + //set kernel args + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq); + m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map); + m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func); + m_oclbase->ocl_setKernelArg(6, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap); + m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart); + m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep); + m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m); + m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m); + m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL); + m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL); + m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL); + + if (ierr != DKS_SUCCESS) + return ierr; + } else if (fitType == FITTYPE_MU_MINUS) { + // not yet implemented + } else { + return DKS_ERROR; + } + + //execute kernel + ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + if (ierr != DKS_SUCCESS) + return ierr; + + //execute sum kernel + result = calculateSum((cl_mem)mem_chisq_m, length); + + return ierr; + +} + +int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) { + int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams); + return ierr; +} + + +int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) { + if (numfunc == 0) + return DKS_SUCCESS; + + int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc); + return ierr; +} + +int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) { + if (nummap == 0) + return DKS_SUCCESS; + + int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap); + return ierr; +} + +int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, + int size_func, int size_map) +{ + + int ierr = DKS_ERROR; + if (initDone_m) { + DEBUG_MSG("Reinitializing ChiSquare"); + freeChiSquare(); + } + + //allocate temporary memory + mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr); + mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr); + if (size_func == 0) + size_func = 1; + mem_func_m = m_oclbase->ocl_allocateMemory(size_func*sizeof(double), ierr); + if (size_map == 0) + size_map = 1; + mem_map_m = m_oclbase->ocl_allocateMemory(size_map*sizeof(int), ierr); + initDone_m = true; + + return ierr; + +} + +int OpenCLChiSquareRuntime::freeChiSquare() { + + int ierr = DKS_ERROR; + if (initDone_m) { + + //free memory + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m); + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m); + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m); + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m); + + initDone_m = false; + } + + return ierr; + +} + +int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBlock) { + + int ierr; + char kernel[64]; + + switch (fitType) { + case FITTYPE_SINGLE_HISTO: + strncpy(kernel, "kernelChiSquareSingleHisto", sizeof(kernel)); + break; + case FITTYPE_ASYMMETRY: + strncpy(kernel, "kernelChiSquareAsymmetry", sizeof(kernel)); + break; + case FITTYPE_MU_MINUS: + // not yet implemented + default: + return DKS_ERROR; + } + + ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock); + + return ierr; + +} + diff --git a/src/OpenCL/OpenCLChiSquareRuntime.h b/src/OpenCL/OpenCLChiSquareRuntime.h new file mode 100644 index 0000000..90b5c7c --- /dev/null +++ b/src/OpenCL/OpenCLChiSquareRuntime.h @@ -0,0 +1,103 @@ +#ifndef H_OPENCL_CHISQUARE_RUNTIME +#define H_OPENCL_CHISQUARE_RUNTIME + +#include +#include + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "../Algorithms/ChiSquareRuntime.h" +#include "OpenCLBase.h" + +const std::string openclFunctHeader = "double fTheory(double t, __local double *p, __local double *f, __local int *m) {"; + +const std::string openclFunctFooter = "}\n"; + +class OpenCLChiSquareRuntime : public ChiSquareRuntime { + +private: + + OpenCLBase *m_oclbase; + + /** Private function to add user defined function to kernel string + * + */ + std::string buildProgram(std::string function); + + double calculateSum(cl_mem data, int length); + +public: + + /** Constructor wiht openclbase argument + * + */ + OpenCLChiSquareRuntime(OpenCLBase *base); + + /** Default constructor + * + */ + OpenCLChiSquareRuntime(); + + /** Default destructor + * + */ + ~OpenCLChiSquareRuntime(); + + /** Compile program and save ptx. + * Add function string to the calcFunction kernel and compile the program + * Function must be valid C math expression. Parameters can be addressed in + * a form par[map[idx]] + */ + int compileProgram(std::string function, bool mlh = false); + + /** Launch selected kernel + * Launched the selected kernel from the compiled code. + * Result is put in &result variable + */ + int launchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result); + + /** Write params to device. + * Write params from double array to mem_param_m memory on the device. + */ + int writeParams(const double *params, int numparams); + + /** Write functions to device. + * Write function values from double array to mem_func_m memory on the device. + */ + int writeFunc(const double *func, int numfunc); + + /** Write maps to device. + * Write map values from int array to mem_map_m memory on the device. + */ + int writeMap(const int *map, int nummap); + + /** Allocate temporary memory needed for chi square. + * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to + * the maximum number of elements in any datasets that will be used for calculations. Size_param, + * size_func and size_map are the maximum number of parameters, functions and maps used in + * calculations. + */ + int initChiSquare(int size_data, int size_param, int size_func, int size_map); + + /** Free temporary memory allocated for chi square. + * Frees the chisq temporary memory and memory for params, functions and maps + */ + int freeChiSquare(); + + /** Check MuSR kernels for necessary resources. + * Query device properties to get if sufficient resources are + * available to run the kernels + */ + int checkChiSquareKernels(int fitType, int &threadsPerBlock); + +}; + +#endif diff --git a/src/OpenCL/OpenCLCollimatorPhysics.cpp b/src/OpenCL/OpenCLCollimatorPhysics.cpp new file mode 100644 index 0000000..46d8b24 --- /dev/null +++ b/src/OpenCL/OpenCLCollimatorPhysics.cpp @@ -0,0 +1,107 @@ +#include "OpenCLCollimatorPhysics.h" + +#define M_P 0.93827231e+00 +#define C 299792458.0 +#define PI 3.14159265358979323846 +#define AVO 6.022e23 +#define R_E 2.81794092e-15 +#define eM_E 0.51099906e-03 +#define Z_P 1 +#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 + +#define POSITION 0 +#define ZSIZE 1 +#define RHO_M 2 +#define Z_M 3 +#define A_M 4 +#define A2_C 5 +#define A3_C 6 +#define A4_C 7 +#define A5_C 8 +#define X0_M 9 +#define I_M 10 +#define DT_M 11 + +#define BLOCK_SIZE 128 +#define NUMPAR 12 + +/* +TODO: +1. test OpenCL kernel + - is it launched for all particles + - does the random number generatror function properly + - is particle structure updated correctly in memory +2. boost.compute sort for user defined structure crashes +*/ +int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles) +{ + /* + //set number of total threads, and number threads per block + size_t threads = 1; + size_t blocks = numparticles; + + //cast void ptrs to cl_mem ptrs + cl_mem data = (cl_mem)mem_ptr; + cl_mem params = (cl_mem)par_ptr; + + int numparams = 19; + + //set kernel to execute and kernel arguments + ocl_createKernel("kernelCollimatorPhysics"); + ocl_setKernelArg(0, sizeof(cl_mem), &data); + ocl_setKernelArg(1, sizeof(cl_mem), ¶ms); + ocl_setKernelArg(2, sizeof(cl_mem), &defaultRndState); + ocl_setKernelArg(3, sizeof(int), &numparticles); + ocl_setKernelArg(4, sizeof(double)*numparams, NULL); + + std::cout << "blocks: " << blocks << ", threads: " << threads << std::endl; + + //execute kernel on device + ocl_executeKernel(1, &blocks, &threads); + + //create functions for comparing two particles and counting particles with labels < 0 + + BOOST_COMPUTE_FUNCTION(bool, sort_by_label, (PART_OPENCL a, PART_OPENCL b), + { + return a.label < b.label; + }); + + + + BOOST_COMPUTE_FUNCTION(bool, count_by_label, (PART_OPENCL a), + { + return a.label < 0; + }); + + + //wrap cl_mem memory object in Boost.Compute buffer + std::cout << "wrap buffer" << std::endl; + boost::compute::buffer buf(data); + + //count particles with labels < 0 + std::cout << "wrap command queue" << std::endl; + boost::compute::command_queue queue(ocl_getQueue()); + + std::cout << "count if" << std::endl; + + + numaddback = boost::compute::count_if(boost::compute::make_buffer_iterator(buf,0), + boost::compute::make_buffer_iterator(buf,numparticles), + count_by_label, queue); + + //sort particles with dead and leaving particles at the end using boos::compute + numaddback = 0; + if (numaddback > 0) { + std::cout << "sort" << std::endl; + boost::compute::sort(boost::compute::make_buffer_iterator(buf,0), + boost::compute::make_buffer_iterator(buf, numparticles), + sort_by_label, queue); + } + + + return DKS_SUCCESS; +*/ + std::cout << "OpenCL implementation disabled" << std::endl; + return DKS_ERROR; +} diff --git a/src/OpenCL/OpenCLCollimatorPhysics.h b/src/OpenCL/OpenCLCollimatorPhysics.h new file mode 100644 index 0000000..7b532ff --- /dev/null +++ b/src/OpenCL/OpenCLCollimatorPhysics.h @@ -0,0 +1,85 @@ +#ifndef H_OPENCL_DEGRADER +#define H_OPENCL_DEGRADER + +#include +#include + +#include "../Algorithms/CollimatorPhysics.h" +#include "OpenCLBase.h" + +/* +#include "boost/compute/types/struct.hpp" +#include "boost/compute/type_traits/type_name.hpp" +#include "boost/compute/algorithm/count_if.hpp" +#include "boost/compute/algorithm/sort.hpp" +#include "boost/compute/container/vector.hpp" +#include "boost/compute/iterator/buffer_iterator.hpp" +#include "boost/compute/core.hpp" +*/ + +typedef struct { + double x; + double y; + double z; +} Double3; + +typedef struct { + int label; + unsigned localID; + + Double3 Rincol; + Double3 Pincol; +} PART_OPENCL; + +//adapt struct PART for use in Boost.Compute +//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z)); +//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol)); + +class OpenCLCollimatorPhysics : public DKSCollimatorPhysics { + +private: + OpenCLBase *m_oclbase; + +public: + + /* constructor */ + OpenCLCollimatorPhysics(OpenCLBase *base) { + m_oclbase = base; + } + + /* destructor */ + ~OpenCLCollimatorPhysics() { + } + + /* execute degrader code on device */ + int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles); + + int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) { return DKS_ERROR; } + + int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { return DKS_ERROR; } + + int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) { return DKS_ERROR; } + + int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1) + { + return DKS_ERROR; + } + + int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1) + { + return DKS_ERROR; + } + +}; + +#endif diff --git a/src/OpenCL/OpenCLFFT.cpp b/src/OpenCL/OpenCLFFT.cpp new file mode 100644 index 0000000..5cbe9e9 --- /dev/null +++ b/src/OpenCL/OpenCLFFT.cpp @@ -0,0 +1,303 @@ +#include "OpenCLFFT.h" + +//=====================================// +//==========Private functions==========// +//=====================================// + +/* + call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension +*/ +int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) { + + //set the number of work items in each dimension + size_t work_items[3]; + work_items[0] = N; + work_items[1] = (ndim > 1) ? N : 1; + work_items[2] = (ndim > 1) ? N : 1; + work_items[cdim] = N / 2; + + int f = (forward) ? 1 : 0; + + //create kernel and set kernel arguments + if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS) + return OCL_ERROR; + + + //execute kernel + for (int step = 1; step < N; step <<= 1) { + if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + +/* + call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension +*/ +int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) { + //set work item size + size_t work_items[3]; + work_items[0] = N; + work_items[1] = (ndim > 1) ? N : 1; + work_items[2] = (ndim > 2) ? N : 1; + + //create kernel and set kernel arguments + if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS) + return OCL_ERROR; + + int bits = log2(N); + if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS) + return OCL_ERROR; + + //execute kernel + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) { + DEBUG_MSG("Error executing kernel"); + return OCL_ERROR; + } + + return OCL_SUCCESS; + +} + + +//=====================================// +//==========Public functions==========// +//=====================================// + +/* + call fft execution on device for every dimension +*/ +int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) { + int ierr; + + cl_mem inout = (cl_mem)data; + int n = N[0]; + + for (int dim = 0; dim < ndim; dim++) { + ierr = ocl_callBitReverseKernel(inout, dim, ndim, n); + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Error executing bit reverse"); + return OCL_ERROR; + } + + ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward); + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Error executing fft reverse"); + return OCL_ERROR; + } + } + + return OCL_SUCCESS; +} + +/* + execute ifft +*/ +int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) { + executeFFT(data, ndim, N, streamId, false); + return OCL_SUCCESS; +} + +/* + call kernel to normalize fft +*/ +int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { + + cl_mem inout = (cl_mem)data; + + int n = N[0]; + + //set work item size + size_t work_items[3]; + work_items[0] = n; + work_items[1] = (ndim > 1) ? n : 1; + work_items[2] = (ndim > 2) ? n : 1; + + //create kernel + if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS) + return OCL_ERROR; + + //set kernel args + unsigned int elements = pow(n, ndim); + if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS) + return OCL_ERROR; + if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS) + return OCL_ERROR; + + //execute kernel + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) { + DEBUG_MSG("Error executing kernel"); + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + +int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) { + + int ierr; + int size = sizeof(cl_double2)*pow(N,ndim); + + cl_mem mem_tmp; + cl_mem mem_src = (cl_mem)src; + cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr); + + //set the number of work items in each dimension + size_t work_items[3]; + int p = 1; + int threads = N / 2; + int f = (forward) ? -1 : 1; + + //execute kernel + int n = (int)log2(N); + for (int i = 0; i < ndim; i++) { + + int dim = i+1; + p = 1; + work_items[0] = (dim == 1) ? N/2 : N; + work_items[1] = (dim == 2) ? N/2 : N; + work_items[2] = (dim == 3) ? N/2 : N; + + //transpose array if calculating dimension larger than 1 + //if (dim > 1) + // ocl_executeTranspose(mem_src, N, ndim, dim); + + //create kernel and set kernel arguments + if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS) + return OCL_ERROR; + + for (int t = 1; t <= log2(N); t++) { + + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst); + m_oclbase->ocl_setKernelArg(2, sizeof(int), &p); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads); + m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim); + m_oclbase->ocl_setKernelArg(5, sizeof(int), &f); + + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) + return OCL_ERROR; + + mem_tmp = mem_src; + mem_src = mem_dst; + mem_dst = mem_tmp; + + p = 2*p; + } + + //transpose array back if calculating dimension larger than 1 + //if (dim > 1) + // ocl_executeTranspose(mem_src, N, ndim, dim); + } + + if (ndim*n % 2 == 1) { + m_oclbase->ocl_copyData(mem_src, mem_dst, size); + mem_tmp = mem_src; + mem_src = mem_dst; + mem_dst = mem_tmp; + } + + m_oclbase->ocl_freeMemory(mem_dst); + + return OCL_SUCCESS; + +} + +int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) { + + cl_mem mem_src = (cl_mem)src; + + size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N}; + size_t work_group_size[3] = {(size_t)N/2, 1, 1}; + + m_oclbase->ocl_createKernel("fft_batch3D"); + + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL); + m_oclbase->ocl_setKernelArg(4, sizeof(int), &N); + + + for (int dim = 1; dim < ndim+1; dim++) { + m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim); + m_oclbase->ocl_executeKernel(3, work_items, work_group_size); + } + + return OCL_SUCCESS; +} + +int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) { + + cl_mem mem_src = (cl_mem)src; + + if (ndim == 1) + return OCL_SUCCESS; + + size_t work_items[3]; + work_items[0] = N[0]; + work_items[1] = N[1]; + work_items[2] = 1; + + size_t work_group_size[3]; + work_group_size[0] = N[0]; + work_group_size[1] = N[1]; + work_group_size[2] = 1; + + size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2]; + + m_oclbase->ocl_createKernel("transpose"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]); + m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL); + m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size); + + return OCL_SUCCESS; +} + +/* +void OpenCLFFT::printData3DN4(cl_double2* &data, int N) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].x; + if (d > 10e-5 || d < -10e-5) + std::cout << d << "\t"; + else + std::cout << 0 << "\t"; + } + } + std::cout << std::endl; + } + std::cout << std::endl; + +} +*/ + + + + diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h new file mode 100644 index 0000000..31816f9 --- /dev/null +++ b/src/OpenCL/OpenCLFFT.h @@ -0,0 +1,113 @@ +/* + + Name: OpenCLFFT + + Author: Uldis Locans + + Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL + + Data: 19.09.2014 + +*/ +#ifndef H_OPENCL_FFT +#define H_OPENCL_FFT + + +#include +#include +#include + +#include "../Algorithms/FFT.h" +#include "OpenCLBase.h" + +class OpenCLFFT : public DKSFFT { + +private: + + OpenCLBase *m_oclbase; + + /* + Info: call fft kernels to execute FFT of the given domain, + data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension + Return: success or error code + */ + int ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward = true); + + /* + Info: call ifft kernel to execute the bit reverse sort + data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension + Return: success or error code + */ + int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N); + +public: + + /* constructor - currently does nothing*/ + OpenCLFFT(OpenCLBase *base) { + m_oclbase = base; + } + + /* destructor - currently does nothing*/ + ~OpenCLFFT() { } + + /* + Info: execute forward fft function with data set on device + Return: success or error code + */ + //int ocl_executeFFT(cl_mem &data, int ndim, int N, bool forward = true); + int executeFFT(void *data, int ndim, int N[3], int streamId = -1, bool forward = true); + + /* + Info: execute inverse fft with data set on device + Return: success or error code + */ + //int ocl_executeIFFT(cl_mem &data, int ndim, int N); + int executeIFFT(void *data, int ndim, int N[3], int streamId = -1); + + /* + Info: execute normalize kernel + Return: success or error code + */ + //int ocl_normalizeFFT(cl_mem &data, int ndim, int N); + int normalizeFFT(void *data, int ndim, int N[3], int streamId = -1); + + /* + Info: set FFT size + Return: success or error code + */ + int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; } + + int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + + int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + + int destroyFFT() { return DKS_SUCCESS; } + + int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) + { + return DKS_ERROR; + } + int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) + { + return DKS_ERROR; + } + int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) + { + return DKS_ERROR; + } + + int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true); + + int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true); + + int ocl_executeTranspose(void *src, int N[3], int ndim, int dim); + + //void printData3DN4(cl_double2* &data, int N); + +}; + +#endif diff --git a/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl b/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl new file mode 100644 index 0000000..f08f268 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl @@ -0,0 +1,175 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define TAU 2.197019 + + +__kernel void parallelReductionSum(__global double *data_in, __global double *data_out, + __local double *data_local, int size) +{ + + //get local and global ids, and work group size + int local_id = get_local_id(0); + int global_id = get_global_id(0); + int group_size = get_local_size(0); + + //copy from global memory to local, if global id out of bounds fill with 0s + if (global_id < size) + data_local[local_id] = data_in[global_id]; + else + data_local[local_id] = 0; + + //loop trough reduction steps + for (uint stride = group_size / 2; stride > 0; stride /= 2) { + + //synch all work items in work group + barrier(CLK_LOCAL_MEM_FENCE); + + //create partials summs each step + if (local_id < stride) + data_local[local_id] += data_local[local_id + stride]; + } + + //local thread 0 writes final partial sum to global memory + if (local_id == 0) + data_out[get_group_id(0)] = data_local[0]; + +} + +__kernel void kernelPHistoTFFcn(__global double *data, __global double *par, __global double *chisq, + double fTimeResolution, double fRebin, + int length, int sensors, int numpar, + __local double *p) +{ + + //get work item id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync work items inside work group + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + if (j < length) { + + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + double w = p[0]*0.08516155035269027; + double tt = exp(-time/TAU); + double pp = exp(-0.5 * pow(p[1]*time, 2.0)); + double wt = w * time; + + + int idx; + double ldata, theo; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + ldata = data[idx]; + + theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + } + } +} + +__kernel void kernelSingleGaussTF(__global double *data, __global unsigned int *t0, + __global double *par, __global double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar, __local double *p) +{ + + //get work item id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + + //load para,eters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync work items inside work group + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = par[0]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0 + fTimeResolution * fRebin* (j - lft0); + theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0)) + *cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + // 1.74532925199432955e-2 = pi/180 + + if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) ) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } + +} + +__kernel void kernelDoubleLorentzTF(__global double *data, __global unsigned int *t0, + __global double *par, __global double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar, __local double *p) +{ + + //get work item id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + + //load para,eters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync work items inside work group + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = p[0]*0.08516155035269027; + double w2 = p[2]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0+fTimeResolution*fRebin*(j-lft0); + + theo = p[4+i*5]*exp(-time/TAU)* + (1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)* + cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+ + (1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)* + cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5]; + // 1.74532925199432955e-2 = pi/180 + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } + +} + diff --git a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl new file mode 100644 index 0000000..bdc9374 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl @@ -0,0 +1,344 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define PI 3.141592653589793115998 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +/** From 'Numerical Recipes in C' by Press et.al, 1992. */ +//Returns the Bessel function J0(x) for any real x. +double bessj0(double x) { + double ax,z; + double xx,y,ans,ans1,ans2; //Accumulate polynomials in double precision. + + if ((ax=fabs(x)) < 8.0) { //Direct rational function fit. + y=x*x; + ans1=57568490574.0+y*(-13362590354.0+y*(651619640.7+y*(-11214424.18+y*(77392.33017+y*(-184.9052456))))); + ans2=57568490411.0+y*(1029532985.0+y*(9494680.718+y*(59272.64853+y*(267.8532712+y*1.0)))); + ans=ans1/ans2; + } else { //Fitting function (6.5.9). + z=8.0/ax; + y=z*z; + xx=ax-0.785398164; + ans1=1.0+y*(-0.1098628627e-2+y*(0.2734510407e-4+y*(-0.2073370639e-5+y*0.2093887211e-6))); + ans2 = -0.1562499995e-1+y*(0.1430488765e-3+y*(-0.6911147651e-5+y*(0.7621095161e-6-y*0.934945152e-7))); + ans=sqrt(0.636619772/ax)*(cos(xx)*ans1-z*sin(xx)*ans2); + } + return ans; +} + +/** Theory function declaration. + * Definition of the theory function will be build during runtime before compilation. + */ +double fTheory(double t, __local double *p, __local double *f, __local int *m); + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double t, double lamda) { + return exp( -lamda*t ); +} + +double ge(double t, double lamda, double beta) { + return exp( -pow(lamda*t, beta) ); +} + +double sg(double t, double sigma) { + return exp( -0.5 * pow(sigma*t, 2) ); +} + +double stg(double t, double sigma) { + double sigmatsq = pow(sigma*t,2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double t, double lambda) { + double lambdat = lambda*t; + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double t, double lambda, double sigma) { + double lambdat = lambda*t; + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double t, double sigma, double beta) { + if (beta < 1.0e-3) + return 0.0; + double sigmatb = pow(sigma*t, beta); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta); +} + +double spg(double t, double lambda, double gamma, double q) { + double lam2 = lambda*lambda; + double lamt2q = t*t*lam2*q; + double rate2 = 4.0*lam2*(1.0-q)*t/gamma; + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double t, double nu, double lambda) { + double nut = nu*t; + double nuth = nu*t/2.0; + double lamt = lambda*t; + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double t, double phi, double nu) { + double tmp_nu = TWO_PI*nu*t; + double tmp_phi = DEG_TO_RAD * phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + + return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double b(double t, double phi, double nu) { + return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi); +} + +double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI * nu * t; + double ph = DEG_TO_RAD * phi; + + return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double ab(double t, double sigma, double gamma) { + double gt = gamma*t; + + return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double t, double Delta0, double Rb) { + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double t, double phi, double nu, double Delta0, double Rb) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double t, double Delta0, double Rb, double nuc) { + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa); +} + +double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph); +} + +__kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err, + __global double *par, __global double *chisq, __global int *map, __global double *funcv, + int length, int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double tau, double N0, double bkg, + __local double *p, __local double *f, __local int *m) +{ + + //get thread id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + int lsize = get_local_size(0); + + //load parameters from global to shared memory + while (tid < numpar) { + p[tid] = par[tid]; + tid += lsize; + } + + //load functions from global to shared memory + tid = get_local_id(0); + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += lsize; + } + + //load maps from global memory + tid = get_local_id(0); + while (tid < nummap) { + m[tid] = map[tid]; + tid += lsize; + } + + //sync threads + barrier(CLK_LOCAL_MEM_FENCE); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg; + + #ifdef MLH + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo)); + else + chisq[j] = 2.0 * (theo - ldata); + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += get_global_size(0); + } + +} + +__kernel void kernelChiSquareAsymmetry(__global double *data, __global double *err, + __global double *par, __global double *chisq, __global int *map, __global double *funcv, + int length, int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double alpha, double beta, + __local double *p, __local double *f, __local int *m) +{ + + //get thread id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + int lsize = get_local_size(0); + + //load parameters from global to shared memory + while (tid < numpar) { + p[tid] = par[tid]; + tid += lsize; + } + + //load functions from global to shared memory + tid = get_local_id(0); + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += lsize; + } + + //load maps from global memory + tid = get_local_id(0); + if (tid < nummap) { + m[tid] = map[tid]; + tid += lsize; + } + + //sync threads + barrier(CLK_LOCAL_MEM_FENCE); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double ab = alpha*beta; + double theoVal = fTheory(t, p, f, m); + double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0)-(ab-1.0)*theoVal); + + #ifdef MLH + chisq[j] = 0.0; // max log likelihood not defined for asymmetry fit + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += get_global_size(0); + } + +} + +__kernel void parallelReductionSum(__global double *data_in, __global double *data_out, + __local double *data_local, int size) +{ + + //get local and global ids, and work group size + int local_id = get_local_id(0); + int global_id = get_global_id(0); + int group_size = get_local_size(0); + + //copy from global memory to local, if global id out of bounds fill with 0s + if (global_id < size) + data_local[local_id] = data_in[global_id]; + else + data_local[local_id] = 0; + + //loop trough reduction steps + for (uint stride = group_size / 2; stride > 0; stride /= 2) { + + //synch all work items in work group + barrier(CLK_LOCAL_MEM_FENCE); + + //create partials summs each step + if (local_id < stride) + data_local[local_id] += data_local[local_id + stride]; + } + + //local thread 0 writes final partial sum to global memory + if (local_id == 0) + data_out[get_group_id(0)] = data_local[0]; + +} + +__kernel void parallelReductionTwoPhase(__global double *data_in, __global double *data_out, + __local double *data_local, int size) +{ + //get local and global ids, and work group size + int local_id = get_local_id(0); + int global_id = get_global_id(0); + int global_size = get_global_size(0); + int group_size = get_local_size(0); + + double acc = 0; + while (global_id < size) { + acc += data_in[global_id]; + global_id += global_size; + } + + //parallel reduction on local work group + data_local[local_id] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + for (uint stride = group_size / 2; stride > 0; stride /= 2) { + //synch all work items in work group + barrier(CLK_LOCAL_MEM_FENCE); + + //create partials summs each step + if (local_id < stride) + data_local[local_id] += data_local[local_id + stride]; + } + + //local thread 0 writes final partial sum to global memory + if (local_id == 0) + data_out[get_group_id(0)] = data_local[0]; + +} diff --git a/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl new file mode 100644 index 0000000..34b08bd --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl @@ -0,0 +1,362 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#pragma OPENCL EXTENSION + + +/******Random numbers********/ + +/* struct for random number state */ +typedef struct { + + double s10; + double s11; + double s12; + double s20; + double s21; + double s22; + double z; + bool gen; + +} RNDState; + +#define NORM 2.328306549295728e-10 +#define M1 4294967087.0 +#define M2 4294944443.0 +#define A12 1403580.0 +#define A13N 810728.0 +#define A21 527612.0 +#define A23N 1370589.0 + +/* MRG32k3a uniform random number generator */ +double rand_uniform(RNDState *s) { + long k; + double p1, p2; + + /* Component 1 */ + p1 = A12 * (*s).s11 - A13N * (*s).s10; + k = p1 / M1; + p1 -= k * M1; + if (p1 < 0.0) + p1 += M1; + (*s).s10 = (*s).s11; + (*s).s11 = (*s).s12; + (*s).s12 = p1; + + /* Component 2 */ + p2 = A21 * (*s).s22 - A23N * (*s).s20; + k = p2 / M2; + p2 -= k * M2; + if (p2 < 0.0) + p2 += M2; + (*s).s20 = (*s).s21; + (*s).s21 = (*s).s22; + (*s).s22 = p2; + + /* Combination */ + if (p1 <= p2) + return ((p1 - p2 + M1) * NORM); + else + return ((p1 - p2) * NORM); +} + +/* get random variable with gaussian distribution */ +double rand_normal(RNDState *s, double mu, double sigma) { + + const double two_pi = 2.0 * 3.141592653589793223846; + double z0; + + if (!(*s).gen) { + (*s).gen = true; + return (*s).z * sigma + mu; + } + + double u1, u2; + u1 = rand_uniform(s); + u2 = rand_uniform(s); + + z0 = sqrt(-2.0 * log(u1)) * cos(two_pi * u2); + (*s).z = sqrt(-2.0 * log(u1)) * sin(two_pi * u2); + (*s).gen = false; + + return z0 * sigma + mu; + + +} + +/* initialize random states */ +__kernel void initRand(__global RNDState *s, unsigned int seed, int N) { + + int id = get_global_id(0); + + if (id < N) { + RNDState tmp; + int tmp_seed = id;// * 0x100000000ULL; + tmp.s10 = 12345 + tmp_seed; + tmp.s11 = 12345 + tmp_seed; + tmp.s12 = 123 + tmp_seed; + tmp.s20 = 12345 + tmp_seed; + tmp.s21 = 12345 + tmp_seed; + tmp.s22 = 123 + tmp_seed; + + tmp.z = 0; + tmp.gen = true; + + s[id] = tmp; + } + +} + + +/**********Degrader**********/ +enum PARAMS { POSITION, + ZSIZE, + M_P, + C, + RHO_M, + PI, + AVO, + R_E, + eM_E, + Z_M, + A_M, + A2_C, + A3_C, + A4_C, + A5_C, + Z_P, + X0_M, + I_M, + DT_M}; + + +typedef struct { + int label; + unsigned localID; + double3 Rincol; + double3 Pincol; +} PART; + +double Dot(double3 d1, double3 d2) { + return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); +} + +/* check if particle is in degrader material */ +bool checkHit(double z, double position, double zsize) { + return ( ( z > position) && ( z <= position + zsize) ); +} + +/* calculate particles energy loss */ +void energyLoss(double *Eng, bool *pdead, double deltat, RNDState *s, __local double *par) { + + double dEdx = 0.0; + double gamma = ( (*Eng) + par[M_P]) / par[M_P]; + + double gamma2 = gamma * gamma; + + double beta = sqrt(1.0 - 1.0 / gamma2); + double beta2 = beta * beta; + double deltas = deltat * beta * par[C]; + double deltasrho = deltas * 100 * par[RHO_M]; + double K = 4.0 * par[PI] * par[AVO] * par[R_E] * par[R_E] * par[eM_E] * 1E7; + double sigma_E = sqrt(K * par[eM_E] * par[RHO_M] * (par[Z_M]/par[A_M])* deltas * 1E5); + + if (((*Eng) > 0.00001) && ((*Eng) < 0.0006)) { + double Ts = ((*Eng)*1E6)/1.0073; + double epsilon_low = par[A2_C]*pow(Ts,0.45); + double epsilon_high = (par[A3_C]/Ts)*log(1+(par[A4_C]/Ts)+(par[A5_C]*Ts)); + double epsilon = (epsilon_low*epsilon_high)/(epsilon_low + epsilon_high); + dEdx = - epsilon /(1E21*(par[A_M]/par[AVO])); + double delta_Eave = deltasrho * dEdx; + double delta_E = delta_Eave + rand_normal(s, 0, sigma_E); + + (*Eng) = (*Eng) + delta_E / 1E3; + } + + if ((*Eng) >= 0.0006) { + double Tmax = 2.0 * par[eM_E] * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * par[eM_E] / par[M_P] + + (par[eM_E] / par[M_P]) * (par[eM_E] / par[M_P])); + dEdx = -K * par[Z_P] * par[Z_P] * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * par[eM_E] * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + double delta_Eave = deltasrho * dEdx; + double delta_E = delta_Eave + rand_normal(s, 0, sigma_E); + + (*Eng) = (*Eng)+delta_E / 1E3; + } + + (*pdead) = (((*Eng)<1E-4) || (dEdx>0)); + +} + +/* rotate partocle */ +void Rot(double3 *P, double3 *R, double xplane, + double normP, double thetacou, double deltas, int coord, + __local double *par) +{ + double Psixz; + double pxz; + + double px = (*P).x; + double pz = (*P).z; + double x = (*R).x; + double z = (*R).z; + + if (px>=0 && pz>=0) Psixz = atan(px/pz); + else if (px>0 && pz<0) + Psixz = atan(px/pz) + par[PI]; + else if (px<0 && pz>0) + Psixz = atan(px/pz) + 2*par[PI]; + else + Psixz = atan(px/pz) + par[PI]; + + pxz = sqrt(px*px + pz*pz); + if(coord==1) { + (*R).x = x + deltas * px/normP + xplane*cos(Psixz); + (*R).z = z - xplane * sin(Psixz); + } + if(coord==2) { + (*R).x = x + deltas * px/normP + xplane*cos(Psixz); + (*R).z = z - xplane * sin(Psixz) + deltas * pz / normP; + } + (*P).x = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou); + (*P).z = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); +} + + +void coulombScat(double3 *R, double3 *P, double deltat, + RNDState *s, __local double* par) { + + double dotP = Dot((*P), (*P)); + + double Eng = sqrt(dotP + 1.0) * par[M_P] - par[M_P]; + double gamma = (Eng + par[M_P]) / par[M_P]; + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + double normP = sqrt(dotP); + double deltas = deltat * beta * par[C]; + double theta0 = 13.6e6 / (beta * sqrt(dotP) * par[M_P] * 1e9) * + par[Z_P] * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); + + // x-direction: See Physical Review, "Multiple Scattering" + double z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + int coord = 1; + Rot(P, R, xplane, normP, thetacou, deltas, coord, par); + + double P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + coord = 0; // no change in coordinates but one in momenta-direction + Rot(P, R, xplane, normP, thetaru, deltas, coord, par); + } + + // y-direction: See Physical Review, "Multiple Scattering" + z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + coord = 2; + Rot(P, R, yplane, normP, thetacou, deltas, coord, par); + + P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + coord = 0; // no change in coordinates but one in momenta-direction + Rot(P, R, yplane, normP, thetaru, deltas, coord, par); + } + +} + +#define NUMPARAMS 19 +__kernel void kernelCollimatorPhysics(__global PART *data, __global double *par, + __global RNDState *state, int numparticles, + __local double *p) +{ + + //get global id + int tid = get_local_id(0); + int idx = get_global_id(0); + + printf("idx:\n");//, idx); + + //transfer params to local memory + if (tid < NUMPARAMS) + p[tid] = par[tid]; + + barrier(CLK_LOCAL_MEM_FENCE); + + RNDState s; + double3 R, P; + int l = 0; + if (idx < numparticles) { + R = data[idx].Rincol; + P = data[idx].Pincol; + s = state[idx]; + } + + double sq = sqrt(1.0 + Dot(P, P)); + bool pdead = false; + bool hit = checkHit(R.z, p[POSITION], p[ZSIZE]); + double Eng; + + if (hit) { + Eng = (sq - 1) * p[M_P]; + energyLoss(&Eng, &pdead, p[DT_M], &s, p); + } else { + R.x = R.x + p[DT_M] * p[C] * P.x / sq; + R.y = R.y + p[DT_M] * p[C] * P.y / sq; + R.z = R.z + p[DT_M] * p[C] * P.z / sq; + l = -2; + } + + if (hit && !pdead) { + double ptot = sqrt((p[M_P] + Eng) * (p[M_P] + Eng) - (p[M_P] * p[M_P])) / p[M_P]; + sq = sqrt(Dot(P, P)); + P.x = P.x * ptot / sq; + P.y = P.y * ptot / sq; + P.z = P.z * ptot / sq; + coulombScat(&R, &P, p[DT_M], &s, p); + } + + if (hit && pdead) + l = -1; + + if (idx < numparticles) { + data[idx].Rincol = R; + data[idx].Pincol = P; + data[idx].label = l; + state[idx] = s; + } + +} + + +/* count dead particles and particles leaving material - boost compute? */ + +/* sort particles so dead and leaving particles are at the end of PART array - boost compute */ + + diff --git a/src/OpenCL/OpenCLKernels/OpenCLFFT.cl b/src/OpenCL/OpenCLKernels/OpenCLFFT.cl new file mode 100644 index 0000000..1d4763b --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLFFT.cl @@ -0,0 +1,181 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/* 3D normalize FFT kernel */ +__kernel void normalizeFFT(__global double2 *input, int N) { + int i1 = get_global_id(0); + int i2 = get_global_id(1); + int i3 = get_global_id(2); + int n1 = get_global_size(0); + int n2 = get_global_size(1); + int n3 = get_global_size(2); + + int id = i1; + if (n2 > 1) + id += i2*n2; + if (n3 > 1) + id += i3*n2*n2; + + input[id].x = input[id].x / N; + input[id].y = input[id].y / N; +} + +/* 3D radix 2 FFT kernel */ +__kernel void FFT3D(__global double2 *input, int step, int dim, int forward) { + + int n1 = get_global_size(0); + int n2 = get_global_size(1); + int n3 = get_global_size(2); + int i1 = get_global_id(0); + int i2 = get_global_id(1); + int i3 = get_global_id(2); + + int jump = step << 1; + + int d, idGroup, idLoc, idTwidle, id, match; + if (dim == 0) { + + d = n1 / step; // n1 >> log2(step) + idLoc = i1 / d; + idGroup = i1 & (d-1); //modulo + + idTwidle = idGroup * jump + idLoc; + id = i3*n3*n3 + i2*n2 + idTwidle; + match = id + step; + } else if (dim == 1) { + + d = n2 / step; + idLoc = i2 / d; + idGroup = i2 & (d-1); + + idTwidle = idGroup * jump + idLoc; + id = i3*n3*n3 + idTwidle*n1 + i1; + match = id + step*n1; + } else if (dim == 2) { + + d = n3 / step; + idLoc = i3 / d; + idGroup = i3 & (d-1); + + idTwidle = idGroup * jump + idLoc; + id = idTwidle*n1*n1 + i2*n2 + i1; + match = id + step*n1*n1; + } + + double alpha; + if (forward == 1) + alpha = -( 2 * M_PI / jump ) * idTwidle; + else + alpha = ( 2 * M_PI / jump ) * idTwidle; + + double wr, wi; + wi = sincos(alpha, &wr); + + double2 cTemp; + double2 cTempId = input[id]; + double2 cTempMatch = input[match]; + + cTemp.x = wr*cTempMatch.x - wi*cTempMatch.y; + cTemp.y = wr*cTempMatch.y + wi*cTempMatch.x; + + input[match] = cTempId - cTemp; + input[id] = cTempId + cTemp; + +} + +/* 3D bit reversal sort */ +__kernel void BitReverseSort3D(__global double2 *input, int bits, int dim) { + + int n = get_global_size(0); + int i1 = get_global_id(0); + int i2 = get_global_id(1); + int i3 = get_global_id(2); + + int irev, itmp, istart; + if (dim == 0) { + istart = i1; + irev = i1; + itmp = i1; + } else if (dim == 1) { + irev = i2; + itmp = i2; + istart = i2; + } else if (dim == 2) { + irev = i3; + itmp = i3; + istart = i3; + } + + for (int j = 1; j < bits; j++) { + itmp >>= 1; + irev <<= 1; + irev |= itmp & 1; + } + irev &= n - 1; + + int id1, id2; + if (istart < irev) { + double2 tmp; + id1 = i3*n*n + i2*n + i1; + if (dim == 0) { //i1, irev - w, i2 - h, i3 - d + id2 = i3*n*n + i2*n + irev; + tmp = input[id1]; + input[id1] = input[id2]; + input[id2] = tmp; + } else if (dim == 1) { // i1 - w, i2, irev - h, i3 - d + id2 = i3*n*n + irev*n + i1; + tmp = input[id1]; + input[id1] = input[id2]; + input[id2] = tmp; + } else if (dim == 2) { // i1 - w, i2 - h, i3, irev - d + id2 = irev*n*n + i2*n + i1; + tmp = input[id1]; + input[id1] = input[id2]; + input[id2] = tmp; + } + } +} + + +/* 3D FFT kernel based on Stockham's out-of-place algorithm */ +__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim, const int forward) { + + const int gid1 = get_global_id(0); + const int gid2 = get_global_id(1); + const int gid3 = get_global_id(2); + + int t2 = 2*t; + int k, m, in1, in2, out1, out2; + in1 = gid3*t2*t2 + gid2*t2 + gid1; + if (ndim == 1) { + k = gid1 & (p - 1); + m = (gid1 << 1) - k; + in2 = in1 + t; + out1 = gid3*t2*t2 + gid2*t2 + m; + out2 = out1 + p; + } else if (ndim == 2) { + k = gid2 & (p - 1); + m = (gid2 << 1) - k; + in2 = in1 + t2*t; + out1 = gid3*t2*t2 + m*t2 + gid1; + out2 = out1 + t2*p; + } else if (ndim == 3) { + k = gid3 & (p - 1); + m = (gid3 << 1) - k; + in2 = in1 + t2*t2*t; + out1 = m*t2*t2 + gid2*t2 + gid1; + out2 = out1 + p*t2*t2; + } + + const double2 d1 = src[in1]; + const double2 d2 = src[in2]; + + const double theta = (forward*2*M_PI*k) / (p << 1); + + double cs; + + double sn = sincos(theta, &cs); + const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn); + + dst[out1] = d1 + temp; + dst[out2] = d1 - temp; +} diff --git a/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl b/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl new file mode 100644 index 0000000..b5d9e51 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl @@ -0,0 +1,214 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define TWOPI 6.28318530718 + +__kernel void fft_radix2(__global double2* src, __global double2* dst, const int p, const int t) { + + const int gid = get_global_id(0); + const int k = gid & (p - 1); + const int m = (gid << 1) - k; + + //src += gid; + //dst += (gid << 1) - k; + + //const double2 in1 = src[0]; + //const double2 in2 = src[t]; + + const double2 in1 = src[gid]; + const double2 in2 = src[gid+t]; + + const double theta = (-2*M_PI*k) / (p << 1); + double cs; + + double sn = sincos(theta, &cs); + const double2 temp = (double2) (in2.x * cs - in2.y * sn, in2.y * cs + in2.x * sn); + + //dst[0] = in1 + temp; + //dst[p] = in1 - temp; + + dst[m] = in1 + temp; + dst[m+p] = in1 - temp; + +} + +__kernel void fft3d_radix2_transpose(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) { + + /* get ids */ + const int gid1 = get_global_id(0); + const int gid2 = get_global_id(1); + const int gid3 = get_global_id(2); + + /* calc indexes */ + int t2 = 2*t; + + int k = gid1 & (p - 1); + int m = (gid1 << 1) - k; + + int tmp = gid3*t2*t2 + gid2*t2; + + int in1 = tmp + gid1; + int in2 = in1 + t; + + int out1 = tmp + m; + int out2 = out1 + p; + + /* calc FFT */ + const double2 d1 = src[in1]; + const double2 d2 = src[in2]; + + const double theta = (-2*M_PI*k) / (p << 1); + double cs; + + double sn = sincos(theta, &cs); + const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn); + + dst[out1] = d1 + temp; + dst[out2] = d1 - temp; +} + +__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) { + + const int gid1 = get_global_id(0); + const int gid2 = get_global_id(1); + const int gid3 = get_global_id(2); + + int t2 = 2*t; + int k, m, in1, in2, out1, out2; + in1 = gid3*t2*t2 + gid2*t2 + gid1; + if (ndim == 1) { + k = gid1 & (p - 1); + m = (gid1 << 1) - k; + in2 = in1 + t; + out1 = gid3*t2*t2 + gid2*t2 + m; + out2 = out1 + p; + } else if (ndim == 2) { + k = gid2 & (p - 1); + m = (gid2 << 1) - k; + in2 = in1 + t2*t; + out1 = gid3*t2*t2 + m*t2 + gid1; + out2 = out1 + t2*p; + } else if (ndim == 3) { + k = gid3 & (p - 1); + m = (gid3 << 1) - k; + in2 = in1 + t2*t2*t; + out1 = m*t2*t2 + gid2*t2 + gid1; + out2 = out1 + p*t2*t2; + } + + const double2 d1 = src[in1]; + const double2 d2 = src[in2]; + + const double theta = (-2*M_PI*k) / (p << 1); + + double cs; + double sn = sincos(theta, &cs); + const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn); + + dst[out1] = d1 + temp; + dst[out2] = d1 - temp; +} + + +__kernel void transpose(__global double2 *data, int ndim, int dim) { + + int k = get_global_id(0); + int j = get_global_id(1); + int i = get_global_id(2); + int nk = get_global_size(0); + int nj = get_global_size(1); + int ni = get_global_size(2); + + int n, m; + n = i*ni*ni + j*nj + k; + if (dim == 2) + m = i*ni*ni + k*nj + j; + else + m = k*ni*ni + j*nj + i; + + if (n < m) { + double2 tmp = data[m]; + data[m] = data[n]; + data[n] = tmp; + } +} + +#define PI2 6.28318530718 + +__kernel void fft_batch3D(__global double2 *data_in, __local double2 *d, __local double2 *r, __local double2 *tmp, int N, int dim) { + + int id1 = get_global_id(0); + int id2 = get_global_id(1); + int id3 = get_global_id(2); + + //calc indexes + int sid, offset; + if (dim == 1) { + sid = id3*N*N + id2*N; + offset = 1; + } else if (dim == 2) { + sid = id3*N*N + id2; + offset = N; + } else if (dim == 3) { + sid = id3*N + id2; + offset = N*N; + } + + //copy data from global memory to local + int i1 = id1; + int i2 = id1+N/2; + d[i1] = data_in[sid + i1*offset]; + d[i2] = data_in[sid + i2*offset]; + + barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_GLOBAL_MEM_FENCE); + + //exec fft + int p1, p2, j, k, out1, step, jump, t; + double theta, cs, sn; + + t = 1; + step = 1; + while (step < N) { + jump = step << 1; + + j = i1 >> (t - 1); // same as i1 / step, because t-1 = log2(step) + k = i2 & (step - 1); // same as i2 % step + + out1 = j * jump + k; + + theta = -PI2 * k / jump; + sn = sincos(theta, &cs); + + double2 temp = (double2) (d[i2].x*cs - d[i2].y*sn, d[i2].y*cs + d[i2].x * sn); + + + r[out1] = d[i1] + temp; + r[out1+step] = d[i1] - temp; + + t++; + step = jump; + + //swap local arrays + tmp = r; + r = d; + d = tmp; + + //wait for all threads to finish this iteration + barrier(CLK_LOCAL_MEM_FENCE); + } + + tmp = r; + r = d; + d = tmp; + + //copy data from local memory to global + data_in[sid + i1*offset] = r[i1]; + data_in[sid + i2*offset] = r[i2]; + +} + + + + + + + diff --git a/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl b/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl new file mode 100644 index 0000000..ffbd0ba --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl @@ -0,0 +1,41 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/* transpose matrix */ +__kernel void transpose(__global double2 *input, __global double2 *output, + int width, int height, __local double2 *block) +{ + + //transfer row in shared memory + unsigned int xIdx = get_global_id(0); + unsigned int yIdx = get_global_id(1); + int block_dim = get_local_size(0); + + if ( (xIdx < width) && (yIdx < height) ) { + unsigned int idx_in = yIdx * width + xIdx; + block[get_local_id(1)*(block_dim+1)+get_local_id(0)] = input[idx_in]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + xIdx = get_group_id(1) * block_dim + get_local_id(0); + yIdx = get_group_id(0) * block_dim + get_local_id(1); + + if ( (xIdx < height) && (yIdx < width) ) { + unsigned int idx_out = yIdx * height + xIdx; + output[idx_out] = block[get_local_id(0)*(block_dim+1)+get_local_id(1)]; + } + +} + +/* naive transpose matrix kernel */ +__kernel void transpose_naive(__global double2 *input, __global double2 *output, int width, int height) +{ + unsigned int xIdx = get_global_id(0); + unsigned int yIdx = get_global_id(1); + + if (xIdx < width && yIdx < height) { + unsigned int idx_in = xIdx + width * yIdx; + unsigned int idx_out = yIdx + height * xIdx; + output[idx_out] = input[idx_in]; + } +} diff --git a/src/Utility/CMakeLists.txt b/src/Utility/CMakeLists.txt new file mode 100644 index 0000000..8a95785 --- /dev/null +++ b/src/Utility/CMakeLists.txt @@ -0,0 +1,18 @@ +SET (_SRCS + TimeStamp.cpp + DKSTimer.cpp + ) + +SET (_HDRS + TimeStamp.h + DKSTimer.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/Utility) diff --git a/src/Utility/DKSTimer.cpp b/src/Utility/DKSTimer.cpp new file mode 100644 index 0000000..5f495d3 --- /dev/null +++ b/src/Utility/DKSTimer.cpp @@ -0,0 +1,53 @@ +#include "DKSTimer.h" + +//set initial values - running to false, timervalue to zero and name to empty string +DKSTimer::DKSTimer() { + running = false; + timervalue = 0.0; + name = ""; +} + +//destructor does nothing +DKSTimer::~DKSTimer() { + +} + +//init the timer by setting name and clearing timervalue, also sets running to false +void DKSTimer::init(std::string n) { + running = false; + timervalue = 0.0; + name = n; +} + +//if timer is not running get the current time and save to timeStart, set the timer as running +void DKSTimer::start() { + if (!running) { + gettimeofday(&timeStart, NULL); + running = true; + } +} + +//if the timer is running get the current time to timeEnd, calculate the elapsed time befor start +//and end, add elapsed time to timervalue, set the timer as not running +void DKSTimer::stop() { + if (running) { + gettimeofday(&timeEnd, NULL); + timervalue += ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6; + running = false; + } +} + +void DKSTimer::reset() { + running = false; + timervalue = 0.0; +} + +//return the accumulated value of timervalue +double DKSTimer::gettime() { + return timervalue; +} + +void DKSTimer::print() { + std::cout << "DKSTimer " << name << " elapsed time\t" << timervalue << "s" << std::endl; +} diff --git a/src/Utility/DKSTimer.h b/src/Utility/DKSTimer.h new file mode 100644 index 0000000..80025c0 --- /dev/null +++ b/src/Utility/DKSTimer.h @@ -0,0 +1,59 @@ +#ifndef H_DKSTIMER +#define H_DKSTIMER + +#include +#include +#include + +class DKSTimer { + +private: + + bool running; + double timervalue; + struct timeval timeStart; + struct timeval timeEnd; + std::string name; + +public: + + /** Init DKSTimer by seting timer to zero */ + DKSTimer(); + + ~DKSTimer(); + + /** Init the timer + * Set the name for timer and clear all values + */ + void init(std::string n); + + /** Start the timer. + * Get the curret time with gettimeofday and save in timeStart + */ + void start(); + + /** Stop the timer + * Get the curretn time with gettimeofday and save in timeEnd + * Calculate elapsed time by timeEnd - timeStart and add to timervalue + */ + void stop(); + + /** Reset timervalue to zero. + * Set timervalue, timeStart and timeEnd to zero + */ + void reset(); + + /** Return elapsed time in seconds. + * Return the value of timervalue + */ + double gettime(); + + /** Print timer. + * Print the elapsed time of the timer + */ + void print(); + + +}; + +#endif diff --git a/src/Utility/TimeStamp.cpp b/src/Utility/TimeStamp.cpp new file mode 100644 index 0000000..1e239fc --- /dev/null +++ b/src/Utility/TimeStamp.cpp @@ -0,0 +1,11 @@ +#include "TimeStamp.h" + +timestamp_t get_timestamp() { + struct timeval now; + gettimeofday (&now, NULL); + return now.tv_usec + (timestamp_t)now.tv_sec * 1000000; +} + +double get_secs(timestamp_t t_start, timestamp_t t_end) { + return (t_end - t_start) / 1000000.0L; +} \ No newline at end of file diff --git a/src/Utility/TimeStamp.h b/src/Utility/TimeStamp.h new file mode 100644 index 0000000..d53104d --- /dev/null +++ b/src/Utility/TimeStamp.h @@ -0,0 +1,14 @@ +#ifndef H_TIMESTAMPE +#define H_TIMESTAMPE + +#include +#include +#include + +typedef unsigned long long timestamp_t; + +timestamp_t get_timestamp(); +double get_secs(timestamp_t t_start, timestamp_t t_end); + + +#endif \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000..01f33fb --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,84 @@ +INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) + +LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) + +#ADD_EXECUTABLE(testDKS testDKS.cpp) +#ADD_EXECUTABLE(testChi testChi.cpp) +#ADD_EXECUTABLE(testFFT testFFT.cpp) +#ADD_EXECUTABLE(testMIC testMIC.cpp) +#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp) +#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp) +#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp) +#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp) +#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp) +#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp) +#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp) +#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp) +#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp) +#ADD_EXECUTABLE(testOffset testOffset.cpp) +#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp) +#ADD_EXECUTABLE(testMPI testMPI.cpp) +#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp) +#ADD_EXECUTABLE(testGather testGather.cpp) +#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp) +#ADD_EXECUTABLE(testTranspose testTranspose.cpp) +ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) +#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) +#ADD_EXECUTABLE(testPush testPush.cpp) +#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp) +#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp) +#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp) + +#shared library +#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp) + + +#TARGET_LINK_LIBRARIES(testDKS dks) +#TARGET_LINK_LIBRARIES(testChi dks) +#TARGET_LINK_LIBRARIES(testFFT dks) +#TARGET_LINK_LIBRARIES(testMIC dks) +#TARGET_LINK_LIBRARIES(testMICOpenCL dks) +#TARGET_LINK_LIBRARIES(testFFT3D dks) +#TARGET_LINK_LIBRARIES(testFFT3DRC dks) +#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks) +#TARGET_LINK_LIBRARIES(testFFT3DTiming dks) +#TARGET_LINK_LIBRARIES(testStockhamFFT dks) +#TARGET_LINK_LIBRARIES(testStockFFT3D dks) +#TARGET_LINK_LIBRARIES(testMemObjects dks) +#TARGET_LINK_LIBRARIES(testRCFFT dks) +#TARGET_LINK_LIBRARIES(testOffset dks) +#TARGET_LINK_LIBRARIES(testOffsetMPI dks) +#TARGET_LINK_LIBRARIES(testMPI dks) +#TARGET_LINK_LIBRARIES(testMPIFFT dks) +#TARGET_LINK_LIBRARIES(testGather dks) +#TARGET_LINK_LIBRARIES(testGatherAsync dks) +#TARGET_LINK_LIBRARIES(testTranspose dks) +TARGET_LINK_LIBRARIES(testCollimatorPhysics dks) +#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks) +#TARGET_LINK_LIBRARIES(testPush dks) +#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks) +#TARGET_LINK_LIBRARIES(testIntegration dks) +#TARGET_LINK_LIBRARIES(testImageReconstruction dks) + + +#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared) + + +#IF (${COMPILER_NAME} STREQUAL "mpicxx") + #ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp) + #ADD_EXECUTABLE(testGreens testGreens.cpp) + #ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp) + #ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp) + #TARGET_LINK_LIBRARIES(testGatherAsync2 dks) + #TARGET_LINK_LIBRARIES(testGreens dks) + #TARGET_LINK_LIBRARIES(testFFTSolver dks) + #TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks) +#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx") + +#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp) +#TARGET_LINK_LIBRARIES(testChiSquare dks) + +#IF (NOT CUDA_VERSION VERSION_LESS "7.0") + #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) + #TARGET_LINK_LIBRARIES(testChiSquareRT dks) +#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") \ No newline at end of file diff --git a/test/testChi.cpp b/test/testChi.cpp new file mode 100644 index 0000000..0181144 --- /dev/null +++ b/test/testChi.cpp @@ -0,0 +1,141 @@ +#include +#include +#include + +#include "DKSBase.h" +#include "Utility/TimeStamp.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[4]; + + + if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else if (argc == 2){ + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << endl; + + cout << "Begin DKS Base tests" << endl; + + /* inti data */ + int ierr; + int nsize = 4000000; + int jsize = 16; + int psize = 6; + double *data = new double[nsize*jsize]; + double *p = new double[psize*jsize]; + double data_out = 0; + + srand(time(NULL)); + for (int i = 0; i < nsize*jsize; i++) { + //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1; + //data[i] = sign*(double)rand()/RAND_MAX; + data[i] = (double)i / (nsize*jsize); + //data[i] = 1; + } + for (int i = 0; i < psize*jsize; i++) { + //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1; + //p[i] = sign*(double)rand()/RAND_MAX; + p[i] = (double)i / (nsize*jsize); + //p[i] = 1; + } + /* end init */ + + timestamp_t tstart, tend; + //timestamp_t t0, t1; + + tstart = get_timestamp(); + + //init dks base class, set API to opencl and init connection with OpenCL device + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + + //ptrs to hold reference to device memory + void *dptr, *ntptr, *pptr; + + //allocate memory on device + //t0 = get_timestamp(); + dptr = base.allocateMemory(nsize*jsize, ierr); + ntptr = base.allocateMemory(nsize*jsize, ierr); + pptr = base.allocateMemory(psize*jsize, ierr); + //t1 = get_timestamp(); + //cout << "Allocate memory: " << get_secs(t0, t1) << endl; + + //write data to device + //t0 = get_timestamp(); + base.writeData(dptr, data, nsize*jsize); + //t1 = get_timestamp(); + //cout << "Write data set: " << get_secs(t0, t1) << endl << endl; + + for (int i = 0; i < 5; i++) { + //write parameters to device + //t0 = get_timestamp(); + base.writeData(pptr, p, psize*jsize); + //t1 = get_timestamp(); + //cout << "Write parameters: " << get_secs(t0, t1) << endl; + + //set function to calcNt and execute it with necessary parameters + //t0 = get_timestamp(); + base.callNt(ntptr, pptr, psize, nsize, jsize, 0.025); + //t1 = get_timestamp(); + + //cout << "Calc N(t): " << get_secs(t0, t1) << endl; + + //set function to chi2 and execute it with necessary parameters + //t0 = get_timestamp(); + base.callChi2(ntptr, dptr, ntptr, nsize*jsize); + //t1 = get_timestamp(); + //cout << "Calc chi^2: " << get_secs(t0, t1) << endl; + + //set function so sum and execute it with necessary parameters + //t0 = get_timestamp(); + base.callSum(ntptr, ntptr, nsize*jsize); + //t1 = get_timestamp(); + //cout << "Calc sum: " << get_secs(t0, t1) << endl; + + //read calculated sum (one value) + //t0 = get_timestamp(); + base.readData(ntptr, &data_out, 1); + //t1 = get_timestamp(); + //cout << "Read sum: " << get_secs(t0, t1) << endl; + cout << "Sum nt: " << data_out << endl; + + /* + for (int i = 0; i < psize*jsize; i++) { + int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1; + p[i] = sign*(double)rand()/RAND_MAX; + } + */ + + + //cout << endl; + } + + //free device memory + //t0 = get_timestamp(); + base.freeMemory(dptr, nsize*jsize); + base.freeMemory(ntptr, nsize*jsize); + base.freeMemory(pptr, psize*jsize); + //t1 = get_timestamp(); + //cout << "Free memory: " << get_secs(t0, t1) << endl; + + tend = get_timestamp(); + + cout << endl << "time: " << get_secs(tstart, tend) << endl; + + return 0; +} + diff --git a/test/testChiSquare.cpp b/test/testChiSquare.cpp new file mode 100644 index 0000000..550f3a4 --- /dev/null +++ b/test/testChiSquare.cpp @@ -0,0 +1,168 @@ +#include +#include +#include "DKSBase.h" + +using namespace std; + +void initData(vector< vector > &v, int length) { + + for (unsigned int i = 0; i < v.size(); i++) { + for (int j = 0; j < length; j++) { + v[i].push_back(j); + } + } + +} + + +void printData(vector< vector > &v) { + for (unsigned int i = 0; i < v.size(); i++) { + for (unsigned int j = 0; j < v[i].size(); j++) { + cout << v[i][j] << "\t"; + } + cout << endl; + } +} + +void initData(double *data, int sensors, int length) { + + for (int i = 0; i < sensors; i++) { + for (int j = 0; j < length; j++) { + data[i*length + j] = j; + } + } + +} + + +void printData(double *data, int sensors, int length) { + for (int i = 0; i < sensors; i++) { + for (int j = 0; j < length; j++) { + cout << data[i*length + j] << "\t"; + } + cout << endl; + } +} + +void initPar(double *par, int npar) { + + for (int i = 0; i < npar; i++) + par[i] = (double)i / npar; + +} + +void printDiv(int size) { + for (int i = 0; i < size; i++) + cout << "="; + cout << endl; +} + +void calcChisq(vector< vector > fData, double * par, double fTimeResolution, double fRebin) +{ + + double chisq = 0.0; + double theo, data; + const double tau=2.197019; + const double dt0 = fTimeResolution*0.5*(fRebin-1); + double time; + double w = par[0]*0.08516155035269027; + + unsigned int i, j; + + for (i=0; i > fData; + fData.resize(sensors); + initData(fData, length); + printData(fData); + printDiv(75); + + DKSBase dksbase; + if (useCuda) + dksbase.setAPI("Cuda", 4); + else + dksbase.setAPI("OpenCL", 6); + dksbase.setDevice("-gpu", 4); + dksbase.initDevice(); + dksbase.setupFFT(0, NULL); + + + void *mem_data, *mem_par, *mem_chisq; + cout << "Allocate memory" << endl; + mem_par = dksbase.allocateMemory(npar, ierr); + mem_data = dksbase.allocateMemory(fData.size() * fData[0].size(), ierr); + mem_chisq = dksbase.allocateMemory(fData.size() * fData[0].size(), ierr); + + + cout << "Write data" << endl; + dksbase.writeData(mem_par, par, npar); + for (int i = 0; i < sensors; i++) + dksbase.writeData(mem_data, &fData[i][0], length, i*length); + + + + cout << "Call PHistoTFFcn" << endl; + dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq, + fTimeResolution, fRebin, + sensors, length, npar, result); + cout << "Result: " << result << endl; + + + double *out_data = new double[ndata]; + dksbase.readData(mem_chisq, out_data, ndata); + printDiv(75); + printData(out_data, sensors, length); + printDiv(75); + + calcChisq(fData, par, fTimeResolution, fRebin); + printDiv(75); + + cout << "Free memory" << endl; + dksbase.freeMemory(mem_par, npar); + dksbase.freeMemory(mem_data, ndata); + dksbase.freeMemory(mem_chisq, ndata); + + + return 0; + +} diff --git a/test/testChiSquareRT.cpp b/test/testChiSquareRT.cpp new file mode 100644 index 0000000..fcd0b50 --- /dev/null +++ b/test/testChiSquareRT.cpp @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +void initData(double *data, int N, bool ones = false) { + for (int i = 0; i < N; i++) { + if (ones) + data[i] = 1.0; + else + data[i] = (double)rand() / RAND_MAX; + } +} + +template +void printData(T *data, int N) { + for (int i = 0; i < N; i++) + std::cout << data[i] << "\t"; + std::cout << std::endl; +} + + +const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])"; +//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])"; +//const std::string funct = "p[m[0]] * se(t, p[m[1]])"; +//const std::string funct = "p[m[1]] + p[m[0]]"; + +double fTheory(double time, double *par, double *func, int *map) { + return cos(time*par[0]) - exp(-time*par[map[0]]); +} + +double testFunctionSerial(double *data, double *par, double *func, int *map, + double N0, double tau, double bkg, double timeStep, + int startTimeBin, int endTimeBin) +{ + double time, diff, theo; + double chisq = 0; + for (int i = startTimeBin; i < endTimeBin; ++i) { + time = i * timeStep; + theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg; + diff = data[i] - theo; + + chisq += diff * diff / data[i]; + } + + return chisq; +} + +double testFunctionParallel(double *data, double *par, double *func, int *map, + double N0, double tau, double bkg, double timeStep, + int startTimeBin, int endTimeBin) +{ + int i, chunk; + double time, diff, theo; + double chisq = 0; + + chunk = (endTimeBin - startTimeBin) / omp_get_num_procs(); + if (chunk < 10) + chunk = 10; +#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq) + for (i = startTimeBin; i < endTimeBin; ++i) { + time = i * timeStep; + theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg; + diff = data[i] - theo; + + chisq += diff * diff / data[i]; + } + + return chisq; +} + +int main(int argc, char *argv[]) { + + int Loop = 100; + + //init test data on the host + int Ndata = 8; + if (argc > 1) + Ndata = atoi(argv[1]); + + int api = 1; + if (argc > 2) + api = atoi(argv[2]); + + int Npar = 66; + int Nfunc = 1; + int Nmap = 4; + + double *data = new double[Ndata]; + double *par = new double[Npar]; + double *func = new double[Nfunc]; + int *map = new int[Nmap]; + + initData(data, Ndata); + initData(par, Npar); + initData(func, Nfunc); + map[0] = 1; + map[1] = 2; + map[2] = 3; + map[3] = 4; + + //create timers + DKSTimer serialTimer; + DKSTimer cudaTimer; + DKSTimer ompTimer; + DKSTimer gpuOverhead; + serialTimer.init("Serial timer"); + cudaTimer.init("Cuda timer"); + ompTimer.init("OpenMP timer"); + gpuOverhead.init("Overhead for gpu"); + + + //serial version + double resultSerial; + + serialTimer.start(); + for (int i = 0; i < Loop; i++) + resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata); + serialTimer.stop(); + + //openmp version + double resultOMP = 0.0; + + ompTimer.start(); + //for (int i = 0; i < Loop; i++) + // resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata); + ompTimer.stop(); + + + //create and init dkabase + gpuOverhead.start(); + + DKSBaseMuSR dksbase; + if (api == 1) + dksbase.setAPI("Cuda"); + else + dksbase.setAPI("OpenCL"); + + dksbase.setDevice("-gpu"); + dksbase.initDevice(); + dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap); + + //allocate memory on the device + int ierr; + void *data_ptr; + + data_ptr = dksbase.allocateMemory(Ndata, ierr); + + dksbase.writeData(data_ptr, data, Ndata); + dksbase.writeFunctions(func, Nfunc); + dksbase.writeMaps(map, Nmap); + + dksbase.callCompileProgram(funct); + gpuOverhead.stop(); + + double resultCuda; + + cudaTimer.start(); + for (int i = 0; i < Loop; i++) { + dksbase.writeParams(par, Npar); + int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap, + 0.0, 0.1, 0, resultCuda); + + if (ierr != 0) + exit (EXIT_FAILURE); + + } + cudaTimer.stop(); + + std::cout << std::endl; + std::cout << "=======================Results=======================" << std::endl; + std::cout << "Result serial = " << resultSerial << std::endl; + std::cout << "Result prallel = " << resultOMP << std::endl; + std::cout << "Result cuda = " << resultCuda << std::endl; + + std::cout << std::endl; + std::cout << "=======================Timings=======================" << std::endl; + serialTimer.print(); + ompTimer.print(); + cudaTimer.print(); + gpuOverhead.print(); + std::cout << std::endl; + + dksbase.freeMemory(data_ptr, Ndata); + + return 0; + + +} diff --git a/test/testCollimatorPhysics.cpp b/test/testCollimatorPhysics.cpp new file mode 100644 index 0000000..bb5d9b5 --- /dev/null +++ b/test/testCollimatorPhysics.cpp @@ -0,0 +1,248 @@ +#include + +#include +#include + +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" + + +using namespace std; + +typedef struct { + int label; + unsigned localID; + double Rincol[3]; + double Pincol[3]; +} PART_SMALL; + +typedef struct { + double x; + double y; + double z; +} Vector; + +PART_SMALL initPartSmall(int d) { + + PART_SMALL p; + p.label = 0; + p.localID = d; + + p.Rincol[0] = 0.0; + p.Rincol[1] = 0.0; + p.Rincol[2] = 0.02; + + p.Pincol[0] = 0.0; + p.Pincol[1] = 0.0; + p.Pincol[2] = 3.9920183237269791e-01; + + return p; +} + +Vector initVector() { + Vector tmp; + tmp.x = 0.5; + tmp.y = 0.5; + tmp.z = 0.5; + + return tmp; +} + +void printPart(PART_SMALL p) { + cout << "label: " << p.label << ", "; + cout << "localid: " << p.localID << ","; + cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", "; + cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2]; + cout << endl; +} + +void printVector(Vector v) { + cout << v.x << "\t" << v.y << "\t" << v.z << endl; + } + +void initParts(PART_SMALL *p, int N) { + for (int i = 0; i < N; i++) + p[i] = initPartSmall(i); +} + +void printParts(PART_SMALL *p, int N) { + for (int i = 0; i < N; i++) + printPart(p[i]); + cout << endl; +} + +void initVectors(Vector *v, int N) { + for (int i = 0; i < N; i++) + v[i] = initVector(); +} + +void printVectors(Vector *v, int N) { + for (int i = 0; i < N; i++) + printVector(v[i]); + cout << endl; +} + + +void initParams(double *data) { + data[0] = 0.0;//2.0000000000000000e-02; + data[1] = 1.0;//1.0000000000000000e-02; + data[2] = 2.2100000000000000e+00; + data[3] = 6.0000000000000000e+00; + data[4] = 1.2010700000000000e+01; + data[5] = 2.6010000000000000e+00; + data[6] = 1.7010000000000000e+03; + data[7] = 1.2790000000000000e+03; + data[8] = 1.6379999999999999e-02; + data[9] = 1.9321266968325795e-01; + data[10] = 7.9000000000000000e+01; + data[11] = 1.0000000000000002e-12; + +} + +void printDouble(double *data, int N) { + for (int i = 0; i < N; i++) + std::cout << data[i] << "\t"; + std::cout << std::endl; +} + +int main(int argc, char *argv[]) { + + int loop = 10; + int numpart = 1e5; + char *api_name = new char[10]; + char *device_name = new char[10]; + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-npart")) { + numpart = atoi(argv[i+1]); + i++; + } + + if (argv[i] == string("-loop")) { + loop = atoi(argv[i+1]); + i++; + } + + } + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of particles: " << numpart << endl; + cout << "Number of loops: " << loop << endl; + cout << "------------------------------------------------------------" << endl; + + //init part vector to test mc + PART_SMALL *parts = new PART_SMALL[numpart]; + initParts(parts, numpart); + + double *params = new double[12]; + initParams(params); + + //init dks + int ierr; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + //init random + base.callInitRandoms(numpart); + + //**test collimator physics and sort***// + void *part_ptr, *param_ptr; + + //allocate memory for particles + part_ptr = base.allocateMemory(numpart, ierr); + param_ptr = base.allocateMemory(12, ierr); + + //transfer data to device + base.writeData(part_ptr, parts, numpart); + base.writeData(param_ptr, params, 12); + + int numaddback; + //test calls to do some first executions + base.callCollimatorPhysics2(part_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback); + base.syncDevice(); + //std::cout << "particles to add back: " << numaddback << std::endl; + + struct timeval timeStart, timeEnd; + std::cout << "Start MC" << std::endl; + + gettimeofday(&timeStart, NULL); + for (int i = 0; i < loop; i++) { + base.callCollimatorPhysics2(part_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback); + base.syncDevice(); + } + gettimeofday(&timeEnd, NULL); + + std::cout << "addback: " << numaddback << std::endl; + + std::cout << "End MC" << std::endl; + double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)); + + std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl; + std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl; + + //read data from device + base.readData(part_ptr, parts, numpart); + + //free memory + base.freeMemory(part_ptr, numpart); + base.freeMemory(param_ptr, 12); + + + std::cout << std::fixed << std::setprecision(4); + for (int i = 0; i < 10; i++) { + std::cout << parts[i].label << "\t" + << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" + << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t" + << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t" + << std::endl; + } + + std:: cout << "..." << std::endl; + + for (int i = numpart - 10; i < numpart; i++) { + std::cout << parts[i].label << "\t" + << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" + << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t" + << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t" + << std::endl; + } + + double arx = 0, ary = 0, arz = 0; + double apx = 0, apy = 0, apz = 0; + for (int i = 0; i < numpart; i++) { + + arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart; + ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart; + arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart; + + apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart; + apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart; + apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart; + + } + + std::cout << std::fixed << std::setprecision(10); + std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl + << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl; + + + cout << "==========================END TEST==========================" << endl; + return 0; + +} diff --git a/test/testCollimatorPhysicsMPI.cpp b/test/testCollimatorPhysicsMPI.cpp new file mode 100644 index 0000000..22e8a84 --- /dev/null +++ b/test/testCollimatorPhysicsMPI.cpp @@ -0,0 +1,126 @@ +#include + +#include + +#include "DKSBase.h" +#include "cuda_runtime.h" + +#include + +using namespace std; + +typedef struct { + int label; + unsigned localID; + double Rincol[3]; + double Pincol[3]; + long IDincol; + int Binincol; + double DTincol; + double Qincol; + long LastSecincol; + double Bfincol[3]; + double Efincol[3]; +} PART; + +PART initPart(int d) { + + PART p; + p.label = d; + p.localID = d; + for (int i = 0; i < 3; i++) { + p.Rincol[i] = 0.5;// / (d+1); + p.Pincol[i] = 0.5;// / (d+1); + p.Bfincol[i] = 1.0 / (d+1); + p.Efincol[i] = 1.0 / (d+1); + } + p.IDincol = d; + p.Binincol = d; + p.DTincol = d; + p.Qincol = d; + p.LastSecincol = d; + + return p; + +} + +void printPart(PART p) { + + cout << "label: " << p.label << ", "; + //cout << "localID: " << p.localID << ", "; + cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", "; + cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", "; + //cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", "; + //cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", "; + //cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", "; + //cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl; + cout << endl; + + +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + int numpart = 500501; + + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.callInitRandoms(numpart); + + PART tmp; + vector p; + vector p_out; + p_out.resize(numpart); + + for (int i = 0; i < numpart; i++) { + tmp = initPart(i + 1); + p.push_back(tmp); + } + + if (numpart <= 20) { + for (int i = 0; i < 10; i++) + printPart(p[i]); + cout << endl; + } + + double params[19]; + for (int i = 0; i < 19; i++) + params[i] = 0.05; + params[0] = 0; + params[1] = 1; + + void *mem_ptr, *par_ptr; + + par_ptr = base.allocateMemory(19, ierr); + base.writeData(par_ptr, params, 19); + + mem_ptr = base.allocateMemory(numpart, ierr); + base.writeData(mem_ptr, &p[0], numpart); + + int addback, dead; + for (int i = 0; i < 100; i++) + base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead); + cout << "Add back: " << addback << ", dead: " << dead << endl; + + base.readData(mem_ptr, &p_out[0], numpart); + base.freeMemory(mem_ptr, ierr); + base.freeMemory(par_ptr, ierr); + + if (numpart <= 20) { + for (int i = 0; i < numpart; i++) + printPart(p_out[i]); + } + + MPI_Finalize(); + return 0; + +} diff --git a/test/testCollimatorPhysicsSoA.cpp b/test/testCollimatorPhysicsSoA.cpp new file mode 100644 index 0000000..bc4bf0b --- /dev/null +++ b/test/testCollimatorPhysicsSoA.cpp @@ -0,0 +1,250 @@ +#include +#include + +#include +#include + +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" +#include + +using namespace std; + +typedef struct { + int *label; + unsigned *localID; + double *rx; + double *ry; + double *rz; + double *px; + double *py; + double *pz; +} PART; + + +void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz, + double *px, double *py, double *pz, int npart) { + + for (int i = 0; i < npart; i++) { + label[i] = 0; + localID[i] = i; + rx[i] = 0.0; + ry[i] = 0.0; + rz[i] = 0.02; + px[i] = 0.0; + py[i] = 0.0; + pz[i] = 3.9920183237269791e-01; + } +} + +void initParams(double *data) { + data[0] = 0.0;//2.0000000000000000e-02; + data[1] = 1.0;//1.0000000000000000e-02; + data[2] = 2.2100000000000000e+00; + data[3] = 6.0000000000000000e+00; + data[4] = 1.2010700000000000e+01; + data[5] = 2.6010000000000000e+00; + data[6] = 1.7010000000000000e+03; + data[7] = 1.2790000000000000e+03; + data[8] = 1.6379999999999999e-02; + data[9] = 1.9321266968325795e-01; + data[10] = 7.9000000000000000e+01; + data[11] = 1.0000000000000002e-12; + +} + +int main(int argc, char *argv[]) { + + int loop = 10; + int numpart = 1e5; + char *api_name = new char[10]; + char *device_name = new char[10]; + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-npart")) { + numpart = atoi(argv[i+1]); + i++; + } + + if (argv[i] == string("-loop")) { + loop = atoi(argv[i+1]); + i++; + } + + } + + int threads = 0; + /* +#pragma offload target(mic:0) out(threads) + { + #pragma omp parallel + { + threads = omp_get_num_threads(); + } + } + */ + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of particles: " << numpart << endl; + cout << "Number of loops: " << loop << endl; + cout << "Number of threads: " << threads << endl; + cout << "------------------------------------------------------------" << endl; + + //init part vector to test mc + //int *label; + //unsigned *localID; + //double *rx, *ry, *rz, *px, *py, *pz; + PART p; + p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64); + p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64); + p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64); + initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart); + + double *params = new double[12]; + initParams(params); + + //init dks + int ierr; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + //init random + base.callInitRandoms(numpart); + + //**test collimator physics and sort***// + void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr; + + //allocate memory for particles + label_ptr = base.allocateMemory(numpart, ierr); + localID_ptr = base.allocateMemory(numpart, ierr); + rx_ptr = base.allocateMemory(numpart, ierr); + ry_ptr = base.allocateMemory(numpart, ierr); + rz_ptr = base.allocateMemory(numpart, ierr); + px_ptr = base.allocateMemory(numpart, ierr); + py_ptr = base.allocateMemory(numpart, ierr); + pz_ptr = base.allocateMemory(numpart, ierr); + + param_ptr = base.allocateMemory(12, ierr); + + //transfer data to device + base.writeData(label_ptr, p.label, numpart); + base.writeData(localID_ptr, p.localID, numpart); + base.writeData(rx_ptr, p.rx, numpart); + base.writeData(ry_ptr, p.ry, numpart); + base.writeData(rz_ptr, p.rz, numpart); + base.writeData(px_ptr, p.px, numpart); + base.writeData(py_ptr, p.py, numpart); + base.writeData(pz_ptr, p.pz, numpart); + + //transfer params to device + base.writeData(param_ptr, params, 12); + + std::cout << "test runs" << std::endl; + + int numaddback; + //test calls to do some first executions + base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart, numaddback); + base.syncDevice(); + + struct timeval timeStart, timeEnd; + std::cout << "Start MC" << std::endl; + + gettimeofday(&timeStart, NULL); + for (int i = 0; i < loop; i++) { + base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart, numaddback); + base.syncDevice(); + } + gettimeofday(&timeEnd, NULL); + + std::cout << "addback: " << numaddback << std::endl; + + std::cout << "End MC" << std::endl; + double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)); + + std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl; + std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl; + + //read data from device + base.readData(label_ptr, p.label, numpart); + base.readData(localID_ptr, p.localID, numpart); + base.readData(rx_ptr, p.rx, numpart); + base.readData(ry_ptr, p.ry, numpart); + base.readData(rz_ptr, p.rz, numpart); + base.readData(px_ptr, p.px, numpart); + base.readData(py_ptr, p.py, numpart); + base.readData(pz_ptr, p.pz, numpart); + + //free memory + base.freeMemory(label_ptr, numpart); + base.freeMemory(localID_ptr, numpart); + base.freeMemory(rx_ptr, numpart); + base.freeMemory(ry_ptr, numpart); + base.freeMemory(rz_ptr, numpart); + base.freeMemory(px_ptr, numpart); + base.freeMemory(py_ptr, numpart); + base.freeMemory(pz_ptr, numpart); + + base.freeMemory(param_ptr, 12); + + /* + std::cout << std::fixed << std::setprecision(4); + for (int i = 0; i < 10; i++) { + std::cout << p.label[i] << "\t" << p.rx[i] + << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] + << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl; + } + std:: cout << "..." << std::endl; + + for (int i = numpart - 10; i < numpart; i++) { + std::cout << p.label[i] << "\t" << p.rx[i] + << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] + << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl; + } + + double arx = 0, ary = 0, arz = 0; + double apx = 0, apy = 0, apz = 0; + for (int i = 0; i < numpart; i++) { + + arx += sqrt(p.rx[i] * p.rx[i]) / numpart; + ary += sqrt(p.ry[i] * p.ry[i]) / numpart; + arz += sqrt(p.rz[i] * p.rz[i]) / numpart; + + apx += sqrt(p.px[i] * p.px[i]) / numpart; + apy += sqrt(p.py[i] * p.py[i]) / numpart; + apz += sqrt(p.pz[i] * p.pz[i]) / numpart; + + } + + std::cout << std::fixed << std::setprecision(10); + std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl + << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl; + */ + cout << "==========================END TEST==========================" << endl; + return 0; + +} diff --git a/test/testDKS.cpp b/test/testDKS.cpp new file mode 100644 index 0000000..4b66732 --- /dev/null +++ b/test/testDKS.cpp @@ -0,0 +1,15 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + DKSBase base = DKSBase(); + base.getDevices(); + + return 0; +} + diff --git a/test/testFFT.cpp b/test/testFFT.cpp new file mode 100644 index 0000000..c3fec1b --- /dev/null +++ b/test/testFFT.cpp @@ -0,0 +1,83 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << "\t" << device_name << endl; + + cout << "Begin DKS Base tests" << endl; + + int N = 2; + int dimsize[3] = {N, N, N}; + + complex *cdata = new complex[N]; + complex *cfft = new complex[N]; + for (int i = 0; i < N; i++) { + cdata[i] = complex(0, 0); + cfft[i] = complex(0, 0); + } + + cdata[0] = complex(1.73205, 1.73205); + + timestamp_t t0, t1; + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + void *mem_ptr; + int ierr; + + /* write data to device */ + mem_ptr = base.pushData< complex >( (const void*)cdata, N, ierr); + + /* execute fft */ + base.callFFT(mem_ptr, 1, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 1, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 1, dimsize); + + /* read data from device */ + base.pullData< complex >(mem_ptr, cfft, N); + + /* print results */ + + cout << "Data" << endl; + for (int i = 0; i < N; i++) + cout << cdata[i] << "\t"; + cout << endl; + + cout << "FFT" << endl; + for (int i = 0; i < N; i++) + cout << cfft[i] << "\t"; + cout << endl; + + + return 0; +} + diff --git a/test/testFFT3D.cpp b/test/testFFT3D.cpp new file mode 100644 index 0000000..ff14242 --- /dev/null +++ b/test/testFFT3D.cpp @@ -0,0 +1,159 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(complex* &data, int N, int dim, bool normalize = false); +void printData3DN4(complex* &data, int N, int dim); + +void compareData(complex* &data1, complex* &data2, int N, int dim); + +/* usage - ./testFFT3D */ +int main(int argc, char *argv[]) { + + int N = 16; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + N = atoi(argv[1]); + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, "-gpu"); + } else if (argc == 4) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, argv[3]); + } else { + N = 16; + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << ", " << device_name << endl; + + int dimsize[3] = {N, N, N}; + + cout << "Begin DKS Base tests, N = " << N << endl; + + int dim = 3; + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cifft = new complex[N*N*N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cdata[i*N*N + j*N + k] = complex((double)k / N, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cifft[i*N*N + j*N + k] = complex(0, 0); + } + } + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + base.setupFFT(3, dimsize); + + void *mem_ptr; + int ierr; + + /* allocate memory on device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + + /* write data to device */ + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + + /* execute fft */ + base.callFFT(mem_ptr, 3, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 3, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 3, dimsize); + + /* read data from device */ + base.readData< complex >(mem_ptr, cifft, N*N*N); + + /* free device memory */ + base.freeMemory< complex >(mem_ptr, N*N*N); + + /* compare results */ + compareData(cdata, cifft, N, dim); + + return 0; +} + +void printData(complex* &data, int N, int dim, bool normalize) { + int ni, nj, nk; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + if (!normalize) { + cout << data[i*ni*ni + j*nj + k].real() << " "; + cout << data[i*ni*ni + j*nj + k].imag() << "\t"; + } else + cout << data[i*ni*ni + j*nj + k].real() / N << "\t"; + } + cout << endl; + } + cout << endl; + } +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testFFT3DRC.cpp b/test/testFFT3DRC.cpp new file mode 100644 index 0000000..b0a0625 --- /dev/null +++ b/test/testFFT3DRC.cpp @@ -0,0 +1,199 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim); +void initData(double *data, int dimsize[3]); +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop); +void printHelp(); + +int main(int argc, char *argv[]) { + + int N1 = 8; + int N2 = 8; + int N3 = 8; + int dim = 3; + int loop = 10; + + if ( readParams(argc, argv, N1, N2, N3, loop) ) + return 0; + + int dimsize[3] = {N3, N2, N1}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; + + double *rdata = new double[sizereal]; + double *outdata = new double[sizereal]; + complex *cfft = new complex[sizecomp]; + + for (int i=0; iREAL) */ + base.setupFFTCR(dim, dimsize,1./(N1*N2*N3)); +#endif + +#ifdef DKS_CUDA + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.setupFFT(dim, dimsize); +#endif + + // allocate memory on device + int ierr; + void *real_ptr, *comp_ptr, *real_res_ptr; + real_ptr = base.allocateMemory(sizereal, ierr); + real_res_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< std::complex >(sizecomp, ierr); + + // execute one run before starting the timers + base.writeData(real_ptr, rdata, sizereal); + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); + base.readData(real_res_ptr, outdata, sizereal); + + //timer for total loop time, FFT and IFFT calls + struct timeval timeStart, timeEnd; + struct timeval timeFFTStart[loop], timeFFTEnd[loop]; + struct timeval timeIFFTStart[loop], timeIFFTEnd[loop]; + + gettimeofday(&timeStart, NULL); + for (int i=0; i(real_ptr, rdata, sizereal); + + // execute rcfft + gettimeofday(&timeFFTStart[i], NULL); + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + gettimeofday(&timeFFTEnd[i], NULL); + + // execute crfft + gettimeofday(&timeIFFTStart[i], NULL); + base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); + gettimeofday(&timeIFFTEnd[i], NULL); + + //normalize +#ifdef DKS_CUDA + base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize); +#endif + + // read IFFT data from device + base.readData(real_res_ptr, outdata, sizereal); + + } + gettimeofday(&timeEnd, NULL); + + // free device memory + base.freeMemory< std::complex >(comp_ptr, sizecomp); + base.freeMemory(real_ptr, sizereal); + base.freeMemory(real_res_ptr, sizereal); + + // compare in and out data to see if we get back the same results + compareData(rdata, outdata, N1, N2, N3, dim); + + //calculate seconds for total time and fft times + double tfft = 0; + double tifft = 0; + double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + + (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6; + + for (int i = 0; i < loop; i++) { + tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + + (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6; + + tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + + (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6; + } + + //print timing results + std::cout << std::fixed << std::setprecision(5) << "\nTiming results" + << "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s" + << "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s" + << "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s" + << "\n\n"; + + return 0; +} + +void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) { + int id; + double sum = 0; + for (int i = 0; i < NI; i++) { + for (int j = 0; j < NJ; j++) { + for (int k = 0; k < NK; k++) { + id = k*NI*NJ + j*NI + i; + sum += fabs(data1[id] - data2[id]); + } + } + } + std::cout << "RC <--> CR diff: " << sum << std::endl; +} + +void initData(double *data, int dimsize[3]) { + for (int i = 0; i < dimsize[2]; i++) { + for (int j = 0; j < dimsize[1]; j++) { + for (int k = 0; k < dimsize[0]; k++) { + data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; + } + } + } +} + +void printHelp() { + std::cout << std::endl; + + std::cout << "testFFT3DRC executes 3D real complex and 3D complex real" + << "function on the Intel MIC.\n"; + std::cout << "Operations performed by testRC are: " + << "write data to MIC -> FFT -> IFFT -> read data from MIC.\n"; + std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z " + << "-loop $l\n"; + std::cout << "where $x $y $z are number of elements in each dimension and " + << "$l is the number of times all the operations will be performed.\n"; + + std::cout << std::endl; +} + +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) { + + for (int i = 1; i < argc; i++) { + + if ( argv[i] == std::string("-grid") ) { + N1 = atoi(argv[i + 1]); + N2 = atoi(argv[i + 2]); + N3 = atoi(argv[i + 3]); + i += 3; + } + + if ( argv[i] == std::string("-loop") ) { + loop = atoi(argv[i + 1]); + i += 1; + } + + if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) { + printHelp(); + return true; + } + } + + return false; +} diff --git a/test/testFFT3DRC_MIC.cpp b/test/testFFT3DRC_MIC.cpp new file mode 100644 index 0000000..9eafe04 --- /dev/null +++ b/test/testFFT3DRC_MIC.cpp @@ -0,0 +1,220 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(complex* &data, int N, int dim, bool normalize = false); +void printData3DN4(complex* &data, int N, int dim); +void printData3DN4(double* data, int N, int dim); + +void compareData(complex* &data1, complex* &data2, int N, int dim); +void compareData(double* data1, double* data2, int N, int dim); + +/* Compute (K*L)%M accurately */ +static double moda(int K, int L, int M) +{ + return (double)(((long long)K * L) % M); +} +/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */ +static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4) +{ + double TWOPI = 6.2831853071795864769, phase, factor; + int n1, n2, n3, S1, S2, S3, index; + + /* Generalized strides for row-major addressing of x */ + S3 = 1; + S2 = (N3/2+1)*2; + S1 = N2*(N3/2+1)*2; + + factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0; + for (n1 = 0; n1 < N1; n1++) + { + for (n2 = 0; n2 < N2; n2++) + { + for (n3 = 0; n3 < N3; n3++) + { + phase = moda(n1,H1,N1) / N1; + phase += moda(n2,H2,N2) / N2; + phase += moda(n3,H3,N3) / N3; + index = n1*S1 + n2*S2 + n3*S3; + //cout << "index = " << index << endl; + x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3); + } + } + } +} + + +int main(int argc, char *argv[]) { + + int N = atoi(argv[1]); + int dim = 3; + int dimsize[3] = {N, N, N}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2]; + + //double *rdata = new double[sizereal]; + //double *outdata = new double[sizereal]; + //complex *cfft = new complex[sizecomp]; + double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double)); + double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double)); + complex *cfft = (complex *)malloc(sizecomp*sizeof(complex)); + + init_r(rdata, N,N,N); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI("OpenMP", 6); + base.setDevice("-mic", 4); + base.initDevice(); + + /* setup forward fft (REAL->COMPLEX) */ + base.setupFFTRC(dim, dimsize); + + int ierr; + void *real_ptr, *comp_ptr; + + /* allocate memory on device */; + real_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + /* write data to device */ + base.writeData(real_ptr, rdata, sizereal); + + //printData3DN4(rdata,N,3); + + /* execute rcfft */ + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + + /* read FFT data from device */ + base.readData< complex >(comp_ptr, cfft, sizecomp); + base.writeData(comp_ptr, cfft, sizereal); + + + /* setup backward fft (COMPLEX->REAL) */ + base.setupFFTCR(dim, dimsize,1./(N*N*N)); + /* execute crfft */ + base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize); + + /* normalize */ + //base.callNormalizeC2RFFT(real_ptr, dim, dimsize); + + /* read FFT data from device */ + //base.readData< complex >(comp_ptr, cfft, sizecomp); + + /* read IFFT data from device */ + base.readData(real_ptr, outdata, sizereal); + + /* free device memory */ + base.freeMemory< complex >(comp_ptr, sizecomp); + base.freeMemory(real_ptr, sizereal); + + /* compare data */ + compareData(rdata, outdata, N, dim); + + return 0; +} + +void printData(complex* &data, int N, int dim, bool normalize) { + int ni, nj, nk; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + if (!normalize) + cout << data[i*ni*ni + j*nj + k].real() << "\t"; + else + cout << data[i*ni*ni + j*nj + k].real() / N << "\t"; + } + cout << endl; + } + cout << endl; + } +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} +void printData3DN4(double* data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k]; + //double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + //if (a < 10e-5 && a > -10e-5) + // a = 0; + + cout << d << "\t"; + } + } + cout << endl; + } + cout << endl; + +} +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + +void compareData(double* data1, double* data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + //sum += fabs(data1[id] - data2[id]/(N*N*N)); + sum += fabs(data1[id] - data2[id]); + } + } + } + cout << "Size " << N << " RC <--> CR diff: " << sum << endl; +} diff --git a/test/testFFT3DSO.cpp b/test/testFFT3DSO.cpp new file mode 100644 index 0000000..ff14242 --- /dev/null +++ b/test/testFFT3DSO.cpp @@ -0,0 +1,159 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(complex* &data, int N, int dim, bool normalize = false); +void printData3DN4(complex* &data, int N, int dim); + +void compareData(complex* &data1, complex* &data2, int N, int dim); + +/* usage - ./testFFT3D */ +int main(int argc, char *argv[]) { + + int N = 16; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + N = atoi(argv[1]); + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, "-gpu"); + } else if (argc == 4) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, argv[3]); + } else { + N = 16; + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << ", " << device_name << endl; + + int dimsize[3] = {N, N, N}; + + cout << "Begin DKS Base tests, N = " << N << endl; + + int dim = 3; + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cifft = new complex[N*N*N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cdata[i*N*N + j*N + k] = complex((double)k / N, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cifft[i*N*N + j*N + k] = complex(0, 0); + } + } + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + base.setupFFT(3, dimsize); + + void *mem_ptr; + int ierr; + + /* allocate memory on device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + + /* write data to device */ + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + + /* execute fft */ + base.callFFT(mem_ptr, 3, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 3, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 3, dimsize); + + /* read data from device */ + base.readData< complex >(mem_ptr, cifft, N*N*N); + + /* free device memory */ + base.freeMemory< complex >(mem_ptr, N*N*N); + + /* compare results */ + compareData(cdata, cifft, N, dim); + + return 0; +} + +void printData(complex* &data, int N, int dim, bool normalize) { + int ni, nj, nk; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + if (!normalize) { + cout << data[i*ni*ni + j*nj + k].real() << " "; + cout << data[i*ni*ni + j*nj + k].imag() << "\t"; + } else + cout << data[i*ni*ni + j*nj + k].real() / N << "\t"; + } + cout << endl; + } + cout << endl; + } +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testFFT3DTiming.cpp b/test/testFFT3DTiming.cpp new file mode 100644 index 0000000..27ef7cf --- /dev/null +++ b/test/testFFT3DTiming.cpp @@ -0,0 +1,130 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void compareData(complex* &data1, complex* &data2, int N, int dim); + + +int main(int argc, char *argv[]) { + + int N = 4; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc > 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + N = atoi(argv[3]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + int dimsize[3] = {N, N, N}; + + + cout << "Use api: " << api_name << endl; + + cout << "Begin DKS Base tests, N = " << N << endl; + + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cifft = new complex[N*N*N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cdata[i*N*N + j*N + k] = complex((double)i / N, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cifft[i*N*N + j*N + k] = complex(0, 0); + } + } + } + + timestamp_t t0, t1; + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + void *mem_ptr; + int ierr; + + /* run stest funct to init device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + base.callFFT(mem_ptr, 3, dimsize); + base.callIFFT(mem_ptr, 3, dimsize); + base.callNormalizeFFT(mem_ptr, 3, dimsize); + base.readData< complex >(mem_ptr, cifft, N*N*N); + base.freeMemory< complex >(mem_ptr, N*N*N); + /* end test */ + + int steps = 10; + base.oclClearEvents(); + t0 = get_timestamp(); + for (int i = 0; i < steps; i++) { + + /* allocate memory on device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + + /* write data to device */ + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + + /* execute fft */ + base.callFFT(mem_ptr, 3, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 3, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 3, dimsize); + + /* read data from device */ + base.readData< complex >(mem_ptr, cifft, N*N*N); + + /* free device memory */ + base.freeMemory< complex >(mem_ptr, N); + + //compareData(cdata, cifft, N, 3); + } + t1 = get_timestamp(); + + cout << "=========================" << endl; + //base.oclEventInfo(); + cout << "Average total: " << get_secs(t0, t1) / steps << endl; + cout << "=========================" << endl; + + + + + return 0; +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testFFTAsync.cpp b/test/testFFTAsync.cpp new file mode 100644 index 0000000..89550a9 --- /dev/null +++ b/test/testFFTAsync.cpp @@ -0,0 +1,117 @@ +#include +#include +#include + +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + + + +using namespace std; + +void initData(double *data, int dimsize[3]) { + for (int i = 0; i < dimsize[2]; i++) { + for (int j = 0; j < dimsize[1]; j++) { + for (int k = 0; k < dimsize[0]; k++) { + data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; + } + } + } +} + +int main(int argc, char *argv[]) { + + int N = 8; + if (argc == 2) + N = atoi(argv[1]); + + int N1 = N; + int N2 = N; + int N3 = N; + int dim = 3; + + int dimsize[3] = {N3, N2, N1}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1); + + double *data1 = new double[sizereal]; + double *data2 = new double[sizereal]; + + initData(data1, dimsize); + initData(data2, dimsize); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.setupFFT(3, dimsize); + + /* pagelock data */ + base.allocateHostMemory(data1, sizereal); + base.allocateHostMemory(data2, sizereal); + + /* create streams */ + int fft1, fft2; + base.createStream(fft1); + base.createStream(fft2); + + int ierr; + void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2; + + cout << "allocating memory ..." << endl; + /* allocate memory on device */; + real_ptr1 = base.allocateMemory(sizereal, ierr); + real_ptr2 = base.allocateMemory(sizereal, ierr); + comp_ptr1 = base.allocateMemory< complex >(sizecomp*2, ierr); + comp_ptr2 = base.allocateMemory< complex >(sizecomp*2, ierr); + + cufftHandle defaultPlan; + cudaStream_t cfft1, cfft2; + cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z); + cudaStreamCreate(&cfft1); + cudaStreamCreate(&cfft2); + + + for (int i = 0; i < 5; i++) { + + cufftHandle plan = defaultPlan; + + cout << "Iteration: " << i << endl; + /* write data to device */ + base.writeDataAsync(real_ptr1, data1, sizereal, fft1); + //cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1); + + /* execute rcfft */ + base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1); + //cufftSetStream(plan, cfft1); + //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2); + + /* write data to device */ + base.writeDataAsync(real_ptr2, data2, sizereal, fft2); + //cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2); + + /* execute rcfft */ + base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2); + //cufftSetStream(plan, cfft2); + //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2); + + } + + base.freeMemory(real_ptr1, sizereal); + base.freeMemory(real_ptr2, sizereal); + base.freeMemory< complex >(comp_ptr1, sizereal); + base.freeMemory< complex >(comp_ptr2, sizereal); + + /* free pagelock data */ + base.freeHostMemory(data1, sizereal); + base.freeHostMemory(data2, sizereal); + + return 0; + +} diff --git a/test/testFFTSolver.cpp b/test/testFFTSolver.cpp new file mode 100644 index 0000000..4f01bdc --- /dev/null +++ b/test/testFFTSolver.cpp @@ -0,0 +1,301 @@ +#include +#include +#include + +#include "DKSBase.h" +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "cuda_runtime.h" + +using namespace std; + + +void printData3D(double* data, int N, int NI, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < NI; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void initData(double *data, int N) { + + for (int i = 0; i < N/4 + 1; i++) { + for (int j = 0; j < N/2 + 1; j++) { + for (int k = 0; k < N/2 + 1; k++) { + data[i*N*N + j*N + k] = k+1; + } + } + } +} + +void initData2(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = i; +} + +void initComplex( complex *d, int N) { + + for (int i = 0; i < N; i++) { + d[i] = complex(2, 0); + } + +} + +void printComplex(complex *d, int N) { + + for (int i = 0; i < N; i++) + cout << d[i] << "\t"; + cout << endl; + +} + +void initMirror(double *data, int n1, int n2, int n3) { + int d = 1; + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) + data[i * n2 * n1 + j * n1 + k] = d++; + else + data[i * n2 * n1 + j * n1 + k] = 0; + } + } + } +} + +void printDiv(int c) { + for (int i = 0; i < c; i++) + cout << "-"; + cout << endl; + +} + +void printMirror(double *data, int n1, int n2, int n3) { + + printDiv(75); + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + cout << data[i * n2 * n1 + j * n1 + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + cout << endl; +} + +double sumData(double *data, int datasize) { + + double sum = 0; + for (int i = 0; i < datasize; i++) + sum += data[i]; + + return sum; +} + +int main(int argc, char *argv[]) { + + /* mpi init */ + int rank, nprocs; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + if (nprocs != 8) { + cout << "example was set to run with 8 processes" << endl; + cout << "exit..." << endl; + return 0; + } + + /* set domain size */ + int NG[3] = {64, 64, 32}; + int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; + int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; + int sizerho = NG[0] * NG[1] * NG[2]; + int sizegreen = ng[0] * ng[1] * ng[2]; + int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1; + int id[3]; + + id[0] = 0; + id[1] = NL[1] * (rank % 4); + id[2] = NL[2] * (rank / 4); + + /* print some messages bout the example in the begginig */ + if (rank == 0) { + cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; + cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; + cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; + cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; + int tmp[3]; + for (int p = 1; p < nprocs; p++) { + MPI_Status mpistatus; + MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus); + cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl; + } + } else { + MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD); + } + + /* dks init and create 2 streams */ + int dkserr; + int streamGreens, streamFFT; + DKSBase base;// = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.createStream(streamFFT); + if (rank == 0) { + base.createStream(streamGreens); + base.setupFFT(3, NG); + } + + /* allocate memory and init rho field */ + double *rho = new double[sizerho]; + double *rho_out = new double[sizerho]; + //double *green_out = new double[sizegreen]; + initMirror(rho, NL[0], NL[1], NL[2]); + + /* + allocate memory on device for + - rho field + - rho FFT + - tmpgreen + - greens integral + - greens integral FFT + */ + void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr; + if (rank == 0) { + tmpgreen_ptr = base.allocateMemory(sizegreen, dkserr); + rho2_ptr = base.allocateMemory(sizerho, dkserr); + grn_ptr = base.allocateMemory(sizerho, dkserr); + rho2tr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + grntr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + } else { + grntr_ptr = NULL; + rho2_ptr = NULL; + grn_ptr = NULL; + rho2tr_ptr = NULL; + tmpgreen_ptr = NULL; + } + + /* send and receive pointer to allocated memory on device */ + if (rank == 0) { + for (int p = 1; p < nprocs; p++) + base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); + } else { + rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr); + } + MPI_Barrier(MPI_COMM_WORLD); + + /* =================================================*/ + /* =================================================*/ + /* =====loop trough fftpoison solver iterations=====*/ + /* =================================================*/ + /* =================================================*/ + + double old_sum = 0; + double tmp_sum = 0; + for (int l = 0; l < 10000; l++) { + MPI_Barrier(MPI_COMM_WORLD); + /* on node 0, calculate tmpgreen on gpu */ + int hr_m[3] = {1, 1, 1}; + if (rank == 0) + base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], + hr_m[0], hr_m[1], hr_m[2], streamGreens); + + /* calculate greens integral on gpu */ + if (rank == 0) + base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens); + + /* mirror the field */ + if (rank == 0) + base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens); + + + /* get FFT of mirrored greens integral */ + if (rank == 0) + base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens); + + /* transfer rho field to device */ + base.gather3DDataAsync ( rho2_ptr, rho, NG, NL, id, streamFFT); + MPI_Barrier(MPI_COMM_WORLD); + + /* get FFT of rho field */ + if (rank == 0) { + base.syncDevice(); + base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); + } + + /* multiply both FFTs */ + if (rank == 0) + base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); + MPI_Barrier(MPI_COMM_WORLD); + + /* inverse fft and transfer data back */ + /* + multiple device syncs and mpi barriers are used to make sure data + transfer is started when results are ready and progam moves on + only when data transfer is finished + */ + if (rank == 0) { + base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG); + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + MPI_Barrier(MPI_COMM_WORLD); + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + //cout << "result: " << sumData(rho_out, sizerho) << endl; + if (l == 0) { + old_sum = sumData(rho_out, sizerho); + } else { + tmp_sum = sumData(rho_out, sizerho); + if (old_sum != tmp_sum) { + cout << "diff in iteration: " << l << endl; + } + } + } else { + MPI_Barrier(MPI_COMM_WORLD); + base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } + + + } + /* =================================================*/ + /* =================================================*/ + /* ==========end fftpoison solver test run==========*/ + /* =================================================*/ + /* =================================================*/ + + + + /* free memory on device */ + if (rank == 0) { + base.freeMemory(tmpgreen_ptr, sizegreen); + base.freeMemory(grn_ptr, sizerho); + base.freeMemory< complex >(rho2tr_ptr, sizecomp); + base.freeMemory< complex >(grntr_ptr, sizecomp); + MPI_Barrier(MPI_COMM_WORLD); + base.freeMemory(rho2_ptr, sizerho); + cout << "Final sum: " << old_sum << endl; + } else { + base.closeHandle(rho2_ptr); + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + + +} diff --git a/test/testFFTSolver_MIC.cpp b/test/testFFTSolver_MIC.cpp new file mode 100644 index 0000000..29f84f0 --- /dev/null +++ b/test/testFFTSolver_MIC.cpp @@ -0,0 +1,319 @@ +#include +//#include +#include + +#include "DKSBase.h" +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "cuda_runtime.h" + +using namespace std; + + +void printData3D(double* data, int N, int NI, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < NI; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void initData(double *data, int N) { + + for (int i = 0; i < N/4 + 1; i++) { + for (int j = 0; j < N/2 + 1; j++) { + for (int k = 0; k < N/2 + 1; k++) { + data[i*N*N + j*N + k] = k+1; + } + } + } +} + +void initData2(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = i; +} + +void initComplex( complex *d, int N) { + + for (int i = 0; i < N; i++) { + d[i] = complex(2, 0); + } + +} + +void printComplex(complex *d, int N) { + + for (int i = 0; i < N; i++) + cout << d[i] << "\t"; + cout << endl; + +} + +void initMirror(double *data, int n1, int n2, int n3) { + int d = 1; + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) + data[i * n2 * n1 + j * n1 + k] = d++; + else + data[i * n2 * n1 + j * n1 + k] = 0; + } + } + } +} + +void printDiv(int c) { + for (int i = 0; i < c; i++) + cout << "-"; + cout << endl; + +} + +void printMirror(double *data, int n1, int n2, int n3) { + + printDiv(75); + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + cout << data[i * n2 * n1 + j * n1 + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + cout << endl; +} + +double sumData(double *data, int datasize) { + + double sum = 0; + for (int i = 0; i < datasize; i++) + sum += data[i]; + + return sum; +} + +int main(int argc, char *argv[]) { + + /* mpi init */ + //int rank, nprocs; + //MPI_Init(&argc, &argv); + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + /* + if (nprocs != 8) { + cout << "example was set to run with 8 processes" << endl; + cout << "exit..." << endl; + return 0; + } + */ + + /* set domain size */ + int NG[3] = {64, 64, 32}; + int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; + int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; + int sizerho = NG[0] * NG[1] * NG[2]; + int sizegreen = ng[0] * ng[1] * ng[2]; + int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1; + int id[3]; + + //id[0] = 0; + //id[1] = NL[1] * (rank % 4); + //id[2] = NL[2] * (rank / 4); + + /* print some messages bout the example in the begginig */ + cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; + //cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; + cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; + //cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; + int tmp[3]; + /* for (int p = 1; p < nprocs; p++) { + MPI_Status mpistatus; + MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus); + cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl; + }*/ + // } else { + // MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD); + // } + + /* dks init and create 2 streams */ + int dkserr; + //int streamGreens, streamFFT; +#ifdef DKS_MIC + DKSBase base; + base.setAPI("OpenMP", 6); + base.setDevice("-mic", 4); + base.initDevice(); +#endif + +#ifdef DKS_CUDA + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); +#endif + + //base.createStream(streamFFT); + //if (rank == 0) { + // base.createStream(streamGreens); + base.setupFFT(3, NG); + //} + + /* allocate memory and init rho field */ + double *rho = new double[sizerho]; + double *rho_out = new double[sizerho]; + //double *green_out = new double[sizegreen]; + initMirror(rho, NL[0], NL[1], NL[2]); + + /* + allocate memory on device for + - rho field + - rho FFT + - tmpgreen + - greens integral + - greens integral FFT + */ + void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr; + // if (rank == 0) { + tmpgreen_ptr = base.allocateMemory(sizegreen, dkserr); + rho2_ptr = base.allocateMemory(sizerho, dkserr); + grn_ptr = base.allocateMemory(sizerho, dkserr); + rho2tr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + grntr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + /* } else { + grntr_ptr = NULL; + rho2_ptr = NULL; + grn_ptr = NULL; + rho2tr_ptr = NULL; + tmpgreen_ptr = NULL; + }*/ + + + /* send and receive pointer to allocated memory on device */ + /* + if (rank == 0) { + for (int p = 1; p < nprocs; p++) + base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); + } else { + rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr); + } + MPI_Barrier(MPI_COMM_WORLD); + */ + + + /* =================================================*/ + /* =================================================*/ + /* =====loop trough fftpoison solver iterations=====*/ + /* =================================================*/ + /* =================================================*/ + + double old_sum = 0; + double tmp_sum = 0; + for (int l = 0; l < 100; l++) { + //MPI_Barrier(MPI_COMM_WORLD); + /* on node 0, calculate tmpgreen on gpu */ + int hr_m[3] = {1, 1, 1}; + //if (rank == 0) + base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], + hr_m[0], hr_m[1], hr_m[2]); + + /* calculate greens integral on gpu */ + //if (rank == 0) + base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]); + + /* mirror the field */ + //if (rank == 0) + base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]); + + + /* get FFT of mirrored greens integral */ + //if (rank == 0) + base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG); + + /* transfer rho field to device */ + //base.gather3DDataAsync ( rho2_ptr, rho, NG, NL, id, streamFFT); + base.writeData(rho2_ptr, rho,NG[0]*NG[1]*NG[2]); + //MPI_Barrier(MPI_COMM_WORLD); + + /* get FFT of rho field */ + //if (rank == 0) { + //base.syncDevice(); + base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); + //} + + /* multiply both FFTs */ + //if (rank == 0) + base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); + //MPI_Barrier(MPI_COMM_WORLD); + + /* inverse fft and transfer data back */ + /* + multiple device syncs and mpi barriers are used to make sure data + transfer is started when results are ready and progam moves on + only when data transfer is finished + */ + //if (rank == 0) { + base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG); + //base.syncDevice(); + //MPI_Barrier(MPI_COMM_WORLD); + //base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + base.readData (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]); + //MPI_Barrier(MPI_COMM_WORLD); + //base.syncDevice(); + //MPI_Barrier(MPI_COMM_WORLD); + //cout << "result: " << sumData(rho_out, sizerho) << endl; + if (l == 0) { + old_sum = sumData(rho_out, sizerho); + } else { + tmp_sum = sumData(rho_out, sizerho); + if (old_sum != tmp_sum) { + cout << "diff in iteration: " << l << endl; + } + } + /*} else { + MPI_Barrier(MPI_COMM_WORLD); + base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } + */ + + + } +/* =================================================*/ +/* =================================================*/ +/* ==========end fftpoison solver test run==========*/ +/* =================================================*/ +/* =================================================*/ + + + +/* free memory on device */ +//if (rank == 0) { +base.freeMemory(tmpgreen_ptr, sizegreen); +base.freeMemory(grn_ptr, sizerho); +base.freeMemory< complex >(rho2tr_ptr, sizecomp); +base.freeMemory< complex >(grntr_ptr, sizecomp); +//MPI_Barrier(MPI_COMM_WORLD); +base.freeMemory(rho2_ptr, sizerho); +cout << "Final sum: " << old_sum << endl; +/*} else { + base.closeHandle(rho2_ptr); + MPI_Barrier(MPI_COMM_WORLD); + }*/ + +//MPI_Finalize(); + + +} diff --git a/test/testGather.cpp b/test/testGather.cpp new file mode 100644 index 0000000..e0f8eaf --- /dev/null +++ b/test/testGather.cpp @@ -0,0 +1,172 @@ +#include +#include +#include + +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "DKSBase.h" + +using namespace std; + + +void printData3D(int* data, int N, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + + +void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") { + + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nz; i++) { + for (int j = 0; j < ny; j++) { + for (int k = 0; k < nx; k++) { + cout << data[i*ny*nx + j*nx + k] << "\t"; + } + cout << endl; + } + cout << endl; + } +} + + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int N_global[3] = {64, 64, 32}; + int N_local[3] = {64, 32, 16}; + int n = N_local[0] * N_local[1] * N_local[2]; + + int idx[4] = {0, 0, 0, 0}; + int idy[4] = {0, 32, 0, 32}; + int idz[4] = {0, 0, 16, 16}; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + int *hdata_in; + if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) { + hdata_in = new int[n]; + cout << "pinned allocation failed!" << endl; + } + initData(hdata_in, n, rank); + + + for (int i = 0; i < 2; i++) { + + MPI_Barrier(MPI_COMM_WORLD); + if (i == 1) + nvtxMarkA("start gather"); + + if (rank == 0) { + + void *mem_ptr, *tmpgreen_ptr; + + mem_ptr = base.allocateMemory(nprocs*n, ierr); + + //call another kernel + int sizegreen = 33 * 33 * 17; + tmpgreen_ptr = base.allocateMemory(sizegreen, ierr); + nvtxMarkA("call green"); + base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007); + + nvtxMarkA("call gather"); + base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + + //read and print data once for debug only + /* + if (i == 0 && nprocs*n < 257) { + int *hdata_out_all = new int[nprocs*n]; + base.readData(mem_ptr, hdata_out_all, n*nprocs); + printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]); + } + + else { + int *hout_data = new int[nprocs*n]; + base.readData(mem_ptr, hout_data, nprocs*n); + int sum = 0; + for (int s = 0; s < nprocs*n; s++) + sum += hout_data[s]; + + cout << "Sum: " << sum << endl; + } + */ + MPI_Barrier(MPI_COMM_WORLD); + + nvtxMarkA("call scatter"); + base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + + base.freeMemory(mem_ptr, n*nprocs); + base.freeMemory(tmpgreen_ptr, sizegreen); + + } else { + + nvtxMarkA("call gather"); + base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + nvtxMarkA("call scatter"); + base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + } + + if (i == 1) + nvtxMarkA("end gather"); + + } + + MPI_Barrier(MPI_COMM_WORLD); + base.freeHostMemory(hdata_in, n); + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testGatherAsync.cpp b/test/testGatherAsync.cpp new file mode 100644 index 0000000..4fe35b5 --- /dev/null +++ b/test/testGatherAsync.cpp @@ -0,0 +1,144 @@ +#include +#include +#include + +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "DKSBase.h" + +using namespace std; + + +void printData3D(int* data, int N, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + //mpi copy + int n = 32*16*16; + int N_global[3] = {32, 32, 32}; + int N_local[3] = {32, 16, 16}; + int idx[4] = {0, 0, 0, 0}; + int idy[4] = {0, 0, 16, 16}; + int idz[4] = {0, 16, 0, 16}; + + //greens kernel + int n1 = 33; + int n2 = 33; + int n3 = 17; + int sizegreen = n1*n2*n3; + + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + int *hdata_in; + if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) { + hdata_in = new int[n]; + cout << "pinned allocation failed!" << endl; + } + initData(hdata_in, n, rank); + + int stream2; + for (int i = 0; i < 2; i++) { + + if (rank == 0) { + if (i == 0) { + cudaProfilerStart(); + base.createStream(stream2); + } + + nvtxMarkA("start gather"); + + void *mem_ptr, *green_ptr; + + mem_ptr = base.allocateMemory(nprocs*n, ierr); + green_ptr = base.allocateMemory(sizegreen, ierr); + + nvtxMarkA("call gather"); + MPI_Request request; + MPI_Status status; + + base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, + request); + + + nvtxMarkA("call kernel"); + base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1, + 4.160715e-03, 4.474911e-03, 1.247311e-02, stream2); + + MPI_Wait(&request, &status); + + + base.freeMemory(mem_ptr, n*nprocs); + base.freeMemory(green_ptr, sizegreen); + + MPI_Barrier(MPI_COMM_WORLD); + + nvtxMarkA("end gather"); + + if (i == 1) cudaProfilerStop(); + } else { + + MPI_Request request; + base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, + request); + + MPI_Barrier(MPI_COMM_WORLD); + } + + } + + base.freeHostMemory(hdata_in, n); + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testGatherAsync2.cpp b/test/testGatherAsync2.cpp new file mode 100644 index 0000000..a2ab21f --- /dev/null +++ b/test/testGatherAsync2.cpp @@ -0,0 +1,205 @@ +#include +#include +#include + +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "DKSBase.h" + +using namespace std; + + +void printData3D(int* data, int N, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") { + + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nz; i++) { + for (int j = 0; j < ny; j++) { + for (int k = 0; k < nx; k++) { + cout << data[i*ny*nx + j*nx + k] << "\t"; + } + cout << endl; + } + cout << endl; + } +} + + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs*N; i++) + cout << data[i] << "\t"; + cout << endl << endl; + +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + //cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int Ng[3] = {128, 128, 64}; + int Nl[3] = {128, 64, 32}; + int nglobal = Ng[0] * Ng[1] * Ng[2]; + int nlocal = Nl[0] * Nl[1] * Nl[2]; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + int *hdata_in; + if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) { + hdata_in = new int[nlocal]; + cout << "pinned allocation failed!" << endl; + } + initData(hdata_in, nlocal, rank); + + int *hdata_out; + if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) { + hdata_out = new int[nlocal]; + cout << "pinned allocation failed!" << endl; + } + + //create streams for async execution + int stream1, stream2; + base.createStream(stream1); + base.createStream(stream2); + + if (rank == 0) + base.setupFFT(3, Ng); + + for (int i = 0; i < 1; i++) { + + MPI_Barrier(MPI_COMM_WORLD); + if (i == 1) + nvtxMarkA("start gather"); + + if (rank == 0) { + + int id[3] = {0, 0, 0}; + + void *mem_ptr, *tmpgreen_ptr, *comp_ptr; + + //allocate memory on device + int sizegreen = 65 * 65 * 33; + int sizecomp = 65 * 128 * 64; + mem_ptr = base.allocateMemory(nglobal, ierr); + tmpgreen_ptr = base.allocateMemory(sizegreen, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + //send pointer to other processes + nvtxMarkA("call gather"); + for (int j = 1; j < nprocs; j++) + base.sendPointer(mem_ptr, j, MPI_COMM_WORLD); + + //call another kernel while data transfer is processing + nvtxMarkA("call green"); + base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2); + + //write data to device + base.gather3DDataAsync(mem_ptr, hdata_in, Ng, Nl, id, stream1); + + /* execute rcfft */ + //base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng); + + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + + //read data from device + base.scatter3DDataAsync(mem_ptr, hdata_out, Ng, Nl, id); + + MPI_Barrier(MPI_COMM_WORLD); + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + + + base.freeMemory(mem_ptr, nglobal); + base.freeMemory(tmpgreen_ptr, sizegreen); + base.freeMemory< complex >(comp_ptr, sizecomp); + + } else { + + + void *mem_ptr; + int idy = 0; + int idz = 0;//Nl[2]*rank; + if (rank / 2 == 1) idy = Ng[1] / 2; + if (rank % 2 == 1) idz = Ng[2] / 2; + int id[3] = {0, idy, idz}; + + nvtxMarkA("call gather"); + mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr); + base.gather3DDataAsync(mem_ptr, hdata_in, Ng, Nl, id, stream1); + + MPI_Barrier(MPI_COMM_WORLD); + + base.scatter3DDataAsync(mem_ptr, hdata_out, Ng, Nl, id); + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + base.closeHandle(mem_ptr); + + } + + int sum1 = 0; + for (int c = 0; c < nlocal; c++) + sum1 += hdata_in[c]; + + int sum2 = 0; + for (int c = 0; c < nlocal; c++) + sum2 += hdata_out[c]; + + cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl; + + + if (i == 1) + nvtxMarkA("end gather"); + + } + + //printData(hdata_in, nlocal, 1); + MPI_Barrier(MPI_COMM_WORLD); + base.freeHostMemory(hdata_in, nlocal); + //delete[] hdata_in; + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testGreens.cpp b/test/testGreens.cpp new file mode 100644 index 0000000..8b554eb --- /dev/null +++ b/test/testGreens.cpp @@ -0,0 +1,239 @@ +#include +#include +#include +#include + +#include "DKSBase.h" +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "cuda_runtime.h" + +using namespace std; + + +void printData3D(double* data, int N, int NI, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < NI; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void initData(double *data, int N) { + + for (int i = 0; i < N/4 + 1; i++) { + for (int j = 0; j < N/2 + 1; j++) { + for (int k = 0; k < N/2 + 1; k++) { + data[i*N*N + j*N + k] = k+1; + } + } + } +} + +void initData2(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = i; +} + +void initComplex( complex *d, int N) { + + for (int i = 0; i < N; i++) { + d[i] = complex(2, 0); + } + +} + +void printComplex(complex *d, int N) { + + for (int i = 0; i < N; i++) + cout << d[i] << "\t"; + cout << endl; + +} + +void initMirror(double *data, int n1, int n2, int n3) { + int d = 1; + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) + data[i * n2 * n1 + j * n1 + k] = d++; + else + data[i * n2 * n1 + j * n1 + k] = 0; + } + } + } +} + +void printDiv(int c) { + for (int i = 0; i < c; i++) + cout << "-"; + cout << endl; + +} + +void printMirror(double *data, int n1, int n2, int n3) { + + printDiv(75); + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + cout << data[i * n2 * n1 + j * n1 + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + cout << endl; +} + +double sumData(double *data, int datasize) { + + double sum = 0; + for (int i = 0; i < datasize; i++) + sum += data[i]; + + return sum; +} + + + +int main(int argc, char *argv[]) { + + int ierr; + + int N1 = 8; + int N2 = 8; + int N3 = 4; + + int n1 = N1 / 2; + int n2 = N2 / 2; + int n3 = N3 / 2; + + int sizegreen = (n1 + 1) * (n2 + 1) * (n3 + 1); + int sizerho = N1 * N2 * N3; + + double *data_green; //= new double[sizegreen]; + double *data_rho; //= new double[sizerho]; + + double hr_m0 = +4.0264984513873269e-04; + double hr_m1 = +4.3305596731911289e-04; + double hr_m2 = +8.3154085085560838e-04; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + int stream1, stream2; + base.createStream(stream1); + base.createStream(stream2); + cout << "ID stream1: " << stream1 << endl; + cout << "ID stream2: " << stream2 << endl; + + void *mem_green1, *mem_green2, *mem_rho1, *mem_rho2; + + mem_green1 = base.allocateMemory(sizegreen, ierr); + mem_green2 = base.allocateMemory(sizegreen, ierr); + mem_rho1 = base.allocateMemory(sizerho, ierr); + mem_rho2 = base.allocateMemory(sizerho, ierr); + + printDiv(50); + + data_green = new double[sizegreen]; + data_rho = new double[sizerho]; + + base.callGreensIntegral(mem_green1, n1+1, n2+1, n3+1, n1+1, n2+1, + hr_m0, hr_m1, hr_m2, stream1); + base.readData(mem_green1, data_green, sizegreen); + cout << "Sum green: " << sumData(data_green, sizegreen) << endl; + cout << scientific << setprecision(16); + for (int p = 0; p < 7; p++) + cout << data_green[p] << "\t"; + cout << endl; + //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1); + + base.callGreensIntegration(mem_rho1, mem_green1, n1 + 1, n2 + 1, n3 + 1, -1); + base.readData(mem_rho1, data_rho, sizerho); + cout << "Sum integral: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + + base.callMirrorRhoField(mem_rho1, n1, n2, n3, -1); + base.readData(mem_rho1, data_rho, sizerho); + cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + + printDiv(50); + + /* + base.callGreensIntegral(mem_green2, n1+1, n2+1, n3+1, n1+1, n2+1, + 1, 1, 1, -2); + base.readData(mem_green2, data_green, sizegreen); + cout << "Sum green: " << sumData(data_green, sizegreen) << endl; + //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1); + + base.callGreensIntegration(mem_rho2, mem_green2, n1 + 1, n2 + 1, n3 + 1, -2); + base.readData(mem_rho2, data_rho, sizerho); + cout << "Sum integral: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + + base.callMirrorRhoField(mem_rho2, n1, n2, n3, -2); + base.readData(mem_rho2, data_rho, sizerho); + cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + */ + printDiv(50); + + base.freeMemory(mem_green1, sizegreen); + base.freeMemory(mem_green2, sizegreen); + base.freeMemory(mem_rho1, sizerho); + base.freeMemory(mem_rho2, sizerho); + + delete [] data_green; + delete [] data_rho; + + //test complex multiplication + int compsize = 300; + complex *data1 = new complex[compsize]; + complex *data2 = new complex[compsize]; + for (int i = 0; i < compsize; i++) { + data1[i] = complex(i+1, i+2); + data2[i] = complex(i+3, i+4); + } + + for (int i = 0; i < 3; i++) + cout << data1[i] << "\t"; + cout << endl; + for (int i = 0; i < 3; i++) + cout << data2[i] << "\t"; + cout << endl; + + void *ptr1, *ptr2; + ptr1 = base.allocateMemory< complex >(compsize, ierr); + ptr2 = base.allocateMemory< complex >(compsize, ierr); + + base.writeData< complex >(ptr1, data1, compsize); + base.writeData< complex >(ptr2, data2, compsize); + + base.callMultiplyComplexFields(ptr1, ptr2, compsize); + + base.readData< complex >(ptr1, data1, compsize); + + for (int i = 0; i < 3; i++) + cout << data1[i] << "\t"; + cout << endl; + + base.freeMemory< complex >(ptr1, compsize); + base.freeMemory< complex >(ptr2, compsize); + + return 0; +} diff --git a/test/testImageReconstruction.cpp b/test/testImageReconstruction.cpp new file mode 100644 index 0000000..2dbb27d --- /dev/null +++ b/test/testImageReconstruction.cpp @@ -0,0 +1,191 @@ +#include +#include +#include +#include "DKSImageReconstruction.h" + +struct voxelPosition { + float x; + float y; + float z; +}; + +void initImage(float *image, int size) { + for (int i = 0; i < size; i++) + image[i] = (float)rand() / RAND_MAX; +} + +void initPosition(voxelPosition *voxel, int N) { + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + int idx = i * N * N + j * N + k; + if (k == 0) + voxel[idx].x = 0.0; + else + voxel[idx].x = voxel[idx - 1].x + 0.1; + + if (j == 0) + voxel[idx].y = 0.0; + else + voxel[idx].y = voxel[idx - N].y + 0.1; + + if (i == 0) + voxel[idx].z = 0.0; + else + voxel[idx].z = voxel[idx - N * N].z + 0.1; + } + } + } +} + +void printPosition(voxelPosition *voxel, int size) { + for (int i = 0; i < size; i++) + std::cout << voxel[i].x << "\t"; + std::cout << std::endl; + for (int i = 0; i < size; i++) + std::cout << voxel[i].y << "\t"; + std::cout << std::endl; + for (int i = 0; i < size; i++) + std::cout << voxel[i].z << "\t"; + std::cout << std::endl; +} + +#define DIAMETER 2.0 +bool select_source(voxelPosition *image_tmp, voxelPosition source_temp, int id) +{ + float distance_x = pow(image_tmp[id].x-source_temp.x,2); + float distance_y = pow(image_tmp[id].y-source_temp.y,2); + float distance_z = pow(image_tmp[id].z-source_temp.z,2); + float distance = sqrt(distance_x + distance_y + distance_z); + + if ( distance < DIAMETER*0.5 ) { + return true; + } + else + return false; +} + +void calculate_source(float *image_space , voxelPosition *image_geometry, + voxelPosition source, int total_voxels, + float *average, float *std) +{ + + int number_selected_maximum = 10000; + float *select; + select = new float[number_selected_maximum]; + for (int j=0;j(total, ierr); + image_position = base.allocateMemory(total, ierr); + source_position = base.allocateMemory(total, ierr); + davg = base.allocateMemory(total, ierr); + dstd = base.allocateMemory(total, ierr); + + base.writeData(image_space, image, total); + base.writeData(image_position, geometry, total); + base.writeData(source_position, geometry, total); + + + gettimeofday(&timeStart, NULL); + base.callCalculateSource(image_space, image_position, source_position, + davg, dstd, DIAMETER, total, total); + + + base.readData(davg, avg, total); + base.readData(dstd, stdev, total); + gettimeofday(&timeEnd, NULL); + ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6; + + base.freeMemory(image_space, total); + base.freeMemory(image_position, total); + base.freeMemory(source_position, total); + base.freeMemory(dstd, total); + base.freeMemory(davg, total); + + avgavg = 0; + avgstdev = 0; + for (int i = 0; i < total; i++) { + avgavg += avg[i] / total; + avgstdev += stdev[i] / total; + } + std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl; + + return N; + +} diff --git a/test/testMIC.cpp b/test/testMIC.cpp new file mode 100644 index 0000000..354e9e4 --- /dev/null +++ b/test/testMIC.cpp @@ -0,0 +1,51 @@ +#include +#include "DKSBase.h" + +using namespace std; + +int main() { + + DKSBase base; + + base.setAPI("OpenMP", 6); + base.initDevice(); + + //init data + int ierr; + int N = 8; + double *in_data = new double[N]; + double *in_data2 = new double[N]; + double *out_data = new double[N]; + double *out_data2 = new double[N]; + + for (int i = 0; i < N; i++) { + in_data[i] = i; + in_data2[i] = i*i; + } + + //test memory allocation, write and read operations + void *d_ptr, *d2_ptr; + + d_ptr = base.allocateMemory(N, ierr); + d2_ptr = base.allocateMemory(N, ierr); + + base.writeData(d_ptr, in_data, N); + base.writeData(d2_ptr, in_data2, N); + + base.readData(d_ptr, out_data, N); + base.readData(d2_ptr, out_data2, N); + base.freeMemory(d_ptr, N); + base.freeMemory(d2_ptr, N); + + //print results + for (int i = 0; i < N; i++) + cout << out_data[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << out_data2[i] << "\t"; + cout << endl; + + return 0; + +} diff --git a/test/testMICOpenCL.cpp b/test/testMICOpenCL.cpp new file mode 100644 index 0000000..110d797 --- /dev/null +++ b/test/testMICOpenCL.cpp @@ -0,0 +1,94 @@ +#include +#include +#include "DKSBase.h" +#include "Utility/TimeStamp.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[4]; + + if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else if (argc == 2){ + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << endl; + cout << "Use device: " << device_name << endl; + + + int ierr; + int N = 10000; + double *data = new double[N]; + double *data_out = new double[N]; + double *data_out2 = new double[N]; + + for (int i = 0; i < N; i++) { + data[i] = i; + } + + //init dks base class, set API to opencl and init connection with OpenCL device + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + + //data ptr + void *data_ptr, *data_ptr2; + + //allocate memory + data_ptr = base.allocateMemory(N, ierr); + data_ptr2 = base.allocateMemory(N, ierr); + + //write data to memory and fill data on device + base.writeData(data_ptr, data, N); + base.writeData(data_ptr2, data, N); + //base.callNt(data_ptr2, data_ptr, 6, N, 1, 0); + + //calc sum + base.callSum(data_ptr2, data_ptr2, N); + + //base.callSum(data_ptr, data_ptr, N); + + //chi^2 + //base.callChi2(data_ptr, data_ptr, data_ptr, N); + //base.callChi2(data_ptr2, data_ptr2, data_ptr2, N); + + //read data + base.readData(data_ptr, data_out, N); + base.readData(data_ptr2, data_out2, N); + + //base.oclEventInfo(); + + //free memory + base.freeMemory(data_ptr, N); + base.freeMemory(data_ptr2, N); + + + /* + for (int i = 0; i < N; i++) { + cout << data[i] << "\t"; + } + cout << endl << endl; + for (int i = 0; i < N; i++) { + cout << data_out[i] << "\t"; + } + cout << endl << endl; + for (int i = 0; i < N; i++) { + cout << data_out2[i] << "\t"; + } + cout << endl; + */ + + + + return 0; +} \ No newline at end of file diff --git a/test/testMICPush.cpp b/test/testMICPush.cpp new file mode 100644 index 0000000..a2f7d2a --- /dev/null +++ b/test/testMICPush.cpp @@ -0,0 +1,68 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +typedef struct { + double x; + double y; + double z; +} Part; + +void initData(Part *data, int N) { + for (int i = 0; i < N; i++) { + data[i].x = rand() / RAND_MAX; + data[i].y = rand() / RAND_MAX; + data[i].z = rand() / RAND_MAX; + } +} + +int main() { + + int ierr; + int N = 100000; + + //__declspec(align(64)) Part *R = new Part[N]; + //__declspec(align(64)) Part *P = new Part[N]; + Part *R = new Part[N]; + Part *P = new Part[N]; + + initData(R, N); + initData(P, N); + + DKSBase dksbase; + dksbase.setAPI("OpenMP", 6); + dksbase.setDevice("-mic", 4); + dksbase.initDevice(); + + void *r_ptr, *p_ptr, *dt_ptr; + r_ptr = dksbase.allocateMemory(N, ierr); + p_ptr = dksbase.allocateMemory(N, ierr); + dt_ptr = dksbase.allocateMemory(N, ierr); + + dksbase.writeData(r_ptr, R, N); + + cout << "====================START PUSH====================" << endl; + + for (int i = 0; i < 5; i++) { + //write r to device + dksbase.writeData(r_ptr, R, N); + //calc push + dksbase.callParallelTTrackerPush (r_ptr, p_ptr, N, dt_ptr, + 0.001, 1, false, NULL); + //read R from device + dksbase.readDataAsync (r_ptr, R, N, NULL); + } + + cout << "====================END PUSH====================" << endl; + + + + dksbase.freeMemory(r_ptr, N); + dksbase.freeMemory(p_ptr, N); + dksbase.freeMemory(dt_ptr, N); + + return 0; +} diff --git a/test/testMPI.cpp b/test/testMPI.cpp new file mode 100644 index 0000000..aef3cd6 --- /dev/null +++ b/test/testMPI.cpp @@ -0,0 +1,89 @@ +#include +#include +#include + +#include "DKSBase.h" + +using namespace std; + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int n = 8; + int sizen = sizeof(int)*n; + int sizeall = sizeof(int)*n*nprocs; + + int *hdata_in = new int[n]; + int *hdata_out = new int[n]; + initData(hdata_in, n, rank); + cout << "In data for process " << rank+1 << ":\t"; + printData(hdata_in, n, 1); + + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + if (rank == 0) { + + int *hdata_out_all = new int[nprocs*n]; + void* mem_ptr; + mem_ptr = base.allocateMemory(nprocs*n, ierr); + + MPI_Gather(hdata_in, n, MPI_INT, mem_ptr, n, MPI_INT, 0, MPI_COMM_WORLD); + + base.readData(mem_ptr, hdata_out_all, n*nprocs); + + MPI_Scatter(mem_ptr, n, MPI_INT, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD); + + base.freeMemory(mem_ptr, n*nprocs); + + printData(hdata_out_all, n, nprocs, "Out data 1:\n"); + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_in, n, 1); + } else { + + MPI_Gather(hdata_in, n, MPI_INT, NULL, NULL, NULL, 0, MPI_COMM_WORLD); + + MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD); + + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_in, n, 1); + + } + + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testMPIFFT.cpp b/test/testMPIFFT.cpp new file mode 100644 index 0000000..69512ff --- /dev/null +++ b/test/testMPIFFT.cpp @@ -0,0 +1,91 @@ +#include +#include +#include + +#include "DKSBase.h" + +using namespace std; + +void printData(complex *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(complex *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = complex((double)rank+1.0, 0.0); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int n = 8; + + complex *hdata_in = new complex[n]; + complex *hdata_out = new complex[n]; + initData(hdata_in, n, rank); + cout << "In data for process " << rank+1 << ":\t"; + printData(hdata_in, n, 1); + + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + if (rank == 0) { + + complex *hdata_out_all = new complex[nprocs*n]; + void* mem_ptr; + mem_ptr = base.allocateMemory< complex >(nprocs*n, ierr); + + + MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, mem_ptr, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + + + int dimsize[3] = {n*nprocs, 1, 1}; + base.callFFT(mem_ptr, 1, dimsize); + base.readData< complex >(mem_ptr, hdata_out_all, n*nprocs); + + MPI_Scatter(mem_ptr, n, MPI_DOUBLE_COMPLEX, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + + base.freeMemory< complex >(mem_ptr, n*nprocs); + + printData(hdata_out_all, n, nprocs, "Out data 1:\n"); + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_out, n, 1); + } else { + + MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, NULL, NULL, NULL, 0, MPI_COMM_WORLD); + + MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_out, n, 1); + + } + + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testMemObjects.cpp b/test/testMemObjects.cpp new file mode 100644 index 0000000..5a5eaf0 --- /dev/null +++ b/test/testMemObjects.cpp @@ -0,0 +1,75 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + int ierr,n, N; + + if (argc > 1) + n = atoi(argv[1]); + else + n = 10; + + N = 2 << n; + cout << "Elements: " << N << endl; + + double *data = new double[N]; + for (int i = 0; i < N; i++) + data[i] = (double)i / N; + + + DKSBase base = DKSBase(); + base.setAPI("OpenCL", 6); + base.setDevice("-gpu", 4); + base.initDevice(); + + void *ptr1; + ptr1 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr1, data, N); + + void *ptr2; + ptr2 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr2, data, N); + + void *ptr3; + ptr3 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr3, data, N); + + void *ptr4; + ptr4 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr4, data, N); + + void *ptr5; + ptr5 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr5, data, N); + + void *ptr6; + ptr6 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr6, data, N); + + void *ptr7; + ptr7 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr7, data, N); + + void *ptr8; + ptr8 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr8, data, N); + + base.freeMemory(ptr1, N); + base.freeMemory(ptr2, N); + base.freeMemory(ptr3, N); + base.freeMemory(ptr4, N); + base.freeMemory(ptr5, N); + base.freeMemory(ptr6, N); + base.freeMemory(ptr7, N); + base.freeMemory(ptr8, N); + + + + return 0; +} + diff --git a/test/testOffset.cpp b/test/testOffset.cpp new file mode 100644 index 0000000..cf7e6ec --- /dev/null +++ b/test/testOffset.cpp @@ -0,0 +1,73 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + + int ierr,n, N; + + N = 8; + n = 4; + + double *data_in = new double[N]; + double *data_out_1 = new double[N]; + double *data_out_2 = new double[N]; + for (int i = 0; i < N; i++) { + data_in[i] = (double)i / N; + data_out_1[i] = 0.0; + data_out_2[i] = 0.0; + } + + cout << "Run example on: " << api_name << " using " << device_name << endl; + + DKSBase base = DKSBase(); + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + void *ptr1; + ptr1 = base.allocateMemory(N, ierr); + + ierr = base.writeData(ptr1, data_in, n, 0); + ierr = base.writeData(ptr1, data_in, n, 4); + + ierr = base.readData(ptr1, data_out_1, N); + ierr = base.readData(ptr1, data_out_2, n, 2); + + base.freeMemory(ptr1, N); + + for (int i = 0; i < N; i++) + cout << data_in[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << data_out_1[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << data_out_2[i] << "\t"; + cout << endl; + + + + + return 0; +} + diff --git a/test/testOffsetMPI.cpp b/test/testOffsetMPI.cpp new file mode 100644 index 0000000..066cf63 --- /dev/null +++ b/test/testOffsetMPI.cpp @@ -0,0 +1,81 @@ +#include +#include +#include + + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + int rank, size; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + cout << "Rank " << rank << " from " << size << endl; + + + int ierr, N, n; + + N = 8; + n = N / 2; + + double *data_in = new double[n]; + + for (int i = 0; i < n; i++) + data_in[i] = (double)rank + 1.0 + (double)i / n; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + if (rank == 0) { + //alocate memory of size N + void *ptr1; + ptr1 = base.allocateMemory(size*N, ierr); + cout << "Sent pointer: " << ptr1 << endl; + + //send ptr to other processes + MPI_Send(&ptr1, sizeof(void*), MPI_BYTE, 1, 123, MPI_COMM_WORLD); + + //wrtie n data with no offset to device and wait for other processes + ierr = base.writeData(ptr1, data_in, n, rank*n); + MPI_Barrier(MPI_COMM_WORLD); + + //read memory of size N from device + double *data_out = new double[N]; + ierr = base.readData(ptr1, data_out, N); + + //free device memory + base.freeMemory(ptr1, size*N); + + //print results + for (int i = 0; i < n; i++) + cout << data_in[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << data_out[i] << "\t"; + cout << endl; + + } else { + //receive device memory pointer + void *ptr2; + MPI_Recv(&ptr2, sizeof(void*), MPI_BYTE, 0, 123, MPI_COMM_WORLD, NULL); + cout << "Received pointer: " << ptr2 << endl; + //write data with an offset + base.writeData(ptr2, data_in, n, rank*n); + + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + + + return 0; +} + diff --git a/test/testPush.cpp b/test/testPush.cpp new file mode 100644 index 0000000..d2f13b0 --- /dev/null +++ b/test/testPush.cpp @@ -0,0 +1,57 @@ +#include +#include +#include + +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" + +using namespace std; + + +void initData(double3 *data, int N) { + for (int i = 0; i < N; i++) { + data[i].x = rand() / RAND_MAX; + data[i].y = rand() / RAND_MAX; + data[i].z = rand() / RAND_MAX; + } +} + + +int main() { + + int ierr; + int N = 1000000; + double3 *R = new double3[N]; + double3 *P = new double3[N]; + + initData(R, N); + initData(P, N); + + DKSBase dksbase; + dksbase.setAPI("Cuda", 4); + dksbase.setDevice("-gpu", 4); + dksbase.initDevice(); + + void *r_ptr, *p_ptr; + + r_ptr = dksbase.allocateMemory(N, ierr); + p_ptr = dksbase.allocateMemory(N, ierr); + + dksbase.writeData(r_ptr, R, N); + dksbase.writeData(p_ptr, P, N); + + for (int i = 0; i < 100; i++) + dksbase.callParallelTTrackerPush(r_ptr, p_ptr, N, NULL, 0.5, 1, false); + + + dksbase.readData(r_ptr, R, N); + dksbase.readData(p_ptr, P, N); + + dksbase.freeMemory(r_ptr, N); + dksbase.freeMemory(p_ptr, N); + + + return 0; +} diff --git a/test/testRCFFT.cpp b/test/testRCFFT.cpp new file mode 100644 index 0000000..841c04a --- /dev/null +++ b/test/testRCFFT.cpp @@ -0,0 +1,168 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(double* &data, int N1, int N2); +void printData(complex* &data, int N1, int N2); +void printData3DN4(complex* &data, int N, int dim); +void printData3DN4(double* &data, int N, int dim); + + +void compareData(double* &data1, double* &data2, int N, int dim); + + + +int main(int argc, char *argv[]) { + + int N1 = 4; + int N2 = 4; + + if (argc == 3) { + N1 = atoi(argv[1]); + N2 = atoi(argv[2]); + } + + int dimsize[3] = {N1, N2, 1}; + + cout << "Begin RC 3D FFT tests, grid = " << N1 << "\t" << N2 << endl; + int sizereal = N1*N2; + int sizecomp = N1*(N2/2+1); + + int dim = 3; + double *cdata = new double[sizereal]; + complex *cfft = new complex[sizecomp]; + + for (int i = 0; i < N2; i++) { + for (int j = 0; j < N1; j++) { + cdata[i*N1 + j] = (double)(j) / N1; + } + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + void *real_ptr, *comp_ptr; + int ierr; + /* allocate memory on device */ + real_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + /* write data to device */ + ierr = base.writeData(real_ptr, cdata, sizereal); + + /* execute fft */ + base.callR2CFFT(real_ptr, comp_ptr, 2, dimsize); + + /* read data from device */ + base.readData< complex >(comp_ptr, cfft, sizecomp); + + /* free device memory */ + base.freeMemory(real_ptr, sizereal); + base.freeMemory< complex >(comp_ptr, sizecomp); + + cout << "FFT complete" << endl; + + + /* print results */ + printData(cdata, N1, N2); + printData(cfft, N1, N2); + + + + return 0; +} + +void printData(double* &data, int N1, int N2) { + + for (int i = 0; i < N2; i++) { + for (int j = 0; j < N1; j++) { + cout << data[i*N1 + j] << " "; + } + cout << endl; + } + cout << endl; +} + +void printData(complex* &data, int N1, int N2) { + + complex tmp(0.0, 0.0); + for (int i = 0; i < N2/2+1; i++) { + for (int j = 0; j < N1; j++) { + tmp = data[i*N1 + j]; + if (tmp.real() < 0.00001 && tmp.real() > -0.00001) tmp = complex(0.0, tmp.imag()); + if (tmp.imag() < 0.00001 && tmp.imag() > -0.00001) tmp = complex(tmp.real(), 0.0); + + cout << tmp << " "; + } + cout << endl; + } + cout << endl; +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void printData3DN4(double* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k]; + if (d > 10e-5 || d < -10e-5) + cout << d << "\t"; + else + cout << 0 << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(double* &data1, double* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id] - data2[id]); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testStockFFT3D.cpp b/test/testStockFFT3D.cpp new file mode 100644 index 0000000..036a7e2 --- /dev/null +++ b/test/testStockFFT3D.cpp @@ -0,0 +1,181 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData3DN4(complex* &data, int N, int dim); +void compareData(complex* &data1, complex* &data2, int N, int dim); + +int main(int argc, char *argv[]) { + + int n = 2; + if (argc == 2) + n = atoi(argv[1]); + + int N = pow(2,n); + + cout << "Begin DKS Base tests" << endl; + + cout << "FFT size: " << N << endl; + + int dimsize[3] = {N, N, N}; + + + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cfft2 = new complex[N*N*N]; + complex *cfft3 = new complex[N*N*N]; + + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + //cdata[i*N*N + j*N + k] = complex((double)k/(N*N*N), 0); + cdata[i*N*N + j*N + k] = complex(k, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cfft2[i*N*N + j*N + k] = complex(0, 0); + cfft3[i*N + j*N + k] = complex(0, 0); + } + } + } + + if (N == 4) + printData3DN4(cdata, N, 3); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + int ierr; + + + timestamp_t t0, t1; + + /* stockham radix-2 out-of-place fft */ + DKSBase base2; + base2.setAPI("OpenCL", 6); + base2.setDevice("-gpu", 4); + base2.initDevice(); + + cout << endl; + void *src_ptr; + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + src_ptr = base2.allocateMemory< complex >(N*N*N, ierr); + base2.writeData< complex >(src_ptr, cdata, N*N*N); + base2.callFFTStockham(src_ptr, 3, dimsize); + base2.readData< complex >(src_ptr, cfft2, N*N*N); + base2.freeMemory< complex >(src_ptr, N*N*N); + t1 = get_timestamp(); + cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl; + } + + if (N == 4) + printData3DN4(cfft2, N, 3); + + //delete base2; + cout << endl; + + /* CUDA cufft */ + DKSBase base3; + base3.setAPI("Cuda", 4); + base3.setDevice("-gpu", 4); + base3.initDevice(); + + cout << endl; + void *cuda_ptr; + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + cuda_ptr = base3.allocateMemory< complex >(N*N*N, ierr); + base3.writeData< complex >(cuda_ptr, cdata, N*N*N); + base3.callFFT(cuda_ptr, 3, dimsize); + base3.readData< complex >(cuda_ptr, cfft3, N*N*N); + base3.freeMemory< complex >(cuda_ptr, N*N*N); + t1 = get_timestamp(); + cout << "Cuda FFT time: " << get_secs(t0, t1) << endl; + } + + if (N == 4) + printData3DN4(cfft3, N, 3); + + //delete base3; + cout << endl; + + + /* radix-2 in place fft */ + DKSBase base; + base.setAPI("OpenCL", 6); + base.setDevice("-gpu", 4); + base.initDevice(); + + cout << endl; + void *mem_ptr; + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + base.writeData< complex >(mem_ptr, cdata, N*N*N); + base.callFFT(mem_ptr, 3, dimsize); + base.readData< complex >(mem_ptr, cfft, N*N*N); + base.freeMemory< complex >(mem_ptr, N*N*N); + t1 = get_timestamp(); + cout << "in-place FFT time: " << get_secs(t0, t1) << endl; + } + + if (N == 4) + printData3DN4(cfft, N, 3); + + //delete base; + cout << endl; + + /* compare results */ + cout << endl; + + cout << "Radix 2 vs Stockham: "; + compareData(cfft, cfft2, N, 3); + + cout << "Radix 2 vs Cufft: "; + compareData(cfft, cfft3, N, 3); + + cout << "Stockham vs Cufft: "; + compareData(cfft2, cfft3, N, 3); + + return 0; +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + if (d > 10e-5 || d < -10e-5) + cout << d << "\t"; + else + cout << 0 << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "CC <--> CC diff: " << sum << endl; +} \ No newline at end of file diff --git a/test/testStockhamFFT.cpp b/test/testStockhamFFT.cpp new file mode 100644 index 0000000..fdc1656 --- /dev/null +++ b/test/testStockhamFFT.cpp @@ -0,0 +1,107 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + int n = 2; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else if (argc == 4) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + n = atoi(argv[3]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + int N = pow(2,n); + cout << "Use api: " << api_name << endl; + + cout << "Begin DKS Base tests" << endl; + + cout << "FFT size: " << N << endl; + + int dimsize[3] = {N, N, N}; + + complex *cdata = new complex[N]; + complex *cfft = new complex[N]; + complex *cfft2 = new complex[N]; + complex *cfftsrc = new complex[N]; + for (int i = 0; i < N; i++) { + cdata[i] = complex((double)i / N, 0); + cfft[i] = complex(0, 0); + cfft2[i] = complex(0, 0); + cfftsrc[i] = complex(0, 0); + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + + timestamp_t t0, t1; + + /* radix-2 in place fft */ + void *mem_ptr; + int ierr; + + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + mem_ptr = base.allocateMemory< complex >(N, ierr); + base.writeData< complex >(mem_ptr, cdata, N); + base.callFFT(mem_ptr, 1, dimsize); + base.readData< complex >(mem_ptr, cfft, N); + base.freeMemory< complex >(mem_ptr, N); + t1 = get_timestamp(); + cout << "in-place FFT time: " << get_secs(t0, t1) << endl; + } + + cout << endl; + + /* stockham radix-2 out-of-place fft */ + void *src_ptr; + + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + src_ptr = base.allocateMemory< complex >(N, ierr); + base.writeData< complex >(src_ptr, cdata, N); + base.callFFTStockham(src_ptr, 1, dimsize); + base.readData< complex >(src_ptr, cfft2, N); + base.freeMemory< complex >(src_ptr, N); + t1 = get_timestamp(); + cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl; + } + + double diff = 0; + for (int i = 0; i < N; i++) { + diff += fabs(cfft[i].real() - cfft2[i].real()); + diff += fabs(cfft[i].imag() - cfft2[i].imag()); + } + + cout << endl << "Difference: " << diff << endl; + + if (diff > 0.00001) { + for (int i = 0; i < 10; i++) { + cout << cfft[i] << "\t" << cfft2[i] << endl; + } + } + + return 0; +} + diff --git a/test/testTimeIntegration.cpp b/test/testTimeIntegration.cpp new file mode 100644 index 0000000..80fec6b --- /dev/null +++ b/test/testTimeIntegration.cpp @@ -0,0 +1,227 @@ +#include +#include +#include +#include +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" + +using namespace std; + +typedef struct { + double x; + double y; + double z; +} Vector; + +Vector initVector() { + Vector tmp; + tmp.x = 0.5; + tmp.y = 0.5; + tmp.z = 0.5; + + return tmp; +} + +void initVectors(Vector *v, int N) { + for (int i = 0; i < N; i++) + v[i] = initVector(); +} + +void initDouble(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = 0.005; +} + +void initLastSect(long *data, int N) { + for (int i = 0; i < N; i++) + data[i] = -1; +} + +void checkSum(Vector *v, int N) { + double sum = 0; + for (int i = 0; i < N; i++) + sum += v[i].x + v[i].y + v[i].z; + + std::cout << "checksum: " << sum << std::endl; +} + +int main(int argc, char *argv[]) { + + int loop = 10; + int numpart = 10; + char *api_name = new char[10]; + char *device_name = new char[10]; + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-npart")) { + numpart = atoi(argv[i+1]); + i++; + } + + if (argv[i] == string("-loop")) { + loop = atoi(argv[i+1]); + i++; + } + + } + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of particles: " << numpart << endl; + cout << "------------------------------------------------------------" << endl; + + //init p,r and dt arrays to test time integration + Vector *r = new Vector[numpart]; + Vector *p = new Vector[numpart]; + Vector *x = new Vector[numpart]; + Vector *ori = new Vector[5]; + initVectors(r, numpart); + initVectors(p, numpart); + initVectors(x, numpart); + initVectors(ori, 5); + + double *dt = new double[numpart]; + initDouble(dt, numpart); + + long *ls = new long[numpart]; + initLastSect(ls, numpart); + + //init dks + int ierr; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + int stream1, stream2; + base.createStream(stream1); + base.createStream(stream2); + + base.registerHostMemory(r, numpart); + base.registerHostMemory(p, numpart); + base.registerHostMemory(x, numpart); + base.registerHostMemory(dt, numpart); + base.registerHostMemory(ls, numpart); + + //***test parallelttrackerpush***// + void *r_ptr, *p_ptr, *x_ptr, *dt_ptr, *ls_ptr, *ori_ptr; + + //allocate memory on the device + r_ptr = base.allocateMemory(numpart, ierr); + p_ptr = base.allocateMemory(numpart, ierr); + x_ptr = base.allocateMemory(numpart, ierr); + dt_ptr = base.allocateMemory(numpart, ierr); + ls_ptr = base.allocateMemory(numpart, ierr); + ori_ptr = base.allocateMemory(5, ierr); + + //transfer data to device + base.writeData(r_ptr, r, numpart); + base.writeData(p_ptr, p, numpart); + base.writeData(x_ptr, x, numpart); + base.writeData(ori_ptr, ori, 5); + + + //do some couple of integration loops before the timer is started + for (int i = 0; i < 5; i++) { + //calc push + base.callParallelTTrackerPush (r_ptr, p_ptr, numpart, dt_ptr, + 0.05, 1, false, stream1); + + //read R from device + base.readDataAsync (r_ptr, r, numpart, stream1); + + //write LastSection to device + base.writeDataAsync (ls_ptr, ls, numpart, stream2); + + //calc push + base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5, + dt_ptr, 0.05, 1, false, stream2); + //read x from device + base.readDataAsync(x_ptr, x, numpart, stream2); + + //sync and wait till all tasks and reads are complete + base.syncDevice(); + } + + checkSum(r, numpart); + checkSum(x, numpart); + + + + //start the timing of integration + struct timeval timeStart, timeEnd; + std::cout << "start integration" << std::endl; + + gettimeofday(&timeStart, NULL); + for (int i = 0; i < loop; i++) { + + //calc push + base.callParallelTTrackerPush(r_ptr, p_ptr, numpart, dt_ptr, 0.05, 1, false, stream1); + + //read R from device + base.readDataAsync (r_ptr, r, numpart, stream1); + + //write LastSection to device + base.writeDataAsync (ls_ptr, ls, numpart, stream2); + + //calc push transform + base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5, + dt_ptr, 0.05, 1, false, stream2); + + //read R from device + base.readDataAsync(x_ptr, x, numpart, stream2); + + //sync and wait till all tasks and reads are complete + base.syncDevice(); + } + gettimeofday(&timeEnd, NULL); + + std::cout << "end integration" << std::endl; + double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)); + + std::cout << "Time for " << numpart << " integrations: " << t * 1e-6 << "s" << std::endl; + std::cout << "Average time for integration: " << t * 1e-6 / loop << std::endl; + + checkSum(r, numpart); + checkSum(x, numpart); + + + + //free memory + base.freeMemory(r_ptr, numpart); + base.freeMemory(p_ptr, numpart); + base.freeMemory(x_ptr, numpart); + base.freeMemory(ori_ptr, 5); + base.freeMemory(dt_ptr, numpart); + base.freeMemory(ls_ptr, numpart); + + //unregister host memory + base.unregisterHostMemory(r); + base.unregisterHostMemory(p); + base.unregisterHostMemory(x); + base.unregisterHostMemory(dt); + base.unregisterHostMemory(ls); + + //free host memory + delete[] r; + delete[] x; + delete[] p; + delete[] dt; + delete[] ls; + delete[] ori; + + cout << "==========================END TEST==========================" << endl; + return 0; + +} diff --git a/test/testTranspose.cpp b/test/testTranspose.cpp new file mode 100644 index 0000000..7d7b34c --- /dev/null +++ b/test/testTranspose.cpp @@ -0,0 +1,76 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void initData(complex *d, int N, int dim) { + + int size = N; + if (dim == 2) size = N*N; + if (dim == 3) size = N*N*N; + + for (int i = 0; i < size; i++) + d[i] = complex(i, 0); + +} + +void printData(complex *d, int N, int dim) { + + int NZ = N; + int NY = (dim > 1) ? N : 1; + int NX = (dim > 2) ? N : 1; + + for (int i = 0; i < NX; i++) { + for (int j = 0; j < NY; j++) { + for (int k = 0; k < NZ; k++) { + std::cout << d[i*N*N + j*N + k].real() << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; + +} + +int main(int argc, char *argv[]) { + + int N = (argc > 1) ? atoi(argv[1]) : 4; + int dimN[3] = {N, N, 1}; + int dim = 2; + int ndim = 1; + int size = dimN[0] * dimN[1] * dimN[2]; + + std::complex *hd_in = new std::complex[size]; + std::complex *hd_out = new std::complex[size]; + initData(hd_in, N, dim); + printData(hd_in, N, dim); + + DKSBase base; + base.setAPI("OpenCL", 6); + base.setDevice("-gpu", 4); + base.initDevice(); + + int ierr; + void *mem_ptr; + + mem_ptr = base.allocateMemory< std::complex >(size, ierr); + base.writeData< std::complex >(mem_ptr, hd_in, size); + + base.callTranspose(mem_ptr, dimN, dim, ndim); + + base.readData< std::complex >(mem_ptr, hd_out, size); + base.freeMemory< std::complex >(mem_ptr, size); + + printData(hd_out, N, 2); + + delete[] hd_in; + delete[] hd_out; + + return 0; + +}