From 4fa529aaeaa65f3e0af23649bcc893d693a0cc35 Mon Sep 17 00:00:00 2001 From: Uldis Locans Date: Mon, 10 Oct 2016 14:49:32 +0200 Subject: [PATCH] snapshot of svn --- CMakeLists.txt | 174 +++ ReadMe.first | 82 ++ auto-tuning/CMakeLists.txt | 19 + auto-tuning/testChiSquareRT.cpp | 385 ++++++ auto-tuning/testChiSquareRTRandom.cpp | 450 ++++++ auto-tuning/testChiSquareRTUQTK.cpp | 618 +++++++++ auto-tuning/testSearch.cpp | 22 + cmake/DKSConfig.cmake.in | 4 + cmake/Modules/FindOpenCL.cmake | 139 ++ doc/refman.pdf | Bin 0 -> 147817 bytes run_tuning_tests.sh | 97 ++ src/Algorithms/CMakeLists.txt | 14 + src/Algorithms/ChiSquareRuntime.h | 158 +++ src/Algorithms/CollimatorPhysics.h | 47 + src/Algorithms/FFT.h | 43 + src/Algorithms/ImageReconstruction.h | 117 ++ src/AutoTuning/CMakeLists.txt | 21 + src/AutoTuning/DKSAutoTuning.cpp | 302 ++++ src/AutoTuning/DKSAutoTuning.h | 103 ++ src/AutoTuning/DKSAutoTuningTester.h | 33 + src/AutoTuning/DKSConfig.cpp | 163 +++ src/AutoTuning/DKSConfig.h | 69 + src/AutoTuning/DKSSearchStates.cpp | 233 ++++ src/AutoTuning/DKSSearchStates.h | 162 +++ src/CMakeLists.txt | 130 ++ src/CUDA/CMakeLists.txt | 35 + src/CUDA/CMakeListsLibcuda.txt | 25 + src/CUDA/CudaBase.cu | 386 ++++++ src/CUDA/CudaBase.cuh | 390 ++++++ src/CUDA/CudaChiSquare.cu | 287 ++++ src/CUDA/CudaChiSquare.cuh | 59 + src/CUDA/CudaChiSquareRuntime.cu | 313 +++++ src/CUDA/CudaChiSquareRuntime.cuh | 114 ++ src/CUDA/CudaCollimatorPhysics.cu | 728 ++++++++++ src/CUDA/CudaCollimatorPhysics.cuh | 155 +++ src/CUDA/CudaFFT.cu | 376 +++++ src/CUDA/CudaFFT.cuh | 88 ++ src/CUDA/CudaGreensFunction.cu | 469 +++++++ src/CUDA/CudaGreensFunction.cuh | 63 + src/CUDA/CudaImageReconstruction.cu | 1221 +++++++++++++++++ src/CUDA/CudaImageReconstruction.cuh | 118 ++ src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu | 316 +++++ src/DKSBase.cpp | 861 ++++++++++++ src/DKSBase.h | 1133 +++++++++++++++ src/DKSBaseMuSR.cpp | 196 +++ src/DKSBaseMuSR.h | 137 ++ src/DKSDefinitions.h | 71 + src/DKSDevice.cpp | 0 src/DKSDevice.h | 37 + src/DKSImageReconstruction.cpp | 130 ++ src/DKSImageReconstruction.h | 120 ++ src/DKSStream.h | 24 + src/MIC/CMakeLists.txt | 25 + src/MIC/MICBase.cpp | 124 ++ src/MIC/MICBase.h | 244 ++++ src/MIC/MICChiSquare.cpp | 93 ++ src/MIC/MICChiSquare.h | 51 + src/MIC/MICCollimatorPhysics.cpp | 876 ++++++++++++ src/MIC/MICCollimatorPhysics.h | 68 + src/MIC/MICFFT.cpp | 210 +++ src/MIC/MICFFT.h | 79 ++ src/MIC/MICGreensFunction.cpp | 307 +++++ src/MIC/MICGreensFunction.hpp | 44 + src/MIC/MICMergeSort.h | 116 ++ src/OpenCL/CMakeLists.txt | 34 + src/OpenCL/OpenCLBase.cpp | 1132 +++++++++++++++ src/OpenCL/OpenCLBase.h | 303 ++++ src/OpenCL/OpenCLChiSquare.cpp | 157 +++ src/OpenCL/OpenCLChiSquare.h | 53 + src/OpenCL/OpenCLChiSquareRuntime.cpp | 316 +++++ src/OpenCL/OpenCLChiSquareRuntime.h | 103 ++ src/OpenCL/OpenCLCollimatorPhysics.cpp | 107 ++ src/OpenCL/OpenCLCollimatorPhysics.h | 85 ++ src/OpenCL/OpenCLFFT.cpp | 303 ++++ src/OpenCL/OpenCLFFT.h | 113 ++ src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl | 175 +++ .../OpenCLKernels/OpenCLChiSquareRuntime.cl | 344 +++++ .../OpenCLKernels/OpenCLCollimatorPhysics.cl | 362 +++++ src/OpenCL/OpenCLKernels/OpenCLFFT.cl | 181 +++ src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl | 214 +++ src/OpenCL/OpenCLKernels/OpenCLTranspose.cl | 41 + src/Utility/CMakeLists.txt | 18 + src/Utility/DKSTimer.cpp | 53 + src/Utility/DKSTimer.h | 59 + src/Utility/TimeStamp.cpp | 11 + src/Utility/TimeStamp.h | 14 + test/CMakeLists.txt | 84 ++ test/testChi.cpp | 141 ++ test/testChiSquare.cpp | 168 +++ test/testChiSquareRT.cpp | 193 +++ test/testCollimatorPhysics.cpp | 248 ++++ test/testCollimatorPhysicsMPI.cpp | 126 ++ test/testCollimatorPhysicsSoA.cpp | 250 ++++ test/testDKS.cpp | 15 + test/testFFT.cpp | 83 ++ test/testFFT3D.cpp | 159 +++ test/testFFT3DRC.cpp | 199 +++ test/testFFT3DRC_MIC.cpp | 220 +++ test/testFFT3DSO.cpp | 159 +++ test/testFFT3DTiming.cpp | 130 ++ test/testFFTAsync.cpp | 117 ++ test/testFFTSolver.cpp | 301 ++++ test/testFFTSolver_MIC.cpp | 319 +++++ test/testGather.cpp | 172 +++ test/testGatherAsync.cpp | 144 ++ test/testGatherAsync2.cpp | 205 +++ test/testGreens.cpp | 239 ++++ test/testImageReconstruction.cpp | 191 +++ test/testMIC.cpp | 51 + test/testMICOpenCL.cpp | 94 ++ test/testMICPush.cpp | 68 + test/testMPI.cpp | 89 ++ test/testMPIFFT.cpp | 91 ++ test/testMemObjects.cpp | 75 + test/testOffset.cpp | 73 + test/testOffsetMPI.cpp | 81 ++ test/testPush.cpp | 57 + test/testRCFFT.cpp | 168 +++ test/testStockFFT3D.cpp | 181 +++ test/testStockhamFFT.cpp | 107 ++ test/testTimeIntegration.cpp | 227 +++ test/testTranspose.cpp | 76 + 122 files changed, 23153 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 ReadMe.first create mode 100644 auto-tuning/CMakeLists.txt create mode 100644 auto-tuning/testChiSquareRT.cpp create mode 100644 auto-tuning/testChiSquareRTRandom.cpp create mode 100644 auto-tuning/testChiSquareRTUQTK.cpp create mode 100644 auto-tuning/testSearch.cpp create mode 100644 cmake/DKSConfig.cmake.in create mode 100644 cmake/Modules/FindOpenCL.cmake create mode 100644 doc/refman.pdf create mode 100755 run_tuning_tests.sh create mode 100644 src/Algorithms/CMakeLists.txt create mode 100644 src/Algorithms/ChiSquareRuntime.h create mode 100644 src/Algorithms/CollimatorPhysics.h create mode 100644 src/Algorithms/FFT.h create mode 100644 src/Algorithms/ImageReconstruction.h create mode 100644 src/AutoTuning/CMakeLists.txt create mode 100644 src/AutoTuning/DKSAutoTuning.cpp create mode 100644 src/AutoTuning/DKSAutoTuning.h create mode 100644 src/AutoTuning/DKSAutoTuningTester.h create mode 100644 src/AutoTuning/DKSConfig.cpp create mode 100644 src/AutoTuning/DKSConfig.h create mode 100644 src/AutoTuning/DKSSearchStates.cpp create mode 100644 src/AutoTuning/DKSSearchStates.h create mode 100644 src/CMakeLists.txt create mode 100644 src/CUDA/CMakeLists.txt create mode 100644 src/CUDA/CMakeListsLibcuda.txt create mode 100644 src/CUDA/CudaBase.cu create mode 100644 src/CUDA/CudaBase.cuh create mode 100644 src/CUDA/CudaChiSquare.cu create mode 100644 src/CUDA/CudaChiSquare.cuh create mode 100644 src/CUDA/CudaChiSquareRuntime.cu create mode 100644 src/CUDA/CudaChiSquareRuntime.cuh create mode 100644 src/CUDA/CudaCollimatorPhysics.cu create mode 100644 src/CUDA/CudaCollimatorPhysics.cuh create mode 100644 src/CUDA/CudaFFT.cu create mode 100644 src/CUDA/CudaFFT.cuh create mode 100644 src/CUDA/CudaGreensFunction.cu create mode 100644 src/CUDA/CudaGreensFunction.cuh create mode 100644 src/CUDA/CudaImageReconstruction.cu create mode 100644 src/CUDA/CudaImageReconstruction.cuh create mode 100644 src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu create mode 100644 src/DKSBase.cpp create mode 100644 src/DKSBase.h create mode 100644 src/DKSBaseMuSR.cpp create mode 100644 src/DKSBaseMuSR.h create mode 100644 src/DKSDefinitions.h create mode 100644 src/DKSDevice.cpp create mode 100644 src/DKSDevice.h create mode 100644 src/DKSImageReconstruction.cpp create mode 100644 src/DKSImageReconstruction.h create mode 100644 src/DKSStream.h create mode 100644 src/MIC/CMakeLists.txt create mode 100644 src/MIC/MICBase.cpp create mode 100644 src/MIC/MICBase.h create mode 100644 src/MIC/MICChiSquare.cpp create mode 100644 src/MIC/MICChiSquare.h create mode 100644 src/MIC/MICCollimatorPhysics.cpp create mode 100644 src/MIC/MICCollimatorPhysics.h create mode 100644 src/MIC/MICFFT.cpp create mode 100644 src/MIC/MICFFT.h create mode 100644 src/MIC/MICGreensFunction.cpp create mode 100644 src/MIC/MICGreensFunction.hpp create mode 100644 src/MIC/MICMergeSort.h create mode 100644 src/OpenCL/CMakeLists.txt create mode 100644 src/OpenCL/OpenCLBase.cpp create mode 100644 src/OpenCL/OpenCLBase.h create mode 100644 src/OpenCL/OpenCLChiSquare.cpp create mode 100644 src/OpenCL/OpenCLChiSquare.h create mode 100644 src/OpenCL/OpenCLChiSquareRuntime.cpp create mode 100644 src/OpenCL/OpenCLChiSquareRuntime.h create mode 100644 src/OpenCL/OpenCLCollimatorPhysics.cpp create mode 100644 src/OpenCL/OpenCLCollimatorPhysics.h create mode 100644 src/OpenCL/OpenCLFFT.cpp create mode 100644 src/OpenCL/OpenCLFFT.h create mode 100644 src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl create mode 100644 src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl create mode 100644 src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl create mode 100644 src/OpenCL/OpenCLKernels/OpenCLFFT.cl create mode 100644 src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl create mode 100644 src/OpenCL/OpenCLKernels/OpenCLTranspose.cl create mode 100644 src/Utility/CMakeLists.txt create mode 100644 src/Utility/DKSTimer.cpp create mode 100644 src/Utility/DKSTimer.h create mode 100644 src/Utility/TimeStamp.cpp create mode 100644 src/Utility/TimeStamp.h create mode 100644 test/CMakeLists.txt create mode 100644 test/testChi.cpp create mode 100644 test/testChiSquare.cpp create mode 100644 test/testChiSquareRT.cpp create mode 100644 test/testCollimatorPhysics.cpp create mode 100644 test/testCollimatorPhysicsMPI.cpp create mode 100644 test/testCollimatorPhysicsSoA.cpp create mode 100644 test/testDKS.cpp create mode 100644 test/testFFT.cpp create mode 100644 test/testFFT3D.cpp create mode 100644 test/testFFT3DRC.cpp create mode 100644 test/testFFT3DRC_MIC.cpp create mode 100644 test/testFFT3DSO.cpp create mode 100644 test/testFFT3DTiming.cpp create mode 100644 test/testFFTAsync.cpp create mode 100644 test/testFFTSolver.cpp create mode 100644 test/testFFTSolver_MIC.cpp create mode 100644 test/testGather.cpp create mode 100644 test/testGatherAsync.cpp create mode 100644 test/testGatherAsync2.cpp create mode 100644 test/testGreens.cpp create mode 100644 test/testImageReconstruction.cpp create mode 100644 test/testMIC.cpp create mode 100644 test/testMICOpenCL.cpp create mode 100644 test/testMICPush.cpp create mode 100644 test/testMPI.cpp create mode 100644 test/testMPIFFT.cpp create mode 100644 test/testMemObjects.cpp create mode 100644 test/testOffset.cpp create mode 100644 test/testOffsetMPI.cpp create mode 100644 test/testPush.cpp create mode 100644 test/testRCFFT.cpp create mode 100644 test/testStockFFT3D.cpp create mode 100644 test/testStockhamFFT.cpp create mode 100644 test/testTimeIntegration.cpp create mode 100644 test/testTranspose.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..9c08e39 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,174 @@ +CMAKE_MINIMUM_REQUIRED (VERSION 3.2) +PROJECT (DKS) +SET (DKS_VERSION_MAJOR 1) +SET (DKS_VERSION_MINOR 0.1) +SET (PACKAGE \"dks\") +SET (PACKAGE_BUGREPORT \"locagoons.uldis@psi.ch\") +SET (PACKAGE_NAME \"DKS\") +SET (PACKAGE_STRING \"DKS\ 1.0.1\") +SET (PACKAGE_TARNAME \"dks\") +SET (PACKAGE_VERSION \"1.0.1\") +SET (VERSION \"1.0.1\") + +SET (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") + +#get compiler name +#STRING (REGEX REPLACE ".*/([A-Za-z]*)$" "\\1" COMPILER_NAME ${CMAKE_CXX_COMPILER}) +STRING (REGEX REPLACE ".*/" "" COMPILER_NAME ${CMAKE_CXX_COMPILER}) +MESSAGE (STATUS "Your compiler is: ${COMPILER_NAME}") +MESSAGE (STATUS "Your compiler is: ${CMAKE_CXX_COMPILER}") + +MESSAGE (STATUS "C compiler: ${CMAKE_C_COMPILER_ID}") +MESSAGE (STATUS "CXX compiler: ${CMAKE_CXX_COMPILER_ID}") + +#opencl and cuda kernel files are in the builds include directory +SET (OPENCL_KERNELS -DOPENCL_KERNELS=\\"${CMAKE_INSTALL_PREFIX}/include/\\") +MESSAGE (STATUS "OpenCL kernel files: ${OPENCL_KERNELS}") + +#find boost +set (BOOSTROOT $ENV{BOOST_DIR}) +SET (Boost_USE_STATIC_LIBS OFF) +SET (Boost_USE_STATIC_RUNTIME OFF) +FIND_PACKAGE(Boost 1.55.0 REQUIRED COMPONENTS filesystem system) +IF (Boost_FOUND) + MESSAGE (STATUS "Found boost include dir: ${Boost_INCLUDE_DIRS}") + MESSAGE (STATUS "Found boost library dir: ${Boost_LIBRARY_DIRS}") + MESSAGE (STATUS "Found boost libraries: ${Boost_LIBRARIES}") + INCLUDE_DIRECTORIES (${Boost_INCLUDE_DIRS}) + LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) +ENDIF (Boost_FOUND) + +#enable UQTK +OPTION (USE_UQTK "Use UQTK" OFF) + + +#intel icpc compiler specific flags +IF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) + + #for intel compiler turn on openmp and opencl + OPTION (USE_OPENCL "Use OpenCL" ON) + OPTION (USE_CUDA "Use CUDA" OFF) + OPTION (USE_MIC "Use intel MIC" ON) + + #find xiar and xild and set flags for offload build on mic + FIND_PROGRAM(XIAR xiar) + IF(XIAR) + MESSAGE(STATUS "xiar found: ${XIAR}") + SET(CMAKE_AR "${XIAR}") + ENDIF(XIAR) + MARK_AS_ADVANCED(XIAR) + SET(CMAKE_CXX_ARCHIVE_CREATE " rcs -qoffload-build ") + SET(CMAKE_C_ARCHIVE_CREATE " rcs -qoffload-build ") + + FIND_PROGRAM(XILD xild) + IF(XILD) + SET(CMAKE_LINKER "${XILD}") + ENDIF(XILD) + MARK_AS_ADVANCED(XILD) + + #set flags for openmp and opencl + #TODO: check which opencl to use: nvidia, amd, intel, apple + SET (CMAKE_CXX_FLAGS "-DDEBUG -O3 -Wall -offload -mkl -openmp -lOpenCL -lpthread -DDKS_MIC -DDKS_OPENCL -qopt-report=5 -qopt-report-phase=vec -std=c++11") + + IF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI") + ENDIF (${COMPILER_NAME} STREQUAL "mpicxx" OR ${COMPILER_NAME} STREQUAL "mpiicpc") + +ENDIF (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR USE_INTEL) + +#gnu copmpiler specific flags +IF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL) + + + OPTION (USE_OPENCL "Use OpenCL" ON) + OPTION (USE_CUDA "Use CUDA" OFF) + OPTION (USE_MIC "Use intel MIC" OFF) + + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG -O3 -Wall -fopenmp -std=c++11 -D__wsu") + + FIND_PACKAGE(CUDA) + IF (CUDA_FOUND) + SET (USE_CUDA ON) + INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) + LINK_DIRECTORIES(${CUDA_TOOLKIT_ROOT_DIR}/lib64) + + MESSAGE (STATUS "cuda include: ${CUDA_INCLUDE_DIRS}") + MESSAGE (STATUS "cuda libs: ${CUDA_TOOLKIT_ROOT_DIR}/lib64") + MESSAGE (STATUS "cuda version: ${CUDA_VERSION}") + + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lcudart -lcufft -lcublas -lnvToolsExt -DDKS_CUDA") + SET (CUDA_NVCC_FLAGS "-arch=sm_35 -DDEBUG -lcufft -lcublas -lcudart -fmad=false") + + SET (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${OPENCL_KERNELS}") + + #if cuda version >= 7.0 add runtime commpilation flags + IF (NOT CUDA_VERSION VERSION_LESS "7.0") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lnvrtc -lcuda") + ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") + + MESSAGE (STATUS "nvcc flags: ${CUDA_NVCC_FLAGS}") + + SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) + #set(CUDA_SEPARABLE_COMPILATION ON) + SET(BUILD_SHARED_LIBS OFF) + + ENDIF (CUDA_FOUND) + + IF (NOT CUDA_FOUND) + + MESSAGE(STATUS "CUDA not found, looking for OpenCL") + + FIND_PACKAGE(OpenCL) + IF (OpenCL_FOUND) + MESSAGE(STATUS "OpenCL version : ${OpenCL_VERSION_STRING}") + MESSAGE(STATUS "OpenCL include dir: ${OpenCL_INCLUDE_DIR}") + MESSAGE(STATUS "OpenCL library dir: ${OpenCL_LIBRARY}") + INCLUDE_DIRECTORIES(${OpenCL_INCLUDE_DIR}) + LINK_DIRECTORIES(${OpenCL_LIBRARY}) + ENDIF (OpenCL_FOUND) + + ENDIF (NOT CUDA_FOUND) + + #if mac OS and no CUDA set apple opencl flags + IF (APPLE AND NOT CUDA_FOUND) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -framework opencl -lpthread -DDKS_OPENCL") + ENDIF(APPLE AND NOT CUDA_FOUND) + + #if cuda found set cuda opencl flags + IF (CUDA_FOUND) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL") + ENDIF (CUDA_FOUND) + + #if cuda not found but amd opencl found set opencl flags + IF (NOT CUDA_FOUND AND OpenCL_FOUND) + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL -lpthread -DDKS_OPENCL") + ENDIF(NOT CUDA_FOUND AND OpenCL_FOUND) + + #if mpi compiler used set mpi flag + IF (${COMPILER_NAME} STREQUAL "mpicxx") + SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDKS_MPI") + ENDIF (${COMPILER_NAME} STREQUAL "mpicxx") + +ENDIF ( (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") AND NOT USE_INTEL) + +SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENCL_KERNELS}") +MESSAGE (STATUS "Compiler flags: ${CMAKE_CXX_FLAGS}") + +ADD_SUBDIRECTORY (src) + +IF (ENABLE_TESTS) + ADD_SUBDIRECTORY (test) +ENDIF (ENABLE_TESTS) + +ADD_SUBDIRECTORY (auto-tuning) + +### write configure files ### +CONFIGURE_FILE ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake ) + +### install files ### +INSTALL ( + FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config_install.cmake + DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PROJECT_NAME}" + RENAME ${PROJECT_NAME}Config.cmake + ) diff --git a/ReadMe.first b/ReadMe.first new file mode 100644 index 0000000..e781b63 --- /dev/null +++ b/ReadMe.first @@ -0,0 +1,82 @@ +################################################################## +# +# Name: Dynamic Kernel Scheduler +# Version: 1.0 +# Author: Uldis Locans +# Contacts: locans.uldis@psi.ch +# +################################################################## + +Dynamic Kernel Scheduler is a library that provides a software layer between host application +and hardware accelerators. DKS handles communication between host and device and schedules task +execution using predefined algorithms writen using CUDA and OpenCL for GPUs, and OpenMP with +offload pragmas for IntelMIC. See DKSBase class documentation for full list of functions provided +by DKS. + +#####Requirements##### + +OpenMPI (Cuda aware OpenMPI enabled for full compatability) +g++ or icpc compiler +Cuda 7.0 or higher (optional) +Nvidia or Intel OpenCL SDK (optional) +Intel MIC compilers (optional) + + +######Install###### + +#check out DKS +svn co svn+ssh://YOULOGIN@savannah02.psi.ch/repos/amas/users/adelmann/Ph.D-students/Locans/work/DKS/trunk DKS + +#set compilers to use +#supported c++ compilers: g++, icpc, mpicxx whith g++ +#supported c compilers: gcc, icc, mpicc whith gcc +export CXX_COMPILER=cpp_compiler_name +export CC_COMPILER=c_compiler_name + +#set dks root directory directory +cd DKS +export DKS_ROOT = $PWD + +#set build directory +mkdir $DKS_BUILD_DIR +cd $DKS_BUILD_DIR + +#set install directory +export DKS_INSTALL_DIR = $DKS_BUILD_DIR #default is /usr/local/ + +CXX=$CXX_COMPILER CC=$CC_COMPILER cmake -DCMAKE_INSTALL_PREFIX=$DKS_BUILD_DIR $DKS_ROOT + +make +make install + + +######DKS usage###### +Make install copies the include files and library files to $DKS_BUILD_DIR/build folder, lib folder +in the build directory contains libdks.a and libdksshared.so, on of these libraries can be used to link +with DKS. All the necessary include files are located in $DKS_BUILD_DIR/build/include. + +Additional flags needed for CUDA and OpenCL mode: +-lcudart -lcufft -lcublas -lnvToolsExt -lOpenCL -lnvrtc -lcuda -DDKS_CUDA -DDKS_OPENCL + +Additional flags needed for IntelMIC and OpenCL mode: +-offload -mkl -openmp -lOpenCL -DDKS_MIC -DDKS_OPENCL + +Note: always run make install, during runtime OpenCL and CUDA will search for kernel files in +$DKS_INSTALL_DIR/build/include directory for runtime compilation. + +######Running DKS###### + +#running with cuda +#nvidia multi process service started for better CUDA and MPI execution + +#to start mps service (if multiple users use DKS start MPS as root) +nvidia-cuda-mps-control -d +#to stop mps service +echo quit | nvidia-cuda-mps-control + + +#runnign dks with MIC +#Intel Manycore Platform Software Stack (mpss) service started + +#to start mpss +service mpss start diff --git a/auto-tuning/CMakeLists.txt b/auto-tuning/CMakeLists.txt new file mode 100644 index 0000000..e3be789 --- /dev/null +++ b/auto-tuning/CMakeLists.txt @@ -0,0 +1,19 @@ +INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) +LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) + +#chi square kernel tests +ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) +TARGET_LINK_LIBRARIES(testChiSquareRT dks ${Boost_LIBRARIES}) + +ADD_EXECUTABLE(testChiSquareRTRandom testChiSquareRTRandom.cpp) +TARGET_LINK_LIBRARIES(testChiSquareRTRandom dks ${Boost_LIBRARIES}) + +IF (USE_UQTK) + ADD_EXECUTABLE(testChiSquareRTUQTK testChiSquareRTUQTK.cpp) + TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES} lreg UQTk quad bcs uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) +ENDIF (USE_UQTK) +#TARGET_LINK_LIBRARIES(testChiSquareRTUQTK dks ${Boost_LIBRARIES}) + +#test to verify search functions +ADD_EXECUTABLE(testSearch testSearch.cpp) +TARGET_LINK_LIBRARIES(testSearch dks ${Boost_LIBRARIES}) diff --git a/auto-tuning/testChiSquareRT.cpp b/auto-tuning/testChiSquareRT.cpp new file mode 100644 index 0000000..01e4ae0 --- /dev/null +++ b/auto-tuning/testChiSquareRT.cpp @@ -0,0 +1,385 @@ +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +#define PI 3.14159265358979323846 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +#define N0 0.25 +#define TAU 2.197019 +#define BKG 1.0 + +#define ALPHA 1.0 +#define BETA 1.0 + +using namespace std; + +void randData(double *data, int N, int scale = 1) { + for (int i = 0; i < N; i++) + data[i] = ((double)rand() / RAND_MAX ) * scale; +} + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double t, double lamda) { + return exp( -lamda*t ); +} + +double ge(double t, double lamda, double beta) { + return exp( -pow(lamda*t, beta) ); +} + +double sg(double t, double sigma) { + return exp( -0.5 * pow(sigma*t, 2) ); +} + +double stg(double t, double sigma) { + double sigmatsq = pow(sigma*t,2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double t, double lambda) { + double lambdat = lambda*t; + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double t, double lambda, double sigma) { + double lambdat = lambda*t; + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double t, double sigma, double beta) { + if (beta < 1.0e-3) + return 0.0; + double sigmatb = pow(sigma*t, beta); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta); +} + +double spg(double t, double lambda, double gamma, double q) { + double lam2 = lambda*lambda; + double lamt2q = t*t*lam2*q; + double rate2 = 4.0*lam2*(1.0-q)*t/gamma; + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double t, double nu, double lambda) { + double nut = nu*t; + double nuth = nu*t/2.0; + double lamt = lambda*t; + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double t, double phi, double nu) { + double tmp_nu = TWO_PI*nu*t; + double tmp_phi = DEG_TO_RAD * phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + + return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double b(double t, double phi, double nu) { + return j0(TWO_PI*nu*t + DEG_TO_RAD*phi); +} + +double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI * nu * t; + double ph = DEG_TO_RAD * phi; + + return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double ab(double t, double sigma, double gamma) { + double gt = gamma*t; + + return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double t, double Delta0, double Rb) { + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double t, double phi, double nu, double Delta0, double Rb) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double t, double Delta0, double Rb, double nuc) { + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa); +} + +double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph); +} + + +double cpuChiSq(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, + double timeStart, double timeStep, bool mlh = false) +{ + + double result = 0.0; + for (int i = 0; i < Ndata; i++) { + + double t = timeStart + i*timeStep; + double d = data[i]; + double e = data[i]; + + double fTheory = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]); + double theo = N0 * exp(-t/TAU) * (1.0 + fTheory) + BKG; + + if (mlh) { + if ((d > 1.0e-9) && (fabs(theo) > 1.0e-9)) + result += 2.0 * ((theo - d) + d * log(d / theo)); + else + result += 2.0 * (theo - d); + } else { + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e * e); + else + result += theo * theo; + } + } + + return result; +} + +double cpuChiSqAsym(double *data, double *p, double *f, int Ndata, int Npar, int Nfnc, + double timeStart, double timeStep, bool mlh = false) +{ + + double result = 0.0; + for (int i = 0; i < Ndata; i++) { + + double t = timeStart + i*timeStep; + double d = data[i]; + double e = data[i]; + + double theoVal = p[0] * f[0] * sg(t, p[1]) * tf(t, p[2], f[1]); + double ab = ALPHA * BETA; + + + double theo = ((ab+1.0)*theoVal - (ALPHA-1.0))/((ALPHA+1.0) - (ab-1.0)*theoVal); + + if (mlh) { + result += 0.0; //log max likelihood not defined here + } else { + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e * e); + else + result += theo * theo; + } + } + + return result; +} + +int runTest(const char *api_name, const char *device_name, bool autotune, bool mlh, bool asym) { + + int ierr; + + /* + * Histogram size used in tests. If autotune run kernes with sizes from 1e5 to 1e6. + * If autotune is off just run the test once (used for debuging to test the kernel) + */ + int Nstart = 1e5; + int Nstep = 1e5; + int Nend = (autotune) ? 1e6 : 1e5; + + //parameter, function and map sizes used in tests + int Npar = 66; + int Nfnc = 2; + int Nmap = 5; + + //print test info + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Max log likelihood: " << std::boolalpha << mlh << endl; + cout << "Asymetry fit: " << std::boolalpha << asym << endl; + + DKSBaseMuSR dksbase; + dksbase.setAPI(api_name); + dksbase.setDevice(device_name); + ierr = dksbase.initDevice(); + if (ierr != DKS_SUCCESS) { + std::cout << "Device not supported!" << std::endl; + return DKS_ERROR; + } + + //get the list of different devices + std::vector devices; + dksbase.getDeviceList(devices); + std::cout << "Unique devices: " << devices.size() << std::endl; + + //create the function string to use in test + string sFnc = "p[m[0]] * f[m[1]] * sg(t, p[m[2]]) * tf(t, p[m[3]], f[m[4]])"; + int map[5] = {0, 0, 1, 2, 1}; + + //runt tests from 100k to 1mil data points + for (unsigned int device = 0; device < devices.size(); device++) { + for (int Ndata = Nstart; Ndata <= Nend; Ndata += Nstep) { + + dksbase.setDefaultDevice(device); + + std::cout << "Ndata: " << Ndata << std::endl; + + //init the chi square calculations + dksbase.initChiSquare(Ndata, Npar, Nfnc, Nmap); + + //create random arrays for data, parameter and function storage + double *data = new double[Ndata]; + double *par = new double[Npar]; + double *fnc = new double[Nfnc]; + + randData(data, Ndata); + randData(par, Npar); + randData(fnc, Nfnc, 100); + + //allocate memory on device + void *data_ptr = dksbase.allocateMemory(Ndata, ierr); + + //write data, params, functions and maps to the device + dksbase.writeData(data_ptr, data, Ndata); + dksbase.writeParams(par, Npar); + dksbase.writeFunctions(fnc, Nfnc); + dksbase.writeMaps(map, Nmap); + + //set musrfit constants + dksbase.callSetConsts(N0, TAU, BKG); + dksbase.callSetConsts(ALPHA, BETA); + + //compile the program created with the function string + dksbase.callCompileProgram(sFnc, mlh); + + //set autotuning on/off + if (autotune) + dksbase.setAutoTuningOn(); + + //tmp values to store results and tmp values for time steps and start time + double result_gpu = 0.0; + double result_cpu = 0.0; + double dt = 1e-12; + double ts = 1e-7; + + //execute kernel on the GPU and execute the same function on the cpu + if (!asym) { + dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, Npar, Nfnc, + Nmap, ts, dt, result_gpu); + result_cpu = cpuChiSq(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh); + } else { + dksbase.callLaunchChiSquare(2, data_ptr, data_ptr, Ndata, Npar, Nfnc, + Nmap, ts, dt, result_gpu); + result_cpu = cpuChiSqAsym(data, par, fnc, Ndata, Npar, Nfnc, ts, dt, mlh); + } + + //check the results + cout << "DKS: " << result_gpu << endl; + cout << "CPU: " << result_cpu << endl; + + //free CPU and GPU memory + dksbase.freeMemory(data_ptr, Ndata); + dksbase.freeChiSquare(); + + delete[] data; + delete[] par; + delete[] fnc; + cout << "------------------------------------------------------------" << endl; + } + } + + return DKS_SUCCESS; +} + +int main(int argc, char* argv[]) { + + bool asym = false; + bool mlh = false; + bool autotune = false; + + char *api_name = new char[10]; + char *device_name = new char[10]; + + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; ++i) { + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-mlh")) + mlh = true; + + if (argv[i] == string("-asym")) + asym = true; + + if (argv[i] == string("-autotune")) + autotune = true; + + } + + int numPlatforms = 2; + const char *api[] = {"Cuda","OpenCL","OpenCL","OpenCL","OpenMP"}; + const char *device[] = {"-gpu","-gpu","-cpu","-mic","-mic"}; + + for (int i = 0; i < numPlatforms; i++) { + runTest(api[i], device[i], autotune, mlh, asym); + } + + return 0; +} diff --git a/auto-tuning/testChiSquareRTRandom.cpp b/auto-tuning/testChiSquareRTRandom.cpp new file mode 100644 index 0000000..b9e9b53 --- /dev/null +++ b/auto-tuning/testChiSquareRTRandom.cpp @@ -0,0 +1,450 @@ +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +#define PI 3.14159265358979323846 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +//#define N0 0.25 +#define N0 1e-10 +#define TAU 2.197019 +#define BKG 0.05 + +using namespace std; + +typedef std::function doubleF; + +void randData(double *data, int N, int scale = 1) { + for (int i = 0; i < N; i++) + data[i] = ((double)rand() / RAND_MAX ) * scale; +} + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double *t, double *lamda) { + return exp( -*lamda**t ); +} + +double ge(double *t, double *lamda, double *beta) { + return exp( -pow( (*lamda)*(*t), *beta) ); +} + +double sg(double *t, double *sigma) { + return exp( -0.5 * pow((*sigma)*(*t), 2) ); +} + +double stg(double *t, double *sigma) { + double sigmatsq = pow((*sigma)*(*t),2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double *t, double *lambda) { + double lambdat = *lambda*(*t); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double *t, double *lambda, double *sigma) { + double lambdat = *lambda*(*t); + double sigmatsq = pow(*sigma*(*t), 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double *t, double *sigma, double *beta) { + if (*beta < 1.0e-3) + return 0.0; + double sigmatb = pow(*sigma*(*t), (*beta)); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta)); +} + +double spg(double *t, double *lambda, double *gamma, double *q) { + double lam2 = (*lambda)*(*lambda); + double lamt2q = (*t)*(*t)*lam2*(*q); + double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma); + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double *t, double *nu, double *lambda) { + double nut = *nu*(*t); + double nuth = *nu*(*t)/2.0; + double lamt = *lambda*(*t); + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double *t, double *phi, double *nu) { + double tmp_nu = TWO_PI**nu**t; + double tmp_phi = DEG_TO_RAD * *phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + + return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double b(double *t, double *phi, double *nu) { + return j0(TWO_PI**nu**t + DEG_TO_RAD**phi); +} + +double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI * *nu * *t; + double ph = DEG_TO_RAD * *phi; + + return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double ab(double *t, double *sigma, double *gamma) { + double gt = *gamma**t; + + return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double *t, double *Delta0, double *Rb) { + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) { + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa); +} + +double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph); +} + +double evalf(std::vector< std::pair > func) { + + double result = 0.0; + for (auto f : func) { + switch (f.first) { + case 0: result += f.second(); break; + case 1: result -= f.second(); break; + default: result += f.second(); break; + } + } + + return result; +} + +double cpuChiSq(double *data, std::vector< std::pair > &func, int ndata, double *t, double dt) { + + double result = 0.0; + double ts = *t; + + for (int i = 0; i < ndata; i++) { + + *t = ts + i*dt; + double d = data[i]; + double e = data[i]; + + double vf = evalf(func); + double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG; + + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e*e); + else + result += theo * theo; + + } + return result; +} + +//create a random length from 50 - 1000 array and fill with random values from 0 to 1 +void randomParams(double *p, int np) { + for (int i = 0; i < np; i++) + p[i] = (double)rand() / RAND_MAX; +} + +//create map array of random size and fill with indexes from 0 to max, max < size of param array +void randomMaps(int *m, int nm, int max) { + for (int i = 0; i < nm; i++) + m[i] = rand() % max; +} + +int generateRandomFunction(std::vector< std::pair > &func, std::string &sfunc, + double *t, double *p, int *m, int np, int nm) +{ + + //nf defines the number of functions to generate (from 1 to 25) + int nf = rand() % 25 + 1; + + for (int n = 0; n < nf; n++) { + std::string sf = ""; + doubleF f; + + int r = rand() % 18; //choose random function to use + + int id1 = rand() % nm; + int id2 = rand() % nm; + int id3 = rand() % nm; + int id4 = rand() % nm; + int id5 = rand() % nm; + + std::string p1 = "p[m[" + to_string(id1) + "]])"; + std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])"; + std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]])"; + std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]])"; + std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])"; + + //get a random index from maps and use it to get the parameter value, bind function and parameter + //values to f, and create string for gpu in sfunc + switch (r) { + case 0: + f = std::bind(se, t, &p[m[id1]]); + sf = "se(t," + p1; + break; + case 1: + f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]); + sf = "ge(t," + p2; + break; + case 2: + f = std::bind(sg, t, &p[m[id1]]); + sf = "sg(t, " + p1; + break; + case 3: + f = std::bind(stg, t, &p[m[id1]]); + sf = "stg(t, " + p1; + break; + case 4: + f = std::bind(sekt, t, &p[m[id1]]); + sf = "sekt(t, " + p1; + break; + case 5: + f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]); + sf = "lgkt(t, " + p2; + break; + case 6: + f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]); + sf = "skt(t, " + p2; + break; + case 7: + f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "spg(t, " + p3; + break; + case 8: + f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]); + sf = "rahf(t, " + p2; + break; + case 9: + f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]); + sf = "tf(t, " + p2; + break; + case 10: + f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ifld(t, " + p5; + break; + case 11: + f = std::bind(b, t, &p[m[id1]], &p[m[id2]]); + sf = "b(t, " + p2; + break; + case 12: + f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ib(t, " + p5; + break; + case 13: + f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]); + sf = "ab(t, " + p2; + break; + case 14: + f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]); + sf = "snkzf(t, " + p2; + break; + case 15: + f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]); + sf = "snktf(t, " + p4; + break; + case 16: + f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "dnkzf(t, " + p3; + break; + case 17: + f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "dnktf(t, " + p5; + break; + } + + + int sign = rand() % 2; + if (n == 0) sign = 0; + func.push_back( std::make_pair(sign, f) ); + if (n == 0) + sfunc = sf; + else { + switch(sign) { + case 0: sfunc += " + " + sf; break; + case 1: sfunc += " - " + sf; break; + default: sfunc += " + " + sf; break; + } + + } + } + + return nf; +} + +int main(int argc, char *argv[]) { + + + srand(time(NULL)); + + int ierr; + int Ndata = 1e6; + + bool autotune = false; + + char *api_name = new char[10]; + char *device_name = new char[10]; + + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; ++i) { + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-autotune")) { + autotune = true; + } + + } + + //create a random number of parameters + int np = ( rand() % (1000 - 50) ) + 50; + int nm = ( rand() % (50 - 5) ) + 5; + int nf = ( rand() % (50 - 5) ) + 5; + + int *m = new int[nm]; + double *p = new double[np]; + double *f = new double[nf]; + + randomParams(p, np); + randomMaps(m, nm, np); + randomParams(f, nf); + + double dt = 1e-10; + double t = 1e-10; + std::vector< std::pair > func; + std::string sfunc; + int nfunc = generateRandomFunction(func, sfunc, &t, p, m, np, nm); + + //create DKS base object, set and init device / framework + DKSBaseMuSR dksbase; + dksbase.setAPI(api_name); + dksbase.setDevice(device_name); + + dksbase.initDevice(); + dksbase.initChiSquare(Ndata, np, nf, nm); + + dksbase.writeParams(p, np); + dksbase.writeFunctions(f, nf); + dksbase.writeMaps(m, nm); + + dksbase.callSetConsts(N0, TAU, BKG); + + dksbase.callCompileProgram(sfunc); + + if (autotune) + dksbase.setAutoTuningOn(); + + int oper = 0; + dksbase.getOperations(oper); + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of params: " << np << endl; + cout << "Number of maps: " << nm << endl; + cout << "Number of predefined functions: " << nfunc << endl; + cout << "Number of ptx instructions: " << oper << endl; + cout << "------------------------------------------------------------" << endl; + cout << sfunc << endl; + cout << "------------------------------------------------------------" << endl; + + //allocate memory on host and device device + double *data = new double[Ndata]; + randomParams(data, Ndata); + void *data_ptr = dksbase.allocateMemory(Ndata, ierr); + dksbase.writeData(data_ptr, data, Ndata); + + for (int N = 1e5; N < Ndata + 1; N += 1e5) { + double result_dks, result_cpu; + + t = 1e-10; + + dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, N, np, nf, nm, t, dt, result_dks); + result_cpu = cpuChiSq(data, func, N, &t, dt); + + cout << "Npart: " << N << endl; + cout << "DKS: " << result_dks << endl; + cout << "CPU: " << result_cpu << endl; + + } + + dksbase.freeMemory(data_ptr, Ndata); + dksbase.freeChiSquare(); + delete[] data; + delete[] p; + delete[] f; + delete[] m; + + return 0; +} diff --git a/auto-tuning/testChiSquareRTUQTK.cpp b/auto-tuning/testChiSquareRTUQTK.cpp new file mode 100644 index 0000000..c8602fc --- /dev/null +++ b/auto-tuning/testChiSquareRTUQTK.cpp @@ -0,0 +1,618 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +#include "Array1D.h" +#include "Array2D.h" +#include "Array3D.h" +#include "error_handlers.h" +#include "PCSet.h" +#include "fast_laplace.h" +#include "uqtktools.h" +#include "lreg.h" + +#define PI 3.14159265358979323846 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +//#define N0 0.25 +#define N0 1e-10 +#define TAU 2.197019 +#define BKG 0.05 + +using namespace std; + +typedef std::function doubleF; + +void randData(double *data, int N, int scale = 1) { + for (int i = 0; i < N; i++) + data[i] = ((double)rand() / RAND_MAX ) * scale; +} + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double *t, double *lamda) { + return exp( -*lamda**t ); +} +//math func + math oper + memory loads +//1 + 1 + 2 + + +double ge(double *t, double *lamda, double *beta) { + return exp( -pow( (*lamda)*(*t), *beta) ); +} +//2 + 1 + 3 + +double sg(double *t, double *sigma) { + return exp( -0.5 * pow((*sigma)*(*t), 2) ); +} +//2 + 2 + 2 + +double stg(double *t, double *sigma) { + double sigmatsq = pow((*sigma)*(*t),2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double *t, double *lambda) { + double lambdat = *lambda*(*t); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double *t, double *lambda, double *sigma) { + double lambdat = *lambda*(*t); + double sigmatsq = pow(*sigma*(*t), 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double *t, double *sigma, double *beta) { + if (*beta < 1.0e-3) + return 0.0; + double sigmatb = pow(*sigma*(*t), (*beta)); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/(*beta)); +} + +double spg(double *t, double *lambda, double *gamma, double *q) { + double lam2 = (*lambda)*(*lambda); + double lamt2q = (*t)*(*t)*lam2*(*q); + double rate2 = 4.0*lam2*(1.0-*q)*(*t)/(*gamma); + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double *t, double *nu, double *lambda) { + double nut = *nu*(*t); + double nuth = *nu*(*t)/2.0; + double lamt = *lambda*(*t); + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double *t, double *phi, double *nu) { + double tmp_nu = TWO_PI**nu**t; + double tmp_phi = DEG_TO_RAD * *phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + + return *alpha*cos(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double b(double *t, double *phi, double *nu) { + return j0(TWO_PI**nu**t + DEG_TO_RAD**phi); +} + +double ib(double *t, double *alpha, double *phi, double *nu, double *lambdaT, double *lambdaL) { + double wt = TWO_PI * *nu * *t; + double ph = DEG_TO_RAD * *phi; + + return *alpha*j0(wt+ph)*exp(-*lambdaT**t) + (1.0-*alpha)*exp(-*lambdaL**t); +} + +double ab(double *t, double *sigma, double *gamma) { + double gt = *gamma**t; + + return exp(-pow(*sigma/(*gamma),2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double *t, double *Delta0, double *Rb) { + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double *t, double *phi, double *nu, double *Delta0, double *Rb) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double D0t2 = pow(*Delta0**t, 2.0); + double aa = 1.0/(1.0+pow(*Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double *t, double *Delta0, double *Rb, double *nuc) { + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0**Delta0**Delta0*theta*aa); +} + +double dnktf(double *t, double *phi, double *nu, double *Delta0, double *Rb, double *nuc) { + double wt = TWO_PI**nu**t; + double ph = DEG_TO_RAD**phi; + double nuct = *nuc**t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(*nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(*Rb**Delta0,2.0)*theta); + + return sqrt(aa)*exp(-*Delta0**Delta0*theta*aa)*cos(wt+ph); +} + +double evalf(std::vector< std::pair > func) { + + double result = 0.0; + for (auto f : func) { + switch (f.first) { + case 0: result += f.second(); break; + case 1: result -= f.second(); break; + default: result += f.second(); break; + } + } + + return result; +} + +double cpuChiSq(double *data, std::vector< std::pair > &func, int ndata, double *t, double dt) { + + double result = 0.0; + double ts = *t; + + for (int i = 0; i < ndata; i++) { + + *t = ts + i*dt; + double d = data[i]; + double e = data[i]; + + double vf = evalf(func); + double theo = N0 * exp(-(*t)/TAU) * (1.0 + vf) + BKG; + + if (e != 0.0) + result += ( (theo - d) * (theo - d) ) / (e * e); + else + result += theo * theo; + + } + return result; +} + +//create a random length from 50 - 1000 array and fill with random values from 0 to 1 +void randomParams(double *p, int np) { + for (int i = 0; i < np; i++) + p[i] = (double)rand() / RAND_MAX; +} + +//create map array of random size and fill with indexes from 0 to max, max < size of param array +void randomMaps(int *m, int nm, int max) { + for (int i = 0; i < nm; i++) + m[i] = rand() % max; +} + +void generateRandomFunction(std::vector< std::pair > &func, std::string &sfunc, + double *t, double *p, int *m, int np, int nm, int nfunc) +{ + + for (int n = 0; n < nfunc; n++) { + std::string sf = ""; + doubleF f; + + int r = rand() % 18; //randomly choose one of the predefined functions to use + + int id1 = rand() % nm; //randomly select parameters to use in the function + int id2 = rand() % nm; + int id3 = rand() % nm; + int id4 = rand() % nm; + int id5 = rand() % nm; + + std::string p1 = "p[m[" + to_string(id1) + "]])"; + std::string p2 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]])"; + std::string p3 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]])"; + std::string p4 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]])"; + std::string p5 = "p[m[" + to_string(id1) + "]], p[m[" + to_string(id2) + "]], p[m[" + + to_string(id3) + "]], p[m[" + to_string(id4) + "]], p[m[" + to_string(id5) + "]])"; + + //get a random index from maps and use it to get the parameter value, bind function and parameter + //values to f, and create string for gpu in sfunc + switch (r) { + case 0: + f = std::bind(se, t, &p[m[id1]]); + sf = "se(t," + p1; + break; + case 1: + f = std::bind(ge, t, &p[m[id1]], &p[m[id2]]); + sf = "ge(t," + p2; + break; + case 2: + f = std::bind(sg, t, &p[m[id1]]); + sf = "sg(t, " + p1; + break; + case 3: + f = std::bind(stg, t, &p[m[id1]]); + sf = "stg(t, " + p1; + break; + case 4: + f = std::bind(sekt, t, &p[m[id1]]); + sf = "sekt(t, " + p1; + break; + case 5: + f = std::bind(lgkt, t, &p[m[id1]], &p[m[id2]]); + sf = "lgkt(t, " + p2; + break; + case 6: + f = std::bind(skt, t, &p[m[id1]], &p[m[id2]]); + sf = "skt(t, " + p2; + break; + case 7: + f = std::bind(spg, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "spg(t, " + p3; + break; + case 8: + f = std::bind(rahf, t, &p[m[id1]], &p[m[id2]]); + sf = "rahf(t, " + p2; + break; + case 9: + f = std::bind(tf, t, &p[m[id1]], &p[m[id2]]); + sf = "tf(t, " + p2; + break; + case 10: + f = std::bind(ifld, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ifld(t, " + p5; + break; + case 11: + f = std::bind(b, t, &p[m[id1]], &p[m[id2]]); + sf = "b(t, " + p2; + break; + case 12: + f = std::bind(ib, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "ib(t, " + p5; + break; + case 13: + f = std::bind(ab, t, &p[m[id1]], &p[m[id2]]); + sf = "ab(t, " + p2; + break; + case 14: + f = std::bind(snkzf, t, &p[m[id1]], &p[m[id2]]); + sf = "snkzf(t, " + p2; + break; + case 15: + f = std::bind(snktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]]); + sf = "snktf(t, " + p4; + break; + case 16: + f = std::bind(dnkzf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]]); + sf = "dnkzf(t, " + p3; + break; + case 17: + f = std::bind(dnktf, t, &p[m[id1]], &p[m[id2]], &p[m[id3]], &p[m[id4]], &p[m[id5]]); + sf = "dnktf(t, " + p5; + break; + } + + + int sign = rand() % 2; + if (n == 0) sign = 0; + func.push_back( std::make_pair(sign, f) ); + if (n == 0) + sfunc = sf; + else { + switch(sign) { + case 0: sfunc += " + " + sf; break; + case 1: sfunc += " - " + sf; break; + default: sfunc += " + " + sf; break; + } + + } + } +} + +int main(int argc, char *argv[]) { + + + srand(time(NULL)); + + bool autotune = false; + bool eval = false; + bool test = false; + + char *api_name = new char[10]; + char *device_name = new char[10]; + + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + int nord = 15; //the order of the initial, overcomplete basis + int loop = 100; + + for (int i = 1; i < argc; ++i) { + + if (argv[i] == string("-cuda")) { + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-opencl")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-cpu")) { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-cpu"); + } + + if (argv[i] == string("-autotune")) { + autotune = true; + } + + if (argv[i] == string("-eval")) + eval = true; + + if (argv[i] == string("-test")) + test = true; + + if (argv[i] == string("-nord")) + nord = atoi(argv[i+1]); + + if (argv[i] == string("-loop")) + loop = atoi(argv[i+1]); + + } + + //init dks and set chi^2 constants + DKSBaseMuSR dksbase; + dksbase.setAPI(api_name); + dksbase.setDevice(device_name); + dksbase.initDevice(); + + if (autotune) + dksbase.setAutoTuningOn(); + + int nydim = 2; //the dimensionality of input + int nxdim = 5; + //UQTk arrays + Array2D xdata(loop, nxdim, 0.0); + Array2D ydata(loop, nydim, 0.0); + + Array2D xdata_pce(loop, nxdim, 0.0); + Array2D ydata_pce(loop, nydim, 0.0); + + int size = 10000; + Array2D xtmp(size, nxdim, 0.0); + Array2D ytmp(size, nydim, 0.0); + + if (eval || test) { + for (int l = 0; l < loop; l++) { + + int ierr; + + //create a random number of parameters + int n = rand() % 9 + 1; + int Ndata = n * 100000; //number of data points 100k to 1milj, with 100k incr. + int np = ( rand() % (1000 - 50) ) + 50; //from 50 to 1000 for different shared memory needs + int nm = ( rand() % (50 - 5) ) + 5; //use 5 to 50 of the parameters, for different memory access + int nf = ( rand() % (50 - 5) ) + 5; //not used in the test case, but changes the shared memory + int nfunc = (rand() % (10 - 1) ) + 1; //1 to 10 user defined functions + + //allocate storage for parameters, maps and functions + int *m = new int[nm]; + double *p = new double[np]; + double *f = new double[nf]; + + //fill with random numbers + randomParams(p, np); + randomMaps(m, nm, np); + randomParams(f, nf); + + //create a random user function that can be passed to GPU kernel and evaluated on the host + double dt = 1e-10; + double t = 1e-10; + std::vector< std::pair > func; + std::string sfunc; + generateRandomFunction(func, sfunc, &t, p, m, np, nm, nfunc); + + //create a data array and fill with random values + double *data = new double[Ndata]; + randomParams(data, Ndata); + + + //allocate device memory for the data and transfer to the GPU + void *data_ptr = dksbase.allocateMemory(Ndata, ierr); + dksbase.writeData(data_ptr, data, Ndata); + + //init chi^2 + dksbase.initChiSquare(Ndata, np, nf, nm); + dksbase.callSetConsts(N0, TAU, BKG); + + //write params to the devic + dksbase.writeParams(p, np); + dksbase.writeFunctions(f, nf); + dksbase.writeMaps(m, nm); + + //compile the kernel with the new function + dksbase.callCompileProgram(sfunc); + + //run the kernel on the GPU and evaluate the function on the host + double result_dks, result_cpu, tmp_result; + + ierr = dksbase.callLaunchChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, + t, dt, result_dks); + + if (ierr == DKS_SUCCESS) { + result_cpu = cpuChiSq(data, func, Ndata, &t, dt); + + std::vector config; + dksbase.callAutoTuningChiSquare(1, data_ptr, data_ptr, Ndata, np, nf, nm, + t, dt, tmp_result, config); + + cout << "DKS: " << result_dks << endl; + cout << "CPU: " << result_cpu << endl; + cout << "Launch parameters: " << config[0] << ", " << config[1] << endl; + cout << sfunc << endl; + cout << "Kernel parameters: " << np << ", " << nm << ", " << nf << ", " << nfunc << endl; + + xdata(l,0) = np; + xdata(l,1) = nm; + xdata(l,2) = nf; + xdata(l,3) = nfunc; + xdata(l,4) = Ndata; + + ydata(l,0) = config[0]; + ydata(l,1) = config[1]; + + std::cout << std::endl << "Loop " << l + 1 << " finished" << std::endl << std::endl; + } else { + cout << "Created kernel failed! " << np << ", " << nm << ", " << nf << ", " << nfunc << endl; + cout << sfunc << endl; + } + + + //free temporary resources + delete[] m; + delete[] p; + delete[] f; + delete[] data; + dksbase.freeChiSquare(); + dksbase.freeMemory(data_ptr, Ndata); + } + } else { + //read_datafileVS(xdata, "xdata.dat"); + //read_datafileVS(ydata, "ydata.dat"); + xtmp.SetValue(0.0); + ytmp.SetValue(0.0); + read_datafileVS(xtmp, "xdata_pce.dat"); + read_datafileVS(ytmp, "ydata_pce.dat"); + for (int i = 0; i < loop; i++) { + for (int j = 0; j < nxdim; j++) + xdata(i,j) = xtmp(i,j); + for (int j = 0; j < nydim; j++) + ydata(i,j) = ytmp(i,j); + } + } + + + if (eval) { + for (int i = 0; i < nxdim; i++) { + for (int j = 0; j < loop; j++) { + xdata_pce(j,i) = xdata(j,i); + ydata_pce(j,i) = ydata(j,i); + } + } + + for (int i = 0; i < nydim; i++) { + for (int j = 0; j < loop; j++) { + xdata_pce(j,i) = xdata(j,i); + ydata_pce(j,i) = ydata(j,i); + } + } + } else { + //read_datafileVS(xdata_pce, "xdata_pce.dat"); + //read_datafileVS(ydata_pce, "ydata_pce.dat"); + xtmp.SetValue(0.0); + ytmp.SetValue(0.0); + read_datafileVS(xtmp, "xdata_pce.dat"); + read_datafileVS(ytmp, "ydata_pce.dat"); + for (int i = 0; i < loop; i++) { + for (int j = 0; j < nxdim; j++) + xdata_pce(i,j) = xtmp(i,j); + for (int j = 0; j < nydim; j++) + ydata_pce(i,j) = ytmp(i,j); + } + std::cout << "Built pce with " << xdata_pce.XSize() << " datapoints" << std::endl; + } + + //default input settings + string which_chaos="LU"; //PC type + string msc="m"; + + Lreg* reg; + reg = new PCreg(which_chaos,nord,nxdim); + int nbas = reg->GetNbas(); + + Array2D ypc_data(xdata.XSize(), nydim, 0.0); + for (int i = 0; i < nydim; i++) { + + std::cout << "start dim " << i+1 << std::endl; + + Array1D ydata_1d(xdata_pce.XSize(), 0.0); + for (unsigned int j = 0; j < xdata_pce.XSize(); j++) + ydata_1d(j) = ydata_pce(j,i); + + std::cout << "setup data" << std::endl; + reg->SetupData(xdata_pce,ydata_1d); + + std::cout << "Comput best lambda" << std::endl; + double lambda=reg->LSQ_computeBestLambda(); + Array1D lam(nbas,lambda); + + + reg->SetWeights(lam); + + std::cout << "LSQ build regr" << std::endl; + + reg->LSQ_BuildRegr(); + std::cout << std::endl << "Lambda : " << lambda << std::endl; + + Array1D ypc; + Array1D ycheck; + Array2D ycheck_cov; + + reg->EvalRegr(xdata,msc,ypc,ycheck,ycheck_cov); + std::cout << std::endl << "Eval" << std::endl; + + for (unsigned int j = 0; j < xdata.XSize(); j++) + ypc_data(j,i) = ypc(j); + + } + + if (eval) { + write_datafile(xdata_pce, "xdata_pce.dat"); + write_datafile(ydata_pce, "ydata_pce.dat"); + } + + write_datafile(xdata, "xdata.dat"); + write_datafile(ydata, "ydata.dat"); + write_datafile(ypc_data, "ypc_data.dat"); + + return 0; +} diff --git a/auto-tuning/testSearch.cpp b/auto-tuning/testSearch.cpp new file mode 100644 index 0000000..e3b8efe --- /dev/null +++ b/auto-tuning/testSearch.cpp @@ -0,0 +1,22 @@ +#include + +#include "DKSBaseMuSR.h" + +/** No accelerator device is used, this test is used to confirm, that search functions + * used for auto-tuning work properly + */ + +int main() { + + DKSBaseMuSR base; + + std::cout << "Start test" << std::endl; + + base.testAutoTuning(); + + std::cout << "Test finished" << std::endl; + + + + return 0; +} diff --git a/cmake/DKSConfig.cmake.in b/cmake/DKSConfig.cmake.in new file mode 100644 index 0000000..d764963 --- /dev/null +++ b/cmake/DKSConfig.cmake.in @@ -0,0 +1,4 @@ +SET(${PROJECT_NAME}_CMAKE_CXX_FLAGS "${${PROJECT_NAME}_CXX_FLAGS}") +SET(${PROJECT_NAME}_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include") +SET(${PROJECT_NAME}_LIBRARY_DIR "${CMAKE_INSTALL_PREFIX}/lib") +SET(${PROJECT_NAME}_LIBRARY "dks") \ No newline at end of file diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake new file mode 100644 index 0000000..c0b848e --- /dev/null +++ b/cmake/Modules/FindOpenCL.cmake @@ -0,0 +1,139 @@ +#.rst: +# FindOpenCL +# ---------- +# +# Try to find OpenCL +# +# Once done this will define:: +# +# OpenCL_FOUND - True if OpenCL was found +# OpenCL_INCLUDE_DIRS - include directories for OpenCL +# OpenCL_LIBRARIES - link against this library to use OpenCL +# OpenCL_VERSION_STRING - Highest supported OpenCL version (eg. 1.2) +# OpenCL_VERSION_MAJOR - The major version of the OpenCL implementation +# OpenCL_VERSION_MINOR - The minor version of the OpenCL implementation +# +# The module will also define two cache variables:: +# +# OpenCL_INCLUDE_DIR - the OpenCL include directory +# OpenCL_LIBRARY - the path to the OpenCL library +# + +#============================================================================= +# Copyright 2014 Matthaeus G. Chajdas +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +function(_FIND_OPENCL_VERSION) + include(CheckSymbolExists) + include(CMakePushCheckState) + set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY}) + + CMAKE_PUSH_CHECK_STATE() + foreach(VERSION "2_0" "1_2" "1_1" "1_0") + set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}") + + if(APPLE) + # prefer the header from the Framework + set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/Headers/cl.h") + if(EXISTS "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h") + set(OSX_OpenCL_HEADER "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h") + endif() + + CHECK_SYMBOL_EXISTS( + CL_VERSION_${VERSION} + ${OSX_OpenCL_HEADER} + OPENCL_VERSION_${VERSION}) + else() + CHECK_SYMBOL_EXISTS( + CL_VERSION_${VERSION} + "${OpenCL_INCLUDE_DIR}/CL/cl.h" + OPENCL_VERSION_${VERSION}) + endif() + + if(OPENCL_VERSION_${VERSION}) + string(REPLACE "_" "." VERSION "${VERSION}") + set(OpenCL_VERSION_STRING ${VERSION} PARENT_SCOPE) + string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}") + list(GET version_components 0 major_version) + list(GET version_components 1 minor_version) + set(OpenCL_VERSION_MAJOR ${major_version} PARENT_SCOPE) + set(OpenCL_VERSION_MINOR ${minor_version} PARENT_SCOPE) + break() + endif() + endforeach() + CMAKE_POP_CHECK_STATE() +endfunction() + +find_path(OpenCL_INCLUDE_DIR + NAMES + CL/cl.h OpenCL/cl.h + PATHS + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV NVSDKCOMPUTE_ROOT + ENV CUDA_PATH + ENV ATISTREAMSDKROOT + PATH_SUFFIXES + include + OpenCL/common/inc + "AMD APP/include") + +_FIND_OPENCL_VERSION() + +if(CMAKE_SIZEOF_VOID_P EQUAL 4) + find_path(OpenCL_LIBRARY + NAMES libOpenCL.so + PATHS + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV CUDA_PATH + ENV NVSDKCOMPUTE_ROOT + ENV ATISTREAMSDKROOT + PATH_SUFFIXES + "AMD APP/lib/x86" + lib/x86 + lib/Win32 + OpenCL/common/lib/Win32) +elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) + find_path(OpenCL_LIBRARY + NAMES libOpenCL.so + PATHS + ENV "PROGRAMFILES(X86)" + ENV AMDAPPSDKROOT + ENV INTELOCLSDKROOT + ENV CUDA_PATH + ENV NVSDKCOMPUTE_ROOT + ENV ATISTREAMSDKROOT + PATH_SUFFIXES + "AMD APP/lib/x86_64" + lib/x86_64 + lib/x64 + OpenCL/common/lib/x64) +endif() + +set(OpenCL_LIBRARIES ${OpenCL_LIBRARY}) +set(OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# Ubuntu 12.04 / Travis CI have an old version of CMake that doesn't +# support "FOUND_VAR OpenCL_FOUND". This could, in principle, be added +# at a later date. +find_package_handle_standard_args( + OpenCL FOUND_VAR OpenCL_FOUND + REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR + VERSION_VAR OpenCL_VERSION_STRING) + +mark_as_advanced( + OpenCL_INCLUDE_DIR + OpenCL_LIBRARY) diff --git a/doc/refman.pdf b/doc/refman.pdf new file mode 100644 index 0000000000000000000000000000000000000000..41d9b03dc365710e3918f4a89508f667332fdac9 GIT binary patch literal 147817 zcmdSBV{mWVw=Ejmwv!dxwr$(CZT@20wrzVwE4H1iI9a*-;NExM+q>@hzfZUBtodQo zn6qkD^)Y&Dy|+I4Bv%v_r(>dLgCSp79{mNwOvFg!U~CP;%L~Jx>g8xg#Gqhot>S72 z!yr$@#K;K4Aa3RC;!4EH%mKq7Yi4iZYDvVz%*jRc@7KRQwRJUfCSnk`HF7l*H8XKA zHG|>jhjDRrHZ!t=@!aUulyli+N9w)Q*uyd~rtRZOo|L8Ls4JKHVe)YnLqZc()^yfb z!hTbKtK;57i>3@x`U^6ImAJ?6gP;LON}8s=MqNlPwF*s3xg?bdr+vy(H>W8SP*Edc zpbY3?F3EW$HA#6v5K!RfO!Wa$Vm zU`)tqn*tXj=y!?4b5)fkyC~@yMgJzECU<-qkXa<@D}oUMxvRnwAEYd%DF$&lJ}t1o z`&r{CCoXnmIS45dH!6Uv#TgGqQiq=R!xUT$g_ur)6pjgb5aljO+;QtR*&1TL5Gbqy$cpZPM6Sch2#k#=T4Dpox?`%DJ^^c>gvJcil#J7e^oU}@ zn)Gx+nW8P^OL8JH}>5(7QKI8Htx-!P@t5WAOJO_IR( z0p$+Xr7z+gtd|EhF)`9CADXzsXu){`zX?6vvjX@Uhle!l&)v`5b42Zcoe&&>Pw*4O zYh;z zMcdhVY(6&ol1wOEW4`p%qu*B+68t$ft9wUyR6e_%lI^g?oA>*~JWqhbMbVy+to`J^ zO90#l=pCD*wW4MJL_W#NS!epLRnbbE_CuvruQCt%#sGvh)j}cmS-CHT*uZ&QXX?A~ zrpUtk>}zfooumHOPP}ke;Wt;lw8D8`I$3%r^W#VnsK&fIl|cGgYqMT9m>qB zZ?c~%B@SadsiO*J_wz2q7GCRp6Qg$C{U<-F#jtZP!p`tC%U!X1V?xS67J+`3k~O3; zGfvK@x7I8Vxk;484oF0O5y^G`q&Fj2EfVxuwFk0?eftIM%l>f7El^Ecf)nSWcJml~ z-y(#Pf0pp$ zc6nt2qTfnQY&_FMil}1zTyp>Zy%!PW{kx^+G+0`z57TmdO@wH-~8VAqPthee5r#h?~dSCO(OF8W-hA{cB-^ zN#Z?5*G(mvHtUgVVFc@f6@=p zzhEX7MM(`tMaNU-0U=Nz6H4&?fA|JF%iouRlZo?RUW)6?Y2{5uG{G0&Fb#YQqvG}& zx}&G#EUB5sHOz z&?YF~$#02DdM6Pp=qH{NVYT`v4(}c~v!M{P{w%1y0P^O-sV4YMR)a_YR&7f@miT0* z#Np8-vB4fvf5VAZA~GvrNHERIMkQlHCv!rxe@R*aiJ%J64`P80;%Hn1n2=$w82s~A zL&Ed|uyyc{9K249mi-!hy-k`62bQZlF}3i97XgNAcaH{pYOcOp z_f#WAqlW8EsRTqIxtw~IRW8-Fju-4Wzb(|iLc55YVfkQXKHC3;r9xrQ4xgjrR}ue)HDj7qT}()jHgnu{!g(J5an2l?1q zRd;yyaS(a3Bs%bMVYWj*k|bIrPE)lSYz__tRyg7J&3Sl`Ap#NSIeuoZgUL+LGMPdw z7bptIVB#1f$O+okGronRMIPvo8@yDWZi_WrUH!};mjkzeziEHUOXa0wre$LKq`A+ z-7qqot%#Tj>jLiCZraVEfr%iyda%|9vRy>p%m5b-JM_D^u<4z$v|J2pkY70Lg!*6v zA#s7T3_R@}b~@^CR?p3N)W{Mr%`wH5*r=ixpwY+wLL{GYl$)~U*58s^Q zr7!Mnxp-1zPPJtTM8;D-o({JEl^A+ium{?%GTF8;Q_YAZJ_nb$e=w(3_{x2bhAs^% zS2F&pwx(D3O1s{$n!4jBt6)uiPeqX?h!*4{=o%(&1}?y2XQrN0pfs1)BiTa#jd!Gu zr-`IGtK%{J?E}*()uwL$i@i=X0dPrx9C&oBq>Lfg?qr76nnSZLQxAsTNf0hGProa8 z(F2}Jv96Hem5Ww&b~moYmcN!wHO-*>V|~Dnhb{Gb_VwIf^*sE7^Z9DxzL#3%=it`t z_d106XTz}xuTT&8#V1(e$&(5^4wa+|0$^CEIBI?!*F#ab#= zRQ@#o9AQ66#>pi%Vk@o8N!ED7AxBrH$!}jcrzV>Ybu9C24(l?IV>MEs@}f!PC%%oE zHhrtPw}%a+7HvHkG7&f;rfa2|m&b;> z$>-EGE)b06IL2sDCB#)04p*XH)D0wTv-IrrYGltFsuR>kF4O=0V zcKBtgF$DxXT7K7rnON{(o+G1|1hDcRo7x(Wqpwi7{an3DiUUnx5) z_4|=0uo;sH*;;U&v!aa#h|ZShlY-m;0UL;ujXLpjXETFi4N5aXKj;`jD-Ic3aZaS~ z;mE_Z+xrP(Cxa=wCI`g81hp=MTHqjDR!1cO*34Ues?D;26ZaZN}-Jlmsu#v zDv3iABaRx#5b6pp4~PX-gl?up0>#u`JdECo${+0B30-N1Yb zJToY;L1M3HG9kD7>?0@NDXiZrK_M_rnQDX(<6(xj##6jtBMAzfF}x3|NgQ?^8nUJ+ zGgv{D0V7kJc@i*mkpIu;0h#}Kgc;2D*B)_QQ-wn@{K*8zO4G@=Ywy=&WK9BpbnSfLbzeTWmKR);}Lp zX}qqNBRtU(k*1Cy5H=5nlV33lia(Vh8zd;Pv1X}Y5ZZ$DC?0yZ16h-(m-o#n2sbaB zCvU-;u2wj1*sUg-$qhmwp+WgK3kt!!U-8vR@3F@!=XE|!wp%+P>M4O8E;rhm=99q zAsA^WL9~LZMKA_1FnBw8Pl~Bx$(BsN6y4Xmb9M$dSt<=hw}SsNc5mg3dyCad6}$)a zW|)GiO};C(36I*O<9$7I>wH~Hl1x6msfJZ6d)=5t`(!t2Z#Tg#_x$C#Q^1 zFv|q=)8;pc$xx9%RZ@Ia;3JqxqBx^`245=E6;=gO@8r0vx9XMDra0giEZS~~^#`&E zB^In;zzpJaox~E`EVwb{%!|i;RznxZs55$w=4P3lstSYU$teW;nZwB!16iAgW^jUzpIZ(O2Aj3L5jMbGkY-aLK_ z@E{5QZ_qDi9mLL-#E;S|22*I-)MPUjO_nQ%VTC^ z`j@r5S#2c;JPri^U4z2$XEB=S5=vm9Ac~+{n3fKb$>D7-p*>C?jbbjIQ#XX(fZh0} z9;d4jp6n}5uRF`t;(gyDN`JD_9A`=5aum+uMnN&*GPDZx7Bo%lTClok^$>4?Dj}bK zjMznnQ^Qhv=aawkpgEMRy8?#+N~z z)?Xwa9Lgl0<9AAR9hLOk-%G-2s$VEg@9kjUbm;hc-m`!36tjfF4e>;Gw3v2^Vra2C z3^TOgMAQOQL3M`qO5O_etYNlrz^K8ACnmB&j>6ju@Q}xgP(=L#tAt{`5?jG4S3Wcs zju?sbCUxaq*os(OD~#*FV5&m1A>eI5calJ~38TZ3YTh!02Qx4`TNHoYb(Jcjb$N)T z1RG#CNk0MOMEumPrWh=6+OlNSa1SSu^-TjBeNU)`Xja9*&Tp)$YeTb^J_`X>4VEsL z0MZkdKUOTQRu^75@ncne%&a6|*6xiDOWRVk4ij2#nP>%%h48!+Ms266sDE11L4cod z1ih>rB^(4w3dl1`?--t73g~z|pQ=3oRKM7CEahOt_NBnE&NiR~cQGzD*h<$4%Qw(E zQ^k6bPfg|v!x&wJ2$;mKZ80_nsC*&3TNpr;bkbv~h^YR`XYU_IbL23awpXSv&fFE^ zSC<>ZFFCRo!D1Vf)g$NwP9Ef760`p;7)ma>;H=Nx_KR5K?^lIPB-N(tr z6Nh*FaW{+@bDP(U{<-dAd4k#hd6X{U?{$_=+F$e4^f3KMX0^0G8-c&N#>2bb^H4v# z{dRJ*%rO3S`r=`@T#NN_%+a6odR%Gn=Q7;`*Y?x^+P4$_kAuJ0M_~cN$*F_LZX3;! zgXWjpS|gq<{rnB|b)kwjjKa{v-qAtWu-~0wqwn)~dv(L-)7ftI=)>ar`RK)OV|Z=q z9hXMmChh!Q9s&JYp0B5XuO@Fl=ihf%@s1h6A0BZxan*`7*P|dwbHpIqI=B3N&R>@S z-{S)NT0&iP>g9-WfPsj1)YXNPstKvEmoAf$T z#iffUiwIuKN0SLZMDpPNeNZlvhv3D1be`~E2l`Xpg_Ha{qYvlZ8kk=%^FPY_8PX8v z=M3uSy2pOT92=wy&=KM*fA4eikF_jgpjs-)6?3swi32Cwq-s_bY{u`OfbjphKD!>h z;8iy0^AQ5pkB5g*@$Dm~E=(&9QfW=mwE06BOCuos0p?taAkRdcv0edsNuS!il4zj_ zc}z*5n1TcDUY$uI+{yFrQ>sYd_ms|{dr&`-vM07T_P%UH4B=LJD}mra;s(rP7IHWC zxN1~O#70VPq>Vv{l!m`IF^E1|;$zK}yy;QahusH5(G;YVLw8>}6FlncjMl5Q>lToP zguu=v8yYMVn2+(EP-9LYvivT@ZCGEeAw*h^tc>Eo-zv$XR?dQj(r4M#VL z8!OyWMVH6ZYQD2#|HOMtW39qv&uR9)h+0Y8FcobI5Cx1(qcEPP&7is^yts<|#74r# zXwcBba*qZ=igK6r>LO3YCZgh8h7}83?TJ(}#OK{kvW1XjGLeyQ9Xei*6CF zFMF>=rCzE{vuDO^ZG6WK$LWx)O}-n zFwfj^FO;`{WwEZJcxOTELrB~y`xN5;_dZ%run^J5f4E4{ zi0I)yS|P4S@N%1|{*S4(@E-P~R^s}5{%L%TRkO}#q(|yH7KmNbJbQhXg+}s{)Vy7s z9&hS_48sRE?lSDxjscm<+a@X!ml|jpuITH%T63Z!W5cWweXbAI@qNuzCu$An z^epoUs+HzoDxKXRLS2DGST|PQDG3&ihf`2xDc)`k!K11PRObkNh7cTeC3gz`BSnL2 zV6%}i9vPk3IUTI^*sQz$Q5P6_d$m7*FXUL<9ViQRM7nGZ;T|AaL07)JDh#lAI#Hpl z%vTZ_-Y0SN6)k;{&+iS3`6`Sdt`H>>*ZbSB=Gn{fh9JHOdkP$qptC^n1$|gX6k+}K z7WjS+Vbfp|3sx#o)+YJ9Np3L(LYG=T_X2DYw-9T)$Y{F7-CnFAz(!yiobsy!TX)IT zVUk_K4A1rwolum@-#7w!7JZ}eOy{NlbYudE?-4AMvy(BnQ{8SlZ3!;hF$dXIMW7WD z$R7uWbf<~DbGPUb9jPVtOIAM$TiES$4!*PX-;igXlR`@1-m=>;@4w6snyqW?|)!FKrJl7~yw7Z+992@Ikw)wz}b zxCJDndSo%oA&4|V4Aw7j+X2q?qP45}U9jrcyjgC2KzLq$p#*iS&GW6BFXy^x(ayl5c`nh@PIXoc`o6`q#ji2h zom*|1hTC&EKF<@M5UVbsBW_b%Ta4e60?BYZDC2dfOz<#Ps4^#_Pw?=Nz)_a`sINex z-0r%0hxyd&)0x~4*u|?|V=$=yHl_N z*(*_tG(lqkdN{7ohm+wzG3F^Cmd*r0^^$X6hjrMcy^48SUHt9q!Ze@19qfqx_c)S; z^&hmj|3kT4+5auMoK2bG&wnuGUm5WKQ(pg{MINqyy~v}MAg2=YZ(_-3G~T{o(;Kbl zautiWioMKyl^tH-Nsa96=VebmUOM(GJyl}KRqXerfeC1}Yg zl{Pn_Y;tLG>K#YaoYm~q zSz;NND?6hBa(0;4PrC&&;>0WWj*P2t-pq1jkRCmSClr@H=}ejW+&)kmIzX%05nM-p z$C1>up8F!wv@*!T_SnhOb#nAyIFgru@ZSc#)vfrGo^5N2_OBG>ZN!%^3d{&kx?e+@ znppOjWjcO!Nd~LbAtL`^c8-5AI};1@e~MK9eQj+;)6U_qQwx7?42TT{jX+ssl#3|Q zhXdKQx>)JbMj@VwsX(;L?1-6<&&exaEytd!cE^?U$&-Qcu{y%AbIk6 z^22jqB$hpd(cT@g<#UjDwthq^9l(vxd9B_05&Apnrjv;8rCzKHkv#OQmBV=;E>+~n zs6~>D2tbhM|g>^xM$tczkf`&=KS zGaV7BJ6*!T0WZnH!J;qfw}eE-=OhGhN3c}acdyf!OSpvO-d`m0SzSsg6k`13bflT-oqEk&_s%GCYdT}s08~F zT@wp4!9zT9bOl(YBY3uXS{5_f<8O{fXN_^DV9wTXNZsc@NLlmh9r)E+sRw=z+!B6k zjBG&7lrB*3*}o*ZR~*>wlTpwLhsI*IeaV@WTI$CF-ZDTLs^6Mc9g_MRt8X?GJZE~y z87ZfU!`jv!2oxAjiFQp(do}l%g5Dh7={q^6{Vh9gw%k;n4>qcRjoUijAP ze!mpk@RCn`G;PifKPZKOedW3I|Z^y zvs&(Y|8n^RwCVo-_bc>v6zPdTgi zO+*D2pu{!e2w+QkdkK6dX&GEDEbpt?)<{$)Y18h4$99zRkQUpcd^UB9CQ6o62Xp6= z+CF-t%fTz?S=6z%1yV$-yeG#`+FBM-SDn*O6XKP@H962XY9HtuZ8$0FF?Ka+VYd@+ z`u^f`>3(7sayz$>an?%akLb!+wVrY1*A!`f2-rJhsn2*{SU>uy63U~3z-iy?emSMy zX`0T%dleGLgeTpd74>UDtzVFjtLlnJGG-h?2`F1%#Kp>MJ2v7pQZ8@ZJ4N<%02_F= zxbf^4_!)Qt;rq@U1Nvy&Z*tB+J5Yzm4uW}1Do)K?a+{mBZmJAfo4%o<>qdKP)MRR! z2lz46;)Tz8uD&|eik6JYHA36MK>f72fG6P%^u}WeWDXlQKb6SODTp65sn+uB7Z@7| z{fwp+8f|YRiD7vCf^gO8oG@Gz9_->_VBJ{-CwIeoiN$zc`X8RAa;Rut3$}n5@)lpd zC$&Ft9F(piXr3*8l1(Rw9(HOPrxfCjkh7HHSk3ISHv%0X^PZrlN+-=CN|Ai(zws3f zaS3nOVs>vL2TsgtHl{meOZdrVHs^OonK%3SkYksuJW?*z;CM3Z%zVp@ zH!XI6GstfGBzB&ZI2z)HL8_Z?d?Fd=_rZ|o^F#yEq|Ft{4xbQsb5h;4P}@2(I~89* z9|aVR1B{IG6@CXRBT_}kRHVJvJd1vl(loKUToYwYI%O`8VAEZid=pMrs$LU)!rBdu zPX8TySsDK!dSGJVVEYd#_`k>AYi%8uY;iRIwfa4(5c8^QeIt$~SBl0z)+7Fs;_`D31Tv0t6e4UfKew!E)@V!Z*vQz1sY zZfNel@kByVEC{(W>BQW4Nwvr{)$x&Wa89(#_UhO}f z@0L98oCFO?uM&=jh9vk==g0_j`-bJAUJSUgn8(Py4a zB@VQV!!LzWhn5}=7x-#3``3ju95KR07xZMAi_mYew!J(hK)tB2HVT7Fb�!RX@2m zX^vaDv!D%$$xwMTLPw9XjRGje2+A3m$z(8gm8o%LXbtX#AY^A~P@EJ7L&%*958r4h zTfBfrEZDd&NaxJNUPYu9H77-c->XyYPq?9tY34kqgJp@KYqZ}9HDFgu?uB4>J;Xza z$VH`Itt1<>#f_vCJmd0=LmMm9ePfuKgK;fQH84m|akW7wKha4CZPVlh!RuvUm)TBr z>BGEYzyV1`DYEmfHOVC3bvbo60@1E%z~t-JeHbm`p~T@_v3OEr(&BmA;y8ht`sI@Z zwy6u$7@6cb{kyVK(0#9r)ao4t`RXYM z5$nPaRVbOYa%QVB`AAXV9@JgiY64BoF5yPH?0vDpD0EuRZ z-*BC#pC+RkN~2Ceua6TV>HZ==k^<)FhXYbA*8^Cq=E?Ege8@qUwcRf(k1YUZGfCEb zRk#Jg`p*)wtTWzvnB6JtpQbY3YU-TgsTxXIc-KIE%HJZGhJ8)+jAS=}KnF|fI>R!( z#{4OAM2U39oCJ9X#>z0lb9nP>@{3G{%bkUeUAXsFi7Mxm#B~%!P(Iw$n#vn}d9XSW z-&NPb-Eb6Mw(K&k9E~5)xWEC|D&(Ibj`L1ycFx1w9pM|NI6K>&;cLZ+S2w`j(x_3q z2fKA?yHMMZ5f%o-@}qr#2B!o=1>B+BQ*94Wl7+GX&wi$XW{k(?=KCF$0kYVGwm&<% zO&$>!bq8HHshH-E&Q!y~Q2;S!svC+^ysBhqtXdKB?cB1X8i((8*t|_ug!UP12ulA$ zOA~w}#Ia@xq{JSp6x4*i3ojEDslXNkaWU>Hx!TH^(`YNF`802*oJumW5mG;&93NJ6 z36NwHIrJ|U+b4(z2S(6RJPQ_KrX=Z4pICEm_bF{?lhRQ5B{tZ{dI!-rH;_ZIM0tK) z$a^LXvV?xMdZ;(bd5SOEwP9W`qt$mxHUjF4izVG`AK*iHx7y8fL3>k?MDj+WjdX5opVDJPe*A^S-tV0}aOs z2uZeb=I}$Q8ELxuZC0tt?ahJzg%wq*Nt()jLNsowoe)dkZ%haF zRvDgn-nk*LQYG440c}#F7yF06$V@AshPIV=7F@XqM@3L(Gx)}?ahf6ej0vh zv9tbWN){lLMr&fPShFiRXv)lnCnJ%1zPSTWKSYNaM}Y~!T_p-{B@Lmvi3(- zDY$l0>UBIL5OX3I`9jE8mNu5reQ!v7+4;TT7gRu8483+ydQh~2R^BTUr-g$j(vrxC zd-%vLg74?ltCA$&ubZ*@`(f@#-adgqmXMD38s9pgU^?Lw$kp z24@X8Dl!=xB*(1RG=u$NKSK1aW%XF%tu|(VFoYCm3eY^BIO7F zHtCS6tVte6(xqn69Y1~+-kR^XLMj4_a2gN|G%an@dp5}J&@)O+`s1WyX+P0ghMERP zeDIam8EkXV0y%WTEDKpvCI={S)I8oExOFV1Yy>Ko<7aBupO`t%;J5i?shwb+G65!z zkPrgmUU|o-F}5i8oLi8-hS5@l(0v8~ge#&=_kdy(Om1~2Y`Viukxm!T5PRwzv=1O~ zZsdz=P_&0Yw9y0$cxff+7xT`L+&LA&Sd>gQ9wZ-3m&H1%T8(z+BReA!J_2wxCeOON z;b*zf@2mO>%I0%Z6x%d%%_f??2Ka_OXG2Bd{@-Fil$+>|+*mUH$*%5}KlY?7D&7r| z(FIZ-%yr1!hg615y;JU75V#DAvcn^??ounj3j^MW?C=X$;Dw&3w|T@lK~>v+j~;kULV+%u?nIj30{g z1p9@4zk<-lzBTq!N1vO9fHc@!IF(E|*Axm?Q5j~v1TTw&>4==G9c*Wbm6^}&*>;Jq zBMd`3&ld703aU+^hteQqf_2$ScjI*>I}5g}FPW~>xj+2?G}HRxmgZYPIC`nyxp8Q4 z7&laj4Yzs94Y#$(3_pF0aO~CHD+o@F^ka`_3K_mI1B!1Cg2xv@`Yr_KnJ`RGtudtW}nE?J=u%!PXT;);z9MY|o`vx%)4dv+9BS~z7Fc`Q6cU7pC4 zQ2tfI7UogSt+H^!6GE2zWS*YKfLL;fq5O4)XZR*x#T-hwWF-9F`|Ua0bWMCgOv3j> zJ*xvFVum)-b2i7scKX9Pxb}yBDVtg3DbWnt%@q!%k>@ew?guW9CpB~@;SIQO>4ynL zmj24^vD0Hd#ntWp6u1Lp+fCvyPQr`ZV9=`pIyfZKNJ!BJ5dIG88J@5&hkZ;g!+=yec$nZ)m$e)&8MI8E2(-3TC*th zkDEZGvqvBG>!WA;?BF&1T=465D?MUmkVryQV-BB{@B($E+ob`;qBA6Z8sLv-3DlY?OOka(Z|t1oq}&B z{9A%nmIzX#?wi>-@A2FFQT?Bab7OJ_{`Y7pS%hWZ;VX`m3Le>%o{i+?8lEn>FEW_r z`_hbk^vvw|ekcrH-BGseOP9jps}uLR?jH<$p#i|`R;NJ!Ktyb8eQsLuoYPR8UpfoYQU> zBdc!8W*-zZu%yZ8W!|pJ+kN6Yu7*Henojv(@?YvUJS$Vp9V|J?I}q7TO>uU*C0^>h z+z*A4GehpaIj8o%D(>v9alAei2NzbAB@&Lcm3>4h31rcq`g%9Zz4S3%l?YNxQ+Guw zzM}qW6cM@fSsiSnriVJc>xNM~Wvg%+8tx>IkfHdWXJ{`%RBP}bhY%8sC&G~DxamL2 zvXXO4ucnV;M|_e^9~`E-#~ZmwiOXpbfEJSiBMuvpNM!~>1l|5 zK7HlvMh8N%0m^T4<$!^~5PRt|Fm!_2T_D7*ue@LGi}eRz5oH|hbA!`m3_SZ+psYE^ z_uvsiC)J`tv?1Kl51Tj_{)Fo{uPnJ;(z>yZ)7?FjXDIWmb^z;foJHR7E$?c&j)K3q z)-lmq!--S(c*s!jDGsw=*kxprq@M)V! zF-(3O!6YyTP%Uz30s*3$(k9mvu~aM!8}ZCc2=mh_K(s#dYd674b>( zR$DHZ;UsM%8e{P`aP$3B4tLlp)%zl?C3LVj0L321qy+DyOGU&IvE}RtEDAKJZDy-sLvyAXwjhB(x=NCW?uG$8EEquB>2Q=}I?`)bLYxg76s23-|GjF4_? zvyDT|0NCo2Z^y<=V`w$}cAXe%&~?Bg0{%Ko+9YB&`1%9a5&I0;7xCeOC-;gZw=p%v z-ShjyaX9HUYjrcoqI(TBbvBw^r_fz$_$8+breLtK@6?wiLz21-5!E?j8xmrKKt z(fwI$Yxe^gmB)vPO$3mDbJKGoH-vRO|5Oa?=ImXF(6Ytp<7FCzyM#2UP}FXGH|op0 zGk6BDC*U(mf!*+XTj3nfs0XbR{Cl=@$=W|XR z{YL4=nFA@~$ww^iuwy{v1x%CJeE?A}X^ozFTEwIqP9F%^xePj;<;{U{h#h_?4W)NH zYoCH*n|rBpMwQAw%cYIWRMV(d#7BKdin#0`DzEsiwi>(01ZcObf{q_L0&qxG%Yp5^ zc_DG}YwtKAwHg~zI^Nt}(>l5@(b@WFz~i?+g{dEu6T1r3g2`!88tz9-2evt$2qb`= zkEn(-PMdCBQ09Ue;&V){&P3(c5afYz{TcZ;KfQ}iCUkrGs5o8C_ z0r>nc;)oTxl9alxur@k8#`%m344mYP=PR7mznbFMC%Mh?EV?_Wsy%dEzstF0F8&A~ zbg~YBpuAt1lH7&N*(b2X9|I3i5Qlfjy+z@Tn0k(FY-jlC;_Ki7>&vb3XR)AN@(?IJ@$yv@N>@f<|1g0;y)kW(4->X^04M7lp=;fuZmBR$OoO+D{K!c%=aG zTH7UQpi>ymqxir>t{_V2;H83X6PhWN^Y;V}7u+C5yto zu%@i5c~6OL0H~^zzdXdf8fTyX&S<`d*IUOFi;q*Lh0ZUTo;|bcFNP=C)s-6vtj4&9 zGNyX1SF78kb*Nh-7q58DG9NRq>jG~+Rfwo$-Y2fPk}L5F6W|5 zExGb6K6(&n9O*NauD!Fm`k#kL8eKk})|eq+YsxQ>8o{PFY!_nVuk}(%+m9!|SJ+2< zG@NdV?LIVNy%>;;KGdh79!07jZS*XAraYKi<83al2ibQ<>eLgSBVuSO>o{EuQn*VR7OBPaBMp5ry$k9Q_^J5mj6}L9P5K$; zyIY$%u6Cr&#mItJCyV846B=m~9_Fj7eU$KtZdv<_d#fS^`x4{GL`p-ohnmm0BqrE0 z7dx)zCGvrUgtYn{gq~aDS5qO+UDrrj)YfJ8=PB>&Vp?(^#&Xx&Kc&EMgiX!*revIQ zBM7@GloAZcC$ghb-%zvXE?ZNjsgDbvpEE3??I`PE$5C{f!>}vDE_HS-5h6w~HIM%c z%R@r&1qIc_&Bm&cfPI7u8DXtU@j9_V7Z0?nsu6vIzuP)eCkg8{SMON1-0sm-@4~py z?pA7ug6;n8E6scQdHhz0tSMHw9nRl?z+tO}(-$2C>(YNcf5RWpNBi?G-|VT@=QGPV zIAim0EjP~b$fwoddod&+IC0ZcEicTkqcqU=j}6yXfrpYhfz-KPUF*E>g|Wo3A;uWr zsdsW{sb1kJ&zhBa_~Qn5m&VWfJPu8utu;r=2oA55Ev&`IR%yTv$r6_gENQ`Dmgt)@ zsJa4JlwDvohWUbrx%;_USJbs@QdT9+$~~b~AJ|@>AP>-aEAY7+5cFa_=)_v4c)zlg zKZw3hGI%5MMvP2Qg~an1Zu&|Gt@>r5Bk05>kXSTvf#S#)4&@KOu7uO*HhQ55Hzuu1 zZhS|Y2Pi!UZ&xX@w^zRvFjjC$(Y42mMnxR?BcM0A*QRR2J5`7l?Q6qpPvz*@gapJu*7j!iH}^nlbc|8by_Nt_MLIvO^xpJZKp9ePw01pw5bPAv=AxXlJANNNcbVn8?Tc6{G z<4=B1o>}X6nMCfdQJ(ZBdi8z=M8B0oclC+p{AI?Tk?v}*jQA{P{Scq(oHdXrFZE;I z&@>x05`M9H^VfW%& zHnQBWf}r+XmQ7PZ0YFAWr?!L}l+VbewlSTg-Gen-@m4>wSJYfWgMd}DA07;)8%;Pj zVpCEbD<#;%E8@FHe+mH9&gF4?Vw#viWtwBmyzweMnQqL%=dxt}ASL-pKC>m6NSFzd zy5-(#8VpuC~EyiTdNhEGn9JfS@-IyifjPrNH-m)4NNX__wi7JuGpPtm~j^S z@Fl|@akTRew?>`XdzlmRa~dW#$U7a}QpJLyl47Lm#4Ht#zO5b%rz%ZbxQ|5mc(5hV zF=u>JO(S*o!y!u=uiwvTKqLfWD4f-R=4~lm52gxfplvuJAk5*zc2>kX6O{xzTItMo zZ%NhH0$^CpZf#U)XMHowGpE+5j{4u7JudwUcW<5gVZi5G%9Tz2YAo5Aa&cx+EHmg* z60?9fZrVv9Xr|Fp%6&bzXja>1NuRh=j^i!nU{iqF%+kPeqE|(bFm?ma)LXqvT;%^G zK8kRLzSURu(WB>*X;woTjv`ouRS^jq;l}`$pdi&8v9lK%z&+TfL7>`dBklHlooaQn zX{wIIsR!^qKrRZ^8iK-DjbT`Q!X;)I26wE{m0Ev<9Cz+gJN-tas>EI%VYS>fgwC#_)_YEsK^*(+n!h3eJ>E0%czrx}Z-}ve$OoS->nIMSaIq zqp;_aZnp*?|4PW48@ohzYeM*Or>d{@tzh(Vi$Fkscsvonqx$WfFyYbK3RI6`3$xWm>`wX z2Pr*)laAdKSTWRu{_!DySG2QF$^vm-9(%t{Jr}dft*J*tgMyCI4iTv;B zxmCx;0lyvjtInX2aFfC3Ed$#bh*XfkQKtYrmW=QxPWZraG24N3r5WgX|7~Wk&9t(z zmDz3#)Cs?5XT{yan;84kX=3`N6}-#2`ND5%7zKX4tIYl33V0VRN70`c6XyxSryeiH@js zek-|OK7(@6jNz?y2)E?2N6bhunWHx269;sE|RZp9xpGi-u2rk3#JtWRtBSpjhRMYF40;!`t|{UsCuHH@FGJ(w-uq0Eye>A9CUF*G)Sfc!l~ZxY5*y+)gRE8@ zLG6DQkmWKagAz%cH3a)HX^D)$ZT8Jvbrb*K8yM(q%ziir1y8u@J^r6wslDU37#3i`SjZMmgB+Rw?jYkbcdjC zZBp!>I-}SDrOvkz_B2!*zZ|~k%e_&&(w(fE!DH;dw}~a83kTSE z^UsCW-cPTqt2=F9vIzWUmLkSH0zX9DXb3ve`0ys*xa8a2)Acs{asP}v5)NXux5AF7 z*TI62+s{rkwVjpvkhRy5=F^xM>UI2{>H~h!#gV!RmSOGS(=-L@XG2SDS{LQYTmTl3 zkh|{2_`2TNHOgfAgc}Q7D9e2Utk(K0UfPOSFo_%W!%!Qnf)Y%Ab)V(`vz^`lduDpx zd!G}Lu!BYV8*R*Waq?I7(@p81tnUw`z(!I+ro2|S@5Ax(Ph|;)inEGP=IA*iDDFUC zw#RI3?RGU%F0lv1X~YxeED;30*MiT+ ze-B#N^XvRCd{M&M?6d;g>dnrIPa{>$)BKKyr*=>Kx3h-zdE8?ZkX6R?J_nda%r+ITI#G6j{4&Gt+)65CnS!+nrEYqQV#rEz& zNf&$q^o^mo=D#VUW_k;g?a@3+Nn333&O|lO`bJ`P_cTIDbG4T<_W2kmmtB|NWFE6O z*wKmA*-Ta%eqMvi+r!axIB_1^Mu3$NkPs`d&e3{iTWlNz;z+h8Lu&DAmf}8R5{?4{ z45@kWl8w38Ji81+DzhHI@eL?dIR~g!Wfh%LO$=$gom1B_Z(DF~B)TF8pKg4<1H}X! zm5ID>IomG2jBc)e(4HdOJGMuVy;;st)tV;@$;MAZWTBr9Z<-p~O$#P%A9jbA}jJl9WEVmzPThomThI98yECQ_FqmQHexc^BSe zD03GvPnE=7wy%;gMPpYnA2?5wy(K4Ap?%P`d35n0`aH%By*+_#`XJY@RnFBgN=;pZ zQa}QBHBNAW2;v>iZQb_yXnn!z_5<~HCFgU&+UHF;b)V@Jn0Zdzj)FKBnEsGuT>%)( zKh>G#657p?QmYveGCq##u`Gb(exIU)w351X6i0!3e;VO2r8Mr0@{@r{0T;V^IBR0x zAj;q1fv9c8f(uedazxd4%yJux=I-xoFY&NavHg=VaPS_%Sm`inbhILN>1br!^Sm$5 zdcy(gZi_z!aQIQi()?yYvG}>6`W8ysYtja=0K6c(dw{$9w0bs|1yrlz*qYTt2B`_Y zBqHy!M77gdg2WlmD)SU;T$(uqy*)y|%H3{P$7;uK4{zfnj z2HE>bnem_WCopeiu?x`)d9Bo0{A$R|;KSSu0o)uxsUfr$9L-^wyFr{eodiT%E4;5k zc;T+M0$i{yZ$8L+=VSWGKSq)@TMh8D7K_o`QGrXmC4GO+xQP39*z20>p z#K3ajoCeJ+Wmxi#m_|&Mq?I%qn2vaKQtZ5|B%Kj&sGY-n4Tq?IkFKvzh)L>25;b=W zeTu_be8vw`2<1Oq`Otom^)$SoTIe`j~ zg{OW|gu=7N>YFj;98Pvzm5P?!WwM3E6l^T>seD8WwSFe>Z&Nu+#sD$3CWo$VSNK-B&@Z37kprgJ2;2mlU9=QDLR9m zpzHwdwh#MJoURvei*`!xJ}|TV5@$Qt`XUF6wxnYtE-w%m5jDi9IOH!KSBiSG8|{0A zP_k6f!w4`~Aq=iXj-{T#O9c&Zoe5+jP=L5=t?>~OMsyM`{Y0}+Z`f#29s`KO)4#i% ze77>iAV^<#vCc0nyp`$V-fhyStBnPw(RcfcySj*>FJtd( zm6&&~A;F=Fw1+a>dHygq8KBpdC3}OzRmJrtJOAPRd;B-ILsndd>51ZPp`u2_fzD~p zcIyFJZnLz#EYJ@6bSubf%GXqoPn-%m&A}Uli|X44a|c0{(wmdE5bc! zRNv!n)L2J-8Co<*#&U&xPtAZZ)w3F5UkDw;J(hCexm@iW_I2~{?iF|<6$>9MmX#7F zs*m&GRUHjUs*|`&qM7x>z`<_3eSn$4cDKptgjxono|0G&l^nvjYM*pH_=M?PAIFr- z1RX!DVCA2I8UL37Jl;RIaUQ!@O5@EMTf66?m6B>QxJlq7Hj?-1)|3b@E_HBkWFY4{ z7G?ZX?Z&Ubk9EY=@sgohm6VvAjN8f&0u8lqj0HHMg+@Es1ceT}!^C;R=K#S5RRc_k zM(Zvyau=PJgE$wEF?G(^sIeK>4gbvUAsm=7Sp&y&etKcfXsQAqeUQ}ga8d?0yad&i zS0wK1ek0%s?R_0896>cP8Xct&Gg(Rg00PZG6Hw6@M=?E;5i)=DqPQ3n1btfwNQ*ut zSPcw{<_*qzCwaFOKQ2gL35T#2zKM$wv%198+x?{d4#t!q7y7wY!Z}JN5xN{ywEB^l zS6oe-`U$LC4ImjOBS@}rT%vVX(R~K1%zAf7lesxXi?(v%I7h(tlasFEh?9r3I|!CL zx0Su92>z_$Wpca|cg$URRFqdpAjUY>`r&3GMoY-)V4|$Xnwk~4G`{+5ui}-z^ZUmW zzo}UB!@-l4AahO~{wrWxclIaSqa++`{)ps>8!T#ENTBoz>di1Z;VOpLWGE~>eR!r& zbC#ghP0C(G;l++e^Ns;h;5~LCmUwt7LmA&bGL4yCR@FI+fB=wd-tyzW*Y9!o1y{`E zpK{r>k=SZ_ec8FSp3^^7a`hYC28vC$h1ikX-Z_+e$Q!|Zm-+`(v$b6RzjqG5S^oop zfr;&Z&*W>?Q2)0mMe%v5om)gx#$jZK!Y|(ljb@%PGHD#&$S9Sf5{`@;#*T0jqq09< zwPnNJ)mIF?$|m18b2&J@5o8l%?M$y<-)vl+Y)r)jWRd!y3>dcOkXm5`Z>a{%rmSBk zl6y1iLJfDZ^=fy^;7BJw0Iad8bk3V=PPm_)+6T7T5e#D=-cF8cP5J0&dvfgShD^51 zRN@K5Sv0L@dA&^<;Wm7;8Fk6-E<58lW`x=1B!yInClHEH9A9Ala@brUVZ&~L^P+4+^_$VIeBuDdPJ%r2>R?(j~PE|>Nq<9zsOstBR(j+5VoRG7lZ z`UqDbY2QVisN!!WpMb8<=S6=7K%2q>TM%DN&w%&I7sGTbH+R!MW>McD-~ei?pPddb z+U2%8o=zg=NVebTB=z-;Xu7R?0P#&zr$>mb9o#=`$&O2n%f^{4?#iqr>K87msVaR` zB{|ybex>JA=QeEafr^3UF)GI0)T(8%NNGo9`Ih;D(v`6Mz>) zWRI!X_7Y-{L*PR39gZWh0qzm7V!elD>+nbg4_ePiAcuU=z2w zyDsYJT4c>R^A@Y^UKV;rywv3ZvEHN=&KoN(Ps?L)*EDEO3#77LnOMXTA?3LRj5xIZ zJrLU+(*NC^<43Xg^*bNO4+aw0jNr<#I#YXICK|i0YKXA}KDic!!(`(@8lf<5z_RCh z0b3NTP*atKWgu#y+QwP>YInK7`r$sr1HOLG=r{7F1n89n#$z1oU)Y50^QQIXn3FtU zlHD*-9}K3;ICih2mVji}k@$yE!wkJGkyd&ddO!zUgOu~1SRG;%D`VIa>&ixMYufPz z2{v|uTT=}ZBmGvfZYbrBl}+snr-cV{a#tz>(SzNv=(ksetj)Aj3iO9xO^ogWQcLa# zL*Tvnu z63Ol>gT<~@t}NMU7Lnm`2zy@&6kwv2Ic;0K!!>2r2k4M4%)>_LRprWm7(@tjD(90Z z-{QuCV{=9mwWphFt?{xOO@;vY$}+}dbnR=5Hnhc^5?FuZRtV6+B}f?N)y_F#dD1}|nP{N8 zj1P35@`u}fL?y>J1)6^yY7cnmnX_HgUZkNf`%y#aOP@t`PWl+!9m3T^iv`~wt4D>t z6I-O^I_vwup&=OKSqdD}eFgjyeAfd&408BOpbm}d=SF_t!q0c!pfhWkS{z|3ty z1BQ=vSdphSF+oq+?tR>euVa4z;N7qua0~eQfy5jV*HHk8>>)p>41WysX8su!#crJG zuTSyEx5c5VZ-VtC>8Tt??i?S%l*`$%h>H1MKgyGHQAMV(+Q3&v_4xIfvmy$xXWkfi z;2=;WZ2bd?Oi4K+e7Ah_%dhMWeK_T>!1opB=3)iqjq4YJ9d%>p=7raO{W2QbN*7R! zq){%Dm04C zybbU}Y}6QZe2(o><;k}ZI=L`V2{PHJm}TOEyI&1lQ`esuuo<~oJ343sj&JJdH!37I zmqWr3i!8bqoHVUb+G4X4{fZ*{ruE2B;iIsfLo%lUHw#(?g@4%5GXG9!ghr^_D(S8& z4Sjq{ZN5Ovu?}EtsFmP&xg}md-gPIj!bmr{%geczPZ!*kUwE5!;mSJvPPB`I!VZu^ zyCdTc-3)YY?NuXl(i?B@F8>vz*H6R8U=|1mLH@BuBp-l_A{hq$Zx`hkj1ODVnNRNi zW9QfXB5M^h91wOr$Z)RrtNnq|K}`s2|yF&IS;CD-YXxiMtE=IwAn2Xj6h{J zn;@}7pQ+4}p5w|}1@ginij8P*y`*qJF8jU}Ui~;vkypzi(s!+8mPQHIqCR7oca4LC zlH#3?8s9p?rQdky% zi>T_fhSM;8w$RtrTR%|fMk+Qy1T~BlG!P*7Oo)aM^tufdWB??fzv68m0VW*d7yTkTyye+i64U(|98yi|X#w_J1ViDK zmZb=JTSR~{1j>`1Sz}4P(8(~Sc1Yg6Xkd)SmZ#=g9lvG?kH+*{;tT8w%&)+j9!y}l zMhE%E&}yNHbGk$z~Y0(J!K4FqEx zGY2&tx`mIa&=*h7wMIj8>zPP{oG&@!Ht4NBsnm;>j}au&D-3=lwW7JD-oXlf4R&T? z+oWVV)^h#4+!$;-40`0na z0({{{mp=acyrD(2Yrd6|+L|}~+fD9Q4vmEaA4?9^1)*`ZO0N=jnuc2NJ_z<*pU0x< zE=C_X&%3oczkoC=x;3nwTBx@|WnV(aGtaL_;j`Tb-+mAwXTSRYL)Fao9|%>9?Ck%y z7x#ajr~g$~|F3oTzpD9mQ2@dqIkb#ejb@I_n(fM^woTSro`MO_l3KEcgnat@J$p~C z!DzBxF^@iykdP9=G{n&hcfrbw%j96S5VLWI|++@OBe6?4%vMxJY8D?)EtSWAq+y`F}HPh|&MJ~D3cxQB`} zyZ4dF=xB@LZ=Pqw9|_y#>^HX!FcLatNMBJTs17S<$4saU_B>NreGUQ7QQt@@l`fm_n? zYuv#QZ=f75X)@3Z?iq}SN8`3)X_t;t&=eYCSm&_nK$B-W;&3vES#;LiL%8KR`9P*5 zuk)eDqU1bag;ql+)}EmV>AuOSoxEFTgvGmablF}wkR8+qso1e#L&@AHvNI@KwjUy1 zp(#J429=4#$MKX9e;LJZ9=c}B%|4(#_D z4+Lc93IyaOiw_J$fByjlH1!OWC+!m^EcMOR#2-Hc$>`VVkV&EmcX~u}Db&OToauu< z<8am~5h0U8y~CAsH1IT#M`ZoQ(WeN^SxKDEo2lNeC1})Vzmc0)1KVSkOz+3Mq-GpNRLWHC%57Z-Y># z@s(`x(cLlyCTcI#-w{}n<{R}{i?Ti#LYQBBVhRTrannM-#ePIadxn8ACDMePnf}5^f zmp@~#rbnIhdeNhudd5z51zNDb@hQw;gm^7M)GA{(7f83qS-_LznMa;vUAW}nzBqj+ zRrh(xxM88EOD~{ ze>p$I+OWF4r{8F6%OKjUM-D2l&S}|4%XY$9o+VHDa2@rcbhj*B6IS;KHY#{OXi%U{ zD{K(~7VyP~-9a$B7fYk$ZC@sr+#k=WTv4W53R1iBbs?Rit?&h$8>?I<&`!gxlV2g` zB^yk|^{0bgE2u_cThYi+`w1&HdryH?iEAKq@}T??zy`LzXF2PdGG3}1MttmWdnZ$m zo(W(zvk*O{oiGUI>Gk#NgA?u~ntxuT6TjJ}NoPW-5ClB0E;>8O+?UrOXzXMHPho2M zL~FXprX6cko`TR9Jv3Ag@7WwGS!NeO!IsD%KYN5TUrD3CtK}F-c!@2Hhw(HMdKuL2+qKhw?$9F{+QV#T*e~5)=Hh0gD_tmkS=MoDVY!_P z1o+7eZGe&m%keydP{;yzB~pcqIb!XUPvvZ8@*S(6@JOoJ4a#VbxHl6*L2WD{S~Wm^ zajLp|Wd?=7v$(2JqyeBsH$_?fW39^Dq5rUjJDzW;1(%(U=vbICL|+bAH!HyR2hJl- zT>opgLfwK7Wlk95U-a=XA9py@cSO?C|DMkzC-1STnD z9))n;7(^yUU=z(I`^NB5q#++v083T5(_S5c_Nipd?b6Qp!RrTFSG#A^Jb5LaHT+Du zt9yC2Ex*+ue}hgsgqr@R!}>p%h?yDx_sj2PjlZ^stpDW&V6;5}!pr6TWQptqZISL= zd+zw_qPR8KqQ-7W%E)ObYc1P)rt4cIHog$mIF5cg$fI6DaURK+P(Y&MN?q^aeE0S= zywj^NY>{XK^QyH|!)l<6!8Dz$rOj6P5-ziRq;Ujje?R_v@?>^;5~ zDP{h8t#U)x(~C4uF+5Iz{+SXXvX1w{#sKq6F6XiJbvmIW$D$xDdp*+zJDm4PC_AJB z$Nn;1j8{-0UJG?J0}CJFm@%G&Br_O@wp+{cpZd-=+}QT{>z3c@$!68|qWp>rW6aPQ z1oTgek0+(DLTe{-D)NdRt#QY=V}_Mwi2usBVdtj&SqPiSZ&Y+MKLGG{PJ64jp#qv) z!x16xa0L;xEob(4at@R%3wilYD z>`EoEv!EZ6AhH@QQN;%5A#kYjOl0~bI9458@;%pM&y|a{sgH?%e!co62!s-PWi!=6 z1>jrt$tl)sxk2g?-5Yuuo1FF0v*0SWvz+G=?gvkU3OB^Za%4Vgh);W>sMKJGL#Ks6 z4S%Y5Dv_&>K+njbJmF_6)22KSwf(@G#||Ejjfo6=+OWxQX_(A>s(2!CN26)K1I_VA z&J}m_Iwqp!Q`zP&Kd1z;msZ0o8=M+?$cz*=o|JFF;E(cjU;tn& z7%zQlObX$o#bFvR~K0p%+kWY$CQoPemrtd#2S?ayVN7iY?XP)$q@6}wX?a9 zm@cYjXEnxW6H%$ovxA}2JygCP2`sSurB*$6B_>9Q=3zy{`?pXYot&q+!PsEV(zUic zzaJAGWT%=O2Y#z#>{5#ZFN*jfalUp=7ij3!)2cw%tp-Grad6grnMAtR$1@8$h#Hlg z2@|54{(MZZT;}bLC}MGxGX{F(ob7tSw@iorqal334hGZ>IS-vQ$!YF}^v4h`m@%LaKYhwn!HfVaiS_3}gYxJO(n844HpFy&Boy zuc7IgzUT-Qb+%_gJ}QkWnTfI)3|`rhVvwD$5OZ(t?yS^Q|AokAfULF0sAl|pdNjt% zz;e!$qG-4-+uaAIP*4K_XTn-DF|bGb#qS@5*qVJ%P1Aq^tLXXAFQ|mtw}+a5i(BI099{8&2G)ZjAtyY)#@s z2=JQ*kB}2S7s|vfN1c=|AC8^1$k#fpI`E4RX{ZxshFGQAd<|qJi1HW6!GuD_&l~WJ zv_d@vzPsnxw%}FT;m#Us2nQjI>jV$LV^R2gUpEicA#IG0!7~b)LKVHE7(fe_c-1EW zwU5XJ8=0@~r36;JBG99jURo)YV_@hgwvA)o+l4JJ4M#;A(@<5C+2UqZJsMKcp<#PA z^Pu_8t}kYGM){G5@kOuZK!M_Ct(Z2cBZ8HLrEqMck|j2%L!G|KZJr(^1q;0WB2GXK z8FirKRyhTvqAkh_Pjw6K;zCZg3~>(Laahga+{ziE6?baGB7*2Fx9xocZwgPhjOsJ| z6>37TNOwN{pfu8{!^^H#A${Haz;3RlAy(rT#+`hBz@#H4g91MN@{z5we7DQf+Q=|m z4PeM@reY+AJXyP``A2kw`|5Yv#8bAvR44gxk!-&D=I=0ew2mY~Jq{f4aV;{+s^Ry4 zCDF}CSe7!<1Zz^%1WPuG%&TE8VT3391++qni{`BXL{*(!tyL6(*3zJS+8i&V(h||d z`mI3?`;?2YoIMRMLYpEo-Q9-DCC8}&62H~={k2~_)sondQ5-j}N-1=sBZWfRLb!WCt6T9v|`RMF-@)shBc&P}C`CyKOy<*qoVAjarSvRqpJUVg#g_Fl{>93X-yY zRe126t0|Jb8;#yk6!;{kp+6NU4PpPkzw8?pMp$;UZpGN;pXgN|Gl>mEp{4KQ4}WHw z!y`%zzkQR8R0P<(rE563oLpdb&+uRDf2!} zvMzlKiln-DRjd$s{-O(@y!WNHCLaFf)ITJFQScqV279w`kxv-qC{P-t2E1&~=pbe%B4fe~hnP%e89^^yhH0vG(W|PW1-g>zlHq|v@n7h*&Zv8Nu3ah* zD+?y3!6;~{r`;GfpUH{P@SdnNT*}&dg4$?{?=_kEFB=H5H;@V)DIz`9*=Bn~6MT<& zl;pWx5R;FNP+#QI=k9pWpftPw=|O7?qUZ`EWVa#58zdeoyzT|c=7w|IqXN>N7E|Sm z%W_U}>_`>T&l01S9&C$qywx97wfNsY7XzlGsgiv5-^*_kRvg6n#)`Q;wU&R5hSMro z=XS_Ce!))pj~t@-Vu5ae1B)32f|9nFaZbssKU$`E`TiDVo_T{uxj!qZxfyvHROh#X zzRe8Daim>&?8ii$+4|=ZO1&S4*PfYGiPuGI6FT~2AS@LkRoU+7^kJQPv-+Vs)gQ7O zr6=)mbFTNW-9ge42JZER0&b@0qczEK&&*atfP-b*y7Olpu=AdrMPV0>GtCbqjEV0F zeV?hC_c6>i$Coy+$iG^8VX4kStZC*Fg^oHtr3Dqr6#{Bgv$=%-VT=u9M0_28ivoV+ z*Kq4Y(jOAW3mS#TP2jPC(!jZQ73<+~**vQc()ne5)UQsANykI=8kbI=&EjW9%avW> ziYSCV<%P#2W%x;F$ZL;m`8NtpmS>fo>N)ZgLSU)~(rQ$4VkP4BDNw8hX@{O8lS=ZT z!0r9%&)Kwqw;7)rxd-nyoVVcK&4}&vqiKJRiAa}$|>Hko*9{4 zU4-C<&uj3_u!rmLaH|2q=2?zJT@Gb3|JbWIVPVPAA8V z^Zwrft{@@Kn3HxRe~Clv8f%cxMAO3@F|~5W%&=Cfb$-$V;%K`BW2fEWP}@rqgnvyL zayBLRk>06w_4E~gBV>WECPPrHdpHbB`X(z@vdOR|y?+kev`O@hQ`N6(!-4C)M%T5u zoq62GJmou>*L&{k+s6F&?DGef0@F_UKXf1L{{bJz#QcA-egE^Qa9I7q@vsH{=epN` zZ-QAponAhn^hyeKG+lB-XYoObkDZN;GyFnXmfHG>ee~@D7O4o|^38tL>D^d3fDkHR z7ZE7c`kK}1VQ{^b%@lK0z;=OQ`L=!CTA?MJ1(CR)jcxdtb@$whIylPtRy*1G@0*Gc zO`FC2p58g{#bgPuukXXb`U{)s5AOkCqIFv`@~}Qlp|lTsr}r!?_?+)nom|QI@wNjV ztAyCAN~WtRUv9nW6Y^X%_!+Sv@`T*P1FdNkxba`EWFLy`A$$(cvh<1Eqs6DGFb z&#;kd+e_Jo(=Y0g61rNBESlk^@@4x_k9DwL18F2RVl!<9yU01E3~ghfoLs*Qw9D|v z<6^6JvTTV$s@}~s1Skv?;-P1&nuWkuS2sScg+Hf3@jIrg~lEVXxo>+YHd2L8L2eyhT`o3UV z+=vM3BYuJP?vm$KEisj4RGo&Y;c$0XdArdZEpm(|R!IwcI}O9kIN*>1+L%UH4%3~h z-V$8E(U+nXD_PW)d7@y@EU%N6!-x<=OIT`Ml1@29v0_$N*%DK$a^Y90q>|Y@w39-c z6ZeP|KdEQlZK($g)`B15TJOIUA!*xS4FA!bAMJ*C&ibrnVnq-cZHoFcJ1!@ysC*k! zaYka*Vd=IqSDsv*C=c{PDe5l5Es#G|KN;WV5RWKqL=O_p*>i8$M6N_OKh2HFaSM?C zA>zWRWt+D^ca@gHyCXcx4G?6Oq%HCht%VB&s)rVmw`kBxgywb;lHhoWSu7__>_D?M z@kXG4{8exi)zXIU#zXyzH`xS_M0l|a@J7mKGwZO0BR#yzfTF0P=+=rS`H3dQ413STQ1%De+ zwTaMXuKdH4%!c`L=z_NudT%Zr*`^1Y>TN8n@f28_$Y!9CkuI0FP>b3U7p&|WhG0iZ z{rMn71l6(S-lc%ver0aVsrtGS9LXP}QS4a5LD(ZJTV4vgT%)q2CoocHKNHU*@S`^H z#&h?4alRkP1iRc!-%XC5u_95pZjuh>6D?B4(%a8jz-y~U6yj-#mQ+yC$1IuzCY~A3 zYS){Tpc3HS<%Uz%4G6~&h(Z;lo|ltfIAW@CrgPf)+9Gig|Zh~!U33Vmtmc-dkl{wJ{Y;^7?z&tP|)&D zP(&`n=g)Swn~E4{U!^M&(JB-77>MGP`p;?5v~^+-caB3^@&@Ws1OfRCELPpH(ZsFo zBsS7mm#*Zu+t7eeDS1C!HVh&o>EPx{jWxqM5bh3}(@=Tj(-{rO?r_SxJ2Wb-dOW|k zT_;2(m{sctK-)$i;~m39zs7Z%0_L*8{`TvVoxqr8l!@Mz9X&8JS{v#5eD8L?NvP=! zXHKM)a3!SdxkNvM)T$)(b#r|Ihq$^nTiO-iJC5a!Go@~)v)OLn+Cp2Jh*h%>AgN6L{?x!t`f0axNvi%R+kJFtPlcbSJSFGN1ISmq z{{zL&H+rhZR@%i)v1(jOkn9k)u3A)Fv!_LsX*BiXX$LY%ys;D@ao;J7hTOQr zpTO~}FU6ZO!Ardr$osVHcYukAD_HyOHovm5ZV+MTt2;wre*9&SYtKlxZknP>HC_O7t2WJVM`8S=myy_~UPblO~1_#pHnMXPYpbuqDK`~Jt)u}XmWX<({P zelw^!tJ+gI-bqPV!U!RA&3bm)i;M{ut`x&4#)+C6l-hp|j;>D&d_FLsHC}}M#h?e& zWjSTBe@u{cj1}3rmMu!n1qqU8=VF*T`nzY zYGT>}mhvV96|9VoK2~V*X!lop!*}*`M;E6?kQ5ockU=7OP=ly|bhR2D2J_=skFEl- zPG2mEu4A@&xx$=J8~0@xya)wTJ5#qU39vt8>&eB2?>i`?(&~Xwcr088uOds ze-BtrmhX4SWI*bEK=%w19FhB%9pjIJ#y6t! zla9OX3Bdp+rXsyeq_SkIYl0jBS5~zOUlrDhnT&XRQ{qW@+lxEenVf|S_GSvUY0QyM zrZlO0=we_;Kbe2C(w(q4`iYn%%v9W#T~BhRYE=Htu*EvpslE=p3XFjl?gjncy(qMF zL~DgqeAyf!z>i@YZ4$OhdR}WdQ|er#n(DWgqmD^e5vxo97FIXJ?T_+B2QJU{A&9p~kc^^p_(do_}sdq_BU&r zbt!ln2N6o*o*{sd4((-{xsC||e`o&MK2e1S&a6Tv*8o|3d{}3LOLI>06hh90^CKks zn}E*aQ4Sotod=E_W(jR!?+_bkv+hlw_gWf3}-|E?GPfdK$h^JJX%uN zgD4yZVciO6tasGXRH4zwz=c_&Q+x$=<#P%cV!%zqEF$|evJVAR4_UB(DQu?!A4;g0Y$ zo#MzLUx?haZ7K&G>fQuX{eZCJ=(bU7NJQp#XPt|+FY6ENXmmA`GA+#{_@t}#BA^En zdKHduAJP`D?0wv#xa&maOr>%bBQBu$ud?K{-oavX>JZ66Ctf)YDV5a8mtp5Z-B6H4 zyRPwemENnkzHx0Y=|mOo9Xk-eX09JTNBZ}+4sv=wkAB=CcUFv@szezM6JRH=TUZP6<$4^=`~ zX_ucICB1YzOyFV2b-+O;H$;^-B~ANr3AMl^y)e=Mrt9ovRa*rwv@QmzBAR>P@r$nm zh3n2PQllr7_7Kh0Fx@wND?AMC(rjJELfJQ+1m&nR%I*#L4v84;fSz*1r?@8+S!7e9 zrN$=Q-E*7E?)P8bSYb|7Z){JDk*EH!%{Ci|&iCMXWvG1O!~|Tq%r5M)fa4l*=$CFQ zJhq~Or0|v9mVAPgK~>I?+SSD2eO`PqJqBwDHDi4B)KJ{uCu!T+{|WaQ6pC#XP49VE zIG5>)BX>^!oOMn-zUfa zkN=jP_5VrBV*JhVzs8YGt8K>MvLJci)ZP+zwP|CfTAQO&V$u@&qs4OrRhl(RiPQlu z`*h@9J!PIm%9t0iTv?I#^*yX!(mUSTD!46B6zWilz>m-*V1q&GA(Rn}uq6<(s7g@O zB0<~iM4Rt_^m2Q!|$T&j&JVC06=OyaXBnO3RO2498k<4IGzlurx(L)O_@y`n@GO6MZ zV%laVv9#YL0vzOrEMKm)$vNu6?8_mM8H|;Wdqb9Zxb4C%Ye4+@RrcoDUm2n+W*iia z8|YUY1*D2-1Uf=|Hksj11cpWiW)ka828ZNy?_b0!Ci$yP*6v^Uh7pufP)-A4geY#H zR9NgyayVjMfzc(lj-$02QPuC(885lTJg9CG7kYgyCPY~9KY4pKh)w$7d&Cq=APj=# z16s(xY7kZ~^*$AF+~oYn0fC()?fhI;D4>dC=KKBsz+p*;`hg4ko)OWB`#1}qEP|Nr ziMQc>n6Eb*ca-aAUV^g8dtfK!I0c><+BZXwtH5&tg5O9v@UcO$>q5Rr6CsI-`d%0_ zpK3+7-oJcb?@ECtCB%_+Rb6ejyRh+jG=loW0krQec*8_AjQ(jWB-kJes20Np<1VgRQ#kZjrPK4Qz^ zAgOs8m?>{W&yLOxcye-m?qOg9l7chU(lObnKqo$6P;~LfzeCK%-fVv$4103BOEA}{nh!q?Z=V0tN-ca+1S~+ZDm`+ z#6aua>h0}*XY|6FMM`s+LL6wQ3?2G{-lVN12v%i|>UG^!&2`;!-!^Nfkk<_{xN;(* zoo<5de%CZ>2QwTMm0}s);kLDdnV}CV&l4kv6%)QO`nDlchJtmL)W>SFtMbDvo8uxI8%AvbJH8ZO}%FZ{+NMuVy z%yp)0p<0%YJ*GgJbugDQwEtAb;^(MJfns%7?D2b4JYlUaAM5U3rItGBW^%w@@{flp zXhN9_X@R5Y^7)(2F0%%%lhImHPrAadD}Gb9`O`x6Ta?rSS9k6FHDyimgnWY1ChCMn zk*mF+esf%ID(4AOo_wAC=eN6j)XQ}7?~em3$WbiX573y0*Vl6k$Z*p|(gEp>Mec5b z5lmOU9hnDP`dS~H5%SK9)y)fWK@S!m9v^02o0{ZES*2h2J|U8`-at*xxy?QiGyb)U2PbX$3yR4Wq(nGgVZo3qMNu>gXZ z_*C>xJ4UHyXO@1J?Icr|e%AK8E3o;4rK)C9!r55GgK)<4ykUv{`rwnS`H^6dS4Q68 z-6l=!(f6u@orF#29n}Oeb$;6MIy2Q#82mDnbg*Z*a)cM*d3{0H39&Q;fps?Gjc-;| zc>TW@r!4;|50{nW|0LA^&%@$wmCgU^K)g?=AI)Z$1TcdLnZ3uI6QpbzZNYn+4M-S%g$RUj4^Vt4m4O1d8EP% z=urbs!vHDXECM{zv(J3!Ny;a(idcMOedVV~>SnnwYP@D!ZL;B@i8E_E{cNz6K;b>H z05jL1I5h|Vf~}4oQi_mQa6sAzzJ{whhr_Q>(~#p8%*d!y1xUwHV^Cd)N7Bt*NoaQ# z{XFP3BSDw|6Dfn-PM-UA8Kv|UWmP#{@%G=O+S%%-9&u+3RY~IK=7T0yxBgtiJxEiy zeRxtl>jby$?XwGGW!3kdgsauEIC#tE`IWSD$m-_?p`Lwd`m5+XyYPkvLPli#=}N1z zPC3ZGhGQVj|E_RHUl))Va~at~xfJ7~*oG0>15ZT={Zs3I5UexCGn_Y5@LcEmr zD$Drv^(C@b3;n1Nn?;CLawrb2q zYj0@mtO^s*USD?oTTeA%`%D#nbuihkog5D@{dfBpISl!x-l1{oxvgxZN+oUE-R~zT zjHF(tGz7t=;CyJ#bSxM)h#D8sAFIvC{%Jzf>DC1IPg$y`@jU(hz2~<{&z| z&yqf`RO{rkG?(YFP-Ru@bfM6&G%H;9hiGH%>m5|8q1F$5dnDP~fA|*u3AFxO+|j?> z9oGLXQfyf5U-kn#+CPo;t~umvm9chQ5NBd2YpZ$Yb+LW!Mhs-26?6YCh|K=ilPCWG za4Lz+O1Ae{nxE5S_KlWvU87x{BZnIsRup}i9hw4&51FH=z9>upJLwFGLr=#LLCh~> zg|cyKs^NnKo>%ub6D8ZC^VE zg9+E`b7kt$LO37P6OY>$p#O`pcMKLKh_W@0ZQJ+Qwr$(CZQHhO+qTa=wry+f!@TK; z?s(JxDk}14W!BEM_FmuZJ|UiAZaypl)EfzlKL*ke;s8dMU#*L$=QAyNv%m__o5q|A zFG+9JlDry#0Hv)(L=Iw#ID>(4uUr~uoD^=xc9Ts_k^-rodJV50Tz1{Fo73GnWA9G` zq@xWwHZ1?ft*B(YXjphhd-@eLKN`awfBZqV=J>SGa2nWyUX}(@*h#URCkwF4`I2<`rAa=km`LP$jD2{sUk|X%30MMWDM+1!>7U-mN!8E(1BWZX@ zH@ic5be?Ldi27CQk5G9);{2FHF&?y2o zukh6!;9-fMdhjOR0rB@{4din|mT2iE{Sq%`QhnHw@RbZaky8@r!nl;vecl9bugz9ukSmp>c?(1&_M42_@;y3D!~q&KE;|^Y+Ti+D@(nmI-msJ!$r=9+qq9;mKP;+$k_wc!RLP%Kk5>lKZ(%k<>F#7WyT8cg($ zGTF4UvUR0Nb3*W4db3_v`^4~T9Hx~JP0ue3TKeOwx_LB~tCK?YBDw-CJIg;1HZ>p>d=-{w54q+NCJ)LhcSxfMap`y&#Y~by&afkyy zy2aLe@p%yq%iq#bbBHE%gvYgv2C2K!gjuzb=-I;b0>YJP%J@TT6?A%smv3} z%I{{Ws2|1CUQPJM@X-?n!mi(aQMle>o}bPqr)@6ll2g4_Ir8=zYIEC{BoCLlrvRFI z!E^Rj`#b}5B#<`|yvYDiUgL+pJSclJuwISLP9fz?*X4RMOkUfxX$}j_d|l^4In=

=~ zHXFrv_nw$BbG*871p(Mun8R-d$$_al7H{-PJs<7%JvZZ<_#^@*$HBOqVwNXj^N`~D zm5vzugo8X{f)hdidd}OgGK(25j5``0llwK(vhXK&?C(R7KJiIkdhS`eUaXXU_^KKyjg zc;{jkBHh)Rfo)6Yh2jk(!-WOiW}~SEq~v<5mHfKhOpDg^J*75JCtdhFS zX|~5J{99zWC?j#6(RA~byWi;yn>Mlhr5;YP_=CM3D;MoD={mV@Q`aoR@MnH_5pCQ{ zM#?OdoNc*^l=Mt^NLdQ5KWjgP*4LN19Syj&x+0o!*f*tp=W6sWpmotU8K~U1l@9kc zx;EKg@0Hjln!j|JT^+Ok#a8&Q7?$iT|NW|bMs3q}>z`!2s&+>a?@v3z^#|BPqLtOL zVsq5fDUko4U>r`1ErC7u{ShlX(X~Mt6LSOPLp1WoY&JWCa1O~i45HIsS0vVN76rpUKq`+q`J=Q>h80s5P&Q&_Rvy`VVb?PNF4A z5N=3xrR`+<`0t0nIS2|e&2OHyeIPIcG^^>GXD3_FbV_ zVRdvPie=#JBu@l`&7tVK^&yM#AW8nbr+>W~{I{3H(y9X`jTOJj&7fwej&n#~! zGq^6Ayq9Mr=6E-bF#B$0Ofox|E#4a|r7bBiVNykI7%le9RdIEJ2s8{fn>lUZn?whT zBK_v#@Ks42qUS|%1}r{o+O7Ck+V;L>Q9rNV)_RlU?nW~Dk%1PVIk2*c=c>Z}ju+V(6iGeCALKW}J1d!q!w5OAqO4=mbh^1i0aiqHHe@Ry| ziH^|;jy-Skx+w8f<5#w4J70ADr0I%FwJ{#*IHeeqZlfi6_jsiLAbR`xOLCCx+c^ls zR{iI_%F6g3mDv9cNo8aH?|XPXX);b5{}Yl5g=OMWNpt+aF~=~@ZH^&*zyS#i0QAt7RWV+-RE*_fUUn|{{aW|!(>lbpOXtX zgLhZ9SHUA$derhb_WSd`m=Y$vhgD0XmrTb$-;y6N&-ul6HNdTDXh1m5fU}fRP?f%k zpOodF#0^7WO)v(g6AZK3DEaHRSuW_$pDA_9H#&soeqjjRx~-^~NU%~5+Q}|wnk2(d z0Szp0gNY7LVtVdK(+Q~)vEdSqw)BheEm{9UDOADUZ#4HvN1{qnGmKH9fCG`rJIkf& zHBl7LR$t%oL^JiH_>fdWf@s&fDnBqoQ=^;-f%cOH-f$90R8Gzz4Jq+Pj#wO2{Xz{} zZd!9n6&3zRCL)fK?U_-ETat(YE|DV2L+O}D*kW?=KHCGaU@59{19T-i(p(@77>(}C zQcX|oBPQiLs>Ff6h)q#Sh+F4nF*Bsaw>z#~zxOm`!wiffm18sr3rblfOL!tW(IDQT zcKaMY%D0-*B>Nu+RZ@bOrBrw$I3hmoP0>>%&t8gi&9YB;pCMBv@R}D;wAJ7RvA*6$ z{X6NQ8J=T8Wfq(6>?Q7F;wLtb+X1K!@lS(Kmpc}+>Y*jS4NWd@>0q~okehV|YhA1$ zwCcrhsutiTn*_T4$;+v9wGu1q#{?>M6mPY)sO6rI3DeU87g(XFK27uBJf8!Ja{u2$ z?K~x{<;?bL2wvo)D|oe@jV3KK2Hck*Dwe_AC;hKKR5(p^0>cSN^yqhit8;o{J7}Tu# z|LPiIV7CTUVcoryHrKhn$V~iH%BbxG-+cY$l63L$|IZtTg`Vj@)O-Io3xl2M|3ru~ zaIpNhN1kUiZR~K`5Py980`arxxM!&(`XS=c+iB*31g_w1K!xO!t*n+;Iuc=ue|zVy z6w)pyPI)#bN$HIp?aad9!{%Vj+O0yABsMe`6RX{c6q+;FaBQR&c_42p)D5+*SiW|W z5H$yx&}jZ#wlW?ZEj^}o7yglR>7=V@F=tUgBri=3#}Z=tV^})up1f8N#z~Lj+zx`F ziD{eE{07Q6xq#>_m|ieXY9J(56hxs8Oj{d~T@=)?RRlcKl21IZ-BK-`-w|5R?*wfS zaAKN{S1M2=2AWeXXMJ38z>Z;eXQ?`NuRDvg&D>jEES)S4cgFC;-@x(`E%5Uj(G z@6Whe%#%7nCaV9wZ4`1b*&Vt^Dr?n3+4b3SytAzm2*&W6htv0~wOs+D@h?QC?sc(^3`Eq7n38y=pY#HNX zmv4Q!Ct6j3N4ZnJpsks{8uq>4h8oQif z{Cu@;`EAoFUru%Y+59u5_X`HOmgWfBV!PAh@o4#coR+lpcs=9e`9fWj7o-C-BrC@a zuj)_N1#K^KF$6k2OYf!4Kl0JFdHozU+pW%<8x;FW&8U1#P;Fzyo}Gz*W2g$`K8QR;SRVo6_W~3>*%kZL35Q> zIRLIV+0V+%3>p?)sXtPDtZ42{-Cw$$Z+kwS-}C30KGn;*wl+Rn+cLz-s%QMae%gLc z0!t$LoukI#6hrfO$1qGoQ%nIs`2w{yf)`a2#&nAxubcleM2&uLK4w<7w~zb&g_G*) z{n)#-y%SY4)$7v`-CeH&T`F1)MF03v#E#S(t8m({B~5q$gYr*laEuKLRA_KKQ$H@* zCunc+=6cBSvwBLierPum_JS0z+2kfVH-t9~Jf~sy?D4^X41k5xb%@YhlyebwCSssv}i&ybz1C;?m5jiGMwRDwGHB<8oFdq=)-)F|QWZ%lWiY zPzUEzBp*%bVYxZK8rsYH)KX9f>tl0fC8US_u`$mUxlSJAc0Iou*30|!Tu=w^<8yW? zq=*0UA)A-RsEI`rrz%=i)UxnlerdjG9>$CHVSZLrV7CmzYYA3XU}p>>D})d0!~Epn zvFLA+h^Yj8g##`cyeHuBH$bz<(aUbw8`|~s#~E}47)+jwYSg#izhv&Fn;Ps8r?z3t zU$Jjv5Q8W&FxCrZJfP>j5Tn@(4t>*zJl1h@ZZ}T zj@YkPf_Py|4J_dX>SF*9OrX_40*rUPZbnZ4#4!bNN;EJ7ma(c-9tg30g*31LX`aP; zo^W=f`x1ftWk)^$0o;KoUVy+{JP?+l+8Qr};O5TDMFJd-Zan{fS5;#R1Vshye~-SR z-Sl9K+6SN)z+kb&V8 ze-l=G^lu-?hW&k%Owx|UOetgH}QrHV;?@=Sz1&2T&ov)_m6{NDyLSM)z?=|~RJ#n2YmlcF}XMDpv_?QQ#Vl;{4yBm%^whXvaoA6QaE>lO+{!ZL=# z`%ib%t6>OMfBo7m_sLgGhJIBWAq+cMR-mPR%0C$02&%+_qjNhn=B$b~!6Su}UGDZ} za>v1p&&lW|PsIHzhL)A3qJ(P{Rc0Usce_A}ESrICl$PL}3&Al46<><-R>b8Eh|$C8 z^7Ik)qzT%uwjPIOS6U&d)6PmmesiemG7uuf(ia{Tu*lRUq##wSt;Nfs$F$hUoSrd2 zCnTVARl;!v>cNT|&@B`zg{DG?SstR?mIQPH-Foag#nG)OmbYTz?3?EAif`v-k|*FL zC$5PEu7)RNOREf}#G{W_NHJ*3$o2!P6Jn(cT#pWO?Tfu50`mS0!l?>ihg{K zxB*Rp;2z0$bq7yyHOTFTbhHck)kc;F&QEI6JwqHGQbpO%4$z&&Zh_B^up7g_>wbC? zXo?hnJ9!gh>zIYOgPE(maS(;n7*+;`DgRy)&0?BWUpa1HZZ{5b)hhVy344T4s+dw!ySgQyZ#|{c$u z#tmFKX>*ZvF*sjQgi|3v2vPj3=amw-Bou)lnS>+(DUzT^;#Q@CvL{G9fs#NRu_OqE zf>M$WNKbyQ=XL8>@A6aiujEdKhw01o&Ro+f_H340%p2-;f-5}>3D6Wg8h`{~ii-=& z009962{ix^Bp^FS%)YwX+O97^83jyC05PKCZ-Bh$5P^EP9BAMOuuB3&{$F4<0(A@s zv?QQ^m=Jz`LIm*#I|3vcP~`yk{_B8FAbuJWpin@2N@9f_^Y$WSw7%KhIDT7lcm!$+ z3g1&TPWc7c&`=>lhVpqO}hc9_fWIHWB`S#@ejN%{GuB_)7^P(=Ph z+++6}0G@qyvV1}qU?H#ms{nLj;AaBcdcGn^0QP`US^Mc*f5b(=OGtqsfA=sd94I0E zI|_J&7kXxdKF)b@CGhgNKtkUj7hm8TfM07k1ZsdEwGO^@^pL%9y$J+zl=IL)g6@3? zv3%eyRIp{WCB%MDgIfSXh)-yOVbq}EQz32wd$n*}yt^tF2zi+W0Ehv(H#1%s6j(6> z1|W>~%aQ0nbd$Ty>d1OE(aw$m1a+9bLOxV>EU=N=uC~3rAr}Dx9zCV4Qx;^R1m%Phh}$= z0)z_u;9nT@eC@z6zrc3`{M`Ed)E|J;bEka3zs&&p^z(RdBS6>svGyQ)yDRs$=|gtd zem~~z=<`6qW%w8noo`>46Hs&XUL=DnpSyQ@V$|l&a@Vzq6uWr|zo^QJf}VgrZy_Q6 zUxI}M1Q;YlfGA^pZ{JO`IuKuS_`eBN5%&B*bdfqwQM=`RUN3J#KsS6)`+z@9>3B$T zIvDYvyV7kS5rO${ba%fQw|$B~e$0L44}FZ^eO!uOdlw&6v)|Zvzr%#K_G-6$5Zo^K z`uB|UA_pV>Z+7LFS8c8?1}}Huw!gS5!-98?^1|!~6f^K>hyW2^FGCEL_H^mrI_zi{ zPg$&e-=ye)P9enlZV4D~Bd9@hKmk8gp=y}Rpnfs>cUxb$f%W-q{;1PlIAL1@T8 zP@KA3CT}+V;J%;y`~s^{q_jA{>aZUGF{5pdQKWCGKdpX{_+XEL0)R=#;^ofuPA>LS z1|>vIWWST~FcYd91sw>#@&PQEty(V2rVIn*Tvn2w9UI2z>N6W1GIA5&^kz3hv2%{F zUNvpwynJ07nZ#W>Tf^@^CHKO4z^Piu$uS^u(~8M!#gPl!AZ{8(n*SK9V>=Zt~ zl{a7MXA>f7g`~Bpi-oHjx*CZkLUTkGAIC|AtrN9#?jp`<`*mA#whaPO3#o_|o|tXT zCYo^okh!SJ&^_NiOv^gexC#N9cyX$|PQiv1DkY!l%EGj2>9gEDFHCJ~ModAYW3oA_ z21~u4BiwS61}!_SOAjxlD>665t`20IBw^-v=v##58ID(K_ zzm?Jjx~j)Ta|i2pP7t0OL;IOLO>?-6wxq^{2`9^j)N0Jcm|HmdvNY7*E5Ny`wvICO zH%_<5gI=G!>9}dilP~*9e)2&hssCJb;PE4>QE=CaEf>I2&i%on$CCT`FH!N<()bJNDpFnHK zhDF=RX|Gt1Men0k7~=eT78QxG#YqX?ublp#)YcO-Ey<={vA9O5V!t?bITg=5-%3)K zfdjdB0#CF$(F2tQe%2MuaHKK3WK1BcZ(qJtmY%4ox zd(E-t50G3)GO25ZpXG(T#I z46IaMzW-yr?YXB6B1l$AfwbYC<1dAtk3<7>^wDV}YA+}X&{4EySoG3^hCOCYJ zJb~6`K5>3J`w0d&sO?R(w0w+K!Iz<`3hhFSut{0AQCAqD#dG4B%X_|@qgB6+QFZ&) zewO|!>Iwabt3X}1bQi$1SYf79lKODk3`u^at0Q9_i%kaJ6zKZW?k~9bW)$}f8*Onm zN=j#w^Ob-bb|vl<7Zc%37^-hx(+`*0!Su0TN0=F&JV(9@? zLRgQN9|9eA@BX|p-Q%{#=jkB>T$ zewfSOQ{DSpxmeu?_SfoiOkGGK${3Xc*T(Iqxz|6zVx-YZRmV9rb~{ZN$}=LHz9p+A zT%E=Pr;qm`vD?zM#y*o0e}OEk8oP&G5-Q5j(^)$EW`0Qi|-N>VtK?tagU8 z64-xw5&wmTvT=DZE~m0Lut>e&FS#KVQC*vW)VjUdK`l(sGTE4aR>20)SGzgQ${QU= zTnExc+Zbg79@_y0OlnhTF2*@06{3zW^uO@8Rsb&e98`o^hn@Ced|ejv?hiYaSCSe1 zC&t=hH{a9@L0E_T@bk}KIrYrQ6H<}mG`7<MPOdt> z9`-=UZE%uq&v@^1b-U;!;1&7;%|ZW(vQ<9WK?-VKR@4jH)|$JEB1ZBeJu1Tm`7kxo zl;C?}{CIfE*Kbho`sD{P-xQ%B>^_}-&dN3$%zh7@CX4Z6NZ}dQ#bd5B{{~83dE;QT zGuc?!KB^t+D96CqFetI|tgGQdXLdE|lBd38xQR-RNr=QLxt(2|&gPmPzy)>xU^*hdDHrz*Cj^s0cxS08 zSvPItK4N?>58};O-{Kl9N!&^V7t3D{dc!~;2zIQ6TIiv&6P1d+*zqHQuRh3I?J)EF z?bt>~e8hePdvxERUH{W?aN0v@ zSG?NgdHP^iq5HlfTDKTZ-E--Vz;=}7B)y7fvBK1doAA)^%Fx9x_&|N;z!U^$p#sQU zfse!NVMc?4VFKx5aTYS>&aUOlO;sl8?5C?+Hz%`bI^p+g%1o>^TdGRN9~1^^Q;)AU zW=YVQ_dB0HAUh@}Z!5kWasoXRP#tip^v&epxsbQtmQj+(x8-4JC{*X9#@k!l_8Xe( z_+FWC^W*jQ_p?gLabmUHD-?cEt+x6L;pBVId$XEJ%S6bKCep}j+KubfG)VKv`&Zxi z5Oe7xS+=lwM##TMATpIy%*RRcw1@X54(7==^v0?g{9O|b2J#0ESy?#IO;I;|#k^qn0EBwoCyhq~8!#uyQZoPdN_Q7%3Ml#8)eV$mM5gZIe+@3UTq< zg~~m;oRoH(C`&|___?l^s13bLB)}KKJ~NJI_nN4R`?4O);p0`gnd6zEbHm*Dcphjm z9UC0$NXSp;>Hs`qsuIJkMV9x`dF4;d`%^nDE%}N3TUE&^>dqXH$D@PfVg-v%P`H8v zqv>~ zI`Jt9*huS(P-vl=#gBg7*r09Wl!8=5*db`wL&moEHB}s*@S+~Ies?I-wQyl@pwun( zGdi|q29F%?pw90x+ipU}9t5Y$2uedY=fR(|c;kTcHc?#Txx;O8t>f6_7`~t^VJAn$8kZgDirMi<=gF`4w zo0?HNcCG-Jc%EYw`l9>R{&EpT;q{~O@p0kTX`T8ASsH`ION*Q)W$y^yR{tsh8vPh8 zNc-sE_a?_YCJp)#FdzLcx0^+kvUDD0qPeZz+4r{wWx$kG!!yE3w0wl-)$9c2*XbXp z>|F^nHVCSE90tU##5Qo%QgBP2+IHjY5R{v-U}Je?@)}(VU#O!lyVT?Jbox0TMhIsFXd_wt83%67+v>hCZHdomiKGN>pH5icf{&j6gV4rV^qa zL6_9l8qt&P#sw!`EcmJYRMkAZDX zZ=rVLxyiW=Uk9JHm;cGXsMWI~Ov_RcszqsKb#b$AHP!As*fY8f zBOCCwHPyKM;=6E0M-8*b(2CB{)0TZYvLGu}2$&^BTp|)r&ul72+d-4A$jkru5UIRu zE9W4Sa?$FN`b1!j2#f}UyXm;*wL&)7fO?JIxbLYc!8J+LqMPN}D8zcro-2nmJX&&$ zGXq1>7{gK(0hOdztWoWmUc?eSCakMG*4ebIDsX0=G6QjghIh3a9DgWddu!Mm!w1a( z@jARV7ED4mlhu(0QS&~LSQSf*&k*ux9%!)h3Qz4tkt}yo6w43Q+_8RPKKqHslHwK3 z?mh9x0<~t0+VRefUxJa0{bDXbaAYdk@zV(LzG@AAN(qx|{g<{$KV04M7dxdfc5*jw zbQNrN62HdAw_hhUwxL1NTHy2W7toSKRT#q1Ptp0NI)Z`^S=)JjQMzHhQQHPz2a%3zr|mn$ulgT$Dc*A90D(hbAREZN zqnCaZS{KveW9wV;OW$ZDs+h47a@lMQs7ESyNA(doRf7qG(5=nR?12LaAG+l-?zP0{ zS|yhI(Av1ERI!f0po+`XBpU3k4C8v1Hd zc?ynA36m~I_P6GYbA-48Us@M8%HwE3WRlkj2um?zje}Hp6iEy|szgE0u^y&DTQiln z(o4Wa?0}~tCFW~sWVCaPlZl$&(EV5O$z%dTXza7a_g92rjZwMhffi5Ej<|W;{M+0K zyg63D{Q95!*U7k=l#~awqQ8{q#%2o+#H|#Y&Kk8hCR>84O*ZGA^e5KWEhO@=m+0>Y z3MlNyd*C51(i<-n1H`bH^%isx28zOdXbQu0#(DsW^g1IB58VpgpAW zgEb&oc{dr{6fW>Y6^DQR{l0qfL|&sHfO&HT zEr#ANgl7js5Z><3Lw)Q`c=ENTS#4b6x5&f<+4jZjYevsXh*n6pc)pmOM$cCXT96|- zaY=ERu)O{yj%@L?>FbkVJY{&=$dpo#BffYbLdQi+ccko)`CDB$(4N2rc|jcf_uY+o z!3tGZ1IlI#iaObmtn~KW>EUl~4UbY%YKL3yC+$(0U)k+TRH(;;gXM;fk%0521)|qT zs$tr;sFa=W^+cl;MH!VXdNSbsn76EQM^!ahp^XM#R}W>K=Vut9;S<68-_gRU>-DW| z5YVD-BRv|Z+(#W)DMM35&OR=QB0SOuN=9x#U3!BNqZTa6Wld+xK zKC29R$t=2DQS;UoDQfjy21!IK`76Hndxwm$Yd);GA8&3|%a;=yjwn7lg5Mi;S3+jC zNfeaS_|Vp?U8i!%6muR*RHHAXYJ(|1U>Ug^iq)T5OS_3LO^~(639~Jy<*2$;73>G5j zjmMj&@&&B1Hd?0H=M@31Rj~_E(>v#j;Xxw#150QvCjfObW+;|U-c6q9_VNaH?lE%N z#vEz`2W#w#*^h_u?O#f8D8;fwdn=-JJ;N|YffZL3c%?Hfe?WD!6I#UBV;!Z7(%AK- zfww=*w-R>#MS08OGbFGy=0+9z?y(olw(jn!-sF*s9Q*c ztz6^V48fN{$9iDAzi9ImJJDCB9WQL_*1W8aG9*@e=ApxdZ+{0hu85W=l3PdFI(=ao z3YSEP%E7gK-Dc3p2q*XYOEpsz_$<8BkG}R?&II+juk7ni!LARX@Ms4I1({%)p^-Nfh04 zLpw=Ns$Q?otnS8l<*e9cQGPjL>>+uR<7CS=Bu)1(I`)h=IA>@F z%{fRT!@0RR`Wu7Mk{a2#;dv$OP0k2yMblTJF3rWm0IAqH=nLno<3i#=h!#3CT@`jj zU%U4iaXHBY3EpqL*x^f&M`O|lC+?}kP}gi?GNtjeQ2%C@4)KO6DV?4lZi}N2NPqM> z&lw((U8o%_^(Pkzy8TcFM+nAgiym{8GIMQlZU+I#F1 zhbhztINYSTRPS?SBv>~~jZMp!6%+)Yc>^nZwqN}N#JKA?vkEY+jO_6Ci!^8}(;TpE%Q{)V2K_WSK*_ z*!uJ)!rSSIdb@};W<~kC(iJDAa-~?*Aga};FXWb4jf&D$(9L74SP>t5PMB}NEP8wWhqt3R1v0d)cbaHRcwA>O=bU9P3m>a*e!*$>5J{VjQx zRc6DBc?Qj3OoUK9da2xmVL%cjm-)u!DHM$4!xVKs}>fEbV%+Q&eu8Fqz!V z5J!|3Cs0;=dbq#mwDNc>e&ZXgpSAQ?qO0=zrgc_{6=`?54lj>XpV^$Q?4*iZ#9Qh$ zFp88nJquL9es{3#Ww$zj+plB_t0@@-CK=jx`!4vRRYTmgm+x=ZjZMZI$p)1Q_#HsCdOADfiT0Nc{QO5Q=tZF_sG-%k#qo_+;cLXvBpEBg`eo{` z-ES3uMpE{O&Ym9U(aE5xh{tb`0uq9sv#ridtOsDgm;~+KjUwk--%WQnb2+>Pt~bZo z4c{zc!ZZMU+RBqIt=&+XBpDM#dD{+-SsTEoanv8|meRvhI5oi!wId-?m|T*G&5ZWh zj@xZrVHNG#m`=VmC2Lbi6)*Q38_QSmQ8Uke(o94yrnIWGf_l*6@`8Fdi6$iMeXM%|W9o|)*Zx8n zx5sHWow?VRpQH@hM4Z*$Pf4)S-Z_2_z5yh-o;^c`Sa}Yytw$ejlS@pz|A23h zXX*Y|79Y!hXe|5xEIxYn|JTX;zgc{Y|J!Z;Z&-ZJpvud;e^~>?$rUAo_%Jxl$j{-< zZyfyr!9p?hLvKU{`6&R;k(3b-l;acO&QVYh;t|flJjU8zdw)NDtbBGeTYQ~fcII}> zZk=*Z>lrSw?j|;itPLp1N&d+h@TmYOEzQfr0QBnN_%LWBrbbM`MhNwNwuY=m>EKwg zf|6f!{)ON$kpmye`54r)&>5$h-lw!L`vVq`F`S$;HtvRt$PM6|wXuWpW_@SX5?gLi=dceRN@@i{& z6K@gg077K!A91??$Nu-Bp8WvTz<_N*gbQmha>YE;!2Pyx808i~oVN5w`1L3F0Cab+ z)&T#&Y`e!lroWURVn4E>8|wwy+xg;p?2Cx@{>1wJv5};Fqu@i(|9qi7e|ZqBkeM(~ zfqjT_8nnAEC-QuvI&l65;Gch!5vPD|L;3Fu_QQ3EpuQ;sxs8mAx`@XYP(p_D!+jr0 zf&YBS`mr0TyHiu0cyhbqU%er~*k-pkmf)VUEPq6#n;S?ar4Q(V&hXzur(gpB#QHTI zLIy&B6S#iwt85V86t*Xqpl>)q-;M@kkdLl{EdcNU&lVi;+91xp_`$F8Sf>G;pY_$TSHBMcG<~WU0>Ibh>um~Zs_qLp$>1yB z-R>)uer=saap~Nn*@52}3W_|hKVM!xeLopF5Euah1V($54#-baG%d@)3OM@RoZ$vJwT%J5sh_$nBpTqR0bl&DUG1;zd!NFWTFNi_ z)NdCdsvO9HTl%Bh*Dr*S&OXF$9{{x#4}&_Wc|koV{}0KIh8j=Bbl9&}b z4kp)h&OEtFx=l#AIwz|Ibt}$^SMKJWZe^?D8I|{_tEln9eT>!O;Nfm^&FGgHEERH* zjs2a}0!kXG1P_qEI07&WVI(#-B zvfX@qFWvZ;>Qyk51Ro-j1vI+U9IjD>YxHwlTBgzhyoDzh<~En04|5FPZdY*o4&!ov z1}v1lr*11Y&N7)D!fE#X;%s)@E(tuVK*p}@z_kr{ukVv8Y9j4%(vm`R)&dHJ!4U{f zgv4b|gk|aN$Bx;9DS20__@vlv#Yv+e?QP0t=3a(ALLWUq+$8!6jS&H}Yot;8H;ZRq zcHsHIu$m^M>#;;Eq{r-{c|!){ujKXGQGP785ePV4x^2X+z;1GHn$g_Zja?VH+bLCu zhB&P(-#9SquX+ak{Tm!r8@^-q?R!`#1w1b!Nx8777vsbE1+X*BSe(48w^Bn7m?(~8 z=J%sqyUI$K;Ozs`;w}U;+pb@UcrJ;v+b+hyA~s`FsC;{3vuG0#yf5hb~Y8=f@OPk`9)w!F=$w>)PnYKIAkU*kK2_i@fqn zldiIj9IuR``_xS?NnFQiu9pLK0Cr8n(j~lZseEGGU}lz!VPeh5&$x+wpWV-Ae~Efg zo)$KlbRw!Zn6x@@wAB#c#u%Pgdo;oZHrmmt>q~$UO>1@7VNI}dI)z_a772|izHV%;bYNBe9v&}Ky@B?iHIMqOht(p2*iY&x$L)u z)lOYHk@|i;fyPcv&q6+^oj(AJg7@G>mJN;9b>hFwxO`Nqx3|e8xwe)Z5@i!dtiq*9 zIpCSq(UUlG#Boyy98M5OR-H{Pf>cf6_Q9I%bz#;rPa#@DzG*S9{mlEDXB}yGn-%l5 zmy!`BJJ8gN+}a{*;+vmABbWxa_Qh}+RYoVE zWqE^1%rI?2Re`d*{y{_*BoJkb0e9ALA98*np9GKaCn)l(vMswBuAY*(6&bW*gq?FG zX5GrzG#P*;a%2sHnuoce`C{-Z5{rq@*hlf!u{*I>fi9#ZsL6bS7mn6c?zb`{Mr6Fy z{8(nIl}>~cXusNaip}pwv={%eXI^A@LAKxNyzJ-v7e=z#U6M)xUzplP zqtv_InO@UiQ^a3RvYtVwcTa{(?Q`CZ?l;ip46>M8KPad1GJ03G(*d+H;?Ncw^jS!W9*bmJ(xhT#174(=@KQQ7io0F~Uyk zz305vU*u4vK+P&uuP1v{R;e4rX@07vx!59?hMsJy)a*tL zMibSP3+nypI5FBlH{|L;`gw#tqEznlI|# z;Jz^KtTI@lSSu%Q+uIAxndzCJ%+LW3cWu8+(868B%nXC4-c6va41eWHzFBz|q5q|? zIJlt8fb&JMS;dEa8*3V0y-I)HY$dq`kicVOY`AB8a&}0h%AAyk@$wtbkaw9zt@f@d z_Dvf<7#{0=(gHNG;~lNdY1&+xrmHYM+9o#9j!-onrJZzQ+`1Zl`T~f3Fdr{uT|DP5 z8}VF;BIy!EV&bV$k7myB0u%bKWT}yaitA!=eyO_)1$8@RKbNj`!6>;^^B=o9alM-q zoIXKY{q>at_l3Z_dHR7J!6As8#@M_kgXU*05>=|W=$5Xdw6&Gd571{qFEgc8WYd$Q zNll0k6h{jU(mpEQM#!mB*XgWBWtdw=5915xI^vfO(%%!Jtb!ad1mMn1K?Wt6zPwW2 z*48wmLFdVzqBE&j=1ZLn_N}8KYuND^eU2f_L{dgN%n^Hl}?f1;M@QR2GMb`mJ zK&R;eY>L@=@P%OTkh+|U#|u)_i`gRx+a=M{8iK_7NQxt%;{Q zqC**|6QR0iw!n%kt}N=rX?@!d-2*FjjFn8LgHkC=;ga-DObf;Sq71I#TS{T(;X@)M z%(hVqQ#V(gCdg^gJeOx;J3z{9F-Gr3Z5Qh8lGTfm7^TiyhG@q$5_hhoXoVLJyw{oH z#nO?YMwT?K_31$SnU)V)O+d)RgY1oBwr@5p&c2pCfP|n;vD?Q(yW?tB#!E&vSRy0> zeZLgdY0j{7PX$@WAvV}8R}-3UiH3v_lj#9rJ(yDt+1lKBT=d7rOJPigyr#d8AyrOc zd#ne_LDE;fAq;1_!C&fjod|BzM77y*&~-Sr{&G``)%?1{Pj}{a8zgD7!O7mn*fbsI zz4z@7Un&vfw#5*&C`!~c!rDIKICp+*8-Iwdfff&6oz<^x597ETI{9(t3LdH7<;ECVVJ1Pviy{7 z+qP}nwr$(CZQHhO+qSFUxtN%5qWhx%LPng-$g^Uvt=))4Lf%)Nm@R>U?tklX<`vi>Of{MLQf zSPjroMxCfnn++MVA&XjtKt_I-@*2XPP<#ZAn`gzv(_AW$-{1Wq}{gM%8Ct zlPQPFq1$(p6Ym?rTasIG5N@^@bA_GeshqV}bPZ0{4r8Mo2->H=T4k!QVhuPj`$ib4 z?ww??q`zyNV34GkNc7v=gH-feRu-|bpa_x5CdC}A#^t%}YNx}{>TRqXj)$ylfGOqE zK{UGUNuE5fyX^Q}NH43skW@n>&UGy0hEujn8HM*$F*JB!_@L$JT(20ViF7lr1n}I; z?jq*j?cM3N+zr4}D{k%g1@QbjY{r~++q62rRyy{5koXek5g4v*8~>sDNpscgNBr|X zafR|uHGDAaoh@fFF6yRR^ezxr3Wkq1YPswx^HRh4q#cW@JEbr<8B>^5viwx?FBkic zw=*?QN5gfPrOISq@9COix$w!5Igai__j-P=)`xWqt3cD~Ix2{&jg_j(xzz|;`8_!m zM&&~HnMfubbLZ=0qFi_L;xRs3O-T*A3hQYd$})(aVfjLwQmZDP`Z?-}d~X=YHa=Ks zGHDuV9KfWfUQ~~uYmsd(Ud|{w3Z$aF8LYwd$@r;Q_BV+mA;@~Bi+4ixKzCwWk%FE` z#H}ON{k5eo^4knsJgLyWQ^yEH>XL~bIanx92yqEp>WgT8@{D_0YOf#{N=aNbeD%9s z)0wp_+&5F-h!73hQqL#H53M$7b$Q!LlanVe(BO>>ydfkGnVC7AAVzDat{38JF3XG+ zrc9=p-v(Bj=9#zc1F&9se1;B2NfRdNv;yi(q%O`KoxBgTeIV#5@r?zP-G=|$Ln1SA zLN3KJwcLDU3XtRdI&wsvGXZf9{U9Gx zC*g<0=egLVXUaOkrdv@*tkK+dr1WlC>DrdJ;ZBC`i%V>*rrGk&I^wQxg30paK@;6f z-Da_<_Oaypyk+X!-BcAQQm^8HR8L#LS4!f*>+e$tjPdw=1$ug6g?rdd((V?BRa^DI ztm_XvQXHwet9P21?>V!0)(IU=GbQg+<2nWeg%IUG+nb=!`_Nv55w z_R180$!nzYogWLNuP2-J$rXu)9`sv&w`GPs)-ch;%fTke#3_I4^G|`?fu*9C@%AWP zF~`?1WUK)98241K+nRAgqmZ@aBf+`h3R)f$afqWW-8b?;)2Zw2Y;T)`Au<#}zW}2P zxN*FS$I0ku0i|R%DGT+_hW&>XeQN5j@J50!tg@R^$jEK>e6#2 ztmq|+Q=KLF=*_VDy1gwPDp z?_XDsaZc#jx+K!DUcY9uMt$$=9wRbXLa2bzr(SDdN0;;+BNz?t7 zOH+efod)x{CZhK5A&^hq7yQV{KbYu5h};jzs>cr;gh8 z@@?EOMX9D@NP_1NaD@9!Y?R5MomNJB)Og*dvX`$*;D?)6D~2wb@l18 zb!I&md{G%F^7XtHC&Ntthd%P&ZxM?`kI*yklRAWbVv$SzQP=HFBVuqzK2yjk zU%t2HeiZ4Lj}_<@)5>CaCJbq}wYg5P45hIqOOqOCoiy?rS&4Vzjs0 z1#_54l0<%U0|b+Sw-$~%P-eM_^j?=6h@7CO+D~IM$F6>K+O2uvXH>+by8BGq^=!Rj2+5xPJ3XVBL zLVL-_2mcO2^}a@!fDmV^ok{PzZp+yF)U#>>;va^E%`&%P&+6$GX1_Uq0+@P{A4-4~ ze-#VNL?cEwEeq-w;_TDPRFv8w6LD4e3*Lsq2XokSE57$rEl-U=F-u3K5;w+1AF)69 zpkXEFcEl!>?AX0%@Y+J1c#10v;C~fboSdFd<+X`!T-5v7fxC+`a2@uNxV7Ol2Yh?hr~j3e<3K8rv*ilsVC3?Cc1_Wy*P3?pXISW05(k zIph+2P*z6jwbEgN_hs7t_k`5_u_wq|x7r8@{Ro_eDG4_`HqqU`Z?i}_YT81A=KX7r zL=uwYpZnwp66Igasc{aPApRG}=bWjT>MHj>d*xG&r17O|>vS+$OSkyi&+jvcIsVE9a_Gyiio;9g8NmCnH zu3TNfu@kkuH8cV=Pd&98zC`vp zK7rF75d&^=m!!;zSw8A-puw?&dqqr+N%l#@_xlNl-^}Fm)roZ#Mj2fDl6Pl$YKr9oX*@1 z*V$IRVy-k(K@B8ETk@yP5mQLg4>wh7=ZBhQA#)G0PlgTA_u09S;DxLdgf)UEf_ORE zT2DIDShwITu1Q0fO&{jGR_Fw)(F&2P5*I5SV?<67=iYp8UftAeTS=>Q9Fk#*XG9eS z3>2ta3h(da@)Gkm9216>APmPQHSEO;(23YO&@%HtioMdAiVX8mCyWsvNO5e|Ex5!J z`$N*r4vkyT=Pvl-22|G*vtF)?={#E1l!ECN>_xbF==Ct?xTQG>>}0kc`Q0yj(c!{- zV}y&okmAxj)S@aRc~;U(=@v8Dui#GUCme9n0(7`WQ3>en{Fnk;;34DCC8529Dyp5u z8LI%9eoW2}#S6}d#KE3j_Z9*+P7|=*rZkUaGPStv%!ok~=IcM+j0oG-`~*6`*+z%V zUx|M047sNjdcV%bvxIB5w->YW*A&e~yx=S2N+Go;CXo=URD`<$|zLFtrqv;5< z=kn`;D_@MR3Mh%e1d!D6WcQ;If2T&m1o9?nw*8C{kk2DRZo00isI`u`K-``=xS|4)pM{(r{!7+6{UU*h-wi}5iq zG5p^u9b?=%+u@p0wp!C~hC3$h7p&m;BWq4k3Sgb2v@{EGKI_oD?nq#z(Yy}Y3Ox8s4p z;m9;>dHvatPoU|8@d;BfvyJC1fPk+9j?T&->%kHQ6HQ_r0|Ms5(37JB zi4_WQAHc%H0d#Wqiwj}ppTP$Ffv)~S_5XWs;s783EbuG&Li(XXK>p=I2LU76-WIUe zBj1MD2M-Yd@T}YfBF96<0|0S*2L&032Y2QkhXM%+xL)wzs|fW4!A>Js+t`pa5m z2L9|O)((ulV8jnp5DWi|7Qlyi3<{Xf%}o&S`*-nv7RfITz7GIe1n>$^MxMRDx3ORv zexzsh{Ncs#1E9*!{1Xh&ub1byqfvx$>hC|~{Ri{sGaSe%8F>n6+U+O$o2I0IZ~%ON zhL#F)FBJ(0C@3gs2;ktLz`xJ;bRLxN_xT>D4%1Eqg#V$|b?)=oWm3n7z5huBZ`;r3 zbU`>j9Rkq)EAt%2uYiy4gYe0>^y4=E7kARv_+2ObcNeS7V^F{6TiXZl7l(HY7V_~c zke_iCAy~!#FJKkuyRX1BzOT3n*umek^QTP>7*w!I0Pr^0l(}!-2XOzV?^?c?a{1q3 zKAcEE&Y$6oo@;=P18F(O}U2+CU@#h=BO> zw0tquCw8_Uhu>Dn9WY3O{PV6qx9`@k0`m36L4fMMoohbVr`M0xEqpi}h;%)39Dr0< zS2y%sd$m&j+vy(mn2yFxq(mcnII4&PHTRR->gbeemPfJqnY<9wr?OuIp`$0Sf9~16 zXJMyr=ABqCi}v+W6h(khkH$U~431gS&Vp|E)pFg+;XiqesjK zNTz+kI)d2OjN1++fz|QR7+XgWu$+i&Wb>!xh$SI~`Q4{Q<8fRm^nxPknPsn_p*?wI zE|p3#S;Jy#yNDyT(MAN_Upft@wHz+ugSF$L3S>von|t*}6CeR6UD95c_n>}Pg3-53 ze0V7d$!JF1EkzcgR}Jv!hxn;N4S01vTvjB6|JMA=uc#TSf~v1ULCi-j2kf0gdicy4 zMW$5O`?WEF!Id*c`tW?&S)}17J?Wc*Sd*I?+#(>d^EL7(rg|qeZB=g3+dD`vFP^_@ z223l~_=p1pPAHBT&P$b(HYWG8jb&7?C2i^5w{{*0G|+Eyeg z6T#Tw(-deVpG=@N#UvgdwSDwo!oAB?^+kEVb^*aAB5Kkq4^p%3mv?l-k|TWp!PCpK zuO6A%XKI}1(woz@5#$#FO9jE|J)*jbQ=xg*;+ViBI#QKDyz7_-RIk`wy3x}Hn;=0I z)WsJT!7b~SOsCi_A0-|ICGfZeI~ozO$f}idAP4bJ14NNbP;x}CWIR$pX{KPv*#Qvx zZ0rmO7ClFeo3e<)8-sVTd6gObGWH{|o_M#~ofzXw_8mb*aHt{{dslgV?mryfAb>`dQo@}2Fqo~s=f(<#X#tE?E-q`l^eENHckgOH;^`9u})b z80tIev@%~rik6sd7$y;BZlAlA)V@5zwXOJ3>YP+9I;Ky7R61|_p6bdg0;`p_(`W1d zRNcVXbV4Q1BZ$}9z$Ruunui)5TD{){l?&R5vTBZ%?p|W+sez*?g{?*R{PufQWDbj!^)F+D zN>Qv`x&`05sh;Y_lpjGRKC>V*XGM2G-HB)KU>lekIWm*Z$FrEP$Er#@9;ZViJ>h8O zCc9Tq1i))6;)*9xE=s~Su_B8`(QH`6*jt2v&b?qI3I;Wv+XpeiEBp@|b4=I62yH_v z;8A^L=$|}g>^v^p4{@1|1WK;I_OJRy<9t+BG7LV>ZoKzjR|6&9#q~sGixqc|nmIS~ zQ5$HFtNNr7%bx5x??kion{#c=g>|=GmlN`M-_?nEY=Fhde*O$(oZHo3X#NfP~wNNl>O`e~z%p1(0s<58kd~n(|J+sUn=I z1Q}wpHxl+J-C~~r(QBZpI!BZXu(DQ4?^sDg2Rj8&Hdf8H9G$#_)BlmZmJ6y|mNHeS zKgQ@btJT?H)E>;L2N@?K^n*$L`H%Gf9JTyGdTI({Fcv8pZbHWhP=%d6@$o8%3~9x%!N@tACqE z6#TMoovGC*vgO!ygmED$-_qEovD>S@*|;t}wXq`gB8P~RXYIZl(qiD=_he(UAnG2Mg&bCH@7#5RvKtG}!rur0 zZ4V(Z-Md!XE@N4@E~q-WB|I7#7V^ID6U&>KPvHRAE_io6<6<+a}HOFx7#6BF6YmA=A?~b)31uPDq3;A zosz89^k9+ZR-Q`Oy{_GB6brZsn1*9f^_!jiDoQxz^jEC4&Fp>t94FG`8-RnEk@98Q z^hrGctYMgJO9#7qO7hkMt;B9;8(Y-C^s(bmLD!6_9F4PeZ<6dWt-UKWPp4f8X&-r*LMo_!Q1THgu;w2f z$s>qS@hNqjw{?6(?^~jY7zpscMbrqxLeo3Cevj`ZRcb4klT)+6ONrs2h?ytNBgLcX zwVmoExd>b8wZN`v#wK2m^W0A5j>m{l3e`X6aHqszJ7?f1dnkrKP_%N)cac3fQd}ps z&w_LNPF&M1xBWdq8|00_PH%io2p~xn9~Wuj<->MZby%GE(4Q=H0!16_qa${gz*fV@ zMI0+4^t`JOT23<*77XocQ7oa}lA~VP@|C4lTHu~Tut9HPr){wNaPbSqj!uhS!Wl`KiQj12*G1Q_X_>aH zYw$I=kCIFfZaehz^1%Wg2Q4xh@(;b_3|3GRRHRl# z#i)T6FQ_u|{^Ee2?^3b_ExKwt&QO=+Hh^*(+I9VB(9D8aua!0vZ3*QiGN0Ky;74Z; zwBGD)1X{ghm&z1cskiKrm3d^RM>VCa^4sdiv2-mVY`1y{L>t?;EZrJxN`W!DEY){U z`_gJH^;%x66?yKxk+q2E)ScTT&lvI_`G`nn+Cp9<4!TR0*JX}iq>^3^>YQzOhbrBv z*OIAhAIa1b1L&yRcH2SNS&|#(X*Ad3_-A9{u9aLhywxpE&hV?X|pI z9Lk}krDF*ppQuc8{}sdm)b6LU+4@Bn2xFyvFD@oo^AXxhm3imUhJ|h|eaFq2%xLVk zmk9MnkIVI!htJrmn@OiEx6wTj?{+HAksNW~_e`(65CfGQ+@vPE!Gb5l;u;h>V@Z7$ z84S`%XlbJ`R~NNcxX?-jX2wMF?x`D74`iJS9yvHPA)c{MdRQZ(hb3~<^*^E$TQRF) z*M$P(R!c)8^Yxq?W|nf56ZLKAR36I!Bdvk3@sWF9PhGUedB>YHA`xE98g7MXn9e`k zUV)ia*v4ueEl|pqQZtXPA;R?mVF`LapqfVU0Z9t`u1i?(GWKfOB1qZC9#!KKf`z|Co1O` zew@SSVitFeA}swR-U+!;3^hG6dzsw49qec#ZnT6VRO5^&a!>h&gW#4 z(KHd+es$Ka*>{%AytA-iL1bUpBXDo9ya1 zl5|aQ);PZ}7a6qC^X^G*!zR}8qG153r&IYb(xd|ABI;GJT-xILLhy%rS5F{>6?I{) zX~-LBW?gw=$F!W1CM$h$ye)U{U6$9nyP{{RmSO>SgM+CYJ;t1u~ng?pp!}c_{lA(i|pd}t%H_y@VOKKuY z<&!oY#X%9sj5OvggGr{Hz;WU#oxvee=>J<8+N=#2<>0`~zduVwyu!QM5%Dq_Rgauu z@wA@jR;wh8r8frV1#*>d>|==_o-{Z0Ajz#?M?*KVZ(OQ+$^ZRAh z;t^Pf?M%Jv0J4fI15yBI{xvG?DaPGDnxGAPe449q>y_qd-Ep+z*++qNIKd@Pmx4r- z=`Ts?yCVMmq;LJ@UGsSI4Tpq_O>+@oI~I+Js>{qZwdlhJ7}D|iYqTenpkA#kqMr)) zQBViTLN!ZVSu4`^pW;Fz>j(w8BX*!m#H>!E2fQ*r<*IG<U!4tt4 zN+>NoKzN@F>Oen^@rK-bT>5@&rlN>>E$PxIS0dDKuUvU2@2&aDC{IeHg0KFQU9<1J z>;kPH96=W3sNuR2;0%daBHT-mlYq01VpM{N7soS%ge&A!jkk>E!kof-DzMflFRE5? z^5Y}Nn5rcJr|f(u)mMQU050H~emN1lRXOeyA}1xIa9}x(z`8FIV$j;Nsm-?Qi*DqB zZUM@jZpYxE!jc7S`U$2dRi6!t_N5IQm=zj|4qfF+q@#x3$E=NZDzHC(YUVw6N$BSfX3l~*{IhDP*dVs zk_F4TJ&`pHb-LEsM(3K)`-rRvOQPP?1@lVk?wpy}Vnl1qPK8Y{1A}>HAJgYsf5# zNlsG`)o!48S)Et5yomh5%qrVWNF7QdsV^Y)DNMdR9MTyiWM~*Ig;~onPBeW|^>`&# zdpbhl5giIO-_l(pxIra3kMIGt8zuz%&}xxS?Bwz@KKT_wt-Wx~VGKTd+O8!_3RV%M zfP>LC?i=f2J5z~V@o1I;iJ_0mTSMVe@1vl$TT@+Na8(MFA=e7B&cZ%~+Z{75lp>&L zlLbxGhzv}VmW}M+)93Rt;`&_$lX$7k9F^0va3XgB{4o_65Bf6ZPP9f`FRi<+G#;); zgm1>TWa+=u6h{~a%M$1sD-vpfyH4)5E_W3?(dgaA9{TqI=9*$3Rp-oY!IPv5pD!gf z3Zr1lDMwh?e+x#d_b@5S8xQ0DrWRb*iGYH2c-Lvuca||Vu>DW%t6~N4)9R`%Wu(ru z4^%4c2fYnr&A{?g=924JKU8z}SFmGBmVLZFh$cjI0!X`q61%|6WU*|J*W7nijN4fbNK@CA4IB*4bj zq+7q(J%wJq9B)$VzRv=(RFhWj-uv_@o-3r_0)Cic@!lynqtyD<*}W0BVjYgD3U_f! zj-I%X4g2n%x7Yvkvv9?JWUw%u2QEF?6ZxP2owjpx20G=zC+m25UlMJrW|r@> zJjxpV7Z^J+4sYacD+Jyoaa4Oy2kt-8#5_o=Y=}XOo$uRov*?cfJv8UG?9LtC&z*@W zqU{q&_)4IOgq{;yead0fP22~?Lw#C%ny)NROQ}B115%w$N1R0H9syJR9QYBbvv6}! zs6oyg?l%lw03oS+LOWd{4F_6kS7^$VUGCmj@Ja_5j2JkUAfRtLzwk2m@fT%=6&RdJ zBnKC$8wLgEA#4@Z#5-W7Hgki|wSBT=duIzzUkjuFwj3^iWhxr>+BM$l+S$e8+j}V+ zNMv$N`4g7hecn#pQ)T#&cnj(ZF*0b=DP5RRGnyogU<9C;E*`DAm~AWor6ftVqgf}$ z(HW-aybZQpH2~?AIkr4c<=hXB@8wrPmwWq;r%JlBQyp7YTDx60BJ{UkApwRgOR;Wd z6Vk8&H~2W(FM$OaM0iIsDT2A8c{x-XKPMR5Yg=Stuk_aWLd{)#z0RqBH_XvMRkLh& zW~Xk0Qbdl}np1XM6n9G^ROabp*OE*sjI8xqj-1XCQmu zz9)@{{J8;h73>*bt)?`Ev)g`=99-XAPOkywwq_(zEP+b#o}^__TCnPO%zd&P{Y811 zN?z8eH(xVc)4_3$zh>3zNE7T`#PyI9!_%`@rCg+W?+@{z5@)R2iiz$lO!j+eb+odC zh@M#vUhAJa$01{=Gc(wEC#w*~5?LDQi`bVi7ZR#`)^;9NhWJ0XuBBy^KmWC31j1EI zK%nyV;m#R0rcLLFAvNs#8o$`ah8(3-kXMBK)67k(H5w>Hiif)+n2?XJ4(7 zf)faY<8M~I?Rv!_3MZ2jFq(o5rDHG&IV%(jN60Rp6D&mRc7@{$hvW8yo(uF!ymFZ3 zS^1rD*Dy_1w1HzHnz(hmHnT!WYv#vmyqdAfq5A07L7~w$`Hi5DUu5!$5=q zu$K{2kd<*_LJkhbgTuh1QvOR9An*r6h7OMaENpCSEY9}lL_7%y-Qdy(#8o0<()INX z;N$621kj4}6SXH}p!Wjl5BYKk8cZNJBL|8Kdar}S6e<8%1%zB5*q7)3sVxYJ4M6QL zGSP=L7eJ6_28TuO+#EE}=Z6CeC8U?v?T3v3+xJ5UK#%y(z}ymwFlbO=2mtRdATU6` z%}~yT9{Gua2p})t%mf`Y@B>T_tse(QP~VQBp8x<27}_Q1i_rgZ6W<3>FfXnh%0wFq z%p`1Bk4}#nItbKW8H(BVw;$mGVBohh+HY?FU=cPIj6YD@r}>F9G_YT&4i6mwSTFB( zU8mV94*)cjzmE^6ZG(*t3G*d0gaRzI{x`u7z%O(oU=EHQIk2y9zus5+WR}<8=K?2K zu#ej(N91Sr69o(%6)<=cEG{x)&oZE{ZR3R=sQkWGc#eM^?_WNS=NnFSna99J{`uzb z?t24G7YhO&DBRBj0KOcnybcgCk^4JqI7-ux2M6u%Wf1R=dw&)4;n#t0hQ58oXZ**q0R8!I=nteY2=rhyKeB#I zf`E|tF$^iR1I8ZXY!9$5ch;F0O`mR+Pj|AIX_P~_6%8#*?o1KA=3Y3tM%5Pt9X-Py-?VXi}i);O|XD zBWiHK0h-l5?7l&LHD=7Rv%9WXtM|OuJCrcNodW&E2s!b3?|37}ODJm$8`C%c(V$mV zg}~dg@<7R??Q1fUan!!!d5(>>xLL_ONEMh#on?>cnxa`Wmyc9WExW-EEyRK?jcDHM zZkvs@vJTARk2IiQJXCG31_RWC7N$xzDRT-EM?VYS{oIA2@1`l~Aeck*#2lhsvTNH7 zIa_<|!f~Gg36wbtOp2=e@=3>4w)R8SIM@!ZFVD^{P1lG`RE&3~T)l(je+!?a)lH*D zJh%mKR0C+jWwwj8$9f-`LJ7Oeut-7o#fmMtBNA`E;2s60LendFNtV zC>2%ib)A`x(O>hRYpSX3DzuANP zoOu~cEmN)Ky`VUl+j0*5GWoo;39uaBfcVkE5tY8_rP&EWCU>GkHA^c)(zlZ;NT%_( zYmW|`i7DkXT^QOivA=e_+`g}-J3K{D<{BQ}{vBm7uF3Vp3Wja*o(JZ)Y{kSytM7(e zz@_=~yNC}8rJ$xOirt#i!Ol?_ zY-{1I%N}e{mT9nIg<22S&(trKJ>Or|V_R)p1u1|1>`Jrya3rZ3nnFOBVAv%;1>8fX zpN4tE4Lka)Lu6*F(t(U8$1uWWwDai-GusCyY)*Ah`?DNWNl76it;2ozK4d-PWEqOk zg2^xT?Fw4nB<0)=JK(7jRy-VF&3&#K4aB60UM3YYepHGa!IX0Za=va39K4%J zpre9qY^&R}TGBE}?^Y7o&@!F6%9x!Y?FQisag|u90p;j?{p89Z3)`ICE_ID$9r~Ts zK`%2m;1&2>Lm{AgU9SZH*e@mtj|*o`tL<`r5`HrDufJ`U%kK{t{%Mc%ezu$%RJYk3 z=7#dYU68b>yn^xqGjuKM?&5&$#VSR#ydm*#EC`M{U6yyKAC$%&&Frw{s-JFX(oYU< z-=B9uo8A*rZN^Dq-GD<+Q(%pFZdhHg88=mvvuEW&>8c;DC77R~z!6WM+ta(~ZPpn; z4E-r!40hbS(OiJ9N_IiX)TVfAl+EibQ_x5{OpRghT9)7Bcb$Fpups@6wzeDo4c?8^ z53}6L)-+>vp5gRX7Us?2V(j3?IhapyE?Yu3UjU$@<6I6b)h3jMgSh&nLEc-DCz@i$w4m z;_UGlT?$=ApJ{dm-}6V5c2!v#=>}U) z3h5m$vw{BvvKiLU3F|2$)QiX9=te*Y>=Bp|otshmdpD>UleUz1&12&I9M234U#+CT z6^P(e<7i|!bfn_zQ;{sT?oNL`ei%cPmpdW!oYAx zNjMn9LK;Ob)T5ucT#iEl>42p%Li#b;Ce1G2M@5V`HR8@3b$Q36HsJs&C79N7%AGAQ zVlbK)xBGs?%N-vr#9)_n&Hq?1>Nkcro-6A7KE9GZcx>C7Vp4o7Eq%ZD+Y^+Atl532 z05A!}`jrXkJbYB1;qck$EaiE$5Q^HZzflobA|fSjvN8a@Lsvt@X=!MsN7U)%;UXI)Qet+T zw;ss?@~LGhzQI)XZDf4nDRIwCQ8B&MAPy62%1XWKX?J%qf;-zU3=@oT>KJ{gx>Nev zZgeI;qO7yH)_CNF*heLkvX!sO)1ME(27`RsrTvnK`coD#n+%t-tUmpB2>#!0*A>x* z#0}ZZc4omX$%~IDJ_|7G`!>mJ30r1Q2Me~zxabRPFP$?g$*m^lJV(f=Y^q0^5u_rI!7~!OL-J+bsCxa3ML`fuTBO9mB0X_b4rjA#&IsS`e?@=?G1!HlHLo8poSs0$idu_Uv(H&M}YAA5JAm> zZ2m^o`T!T(B_n4iT!w@!nbpT3*ociZe^;roh&({dNcF5<$~xJt0u6(;JKLF6PX-8+ zoS7sWtZD)eG1*Fe*?Fv)Ka(uEx^cQKyjw}mR?&ajF3DPo;5e;W3xw;S8Uv&F?rP8Y zB*mc!^EJISRZn3YN3-@pB#eU`mePgBj~TGBK{DXY)Cbs*ZtF;Ua3iHQyA z@^+D$WK!b&<2-)!lr)2crq4*yM!Yo9(gr62up%T(fN(i&>{h)$F*t&3glt_(bBX^u zbA%CLOC8B&zI!gmImDEyA&BLU#bF-oaMN zGeh2Npj8S`HHFK2{RAX0?C$tXg&Zy;)??eVnXBpj69cE|T#rmhgD)S$y-=N%IPE^c zjk@|u;f9RIoQvNr??tgdt;&%`Fvn$jpFg?g?yF3mpyb?{dx;{piimDd@-Ye>Nswo4 zDPU4|4Hd~=U5mj*qJX)KmhG=d{7NE*5Hg7f4~=4hBCjdxBA33 z4Qv8j`G(X`e+#cY6$ zVUT)$KA;yOM^A;hm{5Xt`zhGTSXMrXT|SZS6$weiDwyLO`9^@d2#X6@azay3fq5a5 zz}~5)E_ZbAA*fINh6{8+NC&AfQ!R#U09SLDu3f~+YEnvraCIvgS{oDhnHRd;`X5oB z{f3f%uaAhstWGH`PW`GpT=_`}qgTpN_R|!F zBV@b-S*xvA`nruB%-8c=~3* z0xyCr}_I6Z~;y7MNG)-Ddk+ZSUks9X=g$g zBd!%Cnug>QG0IN!O%M*{oX4!z?lV&xxSfYbehLq$Zl@oLcr-rMtnctR`(DtuYMmEh zon1kL&nC`KQi7}EL#&dY`q1+}4{OA>VJ!~Z=+>bKc@3~g*3|2T4m0xu=#nbq)E%KU z^ST;qpv?B2O(TrnQw(+X7J>$6SdplY@PKa3-OEj!vQ(v{l-)}%#B(9Z%BcFC3bs)b znldh`hI=$X|85W6E<_E6!4|6F6${=duIEQNuLB)wq> z5cO29*B}bu!By8PG)8OISVW=?Zce4Sa{Z)Scx5N!z(HCY7|1>V$B_ z%qok@LyT*;bTi6Y_!#k#E;?RN{%j`*ASGXwYll0pf? zd9CEu?v#n`hl^3B(K6z3x-SAMQ}G~=J>A_IEEcD#qswrIi@*ZR*%fDe#iOyUZRruZ9-{=nZ_@HU4N&Aofn;s6Bn^1w&k?$U| z(g8|whDQ~gp>OV-lT(kHg`xCnMOUqxf$SgN!B*w#AGOD3V#InIio4dMc>aKvff#AW zPAg|Co0INe2i!%Is7Xt)%h@^~jpGlyc}qqdRARH9y&Ti?R6VX$ei%zD3hhrs1%+We z&(gp?Y@GdK05%WSe`m1@irZX1REtR@)`pb0h{J;>#MB2DR*PdGULGzNL5(R?bCXiv zG=!H^$CWhhuGtwzLT-~OtYn))3=KQjeC_QuNG=v?=TP1Y~Q8!z0| zdcA{YSzVR=`Vm|U$BY$RrRw+~k6B-3D{t1rK0=EuHQ-Q(3xZl1P zs>FEm&`7jvZk1k%O>=9>bv*3L?TVBC8=JTb^?XP`E}K-_op>3A{L+6D&2cRUZrX4L z`CUSEsYV*a)WLvJUnO%)O)N`RJ@89Br#Iqt&YUe}iVV1NbaT*2pSqfK+oh2h zuxARCnOwQ24sfZE6g9#GD(0_h4oasRpYuHm4NKE73|GoBUjhYD0f!Z4?~hpj044jI z1l?=60I?T-c`c-M&YKXS67(a#z`d5+ne%Qd^zL3%JA*jFFo6Dcw1!vN?7V-$rknSg zenu-YU0+DG;1AZiDtA3B<2t&@<&q~_XzswP?t%+A0(pWOK6xSWf;D(!WPKlObV(IV z?5#cyOwb5Fa_7?Nv}GT?Gftx!_sTiCQnmBu8o1{4<-k-UZDJ5`(fwbUc; zQSn_R0MX4rSThyb=kKvUvgvMSA~2d-(ss4|6#jAJdI%L-ThwaJte8O_ zdg2Ytm!zno|8|!Kx1WNUx~V_&Q1<9j!`Nbucqe~79Jn!!vHbUIqI~zFSMkPkHE9;F zX!oOed+vnVaq6PJz$_`*KrbYpV~nz2P`oM*R&yD);ZsoIAYUo)sBL6*Y7MGbO2s5m zAtDP%eMCne4ytU}1Z9d7o%aa(P9BEC)P`gZOyR35nr8WT;W^BlSGlGv#8>b8C|hGj z=0{1f?lOso%uFw>U#+G%C|xtLSto8=SkEoo;%%0C2)<_;`87ZZRLv~I{>1W#E7V;F ztBWA@mPW8M*znPFyd);*v*=X$Y}XB*!U|q8eZB9xSop1ssUNIl*imK~~xqi)Z zbh>QO@%$*(5U0MWaPZt^Hr_)O$cmDH_R@)|I$UG_rE&O7l+b9m&wM~ftG1+PYjypP zrI)hme=&AWv7$unmfbeawr$(CZQHhO+qP}nwr%rl{W}O;>-ZYq$PS*hz!*S- z^^`$W#@se$zDZN^Dk=F)yvtR!Rdswrc?y#qFEpEI#3VA7Ii?P4SK{vVSh7%-hm+~jM@ry{5oe5qiRIANcDTF5xs^VA<7Q~gprzY7MCyA~U zZ{;1SGDt-FV2ACswK+QJkN;K?G5EM$PhPOe@)7cQ8j1{4AD-m#3Tm^Rukd1MTnah* zw43x2_*hBTrRdrdtJ!U@z~r4_h^VA9ZkZO29?Iu#FC(LgX?xeh{x!JrcUpk3wjDZI zHf&Xr2_Lr1v*Rb;z9PO2Z2Mp-`O4W}FFQqm_Y(6B9eX8M2*6uUy@33DyN!H_6`l{S z8}DOP({?FnFCM`+-(+EMb$|^(`q_)F77i9Kwol zbc)U_Fk?o1_WJa*94VIm58(yd|2;YHzrzcL|2@cz5uc5Pp5Z_7Q0z?qCh`7{=l|_I ztZ{KsU%qK=b%sI=eEtVOB`)sJPz+5200RRv3BgbxDkdf(N|rf8EkZ07CXWLEL5aJ0 z&c9yw%>DKK{5;G49@AdC({QNss`28Pvv0)Agdh+2#|pW$b4l_G%d3Os!@D@$3tkTF zxPV3z$m;rI6@f62l|cak2@4$DR`MeYgXseT0xJ{X`ab(0`f6sgXHWgTvmEXe%N3cTQHBmvkZGiS88Dey}$B&&w zygk_I7&Oq}P9i|^aJox38gK{z7!l8Y|MRR(Hho-(b39C4@$#;vKJUaTvi@fr~z+t}0uztuc^RV;Di9t@u#i|!DE({JO1g837~4hw$RFUv>(%>Bxm4E$z~{{lw* z5&`{+9e?*le7eQ_jFJ5G?Vj}sRe25QfAF_ufxcY@0B(2F+xye*zhVIc6aM7c=VKv< z{;)y%!8iMVi9G%W6@>up!U#?5T!{WL5IE?ImM`X9Mgw9EF65sD5W~MVRrI?yz14>R z0&@x5CjT%?2LMDcIOuzq&MClGd)+Hadz%7v0RPSTiV^oKhpxclQ&dqo`25WJAe+1w z^-8ALJPKX+Ybyi?|9!g064Jo9g}z5??+5@YG=v98WCJY`76$s2Y2l+M{N=m?RDdX_ z>;EVPQxg-+}!>#P`SF5MacfKk2~aC3hXAc+s7HWNQhcIc}Y__2>e^b zxJGk6hlQ&~wfU+B&S{~)jZ+bSvlbn2-L2z+tk<#2yRYR#X{_eWWwb=N>UWkz3$`Yy zxbE#@u|B?DdybfA@mzx*Xnz*`3fbkhi)0yV1mB1+uW9qO)5`U?jlAlKjkP1uFB{wG z6|VR0?rMGLO4DF}DuV1V8FUe>#Jawhzu*QT%q8Ed+vyIRSE^C8z>U%#giVim3=A$` z-Ie+?8S~2_<%1}XcihUkV=RNg06BXpijrlb1ObQI`#ab~vM0c;xl7mG4&>hbEWA87 zTq)44p#o08Cd#4zXpYd@F)=M$FUIR!tML$v=x3%!pu~w%(~%=Nqerd5%%v_x28sA- zsp>BWHCek8Tvg=!m;lccq{4teMfu`S0PSHE`aPE_nQ*1}al5g4Q7ItOd_q$Hr$o6T z*+H^HuH0*Q9{C^u;vf8j_u~LqNOXXXuE6uW(zCXai=xzSmLC-oBAyWzp1*2aTc!K8 zqGe;#lLe5=oZX26cpR*bxLB|}$aA^vWz%eEM}ZopJ|l4MVd84(oMvZok(wtgN2J07 z&OHoZU3XSNTe^McY|fGcH{I2Qpi@2dhaff<(ZNzSK8ly?vbk!FYNpvc8d!)q1#w22 zmnwVBSG1V)S_(2q8;O29O&g?cfD9Mh5x)pc2R|X5`Q3mG4GHJdf0C~p^X^j-)W2NS zIs?_5-e1RZwlUKL|J5L)%t$KQ-FV@-KvBRG>F?Q>*s51loG$~z^b!e^+!_P^)rI?a^wy(Ami>$7ADcj~HH_;sA` zJP(O-8qqf^Sdw~UYnaDx!yv*+L3`m|boH<5x4yEgTXBYj zG7ALm?+maQtB0syyd&sTi|rvNe@9z5ktZU+F(H*1&Caid31&EB(<*$Kb-HB8$GFO&At_TPM<<=>43W|1*osZ3v#0y=aR5(zR4iv$tME+Z^AOCk0$btCtS*m zmG|q*c*j+LXk;d2purD56$Y1KIPY+wWaGMOd72M+q>IJpxvy~$ zNDi!t;1CdkNZ{{p&LZZi&%JFlySr>m>nyG`i@sBSf%4CY6gcb;J4;#c*WTt~DvqAz zgT*bCA0+m3%pqleUM=rYXQJ;X#cj801UMd%e=FR{bD^^IZZ)zN94sFcJ*LVy39-LV zok!0pdM{Bl{;9>V4IUJojm6BfM&tt<)}(JCABGMMcEPiQNYQdZbrGc(n*J>qIIjp8YjV|m01ewTo*w%( zZO#}!uTJhgZw?t~NWh=O^p|>%@;sP*Y?)KmgXFt2$+p>>p~GnPE*~&s@X7i-Q#Ab7 zkkY3Xsr(JGQLx{gFs)4nPWw)CR2Cd$N{u7Lf68U}QWS+{eSDZH6G$f0&`9q%2t<(| zzv?W_D^;m0Ofq>W$IT;DcK1nry>Cl&^93l!$NkIWEWPki;PxhXx*=++SYup2^_ScM z0Y&a#G)J~KNKf7G{<-0X-OcjvO<^zo3;m8x$SVwE)+(q5V z@%c#^KRdc{Oth!r>K-~jN_J~#kd8>z-3M#iys$oVirqQc~ww%qD&oX{KF}N zLnWamw3@~7%gMh`GZ(52Z0EGmip6eP@{wH{ZYj(s!Ia&>Kzt+`#Uc&c%X@QwUSV_!Y4I!Cej8n8h#PKKF%?FTp>*Fd>6WAO-nX64AwmE$4JTxFuU~_9ZwFC zJ&60t*3&wbKZU1eFy8CeduN8mubZP_5SQ1io+>qCw~&Lo*ZbI?FOM*GAYjheCg$+M z_SoAN=J+K1V+N*x>QhryXIsUZ9b#LT*?T5cA2MOO#iY`m1+2Pi+~0gtCGzeaj$S7g zyX)+ko;e2{NrzrAbJaZSoU}wZNe&gdT}RaGR1OTaz8RHbomJtf2I=qs#V=h7RhVJd zo1X;(CovH~j0;M72z~}~Yk|t_y?0*%Pj^4fK*3U zbUWryZNLT0NF}XiAtxJCwbcrOjdK}$SwbQfhEHaRIj=h?NZrHp(zCy^7q{=*VJ@yq zg`0(xgmf=#!+A2Ee3jE4I#_WZwcF+2MZeyPx;yb^q6}bT`QWVOYsO451D-`7NeBry zCAboqI$mwo4yGNcbouCm>~oqm{ju7Zxso#SWdlQbqeX;oGm+Kigd-w^TKg9B6ticn zF;4}XC4FZU*NmR7+}x2Q!L?2*Gje^@>`joC$JR*Jnq4=NY{j9M-e8N&7s!}!lS9nq zZ815;4Pf@*bX`twaA3wi&l_;b3>_&DRGq|a;h9oGKT$5<#?zvxIP3*As zArr)gnXX$!s(u^0j;>Rk3zJ;io{ulF!>FMQ#i7QtBEQFku(*y@v5Kca?hj^&P9KHB;odDK!2MZ=}+NNQAW$R zB84|`g#j(I3`!fj4z^iWr}dbSD$tQ)P>V`^iD&z+hov*8!=|yC%<$n(2iS2chsTgX zM~dAu53@{u*}iEj2a*QgZLF4_yxM{PdswRM^>I?2(NX)t^Y)WlnC2-+^8I=2nmaXU zN)K*{yTl{Y8di=CSX!%-1Jyxta>oGH)B7_ZJulri67t<%0asEi#7xX~p~m44`nXzm z<0NfB6AwL1^$V&Z?;!@MXM|j#e(6j8hp^(xU%G6lw^s{1X`;i6-P6-y2HqsqZm zXD>wr<+!{wnD)l^id?&}<9Vx=;n5ys0??74Kn%@yNWXYB&8C9`A(P6y)O5Z2s!;4W zk?0sh0DJLStF1f?)F6j&Js_pUS(#;hRo@$NKV^A^Y~$#IVyy&Co0zL! zr-ja64#{Z)xl1tTlgq7!ovlYDZIWbmg>&5Dz5G_IO&opjc0qgdP-&k;s`B^oOpUeT zc4Nd;^>-iXcCeICG#~Y43Oy^j%dNTIvo5O32hPjCb| z;1q{0;~Jj}H=OKQk?N=A!L*Ra+AVmo@38^)#imDE%Foa4PsQwds2GL1)Wa9)e?3Pi9k8Qe+l zZ46UjT-*RpcD{ zwkM6}F?sUpZjHZao}u5WG1e|*E!Gdosm+d-PUfb_SsgELv(5a+iR|-Iv_E&91*k>g zgFKEVBX5|S){uuU8fDr>=47tb3mO07kTM@ob2haok19=kINy|H++r!L>JlG~+r@{! z`cUKC{Sce)rh9{`9?V!oJJVs9D3mTM!^v%7`9|j@nR8-dwJyHX)L{^roG}a>d1+Cb zUp84z*B^HEK2ykr^0{2z__o#B5*tkkjVsmN`!)tvFR)}ZrbS_*jlJ4utA~ewC)w(Spal@;$bY8d z)K7G!p?B~_K3o1A1znFmjivDqQvBemr*I-H5R`c-u4K<*<`mdf!&A`)H~M|JQbS0@ zw6r%gf1W$!VD^{3fO-q>j7U9wr6V^~25?13Ld>TX@KFxuQ^p&p8zpcEuYQ6m08H$u zE49goy%L^Nu&>JStNMwp6atd?>RM&9F!AwqN1VDEpMAHJ9iUbxcf9YQJ%0!Pv)B+i@Z2W-z-I z(I6mTXp_2A8EK%}*n|M>C!Xm8no1H*ueR`MYLEh)@s|@V8vHon;Ou&>ABnf6+H^GR z&%MD^?|uh1t=`8`>GkB-m#lPpLYODir|0QB1+P$JN-1(*@t3X2ZFXe!cTt>>>_^qc zP0`QizfY7N-1W9YDtjryN*_;m#$p@i7q+9J*!vi}p(b>z^<9$im$S6kLv0?hvgdXS zuqr|>%C}&Odg)w8P!ZoAaMXevRQN0H(0gF9ia#ncugOjU9B+QZ5H$?~{VJKw$zj*>ojc(0YL|NQ|tT=GU0=XxsPjFIWvDQMTAs zi@%(gGP~U^APOBS6AuSc6;NQh0$vcbXiq!A-qcF}wv8mKNH~6GG$KM;0^Jt?a_!9D z+qDyoV{W?14}^|NzB&W)mS*!8gjK(8cS7m0dRJX!n)~{yrpWB!ZG(5Q^6V*sNEiZ@ zXK5TjigI`j#`U%|3$zkkj4~_RON7H5m6Plz8Qs7n(fx?dCN&Q*+Vh0)F^aI5-d~wS z(t4t`z~n@W=l$I{yIS(wDUlYga*}vEFL$S=dSHO=v&@-JobU@uSn4mJNWxw;nKl>c zu0F(xZ3y{Z?RW+m_XEwnK>)X`0qWz;&&2@h)1ZKpGla@QO)z%~`^!S(fT5LVdBexO zPwYEpzwt>w)jSLV2L#Va){fBpG=C;x@+HZ~A3TZny^;#jYq!4L%_nPe?v2Cpq_eaN zNejbdUTh$WHIyLo(nR0UjI%W|*ebd)XtMRA>CvPvdRwa?6)w0F!+7upT!+$>wwsYE zj5_317BQ$MZkUAz61Bk&x~ZyKIOqubJ3yDaw2(I9+!&b|BL5G8k^HfThdHLJs_-qaG@DF@oHX33nQfY0NdIM8<7+4k6|IwOpPFaEe%hzlOYj z28GV(7wim;xVDm?tTNBFUq?F`dNU|Fn~sYmh|L6t+{zAkd7_8dZuQi+R>{x^e;DRQ z#jK<$uu|M2Oz;k{p>m(Cid=@Rm)e$wR#9P5a}zK9RwpvYBf%hpJbQyB5q(ge;PP1M zU8s#SG&KXZ1k$--6XsGto?ItO_CHyK-GoY&K|wdB*qF7z_VO&(QX`ZrocchFr;l}- zEHiS_hg=iB9=cr-&`L<9 zWiZXpZP`we)Zj==dgTwxcB|pmuDvmS4zs-^;E^s`3_2;NbBfj;U3gx$j<28p*>-tI z_Kmo~a(xAxLc?ZLI#+&=SD`(R{{3a{GErBpba+IBiG2+fsbXmD6m~^(kjzn}W^N|Z zWqxy#S-ao1z@@;>+>5Rm!#Cd^mhl~?jCzjH2tE;Ul;Af7BpP%T=|=cW_|B->I|jT5 zgbq$=0B)q*0czQT#Z^Tf_BE#&In>`7tv#nQ{F=kkSA4^jrmsgI%lZaUw+fI$0+A&ne-fKWi- zRo>?mBx0Rt;tYkNoXw2dpd<&)IK1cgzUTsYNh9t0?x@%ri=U+wO{k6NPCqfHA@57* z3TWu&ufcc6F3=@>H$Bb(Y}3s-6^59fm(*&(5bPHGyn1Y^{3We)JB`heyjxJ@H^Xj< zrV*Uo>HEW3QS{`*86{V|C4DTL{Wp+FU=eNihW9=Sw7PSa5jg zL6p?ekTdNrBZ}SzsiI3Ur&=jvX@t&Z_Ro}VEay}uq8|*Z>J%wYstS0i$-5>WG`rZ7 zC#?{~Req<7vZ-F3F|V`k=ygxS=_Fcm09VBYJcp%sdb{Kn8hpRgS$*3X(56%Kr6mp~ zRi+Gk34TLm7f7WZqQS_X^l|*z@@hdgUq)&`7&Sw;-If+M{c`N5IiCC_HC2*V6myTVFY%EYVpw)p zS(h^_by~gyBmv}x*6hll{~#vPD9Tx)i5FfahU|^DM1gW2%60bo8*pGgPYbchk=BCN z1rC%AuW-t8OYj}%1?(OxIw1eNW<-A}qDL%7POG`2-mF$5v_xo3^aQ+n^Vp@pma)^1 zutRemm^oj5pFW}6iIhHbrnhMh`!Y*B2VjOAL-(Nzx}EPOH`nplm75Aicv*|%xM1J)6yy8`YJvPE!g8=bVj(%h9X_fsxCcPEsV zwCy#A54t#m6uHgx|BOy;3aT88bswqZ1qIhkMdeEN@Y5K*QV$}}PQc(8)uP|+=Ht%t zxUdNiL@gfN3m5&#@;WKYFzXX*^fxSZiB??@fKXMdd|&rbItCbJH`Y5WP+VIGj#oJp zZIqhN6S?;z%ucd#As@tIdAt5n;zQ-_UMJO3waA(ZE?57i3`Ub54xru!iKoUJB@Ri& z_fAF+YAEDmXplU&47;=(Jw}2kGyB^SE?Rj=OKtzULWx8(9;yF8`bbA2K^xUP`jc1q zpt3OL1}S(DU$_KhAhlAnopi&e+-U%P;*5=}7qD#5_2>tISPz!Bk7?5_TM90QTuO-O zdXaYR`%_7#>83bWBIW4MEukLkQf#utx7AJzlZk>>Yc^b60)~e9gdyX!l->guWx<2) zXY!pOW+Wh2fTOKt`|AV09I6X6O&MH;@wtL2f~!Rgb*+ZlP>U~ynSwrPi8Z^$ySGH~ zvF~XQ7yZQ~FUe)f<)<`%%KMNcDL)*H_=mMW8k6C7IgQ2CqcIH$I1tHk;v`<(s9CFl z)b%9AK=SOq?t6?Ng<+~{fkqv)>)+a2MqAZay_gheVw1*ND+*Jd<2z)(-jKF%8!H@IrkEgYWq}sw~`rx>i^|>?%|r~ z6&xj<7>UsW5_>&bqN<9#

6q$1>H#E&p)fM}*AWvgY-nV9+A(YLy~W*=fp|f=NIS z+0OxTY~5@xZldya#MqJh@r|cLx6p%wxAj1Erj#;ExW$tSNDGgJc&p_1HB9DLZc3!| zG8|qgH0X$IIDTdNofMeONk-4esYDGDky^qI$a#P)eAC0jsM-HdHHUxHCBB`ZB@{RJ z|EefGvpR~W?*p+P5=yz z4D~nhWb5mhZS588n-pvh6m%3oR90fYlWhCs^9qZ~Ez!K)nE6o-c7Z^WMk8zL@8Ee1GUw@rj8XWNpK0XiO>Ybb$ z!Pxjbh5c3}foA^LM3e`u=xM5_tN+k6JESx|fT)+R1I*M&P|Mu>yj(smiA<#V3{g@=>q( zVN`zM0Og0KE$LbR79op>Mg6+GkwvGc7m-c4iK^1#B8s}gt@gRy(c$aTqVD|)P5OZv ze%Cb>Qp}HxCJF!A?NC?$R{r>L|GF@K#nC6ioyMa6M#pf@FAp5v6@SwW@L4*sp{4bK z{9Zl&)_nNo{<1s)_>=Wx+nP))Wb>`F(lFtuN9AyF6BO+2O+oQuA1W{AD=7bwzga)Z zL+zc>GI-QQ8eC={dEGc|ald?Spc8YWUyL#l+erdypC-E}DZ8QK-^suH$NRhuSStsn z`SA+ImAC`q6+NFvj%xBT|25s^s_K|0D~df-Bd5 z)BEasBq0-1y9q11V3r-VXdV$SoCOF-F*~N(1!NxBTn^!`_fG=uG!#&N%B3DTSm~H$)6%H<>;HFJBqy&0`|T0W#CmZvAZ@LIiBxD=-r@ zH~B~H*<2wE-z(9b^7c-UQPrAGC2C>v>U(zuY80A_xAQ(7lTK#R4!?j_LhreYCUsE&3e_m1-q$Hxub1Q2;= z$kRRN`CG}2-Ly7by8b-Fh-!%(c?JcC?hH3XTyTHk)= zJv<}ZMLu7^Fb#`rB_cfVlZHNCvMnz#54rfU7+^m67%`~ruujk-pNVR_CFLWhycs)d zJ72_zLr42mDCdgq%*#WPOnu+w^v{fp9Bx~mg`$;PoU`!h=5;ZZ4Q5S0N^KtqiNcjk z+tcn9IXkrKl6j=4H6O9qpZ6bc0?_G}3xq@Fuz_h{XHz@|X@G}Ej-$mt<7ckpjmI`? zOG%(_6jz#>vXM$J`B_Mpn^Axo>tX)8N@hk#lz&XV#VQ* zcqVO8*=gAuW+`ot+#iXo94hVAZk`tv4t7~u4aFp~jmpibEO#$$Tlpa7DjW_T%%bY( zT?RY{XppJT@iXSlV^##nBE6Vaj$B-5WdkE+Si|GpznmDIwTz`jAG>4URYM-R>(e&z zQU;g+@ysTWq@Dl)XMAtYaWz9#4$#VKsfd3Tzi&k2NgRW1Qy@j(sjXFM3gLv7Ja`7+ zbryz8?H3~Kb#68%JeT_mWGht=Oo~n5b+Q@M(Xs3*z<^x{lo!MliN2mypK`aav&n!l zql`7=sxxywb4=eR^-Cp_-j_+#fUk`PD`TsP#E;??(?WBY9XYMNGeliQDT>7V@=b)6 zm*m}<=76Bkq_$NqTEX=%Xb8ghUrVy(W~AMO=up}k5*N6uo}l6&>sH+odYgGKn!A(E)~EGq$w!;eV1wWWCvBTSKT7`;`?M$H%F;*};Q zB!1v3oC@~32DS%SM0^{K@Se~uzXaEBX$HuQ^0LQ7<~D||i`B^Q6ugi4Gm3?~BHRN=r?gZeq|&%BPG zH(K{bul~RuSVh?pQSzi|ORS%DIS%D}0@S_qfV7Vc#FaSzKHPSsDGrc#E{$((ZG{qb z=p?^d7+AqHsZp`_f%$_b!)q2l9arBwsvR+H3|B7vMN=v-N~L~@|0=n3;fS}teF9VS z0Ta+4ck)ZYBoSzY(tMQ0trE(Pv_PAy9??_!iS9Nb!N@%_&kSEwa{kk%`iFOl+(Zoj zFwdKef;8<{C%4fqjfJGRstx~lVu}`0X|G@lt+I%50KFSR>d+Y7u%&+r-kAn{s@AGp zq7q*;G@?@;XK&R=%SSai%e}lBOY}7zb4KcsNn8p~xf1Mpw&i%JKP+r^c0J~Fo#sF>< z-K7TxCe4=d!#wgZGT9P%c|=iSSUpBrSG$|McxILuCKWvm;O^3Mb~vC9{<>5ODz2Cz z&zDQr+$C3msCjS*$573?j#v!HR|YemSgF>Kmi?#&bD)D)0!l6&%L6HQzPZ_PtE!8^ zA#X!R=xa!~J|#xW76^g-QKjU+K%F3-aEU`wJ}OfEnB+aO=MtTPuSQ1+?y#$#sKvA= z|5OWF$8#A*0Dsidj%R2B$m~Ezd1j=Hpo(@TXUq-(C9NDvp_E0ef@J8Spdx}ti+Na z{&Nz}`7R{dt|A(Q6x>6{K=NQFWm`VQjAB-<`AzzZfiagZkk?8AWaQ&$Ld$Fy za7PO_BEkg&>tJB^17WF{dV&n+0`)F;UQ*Tyy<;L>U4wPJOvi<9Gt{zYAVXBufzY}h zB)wO}Wl;9lBDEi$yWL#`OS>XAuixiPY5Dr53&|t_hz{bNWkb)Q7kS?PMs+5kH z85Ps(L0*}FjhULU2K`il%I)+kX{Sk`=&lyX3i}=CXOwcZm)QDfen)QXs7Iv`kIfaJ z^ZC;-LSwF(Wg?E?nI?eYB_}jx>`vPW`^Z*B%4*vaE&b6V%vgwu!NK>a&Wc9Yr*PVb zoIQ9FkW?6-TV%~`&V+fkTCICpU{t+MT9f_8vs%ZPAkb`Urga_*R=_48hS%P&;OPk*1qDAWP+rX+7ov(*UmgUd~-2+eiBgK&a6 zbGzWqjKSCy> z>hny39U^=PcFb`_#qZg>^!bDH_bH~llHHlAx`{$9U@bmqpYXH0QC%*gRVW(jEo^SmQ(#Vbfkub}O0sGI3f ze|vd%##Ar%@D!2Jv1ap&)o!cSG5*2b{QRO*(ZZfW_FD0441lEP^eBz^zA=Axtm~%| zQc9J)D0iLwFW|da<=udPt?nKA-i73t`3Y0)-15O>>~q@ZpAbgx@Gh|^Efhu|y*ps#6-x>qb))Q23Gc$_Zbot(zQYz3Nvurz;|;ly2h9C!cBQiv91 z&b9!0nM_N#N}$9rrEw;^6TIK6*0{|wVmsp)n=PCJ|1t?u1Vcr+`gyGKWJU?DvgQnH zSOxA;PCg+XPX4;OMn&G}Mz?l)qy{C%UqhvcoZf<$oYf35{2SYTY(LJTzAO-I6Xsi@ zCk1k%=ne{M@r?BIZr9H?;stHf#*7FPk{S+D3dw)~=0ateN*tihraZ*?qW~KC74C^_ zR6unk=L@>Waj8-mJ@=f0tvejH+**t?IIr456Y_;NNGrBJtBTh^PaAB?QH}?@5O5Y! zKjE%jZ2XFPNdt&wCNM-0BhVo9xa8*Ab26j?MccilsZ;>tbYF9yqATZ@)ipzb}b37b{r|3UlY2C884!9R?|pF8rM6+-om-y?BFE8mV>hRxXYLbFyg(1N$H>*D~x@cQBH$*!@^-$p$RR#*bjhZ#CQt8-*2k_9}MMiDN07kUr^} z%noQ+#19&`1b&T+OH5|UFFP%-KX9w2WlpS5l>x4teO?9q4~J;9GlN|iK5t+6Y@rF* zKV@o~H3FA}KwCuHJrnWCuV#ORP1fOLvs^a}UWOQ24(pUtyeYJ?o+SaAtSf!rJxCJo z;w1Cp_zLkF->2YO4r;>8yE!eptF0zabQ8!XzK=S|YH{Snl&!$IvGwYJFQp!)e!vB- zBLwQpCT3ynJNkD>EicPrTckVe)VUfi3aA-7quTDo!vih|>dTrqKeHM_ufA>ym0|=H z$tt9AQjLa{v3;MuqoYo;On5o8VNPtP_d-JR$+nEQq@1z^?vpkcq3d&0E?Fm!d9zqBJ!kDi8b?(K!dBjdy)vtk8sSWP>+Sl5JzOe zEbT6(12;6`SbgJ{YI9Scy3_!eyNxqhrZxRnd=%5PR#1KfpCO~$8F^!#ZhS2g)cvx! z;AbT?XxaK2os+Pnoe+kb+`Ss^EYY55`&tld5;XE%Vp`143NXM4^gx^nGRQU}X}n+d zc$q)INC4i)9Dgwg*7y@GN=_J-cqqzqJ~We5pXGpF zN-Loxsz}Opk4`?C)epZh$~q}RiPumC9a~8XBP+cGa$;x9G3%UPz$SV-_)GDzJ)g+U zD5Jsj<|IbJho%TZhM*b5B+=6H&x-e` zqMY9lr5>%hKQ3P#6RgFLhtzw6YKSuJBAz@4zM*`h``-m!L&62NAI)lpstKGGV^0Yi^Q8;FnCFi z<9B_{UUTwy)03OJ8sdGO=kl0o6)*hK_jB+Ow*SDXJh-&AB>odViGqbA76SS=3&9Zy z(_P-!0n!hi+EG+3?IwHrzbW3jrN7?TL=%;j(~^=Z2}aZNbSQySYs~609s4Qy2q4pz z)?`?_`4vJ!*Fe-nmHYusw!J8gCXUjyw`p8E0h>|btvm(Wp~g7n6kh&Ne9ws61y8t z-VNH>aqi*80BZudxi@e?J2U@$a|$I&+S!n7o^iL$WeU zX`oWG*FJv^x+v9zXG28kjUnqZUxwprb#5$A`c~OrdFP9{$1ilTy!c(uq$IOW_Kz*8 z&5bP?V4;boWi_2+|4hR&0T4LyaZ9Qcg5G7A?7y*%KYgG~-w73WL| z#l&E;f!Isy#OY_@6ukC$eI*vkye(8%;^?hh;w}E0s&4iUgsz#XCM_uwq&pR5f`q<_ zT)W$I%ZypV0Fi1`_NN1!+Q{@mrq{EISQ%++$Qb>AY9tCHZdZoHwv%dJ|Cq8w^dHJx z20G)$r^6WcZWC(3jCs242A76boL4v5$cX6Uc0YAaGbtUZKt^tq9V4nSHBzZigugXm;L19bo4}<=1s^dn_U(qz9)5LZcLi({276u=ZKAc={Y=3ze z%H}wzr9NW?DzjcVhQYlnF1f8{kd6*1bf~&ii$R#;vnh;bSjc@b?meI6gf^~jsMQc1 z@t>cvXpbI^dgC4X*Cy}+*~{O1ar}Ev`%n7#e1$lgO@YavE9R3FnkM=9OmYH6iOoe1 z-AObiQ28IzZMz`-A%XWJPJFhO%8MG7SP*d2u`KqWknA~O_NJJr*)?%xC%L%cxHIl( z*-snXjZ`>PA<|1LgVgSQ;+t+{KfhH9hx4!DscU_?^JwG{GqIC6<6W7}iUP#VX|$?s z0e-rR$HRhwpVV`e4@T^fRABzMwkFhbscEbdJd=x6H`v1W9jOl9YTA9o1yDSQP&>oG z8^YQL`5x%j9{Lt7iw#rI%|_s*HF86t{>PAoy29P<9TcOk+;#@bLzz$S>4&EAF+M}6 z$oItvx{cgMMC(r}g<`M@@~6&6%VSAXZ%ogt4D^Vm&83Cu)+XAeji#C_j^ifV_2Rnb zuP#%mr>NU6Ev><8b^W|?pBjhjVQmXMkb&zGePM-cu~%_@OVHfY6(THksDx-MIMRSf zVY7-d-Cagm-@Pkn$yfW;Yn8*$O=%2Qzuyj+7m+YQzt2!pPUrWHYTOl}mgVQE*K)J8 z-{;!}@wumgRNwE;C($|iq74D_*u+|uER1Z~)wwEH8KJ>8-cU(vp4<>bIF2Mas&k2$ zsY??AVxckMDWuq==%ZcD4q08@&pEY<(I0sRgE#;`=)LA_5s_$^oN2=@W;?&ZH-Tch zkGes1bscIhp10@my4-ODH4PdWunFEelHaGo;D9{%4*^euze??6wnlO{WGg-j}0H`nVnsNw;hmR+?k8S}bGZk2ql$Ofys(RAT??V7BR z_0@j|j7VL_JQCr4$bIA|R@*9jW?`-qAkXaa0b+eo2tGrmXLaR*vSz3W$$51HMaMeg zX4T4gK{8z|zButv_Y2}7g@E54eC%IYq;e`4OSAE9idoR{8$$LP@}RX5kuv3$X)iCR zBwI%Cse(%>S^K*L4*K!c(wr3drZ+H$_0&UePPOiCoYH&4!pnRneuCwwHMMf7(odwv ztF&-cu{-TKT!PqBUU51Eb*me`1wGjRiU|mHhl~wI^!<1f!m1l}0Pkj|@EL_2Ebm7= zDH@3I)eO$I0*r{m3vm5uEY6fk`VH@pXwcRo`LgKnc7c-s*V_CQB;E~Toe1-+iZO$w-%f!={@kEWE|9%f}ssIX2j!nmgbpT zYqlWA)vTuhU<6utov^>_&cuQ@+ckkSGR$bRb4DWU*qA%N7m67z-byO*6ohWieyg*u z1AHpB9tE%gnlZD2|(_@X2e4%Kpc11=~?^_v&I58H{sG^7sFK%B@16>I1-n`;+;1E0HH;6 z>skQq4&|ky%5Z^cDj_Tdj};&V;Mlvt(6GPVgynIv-q?1MMH!%tuL=dzYUw2z zl|Wb@oD#5AkvIN#)VRfZ2NXDZ3?`P{xtw?jXd9_bs-LViyNoHOOlFFis>H({B95Yd zU2AiuY_{)pSJmFC;Ys|t*+iK4i6R>#J?b0oazYmll7@-L8+JXq*NdJdB(LN2tY=2U zo0lo~jja1EY~RyzH~DnYdD31R0=P8llzl^E;DC)4Gao3@S#|a_jk^@)L)OuHx69O} z>WXadvG%D|4KBh9nV!4Vm43DRM_k@vIgUvwA*%!sM1|e@ilo%>hXV{3$%o+p5T-gI z(p2vEqIt^D#zm%=S(!DldB}#1#Xczt;WHb@ngdO1`RrxC44@fjC%_sCjpd)o2rxqg zK5*$n;|hlryh$?Jh;c4jS>MYQyx(TEU}wuq&n|&2RR2ajo&bgEb#&N^MGU7t*2Xe$ zH7?+1tiF;2j#(6E$qn*dGnm{nm%y;0^1LyagdG>Ue$s#Cg%_QvQ=>9*XBpfA+-urm zypIh{R&FMgxu5xCVZFnks-l0WqjR6>y4C;)%2>w}0+Pg5UqBuIas}hC73huY*?7gz z-YD}us9(`yaeH={^x{ZgmHvEOpb~g}gs^;cJ5CXibfe{txj}4`=;33MlNjo~abNvx zMiwA*dd8ftB6;j)&DzQT

  • ?L5%RTxwdXe;Ccz>ME5mng|%;^v6WH+gUe^SP#X=D^GcO0ILxfAI&XJe9)N`Zp+MjPu)9;m!FyN zrSRl*H4F<=gkS&<_;X~;lqPNsVJuOCTPfB}v>MDH=kM2Mz=1iE>y6nq=i0`ZcAgMK zb?H~6(%NUY1ss$yJcb3>ujUJB*g!H!Vr$G5nu9XXgKhx{%|s7MRz_ME%dH({e|Iuj zGBZd;W(r(|tF=T_)W{L*F4ERD@Xk?SpaM0iH>PIYUyH6!7(AdeAL6z;bl0{1ZJK7^ zY+~T?PhR-Y9z5oEiP(`4J6;I?14m8|Q#!rUZKnli%%WZj8f&-SVt3ncqPKL|B3|U3 zrK#tM8Df>7R1&Em8#^Z|qPBu7g2D4Ei23^-X1Q*Al|-al!Kil2w_bg4@$oMU4d4_G zvYw&QKb3CU5UdJkC2ykYz%GiWEIZ{I?oI80k&4oaC#;odis|Yq2 z+Xd}k&3sHM*54Wkp|EU3X;1%qvF2An;=;E3-%pjPKl{IeD%JmR@<${ARcN#BlBbV3 zFLX6990aD)Qz{Ha@9{c?7kP^31Ie?DOEqv<#JD#2C&f*=EaBaX7!re9`3feqwGn?b z=LoZ#sgZ~qAIl!gFwngq=kC}VMu&FAk%=!gqw28SP)&u(T=Q~KteNokX{6Q=s?*lU zX*+41RX#8=7wx zrMytX0)xpbt?;=yozD4K@ZKwiYG#|*=;@08|4d0`T_oLvs3~fL z4Jhp!cDJ_~u52iclO8hl|4x!Sq7FUz6FcfFwi(xdwEpR zU2fTdvADZ31NZp28=@2-KpwOrQwpEEK#H!HpX7@k)Y+7+lw{h#W073f1{EvPL1HHz zX+cc@Us_%053JO#XB9Atem2`GiTlc*k2h}k;Uf=h7R0{hKqafh##T`~y}lcUnhHwA z(<)MbTC4LY5ld5~E;=8+X{R6a5#B@z>@{)a?^Bryd|D2};NNnN;jUhVOlS8ztU2Vg zq1Mxi$3HGd<2&)|;~__Mn)pdMURt({tQ+^i;o$O5$Q~D%{*ENgiCcYW zmkXH5_+dIk)5rUNGG*|ztdSV^y0!C0EYZ#Dh?2K>*Ye%cI;k+Xe@b`>m)6i|xwm_r zGim(oQNP)&;l5aFk1e8vG+ULXm8EywARlV{CDj- z6?Cps&XP!Py7)y*Ck9j9%Z{C~NX4WN-+!w7BSZYK`a6+|Wrw5mtP(7lmDOvrd_i~* zL6u||W9p1oO`m33|Jl<~m`V%~UbcFn1=O|YuFpzZ(`$lB@%BOU;X40>zrKDPgXHN@ zy{>!0qPzT3cPQOz1@Ek<`QY0RZwL%gA4m+ff{BSIGxIH$ayC8V!{lXXootaBwEV5m zVW8v@cj6?iv$PtEaDA5apJPDQD8$$E}UAJ?oCW6N4JgX$zZHg0Co_ z`8+Zv71bX1W+hV@cS}EZPcQ|A4a@E;laL!s0nV;U9rc&w?pr^X$Mh1WS7_Rm5S2?v zhB&FxifM|yR=rA7Fc@MV-;;mvPeTttiO^Ldd&xaM0^q?%Va9<|h2H-%rl3A8gT zRwcO|a$kZ0YPfW={u;^qA!i7G0M0cO+8 zKR$0yr;bq*{{j|j<70?jTUiD~gu|F0A?MNFBv7EeA;9>Sv;APxgY1J41@ys>(Osu= zo?luN7fvijA4H)$pQ2sd1e9_)sUOAvC#+MN_ugsc97_Raii-yeedDOI{0Wl-Ls*s8 zG+t92sJCPi@uPl?=XSU;pw#dH(0@ zzJUB3(RrBnXJwS)kv{fJdI~;*oljzUw@7sajHUl%-wj?8~j{#V(Y2sS0vdiv#9cD}E<9q7#lTI!a z`u)B5rjhR9_3LswcX5XN(3Rw?&&km`B?$B-xUx4Xh$hrYF6ODE?_2wP)VIj=P0Te9(5C}(JtZeceIU1aCyb4|Xun>Em0J}Gb%;JmTH)JBa6cdt-R z*zqi_Oev&`#pgC)s>wR*c;h_5Rd|(er?h+$lc9ot% z;HVIC)_6nGIaDrRx1*Q!LfPv?mOTO z9DN+g3P_Ncqy7_I&X`R6z^wmEC3u8Uy6L@P5l%d8VPl}!jwZJ4BZStr$xQ2QdlCg_ z4lf#6)aPlfU*L@SQ%G;_VW$q2jFq@XdS>!eW^h%M)N>)Jgt|#(fHBJi-Tu$WnPqMs z)z`5+y{JaluiYuMdq+x8+42C%Kj+>RBZAsGp|`onLUa!d>{%kmRi@ZW-~AlF7t0y~Z=C+IMHzvx;-3|NROlO1QG~3$1K7yC z9E{sEZ|F|pP3<+iCK!6YF4Mrz za}c5X@*i&xD_z-ZGnI3LaUG-6`@da#tEd+%Cpg%m2Jh^wI)@9@53i@-b^D3>w97}Z zy>%^k{u=)7&;c?}5@{6u7g0zKadrB$Jv6)XMXMz33AZfxny?&mYlds`%8yriPgefT_c<0*DiV2LCb(J!)s3ZK1Ho8R~qgbGfoU*Z*cB+<8W0aHV zv((B9<1e|dbiSo!N5d_*TZGmU14*{&Zr#6^d6J>jY*!i4Y8Bc2pHZQzz}=Ah5yg(D zSz5r^Vev}Raih|sCys{)|S zJW|$NTf&=?(hBCnic0+(dKnAtc?4_!8R7A=f8vi0wF$t}RaOfP3K}B;urSUBY3>g4 z8W&bx?4^Iav+M0BJB~c_3-ss=PE?gSZH1%b+tOF>J;u9O(xd%>lERsb^VGt( z=N#YS0riT474ig?y)u7k|J{8GF!cM68dKtS%gb}Y@7r5Boeai0A=a3bA5oKoj-NHN z*;hjwn&cQ8-2_D$dr>i}rUlVUDNfgZ%r@U87ZHKcnvQXS6NyaNOgZyrtD{SESW(28 zQu$jK=K?8RM=WuR9(QH>OBECRL*{0*0flq9h~+hD*XY0sbKII3){44&K!1A1eZ{TW8~>Zl%j}A zIsS}U)KIlC14+2z=Bb`dRXChvYm{G)+o&`M{dWa`5P>at$dfGS1 z>N@f;q(y>>8_}UoF2$B6rnPc&7fI-@2H9KVgw8^9DXKhj$ta)p-Agz!KrEK0d{j7RmVfq+b{h=?u%Uyo1-Z$kxI71 z3uF$dSdY*8e01h{D;nsJtV1T}5D7-rz^6qfAO0Zp)@eMIBd2>#=mvn+q7VRUpvK(a z<#Wnn-9MO5Vzj3SXOeMVo1;MXoy3S{OgoD~9)OyGnp?T)Hdu!q*3dwk2!3jeSxJ5c zKQIJ06@>DY4~PiwWh1eVoSlYQ;XWU^0!hb(4a>%+q3pt$+tyl@Z{G}rT9j;$&_{1( z6r#i}I#W!iu(a)y=#xyq!DMU_#~N$om2c7AR=aqfGgzKzH}EjunR-{}<@K~D%K*1r zlW^j;c!N=gZmT_@Ff9<@D$m?MdY=;T4@^=sGxWSS#R|8yr>LaZ6H)Kk6;=wLQQV)H z95TCVn=R-~9<;vaA9052ZQk_h_Ub3ub5=OO7ZQz*Ouo4-_MH2UmKKvj8xd(*Ca9_G zqpw_gAC@j$so}l*vnC;frg#fstoy@Z7`xC|Bxi z&J|AO>v`1*Bg=7@B6UvP37Vx%hhlX(s|q~G)nN<6w%^&NuMk0}2#^yHRY7ibTS8`p z^(Ug}kWH7wa=8~BI>GdlDXnDnD$d;PT%^{MZ0Q#6^gGbz_u}FUJM!)E9Y2cJcF5gz zEw0`*90ThCMJMoDZ+|C^mC&#a7I9{QtQOLqPrPc4m8Pd%BWx0r_vGg8Y8IqP7?KjC zhNlagjA2m?tUynDB4gHy4V<(>rf{#;WIBd7Q^x&VSRfN@m9tG6D4Axgrs-=;-4i;? z6YCX|^WD~Zqvdm7dmuKAbu>ibq|3+;6azms=`jzS!lB|03Zad?#Hk=fbYd=RY5|pt zJ%H5qP)vhOPvM($=PgB(AMTL~1I-$Ep`aEQ!(NZ;@`FC{@D7IbF1zvfSGZf1-pL8o0dlo}1)#dx8lU`LUK=TRmKU|Ga*o<>IOLwyTvU{N9zWc`Y8N)Ykfl z-?W8dRcNN2jcdA7Vy|kKE;;Wh_}yu*1*r>~{Z1j$HLpI@_T2c7cS^&nR=SYs+X_=q43FEd9x-vpck13=G>x?{)$N}0*_ zL2@j#ahdW4MCa(PRD<)?$jKdJjA@jm-IozJOFq%h7rJRm4P$IevZxcy6a%h4GK{~6 zeY$k0nPmbe6Vn`Y{y8WUIfs)m!8X(-mi&{RnR2%41}ZmfEk>mHG-RQ77-&z*XIZ6#YHmtszA)HxGsZz(97 zBW$mw6ZrpoCK_02id@1DsA;cnDv;?w6KoK&TJAz0(}xAUt5k9IrG7d8;y-k^N5s&y z>#BK02z?Luv9;Ej$`g{`-FW607hKjdty%cX^>~T0qHTU7SU&<=%DY{hsnsrpki!VP z)RcJe5da+vzptDLD}h};8mw^>IVNdp`l(;vn-6KW8SzvI<{Llk$A^s6D-4OiwJf_@ z95`T{XZrCsDHR>xC^eOcC&Go)xBn=!5)gYdb`VessnWs@_0qQE7<~FpL*#q!GT@L% zhTni)Sn25LA=3lHtFZ{977g^5#l{~UWK*JZ$}Zr+LGV0K11f3rG{&+oybdp?D()cx z3JFIFXHiA*B~PaT)m89mC0(Yu%;tzU#VM&iuCGQB4qEoddjE0^gMD2$H-6&I)5=;T zhk`Y;qi(wCFMQ;vzg8cj$kw@qgK-YES?N{VZe!gUG2;5z{A8iX)UZn>+33pQ^@jKj zkZqS>kw&P6;=*;8p^8_6c?q9G+hqq&CMCZS{?;vpSZLN~qf=Y&P6ccAHbfx6J+-9s zHE}z&LF5oW3Y_7#H$MP}zSq8MMAH4uV%;iz2BC@(bm?KvW1sW@e)aDoO8+s6Qi#PW z(YUIhWQq<^CC#k5*KmSpS>#>i(MGlfmGD5ESZiN#gG&VWB7Ui2e4>*}q_?M$O|p(k zDTb2*Vx4~m>QvZ(HqWG<_bT;q_R#$?ZxVk>87Hlh@y|W}cF4O){t6g`gtQ$|MCZF0 zFD@27#WcjTNWYKIpL}IAKxuF3TSFSRDXjM7gBd_G#)GrO~vXzk!)orC^ zi!b>chqhS9D*}^3nfoTvyeK85IXExv-vm}JmM8Qxv;|ES_d%2botFxqVu0B!>T+~> zxGc8TrTa+h;oqq$rZs;mL1~=yTUSXltxZESb6yXd7(0$ z87e5y5~}#`4TKwgdOYL)kcqxPo(+^94byi)lZ7F&(!wl@NHYTjdex`syO3pA z>l;wwyBJdfbt41S;_LJ}9=O`8G#4FWx_d-AN(HEat_0k7t#)2!f=aDF3cJ*hPWjecPPk63q;yE*G+JT$&z0vJ#`I)!T(i1dEb6mQ8}dNz>_Lue1$ z?QroCERtYWlGA1W``NAjp2VkY;C-DoiL6i7tvR#{%gS(zj5nv+$(|Lccm#eJRR?=y z0~piI^8O5Gx#?kay2wkEu>p3!;BMxJ;y!?u;1-I36}8jZ-;X&FRAOuIb5?6 zZSL*h%>!FuWlVm|_Wn=>#-^6Z#|StK4mbCn6u;{3cV|R}{mUUmLRDNQit7Tr_nRS>ib|O=>WnSM^_1g zum9X6#2XVcFv!e8yNTEr4r78oaC8>nI5*|sLiTXvFNX^PRp0zN9%Qn=H8*60r}TW} zye>(W%+eE-E{vo^1^><7~$mCehk^jEb_6EKwl-ea4{&8ABOC~#yt;rfM z71_-jyZRR)p7S+MNh6=?;g^h4cV?*M$5!*9dWiz(=%hr|331k1%_`i=lGGDFKVraA z#I$C{z+(u>2zi!La|h0S)JWS(Re3}R_Pmm>I*Uh!tL$Hcl+q}E9!0agbsLiQ!GK@* zIIyVjUqGzLzj4|&pOgDB=TO*!C$OpRS|+JJC3i7fpen6{EB8ePTs82toN0=)4KyK?}K|jmtqP z=fY#EnP|g-`8;ZLMERl0C@#>^tQrW`KH(jI%rl2^Z;ZQc%HhtX+{JbE^1mUP0m{*~ zu=Y*eu677*Z>9|gA{w892A7b1O`v;XEveO=LB;6Ijir`+G3>q=geRom(YcnM&&Wf7 zk1MhObxm=>7*+cNHoSCkEp*ZhYO7FBexYT3plx3-_AAKSEmk*zYamwPtD$T8?O(i+ zu^#I|?*HNgp~B&1`7I8mFoO&1KrII96(fI(l{7gJf)u2dIuId*F)+sICkYV_b>f+f zSam1mQ@;&l1BK)dun0C*P#xIEXW8|t9Uqez_0GeMV|DfSTBx8K&2NWzit!OFm0wPU&hvuxIDiDnTgO+lgj>@djVqa*f z+|WgEK~wVCDJv+H!-sus5Cv-{_N_gSr5r6_5j!g0S=QDaWMjnmTHv9k1PS;8g2gH?G zNLq>-S{QEdQ#(zj^W7>}1bPtuLqWQ)0l~rb2hz`y_4pddmHsM5U>^X+Vk?Fq)+*M5 zcwDDV=C+=>)bDgn&-maikI7?MCzWpES)D{}R-8m08h1C%N`lUx`zjY?g72fFPtwL* z9$9qB8#<9#SH6!UlTW&Pa?dc#AwN>*SSgj->b?`2;8BT+V_u@aGq9?oqjeD!t!@C7 z#<#rE^sLyLh57OVP5-_5?Wk)(H+m6L_rd>KgU1#l{?b+Ku{02#ev^ZY$6avtN$`JL zYg_Axg{dcT=-D}(UHdedd?Z)gXFM?$H%<6(4HD6(qC?{?d2qK zmKPiOClI;hCe&(Dayd{b7vLkGR*xe4qh<<=0?k%XXrEW0F?9HR?yO(7lOyV~rDXzF6kVg}0y%qV*qsXrOs@=5UE{PV>P8^%qtCcO7)&{G`upH|h<3 zc=St}{R))1((|MA^q)RgWQn!ekvCs$U{>Qfjr9##0$$PDx=?}j##yY{>V&;+tF~)q z-83RzNTsQy)G-I9nqR>`3SHqjoatg&?@-x-+gFrvkzzc^yG9fFhFbN?r?yRjn*g?}$)>iwig2na3oL$zs+emn(f9zEfr2(YLJf@U@=gk(g~| z1I{4yn&Axd!BGwELMg;+GWu~DR40|xbONnGDE9z_*b)D4-4d$G-QX*f;9EBdN(3Bd3D3cF#W@#|WkgQ6o)k zu*O^}4mWJM1gcAIoWA_iP&3lucSzW~ zYv0;rlZUb=R*#k4&@~>jTSLLerZ-zH_rHH(ws?yFFYW-#|H&O-{$F|p26_gD|A!&? z-*RXMdJb0B|6jj?tFe;SWdjZV7DzvitLy4sO@FgL{9Lqo|jV+WmRD^H}IvRco?bRbjhQ!GKOv+<{(Mgf;|B4g=KL;LuJJT!pue)*~Yh~+hcj13Hfk99PJ6G(5Nu=1Hg++Sb;g#usY!oAe$ywcemUZQL6CpNK!0M#K6_DKcUeB@=c(sLuZ25Qv*J1vw^m5VW;Xq@NKKV6S{5h`rVMMCgkP{==`!!im^jG_SybG0( zfIfh_GdcF42u{w7gdiXvgFrkwy#M|+`i=~J)HOwOR5gtQ^u+~62L__tJ^aS*_R5U> ziaw%SLDhdS32M}w=vAhcf0pheSe>6-eG$FrM(!2g`)2)S_!7*an1y#RBh;h9`{X`6 zM_Q8Vi~g2H>AL~|UcO<50?{ruExsH@he8r-A$sBKLL!iw*KKo(C{8zX_x76c=B9Ot zeK|eEP;Ln#7n0L!$3u(= z1gWfK&Qjg}rSCzrIQCT4dkJ>Ap1St5Cp?Maae(erySad%M38hW4xQ&}2$@o5>HUs_m<3?~2>w4FNqPturH`}`D`zy;3JC&+QoS7GG& z`GjQEk^G)9oh~n{R>(dCKQ6wZZ;gDHb$a9Z!F>s@d7CU(#>CA|bO6#FwxyCmpo1|X zYl^i$E+sOj8$AHOXRPa0XbNe!X}9Ws%4do6bv!~ZkQ>s#(c@U!rTl)#q$i2bh~WIT0`F zRUkGL$8>%^kQZ|4(oM?gDC1LqPrL}$^$V{m)U~zn&@AG*lFI+(L=}fs<`x-Hc&A)f zfd!Z0uev%uWpJp7&Z=>-gzWGB`;>qu9#$1pu^&2To*C~d4qsNi@HJPEYX`SLmD*M$ zF%#|a{V}g5Cua#6=e=X@ac+Dz#$3dsp(8nJ9`Y!pn=}=H2$a3a02Q5HJjUwtK&Q|R z0ju;5FNrOyRdS5f_=v?yQs~@|BT?0qr7z{X0%iB!6{c0G8k~@6nBf)fQ1BFXY74EF zh8sY0_YAcA`T*(=FY|nrAxzO+-y@KQ{dq~xQxzz#o9?PL;s-Tjw)4;x1G-(RzS2W^ zs#Ede6FBfiRXWP<(2mYI-e`1SAo+C}pkKn)3ixxrwgJsBuZrJz$lw5fFx9$Y(+s2N zmZeSET76g}UR>}csH)HWVQu)3()E-1ekwRpx?XOV;mrldgEBPxU$e$H%?Vv8(KB79 zA=6DRWNmjZS{DlgFi<9*K9p&@l9o+-btxZHjj@yp#nHLem?h1_6=^|YMVj@k3MdyF z6js|Rw0PkOtm)kt)M-Roos$;uZ0vKdu};khLy42=7u7g=KlT| zNZ#oaZ-k>t3!o9`biI_;%dEVXGxX&Rrmm{JQIJQsi=G?1?ofT-?YS{Wq9dw8reQzUS%va0( zGN&UC4fxYWaLDm@8=<)DRGDsTj2Km!s@nOl zAFW|KRWR%67R)%c$(W4xU54Skn`hU@7{3Z&B5I#mB-Qsk6Uf0zymXj=f$C=#npoFI zp_fO@5~B@D;DTk#GjMRMbBq-MLNN(hdjjX!}bRpc@ zko|Dh+Eqm6z`3z2O4=4Q@rBHb1!R~+Pg06tnsvL3Y8_I#gLCHvOJ2f|h+~4=lBmz8 zRjTX)lV@*gWEYpa`t{Ob^>69`?ln*?%F4s|-;SCe<45tfgNp6==Q=`nJ=x&W0@D7^ z4)Y0VhCY6evrXbq@EM*IeDC{-%F~w#12Y1|zTjq2*Sr`HBnW`_N$$4Coi12Uv`IX~ zZ{=+JfXynBnghRa&S08{NOmRo+fR4Pjla}nDmT`_$BTho(tUW5`XI$mQv+9$>%^Ku z6;DzyE<(rP8G6eJ6PB6_u%dtoKY?&42F+xwfC?t}!rmcrI>>ud`h!`Wd@Iuf$IM9q z;}KI=QWm;BqS&&tpfjrt;S%>rA#nu@dWgw>;J4^gVlH0|)@T?L=eY0xQgyk7H9F-J zW1Mi9V{c>#?n4M9x4go?^R-~-;bNAdv|G~cXHg%hFl}$=jqVCpB9=6;fPbX(k>Qy4 zL;180YGNMYWriU#<`c%ZM~JZQS#^%dA|=j3a9c=XGg_UBT)=wLumkZ!3TY*{@@Cj= zn~%^Lox0tZX5{}v@|+#6=pw^rBt-wr_@qa6bZ-B zk4#S&wPL4Z^2t+_QDSU_$OTQy=qg((Zr=66k7Shu8@w`E*aDbJHEOw%YFK8Q8N@ijRKuwW=+|A~?qzmD|8 z`pT)=>l2o&AGACGNq(+Yev+4^HR%wJA$V!#?3ga3>NTYL*>T9Fwo zww@M*|M|D4(dlKv7mre2yoHmBJ>!Bk=VivR`qGoUzvn4*y*DG%Kq))pJlRu=cY9&) zti$q8@RLb)%Q_tG8E<ru&7 zRelLR#I^sgFeG;5{_{B)ux6%;p7%+KBLP8jKnnS;^x(&tXzrI_$GPVa+hi*5Zta)q zt2>_B6`^Y&%_w-3*WgeENS#*9%X7%S{)(w z#qh=6p)we^tLHdr$cj15RDoc?a2&gAyvGU9=G0qUIB3mrHWMME}9*ercCK zVbZl=@h|tCAj3Zw#tCNpPpz_dV?8a=zLp7<42~5v#V@CmWH!J z5bihlpegTpqmFdL6@4Yx&V^pbVpe;dxPPDh!xE^hQmv%-FIcbwb}&EjBVeIuL%`OcEHnvz%|YNof3a6(U!{rb z`_=TGk-r}(t-Tu?Lh3!mYY5KlBQspmG_f-*LRW65)baq;7V-Wx0UVxplK|ANZ_n`~ zpxJ3nVsY1alFT}l)?1ccX6WsJ`+aJ&N%}-^zrYX8W4wycJ~KSVG2Ro&%7+oFNH@=n zeB|0{zKkoqqNt3FM>!R2%i-((`aS=b3eR6u3G7xO#n*j{W8iI739ZJ9W=Q?7sgLv7 z&D6IvPZbLk&Le)JVs>4gwjFlTaNa^3qaXf5inUW?VBwfAMma^2!)ekaHE2rM-PUi3 zOg)WbR9PpYzokwZv#1ox%ozHqH-c4e3WHWWvt_Q#(gVUsHL1naz=HJ~1PgpY4rYwf7z7>QvgcgyVY;=6*5QocA~k+*u^gT-~eg9ET_@zO5D-UrwU2LdWJj z?8@Hqg-#1W9s{nHE%b0_db&thT@JWNJ$tOj{KH-oxo$O{IyrBIp~~gIpCw%1&qF8R z|KdJRfYB5xl^e2f-t6A$Hc-PV$MU!Wx7Vu~+@Eaa*ASjVPS2`M;GRt>%!5p<0ein7 zoX2bf3D`{ayyXuA1URsUxM@nXwvO=#{0bG8==d6w1}+~CBTq^KHa z=nW#iPNOZ(^pOX>Q>rSaFqq3UoJ5aqOj=Ddx( zz@qzoWt%9J{cq#tU5jJ!1tv_aXhR!L!6S%?*Y0!zGEi4P z8S*TwfT^b!A)167$~$0~HI3E?VBYwfW`WToA^;VQiqQDZ(Se)Pph+m>3jmm;3OdLNWWpiLo7%{qjpE;L zB}=fH48u-*)kBWW_oZ#FcQjP8QYuP^+UAaNO-Fo~QhtxhV{J0yqY=aO0R}6~&V7g9 zA~&E%eGUH0(L5xsDH3OOe)E1**+psP0srHHOFLZg7Li0n)Ya}sGfx2P5}-$ZJo!BS zV~~H%57A1R2s81bUc2-MnUP*Y)zC(#-g;Gh7-QqJ)wd@5jD5XMEF@ z;l;pWd^{TCq~U37>_SkEX!%y<%u&!I@>#v$*t4kyHxjah*H1cDF72rcHRAH6nn@n~ zscy>pEv}MmCWPm7>0sle4a379_W?+%9Hw@WC*Fm_O@h^U+SIk-naL zF+^SFn*SmodOd;N9L%#qo#0KbekI!(9E^ewG9ksFX_4y^@5{Gl`>#a(Ckzg{7S8eb zMZQnxPh)dD&&R{LgH3lAowKeBHiVfm(fgvS6nxC1P7N3Y3TAdm3f@;x%y%aDrP3`K zl1Aw^f)>5}!bFK&Pq_8TNYCSw~O8#9VCq)imJA zm)NcxG6H5rff38VNfGJS9ObtqIjS*5o)GiXQI>KHTppbFH*nRFu4$BmR_dq{Lf|$ta-cI2;ltotn!JJdnEBnW9MTv!4=uO!3Qk&9!q!`(RP71`poFd)s^Qy z+`u!WeXc#7j82IzlYDN}QJuuCs;k;>WT7#{Wow*h+R)#7h>jZWje!TQxwguzpa;upz+3a5c+a~KL-3ea7!a~7xD#SaFkcO zS#J2juB;i`YSH;qqd4x$m7)l`MGRYR#QhpyZ471*;Ch=z!!%;Lrn%pKm_IT&ZmIw?~2?}I`sTj10^3_=*|=6odZ(xwr=kw^RPl=WxPTr?<2X-WI~WuX-fHnUBb-PvBL5v5j=-@-dAdJbo@; z(o8Mau)FctX^rDFEYIdg+*ehuL9JP%0x@8TD||Y2A^I<;CXBl$2V4N{L7^%%xO~En zI+*W0-rVcV-gsgO5_<18aEFuf7N7njyDvgNs_zR_dSx}wmS*99{r4diC&I2+;sfW6 zL{)@ger39;u9tDeF0q&u)o%SK%`45st106K&&3;qfQeQNeEv~|j7lLzwtn_XhV1i5 z3!nC>*8?KWTChGz*7X=y?nlP32>BgtDOXb-TxcN2)N^xly3Fi{B~sGo3ji19O} zT|@r#zt*HoHQ35WY-B!?)Pv2KuOQ^k+XjX9*bW)CQn4<|Z0%_K9Fdkt#MdGOt`P=q z=To6~0pNE}a0wQwmbNmmMl4lW}aa81pGL^;xRB(mb3;dyj$?j>WPA zmI(2NpL3Rl#38NHzaq#8vYWRH2~*~L**}Ihxp+daj#1cRy6$pPmZG+$fatIyIv==( zx+Sri_J5UevAHW33f7;X9pnH7!1~9!iv=>+lJ`DdO-d`qfeui(m!Rj-BcyX&MAej| z4=poId&*R9Ih9O%{#D6cQWft`EoQ5L+;Z}VL8nU7*p9yJ8W?kbnUo9qhiavsEpsmb z07s5S;)m(&Jz8b`4_;B2=;<19bVpQiYRmv9p-wNT_tjgCI$1TnH?JnjTxyvmeH10r z;Rrnsx3_)w2St^H8vCQfw@0a3)IfNjI9Rs?zX|muiPe1@nXS_cMG=0#`@|aNRy5OM zlZ~4zK7R2@zEfjJEJ~ST|Ab6)3tx*dgpipDPY*U{Zb7R5O4>WNE^RjvZW-B@Rh~ou zG4_?0zmXXXXV-lN%RB=oIF(pX96S%GdE0MT#ssg?h`>2fY(E zx{REBOOW?NcuKFKepKOE$sx;^;wg$k|Kl>izeC1?458@vbB)JMv^9raqyjfZhOc5K zm~lYSYqUFuE%=7U*GfJ$w1>)dTzLC~e zRg-JEX+rZ3cw_H>lmf01!mLJdPU<1Zu!x~-_>Ba5*hTgw{q;Tl%vKeCks>LyeaBiV z&b2Jx$_ieEO=X*s*nF1T3Me76477I`jl*01oPtZK8XGoeYghXSp3b0(d2=WwiqYM!3_icg=%))&^Q=G}(+lHQJEQ2c)$Wlp(|pDM zW+wQ7uDBk6O4G?}SkrU(xow>*?+|S&?r^Y|(pzKrk`Gk(pnd0e2${tpWlojH`BD86 z!s9$X{S)P|h;3PZ=+5Wlt_@X$O5tjp7K01=?0#*y*b`3oCTI2LRgoo zL-7(+?y~s<*iK{V)1oSp^BsQIHYd-5P@kM=*@|0u(pa?e0b!4Q&r}P)y*zj*-J&T^ zje`W!s@i>dFg!6}kX@a?XO;l%z>_waBHZ)Sw?PFHP%%XB;ijH*J|8oo^errX3%f1@ zlr=>@KvClc)~}Mj)1S5aO2FAY)GaGK)O<4T{t`4@-bh!$Oa+&KWgz2jOlqqV0mHrE}^Iwrn@>O10Il@ zM?2cOfAGe}DGtwKG9;m`Wo#@&O{JgPrJ2D{uZc>t9|1n2cgNX7l8F*4qF|m)KSPDd zZ9V=!jGaS!E=mJsW1rZzZQHhUV%xTDCnvV~#oBeAAmJ;j30l~8LGstH_e=s@`*3)w#3BOjI~kJDvv;2OkrHjG@;Jplf34U$p

    z>TNQ0%1TNhd}Yz7o9}rPYSMJ9?}jogo?DA=6DCw#dn30lb9sECvtsmwx}^1RIlVcL z$8{UE3NZzn=-TV+d3y6iXN)M5!DjOx?j_^GU)+i?>ABQoLs<4nQ{}N~iKcc{1(jkt zD?YZlirO@LqejB8NgAJrp2YH#*gU#l(|BMDr+8*%rf0jR9FQMtToqPNj|T6tZ3$)S z;Fokyfr^rRm_lni%s@QFUYrwo9-@8rw_Z9BFF_1ExBaM1abgJppt}qaXL02j*lAGv z%_95dQUwHj;N#mLO4JDzzim=pVy$r`gU+jrIL^tWR<;jeWA%hnkdA?iykd~A19~yu zUt7P%G*KIoOH-ZsyS``s-9k98+@62Q_c2!``#g;*-YGGV#&M=fK4E_X%#9laWilf? z&|=qRfm7_gisF6_D~MUi-wg9&^a@$>(7~tOv}8c@TKhrK;=@3uWy$0t*@xhB@`~~` zV&)&1u8)tCuKgrnC9Ykb1@XK(4d72&hab_3fOqh_SYe1%NIIn$sgu>3roqk)M zqV!v%fR9i6&$`SRjA<%K+JZAj)7&(HaZ82173fn}^mp5-f9%#s{k~24($IT)J3r2a zp5hooth_SY^{EW0OB(L&;#uKEe&4FY5WDukTL@4*wv&K&{SMDnRcYWs%6JP%VU5V5fX5ZydFu2f6^tZZ;TgZ?NJP7kV*Vr& zrN~A})A2mytBP&+0#QoasG{b%vat4uX^Tem)K1LS7R{@zUm!;OLQ~u(_#S`g7)!57 z{w9L0p7)q-&Z5Rs@jDi(EItABWYw`MN3i&4$Wm{qQ?~C)#jDCE)BUs;6Gzwl-65^| z4g6PE4co%%({zzoh7(TK;d{o^0fZl$L+MF!;+J}1`K71fTvl>osHCs9cmF3C{PjG3 z&cYTuod#5kQ-WEyaohm9H%#Imn6~T~QtdtW4=;-z1#eMUOe<3MFY~U~2dPy5IKvCn z1TuR{XP9%Xa!@5Ua8NN_F%ao^+8>*>q{8k5x~dXfIxq659Gk&orr|i>!X6pG#Ue`7D@2@5Kca(WBLA4?F7+OTDr5 z6w7w@xKf-(u&3!!_9NUadaKxpiHm@M+{v;+$_G;uyDg?aU@`l@%UY7FBOlsoiEl^ajxPA)oaxl z0=2GSJ20|-m&t|l$GL3oVRn_MQvnw7;;eMR>|4gu3zToXWU$ljbqZI_Ob^t8!bEZ{ z=QVc0{+>QzIV<6BIw%)xvmk$xW^i2v8~06aQF38f$=B+C+&zvrsi+ASjkd)Ue1MsE ze9;SU^mitQfNey0D&Fng^zAo4Se zg;Ty!kn}DL!B(z`<+aRcJ8u8P22XVsMCPe2JtRj8#@x|DQp|Gz9tN^x;_>EEC`!GE zFbwrC`p{LNeYtYYK5sl+t1iS#yBnMBRs;P+)jNJ_KRIBJvMlWFO%*Kf4)p#uVQldQ z-8MstPc)GaYec#sjJ@UahBRIsO|?DJN*cOyOdLhB-g}M_1A=Uj#yngr;MV@ijNN0 zl4P4HGr|8?WtCR>n8J^Wg!nR7L>k>LJ+dfLHGGC+0i!a;p-P>M6;eVQty&+ZjN2aZXdZKrBqxljcABv1bfh z5XE&Jj6`HX+-*xpUKVTb^jhJNFEyGvs?T?r60?5&21F~*T}6X@%>CCT!<+93Xm>3P zuX!MW=|}P~os1Jcqbt7c4LvY#acFVB1bp;EOCYoIZCl-AXPSRQoAI2QdN=^DOnDXkr_;|sC;_Zf1rZn;^jF3v_+ zJ|{~bG=>Ty*O-6@b(&MJNyC z%Dy}X9)3o83zivPv*#A#jSXUb`p)JX))uAW2}X^4#`>Wf*`*AU@+3Z7RAox#j_^bL+Gd<06r8_EWTxkl z5yGilBY_-LsupRIZke^2>+u_8{C00N|B!x7%pmt|2l+X~_#3FIK(3itQ>KphJ_fxf z^PngsqVNk#IBMk$Fm6*VqsNiqsEYn}y<+F9jz&RL*z7DQaNLfyzESy`;l@qki3F>X zH3}Rs5wl2AKO^!HZD zT6mk{wXI;^iSJw)FWiz&$|nJAnz`Kplq9c!${5Q}aw?YUP`JW!LJ&5pR$xJ+n91c3 z5B`?zx}(YiuEqi>;-#`$OLi}vDLlL-DieHJI!%m|CGSl63!9qHuqv5kGA`t;pF&>z z*dP+aPWE7Vu?t_33CTipdVZ%h4)&IN)n_#=4eSOb@(2}#g0_^dl)8r1fo zMe-U1vJT8)qwxorC8Bf;!kw3aG1t)@?I?Pf9+>hanEeCFVvUj8il`du-t@HxuqKK?VVkrOX27p5>VvNxf&vp*Fc6`dXX zlK^ODb8=O01*8H_lS)+B+1$auQk(h{5+gY?u{$(3HY2e+GMqq{?<>cv-6SHQ9{ziHWR++Pa0*%gbu z*rM;VQ;Oem19L+=Gb1PlMph7mg!Z=&9Hapczj3A?zx+wRIFvv3L~p&JJHH%GzcdoR zlVm^s-Lt=F+FQ#b;yZIMw?JRsqd-6OSRy=qyL_Mt;9pG^HgLH={`~)q5U~IIrT9<7 zRb%&Y>+*|WpiCd;ze#Lj{*#U^?ls~MOP4!L$0sX?N~ae?2V@q;?(hA=@Q%QPqC$Zq z!?W1C^;1;?#7!{6VQqS7j`WX(^>R%;$E&F2)?tlKF@dKBHZG}xO%=()R{>dOR z+#9@CcyWHm0r*vl&FLQZ$NdRJLT`Nq@>pbQa0WTc;L7~j>FtGfWNP~Ou^0YAE>J6(SFH{O&|e(cM^`hqh#mTvl?N3wzo2H_pz+oOCg7^69w@YycK z`1>BMq;23uBXd=rjcjG*6`Ie0zPxi2$Dj_+QW!!<8W=$mUnjvk7J2(9iCbYSU|~y2 z{$4@_eqX;rOl$%?tebAoTY;gH@#t{!L#Cl889oYW$NT>>mNx7^ApXvW4z*e{mMV23 z@<01Gp`Sem(vlw(b{MHtk5Qq0Upi240ZIl&=Z zE{tM*4;v%%cm}z>V`*lPU)spL#E657Lu@}`wPKYOoh+zyXlUm}m17f8s8pm>=(6n7 zf?JRz0%T(dh_vg847#XI73x{l>lTO8fuial%hlP8>zQ7K6YWA~kyGf?&e!-Z<`dJE z6crb#V?KyapdaMgli1`9@-QGtZL7dV+N?Gz%?wTOaFq#**lKawcl1s^0%DH;@Zf4W zO}2(6@Q73AUG<+5snykJbaqmR#Lz+cDfJn=nBcH zsa+nR7><|wwUvab^L(h91F$(x(;c+k>`_1cU=aM+_s+x6jPthS_9P59z7h&bf90dN49qPO|2@CnragPge^gozKSnF50Q!|#H_*BG+IF}BdRO5)w+%|(n z|2I#^bja=V%*tq>x@`GlSE6pYB^~d>@n1XcnqnQdM9N!5i}@K7AB!MsM;jU?$tu>+ zf4o$AU3XraU59kDuO8-GF<#u9t(bx%-T;!dR6s0>R#-49AWt_QTe2_$L0E(Af}v%o zd{2ost&e;{^MlCqQ#uHdkR1F0$IsyUSwlez#s$2kt}uMH+KqPR!wo+Tv)Y?U_}}Y& z@AUg;jYa-DL#i}fB4+Pv+6V0L)thy{W8mnq>*I`xh%3IsX1r+p;O#5T51|Q&tC(t$?2Q z>9pJ|t7wHOZ~)>>96>GAfV$iyWYz=cdf*fBXYxDe*I~hW$q*e~rsNqs=<03ySk6Th1wLw}S}MAnhc7-lVuV1MdZifgp0x zb`kg16XLxX&q$zl+x?~Gc0xH^`#N1+=Kl6ko7oPrW!o_bWmS)-&)MqyPhSPqGDlH; zR>6nWy_Bb?sOAd##(qTJHN8=utVWt9K`+Z0OwKN$u|DCGqj-3 z;>ib{cm)J}vN{1^U zSUq0eQs0Bu`5HdHFi?+CY4!?I;BlTbPN3Kad5GdV+-s8vs!z1L8ri}H-D!h)scf9| z8p89)0yOTCp~$`BKkIuNO4Q;V`>oj{puUO8=trr3s3X`In~9+~W}D$CoqB4$-q{~E zlA&5{1AC!Wv=6@JGu|g`n`RZP&NP=OO|E@J-Kw{&@6MX6)mTpwm?*qw+D^s{fK~ct z4m=D=#|DAU!Om_bCDKz>a-Nv`97lwmJ%HLqTBn(bRVP{LW1JML9w&{|@O9*vJH(B~ z1dr=5xD%utWDWyEibb7($MThu2ev%-VH zT#3tF(Dc;AG-Dh{mP3F?}L(ZUKp#V+Xl0TFyEGxNoaA_+}Z^dD!e*a_}rU$~I?V zI(x`9ugsO?_F3IyVa)k!=Fzjx8q_|c-?T=S<;W$Pcz$wDfPBu*itiD9@b|~WDgR%U zqI+*u>*ed{351=wfYg|JS~A9Knxoh1_yn+c^3QL-E~aM(P-ncg2LqH4jFm%}Bb4IJ4MWe8 zYAqA6ON*HRr~zznu1jc6^C^mzOAJr|o$k_7d4N4D00r6q3LBUftuW+0k5)O71p8wMeO65` zR7`_#jnVttFa7=kwbTIDuJ1}uYy1VIaz9eG z*MmiJ;z4340BdI%PWa#0XQq#gm zsX>qnrmFij-*6+wR3b$T%An&t^Eig{X;Crn)<}t;lUWAQET*LcZ@$aE@IX=|KRNa0 z{Q8i|b#a21B3|xR8Yd^p!FFE0^*L=zmJaTkWmH;?fGt6I@cM{}GT)mCvD)b!2Z^xg z2kOP};N}eKH2r3SzP5>%$e>sK@&NaZJNaxAiFrbsxnxsf8m*`UuASR^@!A6hnNVw& z3?4b@0a@XBn>dT|%uOCGF}DT4C(*eGOZT(2H+hy=5^u){AguFliud)p&ZA%b8R)(q zNxA4RX%Iyp)Lq_(t^g!9(Gfxj_IRnSdO98F;&Kd?{gcJ!?!*xC?DC!qYIRW za3DKh2@r2uKDJgNt;hvo!C+b>Yp>1N<*s{Z(b|+y{Y1=>z4Q~tZys8&`awMZQSLz%dSR27|yA7rD3*-%iezwZz##bZm_p92H=U>1hm*l8f zxrtFZ6@=|5i262NJj1R!PytoVqj#hb#Q>QZ0!MtyCuu;4x`#z^E}dJA`8!dW`X-WE zf?T-z;+K8LvR1H^TFNhoTc0ip?eqPd#jn|W|jK2LgCZ9rCHqkzLXF9U79C&$w)Q5r~4Nq# z!U&RM^MQ7iG8g`7&tjXLmskAfivO;@jfJg=90t>uHKG`co}CS)TcQ88|9DIgy-xo= zJ`*)34gY!cST&vAbAOKF51g_fY1SK)ZcW@Re*OXk15m7L{?w&A z#terD9Q+6~-~A#^omukeq!@AJ*Op>}K13QU1|gDzrwGvP|4UIBiCJT#Q9 zFcl8PhwT*xj{~W>~a#i;7uEiGN9xx3t8vSGE|CN zh_HI@zO`WK8tAx2#R9jJmw6HVR5RIrQkuA^KT)6&r=)~GCLz&!&h}wz`G*ST;;J!t zMDXDa1%~Phhp5lmA>GIseXx_CAf@;ld3pzc^^e0-8aNkrsONef0nFti{F#F_N)&Tm zls5p{)|@2iKGA`Y%fq}P$7?FIdH|yYL)7hWGy0uEyFz~q)FGWh1&XFDSg#+zB^zPf zUBjfQL6IpYy+0$$R2CrtDy^WKix#y$M`lE;*J{NuN$Q^b?F5SU@M@@OLFq2gQ?{Zn z^QJJg?eBt&RLCbAKWfDverq8ES-pW?HDR|kxWUG_hGZYYy8t)oS^Tm-MG4!3KTAl@ zb0*xE1#~i-)sFZ`4*%vZHymG$vU23KplQO+)J)Q}_(@!(fG<`4zI0Ko^(qw;!v4+h zWug>h9Xj(?`$aMV(X39&pp3czj~x(R>Dw#}zgs`pCqU2$YEub7<6QzN0lZZGnL|p4 z-}-E|5G=Jl%Oq z3b*t8Z3{|WqRfcvg$RtaSio>>rk)~sQH)#kxB}){5SH2pt;Gm{P9G(sdbSlt+H{qX z12Yl;wtXN;d-R2CkX@NdYoMM6*oTz#3d@wtr*NRFxiR!hY--GL*v z6+D6h`dN%G4@-bM|H$8{O1M%^n16&8!tzgrF*tuD4`qm(DeMgX-b3jA)l@{68<|l* z5nE`|PSe(0&EM+hb{(V%r&rM>xyNxnBvmuri8LYaD?Jbk;e}VTJ|U|_mgsgj2UL9@_XY{lv%urA=VN@a+oXy`bRXZtR(?vEKg&Blzb@j z90xXJ4-_~ATV9^*I-jjM*zwdkDlPPkMHYcF^wG%GtF5!dyPr<(Cdo>RXWWOg$uZxT zErl#RDR0^bPRE#f)S@XHL`TfPyp`dlY#-3aP^Y<VW;E6z4klBZT4pfiU_Oa)4Csn?xrORh9{FTLJln zSK8@-0|%g^RKh21rF^pZCUFdr&bU5Z$uvTzgN=-~my=P869=b_8&ockluxwt?X~XV z!BtcmSW>V4V%si7On8`P@ip3Yg0L6Wi*%sF`7T7R*$A~X;Xt+G z>o650*F6L8V**J#GZ%y5r_bGyLMA-hrd|ydkDWts+{J+3QU{jA2CkQJWvi1u> z6tctfpLaN=2gjFxyT_pWz6>1Aj#Bl#bbm6;9T+K8=Ak%o?)RIo3h33hBVyI$ds0@{ z58Yh<68IaodlcI4j(PlJ^5>wZtK+JURTf6utc(1H4sHDdbF&!II8*`T#e@WF`M8@M z2W~8*>{T&wEI&V)u6>2;K7R)bUwClSr8CbAKx5*-w2ijk`MB~@MMh@Bg1FR*D-*S@ zGec%N_eZx}$@a7=uFo7A`>DSmUo+O9$7l0K``fZb*$;p;F6>9srd-3fZ^;$q*A`)k zn5ks4YP?gt;%yMiJUksDIHBjPn%mY$_rB%g2Ou;D25q4rrC6KU4)L~EHllOVB0(Y>dAIe9fuXT zhU>hUy>6tV6=~V^{!4&aA5?gh`=PIlA4?E3Ce&_5v%01j;+LuNv*NgA+|{klUy#SF z?!C^S+96WfpyibhDCM5gyj#88NT)@F(t(D}n#cy&tWX9c;qE&!aYQ=@Fd6C?F)2fm zw7X!4f5TDO&kBYkzVz<9>?jldMYhPjl(h(s^19{l-5oGkF}@e6;wvBPbZjiIN0>QQ zml@Fih4$1SxTEB+=`r7r4-@YTLL4rpWh}(d7kdr_P18$kHcs#pKcykkR4(*LUMLpvg)92vF3&l2+Wf*= zi-&`{ofh%%N~ExMk+c(^c&R`-ZeZMJffa~a@vW3U%p%-F5K4+*mA_zikfnBXBt_v^ ztDw~=5O$GCp}hOeT^B1ai*Q74;AACwdieY|u9YX-Ld`rF@xTm}tv=aca`ccY|BrNe z6ELPg+^$~pUw1|IWqAd#T9&b3Y*1i$%bdX*0J0N#cT(g;Ne>4T9OdbmY3>_kw+XS# z=j7x=%IOQ#Px~CQDA}|JPuYlNqhQW)f!uH6t+EQZq~3}%ID&Gprp_jdy%vP0d}(AL zYWEokh7dsO)+4lJfFm~I5CKKp;wt)5OsbWbHZFS3niL`Qi}ik>R5H8Gk@|!CH+`w6 zA99##>NTTUAO$ejUis8sL|sY7d>w@@jKY_Zfh-=C(Z0ZGMzP<`l+0B6y@l)X>|gC* z!}0~qsP0z@5-oP@yRd?!=S~4=woTC9d01{_1&qcR>BepCI;#$yvB+_; zcgB+miEe5T2ceo8D>UiA-Mqf4^8tM@9Y!H|m9M85C7TX|v9rbqBLZ4s%*ho}aJB4W zCHYX=^kUQdV8A%PN|Vdg6OMonA#R(3c8m!pROdPAV-E-Cr;8t49UM01eKMmHX_RyGvRB{;La;K^ zmO<-ag@Nri4`ORjAl((<=)mFur6{^!64 z>u9dkG^jNTW->C4xbYu*vg@A*m9~!G1KQ2iMlCJ%$(Gs#3TTDFi3u;7?+gm(MLg@2 z{6;HXU9QLjJ#p6Pss`~FA>*pdZtn)2OyQ+uJG^<*MmzB;5)@s^!J-91q7H6&zS(q4 z7sw(aJ>_iDa$jcjYlU}n20{xpE{hJK(vjhM0|x5)?_iHW2XLH4v?n?1PcNwO#zeHI z7e-E(8lKAXA`&FDtm>pDLMbf$a~GY!`it13ZcFwKuBXX^J zU;qkmwM(3`E_XzcDOGMBOO*AdKH--)oZ^QNVC6$;RDbM~97b;iwTZzEp%-lJG?!#F zrP61N8P|1k250E*;JNmT7DTI&n*9a|zLBxFO^fthsEOAKe0xWWFfV;1!waGzH;%j(YuQbC);kPrpA%oZ-(3eT z<~vPrfzuS%z%YsJoYMtOudi;hMpmfsyBPTI+V3!K<=AakuI)&Zp9tBPX3rf}&E}81 zE^6Z7&8dY3T3K}_XQ(SN@@YUlkYJP(zXJd_qHWcDAKyQratm#f0jSw0YC++eq(lrE zF*6>oo`YrJ?tBmfVc7x;mwC1Jy{(OLt@cYQi&L6npoCrs0ph7n;@N8cM)ykv< z@5cxOy-RwXHABp)b2StlJQS@zq1_(U_+-3jPal!fDpgoxzIuh*{N{$qjv7PRBpOF{ zIjW3mt7^KcEKA#hhWJ&?1!UqSw*Pb)O4O#kf>J85yz`ds#FsM59v?c(4yi_`kSnuq zct^`^^F)e_-r21zYHL}&5@pI%vGn}HT z#q_KVl-r#}`WmN9T$ClD9+qNpp?`$96>bq2;d)nz!T5uAvg6q1Z<@rD%%VnW_=Rzv z%u%B}O;$DmK~PiXvjfBbJ(R-J*n!OEpo5D%b(iODbrt0bmd%7CLZIuxt@O+~?u@M{ zOhB;b+q(6j3z#gX&@l>%>sz4wP;xcW;a5hV=Xi2O0p$%s>=>6Diu8MQrAi#R-36VG zj0H_f@Dwh(BXnMrAEMr1$D1$`?3a~Q$+Sw^W!MqnwX>tpT&53A%Ui;!1Q&wkP=l|j zpf$2$y5BZBmO&fXU10%zxCi9M_(F*jUK4@f!aM{6WGbOWs>F9uqy zxDXr*t;)SfGMFhqh!pA`_C-<&1eOtsjAn&dJ1nrINI<`DGGM^5!R59+N;pSEzPF#C z3Qi?eW$C3STeF2V7lZcaq|jCyvys=Ik)Cho%e_uzft*q;h}qd6rvP2mRWHtuk#0n{ z#;HJaEpA|ne2a4}akJgPnsW-~bIkt;ww>#K)$$(4gJzK^PLspi3V6U<*7Ba-O&=G? zI8j<|A^Y`cQqy1Jqh#4|!*lhLpbvR~;}W?PGyz!2&0%1)?%pXE^_E5^d zRB#zMKai-9Oo~WH!G~k0Kbm}_;S*n6LsAHAhnU=2o+^#BHZx-wRwrZ0BiquR&du&D z%E<|*#`}-c|0FCypgquU_q4x8H{I2QY8<+HL|#;SCM4DidZnZS-MGIzI(`SomdSi4 z8pc}K)>XuJ>NOgefH(cua^|0s#yWSwx0M*F;gl>uvH~;l5PGE)63s!jnE5wYI52et z)`eD&7**De8-9hy9s0gS@(ZPrb1gY@IH<#+PI6Ax1Ry$19K8WPAy=}@ufkymcZ5l* z%Kw|wyoXC*r0K+R@TJ_+9^Wn6rzjvIlf_gHpCz6Y+j7FH7ytpc5T{YN5WDvqaRoc&LDhMxx*Jqeq(=PUZhfUB2HX~LgO|}X585br zn|LImgDOI~e{UPBJ}jC9$v9ycpr;bq1oneq(tCz@Q#Ab__=D4b{)J~6!|OOcDyPXr zAXvdj`Lk$Wa>uJu7|+Gwg>?+O%QZ}Si)Oq@OKYNsj){s~Qcnd;I}w4tQltba<#XPfvAcw+Fs7?yPWLn5(;j4g&WZEYNxqq-E!#q15A z+5U~8C7#Rm{2#C9T)2?iglzF67B)IcYne~ehiZtUjVBzCKTUDtKJkqeqw1(h;AXt- zeTL2Y@`305s>8(=y<1ne8z?7G6|kZ2vxN;^E`gWZwc0how?nd%{rtm~kl|e{$erFT z%|Z1Xh19T<5fo9RV5eukAiX|s#gT)RPsLsdX`NeAuPV^lYFFPVK~E{d3oT98P^M2_evfd3M>KtBdQGoS@aUJQC1bc8^m z8YDq~&}+Q^*S@R(ml6>P-2TJUW)~7eSmvP)t2dA@SpWi2uR_ls;dNKc{djghM60+R)%6$t}b9up&oI+vY3*t!LOmNS-b8xLO zx#X9t7braNPS2S!Ak&L86IN@al;e3TLVzyRpa}Y91Q-lbOioJ~Xge}Y822#h%=47e z&)`7DR?Z_xTT2_>B96Ei@{;x|7>aUzFD5dNvYo5jwaKSF#aFwcfz*tJSobIy{%=Ca zV+PvR^8LKe8?vQ^aOo3$SKWLvJ$#DxDaY|_WaCjA!`Pky4YVw|*^x&iPVBz{Vm!-- zSk{-2eEv+2)jFj_>0Ngvp+xopM-O!Rpo>I$0F)64aeNhAoUhN-84&fOhrFiVZ>!q?38?bX zG@;+-UP~-1NTB~dZm)n8a8Le4!k-Zo@A8xrWUssToN`7wozoKY?jFPjYgM|a9?aD1_Ukdz0v%+~R z*?XBI9<^r_6tZ1QeF#pJ?sr83doqFHh6)JaK0pbMpoQ!2Dr?Hufo4-2Nr^mJ!jZi2 zdB>@`QY^Zfijx=siy};ORR*5~*J`fljPv)8@^7u&!k5*~#yvGs3SF=n&=eF~fy$ zFR*Ph_&%l=A!2``KJvYjk2SA>V_ktmD1KNwPaQT#$YqBiys`qHbUU+o69`Cl5Q|4ZVRnKoLx z8iyL|O5w%DJz@t4&sU)E7bSNFn5zY3pF1m!X)7U>;fg5x#C{F>dqu!FrOY%!hoe=0 zpQv)?GuMg#h%9Ko<;G62bH%I9-M}?}0=`MF#qDcL%1_najzfy%qrtRty zLVN_4m5u`<3BC`e5&L;PR+RZn;%u$}-?aL{WBd%Y7_uZp>Wn?i>(ZrQh(W-G=_MWt$xo|OFp9?R#sl-CDvGfvcl-#0Z?ljWC(huV%?nPu@4R+jFSIO%Ia5fV6Pa#-Dl9LE4T*Bzx_kZ zkO*lWaA}OVs-*@>9;!eyy5Nrpr{FB$12K4jb!R*EmVJSCv@NgynqjWRIgqdWxmOwLP_;zx$(Mpdb_pomj0?ij^T->G+ zRyCHny^ZCqR={niRFCcL;^_JGdZMfyeM*GIC8Vm3loD)H;N|gY{qfhzh&s~9&??0% z$)zDPawTjy8YZeEOfEB5oeW#qIn%4-E3zgvLboK+%euI%vQ$9serYrp{oWrmxdf0& zfTfkUwm#S@evhblGt0Cj#2`b(D$J@Zb(4B;Ybu*%7Auz5p16$V!dgyV=9lk33CB`8 zpqDXTs1~avi`;NVpzf4%rZp{?$kV@t_<{M)({xR~n_bwAe_NP2-7M;n!N2fFO^p8@ zFZDb_F5f${%$Ut4MDE1$(2QB!x!5{3np))(k*Xh7#nQl(XH1@RWpc$4o|He%W%rz# zGww~}XPzh!L1bXE_=heEdUx@ReIw*!^Q2|#568FhUqhA2Ie{aD%wmOyKc+qd*GZR@~xpF{_FFHS#3%0g$uicKu?jtPLPEW#cw zSSevz=%}b}7tKZ-d)}{%Jzp2n&~Iu?E{G@(iuh|o8?@7YLW3q%^VxHKF`2rz^&Pq0 zL~{+xEmkZIc=>gd!-We7IKw=-yitR$lUzj|tF|2yLzB2*^-yQS zi6f_v0F^oLCh{a&)DDg^*E<3-;mp;Q>%CT9sAhTM#hGt7S7}<5J%0Z^mEdS3-<;dh zv(d7JDk}4;@hA&)_BJV!%Ncn2%WIIxn<0$q@G#E-kpme}^K^>s8K;!6 z9hLN%jM|ZDckw4E9dCLnhKesV` zVXZ}!T507V&BWDW6Up@CH^X*~GIaghNQy8Z8sLLj~~136Y$h~9=Va7ikO-KYiyGlHs3*J>~+*D+;;M+8B< z9Fx_`R0LSZlq8fqueg>Rn{cYkR(D!cb=u(_EvU9Wo$bB^!#USj96%Ixf~#(2Nf%dz zS6s1hm-q~d{{B*obg4na-|xrAhl3nbA>#8nMhdjaxBImy z=9u^OMbG@Gt=-ZmDi;X_$t5c0%t+13%E^ryM6JVXZEkFVbTUn8N<7Ba%Nc1<-T-n* z<5uj%mciLEPyWo*Q}yeLzslqkFGIkJ@XgD$P}3z0)ir!~Y?^8k8C2kVaktYg+`iFbP)|d zUe(p$xdsRTXLMvOkr8?V9473YQk;>?Ylp14xKdXwT7lfGX-{kCIy{UjzDBtLGGwa( zEWE$o6}ILJ{%l}Ch`&}{Qu%KYG>`WS5_r_M`y)n#F`yxaM3HvCs`@D3n2G$7i1`j& zZDfw$M_}#}*XhgEeM6^K3&R@kiix(wNZ@2gqQU-t71gA@HsoSJb7*8{811oeA-aWg zRsPrlXdo^%b?E7fc6)L=$7I`6B0@U9IXt>}7Fm)*BvVO{m^V>V#ng#qTkk(4GXtO@ zIN9|RgX??R@Ur;)@yI4Yg=X59RLvJ2jWAfj7hy``72)`R-oBz6S&CH1HI%1#O0P@Z z+S3tBbo3FSx;GlsH*G}EYa#a^k=0;z1SLj5;DfIZUuj?}noz#Kofqn00z1DA(Xt97 zsBK)+N`!Xu{&RCRVlY*j-+&C364fFeO{zP~!l)b)ZIwJKnQ>TJx7|JrEy;wiAj;x05Kt=397el;OnQSNsQ8lWnlDvib zD2C}PmsJ&dOuL_6V>-ogp3~uqy@i7_g|B_0St$XczA{S|p`bxRK$eRe#3>eJo2Q9> zan?LqL4A54p#;1jP){%oFvI$nRH5xwgsNevK5z%78@Fz}@S|ZMDM!cF*7jC*asub~ z?M`^d1lpw!G2!w3eSIjKs?hAr;P^bicz4B+r+C4TG4OsSp1s!!Zk_;bY3YnY9FgW& zZ0I4|%Q(k*!lshvM=+`PU52h4dEBK1{MxE958gD(qMPSOO#1SsdoJd zBCNgJ9hcO&MGNjRW5qORC3terHR4;2u&SSpfv-C@6_ClEE%MiwB%*YvljfVDOAd|c z0VbfB@p>Cj#8mT|1@7BILIH=~^o2JNzt@uM?-^~+(ksbQV7I0^SLLQEvQJ&lx@5{MAo9hJG@-yzNJ`n< zLs-x2X#!73ND4CQg*N%aT8INk@DHMHgug&iq#s0u5px>OE~i+^h|u*TW@7)C@n<*H zT#U0I6kJtyzbX6wi~kMNg5NgW>zZ2NsDDa#j|TpBwMqyAc^AZJgkkFP(9f1S6g;_x z+egzw;d!8HG8BhLIL2qmRB(mEi)GyN3g9|ABnT5T^bBNJ1Slu_VoIou+tSPq;#u@> zqeA?X{4)ZRujpQ=x}=^spZ$n;fjP!PY2K)&?olx`dV-=mu<|b_9|h`R%}3y?=Z%p8 zIne-u&yyKyTPivLB+_qxovHwi-kqEcR4J}es#nY_I)@zcShYF?q8+ZbXK!b>f1Q-S-8zW z81S%bG~s3K!+kwK4A}QBAryPQ4%rTkxm|_iq=V_^2my7ha6XLtJg)jwgQK2Jf6So{ zkONhxf~F9qtg@bjMQZXmCIFGp1@DNjjVv&u+1PObNrM2OLBp(Tiwe{$S0lv56FSuO zA;uLj{}M>dxh8jfi0HKMN`k;E>(|ogpmBdXa&A1vjP~m5vuzFW5rXgkM=96*<3a zbBT0N?kD?}-Ba-kSWrG7zyc)UFt=JUrPqD-I#>`FGyYC$FK; zF>f4Jq$2cf#DY3}dArb2p+Ti7WxJ(__IN3Lzh{W5#}-ei>`CXB*=NeP6R zGA3Vx30F6$0-a7^D(cSL%tbk=b=hl84W!Lh*t&^xUa7~729;GysIXg?i_0PI)Fy0D zW2LSO)p=LeF?sfGvtEOcCha*sglFsbhX8#osSUF^Z*{M(`{gk*9=g89EL>L~n6kgi zQptUvx@HAQ$#p6YHG@EF`QB~>k7(vl8BFhPB|mi}9bEk@k~2j!fH}YBnn!2JU6bQj zg96RD5Wk(uoVJiB##hp$vB8nDo5AGlLrmj~v}p|?8k=+H8QLm{p)5%Vj{oquaFhCi z?Bv8r_uidQGas+{C4s=Qg;*TA_9VR{Vg_C@gnQy6CIm^;+}P~y!=^b3@sBLB-&$#8 zd2(>?)Qp?st5!1fZm@QYS@b}jFUv`J-q^xT#kUV2kyGF-}Y z3--7Bkf8+aN1>qfUPEBBVGnRrChc9<)BbRWsp(}WLKz_+9iAg3u_M$@2e$J(+xmIZ zZ1O;dNX%WCRi>3wST`%?YCAA|lMM}jL+BxOPkPN{ddTl3G4D!+@BK8_^ijFdcr>=7 z<5l^&LrUJSM3@8}*f)(!%HKb)=}f)6Er-|70VjCeAUH&jiyi7TnpWhT6ci z$qbr`smshri3YCra5VR0I-xhdQa5gOyHGo>pSLO>i+EE?Li{Pzzm!>EGG(toCb-AL zZJdgwq@SEEcC@urS_m&>U@k`~_9PC>K&bL-ilYw>W`Oc&3LK@F3)Qo!xTxUwALL3! z9Xn5#jNZ7Rq2_4IBSNYTKGDcST}?xO(Uq3+49$zAmrsqO0n9c9#MPe(k=Uq&>CGKed8dPNRgGik1&dy+=#gA#B3W zV;+Y^{K4c!7W%d&yDr*_h)xdf(@vt`nGDL`z-z4LzGIM`HjYl z>8YEBg8z{c@48V)_6f>?O-<9tyiL>-I=We4;;8*yDygTE+EWlsMfUt}#f>V)$j8CR zA8$r%eJl94w+b^dT{+w2Oio^zhr-79`-{Ffg();+38Itzd_FDAz2Nk1$>E~@p7_V+967Ft|mh0eZMRge` zky`x$HRJXC{81-*O?9bFQWi0h2$>I8c-JE2r^c#fnx4&Aw}8d#=%Irg;S17EeFCZC zdq7m8g>D^s!s)920)m+k(K!#hRD#LYhiGU0<@nR_kjCx{u=Zz|8tX&axSDiU*SjB3 zk^aq}|Dtx0SF1DtQ)o1gXYzOc6$?9ceG~h6Q;%4w8YV4m_P3CVrtb)ase|L`iwQzH zauuhGbeGhdSutNr4qmUzYYGbF=mtwDDuibdzJ{(PN!=JTIcarR@~Jf=i~2@wva|Y3 z&$!s25pwOWv}f`sc1vv4?khCa219SL5}5EMcLp+Xe2h++Rr1IJJ<|c6^vv4(%x5*F zvg?9v0rl_b*21xNfTBde|MgrDRC70!zj@LH_$ZXbK% zY@tnYgj73w5JS1@ZQyG|_@=%%Nk2LSNm*cTz)hFL)IgTzKod zJMi#^v4fIrQ~|k?T?VZD_gb3DZss5(JSDE#W`%Ra zQCOHTFkHoBVPYXOqdYJF-XS#BB;!y&?~5-7a+|?KQrQl)7jAG}Dpy9~UxOlv>4##F zLrz-99G@0cjyr&>=3#3ig|GYs+d8p*osp!^Ki}qtk)<3fP}nSIJW1%aS+^{!Jd-T4 zMq0wY2YP9?OCLC3H!frU&13<1S>PSOIbImI6Hpdl#W+FYe1Qb>LDA^;zF1T^oPVd#S5=9%v+H)aN5a6EC(RoXX-EmQoM_;(|9Ug`EjtbhJ;-F*{M6e|Bq}esfuhY*wlMx zF@^+)y>o4SUU_2eBxJPQrTok9;>eCvLjcD|ZKP97uhpJTcUT=qrrqxVR*H9C)H z0iKGQ5-O`+%I3(63`@pikqj*IVb1X@Sup0R`%r0U|fxg5!H@ux>@a{m2uCvKQdu^hb5UH@sQHTn5(YDNxXTzfQ=n zk-6-PY`~6`Jflr@PW@}hSG0bNASrGY$?G3_KPdkvNV1Q(_d7ZLgC7r3gWl?1vz=K6 z6BN!Gt10f-5Q@$}-kx*5d5$lT{QNNh^;$#V?{8(U6fCZ@x;IqGi0rC3bR$LbJiP>`SG)rM!V7!C^$qmoGT4i<$g82eV$m4}z=G{s zHbOR#7_cdz-ZtaQe!=mRnZSUH50I3hcH*g@#X`*OCNQM&pgcDg+hyt3@qZO@U`iM^ zXqU*Ej>>qkLKysRn1iT2tv5B~g!O_~_V1XJ&b>Jg#PR zS5!r{L5fF)j zObprWlX$2a$|07;&i|Rdty56rc4sgFn#eYsWA0qXPQ>OPbjc>T za!$$sg~k(?#c0&S^sPOJ@q?}6xR@!}EIdNR3r!YggymBeRfGx`{y`gqn{dl6-zwXe zG|enoQPmvy3V|S*rDzGm#82&M7?N?=ne-Gx3NCy6qDR!b#Y@?3h*#4+j$8+0Zn z>WVGo%`&|FO(^kV-48kJ0_JuNr{na-E%20=<<-JRLg2SrJ&xbW&Xc0_@ZD$RJeglYkI;GdaR0p{c>@K(NR`Ix;1lwblSwk^(%JV#L(R z4s81R3cpa>l%dlyhD{f<+J`>r&@ZQ`4ek%d$2z~^$sdCai|1Y#Wy z?`O!Ofc-Zw?kgt znu18r<}b7-B-t5d+M`lHYMFDPcS#3Nw*4#$Xj4)Oza!#9-nJdv)ie0-zPX1Yuzpou z;{bpztFF9dh=0;Q=1K!nU&{tls)}=H9zEzxR&+h05<^ z)Z?#Q>B;-895cWr#GX&gFKq!Fp>3*n#r@9fOsku@*tMBA*;rMsNmjD#7E`b|c@kX= zwI+`94#O!A32;1SGQGrP$4ISj20v!O*$F^tr7n@Ss+BC&$0ESvalae>!h3Csql2XEdhILsNdmGuvH@7;FW5I=qKb!;&$$1 zUp$i!Ljw*DL6K|f2nl?dq*+#|II@L%Y+~Ww4EtjNwL^!5FhCbBl-O$`&HRUWFcG0% z{syc2v?~cJcG!k~peI3T_>5!cCYUZhv2>8$>#Yl;NNfr7;ci{%wwF0~BMCKdxg`LT zjcjgSn&GKy7D8YF&P?xIOCHGzN@`VEo0xi?}TH_K|0a6 zmH!t%hx7l*&tYNzKlnM!jLgjc`}%+BD499g|9|;8wg2fTb$60zZwGYYZ~y1EY1U;OfxQ+~EFG@Q%-2rL6&=Q%ehwMxYHH(B2ePSChmNAf_dWOkithfAFms5|y1v zJC0KTz|_{@-dF&t!Nmb=^VbNF!MzSF=%0Ez_&h**3B&-}&Y_hN%(qR$Gee8>7lZ*2 zhc@d*fSw+24loXCaAIS1_vr{$2M3VQ(@~3kUk}1}od)Mmetdp?-#By&{6(Pb=-}we z=nlr&3B*@aB_Z)cEU#{@yw+#b6~Jc@5PUKwR}kE>rBC`3>pSmLP*+CajUG@PT7IS9 zI86f}kPuy5Te&8F4bLQQKEG$#-VyMh%rO-ZxPN1Cav9UY`rzwO3`Ff;e8y=Veq*h`E5x+_OLR?7DR4mQ3BFrF_7Oj$ zJM>u}e8g#eUg3|#Aj-Lc{S_#HmIe_6HD|kD2*~{RoQm1!9`c93;Nc%?;Gg>O&)@%2 zQl`KEz485Vx29y*)+SZ%hy#Bc&jRp)#a8m}8?OSO0QkdVYb%@g>jQ(E{Q7rfZ3f!( z{Boc9@73Bq#Mk@XWD4^`mA+6JbPwdsemyXPkB zT@cz#JHC4;eB3wpHWV3pvc~{xo$z7))wjI=4Q^rx|NVx==knDBWag;J3d>3Q`DeEI z+oO3>Os&J(jYA9YhbXDN0QjCC-B&R@jKc%S7gM9t4Isv5mli+u?uhHT!SVfzZ}>-M z^s}b5u)KnK3^{LZXk=&z>ht~g^j4pG(6=D5navIGcS-E3uI#-pe)Us+K)=2{v+*JP z&{sYne)6aD=L<6W7_+$B`#o_3)m0Y}qe>?E6^j5C5or2(*xcr7zWUJtEzibcQjE(9i z#jy(a#+kFe0km{%q^Pxzx30=eeKk9mtYP%Dz_yE26Z6rA~vr(L*0@S1TGi33w1vSf9DWH@M_- z+<0cFu3q8o^Ziai145bY;)I3Szx@himzA#E?=jV=%y5n;`K6m}W{qv-puz4WU8_`4 zn|M-4r4QX%*_IQk|G&3yT+;~`pC;orjVw(vXSEW9+31wxAW@PWHpKZfw+C#p|(Dfiif5r$S4$T3ArYl|fZQ>u=!@GQh7>p3X;qmC zHl8b>LBc+kOrA%M+~nhs4IEr@PDWKns$Xe2u!-@8x{*S4LrKeLHu=@J5< z?T;(D1M~Dvj(ku;gklG)DWVCw<{!PZRz<#Bs#uP-l=Pgkf(vt zxf4{6rtgQU`k6IRsi>)8y}I!1u?b`8VG_gK8XMr)~$C1^5rJT*7XjlWCFr*5QIZf`Ab<;l+8Sa5B(}YAw8_>a2cS{aQ7^ z>_qHAd)A%ph~uitK3;Fx=XUxB44px_5&%3i6iJMXGt-@CKJE&;!1eyHnva-tl3yy6i2SXkV+@ffOiAl0w z(0uabV5Y1pO|;eQwB3{IlJzaUr$P#4l&WMledSKs+9cCt2Hat(r8tibrTDo)a*5<<|5PHcizdl za4(Itg)uX{DX5o!ztEpa7|t8nPa4%J*g{;qBb||}YIj_o2Irkg=Ja>m`rGa;nqPS~ z|CtX}@KWcB7jI$SReIGQ=V^KQ=#TZ@JEi$~?`OnhA{fW7y+q*sv#Ni|xFJ(;WPm#b zWq3B|_GVc}uTC?G6@M!Y)@Z|z+G+8Q5ySl0ymnM=rXi#I!pW&$8DjhzwYHPE6rga8 zPKNi1VKA5Fsyw48mt}dA;OHoNkttZ3m~v;d>{a_`?QYR^GDli;<4%q1lE}zk1x@Wc zi7(Pbu@L6+I=twXCBZ^Ek?MlA~^4^do4?7%bg1ucOa^r{cs zJFj+;sU-3_3r}lep)X5Ii%miDHWGRj&9sUj8_@5$7AgjRAZm7Gy`(-$AF(V%V>B(_o5)tdQ(&?yq?j%5{UlmuNnu`F~q@en#e$ z{Qh~wv~oZPql=Am(D95m^+~zXHyBO?iodL5K?`wx6K(R)x?XP!3f3y57cuA;Z|H?JMGFc6s>w@x)04 zkXMjEs9H45{N4<_NvpUM)Z#4jeC7>W~IV^q?EhoFXEi3@k`CaMDDH6Q^>w(FI`!iPbDbrNP9OBQI1UMJGb}Ymlg#aT!pNd0h^y zr?+$e7MDBbOx9cb%fbnqFwt^Qdi+wpck5+c?MycwT%;%C7N_<2{_Vr)^i2C)lf9n14ZLNGM>5g1dizHn2OLbu< zV)Z>BTB;9JG^fI7jivXACkQDU`)$(&MfXC46+Jp5$)GQ*dGXthdl*Hw6`Ri<&!jd` zrH&#v@YDgic`_RnJeDy9s2w2|$)#>Xio{6oWOUU?x*RNweSIt26Rm3apy!IA-bG2R zuxPFfw9N%n-&M*S39*3uO#5;YQcXXEOyjV}c&fJwAl5FPt`(Jb%q(oWbUQ+WmMFZh zl^R&sG}jtDt19*0Uo`*m;nk!K)2&XOOW+O8mhwfa7&B-`eCS|9I*a*_5G=k6&eTel zbNH_0PD@llq{U z0(jy)K=VVtm7Qih;xquC4pAz#P6rzEDA;gh6&8ehAa;Ss@0Ntz^YbKihanG5=4(VZ zpLFqxjVPFNXrL0Q5Hjh0wfhFi+A}5~H-T9%MJxrGfJ`&{F*{v@L0Z$n$r&!@czSgn z)61cYls#ec_Vipp`; zaLLfezd$5vZ#jGjwA(LhFZuOsCYISj5@xLPhjJ6nFM_`M*A0zvhux3xUN(l^zvA@sH{9F z28af^3&S*Yms9Y@*|EYLZ0un0f|G=rh-^|bF!ib@YZ;?CDH{XFI=uv4ke9|LQ&{u7 z!T&U)w$f-quB;6?^R=ww*1q3h!<9(`aYHrnY#k%r+7qjno8&JPNiE!-f0HB}`TD=k z>%)f)v)hZoZgQtxd^=DK*d~oVrw$Vz)V9O&qjuMX17>=ckZP~A0uNa=|1WZ)X<9Y z{gONJZcoh~B}S25&%HSLh3C=e5P3tgf`Alm>eB_KL~*(nPdZMTAYm1luAU0XIth@U z5OkDgj}%y;QnsV_(f0+!qE?1Di%r4;h?DY~{#7#IxZDzo1v4$AmZ}Kd0jlb)3GQVG zi$-p|+Q#JwVP|GVnz>m)*{-i;EOqd?A?o7Pt}>S%M`oc6eRZPwlOw3p5O7A;o0H=!5&I~s%HaHRFK3T4d9rF@TBx4mg2baCGl#>0cR+#AL&`?8FcZhEOrwhO7|6ZF=SfM_hX zyX%qauFQwse#?vc>Ks%a%+f-nv_uvM9DTKjceWc(9-`A73_&;~^qcEY`6dyokfVC* zn`Ud=)83VZ%s;a+JI(AGgup`VTao8foimjKp2Wi_q$_=ro*B4;enMzZ)OG0l>!+a* zJ_TE^i*nu&UCMeBPZcrwYNq7&nh~f!H~K0PdBy5ryLZSLlxysBGu#Fed%w@-esFFP zfop+pmz-!Lc48-BiYQAw=s&x}KDpvXsv|bGs`U3-5yU8Ov_*-lu-c zRIftq^B?&9Gxm|h2tE$Zqa$NdciQ}Ppec_4*?O1q6k$BMWLwQf?CmC1RAasu5-8qp z>mS$x_&M3(w&*_rLDW7-QDlg6NWx(A@`W+VUP^ftgym8|FE%ziI_knmX&k9%1>ygo zhC{e#a2Vh6i!NvF);<|ap!`R^-45896-BTrYRYt>ii-rn9&)t$HJ=ni`i#WDs^GhB z2K5n*ev-g8hM9J_jSLUZ!&45GX?^Bz_oB6O@bikK5dW5_rm-H}8PK82lyRSHM!k5TYQXSzZVrm065|>mYy5+3W>f{T64~fk^@cb$JEj(J!0=A)`ue;D!G=$@OO%MGe~rCw7V0T~l<#b;Kr8$AFwqZmw7 zWcfB`WCixDI5=0Ht`=wiNw)D)TWc6Bq{ZoNM4}|hpTg2}*3|&k=#S+6owakxbTAmb zjMfg6Aiz6#6zK|LTaH4eC2&>s)O60yBZb@+`jnR3@7zROb@2_hrofoij^IRJ(^Yiq zcp>wR8{;(0k3-h*B-43CNN}v}lU4T&h8s)e`ift%^(k4Z9&2cjDb3^FLrtgNW4oiI z(8K|>cRMzp%|o=4gs{xXQ>Noyjej~f@3PjroEy?)NEJ)8|3tRB~x!3ImO|?;gMla>!YcmR* zk9B_(Qql?u*0HODcLSBzFKdQTCUdYqZOohnM<(eGV;nq%;IYBR_Ub#dfw$7<&79hiTIt4WCLqSavpKyf-{bS zYmeAwlfWnxBH;oioZr_*jQqnLil1<0qs=tr3rR4c=VIH4DN$SDe%@qppDjd0b(~c@ zBBs~20Ij~n~n>^5>T|>n5Z(e{EgUDm4X!#(cvhs z>!}F~{)}9xAbN*|(T?Meyn>&Z|0seyF&KB^-=ua&VE`4cjFkBH;ec=|LHWj0paRJ@ zJvxQD&8im*m~Iq!0UeC1cVe=+_t8n=>JNb3D$7KVLo5F}SPW7mXSK+=4wPc}=%S(y zS1@%^AtMnnzhmMJV^PIz>#*6O6$FC%chr7K}9TtYprN9aP7*fChfd~uoi;8 zsrsN)?FBcV!up<4dRqOQ*0ffbOaUtML+}Qyolw#;KU-bkfleL3Hh}&y7_>OEa(WG5 z__;Hc^CyZt!T2@MP+p>*i08+BE&&tRz74P5k^Mc#x$MA(pA|!8ER?S|N2ZY$lnVOI zTG^T3R#e7~`BVt4c81#-wkEB?^1kLT?-*qe7)YM@7~}rc({e`D1{Ef8oP1Fwa6bya zT6ehT#YX?9p+w>;?u(b-%+c;0>f@G-N&Aoghp0qlnN&5-2%&zRsrMS};ZYdT3G>WF zvOQHwV6_0Pkdc1r@k~`=DSMspSNo=ygTLf;wT{AsbjBA9tFuiEJ#)3lTicz#Gs>g2 zt3_EyyU?i{=4z4pT7ImFVZ6Bvv0~PenQC5o?ngV7^3pa|)gBevgM6C$*1_CDWbPNv zNF{+qa-osvpBQ~J;Pr%7MG8^)MFOTM;9k{+vZC)kX1B@!YL^0l7Z;{T!Ntz=E`Fod zk&K|aO_;*wP4BMH9QBZlBF0hGrMn56{4mYWvbRs{EfOTOGXfwwL^%zB7} zmB^3co1J`|Pjizmek_N7z`!0mThd0dXQ;Az^SOY~gcQA&JY_prc8>hM%i)0_0~wFA zLDq=-i;LHxZ9gq@Wv7=5*mFZjXoTYUSCSOcXYZiHWMjg0Hu|(s0uDY@tfbv6#J0?2 z@!0Uv2G8A(@lQ3?HJkB+IJ}`^>BGu(=1Z9SIlc5@N^fzt9%OMu$P(Alj(a`IN$Mb! zP7Xrq3s0~URa|bm6MY)4vAtcA7Vgbtw$O$Y@1TBm+}Rs+%vei_nq0<#uXs0yIe!85 z6*w4hq}p)yv2`&y0zQbhm=W4`7BSVD?n`QZ%g{OM1}xs>ipiurWbHNux5q#_ewff} z71zfxf?$3_;O*eBoOkB>c-+D|;f|Nl%Rec&2@>_KjpiR!Wn*QzJN`oZc4g8KzH}t4 zD95Xh8~|_@;4#7b;y7eQxFPft(?%=m(QsCZ6P1mP|Jsy79OA<`+08stb~uK0cC@n+RBH;99iLVpeH}HUHDkdI%5N(4r~VH2PSL1e2vP+cJ4vsM<2dXYN}f+8_^F< zoHaKk>c+4gf*H!!20`*m7z8c$CnbwF&x1nxvqso2j9Bb+lh$&30lysxag3!NcO zMgQJX&l9LFWXuNOc0ydaV_6i^eM0o_2~4F0GC`VKbTMnzs{Bn zn_j~j4X3)s9j6y+76K4s9ExrdqyFqM^I$J5zZl5l&4=06lKjRuR;jeBrlF7LDT1Lg zCo+pP-(1n@T6=Rs^iP7D)R|N>;yBba3OZ^pu8b>A*q+Kq5ZlHJ=D&7R=s(o1GI5+m zIGQ9p zyBUMBXBfgg?b!Y%{|64mxhPO>K%>Om6*a$;JEd@kD(9;HQe9*XxK)m4tgWl`kX*3x zd4Ai5@5#hkAadzC{3929nG=(#`i?>nT0LCmJKU); ze=oaS}LwMeXwUF@bNhqnzAu$4I+y{eAPjWNefyt6V zhEZGyf8V}^6*7A;;!Co#wus3SC!ibdNS3;7UXDsbX176BuO|*8kXocp)EE+yVD5gP zK_5L;1=tF&kwUa&4eh5-8d<4r+2?Vletm>;^_-NMK0B9@92x8fToE(+jmLycPFUre zKJcq`MXrK+4RiANti`p}l|0@HL1S+Eq$8X01Kh5+Or5!mgn8hn!Y8v-C)SeqUjpWI zak>}X+l$gE$!2U)hJcfW=DaH3S!6BiOq9m#Oqp<&S&O(A%41NOPkKZ-oC31(!|!X1 z*f*D4`7q4W2G^l1@jc~d7|*ytFJSHOCCw#>V~#N6OrtzMm3JY`S{H{Y>&U@|c;9^V zKQL#*fIi0OjL=_B^d3epnA-gZZn9OZ+2%ewH>Cg#ThN1So{i-$i%{Vn&{wuUvaK10 zswo|gy`@eA?S>xCjea7|rxreMqMyVkqz|k65WZZXG1#6IL;%zUB2+nCA#)^z$BbT+ zNRu`~n%ps+CG^U!ej^@B1V_X+vti%5-r~?XIGep3f?#QeTZSCbV+|e`TVsd@bML#q zQLNo!GAyq2)|Y5eodSN=4w_4qH7>j^(R%jgk}2MW6X`~)S{!pftGmc@K`NK&$i^iQ zaJR1Ksc*f7^`2M*iUAGf$w6|gdwwLTMU~I$0WHY^kkLEe)!TPX(P;M4taX5_N}PIU zyl^fOzpPH}k+@-Y`a*lZU_Z$vre8gK&wT>T`V~mS9+clyE!OFT621d1Sb_dwh^7^DoU!$8MB`r1grgtZut3G8s5<0c4?Kf27!+Re;#KC#G znb%47q&(xBD5vM8C!n3`B(=EoA-rSY)_k&6YmDz()AEW}NxShs4YmQ4AT z_@)&L@U?dECysSoyCT4oIajgNR2yVYaGO^7p-d`BD3?5uEW@j^?i|pGQ+XVj_`r`v zqX#!C{cW7h?gg!2-9vA(vX$9pek(l*fofG#M`u z{#G2*_G=!16=or`VWKHf1NQTF-ykU|mZMUfL`d%?fAp#tckuA`(H%!3>*%O6%N zYjg1?lh5R{d@$9{p9YukHQpk9q9P<0d1%|~#X_huDI0QI*X3=Y-=CtO&|k?E3!OWe zI624x`W2<*Cw$3%o=uzEe4h{KOfu9#KT^ws=bGQQRj8=!r&LV=AFlv_7WyAQQ^qD% zoX>K4&R$HVnBipWLsqh%jc6fT^T?Sj83(xip$vs+m_a1IG`Sd@w@rMXhS5j9J8ht$Gbj?$2V*u5(+5 zXOv1?#KGgnw~s3gm)n9^Twg`Sw1*l!Q82KXJ566TAk7g754?)UEhe=QeF!IcMI83# zn0yQ%7u=$Oxhpbc-ZR%NkM^;3mY0kSM!@sb*BJ-6Jbiao0CM}M#M9(`i_Gzp!1C1J zp>01DS_jPeQPdU_gSZV+!ckgcTda*NHyeDRB$1F=s^qyEFIbP?)AO+GsF0mI`tde& zUaQmU(5*uMtsK;r!Z_aPgP^#2CqHCBhw0cg%tnRkczu?xjNmo$y%GB3_T`PSzK&Aq zNZhc$ffgz|ovs`QTAJne#j1`X@@W>9dtACz^&q0})j-ea;TLl-@>M-8ZAGr2hz;s%%iS=yjVR$lUD6&Vn8E!AdxC+j-4wXSufJ^+S95Sx(3%DW5gtGTJt#QeI z;JhwF4Z*ti$_3Wdj|R$*wEl%~xd7hD54oC*gbImwpWf;q8LVrwyqo-3`%FUY{5ZP? zIZ z{rsrX&}lEl7G|g>PSzKkj7yd6Iy;2m0pfYpK*iUzT$XmTNk5hn*cG2b5jMv$0_WQV z!cK{v@mm6=SwW^|3S$P^zsQtUcb)PY8XpapO_BnJfH7jEV}JvERXZ-~bi8N^25aO{ z0yzd2i~k%>qzCY3t=-yFXjHeG-yji4!GD?BecXZe$}e`4i|ol&w{y@1*a~vWZ2|We zlJAy5ql!{$8_5{uAt6@$UKR-8NKHr7(q7W}UkjJddFSP|8=kdIoU5nC)ihRWgGdN!OCCOUBmAT>}JVpPGMT=ou)U$2*2q5N`Lg0|2SIN?7 z-}DSAXhjUA8FhL&T-AlenZD-YiZ4!%g^d75J(qXFsS5}*+W(y3dT~b-duTPo zG7Xe)LT@RwbzSJV4B=xhDt0xPh4X_dv=^U(#v#AE^NpldM)hCNOaqHyJWr+XYTxJJ zXCh-l%kwQ*IyryWyOEclZ%a=j>j zIvPXO&1|S)r=Nb~GGA6fxz$^#oOz2;XXzb?x+xh`_@~{{U~unfxz$X`HdM zZ|+ivJzU8!zuEZxDVca325OOsHg`?atz4eENjadw5*^s{qBR^+0n$;G%R83&V7QOs zx{O%FXMY?br&|T`S2_(U%50Fpv#L3FOk9PH07%2=%Jh9h1Tf(erJtqaKeaR zyJR77YMAC7GL(+|@d1%Vp@{fAC9yojub}v=nQ1@Q^l-S$Ez}GtFaWvT%r3Q4ducVp zg3WVT819~IWb4c!*P;Ux#^!UNf#TIA11-oGR{f*;!qs58%oN?aI*DtXE+ivh++{&x(9%Q?)D`MFsg z$ILYmL7;yq`qqQ~R>QiyQe1Y#jd3}Q2-|?`y&~-2O5|D0pYL2SnuJQ59TzXVWU_hYqXY#q#n?X)w#tlUuY~-b>&|F?B9-=(Snr_pU#fso!V*W ztu!60prDr|e|>QN$W>oj%Ep9&r&}LY5BHypDXIwNQa(tV%ThS*#7pO@XptIXp^Q<) z2brz_ORyC0C}jcg1SamGJTG2| zQzfn&5_XZl*JY#d5%|M*a`8ZyhmmC`@rC%Ga0R0hPXirs*tc+RNnhVU?~J!TdDyUg zqh5xS{KgebupeHR+>8A{)!LEbAYmiNuiY~6uxGp`EhnRF|G{b@Yb>RAad3Xr0+f*~ zR?Xh^_W>VZ;az@+TAu!f`V+H$Kyro0U zcQZT(@*P28N|0J9zlCrcRC&WVn6-67T%!^IE&MIP0|T=AsSVw|ovtzZQW-{4zGjB0wb-!0q#6wo3E=9+o*xM1UN%C|_lneI0R%?DHFm6)4W4M(2S@tf#%u6buCEHqK zv}i6q3?GyOBSxzQxjf<2k#hOD!=uQ=6-J&qtzU?w2Z;Vpv5{au|dY=cfd8>T%~j>K`h$e})RRkY=FjNiaV z3(tEZ&U*jCiK;(kOV;YnoUqb%oKv;k5p4D%U0v?GFPxt8gKF)MmI9t=)&wspCO;2H z{z+cYQc8x+kJDTjCA5-5+kpCwe&kv?T0bKf=4M*562oIf?69ATa)^fdMzM?`EA` zKst!0T95yReh?5ov!h3CV-#X3l-YFL9h(=UN_@1{64kt zNZ979-b+^u%!RR$#9hge@o*HTJ9AO z`dj2Fi2oH3rFQzKQ?>J9fyAz)OFKt1Zr!Q~h6XY}ba8JJRwCl>;T%o6qAo6aW-W&H z{CuJJR%D{Lk~&BzdM45bT3r8YCgGl8^`jyIVOko7U9enZ2efa7cJEgi=vBCjDr+yKHT5mhKja#8x@nzu4Df)`Z$X;Vf@P?m4A`x#e$~l-khu(D}tJ- zfJVD<+(2D+!$sSeQ)OEu>$*2?5wfLAXpGJvlaiQ3-&Q?8Vu`rn#d(6QiROrB+guIQn&sK?7(9!)CF(Xwd!@5e#|BDKQ`i0J<2N~b1 z#A&uY_P+gn&x^NBQ>lR>JCl<1$x{lRaLG7Yb7eceCeb2T^Ez`IE13B?87N2PD^915&DNkwvI`<8O=|3JW8)Rv4t(O|`+kxi)&~SyVVe34;q`=lq%Q`w`PNk?y-Pq>c8pY`jyxAD3qT>X$J<#uGn5riTYhp7K8*hhe5k_!$t=;(WdW9J#z1FV7> zezaQ|AHKm;ge`8x^IHxR&Kte*HMRjGR+SzO0Hn? z(QDl`29I2tpNrAohSwVVOn_4V(oZR@BNHwAFq*t+CCT=0Ka#-`$gJ#KK1gil)(IQo zXG9ia{q4`H?{t|wvmGXKlPs!fgf{%SnifKn0mC!XR5Iafs~EZuHpxBx)=dq}N!HCc zg%8acOD=mnF+uyHuG~4sLM@vu#N<5>UMwb^ZrDpp$WtG_6DqEu12GOBY3=B%DGvgDlQi*H zw@iMj>|ss@+6Mh%%}M?Qbxz9TE;BXX^h~*JI5x60j=%Z16sdSxp|Tco(w6hmiz#7p zbpSbe`fw@oitKo3&+k_9%#{||>Y0Cae|P{}T@>|>{gwUtr9ELZgrO4PAkp_2tWCh= z@Wdvx@^Gidet)~drb33)V>k);A6??^_+Js?^B3d{xS}fQ`(|T_!D@)opz;XRf%pu~ z5soxdn-RsZA|X!f-aL#+6xfENQB|FrZ`hV1iy+{4P>3B~!!m4kXqyn)R=gPxkw<7j z0)>&P5HbyaYU#yBiW6ivQPDyw!g4-=!l09$hM&U*>1G4OKIXB^SqEnXcaT@2gH32j z`S1zpYtAn~XqGV|uI3E?gSL%ZYj|1(DxogdUJ$#k2Q;wL9E79jED8mm!*?EyqDjsy zi;Ya^mpY$JP+yuZ+@62esy?5JtE$2TrFA673|ow4MvLY=D%Bb$9+Ql-w1vY+t3{j2 zOkEv8kn^;ZqYD!(?R-XSL8LH)l#8r6<8O{?1mBJVC{DCAT~#81PGo#>fp{eopTa#mV#%iei>YASCx$~98r^^` zk*mX(3&S6|6e3LQR;PxDYFJJ*ie`O=TSg1PK1gH$hj8Oe& zNRp~w5M&xrzeq_!-J&?sCBTIseTKodpu)myY`Q=|h{Tf`1kO$42kC5t5ZdtBl!Q`g zHqy$PmTHp|E%Kb1#$pw=Ah2NXi_M!N;pkROB?5gk3N0q}u!(wyw@?zI$Rez zVL!-Nd*#h7X5p1yvSp#EPsd4d&?E@3$VF9q=f8*o7G2^tGkAf5_&!%9TPf(uSAqO}7`(yAw3_KH)W&+}xHL9Wbf8k-GH*C) zeK>OYERZsJX?izS_GBq6d)^;xBuQHRTJt3+Csm%ThZsBxqIMtKmQ!?n6C?vkJ^pY4 z+If8Hr)PRkmEUax{krwI!!sN2{ z1apnjC8UizBWagIw#bF?lp;drFG`Q#Vc|wk5rjetk%$X7u3YmmS?2uY$c4)QH?LT9* zKO=;;DMTY6#gh_07*s&ifUJ@;76`KV0SMw1k{pd?=a!OAk;pyzkjNOx( zhM420aTOM7y`B!#Y^ix(vNjg{HndlySyK?!pCe!NHU1XWs(@=QB{~&2{!$7h9m|S> zU%z-?2cGps)HM^_&;(4id+Pa@bgSGR@%vJ*Nr9U`zPYZo#hN>(*Y(3@(ym-EmNawL z#_a~-(zD0yb`RgrSKS8lUH?}Wsez1p=jDg2NGFTdPc~lwS1M7HcB1k6pI}z0NB02- z?iUrX)J6-R1)$XsC!~amLGXi--z=kl%xX)b{SvZr!>gKDie8P^qB2=ICv8!#)n>@h;Nfzr0LEltvv_r zq($-$8B4-qCx=60iBHyG%3qfIZnPyeF~=8Y^1h7Fwv4~{!0i2Q#LdMsVXZJlA#fX8d_U2HSij zh1`zwv$b9FA4N-j_CPLg`yA4ZCT9}AV@VhOWwFADs{t)RnSm>KFT`g$5y~dn(et*3@e)FzFk0qYydi!urLwp1QfUefO$;d^MC8iq?Ar$*d3$PE)7ShIoMdm6&d z73cdBi^0B6J5zCodar4hQzxzgZ8DVIJi5K`&1JR&sr}SGn`Fa|?F(o92=aqUTsA-d zG_7`Ha+399iCoaUePSIA-eG@adCPg9OnO9~ypDYDpE3RZhF%RiOJaK*nc8dHBHZ2f zi3oOfdK#+)OI3;-yi&anil7T;y)5_$Z|jV0^bG9qT3$oj@F-h$?6=%3v5zV1WbB>B ziadPqGG!Ks%^q)|7F_6kbOuy#{dj~!#6Fys8hj9Z#s0$nc5ig|ggEr4CrObd32c+hJY+e4gojVbii(*8vp60bJv2;~rhx$#vM_a&}p*GJH9q&+=DQUC3 zh9u&qNh8sF5B%ZgmG?dPb>z`ScO&YB<;4UJvPmI~{VNZ7_3-jQY@~w8subJhZPE#mUbUdw zXnn}txp4IOL^9htvh`%m8y z$xjmM{#Q84%K`cyC|blkJ#3ua7@62LZ9N>oA9oE;b33qw$A={G0WUp1+7!(+!MXr% zTMrw6%?EgOa|2reT+A%&&8)#p?5b|gmYx=1H%1prs}D6+5E~be4aCjFt^oG+c6PIL zXJkV8uaRt>og_c-D}Yf_fCC8P266(qIXJm_IJsGXAO;|i;X_Z!+4BFSqUmPl;sUn( z0K;Yu?qC#lRZS^fHfc`>2Xiwgrw@?KsA*&C4*0nK%brgFZLpjB$J78$HV_wxo1cS+ zo0Wr&=fB(eq5rWqMO!a0;KL{`CV;(*nTL%#*bLzH|Mufy<6r|a{ReM@oh+Tr?NE65 zJ~(Rt|B(a*QP}@yIM2)X-_bk|AL#!_^B?}`|64R)W3ZM;Hj)xpP**Oq5Hd75s$dBN z13l2+y~@JAe3wRrcEyaDi3gsZU44BPQZovy$gey-=uIvEH$EO6&*xfaXXnhoZCgk0 zmF7he>~d^F-QR?srqMe;rOqHZ3@WJ|Y2^iwearjNWooWj3O^PwBKljg9_FKAfn4ZD%${ zH+x6BSD;x6pR%4s;3KLvZR0f&A9vZD~8OQNZ!lo%S5)Ebig z0E6C%j47NGgFl7>Xxi9~%jNUayVR%`)7G7ZKP38Lqvs?ghPQ>nMREgViGW|XJ775a z{My?(AA1<6T@3@!jp<6yat`9b5JhmWA@?9)YB~TI7VA!iYR8?Sfx5TRuts@toG<=` zn!!1yBeHrdOH1%9?{FeCTW(b(EQt5*sD#9GPXp27>W7Kx|D%eUNZI}Vw*6{SU@Xj& z-QNd(f8KsQAHv5ld1ncqwD^58M8}qof;p;WjT?lK8?k9{3$TtBvZeJf9i$m(yH%udcAXX494bS!@b3a_+00uCKyAwS({22s6qv)1g^iLh)*X?-1ytKrA1@KeB_p z@#z=D03@*tE+On(!ugPGk+W?llG;S!u9}5-wnhjz0mzj_qoAj^aSrN3-S|Q$VUr(m z_Uc1H1;7L7;eZ&|)sSoztZOLqU}TW9=&1_UGxXpaj`>E&gCS7n)=i78rEjt zQqxIm+`WiwX-wK$V+MHD&|U2+kd)ib=kS z^3Ds>wm7B48)iXI*s1r?^D^SMi*Dvr1G+;)#Jce<{Tv0&Boemb3e>qo!Z^#Pzty-m zI-YJ?#4Xq*qnSj-dHtU6=QXDRv-S{H;$5twlNfZ|1~_w@GglxGfeP z7|VO5`(?%_ej=EwRrFUAURj1zOC7>gAk0^bb)=u_&R`lJ%KQ*w_~r?87~GUAcl??e z8_YCPdL8A+6r@-HxxTsIe7R_Q`Nz(FJTdWEls}yN^~Vd%>(hMq%iZUMAqgvf*6|#@ zy4la^cekm1>V}!ZKMoH^o*qP!W|H`Yc#|o~?ii3BZjO)laZ9^RcBeawX{Yq8dQ(T?+RW+>xtk@HWw^YN5A z;|8r5;4;%)@wKokrt0jSzS35D1_&V;npeFkGe`1NN81nepELh6M!3Zdd-LD*I3{jC zB6&izucf8>psu*9ULQT&S~J*yNXlyrBc59SVb}lI2DHdX)TwY%Q;Kik(Scc>c>M19?cpZa%B{y!BbyzjS{Oij@B1 z&RGBPHC=QVu7RWMa5}*X(lrkcDi>~ZPDk&UG`XKm2NG;Tzfk_P_6yLAD!TI^U)D}J zIL8>d4}V#1ci%v?mXLH}V%s8dCGZYk{IW+jP=&V@+YP8>fsHuIt(2Y^^IBOlObTTx zGu3oQDPB8>oi`mN26pNW;1pbjq z{4zO%n|WaOApM`y|M#q9EY#(u{VMs8eWx_*Wrw&8%s;{$`)gYmO18eqo=`#lsdc#c zXmk}OGxOCF?0M~e(sMZbgLh~zzadRqE*{F~#Xd4I)-gnzQA^y7k2N>SIX-M|Xy=mA zPo>Ae@GP{WSX3D1J;Z}xQZ9YMk_q)CYU;4M*&rr%S)j;i8HsL)JFVuY$4a^!+Dr;k zPf6tF@GGnJrkM+$;?e^t^Xd`{BXaWgDb-{(AN*9Fq2_o~zW@tu-%!WWYzfcM=z=!hjthbuA#q)>M3qCj>I7i+CO z`=_7LHc-Y{I}&%)=GclGW!G@a-R@l^1-GR~I}*P5iKb}RGd72A0yfT8ws`T-E7 zY9zH*=Jo(f?bar(rb2Y7u=6I%arfkZ`Rj8kGFcKXY(=%M@zJV_z4L|)fzb_tJOA=p zjIT6Yk?^OHI3<;(9B+_r{DU6n=NZ0cJT!|r@3s&5&zPQ>sDSh5wFLtG0{lLe zL9;n+iU-?hM3M=?*=sZ~9UrhlS5)&iNN~8ypWz@D8gdAr5>w4G%OE@$7JhYN{e`ta z@pG!2&t_LsjAEJ*5(=U$iZ6N_=r`XXx{J;e!!k$N(}sOs|LtvL#8K3^lOFB3n@>g1 zxzX=A0+%HSlVi6u=dyw$(ciceBI}lb%1r)-JvN!I&nO}#?`k>ug~(mdR(u3~(_7Yr zR>-umW3arieZYy)UYI$f^+)*}av4%m)jXF>kBp&(4quALr=u3^z)Ur;EPUKe%z;+) z9}3s-`{K4~WLOZw#Sg9=RB!`xy1i$^G%1Dd+vUx}%S_#ITRbcHOc(c}at~dg`Dh9djc^ zLkKPaY$kj@rz+Y4%dSeKKmx{BukJU;&nO6X>_Sl_t6WNl?|WNQV^rlS5&;5cun$1^P{z%1be52tf}s z9Ek4>dQCR}ZW5sa10>oAB>0{{Y`T3g`9~rF8KfELO<=(ZgNbnNH(d}ZeIuJGJgz}> zNW3ev2ybQ)Cr2-zMy;FZaBLxwaUvjzcKEX*UjNG*Mj!~X4r%-*QhZ28B5*$1y*XJYLat7OaXstB$%y7pp{Cjt{aoPz-kbQ_F)fr(I0wJ6Hc zDnky9G`?(BK$J8pA`LuZei*nj6d_CtqVnOUQ@nO<3GEWwzFz7ZSIKijT5*-%c*C-Z zJhoS+%StWdH~i&`A>9v*n8R)wlXx&!Ev!XPc(d#f2`UDWV#Kg+QzRLM$qk;rT4p3p zMP8(_UOTb_*C4*C$=`r$Idt}HI<+3v=G2!}sZ^9pqePcA9Ov{cVVUxB@6m{y2m+|zNX?&1u96@T7Dd#D1l{I2BBTVNrU&YvfCNcVw zzq!rm8p@X6=93hGc_qA>>7N?5rO-WX7NMD zH4VbiDH;iGTt!>|7i>nxpYsL;c=?9b2!4Zw#XT~1(0=8L*#nB9#5lv12cIB_W~RI7 z!L4f)&KcXvQ8v;)naOWC(^g_RM@b`j(@McC4MJHS_x4Y1Q_gdVDqa|8>6ZpP1 z`#SzuVVTEnQY5wq&!X+DyD`TsHJH5LuZ4v@gsMk%@z?ib1-Ij?K3H~}=PjK4by(Em zUn{{Pd47;4)@;Ft^HP; z1MlSnKkRI56ZUELirrggRRGwVRv76Qq>H5i`}FAO;ZQ3?CZ|?MPy6-OF1h7OVZ-Mw zhaE46GkT+w24m+A4~WRn_QR)U$S;a7p1;6LN*ERQ-f4!1A7S^vQPLh5{#=Mj<7Y!= z($yJI(?@anYyId*7DBI1-1>?=%a>_T{clmQL;f5%=&1)bzPsX~ z)z5QSCrRqxuuqcQzP-qbjij`jzp90$UJ1LX%KFrgD1BP*5W?^_gI1F9`jSsmB|ed& zw7q?=x7f30;iA5iq=LT~-g|uw=kmS9rJOrSiPaF5{l{oE_*CUp1X@v7Ppd`)X%Q)l?4LdD*`BNof+`N#RhKN*y8+xt`4>~z_f zLK`nDKQRh|9`|0CpQO74^YI9j<6YdP z_?($`f7TcNPaIaz{z00uw+F9)X)3cH4_ANW5a z3cC)#00iI!fIceKot-^CDuEwEsQ;&jmGejGKdzVmlq3NLg4`Su{9=+~oP6RyULHvf zASb632ngbkl;YqP=i(8Q76$yERX*lX1Up%OxMn~+e7yg+E(Y>U1!rw7yvTduS=7>d z0u*?CzoJG}#&iWH)-Jaw_0>{Qg*Gp{;xXlT*PYR=K}p;MdJL}N4@xGo2UtZiKG&hW z!LPK5(J?)}+!{THd4fVGuTO#2=Up33cPLa5@3C01SQ0i>ugG_WNK*W<1Y)6gSZ9pe z81)Tz=eE2MBs21I8GGr^;;T?3bM3Q{$ljs!jY2jO5glTj5F!Y`4An+T!3H+Mv5?#m z{S0V^cqC~cbHZ>6^T!4vL%B^+Pk4q6q3Wlo$8^L!18fJ)#x;NO3Q^^YzQe1@QWcqv zx+7CES(vb|iq(=_lPUj#S$rnCz|wSV)+o@}6(LZM-H5=&suLv?+l*S1>w-omvGCiz zYNU@(A61`W8`%Y?23=`{3tG9zv#<#_A0f@gf6imjx>v6;(55Ehu-CWIn5)zp#ZdHf z^s*u!I$QB=5mOVVDZv>~A9tG$807`0)SQioY9lj8f=t+llhaA1z#2YbGcxxVxh|y2 zhGEun#C}Tv0tZpA%YNve$NZ2GY$x9f7BK>M62;Ig47iEOaQ{!KVph{Mi0^GXV_wbIC6b?Iy|)T zo^3|DEzYu8R9sg1DO-i_ze2ef(A0;RYjZkpx0Ym_G^`(w%Mtqrq!c7MdQ39(uf*nb zU^HIsGzjGkno5s*-tnPV-@M$v^i4czlqH0d+t9iOwD?}A0;<|JAJ~XIPY44t4>3}e zG{+ZnR?d~HmINiVb1`p}OP9zHaoc#{mt4gQTQ26#wZ$`Y`sJw^c%iAPNw*PsTAqQp zJnU?Dt3Q^l?1Q2z&V$TI_8FCt@G~YOP6retdIu6CQZ9UY=oIv%Gb+@?8cy$#7qO{x z2MVz1cwhIo6AYRIgN4nLVccMc7vhn}IZS{Z@|JQ3X8Ob$sB!wlUUv}~?Kn8w&8F1c zYZ!Tg7DX(~9RTKAYqkv|Vv-68oX036Eyg%9wt;F#_ktFPb{v|$Oz2)4{HkssmtbJT zK2-lAPf#lRt-uY^LkW-jQs5MxO*Ly1b8!t5%JW7-G_aF$ zwrc^Z(7KFDv`kqB>XrYvi$PQcEo$?K{HCgU=Q}*zlY22QSU@(_w)XP-k3x*Hm^s7zJS;xbk2d54hlMMr}!EBNxyLAL)(1+OZM){@^x& zA>E=nH(gj_t%q6cUuy7Vvb|;Zp$^Bl)=xQ>w;oTNr!Lee1B(VLJFpzf-oW1M3^wDC z^IbMeZYs%lPTL7bUh>o=fxnM*hSTb_R`ko_r-={QsNO>DUlIF=k2?DIlvNVw9k0R_z#G8nD>0ad?MyI{`Z^WhErP*`BCWl28M z(ExS6C{ref*x`Q~y0|Oknl4R+|2KTG;XL06_(N%kp^U6`0_uD(5!SNuAUy(jAlhX8 zeQ{`4@6oYZ+^>+XbKH(Id4w**3gl%pe{j*t$>Co*vRlzCaktxm;nN+t-yj>1NxDTJ zA0KOT*H-=_P_~_(lLlYbOUB6ue?U9L1RNPx4&|TrLYchGac@`Fdvpbo4UE*?@}M1B zggm*(PyH-O8o3xVc_LIlSq#%aRc7UD+FA@tqbVc!Hpy4vQhlns!30}?xVk)+Yhl13 zl;}vlCeZBZaI(7mv&O<;r1_&F?xTX^tabHgdYY;U?Dw^m&?^OTqd)QE?&>Gsm*r~G z^ck9z?eZ1{Dm>Ls)^zEhh9?Z6BF8&%f$~;HYiP-B*gzgDPwmK|YJ)6nPzwygz&eIU zgEtJq?K(Qq3@4}AKWM~|tIxOt)&+LNic!9-1YOVg9B>B`A#?2aun66PPzZfkL`@}3d`Dv-hMAG1jIWE`RPZy?wAO%4nItH`02-9R+-#1lI`XS z2-&uFU@lvL?s)Yp`Z63dNOXQ{S5W;&Q2G!;)4MWAB!0_OY^MdLpO#os(mgYNlt3N? zz<^Bmkdy{Ps5n#PPq3NVZi;jZR=(&!i%gIH)QKv~dZy#Vefdw)J(xeU--KQ;WoOt+ g{@4GnyN8*ZhtJ1$0}6 +#include +#include +#include "../DKSDefinitions.h" + +#define BLOCK_SIZE 128 + +#define FITTYPE_UNDEFINED 0 +#define FITTYPE_SINGLE_HISTO 1 +#define FITTYPE_ASYMMETRY 2 +#define FITTYPE_MU_MINUS 3 + +class DKSBaseMuSR; + +class ChiSquareRuntime { + friend class DKSBaseMuSR; + +protected: + // single histo fit parameter + double N0_m; + double tau_m; + double bkg_m; + // asymmetry fit parameter + double alpha_m; + double beta_m; + + bool initDone_m; + void *mem_chisq_m; + void *mem_param_m; + void *mem_func_m; + void *mem_map_m; + + int numBlocks_m; + int blockSize_m; + + char *ptx_m; + + void setN0(double value) { + N0_m = value; + } + + void setTau(double value) { + tau_m = value; + } + + void setBKG(double value) { + bkg_m = value; + } + + void setAlpha(double value) { + alpha_m = value; + } + + void setBeta(double value) { + beta_m = value; + } + +public: + + /** Default constructor */ + //ChiSquareRuntime(); + + /** Default destructor */ + virtual ~ChiSquareRuntime() { }; + + virtual int compileProgram(std::string function, bool mlh = false) = 0; + virtual int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result) = 0; + + virtual int writeParams(const double *params, int numparams) = 0; + virtual int writeFunc(const double *func, int numfunc) = 0; + virtual int writeMap(const int *map, int nummap) = 0; + virtual int initChiSquare(int size_data, int size_param, int size_func, int size_map) = 0; + virtual int freeChiSquare() = 0; + virtual int checkChiSquareKernels(int fitType, int &threadsPerBlock) = 0; + + /** Set N0, tau and bgk values to use for the kernel. + * If values changes between data sets this needs to be called before + * every kernel call. Returns DKS_SUCCESS. + */ + int setConsts(double N0, double tau, double bkg) { + setN0(N0); + setTau(tau); + setBKG(bkg); + + return DKS_SUCCESS; + } + + /** Set alpha and beta values to use for the kernel. + * If values changes between data sets this needs to be called before + * every kernel call. Returns DKS_SUCCESS. + */ + int setConsts(double alpha, double beta) { + setAlpha(alpha); + setBeta(beta); + return DKS_SUCCESS; + } + + /** Set number of blocks and threads. + * Used to set parameters obtained from auto-tuning + */ + int setKernelParams(int numBlocks, int blockSize) { + int ierr = DKS_ERROR; + if (numBlocks > 0) { + numBlocks_m = numBlocks; + ierr = DKS_SUCCESS; + } + if (blockSize > 0) { + blockSize_m = blockSize; + ierr = DKS_SUCCESS; + } + + return ierr; + } + + /** Get the number of operations in compiled kernel. + * Count the number of operation in the ptx file for the compiled program. + */ + int getOperations(int &oper) { + + std::string ptx_str(ptx_m); + std::istringstream is(ptx_str); + + std::string line; + bool start = false; + int count = 0; + while(std::getline(is, line)) { + + //when fTheory start enable counting of operations + size_t f1 = line.find("fTheory"); + size_t f2 = line.find(".visible"); + size_t f3 = line.find(";"); + if (f1 != std::string::npos && f2 != std::string::npos) { + start = true; + continue; + } + + //exit when the new functions begins + if (start && f2 != std::string::npos) + break; + + //count opertations + if (start && f3 != std::string::npos) + count++; + } + + oper = count; + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/Algorithms/CollimatorPhysics.h b/src/Algorithms/CollimatorPhysics.h new file mode 100644 index 0000000..b7e8190 --- /dev/null +++ b/src/Algorithms/CollimatorPhysics.h @@ -0,0 +1,47 @@ +#ifndef H_COLLIMATOR_PHYSICS +#define H_COLLIMATOR_PHYSICS + +#include +#include +#include "../DKSDefinitions.h" + +class DKSBaseMuSR; + +class DKSCollimatorPhysics { + friend class DKSBaseMuSR; + +protected: + + int numBlocks_m; + int blockSize_m; + +public: + + virtual ~DKSCollimatorPhysics() { } + + virtual int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numpartices) = 0; + + virtual int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) = 0; + + virtual int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) = 0; + + virtual int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) = 0; + + virtual int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1) = 0; + + virtual int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1) = 0; + + +}; + +#endif diff --git a/src/Algorithms/FFT.h b/src/Algorithms/FFT.h new file mode 100644 index 0000000..b16e5f6 --- /dev/null +++ b/src/Algorithms/FFT.h @@ -0,0 +1,43 @@ +#ifndef H_DKS_FFT +#define H_DKS_FFT + +#include +#include + +#include "../DKSDefinitions.h" + +class DKSFFT { + +protected: + int defaultN[3]; + int defaultNdim; + + bool useDefaultPlan(int ndim, int N[3]) { + if (ndim != defaultNdim) + return false; + if (N[0] != defaultN[0] && N[1] != defaultN[1] && N[2] != defaultN[2]) + return false; + return true; + } + +public: + + virtual ~DKSFFT() { } + + virtual int setupFFT(int ndim, int N[3]) = 0; + virtual int setupFFTRC(int ndim, int N[3], double scale = 1.0) = 0; + virtual int setupFFTCR(int ndim, int N[3], double scale = 1.0) = 0; + virtual int destroyFFT() = 0; + virtual int executeFFT(void * mem_ptr, int ndim, int N[3], + int streamId = -1, bool forward = true) = 0; + virtual int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; + virtual int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1) = 0; + virtual int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) = 0; + virtual int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) = 0; + virtual int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) = 0; + +}; + +#endif diff --git a/src/Algorithms/ImageReconstruction.h b/src/Algorithms/ImageReconstruction.h new file mode 100644 index 0000000..3a6266e --- /dev/null +++ b/src/Algorithms/ImageReconstruction.h @@ -0,0 +1,117 @@ +#ifndef H_IMAGERECONSTRUCTION +#define H_IMAGERECONSTRUCTION + +#include "../DKSDefinitions.h" + +#define BLOCK_SIZE 128 + +struct VoxelPosition { + float x; + float y; + float z; +}; + +struct ListEvent { + unsigned detA : 16; + unsigned detB : 16; +}; + +class ImageReconstruction { + +protected: + void *m_event_branch; + +public: + + virtual ~ImageReconstruction() { } + + /** Caluclate source. + * Places a sphere at each voxel position and calculate the avg value and std value of pixels + * that are inside this sphere. All the sphere used have the same diameter. + */ + virtual int calculateSource(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** Calculate background. + * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the + * smaller speher is given by parameter diameter, diameter of the larger sphere is 2*diameter. + */ + virtual int calculateBackground(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** Caluclate source using differente sources. + * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the + * each sphere is given by *diameter array. + */ + virtual int calculateSources(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** + * Places two sphere at each voxel position, calculates the avg value and std value of pixels + * that are inside the larger sphere, but are outside of the smaller sphere. The diameter of the + * smaller sphere is given by *diameter array, diameter of the larger sphere is 2*diameter of the + * smaller sphere. + */ + virtual int calculateBackgrounds(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0) = 0; + + /** Generate normalization. + * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel + * that updates voxel values in the image on the slope between these two detectors. + */ + virtual int generateNormalization(void *recon, void *image_position, + void *det_position, int total_det) = 0; + + + /** Calculate forward projection. + * For image reconstruction calculates forward projections. + * see recon.cpp for details + */ + virtual int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, + void *image_position, int num_events) = 0; + + /** Calculate backward projection. + * For image reconstruction calculates backward projections. + * see recon.cpp for details + */ + virtual int backwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels) = 0; + + /** Set the voxel dimensins on device. + * + */ + virtual int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) = 0; + + /** Set the image edge variables on the device. + * + */ + virtual int setEdge(float x_edge, float y_edge, float z_edge) = 0; + + /** Set the image edge1 on the device. + * + */ + virtual int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) = 0; + + /** Set the minimum crystan in one ring values on the device. + * + */ + virtual int setMinCrystalInRing(float min_CrystalDist_InOneRing, + float min_CrystalDist_InOneRing1) = 0; + + /** Set all other required parameters for reconstruction. + * + */ + virtual int setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter) = 0; + + +}; + +#endif diff --git a/src/AutoTuning/CMakeLists.txt b/src/AutoTuning/CMakeLists.txt new file mode 100644 index 0000000..62cd2b6 --- /dev/null +++ b/src/AutoTuning/CMakeLists.txt @@ -0,0 +1,21 @@ +SET (_SRCS + DKSAutoTuning.cpp + DKSSearchStates.cpp + DKSConfig.cpp + ) + +SET (_HDRS + DKSAutoTuning.h + DKSSearchStates.h + DKSAutoTuningTester.h + DKSConfig.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/AutoTuning) \ No newline at end of file diff --git a/src/AutoTuning/DKSAutoTuning.cpp b/src/AutoTuning/DKSAutoTuning.cpp new file mode 100644 index 0000000..050d1a8 --- /dev/null +++ b/src/AutoTuning/DKSAutoTuning.cpp @@ -0,0 +1,302 @@ +#include "DKSAutoTuning.h" + +DKSAutoTuning::DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops) { + + base_m = base; + api_name_m = api; + device_name_m = device; + loops_m = loops; + + evaluate_time_m = true; +} + +DKSAutoTuning::~DKSAutoTuning() { + params_m.clear(); +} + +int DKSAutoTuning::setParameterValues(States state) { + + //if states and params don't match in size something has gone wrong + if (state.size() != params_m.size()) { + DEBUG_MSG("Parameters and states don't match!"); + return DKS_ERROR; + } + + //set the value pointed by params to value saved in state + for (unsigned int i = 0; i < params_m.size(); i++) + params_m[i].setValue(state[i].value); + + return DKS_SUCCESS; +} + +/** TODO: might need a better timing for GPU code */ +int DKSAutoTuning::evaluateFunction(double &value) { + + int ierr = DKS_ERROR; + DKSTimer t; + + t.init(function_name_m); + + if (evaluate_time_m) { + //run for "loop" times and return the average time. + //syncDevice() is used to make sure that nothing is running on the device before the timer starts + // and to make sure the function has completed on the device before the time stops + for (int j = 0; j < loops_m; j++) { + base_m->syncDevice(); + t.start(); + ierr = f_m(); + base_m->syncDevice(); + t.stop(); + if (ierr != DKS_SUCCESS) //exit loop if kernel execution fials + break; + } + + //returns + value = t.gettime() / loops_m; + } else { + value = fd_m(); + ierr = DKS_SUCCESS; + } + + return ierr; +} + +void DKSAutoTuning::clearParameters() { + params_m.clear(); +} + +void DKSAutoTuning::exaustiveSearch() { + + DKSTimer t; + t.init("exaustive search"); + t.start(); + + if (params_m.size() < 2) + return; + + Parameter p1 = params_m[0]; + Parameter p2 = params_m[1]; + + double time; + double mint = 1000000.0; + int minv1 = 0; + int minv2 = 0; + + //std::ofstream myfile; + //std::string filename; + //filename = "search_" + api_name_m + "_" + device_name_m + ".dat"; + //myfile.open(filename); + + for (double v1 = p1.min; v1 <= p1.max; v1 += p1.step) { + for (double v2 = p2.min; v2 <= p2.max; v2 += p2.step) { + p1.setValue(v1); + p2.setValue(v2); + + int ierr = evaluateFunction(time); + + if (ierr == DKS_SUCCESS && time < mint) { + mint = time; + minv1 = v1; + minv2 = v2; + } + if (ierr == DKS_ERROR) + time = 1; + + //myfile << time << "\t"; + } + //myfile << "\n"; + } + //myfile.close(); + + //std::cout << "Optimal launch parameters:" << std::endl; + //std::cout << mint << "\t" << minv1 << "\t" << minv2 << std::endl; + p1.setValue(minv1); + p2.setValue(minv2); + + t.stop(); + //std::cout << "exaustive search: " << t.gettime() << std::endl; +} + +void DKSAutoTuning::lineSearch() { + DKSTimer t; + t.init("line search"); + t.start(); + + double time; + int ierr = DKS_ERROR; + + if (params_m.size() < 1) { + DEBUG_MSG("Need some parameters to autotune!"); + return; + } + + double mint = 1000000.0; + //loop trough parameters one parameter at a time + for (auto param : params_m) { + int minv = param.getValue(); + + //go trough all the values of the parameter, while keeping other parameters const + for (double i = param.min; i <= param.max; i += param.step) { + //adjust parameters + param.setValue(i); + + //run for "loop" times and get average + ierr = evaluateFunction(time); + + //if there was no error executing the function and time is better than previou + //min time, save the parameter configuration + if (ierr == DKS_SUCCESS && time < mint) { + mint = time; + minv = i; + } + + } //repeat + + param.setValue(minv); + } + + //DEBUG: print out the found best parameters + for (auto param : params_m) + std::cout << "Parameter " << param.name << " set to " << param.getValue() << std::endl; + + std::cout << "Best time: " << mint << std::endl; + + t.stop(); + std::cout << "Line search time: " << t.gettime() << std::endl; + +} + +void DKSAutoTuning::hillClimbing(int restart_loops) { + + DKSTimer t; + t.init("hill climbing"); + t.start(); + + std::cout << "hill climbing" << std::endl; + + int ierr; + double time_current; + double time_next; + DKSSearchStates search(params_m); + + std::cout << "start " << restart_loops << std::endl; + + for (int i = 0; i < restart_loops; i++) { + + + //init random current state + search.initCurrentState(); + + //evaluate current state + setParameterValues(search.getCurrentState()); + ierr = evaluateFunction(time_current); + + //std::cout << "Start iteration " << i+1 << std::endl; + //search.printCurrentState(time_current); + + if (ierr == DKS_ERROR) + continue; + + //statr the loop + bool topReached = false; + while(!topReached) { + + search.getNeighbours(); + + //get all the neighbors of the current state + bool neighbourFound = false; + while (!neighbourFound && search.nextNeighbourExists()) { + + //evaluate all the neighbors of the current state + setParameterValues(search.getNextNeighbour()); + ierr = evaluateFunction(time_next); + + //search.printNeighbour(time_next); + + if (ierr == DKS_ERROR) + std::cout << "Error evaluating function" << std::endl; + + //move to the first option that improives the solution + if (ierr == DKS_SUCCESS && time_next < time_current) { + time_current = time_next; + search.moveToNeighbour(); + neighbourFound = true; + } + + } + + //if no better option is found save the state and move to step 1 + if (!neighbourFound) { + search.saveCurrentState(time_current); + topReached = true; + } + + } + } + + std::cout << std::endl; + search.printBest(); + + t.stop(); + std::cout << "hill climbing: " << t.gettime() << std::endl; +} + +void DKSAutoTuning::simulatedAnnealing(double Tstart, double Tstep) { + + DKSTimer t; + t.init("simulated annealing"); + t.start(); + + int ierr; + double time_current; + double time_next; + + DKSSearchStates search(params_m); + + //make a random guess + search.initCurrentState(); + + //evaluate current state + setParameterValues(search.getCurrentState()); + ierr = evaluateFunction(time_current); + + if (ierr == DKS_ERROR) + return; + + for (double Temp = Tstart; Temp > 0; Temp -= Tstep) { + + search.printCurrentState(time_current); + + //calucate all the neighbours of current state + search.getNeighbours(10); + + //make a move to random neighbour and evaluate the runtime + setParameterValues(search.getRandomNeighbour()); + ierr = evaluateFunction(time_next); + + if (ierr == DKS_ERROR) + return; + + //if the solution improves move to this point else move to this point with probabily exp(-dE/T) + if (time_next < time_current) { + time_current = time_next; + search.moveToNeighbour(); + } else { + double p = (double)rand() / RAND_MAX; + double dE = time_next - time_current; + double P = exp(-dE/Temp); + + if (P > p) { + time_current = time_next; + search.moveToNeighbour(); + } + } + } + + search.printCurrentState(time_current); + + t.stop(); + std::cout << "Simulated annealing: " << t.gettime() << std::endl; + +} + diff --git a/src/AutoTuning/DKSAutoTuning.h b/src/AutoTuning/DKSAutoTuning.h new file mode 100644 index 0000000..ca8f3a3 --- /dev/null +++ b/src/AutoTuning/DKSAutoTuning.h @@ -0,0 +1,103 @@ +#ifndef DKS_AUTOTUNIG +#define DKS_AUTOTUNIG + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "../DKSBase.h" +#include "../Utility/DKSTimer.h" +#include "DKSSearchStates.h" + +typedef std::vector Parameters; +typedef std::vector States; + +class DKSAutoTuning { + +private: + + bool evaluate_time_m; + + std::string api_name_m; + std::string device_name_m; + std::string function_name_m; + + std::function f_m; + std::function fd_m; + Parameters params_m; + + DKSBase *base_m; + + int loops_m; + + /** Update parameters from a state */ + int setParameterValues(States states); + + /** Evaluate the function and set execution time + * Returns DKS_ERROR if errors occured during function execution. + * Returns DKS_SUCCESS if function executed as planned. + */ + int evaluateFunction(double &value); + +public: + + /** Constructor */ + DKSAutoTuning(DKSBase *base, std::string api, std::string device, int loops = 100); + + /** Destructor */ + ~DKSAutoTuning(); + + /** Set function to auto tune. + * Caller of setFunction is responsible to bind the correct parameters + * to the function with std::bind. + */ + void setFunction(std::function f, std::string name, bool evaluate_time = true) { + f_m = f; + function_name_m = name; + evaluate_time_m = evaluate_time; + } + + void setFunction(std::function f, std::string name, bool evaluate_time = false) { + fd_m = f; + function_name_m = name; + evaluate_time_m = evaluate_time; + } + + /** Set parameter for auto tuning. + * Provide a pointer to a parameter that will be changed during auto-tuning + * and a min-max value for this element + */ + template + void addParameter(T1 *value, T1 min, T1 max, T1 step, std::string name) { + Parameter p(value, min, max, step, name); + params_m.push_back(p); + } + + /** Delete all added parameters */ + void clearParameters(); + + /** Perform exaustive search evaluating all the parameter configurations */ + void exaustiveSearch(); + + /** Perform auto-tuning. + * Perform line-search auto-tuning by variying parameters one at a time and keeping other + * parameters constant. + */ + void lineSearch(); + + /** Perform hill climbing + */ + void hillClimbing(int restart_loops = 1); + + /** Perfor simulated annealing to find the parameters */ + void simulatedAnnealing(double Tstart, double Tstep); + +}; + +#endif diff --git a/src/AutoTuning/DKSAutoTuningTester.h b/src/AutoTuning/DKSAutoTuningTester.h new file mode 100644 index 0000000..9c44309 --- /dev/null +++ b/src/AutoTuning/DKSAutoTuningTester.h @@ -0,0 +1,33 @@ +#ifndef DKS_TESTAUTOTUNING +#define DKS_TESTAUTOTUNING + +#include +#include + +class DKSAutoTuningTester { + + friend class DKSBaseMuSR; + +private: + + double x; + double y; + +public: + + DKSAutoTuningTester() { + x = 0.0; + y = 0.0; + } + + ~DKSAutoTuningTester(); + + double peaksZ() { + + double z = 3 * pow(1-x,2) * exp(-pow(x,2) - pow(y+1,2)) - 10 * (x/5 - pow(x,3) - pow(y,5)) * exp(-pow(x,2) - pow(y,2)) - (1.0/3.0) * exp( - pow(x+1,2) - pow(y,2)); + return z; + } + +}; + +#endif diff --git a/src/AutoTuning/DKSConfig.cpp b/src/AutoTuning/DKSConfig.cpp new file mode 100644 index 0000000..645c6ab --- /dev/null +++ b/src/AutoTuning/DKSConfig.cpp @@ -0,0 +1,163 @@ +#include "DKSConfig.h" + +DKSConfig::DKSConfig() { + + //get home directory + homeset_m = true; + if ((homedir_m = getenv("HOME")) == NULL) + homeset_m = false; + + loadConfigFile(); + +} + +DKSConfig::~DKSConfig() { + //delete tree_m; + + saveConfigFile(); +} + + +int DKSConfig::loadConfigFile() { + + int ierr = DKS_ERROR; + /* + if (homeset_m) { + //check if $HOME/.config/DKS exists + std::string filename = homedir_m + config_dir + config_file; + std::cout << "Check for: " << filename << std::endl; + if (fs::exists(filename)) { + try { + pt::read_xml(filename, tree_m); + treeloaded_m = true; + ierr = DKS_SUCCESS; + } catch (std::exception &e) { + DEBUG_MSG("Error loading autotuning file!"); + treeloaded_m = false; + ierr = DKS_ERROR; + } + } + } + */ + return ierr; +} + + +int DKSConfig::saveConfigFile() { + + int ierr = DKS_ERROR; + /* + std::string savedir = homedir_m + config_dir; + std::string savefile = homedir_m + config_dir + config_file; + + std::cout << savedir << std::endl; + std::cout << savefile << std::endl; + + if (homeset_m) { + //check if $HOME/.config/DKS directory exists, if not create + bool homecreated = false; + fs::path p (savedir); + if (!fs::is_directory(p)) + homecreated = fs::create_directory(p); + + try { + if (homecreated) { + pt::write_xml(savefile, tree_m); + ierr = DKS_SUCCESS; + } + } catch(std::exception &e) { + ierr = DKS_ERROR; + } + + } + */ + return ierr; +} + + +int DKSConfig::addConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int value) { + + + //keys to acces data in the tree + std::string device_name = name; + device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end()); + std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func; + std::string parameter = key + ".parameter"; + std::string attr_size = ".size"; + std::string attr_param = "." + param; + + //tmp node where new attributes are cteated in case the attribute doesn't exist in the tree + pt::ptree *tmp; + bool newNode = true; + + //loop trough all the items in the node and see if new param needs to be created + //or old one updated + boost::optional< pt::ptree& > child = tree_m.get_child_optional(key); + if (child) { + BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) { + int oldsize = v.second.get(attr_size,-1); + + //if param with the same size already exists in the tree save pointer to this + if (size == oldsize) { + tmp = &v.second; + newNode = false; + } + } + } + + //if parameter doesnt exist with this size, create a new parameter + if (newNode) { + tmp = new pt::ptree(); + tmp->add(attr_size, size); + tmp->add(attr_param, value); + tree_m.add_child(parameter, *tmp); + } else { + //if parameter exists update the parameter value + tmp->put(attr_param, value); + } + + return DKS_SUCCESS; +} + +int DKSConfig::getConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int &value) { + + //get the value of the tree, default to -1 if value doesn't exist + int ierr = DKS_SUCCESS; + + //define key and attribute values to find parameters in the tree + std::string device_name = name; + device_name.erase(std::remove_if(device_name.begin(), device_name.end(), ::isspace), device_name.end()); + std::string key = "DKS.autotune." + api + "." + device + "." + device_name + "." + func; + std::string attr_size = ".size"; + std::string attr_param = "." + param; + + float maxDist = std::numeric_limits::max(); + + //check if the parameters exist + boost::optional< pt::ptree& > child = tree_m.get_child_optional(key); + if (child) { + //loop trough parameters and get the one that is closes to the size specified + BOOST_FOREACH(pt::ptree::value_type &v, tree_m.get_child(key)) { + int param_size = v.second.get(attr_size,-1); //get parameter size + if (param_size > 0) { // if param_size is -1 param is not defined correctly and not usable + float dist = abs(param_size - size); + if (dist < maxDist) { + value = v.second.get(attr_param,-1); + maxDist = dist; + } + } + } + } else { + value = -1; + ierr = DKS_ERROR; + } + + return ierr; +} + + + diff --git a/src/AutoTuning/DKSConfig.h b/src/AutoTuning/DKSConfig.h new file mode 100644 index 0000000..bf7255a --- /dev/null +++ b/src/AutoTuning/DKSConfig.h @@ -0,0 +1,69 @@ +/** Class to save and load DKS autotunning configs. + * Autotuning settings are saved and loaded from $HOME/.config/DKS/autotuning.xml. + * Uses boost xml_parser to read and write the xml file and boost property tree to store + * the xml content. + */ + +#ifndef DKS_CONFIG +#define DKS_CONFIG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../DKSDefinitions.h" + +namespace pt = boost::property_tree; +namespace fs = boost::filesystem; + +const std::string config_dir = "/.config/DKS"; +const std::string config_file = "/autotuning.xml"; + +class DKSConfig { + +private: + + pt::ptree tree_m; + const char *homedir_m; + bool homeset_m; + bool treeloaded_m; + +public: + + /** Constructor, set home variable. + * If home directory is not set config file can not be read or saved + */ + DKSConfig(); + + ~DKSConfig(); + + /** Load autotuinig.xml into tree variable if this file exists */ + int loadConfigFile(); + + /** Save autotuning.xml file */ + int saveConfigFile(); + + /** Add config parameter to tree */ + int addConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int value); + + /** Get config parameter from the tree */ + int getConfigParameter(const std::string api, const std::string device, + const std::string name, const std::string func, + int size, std::string param, int &value); + + +}; + +#endif diff --git a/src/AutoTuning/DKSSearchStates.cpp b/src/AutoTuning/DKSSearchStates.cpp new file mode 100644 index 0000000..4bfcaba --- /dev/null +++ b/src/AutoTuning/DKSSearchStates.cpp @@ -0,0 +1,233 @@ +#include "DKSSearchStates.h" + +/** set the current state so that number of parameters and parameter bounds are known */ +DKSSearchStates::DKSSearchStates(Parameters params) { + + for (auto p : params) { + State s; + s.value = p.getValue(); + s.min = p.min; + s.max = p.max; + s.step = p.step; + current_state_m.push_back(s); + } + + neighbour_state_m.resize(current_state_m.size()); + best_state_m.resize(current_state_m.size()); + + best_time_m = std::numeric_limits::max(); + + next_neighbour_m = -1; + + srand(time(NULL)); + +} + +DKSSearchStates::~DKSSearchStates() { + current_state_m.clear(); + neighbour_state_m.clear(); + best_state_m.clear(); + neighbours_m.clear(); +} + +/** Get all the possible neighbours of the current state */ +void DKSSearchStates::getNeighbours(int dist) { + + std::vector< std::vector > values; + + for (auto state : current_state_m) { + std::vector s; + + for (int d = dist; d > 0; d--) { + if (state.value - d*state.step >= state.min) + s.push_back(state.value - state.step); + } + + s.push_back(state.value); + + for (int d = 1; d < dist + 1; d++) { + if (state.value + d*state.step <= state.max) + s.push_back(state.value + state.step); + } + + values.push_back(s); + } + + + std::vector< std::vector > s {{}}; + for (auto& u : values) { + std::vector< std::vector > r; + for(auto& x : s) { + for( auto y : u) { + r.push_back(x); + r.back().push_back(y); + } + } + s.swap(r); + } + + //get current state values + std::vector current; + for (auto state : current_state_m) + current.push_back(state.value); + s.erase(std::remove(s.begin(), s.end(), current)); + + neighbours_m.clear(); + neighbours_m = s; + next_neighbour_m = 0; +} + +void DKSSearchStates::setCurrentState(std::vector current_state) { + + current_state_m.clear(); + for (auto& p : current_state) { + State s; + s.value = p.getValue(); + s.min = p.min; + s.max = p.max; + s.step = p.step; + current_state_m.push_back(s); + } +} + +void DKSSearchStates::setCurrentState(std::vector current_state) { + + current_state_m.clear(); + for (auto& p : current_state) { + State s; + s.value = p.value; + s.min = p.min; + s.max = p.max; + s.step = p.step; + current_state_m.push_back(s); + } +} + +void DKSSearchStates::initCurrentState() { + + //go trough parameters in current state and generate a new random value + for (auto& s : current_state_m) { + //get number of total values + int values = (s.max - s.min) / s.step + 1; + + int r = rand() % values; + + s.value = s.min + r * s.step; + } + + getNeighbours(); +} + +States DKSSearchStates::getCurrentState() { + return current_state_m; +} + +States DKSSearchStates::getNextNeighbour() { + + //check if there are ant neighbours to move on + if (next_neighbour_m < (int)neighbours_m.size()) { + + //get the vector of values for each parameters in the neighbour cell + std::vector neighbour_values = neighbours_m[next_neighbour_m]; + + //set the values to neighbour_state_m + for (unsigned int n = 0; n < neighbour_state_m.size(); n++) + neighbour_state_m[n].value = neighbour_values[n]; + + } + + next_neighbour_m++; + return neighbour_state_m; + +} + +States DKSSearchStates::getRandomNeighbour() { + + int rand_neighbour = rand() % (int)neighbours_m.size(); + + //get the vector of values for each parameters in the neighbour cell + std::vector neighbour_values = neighbours_m[rand_neighbour]; + + //set the values to neighbour_state_m + for (unsigned int n = 0; n < neighbour_state_m.size(); n++) + neighbour_state_m[n].value = neighbour_values[n]; + + next_neighbour_m = rand_neighbour + 1; + return neighbour_state_m; + +} + +bool DKSSearchStates::nextNeighbourExists() { + bool neighbourExists = false; + if (next_neighbour_m < (int)neighbours_m.size()) + neighbourExists = true; + + return neighbourExists; +} + +void DKSSearchStates::moveToNeighbour() { + + for (unsigned int i = 0; i < current_state_m.size(); i++) + current_state_m[i].value = neighbour_state_m[i].value; + + //getNeighbours(); + +} + +void DKSSearchStates::saveCurrentState(double current_time) { + + if (current_time < best_time_m) { + for (unsigned int i = 0; i < current_state_m.size(); i++) { + best_state_m[i].value = current_state_m[i].value; + best_state_m[i].min = current_state_m[i].min; + best_state_m[i].max = current_state_m[i].max; + best_state_m[i].step = current_state_m[i].step; + } + + best_time_m = current_time; + } + +} + + +void DKSSearchStates::printCurrentState(double time) { + std::cout << "Current state: "; + for (auto s : current_state_m) + std::cout << s.value << "\t"; + std::cout << time << std::endl; + +} + +void DKSSearchStates::printInfo() { + + std::cout << "Current state: "; + for (auto s : current_state_m) + std::cout << s.value << "\t"; + std::cout << std::endl; + + std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): "; + if (next_neighbour_m > 0) { + for (auto s : neighbour_state_m) + std::cout << s.value << "\t"; + } + std::cout << std::endl; + +} + +void DKSSearchStates::printNeighbour(double time) { + std::cout << "Current neighbour (" << next_neighbour_m << " of " << neighbours_m.size() << "): "; + if (next_neighbour_m > 0) { + for (auto s : neighbour_state_m) + std::cout << s.value << "\t"; + } + std::cout << time << std::endl; +} + +void DKSSearchStates::printBest() { + std::cout << "Best state (" << best_time_m << "): "; + if (best_time_m > 0) { + for (auto s : best_state_m) + std::cout << s.value << "\t"; + } + std::cout << std::endl; +} diff --git a/src/AutoTuning/DKSSearchStates.h b/src/AutoTuning/DKSSearchStates.h new file mode 100644 index 0000000..cdd8fb0 --- /dev/null +++ b/src/AutoTuning/DKSSearchStates.h @@ -0,0 +1,162 @@ +#ifndef DKS_SEARCHSTATES +#define DKS_SEARCHSTATES + +#include +#include +#include +#include +#include + +enum VALUE_TYPE { DKS_INT, DKS_DOUBLE }; + +class Parameter { + +private: + int *ivalue; + double *dvalue; + VALUE_TYPE type; + +public: + double min; + double max; + double step; + std::string name; + + Parameter(int *_value, int _min, int _max, int _step, std::string _name) { + + ivalue = _value; + min = (double)_min; + max = (double)_max; + step = (double)_step; + name = _name; + type = DKS_INT; + } + + Parameter(double *_value, double _min, double _max, double _step, std::string _name) { + + std::cout << "Double" << std::endl; + + dvalue = _value; + min = _min; + max = _max; + step = _step; + name = _name; + type = DKS_DOUBLE; + } + + template + void setValue(T v) { + if (type == DKS_INT) + *ivalue = (int)v; + if (type == DKS_DOUBLE) + *dvalue = (double)v; + } + + double getValue() { + switch (type) { + case DKS_INT: + return (double)*ivalue; + case DKS_DOUBLE: + return *dvalue; + }; + return -1.0; + } + +}; + +struct State { + double value; + double min; + double max; + double step; +}; + +typedef std::vector Parameters; +typedef std::vector States; + +class DKSSearchStates { + +private: + + States current_state_m; + States neighbour_state_m; + + States best_state_m; + double best_time_m; + + std::vector< std::vector > neighbours_m; + int next_neighbour_m; + +public: + + /** Constructor alwats takes params array as variable. + * Params array is needed to know how many params will be searched and what are thou bounds + * of each parameter. + */ + DKSSearchStates(Parameters params); + + ~DKSSearchStates(); + + /** Set current state using parameter vector */ + void setCurrentState(Parameters current_state); + + /** set current state using the state vector */ + void setCurrentState(States current_state); + + /** init random current state */ + void initCurrentState(); + + /** get current state */ + States getCurrentState(); + + /** get next neighbour state. + * if there are no next neighbore stay at the curretn neighbour + */ + States getNextNeighbour(); + + /** get random neighbour state */ + States getRandomNeighbour(); + + /** calculate all the neighbour states */ + void getNeighbours(int dist = 1); + + /** Chech if there are more neighbours to evaluate + * Return true if more neighbors exist, false if we are at the last neighbour + */ + bool nextNeighbourExists(); + + /** move to next neighbour. + * set the current state as the next neighbour, + * calculate the neighbours of the new current state. + */ + void moveToNeighbour(); + + /** Save the current state and the evaluation time of the current state. + * If evaluation time of the current state is better than the evaluation time of the + * best state, save the current state as best. + */ + void saveCurrentState(double current_time); + + + //Print functions - mostly usefull for debugging purposes, or for benchmark runs to print the + //status of the search + + /** Print current state. + * cout the current state. Mostly used for debuging purposes + */ + void printCurrentState(double time = 0.0); + + /** Print current neighbour info */ + void printNeighbour(double time = 0.0); + + /** Print info. + * Print the whole info about the search: current state, current neighbour, total neighbors + */ + void printInfo(); + + /** Print the best saved state */ + void printBest(); + +}; + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..df12a31 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,130 @@ +CMAKE_MINIMUM_REQUIRED (VERSION 2.8) + +SET (DKS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +MACRO (ADD_SOURCES ) + FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}") + FOREACH (_src ${ARGN}) + IF (_relPath) + LIST (APPEND DKS_SRCS "${_relPath}/${_src}") + ELSE () + LIST (APPEND DKS_SRCS "${_src}") + ENDIF () + ENDFOREACH () + IF (_relPath) + # propagate SRCS to parent directory + SET (DKS_SRCS ${DKS_SRCS} PARENT_SCOPE) + ENDIF () +ENDMACRO () + +MACRO (ADD_HEADERS ) + FILE (RELATIVE_PATH _relPath "${DKS_SRC_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}") + FOREACH (_hdr ${ARGN}) + IF (_relPath) + LIST (APPEND DKS_HDRS "${_relPath}/${_hdr}") + ELSE () + LIST (APPEND DKS_HDRS "${_hdr}") + ENDIF () + ENDFOREACH () + IF (_relPath) + # propagate HDRS to parent directory + SET (DKS_HDRS ${DKS_HDRS} PARENT_SCOPE) + ENDIF () +ENDMACRO () + + +SET (DKS_BASEDIR_HDRS + DKSBase.h + DKSDefinitions.h + ) + +SET (DKS_BASEDIR_SRCS + DKSBase.cpp + ) + +IF (USE_CUDA OR USE_OPENCL) + SET (DKS_BASEDIR_HDRS + ${DKS_BASEDIR_HDRS} + DKSBaseMuSR.h + ) + + SET (DKS_BASEDIR_SRCS + ${DKS_BASEDIR_SRCS} + DKSBaseMuSR.cpp + ) +ENDIF (USE_CUDA OR USE_OPENCL) + +IF (USE_CUDA) + SET (DKS_BASEDIR_HDRS + ${DKS_BASEDIR_HDRS} + DKSImageReconstruction.h + ) + + SET (DKS_BASEDIR_SRCS + ${DKS_BASEDIR_SRCS} + DKSImageReconstruction.cpp + ) +ENDIF (USE_CUDA) + +ADD_HEADERS (${DKS_BASEDIR_HDRS}) +ADD_SOURCES (${DKS_BASEDIR_SRCS}) + +MESSAGE (STATUS "HEADERS: ${DKS_BASEDIR_HDRS}") +MESSAGE (STATUS "SOURCES: ${DKS_BASEDIR_SRCS}") + +#add only those source files that will be used +IF (USE_OPENCL) + MESSAGE (STATUS "Add OpenCL source files") + ADD_SUBDIRECTORY (OpenCL) +ENDIF (USE_OPENCL) + +IF (USE_CUDA) + MESSAGE (STATUS "Add CUDA source files") + ADD_SUBDIRECTORY (CUDA) +ENDIF (USE_CUDA) + +IF (USE_MIC) + MESSAGE (STATUS "Add MIC source files") + ADD_SUBDIRECTORY (MIC) +ENDIF (USE_MIC) + +ADD_SUBDIRECTORY (Utility) +ADD_SUBDIRECTORY (AutoTuning) +ADD_SUBDIRECTORY (Algorithms) + +IF (USE_CUDA) + CUDA_ADD_LIBRARY(dks ${DKS_SRCS}) + CUDA_ADD_LIBRARY(dksshared SHARED ${DKS_SRCS}) + + IF (USE_UQTK) + TARGET_LINK_LIBRARIES(dks cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + TARGET_LINK_LIBRARIES(dksshared cudadevrt lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + ELSE (USE_UQTK) + TARGET_LINK_LIBRARIES(dks cudadevrt) + TARGET_LINK_LIBRARIES(dksshared cudadevrt) + ENDIF (USE_UQTK) + +ELSE (USE_CUDA) + MESSAGE (STATUS "DKS srcs: ${DKS_SRCS}") + ADD_LIBRARY(dks ${DKS_SRCS}) + ADD_LIBRARY(dksshared SHARED ${DKS_SRCS}) + + IF (USE_UQTK) + TARGET_LINK_LIBRARIES(dks lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + TARGET_LINK_LIBRARIES(dksshared lreg UQTk quad uqtktools cvode-2.6.0 dsfmt lbfgs uqtklapack uqtkslatec uqtkblas gfortran) + ELSE (USE_UQTK) + TARGET_LINK_LIBRARIES(dks) + TARGET_LINK_LIBRARIES(dksshared) + ENDIF(USE_UQTK) + +ENDIF (USE_CUDA) + +INSTALL(TARGETS dks DESTINATION lib) +INSTALL(TARGETS dksshared DESTINATION lib) +INSTALL(FILES ${DKS_BASEDIR_HDRS} DESTINATION include) + +#IF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc")) +# INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/libdksMIC.a DESTINATION build/lib) +#ENDIF (USE_MIC AND (COMPILER_NAME STREQUAL "icpc" OR COMPILER_NAME STREQUAL "mpiicpc")) + + + diff --git a/src/CUDA/CMakeLists.txt b/src/CUDA/CMakeLists.txt new file mode 100644 index 0000000..977d570 --- /dev/null +++ b/src/CUDA/CMakeLists.txt @@ -0,0 +1,35 @@ +SET (_HDRS + CudaBase.cuh + CudaFFT.cuh + CudaGreensFunction.cuh + CudaChiSquare.cuh + CudaCollimatorPhysics.cuh + CudaImageReconstruction.cuh + CudaChiSquareRuntime.cuh + ) + +SET (_SRCS + CudaBase.cu + CudaFFT.cu + CudaGreensFunction.cu + CudaChiSquare.cu + CudaCollimatorPhysics.cu + CudaImageReconstruction.cu + CudaChiSquareRuntime.cu +) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES(${_SRCS}) +ADD_HEADERS(${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/CUDA) + +SET (_KERNELS + NVRTCKernels/CudaChiSquareKernel.cu + ) + +INSTALL(FILES ${_KERNELS} DESTINATION include/CUDA/NVRTCKernels) + diff --git a/src/CUDA/CMakeListsLibcuda.txt b/src/CUDA/CMakeListsLibcuda.txt new file mode 100644 index 0000000..a94b877 --- /dev/null +++ b/src/CUDA/CMakeListsLibcuda.txt @@ -0,0 +1,25 @@ +CMAKE_MINIMUM_REQUIRED (VERSION 2.8) + +FIND_PACKAGE(CUDA REQUIRED) + +SET (CUDA_NVCC_FLAGS "-arch=sm_30") + +SET(LIB_TYPE STATIC) + +SET (DKS_CUDA_HDRS + CudaBase.cuh + CudaFFT.cuh + CudaGreensFunction.cuh + ) + +SET (DKS_CUDA_SRCS + CudaBase.cu + CudaFFT.cu + CudaGreensFunction.cu +) + +INCLUDE_DIRECTORIES ( + ${CMAKE_CURRENT_SOURCE_DIR} +) + +CUDA_ADD_LIBRARY(cudadks ${DKS_CUDA_SRCS}) \ No newline at end of file diff --git a/src/CUDA/CudaBase.cu b/src/CUDA/CudaBase.cu new file mode 100644 index 0000000..f352cf2 --- /dev/null +++ b/src/CUDA/CudaBase.cu @@ -0,0 +1,386 @@ +#include "CudaBase.cuh" + +//=====================================// +//============Cuda kernels=============// +//=====================================// + +__global__ void initcuRandState(curandState *state, int size, int seed = 0) { + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + curand_init(seed + idx, 0, 0, &state[idx]); + } + +} + + +//=====================================// +//==========Private functions==========// +//=====================================// + + +//====================================// +//==========Public functions==========// +//====================================// + +CudaBase::CudaBase() { + + currentStream = -1; + cudaStreams.reserve(10); + defaultRndSet = -1; + +} + +CudaBase::~CudaBase() { + + cuda_deleteStreams(); + cuda_deleteCurandStates(); + +} + +/* + create curandStates +*/ +int CudaBase::cuda_createCurandStates(int size) { + + if (defaultRndSet == 1) + cuda_deleteCurandStates(); + + int threads = 128; + int blocks = size / threads + 1; + int seed = time(NULL); + + //std::cout << "sizeof: " << sizeof(curandState) << std::endl; + cudaMalloc(&defaultRndState, sizeof(curandState)*size); + initcuRandState<<>>(defaultRndState, size, seed); + + defaultRndSet = 1; + + return DKS_SUCCESS; +} + +int CudaBase::cuda_deleteCurandStates() { + if (defaultRndSet == 1) { + cudaFree(defaultRndState); + defaultRndSet = -1; + } + + return DKS_SUCCESS; +} + +curandState* CudaBase::cuda_getCurandStates() { + return defaultRndState; +} + +/* + add cuda stream +*/ +int CudaBase::cuda_createStream(int &streamId) { + + cudaStream_t tmpStream; + cudaError_t cerror; + + cerror = cudaStreamCreate(&tmpStream); + if (cerror != cudaSuccess) { + DEBUG_MSG("Failed to create new CUDA stream, cuda error: " << cerror); + return DKS_ERROR; + } + + cudaStreams.push_back(tmpStream); + streamId = cudaStreams.size() - 1; + + return DKS_SUCCESS; +} + +/* + add existing stream to list +*/ +int CudaBase::cuda_addStream(cudaStream_t tmpStream, int &streamId) { + cudaStreams.push_back(tmpStream); + streamId = cudaStreams.size() - 1; + + return DKS_SUCCESS; +} + + +/* + delete stream +*/ +int CudaBase::cuda_deleteStream(int id) { + //TODO: lets see if this is necessary, currently do nothing + return DKS_ERROR; +} + +/* + delete all streams +*/ +int CudaBase::cuda_deleteStreams() { + + //delete all cuda streams + for (unsigned int i = 0; i < cudaStreams.size(); i++) { + cudaStreamDestroy(cudaStreams[i]); + } + cudaStreams.clear(); + currentStream = -1; + + return DKS_SUCCESS; +} + + +/* + set stream id +*/ +int CudaBase::cuda_setStream(int id) { + currentStream = id; + return DKS_SUCCESS; +} + +/* + return stream id +*/ +int CudaBase::cuda_getStreamId() { + return currentStream; +} + +/* + set default stream as the stream to use +*/ +int CudaBase::cuda_defaultStream() { + currentStream = -1; + return DKS_SUCCESS; +} + +int CudaBase::cuda_numberOfStreams() { + return cudaStreams.size(); +} + +cudaStream_t CudaBase::cuda_getStream(int id) { + return cudaStreams[id]; +} + +cublasHandle_t CudaBase::cuda_getCublas() { + return defaultCublas; +} + +/* + get all available cuda devices +*/ +int CudaBase::cuda_getDevices() { + + std::cout << std::endl; + std::cout << "==============================" << std::endl; + std::cout << "=============CUDA=============" << std::endl; + std::cout << "==============================" << std::endl; + + int ndev; + cudaGetDeviceCount(&ndev); + + std::cout << ndev << std::endl; + + + for (int i = 0; i < ndev; i++) { + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + + std::cout << "Device " << i+1 << ":" << std::endl; + std::cout << "Name: " << prop.name << std::endl; + std::cout << "PCI bus id: " << prop.pciBusID << std::endl; + std::cout << "PCI device id: " << prop.pciDeviceID << std::endl; + std::cout << "PCI domain id: " << prop.pciDomainID << std::endl; + std::cout << "==============================" << std::endl; + } + + return DKS_SUCCESS; + +} + + +int CudaBase::cuda_getDeviceCount(int &ndev) { + cudaGetDeviceCount(&ndev); + return DKS_SUCCESS; +} + +int CudaBase::cuda_getDeviceName(std::string &device_name) { + + int ierr = DKS_SUCCESS; + + int ndev = 0; + cudaGetDeviceCount(&ndev); + if (ndev > 0) { + int device = 0; + cudaDeviceProp prop; + cudaGetDevice(&device); + cudaGetDeviceProperties(&prop, device); + + device_name = prop.name; + } else { + ierr = DKS_ERROR; + } + return ierr; +} + +int CudaBase::cuda_setDevice(int device) { + int ierr = DKS_SUCCESS; + int ndev = 0; + cudaGetDeviceCount(&ndev); + + std::cout << "Init: " << device << "\t" << ndev << std::endl; + + if (device < ndev) { + std::cout << "set device to: " << ndev << std::endl; + cudaSetDevice(device); + } else { + if (ndev > 0) + cudaSetDevice(0); + else + ierr = DKS_ERROR; + } + return ierr; +} + +int CudaBase::cuda_getUniqueDevices(std::vector &devices) { + + std::vector< std::string > names; + + int ndev; + cudaGetDeviceCount(&ndev); + + for (int i = 0; i < ndev; i++) { + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + + //add first device to the list, for other devices check if the name is already in the list + if (i == 0) { + devices.push_back(i); + names.push_back(prop.name); + } else { + std::string target = prop.name; + bool isPresent = (std::find(names.begin(), names.end(), target) != names.end()); + if (!isPresent) { + devices.push_back(i); + names.push_back(prop.name); + } + } + } + + return DKS_SUCCESS; +} + + +/* + set up cuda device +*/ +int CudaBase::cuda_setUp() { + + std::cout << "set up" << std::endl; + return DKS_SUCCESS; +} + +/* + allocate memory on cuda device +*/ +void * CudaBase::cuda_allocateMemory(size_t size, int &ierr) { + + cudaError cerror; + void * mem_ptr = NULL; + + cerror = cudaMalloc((void **) &mem_ptr, size); + if (cerror != cudaSuccess) { + DEBUG_MSG("Failed to allocate memory, cuda error: " << cerror); + std::cout << "Error: " << cudaGetErrorString(cerror) << std::endl; + ierr = DKS_ERROR; + } else { + ierr = DKS_SUCCESS; + } + + return mem_ptr; +} + +/* + Info: free memory on device + Return: success or error code +*/ +int CudaBase::cuda_freeMemory(void * mem_ptr) { + cudaError cerror; + + cerror = cudaFree(mem_ptr); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error freeing memory, cuda error: " << cerror); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +int CudaBase::cuda_freeHostMemory(void * mem_ptr) { + cudaError cerror; + + cerror = cudaFreeHost(mem_ptr); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error freeing host memory, cuda error: " << cerror); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +/* + Info: allcate memory and write data (push) + Return: pointer to memory object +*/ +/* + void * CudaBase::cuda_pushData(const void * in_data, size_t size, int &ierr) { + + void * mem_ptr; + mem_ptr = cuda_allocateMemory(size, ierr); + + if (ierr == DKS_SUCCESS) + ierr = cuda_writeData(mem_ptr, in_data, size); + + return mem_ptr; + } +*/ + +/* + Info: read data and free memory (pull) + Return: success or error code +*/ +/* + int CudaBase::cuda_pullData(void * mem_ptr, void * out_data, size_t size, int &ierr) { + + ierr = cuda_readData(mem_ptr, out_data, size); + if (ierr == DKS_SUCCESS) + ierr = cuda_freeMemory(mem_ptr); + else + return DKS_ERROR; + + + if (ierr == DKS_SUCCESS) + return DKS_SUCCESS; + else + return DKS_ERROR; + } +*/ + +/* + Info: execute function + Return: success or error code +*/ +int CudaBase::cuda_executeFunction() { + + std::cout << "Execute function" << std::endl; + return DKS_SUCCESS; +} + +/* + Info: clean up + Return: success or error code +*/ +int CudaBase::cuda_cleanUp() { + + std::cout << "clean up" << std::endl; + return DKS_SUCCESS; + +} diff --git a/src/CUDA/CudaBase.cuh b/src/CUDA/CudaBase.cuh new file mode 100644 index 0000000..325016d --- /dev/null +++ b/src/CUDA/CudaBase.cuh @@ -0,0 +1,390 @@ +#ifndef H_CUDA_BASE +#define H_CUDA_BASE + +#include "../DKSDefinitions.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class CudaBase { + +private: + + int currentStream; + std::vector cudaStreams; + +protected: + + cublasHandle_t defaultCublas; + + curandState *defaultRndState; + int defaultRndSet; + +public: + + CudaBase(); + + ~CudaBase(); + + /** + * Init cuda random number (cuRand) states. + * Create an array of type curandState with "size" elements on the GPU + * and create a curandState with different seed for each array entry. + * Return success or error code + */ + int cuda_createCurandStates(int size); + + /** + * Delete curandState. + * Delete curandState array on the GPU and free memory. + * Return success or error code + */ + int cuda_deleteCurandStates(); + + /** Get a pointer to curand states + * + */ + curandState* cuda_getCurandStates(); + + /** + * Create a cuda stream and set streamId to index refering to this stream. + * Return success or error code + */ + int cuda_createStream(int &streamId); + + /** + * add existing cuda stream to the list. + * Return: success or error code. + */ + int cuda_addStream(cudaStream_t tmpStream, int &streamId); + + /** + * delete cuda stream + * success or error code + */ + int cuda_deleteStream(int id); + + /** + * delete all streams + * success or error code + */ + int cuda_deleteStreams(); + + /** + * set stream to use + * success or error code + */ + int cuda_setStream(int id); + + /** + * Info: get stream that is used + * Return: return id of curretn stream + */ + int cuda_getStreamId(); + + /** + * Info: reset to default stream + * Return: success or error code + */ + int cuda_defaultStream(); + + /** + * Info: get number of streams + * Return: success or error code + */ + int cuda_numberOfStreams(); + + /** + * Info: get stream + * Return: stream + */ + cudaStream_t cuda_getStream(int id); + + /** + * Get default cublass handle + */ + cublasHandle_t cuda_getCublas(); + + /** + * Info: get information on cuda devices + * Return: success or error code + */ + int cuda_getDevices(); + + /** Get CUDA device count. + * Sets the number of devices on the platform that can use CUDA. + * Returns DKS_SUCCESS + */ + int cuda_getDeviceCount(int &ndev); + + /** Get the name of the device. + * QUery the device properties of the used device and set the string device_name + */ + int cuda_getDeviceName(std::string &device_name); + + /** Set CUDA device to use. + * If device passed in is larger than the number of devices use the default:0 and return DKS_ERROR + */ + int cuda_setDevice(int device); + + /** Get unique devices + * Get array of indeces with the unique CUDA devices available on the paltform + */ + int cuda_getUniqueDevices(std::vector &devices); + + /** + * Info: init device + * Return: success or error code + */ + int cuda_setUp(); + + /** + * Info: allocate memory on cuda device + * Return: pointer to memory object + */ + void * cuda_allocateMemory(size_t size, int &ierr); + + /** + * Info: allocate host memory in pinned memory + * Return: success or error code + */ + template + int cuda_allocateHostMemory(T *&ptr, size_t size) { + cudaError cerror; + cerror = cudaMallocHost((void**)&ptr, sizeof(T)*size); + if (cerror != cudaSuccess) + return DKS_ERROR; + + return DKS_SUCCESS; + } + + /** + * Info: write data to memory + * Retrun: success or error code + */ + template + int cuda_writeData(T * mem_ptr, const void * in_data, size_t size, int offset = 0) { + cudaError cerror; + + cerror = cudaMemcpy(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error copying data to device, cuda error: " << cerror); + return DKS_ERROR; + } + + return DKS_SUCCESS; + } + + /** + * Info: write data assynchonuously + * Return: success or error code + */ + template + int cuda_writeDataAsync(T *mem_ptr, const void *in_data, size_t size, int streamId = -1, int offset = 0) { + cudaError cerror; + + //if default stream or no stream specified, use default write method + if (streamId == -1) { + cuda_writeData(mem_ptr, in_data, size, offset); + return DKS_SUCCESS; + } + + if (streamId < cuda_numberOfStreams()) { + //call async write + cerror = cudaMemcpyAsync(mem_ptr + offset, in_data, size, cudaMemcpyHostToDevice, + cuda_getStream(streamId)); + + if (cerror != cudaSuccess) { + DEBUG_MSG("Error async data copy, cuda error: " << cerror); + return DKS_ERROR; + } + } else { + DEBUG_MSG("Error invalid stream id: " << streamId); + return DKS_ERROR; + } + + + return DKS_SUCCESS; + } + + /** + * Info: read data from memory + * Return: success or error code + */ + template + int cuda_readData(const T * mem_ptr, void * out_data, size_t size, int offset = 0) { + cudaError cerror; + + cerror = cudaMemcpy(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error reading data from device"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + } + + /** + * Info: read data async from device memory + * Return: success or error code + */ + template + int cuda_readDataAsync(const T *mem_ptr, void *out_data, size_t size, int streamId = -1, int offset = 0) { + cudaError cerror; + + if (streamId == -1) { + cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, 0); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error async read from devie default stream"); + return DKS_ERROR; + } + return DKS_SUCCESS; + } + + if (streamId < cuda_numberOfStreams()) { + cerror = cudaMemcpyAsync(out_data, mem_ptr + offset, size, cudaMemcpyDeviceToHost, + cuda_getStream(streamId)); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error async read from device, cuda error: " << cerror); + return DKS_ERROR; + } + } else { + DEBUG_MSG("Error invalid stream id: " << streamId); + return DKS_ERROR; + } + + return DKS_SUCCESS; + } + + /** + * Info: free memory on device + * Return: success or error code + */ + int cuda_freeMemory(void * mem_ptr); + + /** + * Info: free page locked memory on host + * Return: success or erro code + */ + int cuda_freeHostMemory(void * mem_ptr); + + /** + * Info: allcate memory and write data (push) + * Return: pointer to memory object + */ + template + void * cuda_pushData(const void * in_data, size_t size, int &ierr) { + + void * mem_ptr; + mem_ptr = cuda_allocateMemory(size, ierr); + + if (ierr == DKS_SUCCESS) + ierr = cuda_writeData((T*)mem_ptr, in_data, size); + + return mem_ptr; + } + + /** + * Info: read data and free memory (pull) + * Return: success or error code + */ + template + int cuda_pullData(T * mem_ptr, void * out_data, size_t size, int &ierr) { + + ierr = cuda_readData(mem_ptr, out_data, size); + if (ierr == DKS_SUCCESS) + ierr = cuda_freeMemory(mem_ptr); + else + return DKS_ERROR; + + + if (ierr == DKS_SUCCESS) + return DKS_SUCCESS; + else + return DKS_ERROR; + } + + /** + * Info: execute function + * Return: success or error code + */ + int cuda_executeFunction(); + + /** + * Info: clean up + * Return: success or error code + */ + int cuda_cleanUp(); + + /** + * Info: sync cuda device + * Return: success or error code + */ + int cuda_syncDevice() { + cudaDeviceSynchronize(); + return DKS_SUCCESS; + } + + /** + * Page-lock host memory + */ + template + int cuda_hostRegister(T *ptr, int size) { + int cerr = cudaHostRegister(ptr, size*sizeof(T), cudaHostRegisterPortable); + if (cerr == cudaSuccess) { + return DKS_SUCCESS; + } else { + DEBUG_MSG("Host memroy was not page locked"); + return DKS_ERROR; + } + } + + /** + * Release page locked memory + */ + template + int cuda_hostUnregister(T *ptr) { + int cerr = cudaHostUnregister(ptr); + if (cerr == cudaSuccess) + return DKS_SUCCESS; + else + return DKS_ERROR; + + } + + /** + * Info: print device memory info (total, used, avail) + * Return: success or error code + */ + int cuda_memInfo() { + int ierr; + size_t avail; + size_t total; + double mb = 1000000.0; + + ierr = cudaMemGetInfo( &avail, &total); + + if (ierr != cudaSuccess) { + DEBUG_MSG("Device mem info could not be obtained!"); + return DKS_ERROR; + } + + std::cout << "Device memory info, total: " << total / mb << "MB,\t"; + std::cout << "used: " << (total - avail) / mb << "MB,\t"; + std::cout << "avail: " << avail / mb << "MB" << std::endl; + + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/CUDA/CudaChiSquare.cu b/src/CUDA/CudaChiSquare.cu new file mode 100644 index 0000000..db7f4f7 --- /dev/null +++ b/src/CUDA/CudaChiSquare.cu @@ -0,0 +1,287 @@ +#include "CudaChiSquare.cuh" + +//simple kernel version +__global__ void kernelPHistoTFFcn(double *data, double *par, double *chisq, + double fTimeResolution, double fRebin, int n) { + + int j = blockIdx.x; + int i = blockIdx.y; + + int idx = i * n + j; + + const double tau = 2.197019; + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + + double w = par[0]*0.08516155035269027; + + double ldata = data[idx]; + + double theo = par[2 + i*4] * exp(-time/tau) * (1.0 + par[3 + i*4] * exp(-0.5 * pow(par[1]*time,2.0) ) * cos(w * time+par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4]; + + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + +} + +__global__ void kernelPHistoTFFcn_2(double *data, double *par, double *chisq, + double fTimeResolution, double fRebin, int n, int s) { + + int j = blockIdx.x; + + const double tau = 2.197019; + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + double w = par[0]*0.08516155035269027; + double tt = exp(-time/tau); + double pp = exp(-0.5 * par[1] * time * par[1] * time); + double wt = w * time; + + int idx; + double ldata, theo; + for (int i = 0; i < s; i++) { + idx = i * n + j; + ldata = data[idx]; + + theo = par[2 + i*4] * tt * (1.0 + par[3 + i*4] * pp * cos(wt + par[4+i*4] * 1.74532925199432955e-2) ) + par[5+i*4]; + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + } + +} + +#define TAU 2.197019 + +__global__ void kernelPHistoTFFcn_3(double *data, double *par, double *chisq, + double fTimeResolution, double fRebin, + int length, int sensors, int numpar) { + + + //define shared variable for parameters + extern __shared__ double p[]; + + //get thread id and calc global id + int tid = threadIdx.x; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync threads + __syncthreads(); + + if (j < length) { + + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + double w = p[0]*0.08516155035269027; + double tt = exp(-time/TAU); + double pp = exp(-0.5 * pow(p[1]*time, 2.0)); + double wt = w * time; + + int idx; + double ldata, theo; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + ldata = data[idx]; + + theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + } + } + + +} + +__global__ void kernelSingleGaussTF(double *data, unsigned int *t0, double *par, double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar) +{ + + //define shared variable for parameters + extern __shared__ double p[]; + + //get thread id and calc global id + int tid = threadIdx.x; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync threads + __syncthreads(); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = par[0]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0 + fTimeResolution * fRebin* (j - lft0); + theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0)) + *cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + // 1.74532925199432955e-2 = pi/180 + + if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) ) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } + +} + +__global__ void kernelDoubleLorentzTF(double *data, unsigned int *t0, double *par, double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar) +{ + + //define shared variable for parameters + extern __shared__ double p[]; + + //get thread id and calc global id + int tid = threadIdx.x; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync threads + __syncthreads(); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = p[0]*0.08516155035269027; + double w2 = p[2]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0+fTimeResolution*fRebin*(j-lft0); + + theo = p[4+i*5]*exp(-time/TAU)* + (1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)* + cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+ + (1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)* + cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5]; + // 1.74532925199432955e-2 = pi/180 + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } +} + + + +int CudaChiSquare::cuda_PHistoTFFcn(void *mem_data, void *mem_ptr, void *mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result) +{ + + int threads = 128; + int blocks = length / threads + 1; + + kernelPHistoTFFcn_3<<>>((double*)mem_data, + (double*)mem_ptr, + (double*)mem_chisq, + fTimeResolution, + fRebin, length, + sensors, numpar); + + + cublasStatus_t status; + status = cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_chisq, 1, &result); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("cublas asum failed"); + return DKS_ERROR; + } + + + return DKS_SUCCESS; +} + + +int CudaChiSquare::cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + int threads = 128; + int blocks = length / threads + 1; + + kernelSingleGaussTF<<>>( (double*)mem_data, + (unsigned int*)mem_t0, + (double*)mem_par, + (double*)mem_result, + fTimeResolution, + fRebin, + fGoodBinOffset, + length, sensors, numpar); + + cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result); + result = 2.0 * result; + + + return DKS_SUCCESS; + +} + + +int CudaChiSquare::cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + int threads = 128; + int blocks = length / threads + 1; + + kernelDoubleLorentzTF<<>>( (double*)mem_data, + (unsigned int*)mem_t0, + (double*)mem_par, + (double*)mem_result, + fTimeResolution, + fRebin, + fGoodBinOffset, + length, sensors, numpar); + + cublasDasum(m_base->cuda_getCublas(), sensors*length, (double*)mem_result, 1, &result); + result = 2.0 * result; + + + return DKS_SUCCESS; + +} diff --git a/src/CUDA/CudaChiSquare.cuh b/src/CUDA/CudaChiSquare.cuh new file mode 100644 index 0000000..588dec5 --- /dev/null +++ b/src/CUDA/CudaChiSquare.cuh @@ -0,0 +1,59 @@ +#ifndef H_CUDA_CHISQUARE +#define H_CUDA_CHISQUARE + +#include + +#include +#include + +#include "CudaBase.cuh" + +class CudaChiSquare { + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** + * Constructor which gets CudaBase as argument + */ + CudaChiSquare(CudaBase *base) { + m_base = base; + base_create = false; + } + + /* constructor */ + CudaChiSquare() { + m_base = new CudaBase(); + base_create = true; + } + + /* destructor */ + ~CudaChiSquare() { + if (base_create) + delete m_base; + } + + /* PHistoTFFcn calculation */ + int cuda_PHistoTFFcn(void * mem_data, void * mem_par, void * mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result); + + int cuda_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + int cuda_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + +}; + +#endif diff --git a/src/CUDA/CudaChiSquareRuntime.cu b/src/CUDA/CudaChiSquareRuntime.cu new file mode 100644 index 0000000..ebbbcd0 --- /dev/null +++ b/src/CUDA/CudaChiSquareRuntime.cu @@ -0,0 +1,313 @@ +#include "CudaChiSquareRuntime.cuh" + +CudaChiSquareRuntime::CudaChiSquareRuntime(CudaBase *base) { + blockSize_m = BLOCK_SIZE; + numBlocks_m = -1; + + ptx_m = NULL; + + m_base = base; + base_create = false; + setUpContext(); +} + +//constructor, init cuda device and create context +CudaChiSquareRuntime::CudaChiSquareRuntime() { + blockSize_m = BLOCK_SIZE; + numBlocks_m = -1; + + ptx_m = NULL; + + m_base = new CudaBase(); + base_create = true; + setUpContext(); +} + +//free resources +CudaChiSquareRuntime::~CudaChiSquareRuntime() { + delete[] ptx_m; + cuCtxDestroy(context_m); + + freeChiSquare(); + + if (base_create) + delete m_base; +} + +void CudaChiSquareRuntime::setUpContext() { + cuInit(0); + cuDeviceGet(&cuDevice_m, 0); + cuCtxCreate(&context_m, 0, cuDevice_m); + + N0_m = 1.0; + tau_m = 1.0; + bkg_m = 1.0; + + initDone_m = false; +} + +//build program string +std::string CudaChiSquareRuntime::buildProgram(std::string function) { + + long fsize; + char *kernel_source; + + //get kernel source + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, "CUDA/NVRTCKernels/CudaChiSquareKernel.cu"); + + //read kernels from file + FILE *fp = fopen(kernel_file, "rb"); + if (!fp) + DEBUG_MSG("Can't open kernel file" << kernel_file); + + //get file size and allocate memory + fseek(fp, 0, SEEK_END); + fsize = ftell(fp); + kernel_source = new char[fsize+1]; + + //read file and content in kernel source + rewind(fp); + fread(kernel_source, 1, sizeof(char)*fsize, fp); + kernel_source[fsize] = '\0'; + fclose(fp); + + std::string kernel_string (kernel_source); + return kernel_string + cudaFunctHeader + "return " + function + ";" + cudaFunctFooter; +} + +// +int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) { + + //build program string + std::string cudaProg = buildProgram(function); + + //create program + nvrtcProgram prog; + //std::cout << cudaProg.c_str() << std::endl; + nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL); + + //compile program + const char *opts[] = {"-fmad=false", ""}; + int numopts = 1; + if (mlh) { + opts[1] = "-DMLH"; + numopts = 2; + } + + nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts); + + if (compileResults != NVRTC_SUCCESS) { + //obtain compilation log + size_t logSize; + nvrtcGetProgramLogSize(prog, &logSize); + char *log = new char[logSize]; + nvrtcGetProgramLog(prog, log); + DEBUG_MSG("Compilation failed!"); + DEBUG_MSG(log); + delete[] log; + + return DKS_ERROR; + } else { + DEBUG_MSG("Compilation successfull!"); + } + + //obtain PTX from program + if (ptx_m != NULL) + delete[] ptx_m; + size_t ptxSize; + nvrtcGetPTXSize(prog, &ptxSize); + ptx_m = new char[ptxSize]; + nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m); + + if (nvrtcPTXResult != NVRTC_SUCCESS) { + DEBUG_MSG("Get PTX failed!"); + return DKS_ERROR; + } + + //load module from ptx + CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); + if (loadResult != CUDA_SUCCESS) { + DEBUG_MSG("Load module from ptx failed!"); + return DKS_ERROR; + } + + // Destroy the program + nvrtcDestroyProgram(&prog); + + return DKS_SUCCESS; +} + +int CudaChiSquareRuntime::launchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, double &result) +{ + + if (!initDone_m) { + DEBUG_MSG("ChiSquare init needs to be called at some point!"); + return DKS_ERROR; + } + + int blocks; + int threads = blockSize_m; + if (numBlocks_m < 0) + blocks = length / threads + 1; + else + blocks = numBlocks_m; + + CUresult cuStatus; + void **args = 0; + + if (fitType == FITTYPE_SINGLE_HISTO) { + cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareSingleHisto"); + + if (cuStatus != CUDA_SUCCESS) { + DEBUG_MSG("Failed to get function from module!"); + return DKS_ERROR; + } + + args = (void**) malloc(15 * sizeof(void*)); + args[0] = &mem_data; + args[1] = &mem_err; + args[2] = &mem_param_m; + args[3] = &mem_chisq_m; + args[4] = &mem_map_m; + args[5] = &mem_func_m; + args[6] = &length; + args[7] = &numpar; + args[8] = &numfunc; + args[9] = &nummap; + args[10] = &timeStart; + args[11] = &timeStep; + args[12] = &tau_m; + args[13] = &N0_m; + args[14] = &bkg_m; + } else if (fitType == FITTYPE_ASYMMETRY) { + cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareAsymmetry"); + + if (cuStatus != CUDA_SUCCESS) { + DEBUG_MSG("Failed to get function from module!"); + return DKS_ERROR; + } + + args = (void**) malloc(14 * sizeof(void*)); + args[0] = &mem_data; + args[1] = &mem_err; + args[2] = &mem_param_m; + args[3] = &mem_chisq_m; + args[4] = &mem_map_m; + args[5] = &mem_func_m; + args[6] = &length; + args[7] = &numpar; + args[8] = &numfunc; + args[9] = &nummap; + args[10] = &timeStart; + args[11] = &timeStep; + args[12] = &alpha_m; + args[13] = &beta_m; + } else if (fitType == FITTYPE_MU_MINUS) { + DEBUG_MSG("Not Yet Implemented!"); + return DKS_ERROR; + } else { + DEBUG_MSG("Undefined Fit Type!"); + return DKS_ERROR; + } + + cuStatus = cuLaunchKernel(kernel_m, + blocks, 1, 1, + threads, 1, 1, + (numpar + numfunc)*sizeof(double) + nummap*sizeof(int), NULL, + args, 0); + + + + if (cuStatus != CUDA_SUCCESS) { + std::string msg; + msg = "Failed to run kernel! (" + std::to_string(blocks) + ", " + std::to_string(threads) + ")"; + DEBUG_MSG(msg); + const char *desc; + cuGetErrorString(cuStatus, &desc); + std::cout << desc << std::endl; + return DKS_ERROR; + } + + cublasStatus_t status; + status = cublasDasum(defaultCublasRT, length, (double*)mem_chisq_m, 1, &result); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("cublas sum failed!"); + return DKS_ERROR; + } + + // cleanup + if (args) + free(args); + + return DKS_SUCCESS; + +} + +int CudaChiSquareRuntime::writeParams(const double *params, int numparams) { + int ierr = m_base->cuda_writeData( (double*)mem_param_m, params, sizeof(double)*numparams); + return ierr; +} + +int CudaChiSquareRuntime::writeFunc(const double *func, int numfunc) { + int ierr = m_base->cuda_writeData( (double*)mem_func_m, func, sizeof(double)*numfunc); + return ierr; +} + +int CudaChiSquareRuntime::writeMap(const int *map, int nummap) { + int ierr = m_base->cuda_writeData( (int*)mem_map_m, map, sizeof(int)*nummap); + return ierr; +} + +int CudaChiSquareRuntime::initChiSquare(int size_data, int size_param, int size_func, + int size_map) { + + int ierr = DKS_ERROR; + if (initDone_m) { + DEBUG_MSG("Reinitializing ChiSquare"); + freeChiSquare(); + } + + //init cublas + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + status = cublasCreate(&defaultCublasRT); + if (status != CUBLAS_STATUS_SUCCESS) + DEBUG_MSG("CUBLAS create default handle failed!"); + + //allocate temporary memory + mem_chisq_m = m_base->cuda_allocateMemory(size_data*sizeof(double), ierr); + mem_param_m = m_base->cuda_allocateMemory(size_param*sizeof(double), ierr); + mem_func_m = m_base->cuda_allocateMemory(size_func*sizeof(double), ierr); + mem_map_m = m_base->cuda_allocateMemory(size_map*sizeof(int), ierr); + initDone_m = true; + + return ierr; +} + +int CudaChiSquareRuntime::freeChiSquare() { + int ierr = DKS_ERROR; + if (initDone_m) { + //delete cublas + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + status = cublasDestroy(defaultCublasRT); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS delete default handle failed!"); + return DKS_ERROR; + } + + //free memory + ierr = m_base->cuda_freeMemory(mem_chisq_m); + ierr = m_base->cuda_freeMemory(mem_param_m); + ierr = m_base->cuda_freeMemory(mem_func_m); + ierr = m_base->cuda_freeMemory(mem_map_m); + + initDone_m = false; + } + + return ierr; +} diff --git a/src/CUDA/CudaChiSquareRuntime.cuh b/src/CUDA/CudaChiSquareRuntime.cuh new file mode 100644 index 0000000..79a9af5 --- /dev/null +++ b/src/CUDA/CudaChiSquareRuntime.cuh @@ -0,0 +1,114 @@ +#ifndef H_CUDA_CHISQUARE_RUNTIME +#define H_CUDA_CHISQUARE_RUNTIME + +#include +#include + +#include +#include +#include + +#include "../Algorithms/ChiSquareRuntime.h" +#include "CudaBase.cuh" + +const std::string cudaFunctHeader = "__device__ double fTheory(double t, double *p, double *f, int *m) {"; + +const std::string cudaFunctFooter = "}\n"; + +class CudaChiSquareRuntime : public ChiSquareRuntime{ + +private: + + bool base_create; + CudaBase *m_base; + + CUdevice cuDevice_m; + CUcontext context_m; + CUmodule module_m; + CUfunction kernel_m; + + cublasHandle_t defaultCublasRT; + + /** Setup to init device + * Create context and init device for RT compilation + */ + void setUpContext(); + + /** Private function to add function to kernel string + * + */ + std::string buildProgram(std::string function); + +public: + + /** Constructor with CudaBase argument + * + */ + CudaChiSquareRuntime(CudaBase *base); + + /** Default constructor init cuda device + * + */ + CudaChiSquareRuntime(); + + /** Default destructor + * + */ + ~CudaChiSquareRuntime(); + + /** Compile program and save ptx. + * Add function string to the calcFunction kernel and compile the program + * Function must be valid C math expression. Parameters can be addressed in + * a form par[map[idx]] + */ + int compileProgram(std::string function, bool mlh = false); + + /** Launch selected kernel + * Launched the selected kernel from the compiled code. + * Result is put in &result variable + */ + int launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result); + + /** Write params to device. + * Write params from double array to mem_param_m memory on the device. + */ + int writeParams(const double *params, int numparams); + + /** Write functions to device. + * Write function values from double array to mem_func_m memory on the device. + */ + int writeFunc(const double *func, int numfunc); + + /** Write maps to device. + * Write map values from int array to mem_map_m memory on the device. + */ + int writeMap(const int *map, int nummap); + + /** Allocate temporary memory needed for chi square. + * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to + * the maximum number of elements in any datasets that will be used for calculations. Size_param, + * size_func and size_map are the maximum number of parameters, functions and maps used in + * calculations. + */ + int initChiSquare(int size_data, int size_param, int size_func, int size_map); + + + /** Free temporary memory allocated for chi square. + * Frees the chisq temporary memory and memory for params, functions and maps + */ + int freeChiSquare(); + + /** Check if CUDA device is able to run the chi square kernel. + * Redundant - all new CUDA devices that support RT compilation will also support + * double precision, there are no other requirements to run chi square on GPU + */ + int checkChiSquareKernels(int fitType, int &threadsPerBlock) { + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/CUDA/CudaCollimatorPhysics.cu b/src/CUDA/CudaCollimatorPhysics.cu new file mode 100644 index 0000000..495ad59 --- /dev/null +++ b/src/CUDA/CudaCollimatorPhysics.cu @@ -0,0 +1,728 @@ +#include "CudaCollimatorPhysics.cuh" + +//#define M_P 0.93827231e+00 +#define M_P 0.93827204e+00 +#define C 299792458.0 +#define PI 3.14159265358979323846 +#define AVO 6.022e23 +#define R_E 2.81794092e-15 +//#define eM_E 0.51099906e-03 +#define eM_E 0.51099892e-03 +#define Z_P 1 +#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 + +#define POSITION 0 +#define ZSIZE 1 +#define RHO_M 2 +#define Z_M 3 +#define A_M 4 +#define A2_C 5 +#define A3_C 6 +#define A4_C 7 +#define A5_C 8 +#define X0_M 9 +#define I_M 10 +#define DT_M 11 + +#define BLOCK_SIZE 128 +#define NUMPAR 12 + +__device__ inline double dot(double3 &d1, double3 &d2) { + + return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); + +} + +__device__ inline bool checkHit(double &z, double *par) { + + /* check if particle is in the degrader material */ + return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); + +} + + +__device__ inline void energyLoss(double &Eng, bool &pdead, curandState &state, double *par) +{ + + volatile double dEdx = 0.0; + + volatile double gamma = (Eng + M_P) / M_P; + volatile double gamma2 = gamma * gamma; + + double beta = sqrt(1.0 - 1.0 / gamma2); + volatile double beta2 = beta * beta; + + double deltas = par[DT_M] * beta * C; + volatile double deltasrho = deltas * 100 * par[RHO_M]; + volatile double sigma_E = sqrt(K * eM_E * par[RHO_M] * (par[Z_M] / par[A_M]) * deltas * 1E5); + + if ( (Eng > 0.00001) && (Eng < 0.0006) ) { + double Ts = (Eng * 1E6) / 1.0073; + double epsilon_low = par[A2_C] * pow(Ts, 0.45); + double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) ); + double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high); + + dEdx = -epsilon / (1E21 * (par[A_M] / AVO) ); + + double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state); + Eng = Eng + delta_E / 1E3; + } + + if (Eng >= 0.0006) { + double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P)); + + dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + double delta_E = deltasrho * dEdx + sigma_E * curand_normal_double(&state); + + Eng = Eng + delta_E / 1E3; + } + + pdead = ((Eng<1E-4) || (dEdx>0)); + +} + +__device__ inline void Rot(double &px, double &pz, double &x, double &z, double &xplane, + double &normP, double &thetacou, double &deltas, int coord, + double *par) +{ + double Psixz; + double pxz; + + if (px>=0 && pz>=0) + Psixz = atan(px/pz); + else if (px>0 && pz<0) + Psixz = atan(px/pz) + PI; + else if (px<0 && pz>0) + Psixz = atan(px/pz) + 2*PI; + else + Psixz = atan(px/pz) + PI; + + pxz = sqrt(px*px + pz*pz); + + if(coord==1) { + x = x + deltas * px/normP + xplane*cos(Psixz); + z = z - xplane * sin(Psixz); + } + + if(coord==2) { + x = x + deltas * px/normP + xplane*cos(Psixz); + z = z - xplane * sin(Psixz) + deltas * pz / normP; + } + + px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou); + pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); +} + +__device__ inline void coulombScat(double3 &R, double3 &P, curandState &state, double* par) { + + double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; + double gamma = (Eng + M_P) / M_P; + double normP = sqrt(dot(P, P)); + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + double deltas = par[DT_M] * beta * C; + + double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * + Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); + + // x-direction: See Physical Review, "Multiple Scattering" + double z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + //__syncthreads(); + + double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1, par); + + double P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.x,P.z,R.x,R.z, xplane, normP, thetaru, deltas, 0, par); + } + + // y-direction: See Physical Review, "Multiple Scattering" + z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + //__syncthreads(); + + double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.y,P.z,R.y,R.z, yplane, normP, thetacou, deltas, 2, par); + + P2 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.y,P.z,R.y,R.z, yplane, normP, thetaru, deltas, 0, par); + } + +} + + +template +__global__ void kernelCollimatorPhysics(T *data, double *par, curandState *state, + int numparticles) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + //transfer params to shared memory + extern __shared__ double smem[]; + double *p = (double*)smem; + double3 *R = (double3*)&smem[NUMPAR]; + + curandState s; + double3 P; + + for (int tt = tid; tt < NUMPAR; tt += blockDim.x) + p[tt] = par[tt]; + + __syncthreads(); + + if (idx < numparticles) { + s = state[idx]; + R[tid] = data[idx].Rincol; + P = data[idx].Pincol; + + bool pdead = false; + volatile double sq = sqrt(1.0 + dot(P, P)); + + double Eng; + + if (checkHit(R[tid].z, p)) { + + Eng = (sq - 1) * M_P; + energyLoss(Eng, pdead, s, p); + + if (!pdead) { + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(P, P)); + + P.x = P.x * ptot / sq; + P.y = P.y * ptot / sq; + P.z = P.z * ptot / sq; + coulombScat(R[tid], P, s, p); + + data[idx].Pincol = P; + } else { + data[idx].label = -1; + } + + state[idx] = s; + } else { + + R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; + R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; + R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; + data[idx].label = -2; + + } + + data[idx].Rincol = R[tid]; + } + +} + +__global__ void kernelCollimatorPhysics2(CUDA_PART2_SMALL data, double *par, + curandState *state, int numparticles) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + //transfer params to shared memory + __shared__ double p[NUMPAR]; + __shared__ double3 R[BLOCK_SIZE]; + + if (tid < NUMPAR) + p[tid] = par[tid]; + + __syncthreads(); + + curandState s; + double3 P; + if (idx < numparticles) { + R[tid] = data.Rincol[idx]; + P = data.Pincol[idx]; + s = state[idx]; + + double sq = sqrt(1.0 + dot(P, P)); + bool pdead = false; + + if (checkHit(R[tid].z, p)) { + + double Eng = (sq - 1) * M_P; + energyLoss(Eng, pdead, s, p); + + if (!pdead) { + + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(P, P)); + P.x = P.x * ptot / sq; + P.y = P.y * ptot / sq; + P.z = P.z * ptot / sq; + coulombScat(R[tid], P, s, p); + + data.Pincol[idx] = P; + } else { + data.label[idx] = -1; + } + + } else { + R[tid].x = R[tid].x + p[DT_M] * C * P.x / sq; + R[tid].y = R[tid].y + p[DT_M] * C * P.y / sq; + R[tid].z = R[tid].z + p[DT_M] * C * P.z / sq; + + data.label[idx] = -2; + } + + data.Rincol[idx] = R[tid]; + state[idx] = s; + } + +} + + +inline __device__ void unitlessOff(double3 &a, const double &c) { + a.x *= c; + a.y *= c; + a.z *= c; +} + +inline __device__ void unitlessOn(double3 &a, const double &c) { + a.x /= c; + a.y /= c; + a.z /= c; +} + +//swithch to unitless positions with dtc +__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + + unitlessOn(R, dtc); + unitlessOn(X, dtc); + + gR[idx] = R; + gX[idx] = X; + } + +} + +//swithc to unitless positions with dt*c +__global__ void kernelSwitchToUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + double dt = gdt[idx]; + + unitlessOff(R, dt*c); + unitlessOff(X, dt*c); + + gR[idx] = R; + gX[idx] = X; + } +} + +//swithc off unitless positions with dtc +__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double dtc, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + + unitlessOff(R, dtc); + unitlessOff(X, dtc); + + gR[idx] = R; + gX[idx] = X; + } + +} + +//switch off unitelss positions with dt*c +__global__ void kernelSwitchOffUnitlessPositions(double3 *gR, double3 *gX, double *gdt, double c, int npart) { + + volatile int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < npart) { + double3 R = gR[idx]; + double3 X = gX[idx]; + double dt = gdt[idx]; + + unitlessOff(R, dt*c); + unitlessOff(X, dt*c); + + gR[idx] = R; + gX[idx] = X; + } +} + + +__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double dtc) { + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + if (idx < npart) { + + double3 R = gR[idx]; + double3 P = gP[idx]; + + //switch to unitless positions + unitlessOn(R, dtc); + + //push + double tmp = sqrt(1.0 + dot(P, P)); + R.x += 0.5 * P.x / tmp; + R.y += 0.5 * P.y / tmp; + R.z += 0.5 * P.z / tmp; + + //switch off unitless positions with dt*c + unitlessOff(R, dtc); + + gR[idx] = R; + } +} + + +__global__ void kernelPush(double3 *gR, double3 *gP, int npart, double *gdt, double c) { + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + if (idx < npart) { + + double3 R = gR[idx]; + double3 P = gP[idx]; + double dt = gdt[idx]; + + //switch to unitless positions with dt*c + unitlessOn(R, dt*c); + + R.x += 0.5 * P.x / sqrt(1.0 + dot(P, P)); + R.y += 0.5 * P.y / sqrt(1.0 + dot(P, P)); + R.z += 0.5 * P.z / sqrt(1.0 + dot(P, P)); + + //switch off unitless positions with dt*c + unitlessOff(R, dt*c); + + gR[idx] = R; + } +} + +//TODO: kernel for push with switch off unitless positions with dt[i]*c + +__device__ double3 deviceTransformTo(const double3 &vec, const double3 &ori) { + + const double sina = sin(ori.x); + const double cosa = cos(ori.x); + const double sinb = sin(ori.y); + const double cosb = cos(ori.y); + const double sinc = sin(ori.z); + const double cosc = cos(ori.z); + + double3 temp; + temp.x = 0.0; + temp.y = 0.0; + temp.z = 0.0; + + temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z; + temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x + + (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z; + temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x + + (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z; + + return temp; + +} + +__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient, + int npart, int nsect, double dtc) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + + if (idx < npart) { + + double3 X = gX[idx]; + double3 P = gP[idx]; + long lLastSection = gLastSection[idx]; + + double3 ori; + if (lLastSection > -1 && lLastSection < nsect) { + ori = gOrient[lLastSection]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + double3 tmp = deviceTransformTo(P, ori); + + unitlessOn(X, dtc); + + X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp)); + X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp)); + X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp)); + + unitlessOff(X, dtc); + + gX[idx] = X; + } + +} + +__global__ void kernelPushTransform(double3 *gX, double3 *gP, long *gLastSection, double3* gOrient, + int npart, int nsect, double *gdt, double c) +{ + + //get global id and thread id + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + + + if (idx < npart) { + + double3 X = gX[idx]; + double3 P = gP[idx]; + long lLastSection = gLastSection[idx]; + double dt = gdt[idx]; + + double3 ori; + if (lLastSection > -1 && lLastSection < nsect) { + ori = gOrient[lLastSection]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + double3 tmp = deviceTransformTo(P, ori); + + unitlessOn(X, dt*c); + + X.x += 0.5 * tmp.x / sqrt(1.0 + dot(tmp, tmp)); + X.y += 0.5 * tmp.y / sqrt(1.0 + dot(tmp, tmp)); + X.z += 0.5 * tmp.z / sqrt(1.0 + dot(tmp, tmp)); + + unitlessOff(X, dt*c); + + gX[idx] = X; + } + +} + +struct compare_particle +{ + int threshold; + + compare_particle() { + threshold = 0; + } + + void set_threshold(int t) { + threshold = t; + } + + __host__ __device__ + bool operator()(CUDA_PART p1, CUDA_PART p2) { + return p1.label > p2.label; + } + + __host__ __device__ + bool operator()(CUDA_PART p1) { + return p1.label < threshold; + } +}; + + +struct compare_particle_small +{ + int threshold; + + compare_particle_small() { + threshold = 0; + } + + void set_threshold(int t) { + threshold = t; + } + + __host__ __device__ + bool operator()(CUDA_PART_SMALL p1, CUDA_PART_SMALL p2) { + return p1.label > p2.label; + } + + __host__ __device__ + bool operator()(CUDA_PART_SMALL p1) { + return p1.label < threshold; + } +}; + + +struct less_then +{ + __host__ __device__ + bool operator()(int x) + { + return x < 0; + } +}; + +int CudaCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) +{ + + int threads = BLOCK_SIZE; + int blocks = numparticles / threads + 1; + + //calc shared memory size + int smem_size = sizeof(double)*NUMPAR + sizeof(double3)*BLOCK_SIZE; + + //call kernel + kernelCollimatorPhysics<<>>((CUDA_PART_SMALL*)mem_ptr, + (double*)par_ptr, + m_base->cuda_getCurandStates(), + numparticles); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + std::cout << "Err2: " << cudaGetErrorString(err) << std::endl; + + return DKS_SUCCESS; + +} + +int CudaCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles, + int &numaddback) +{ + + //wrap mem_ptr with thrust device ptr + thrust::device_ptr dev_ptr( (CUDA_PART_SMALL*)mem_ptr); + + //count -2 and -1 particles + compare_particle_small comp; + comp.set_threshold(0); + numaddback = thrust::count_if(dev_ptr, dev_ptr + numparticles, comp); + + //sort particles + if (numaddback > 0) + thrust::sort(dev_ptr, dev_ptr + numparticles, comp); + + return DKS_SUCCESS; +} + +int CudaCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, bool usedt, + int streamId) +{ + + int threads = BLOCK_SIZE; + int blocks = npart / threads + 1; + + //call kernel + if (!usedt) { + if (streamId == -1) { + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, dt*c); + } + } else { + if (streamId == -1) { + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, + (double*)dt_ptr, c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPush<<>>((double3*)r_ptr, (double3*)p_ptr, npart, + (double*)dt_ptr, c); + } + } + + + return DKS_SUCCESS; +} + +int CudaCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, + void *dt_ptr, double dt, + double c, bool usedt, + int streamId) +{ + + int threads = BLOCK_SIZE; + int blocks = npart / threads + 1; + int smem = sizeof(double3) * nsec; + + //call kernel + if (!usedt) { + if (streamId == -1) { + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, dt*c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, dt*c); + } + } else { + if (streamId == -1) { + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, (double*)dt_ptr, c); + } else { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelPushTransform<<>>((double3*)x_ptr, (double3*)p_ptr, + (long*)lastSec_ptr, (double3*)orient_ptr, + npart, nsec, (double*)dt_ptr, c); + } + } + + return DKS_SUCCESS; +} + + + diff --git a/src/CUDA/CudaCollimatorPhysics.cuh b/src/CUDA/CudaCollimatorPhysics.cuh new file mode 100644 index 0000000..9808f33 --- /dev/null +++ b/src/CUDA/CudaCollimatorPhysics.cuh @@ -0,0 +1,155 @@ +#ifndef H_CUDA_COLLIMATORPHYSICS +#define H_CUDA_COLLIMATORPHYSICS + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include "../Algorithms/CollimatorPhysics.h" +#include "CudaBase.cuh" + +/** + * Structure for storing particle on GPU + */ +typedef struct __align__(16) { + int label; + unsigned localID; + double3 Rincol; + double3 Pincol; + long IDincol; + int Binincol; + double DTincol; + double Qincol; + long LastSecincol; + double3 Bfincol; + double3 Efincol; +} CUDA_PART; + +/** + * Structure for storing particle on GPU + */ +typedef struct { + int label; + unsigned localID; + double3 Rincol; + double3 Pincol; +} CUDA_PART_SMALL; + +/** + * Structure for storing particle on GPU + */ +typedef struct { + int *label; + unsigned *localID; + double3 *Rincol; + double3 *Pincol; + long *IDincol; + int *Binincol; + double *DTincol; + double *Qincol; + long *LastSecincol; + double3 *Bfincol; + double3 *Efincol; +} CUDA_PART2; + +/** + * Structure for storing particle on GPU + */ +typedef struct { + int *label; + unsigned *localID; + double3 *Rincol; + double3 *Pincol; +} CUDA_PART2_SMALL; + +/** CudaCollimatorPhysics class. + * Contains kerenls that execute CollimatorPhysics functions form OPAL. + * For detailed documentation on CollimatorPhysics functions see OPAL documentation + */ +class CudaCollimatorPhysics : public DKSCollimatorPhysics{ + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** Constructor with CudaBase argument + * + */ + CudaCollimatorPhysics(CudaBase *base) { + m_base = base; + base_create = false; + } + + /** Constructor - empty. */ + CudaCollimatorPhysics() { + m_base = new CudaBase(); + base_create = true; + } + + /** Destructor - empty */ + ~CudaCollimatorPhysics() { + if (base_create) + delete m_base; + }; + + /** Execute collimator physics kernel. + * + */ + int CollimatorPhysics(void *mem_ptr, void *par_ptr, + int numpartices); + + int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) + { + return DKS_ERROR; + } + + /** Sort particle array on GPU. + * Count particles that are dead (label -1) or leaving material (label -2) and sort particle + * array so these particles are at the end of array + */ + int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) + { + return DKS_ERROR; + } + + /** BorisPusher push function for integration from OPAL. + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ + int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1); + + /** BorisPusher push function with transformto function form OPAL + * ParallelTTracker integration from OPAL implemented in cuda. + * For more details see ParallelTTracler docomentation in opal + */ + int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + +}; + +#endif diff --git a/src/CUDA/CudaFFT.cu b/src/CUDA/CudaFFT.cu new file mode 100644 index 0000000..88e45ca --- /dev/null +++ b/src/CUDA/CudaFFT.cu @@ -0,0 +1,376 @@ +#include "CudaFFT.cuh" + +__global__ void normalize(cufftDoubleComplex *in, int N) { + + int id = blockIdx.x; //*blockDim.x + threadIdx.x; + if (id < N) { + in[id].x = in[id].x / N; + in[id].y = in[id].y / N; + } + +} + +CudaFFT::CudaFFT(CudaBase *base) { + m_base = base; + base_create = false; +} + +/* constructor */ +CudaFFT::CudaFFT() { + m_base = new CudaBase(); + base_create = true; +} + +/* destructor */ +CudaFFT::~CudaFFT() { + if (base_create) + delete m_base; +} + +/* + Info: execute fft using cufft library + Return: success or error code +*/ +int CudaFFT::executeFFT(void * mem_ptr, int ndim, int N[3], int streamId, bool forward) { + + //create fft plan + cufftResult cresult; + cufftHandle plan; + + if (useDefaultPlan(ndim, N)) { + plan = defaultPlanZ2Z; + } else { + switch (ndim) { + case 1: + cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2Z, 1); + break; + case 2: + cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2Z); + break; + case 3: + cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2Z); + break; + default: + cresult = CUFFT_SUCCESS; + break; + } + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating plan, cuda error: " << cresult); + if (cresult == CUFFT_SETUP_FAILED) + DEBUG_MSG("Setup failed"); + + if (cresult == CUFFT_INVALID_SIZE) + DEBUG_MSG("Invalid size"); + + if (cresult == CUFFT_INVALID_TYPE) + DEBUG_MSG("Invalid type"); + + if (cresult == CUFFT_ALLOC_FAILED) + DEBUG_MSG("Alloc failed"); + + return DKS_ERROR; + } + } + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cufftSetStream(plan, m_base->cuda_getStream(streamId)); + else + cufftSetStream(plan, 0); + + //execute perform in place FFT on created plan + if (forward) { + cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr, + (cufftDoubleComplex*)mem_ptr, CUFFT_FORWARD); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing fft, cuda error: " << cresult); + cufftDestroy(plan); + return DKS_ERROR; + } + } else { + cresult = cufftExecZ2Z(plan, (cufftDoubleComplex*)mem_ptr, + (cufftDoubleComplex*)mem_ptr, CUFFT_INVERSE); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing ifft, cuda error: " << cresult); + cufftDestroy(plan); + return DKS_ERROR; + } + } + + //clean up resources + if (!useDefaultPlan(ndim, N)) + cufftDestroy(plan); + return DKS_SUCCESS; +} + +/* + Info: execute ifft + Return: success or error code +*/ +int CudaFFT::executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId) { + return executeFFT(mem_ptr, ndim, N, streamId, false); +} + +/* + Info: execute normalize using cuda kernel + Return: success or error code +*/ +int CudaFFT::normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId) { + + cublasStatus_t status; + unsigned int size = N[0]*N[1]*N[2]; + cuDoubleComplex alpha = make_cuDoubleComplex(1.0/size, 0); + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId)); + + status = cublasZscal(defaultCublasFFT, size, &alpha, (cuDoubleComplex*)mem_ptr, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS exec Zscal failed!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +/* + Info: execute real to complex double precision FFT + Return: success or error code +*/ +int CudaFFT::executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) { + + //create fft plan + cufftResult cresult; + cufftHandle plan; + if (useDefaultPlan(ndim, N)) { + plan = defaultPlanD2Z; + } else { + switch (ndim) { + case 1: + cresult = cufftPlan1d(&plan, N[0], CUFFT_D2Z, 1); + break; + case 2: + cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_D2Z); + break; + case 3: + cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_D2Z); + break; + default: + cresult = CUFFT_SUCCESS; + break; + } + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cresult = cufftSetStream(plan, m_base->cuda_getStream(streamId)); + else + cufftSetStream(plan, 0); + + //execute perform in place FFT on created plan + cresult = cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr, (cufftDoubleComplex*)comp_ptr); + + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing fft, cuda error: " << cresult); + if (cresult == CUFFT_INVALID_PLAN) + DEBUG_MSG("invalid plan"); + if (cresult == CUFFT_INVALID_VALUE) + DEBUG_MSG("invalid value"); + if (cresult == CUFFT_INTERNAL_ERROR) + DEBUG_MSG("internal error"); + if (cresult == CUFFT_EXEC_FAILED) + DEBUG_MSG("exec failed"); + if (cresult == CUFFT_SETUP_FAILED) + DEBUG_MSG("setup failed"); + + return DKS_ERROR; + } + + //clean up resources + if (!useDefaultPlan(ndim, N)) { + cresult = cufftDestroy(plan); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + return DKS_SUCCESS; +} + +/* + Info: exectue complex to real double precision FFT + Return: success or error code +*/ +int CudaFFT::executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId) { + + //create fft plan + cufftResult cresult; + cufftHandle plan; + + if (useDefaultPlan(ndim, N)) { + plan = defaultPlanZ2D; + } else { + switch (ndim) { + case 1: + cresult = cufftPlan1d(&plan, N[0], CUFFT_Z2D, 1); + break; + case 2: + cresult = cufftPlan2d(&plan, N[1], N[0], CUFFT_Z2D); + break; + case 3: + cresult = cufftPlan3d(&plan, N[2], N[1], N[0], CUFFT_Z2D); + break; + default: + cresult = CUFFT_SUCCESS; + break; + } + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cufftSetStream(plan, m_base->cuda_getStream(streamId)); + else + cufftSetStream(plan, 0); + + //execute perform in place FFT on created plan + cresult = cufftExecZ2D(plan, (cufftDoubleComplex*)comp_ptr, (cufftDoubleReal*)real_ptr); + + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error executing fft, cuda error: " << cresult); + cufftDestroy(plan); + return DKS_ERROR; + } + + //clean up resources + if (!useDefaultPlan(ndim, N)) { + cresult = cufftDestroy(plan); + if (cresult != CUFFT_SUCCESS) { + DEBUG_MSG("Error destroying cufft plan, cuda error: " << cresult); + return DKS_ERROR; + } + } + return DKS_SUCCESS; +} + +/* + Info: execute normalize for complex to real iFFT + Return: success or error code +*/ +int CudaFFT::normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId) { + cublasStatus_t status; + unsigned int size = N[0]*N[1]*N[2]; + double alpha = 1.0/size; + + if (streamId != -1 && streamId < m_base->cuda_numberOfStreams()) + cublasSetStream(defaultCublasFFT, m_base->cuda_getStream(streamId)); + + status = cublasDscal(defaultCublasFFT, size, &alpha, (double*)real_ptr, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS exec Zscal failed!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +} + +/* + Info: init cufftPlans witch can be reused for all FFTs of the same size and type + Return: success or error code +*/ +int CudaFFT::setupFFT(int ndim, int N[3]) { + + cufftResult cr1 = CUFFT_SUCCESS; + cufftResult cr2 = CUFFT_SUCCESS; + cufftResult cr3 = CUFFT_SUCCESS; + + //create default fft plans + if (ndim == 1) { + cr1 = cufftPlan1d(&defaultPlanZ2Z, N[0], CUFFT_Z2Z, 1); + cr2 = cufftPlan1d(&defaultPlanD2Z, N[0], CUFFT_D2Z, 1); + cr3 = cufftPlan1d(&defaultPlanZ2D, N[0], CUFFT_Z2D, 1); + } + + if (ndim == 2) { + cr1 = cufftPlan2d(&defaultPlanZ2Z, N[1], N[0], CUFFT_Z2Z); + cr2 = cufftPlan2d(&defaultPlanD2Z, N[1], N[0], CUFFT_D2Z); + cr3 = cufftPlan2d(&defaultPlanZ2D, N[1], N[0], CUFFT_Z2D); + } + + if (ndim == 3) { + cr1 = cufftPlan3d(&defaultPlanZ2Z, N[2], N[1], N[0], CUFFT_Z2Z); + cr2 = cufftPlan3d(&defaultPlanD2Z, N[2], N[1], N[0], CUFFT_D2Z); + cr3 = cufftPlan3d(&defaultPlanZ2D, N[2], N[1], N[0], CUFFT_Z2D); + } + + if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) { + DEBUG_MSG("Error creating default plan"); + return DKS_ERROR; + } + + //create cublas plan + cublasStatus_t status; + status = cublasCreate(&defaultCublasFFT); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS create default handle failed!"); + return DKS_ERROR; + } + //std::cout << "cublas created" << std::endl; + + defaultNdim = ndim; + if (ndim > 0) { + defaultN[0] = N[0]; + defaultN[1] = N[1]; + defaultN[2] = N[2]; + } + + return DKS_SUCCESS; + +} + +/* + Info: destroy default FFT plans + Return: success or error code +*/ +int CudaFFT::destroyFFT() { + + cufftResult cr1 = CUFFT_SUCCESS; + cufftResult cr2 = CUFFT_SUCCESS; + cufftResult cr3 = CUFFT_SUCCESS; + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + if (defaultNdim > 0) { + //clean up resources + cr1 = cufftDestroy(defaultPlanZ2Z); + cr2 = cufftDestroy(defaultPlanD2Z); + cr3 = cufftDestroy(defaultPlanZ2D); + + if (cr1 != CUFFT_SUCCESS || cr2 != CUFFT_SUCCESS || cr3 != CUFFT_SUCCESS) { + DEBUG_MSG("Error destroying default cufft plans"); + return DKS_ERROR; + } + + } + + if (defaultNdim > -1) { + status = cublasDestroy(defaultCublasFFT); + if (status != CUBLAS_STATUS_SUCCESS) { + DEBUG_MSG("CUBLAS delete default handle failed!"); + return DKS_ERROR; + } + } + + defaultN[0] = -1; + defaultN[1] = -1; + defaultN[2] = -1; + defaultNdim = -1; + return DKS_SUCCESS; + +} + + + diff --git a/src/CUDA/CudaFFT.cuh b/src/CUDA/CudaFFT.cuh new file mode 100644 index 0000000..0c22f2c --- /dev/null +++ b/src/CUDA/CudaFFT.cuh @@ -0,0 +1,88 @@ +#ifndef H_CUDA_FFT +#define H_CUDA_FFT + +#include +#include +#include +#include +#include "cublas_v2.h" + +#include "../Algorithms/FFT.h" +#include "CudaBase.cuh" + +class CudaFFT : public DKSFFT{ + +private: + + bool base_create; + CudaBase *m_base; + + cufftHandle defaultPlanZ2Z; + cufftHandle defaultPlanD2Z; + cufftHandle defaultPlanZ2D; + cublasHandle_t defaultCublasFFT; + +public: + + /** Constructor with CudaBase as argument */ + CudaFFT(CudaBase *base); + + /** constructor */ + CudaFFT(); + + /** destructor */ + ~CudaFFT(); + + /** + * Info: init cufftPlans witch can be reused for all FFTs of the same size and type + * Return: success or error code + */ + int setupFFT(int ndim, int N[3]); + int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + + /** + * Info: destroy default FFT plans + * Return: success or error code + */ + int destroyFFT(); + + /* + Info: execute complex to complex double precision fft using cufft library + Return: success or error code + */ + int executeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); + + /* + Info: execute ifft + Return: success or error code + */ + int executeIFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: execute normalize using cuda kernel for complex to complex iFFT + Return: success or error code + */ + int normalizeFFT(void * mem_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: execute real to complex double precision FFT + Return: success or error code + */ + int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: exectue complex to real double precision FFT + Return: success or error code + */ + int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], int streamId = -1); + + /* + Info: execute normalize for complex to real iFFT + Return: success or error code + */ + int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1); + +}; + +#endif diff --git a/src/CUDA/CudaGreensFunction.cu b/src/CUDA/CudaGreensFunction.cu new file mode 100644 index 0000000..140954b --- /dev/null +++ b/src/CUDA/CudaGreensFunction.cu @@ -0,0 +1,469 @@ +#include "CudaGreensFunction.cuh" + +__global__ void kernelTmpgreen(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ) { + + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgreen[i + j * NI + k * NI * NJ] = tmpgrn / cellVolume; + +} + +__global__ void kernelTmpgreen_2(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, int NI, int NJ, int NK) { + + int tid = threadIdx.x; + int id = blockIdx.x * blockDim.x + tid; + + if (id < NI * NJ * NK) { + int i = id % NI; + int k = id / (NI * NJ); + int j = (id - k * NI * NJ) / NI; + + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgreen[id] = tmpgrn / cellVolume; + + } + +} + +//calculate greens integral on cpu and transfer to gpu +void kernelTmpgreenCPU(double *tmpgreen, double hr_m0, double hr_m1, double hr_m2, + int NI, int NJ, int NK) +{ + + double cellVolume = hr_m0 * hr_m1 * hr_m2; + + for (int k = 0; k < NK; k++) { + for (int j = 0; j < NJ; j++) { + for (int i = 0; i < NI; i++) { + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = 0; + tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgrn = tmpgrn / cellVolume; + + tmpgreen[k*NJ*NI + j*NJ + i] = tmpgrn; + } + } + } + +} + + +__global__ void kernelIngration(double *rho2_m, double *tmpgreen, int NI, int NJ, int NI_tmp, int NJ_tmp, int NK_tmp) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int ni = NI; + int nj = NJ; + + double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0; + tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; + + + if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) + tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp) + tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp) + tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (k+1 < NK_tmp) + tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && j+1 < NJ_tmp) + tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && k+1 < NK_tmp) + tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp && k+1 < NK_tmp) + tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + + double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; + + rho2_m[i + j*ni + k*ni*nj] = tmp_rho; + +} + +__global__ void kernelIngration_2(double *rho2_m, double *tmpgreen, + int NI, int NJ, + int NI_tmp, int NJ_tmp, int NK_tmp) { + + int tid = threadIdx.x; + int id = blockIdx.x * blockDim.x + tid; + + int ni = NI; + int nj = NJ; + + double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + if (id < NI_tmp * NJ_tmp * NK_tmp) { + int i = id % NI_tmp; + int k = id / (NI_tmp * NJ_tmp); + int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp; + + tmp0 = 0; tmp1 = 0; tmp2 = 0; tmp3 = 0; + tmp4 = 0; tmp5 = 0; tmp6 = 0; tmp7 = 0; + + if (i+1 < NI_tmp && j+1 < NJ_tmp && k+1 < NK_tmp) + tmp0 = tmpgreen[(i+1) + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp) + tmp1 = tmpgreen[(i+1) + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp) + tmp2 = tmpgreen[ i + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (k+1 < NK_tmp) + tmp3 = tmpgreen[ i + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && j+1 < NJ_tmp) + tmp4 = tmpgreen[(i+1) + (j+1) * NI_tmp + k * NI_tmp * NJ_tmp]; + + if (i+1 < NI_tmp && k+1 < NK_tmp) + tmp5 = tmpgreen[(i+1) + j * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + if (j+1 < NJ_tmp && k+1 < NK_tmp) + tmp6 = tmpgreen[ i + (j+1) * NI_tmp + (k+1) * NI_tmp * NJ_tmp]; + + tmp7 = tmpgreen[ i + j * NI_tmp + k * NI_tmp * NJ_tmp]; + + double tmp_rho = tmp0 + tmp1 + tmp2 + tmp3 - tmp4 - tmp5 - tmp6 - tmp7; + + rho2_m[i + j*ni + k*ni*nj] = tmp_rho; + } + +} + + +//just one kernel will be executed +__global__ void mirroredRhoField0(double *rho2_m, int NI, int NJ) { + rho2_m[0] = rho2_m[NI*NJ]; +} + +__global__ void mirroredRhoFieldI(double *rho2_m, int NI, int NJ) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int idx1 = i + j*NI + k*NI*NJ; + int idx2 = (NI-i) + j*NI + k*NI*NJ; + + if (NI-i < NI) + rho2_m[idx2] = rho2_m[idx1]; + +} + +__global__ void mirroredRhoFieldJ(double *rho2_m, int NI, int NJ) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int idx1 = i + j*NI + k*NI*NJ; + int idx2 = i + (NJ-j)*NI + k*NI*NJ; + + if (NJ-j < NJ) + rho2_m[idx2] = rho2_m[idx1]; + +} + +__global__ void mirroredRhoFieldK(double *rho2_m, int NI, int NJ, int NK) { + + int i = blockIdx.x; + int j = blockIdx.y; + int k = blockIdx.z; + + int idx1 = i + j*NI + k*NI*NJ; + int idx2 = i + j*NI + (NK-k)*NI*NJ; + + if (NK-k < NK) + rho2_m[idx2] = rho2_m[idx1]; + +} + +__global__ void mirroredRhoField(double *rho2_m, + int NI, int NJ, int NK, + int NI_tmp, int NJ_tmp, int NK_tmp) { + + int tid = threadIdx.x; + int id = blockIdx.x * blockDim.x + tid; + + int id1, id2, id3, id4, id5, id6, id7, id8; + + if (id < NI_tmp * NJ_tmp * NK_tmp) { + int i = id % NI_tmp; + int k = id / (NI_tmp * NJ_tmp); + int j = (id - k * NI_tmp * NJ_tmp) / NI_tmp; + + int ri = NI - i; + int rj = NJ - j; + int rk = NK - k; + + id1 = k * NI * NJ + j * NI + i; + id2 = k * NI * NJ + j * NI + ri; + id3 = k * NI * NJ + rj * NI + i; + id4 = k * NI * NJ + rj * NI + ri; + + id5 = rk * NI * NJ + j * NI + i; + id6 = rk * NI * NJ + j * NI + ri; + id7 = rk * NI * NJ + rj * NI + i; + id8 = rk * NI * NJ + rj * NI + ri; + + + double data = rho2_m[id1]; + if (i != 0) + rho2_m[id2] = data; + + if (j != 0) + rho2_m[id3] = data; + + if (i != 0 && j != 0) + rho2_m[id4] = data; + + if (k != 0) + rho2_m[id5] = data; + + if (k != 0 && i != 0) + rho2_m[id6] = data; + + if (k!= 0 && j != 0) + rho2_m[id7] = data; + + if (k != 0 && j != 0 & i != 0) + rho2_m[id8] = data; + + } + +} + +__device__ inline cuDoubleComplex ComplexMul(cuDoubleComplex a, cuDoubleComplex b) { + + cuDoubleComplex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + + return c; + +} + +__global__ void multiplyComplexFields(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2) { + + int idx = blockIdx.x; + + ptr1[idx] = ComplexMul(ptr1[idx], ptr2[idx]); +} + + +/* +copy data in shared memory first to improve memory access (few global memory accesses, maybo no improvements) +use more threads per block to improve occupancy of hardware (test for best block and thread sizes) +*/ +__global__ void multiplyComplexFields_2(cuDoubleComplex *ptr1, cuDoubleComplex *ptr2, + int size) +{ + + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + + extern __shared__ cuDoubleComplex data[]; + + if (idx < size) { + data[2*tid] = ptr1[idx]; + data[2*tid + 1] = ptr2[idx]; + } + + __syncthreads(); + + if (idx < size) + ptr1[idx] = ComplexMul(data[2*tid], data[2*tid+1]); + + +} + + +CudaGreensFunction::CudaGreensFunction(CudaBase *base) { + m_base = base; + base_create = false; +} + +/* constructor */ +CudaGreensFunction::CudaGreensFunction() { + m_base = new CudaBase(); + base_create = true; +} + +/* destructor */ +CudaGreensFunction::~CudaGreensFunction() { + if (base_create) + delete m_base; +} + +int CudaGreensFunction::cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, + double hr_m0, double hr_m1, double hr_m2, + int streamId) +{ + + int thread = 128; + int block = (I * J * K / thread) + 1; + + //if no stream specified use default stream + if (streamId == -1) { + kernelTmpgreen_2<<< block, thread >>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K); + + return DKS_SUCCESS; + } + + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelTmpgreen_2<<< block, thread, 0, cs>>>((double*)tmpptr, hr_m0, hr_m1, hr_m2, I, J, K); + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} + +int CudaGreensFunction::cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, + int I, int J, int K, + int streamId) +{ + + int thread = 128; + int block = (I * J * K / thread) + 1; + + if (streamId == -1) { + kernelIngration_2<<< block, thread >>>( (double*)rho2_m, (double*)tmpgreen, + 2*(I - 1), 2*(J - 1), I, J, K); + return DKS_SUCCESS; + } + + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + kernelIngration_2<<< block, thread, 0, cs>>>( (double*)rho2_m, (double*)tmpgreen, + 2*(I - 1), 2*(J - 1), I, J, K); + return DKS_SUCCESS; + } + + + return DKS_ERROR; +} + +int CudaGreensFunction::cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { + + int thread = 128; + int block = ( (I + 1) * (J + 1) * (K + 1) / thread) + 1; + + if (streamId == -1) { + mirroredRhoField0<<< 1, 1>>>( (double *)mem_ptr, 2*I, 2*J); + mirroredRhoField<<< block, thread >>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I + 1, J + 1, K + 1); + return DKS_SUCCESS; + } + + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + mirroredRhoField0<<< 1, 1, 0, cs>>>( (double *)mem_ptr, 2*I, 2*J); + mirroredRhoField<<< block, thread, 0, cs>>>( (double *) mem_ptr, 2*I, 2*J, 2*K, I+1, J+1, K+1); + + return DKS_SUCCESS; + } + + + + return DKS_ERROR; +} + +int CudaGreensFunction::cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, + int size, int streamId) { + + int threads = 128; + int blocks = size / threads + 1; + int datasize = 2 * threads * sizeof(cuDoubleComplex); + + if (streamId == -1) { + multiplyComplexFields_2<<>> ( (cuDoubleComplex*)ptr1, + (cuDoubleComplex*)ptr2, + size); + return DKS_SUCCESS; + } + + if (streamId < m_base->cuda_numberOfStreams()) { + cudaStream_t cs = m_base->cuda_getStream(streamId); + multiplyComplexFields_2<<>> ( (cuDoubleComplex*)ptr1, + (cuDoubleComplex*) ptr2, size); + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} + + + diff --git a/src/CUDA/CudaGreensFunction.cuh b/src/CUDA/CudaGreensFunction.cuh new file mode 100644 index 0000000..5095e7a --- /dev/null +++ b/src/CUDA/CudaGreensFunction.cuh @@ -0,0 +1,63 @@ +#ifndef H_CUDA_GREENSFUNCTION +#define H_CUDA_GREENSFUNCTION + +#include +#include + +#include +#include +#include +#include "cublas_v2.h" + + +#include "CudaBase.cuh" + +class CudaGreensFunction { + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** Constructor with CudaBase argument */ + CudaGreensFunction(CudaBase *base); + + /* constructor */ + CudaGreensFunction(); + + /* destructor */ + ~CudaGreensFunction(); + + /* + Info: calc itegral on device memory (taken from OPAL src code) + Return: success or error code + */ + int cuda_GreensIntegral(void *tmpptr, int I, int J, int K, int NI, int NJ, + double hr_m0, double hr_m1, double hr_m2, + int streamId = -1); + + /* + Info: integration of rho2_m field (taken from OPAL src code) + Return: success or error code + */ + int cuda_IntegrationGreensFunction(void *rho2_m, void *tmpgreen, int I, int J, int K, + int streamId = -1); + + /* + Info: mirror rho field (taken from OPAL src code) + Return: succes or error code + */ + int cuda_MirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); + + /* + Info: multiply complex fields already on the GPU memory, result will be put in ptr1 + Return: success or error code + */ + int cuda_MultiplyCompelxFields(void *ptr1, void *ptr2, int size, int streamId = -1); + + +}; + +#endif diff --git a/src/CUDA/CudaImageReconstruction.cu b/src/CUDA/CudaImageReconstruction.cu new file mode 100644 index 0000000..14ab4ee --- /dev/null +++ b/src/CUDA/CudaImageReconstruction.cu @@ -0,0 +1,1221 @@ +#include "CudaImageReconstruction.cuh" + +//x_edge, y_edge, z_edge and matrix_distance_factor need to be set as const for the run +//voxel_x, voxel_y and voxel_z also need to be set as const for the run +__device__ float d_x_edge = 30.8; +__device__ float d_y_edge = 30.8; +__device__ float d_z_edge = 16.8; + +__device__ float d_matrix_distance_factor = 1.2; + +__device__ int d_voxel_x = 90; +__device__ int d_voxel_y = 90; +__device__ int d_voxel_z = 50; + +__device__ float d_voxel_size = 0.7; + + +//phantom_diameter needs to be defined, atten_per_mm as well +__device__ float d_phantom_diameter = 51; +__device__ float d_atten_per_mm = 0.0095; +__device__ float d_ring_diameter = 138; +__device__ float d_minimum_CrystalDistance_InOneRing = 123.489; + + +__device__ float d_x_edge1 = 29.26; +__device__ float d_y_edge1 = 29.26; +__device__ float d_z_edge1 = 15.96; +__device__ float d_z_edge2 = 14.28; +__device__ float d_minimum_CrystalDistance_InOneRing1 = 127.681; + + +__device__ inline float distance(VoxelPosition &a, VoxelPosition &b) { + float dist_x = pow(a.x - b.x, 2); + float dist_y = pow(a.y - b.y, 2); + float dist_z = pow(a.z - b.z, 2); + return sqrt(dist_x + dist_y + dist_z); +} + +__global__ void kernelCalculateSource(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - diameter) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - diameter) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - diameter) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + diameter) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + diameter) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + diameter) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + if (dist < diameter * 0.5 ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + } +} + +__global__ void kernelCalculateBackground(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - (diameter + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + (diameter + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + //if ( dist > diameter * 0.5 && dist < (diameter * 0.5 + 1) ) { + if ( dist > diameter * 0.5 && dist < (diameter) ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + + } +} + + +__global__ void kernelCalculateSources(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float *diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + float diam = diameter[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - diam) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - diam) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - diam) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + diam) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + diam) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + diam) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + if (dist < diam * 0.5 ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + + } +} + +__global__ void kernelCalculateBackgrounds(float *image_space, VoxelPosition *image_position, + VoxelPosition *source_position, float *average, + float *stdev, float *diameter, int total_voxels, + int total_sources, int start) +{ + + volatile int tid = threadIdx.x; + volatile int idx = blockIdx.x * blockDim.x + tid; + volatile int voxel_id = idx + start; + + if (voxel_id < total_voxels && idx < total_sources) { + //read source position + VoxelPosition source = source_position[voxel_id]; + float diam = diameter[voxel_id]; + + int count = 0; + float sum = 0; + float sqsum = 0; + + int sx = floor( ((source.x - (diam + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + sx = (sx < 0) ? 0 : sx; + sx = (sx > d_voxel_x - 1) ? d_voxel_x : sx; + + int sy = floor( ((source.y - (diam + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + sy = (sy < 0) ? 0 : sy; + sy = (sy > d_voxel_y - 1) ? d_voxel_y : sy; + + int sz = floor( ((source.z - (diam + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + sz = (sz < 0) ? 0 : sz; + sz = (sz > d_voxel_z - 1) ? d_voxel_z : sz; + + int ex = floor( ((source.x + (diam + 1.0)) / d_voxel_size) + ((d_voxel_x - 1.0) / 2.0) ); + ex = (ex < 0) ? 0 : ex; + ex = (ex > d_voxel_x - 1) ? d_voxel_x : ex; + + int ey = floor( ((source.y + (diam + 1.0)) / d_voxel_size) + ((d_voxel_y - 1.0) / 2.0) ); + ey = (ey < 0) ? 0 : ey; + ey = (ey > d_voxel_y - 1) ? d_voxel_y : ey; + + int ez = floor( ((source.z + (diam + 1.0)) / d_voxel_size) + ((d_voxel_z - 1.0) / 2.0) ); + ez = (ez < 0) ? 0 : ez; + ez = (ez > d_voxel_z - 1) ? d_voxel_z : ez; + + VoxelPosition voxel; + for (int z = sz; z < ez; z++) { + voxel.z = (z-(d_voxel_z - 1.0) / 2.0) * d_voxel_size; + for (int y = sy; y < ey; y++) { + voxel.y = (y-(d_voxel_y - 1.0) / 2.0) * d_voxel_size; + for (int x = sx; x < ex; x++) { + voxel.x = (x-(d_voxel_x - 1.0) / 2.0) * d_voxel_size; + + float dist = distance(voxel, source); + + //if ( dist > diam * 0.5 && dist < (diam * 0.5 + 1) ) { + if ( dist > diam * 0.5 && dist < diam ) { + //read voxel value + int i = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + float v = image_space[i]; + sum += v; + sqsum += v*v; + count++; + } + } + } + } + + float avg = sum / count; + average[idx] = avg; + stdev[idx] = sqrt( (sqsum + count * avg * avg - 2 * avg * sum) / count / (count - 1) ); + + } +} + +__device__ void localRaytracingX(float *recon, VoxelPosition *image_position, + float &atten_factor, float &slope_y, float &slope_z, + float &a_x, float &a_y, float &a_z) +{ + + for (int x = 0; x < d_voxel_x; x++) { + float lor_x = image_position[x].x; + float lor_y = slope_y * ( lor_x - a_x ) + a_y; + float lor_z = slope_z * ( lor_x - a_x ) + a_z; + + if ( pow(lor_x / d_x_edge,2) + pow( lor_y/d_y_edge, 2) < 1.0 && abs(lor_z) < d_z_edge ) { + + int y = floor( (lor_y+d_y_edge) / d_voxel_size); + int z = floor( (lor_z+d_z_edge) / d_voxel_size); + + int voxel_id = z * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + + voxel_id = z * d_voxel_y * d_voxel_x + (y+1) * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + + voxel_id = (z+1) * d_voxel_y * d_voxel_x + y * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + + voxel_id = (z+1) * d_voxel_y * d_voxel_x + (y+1) * d_voxel_x + x; + atomicAdd(&recon[voxel_id], ( d_matrix_distance_factor - + sqrt( pow(lor_y-image_position[voxel_id].y,2) + + pow(lor_z-image_position[voxel_id].z,2) ) + ) * atten_factor); + } + } + +} + + +__device__ void localRaytracingY(float *recon, VoxelPosition *image_position, + float &atten_factor, float &slope_x, float &slope_z, + float &a_x, float &a_y, float &a_z) +{ + + for (int y=0;y0.001) { + distance_xy = 2.0 * sqrt( distance_tocenter ) ; + } + else + distance_xy = 0.0; + + float distance_z = abs( a_z - b_z ) * distance_xy / distance_lor_xy; + float distance = sqrt( pow(distance_xy,2) + pow(distance_z,2) ); + + return exp(-distance*d_atten_per_mm); + +} + +__global__ void kernelNormalization(float *recon, VoxelPosition *image_position, + VoxelPosition *det_position, int total_det) +{ + + int tidx = threadIdx.x; + int tidy = threadIdx.y; + + int detA = blockIdx.x * blockDim.x + tidx; + int detB = blockIdx.y * blockDim.y + tidy; + + if (detA != detB && detA < total_det && detB < total_det) { + + VoxelPosition pA = det_position[detA]; + VoxelPosition pB = det_position[detB]; + + float distance_x = abs( pA.x - pB.x); + float distance_y = abs( pA.y - pB.y); + float distance_z = abs( pA.z - pB.z); + + if( sqrt(pow(distance_x,2) + pow(distance_y,2)) > d_minimum_CrystalDistance_InOneRing) { + float atten_factor; + atten_factor = atten_factor_calcu(pA.x,pA.y,pA.z,pB.x,pB.y,pB.z); + + if (distance_x > distance_y && distance_x > distance_z) { + + float slope_y = ( pB.y - pA.y ) / ( pB.x - pA.x ); + float slope_z = ( pB.z - pA.z ) / ( pB.x - pA.x ); + + localRaytracingX(recon, image_position, atten_factor, slope_y, slope_z, pA.x, pA.y, pA.z); + + } + else if (distance_y > distance_z) { + + float slope_x = ( pB.x - pA.x ) / ( pB.y - pA.y ); + float slope_z = ( pB.z - pA.z ) / ( pB.y - pA.y ); + + localRaytracingY(recon, image_position, atten_factor, slope_x, slope_z, pA.x, pA.y, pA.z); + } + else { + + float slope_x = ( pB.x - pA.x ) / ( pB.z - pA.z ); + float slope_y = ( pB.y - pA.y ) / ( pB.z - pA.z ); + + localRaytracingZ(recon, image_position, atten_factor, slope_x, slope_y, pA.x, pA.y, pA.z); + } + + } + } +} + +__device__ float localRaytracingForwardX(float*recon, VoxelPosition &pos, + float &a_x, float &a_y, float &a_z, + float &b_x, float &b_y, float &b_z) +{ + + float result = 0.000001; + float slope_y = ( b_y - a_y ) / ( b_x - a_x); + float slope_z = ( b_z - a_z ) / ( b_x - a_x); + + for (int x=0; x d_minimum_CrystalDistance_InOneRing1 && + (abs(distance_z1) distance_y && distance_x > distance_z) + branch = 1; + else if (distance_y > distance_z) + branch = 2; + else + branch = 3; + + } + + event_branch[idx] = branch; + } + +} + +__global__ void kernelZeroBackward(float *recon_corrector, int size) { + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + recon_corrector[idx] = 0; + +} + +__device__ void localRaytracingBackwardX(float &correction, float *recon_corrector, + VoxelPosition &pos, + float &a_x, float &a_y, float &a_z, + float &b_x, float &b_y, float &b_z) +{ + + float slope_y = ( b_y - a_y ) / ( b_x - a_x); + float slope_z = ( b_z - a_z ) / ( b_x - a_x); + + for (int x=0;x 0) { + corr = correction[idx]; + pos = image_position[0]; + } + + if (branch == 1) + localRaytracingBackwardX(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z); + else if (branch == 2) + localRaytracingBackwardY(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z); + else if (branch == 3) + localRaytracingBackwardZ(corr, recon_corrector, pos, pA.x, pA.y, pA.z, pB.x, pB.y, pB.z); + + } + +} + + +int CudaImageReconstruction::calculateSource(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + //call kernel + kernelCalculateSource<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::calculateBackground(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + + //call kernel + kernelCalculateBackground<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::calculateSources(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + //call kernel + kernelCalculateSources<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + (float*) diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::calculateBackgrounds(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + + int threads = BLOCK_SIZE; + int blocks = total_sources / threads + 1; + + + //call kernel + kernelCalculateBackgrounds<<>>( (float*) image_space, + (VoxelPosition*) image_position, + (VoxelPosition*) source_position, + (float*) avg, + (float*) std, + (float*) diameter, + total_voxels, + total_sources, + start); + + return DKS_SUCCESS; +} + +int CudaImageReconstruction::generateNormalization(void *recon, void *image_position, + void *det_position, int total_det) +{ + + int blocksize = 32; + dim3 threads(blocksize, blocksize, 1); + + dim3 blocks(total_det / blocksize + 1, total_det / blocksize + 1); + + kernelNormalization<<>>( (float*) recon, + (VoxelPosition*) image_position, + (VoxelPosition*) det_position, + total_det); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error launching normalization kernel!"); + std::cout << cudaGetErrorString(err); + return DKS_ERROR; + } + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::forwardProjection(void *correction, void *recon, + void *list_data, void *det_position, + void *image_position, int num_events) +{ + + int threads = BLOCK_SIZE; + int blocks = num_events / threads + 1; + + int ierr; + m_event_branch = m_base->cuda_allocateMemory(sizeof(int)*num_events, ierr); + + kernelCheckEvents<<>>((ListEvent*)list_data, + (VoxelPosition*)det_position, + (int*)m_event_branch, + num_events); + + //warp mem pointers with thrust device ptr + thrust::device_ptr t_event_branch( (int*)m_event_branch ); + thrust::device_ptr t_list_data( (ListEvent*)list_data ); + + thrust::sort_by_key( t_event_branch, t_event_branch + num_events, t_list_data ); + + kernelForwardProjection<<>>( (float*)correction, + (float*)recon, + (ListEvent*)list_data, + (VoxelPosition*)det_position, + (VoxelPosition*)image_position, + (int*)m_event_branch, + num_events); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + std::cout << "Error launching kernel!" << std::endl; + std::cout << cudaGetErrorString(err) << std::endl; + + } + + return DKS_SUCCESS; + +} + + +int CudaImageReconstruction::backwardProjection(void *correction, void *recon_corrector, + void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels) +{ + + int threads = BLOCK_SIZE; + int blocks1 = num_voxels / threads + 1; + int blocks2 = num_events / threads + 1; + + kernelZeroBackward<<>>((float*)recon_corrector, num_voxels); + + + kernelBackwardProjection<<>>( (float*)correction, + (float*)recon_corrector, + (ListEvent*)list_data, + (VoxelPosition*)det_position, + (VoxelPosition*)image_position, + (int*)m_event_branch, + num_events); + + m_base->cuda_freeMemory( m_event_branch ); + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setDimensions(int voxel_x, int voxel_y, int voxel_z, + float voxel_size) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_voxel_x, &voxel_x, sizeof(int)); + cudaMemcpyToSymbol(d_voxel_y, &voxel_y, sizeof(int)); + cudaMemcpyToSymbol(d_voxel_z, &voxel_z, sizeof(int)); + cudaMemcpyToSymbol(d_voxel_size, &voxel_size, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setEdge(float x_edge, float y_edge, float z_edge) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_x_edge, &x_edge, sizeof(float)); + cudaMemcpyToSymbol(d_y_edge, &y_edge, sizeof(float)); + cudaMemcpyToSymbol(d_z_edge, &z_edge, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_x_edge1, &x_edge1, sizeof(float)); + cudaMemcpyToSymbol(d_y_edge1, &y_edge1, sizeof(float)); + cudaMemcpyToSymbol(d_z_edge1, &z_edge1, sizeof(float)); + cudaMemcpyToSymbol(d_z_edge2, &z_edge2, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setMinCrystalInRing(float min_CrystalDist_InOneRing, + float min_CrystalDist_InOneRing1) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_minimum_CrystalDistance_InOneRing, + &min_CrystalDist_InOneRing, sizeof(float)); + + cudaMemcpyToSymbol(d_minimum_CrystalDistance_InOneRing1, + &min_CrystalDist_InOneRing1, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} + +int CudaImageReconstruction::setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter) +{ + + //copy from host to __device__ variables + cudaMemcpyToSymbol(d_matrix_distance_factor, &matrix_distance_factor, sizeof(float)); + cudaMemcpyToSymbol(d_phantom_diameter, &phantom_diameter, sizeof(float)); + cudaMemcpyToSymbol(d_atten_per_mm, &atten_per_mm, sizeof(float)); + cudaMemcpyToSymbol(d_ring_diameter, &ring_diameter, sizeof(float)); + + //check for error + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + DEBUG_MSG("Error copying to device memory!"); + return DKS_ERROR; + } + + return DKS_SUCCESS; + +} diff --git a/src/CUDA/CudaImageReconstruction.cuh b/src/CUDA/CudaImageReconstruction.cuh new file mode 100644 index 0000000..4cf532c --- /dev/null +++ b/src/CUDA/CudaImageReconstruction.cuh @@ -0,0 +1,118 @@ +#ifndef H_CUDA_IMAGERECONSTRUCTION +#define H_CUDA_IMAGERECONSTRUCTION + +#include +#include +#include +#include +#include + +#include "../Algorithms/ImageReconstruction.h" +#include "CudaBase.cuh" + +class CudaImageReconstruction : public ImageReconstruction { + +private: + + bool base_create; + CudaBase *m_base; + +public: + + /** Constructor */ + CudaImageReconstruction() { + m_base = new CudaBase(); + base_create = true; + }; + + /** Constructor with base **/ + CudaImageReconstruction(CudaBase *base) { + m_base = base; + base_create = false; + } + + /** Destructor */ + ~CudaImageReconstruction() { + if (base_create) + delete m_base; + }; + + /** CUDA implementation of caluclate source + */ + int calculateSource(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + /** Cuda implementation of calculate background + */ + int calculateBackground(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + /** + * Caluclate source for differente sources + */ + int calculateSources(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** + * Calculate background for differente sources + */ + int calculateBackgrounds(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** Generate normalization. + * Goes trough detectors pairs and if detector pair crosses image launches seperate kernel + * that updates voxel values in the image on the slope between these two detectors. + */ + int generateNormalization(void *recon, void *image_position, + void *det_position, int total_det); + + + /** Calculate forward projection. + * For image reconstruction calculates forward projections. + * see recon.cpp for details + */ + int forwardProjection(void *correction, void *recon, void *list_data, void *det_position, + void *image_position, int num_events); + + /** Calculate backward projection. + * For image reconstruction calculates backward projections. + * see recon.cpp for details + */ + int backwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels); + + /** Set the voxel dimensins on device. + * + */ + int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); + + /** Set the image edge. + * + */ + int setEdge(float x_edge, float y_edge, float z_edge); + + /** Set the image edge1. + * + */ + int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); + + /** Set the minimum crystan in one ring values. + * + */ + int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); + + /** Set all other required parameters for reconstruction. + * + */ + int setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter); + + +}; + +#endif diff --git a/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu new file mode 100644 index 0000000..b22cab7 --- /dev/null +++ b/src/CUDA/NVRTCKernels/CudaChiSquareKernel.cu @@ -0,0 +1,316 @@ +#define PI 3.141592653589793115998 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +/** Theory function declaration. + * Definition of the theory function will be build during runtime before compilation. + */ +__device__ double fTheory(double t, double *p, double *f, int *m); + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +__device__ double se(double t, double lamda) { + return exp( -lamda*t ); +} + +__device__ double ge(double t, double lamda, double beta) { + return exp( -pow(lamda*t, beta) ); +} + +__device__ double sg(double t, double sigma) { + return exp( -0.5*pow(sigma*t, 2.0) ); +} + +__device__ double stg(double t, double sigma) { + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5*sigmatsq); +} + +__device__ double sekt(double t, double lambda) { + double lambdat = lambda*t; + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +__device__ double lgkt(double t, double lambda, double sigma) { + double lambdat = lambda*t; + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +__device__ double skt(double t, double sigma, double beta) { + if (beta < 1.0e-3) + return 0.0; + double sigmatb = pow(sigma*t, beta); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta); +} + +__device__ double spg(double t, double lambda, double gamma, double q) { + double lam2 = lambda*lambda; + double lamt2q = t*t*lam2*q; + double rate2 = 4.0*lam2*(1.0-q)*t/gamma; + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +__device__ double rahf(double t, double nu, double lambda) { + double nut = nu*t; + double nuth = nu*t/2.0; + double lamt = lambda*t; + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +__device__ double tf(double t, double phi, double nu) { + double tmp_nu = TWO_PI*nu*t; + double tmp_phi = DEG_TO_RAD*phi; + + return cos(tmp_nu + tmp_phi); +} + +__device__ double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + + return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +__device__ double b(double t, double phi, double nu) { + return j0(TWO_PI*nu*t + DEG_TO_RAD*phi); +} + +__device__ double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI * nu * t; + double ph = DEG_TO_RAD * phi; + + return alpha*j0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +__device__ double ab(double t, double sigma, double gamma) { + double gt = gamma*t; + + return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt)); +} + +__device__ double snkzf(double t, double Delta0, double Rb) { + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +__device__ double snktf(double t, double phi, double nu, double Delta0, double Rb) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +__device__ double dnkzf(double t, double Delta0, double Rb, double nuc) { + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa); +} + +__device__ double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph); +} + +/** Theory and chisquare functions. + * Based on the compiler flags set theory is calculated either in single hist mode or asymetric. + * Based on the compiler flags calculate either chisq or MLE + */ + +__device__ inline double singleHist(double &N0, double &tau, double &bkg, double &f, double &t) { + return N0 * exp (-t/tau ) * (1.0 + f) + bkg; +} + +__device__ inline double asymetry(double &a, double &b, double &f) { + return (f * (a * b) - (a - 1.0)) / ((a + 1.0) - f * (a * b - 1.0)); +} + +__device__ inline double getTheory(double &c1, double &c2, double &c3, double &f, double &t) { +#ifndef ASYMETRY + return singleHist(c1, c2, c3, f, t); +#elif + return asymetry(c1, c2, f); +#endif +} + +__device__ inline double chiSq(double &data, double &theo, double &err) { + double res = (theo - data) * (theo - data); + if (err != 0.0) + res /= err; + + return res; +} + +__device__ inline double mle(double &data, double &theo, double &err) { + double res = (theo - data); + if ( data > 1.0e-9 && fabs(theo) > 1.0e-9 ) + res += data * log(data / theo); + + return res; +} + +__device__ inline double getChiSq(double &data, double &theo, double &err) { +#ifndef MLE + return chiSq(data, theo, err); +#elif + return mle(data, theo, err); +#endif +} + +//----------------------------------------------------------------------------------------------- +/** + * Kernel to calculate theory function and chisquare/mle values for single histogram fits. + */ +extern "C" __global__ void kernelChiSquareSingleHisto(double *data, double *err, double *par, + double *chisq, int *map, double *funcv, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double tau, double N0, double bkg) { + //define shared variable for parameters + extern __shared__ double smem[]; + double *p = (double*)smem; + double *f = (double*)&smem[numpar]; + int *m = (int*)&smem[numpar + numfunc]; + + //get thread id and calc global id + int tid; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + tid = threadIdx.x; + while (tid < numpar) { + p[tid] = par[tid]; + tid += blockDim.x; + } + + //load functions from global to shared memory + tid = threadIdx.x; + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += blockDim.x; + } + + //load maps from global memory + tid = threadIdx.x; + while (tid < nummap) { + m[tid] = map[tid]; + tid += blockDim.x; + } + + //sync threads + __syncthreads(); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg; + + #ifdef MLH + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo)); + else + chisq[j] = 2.0 * (theo - ldata); + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += gridDim.x * blockDim.x; + + } +} + +//----------------------------------------------------------------------------------------------- +/** + * Kernel to calculate theory function and chisquare/mle values for asymmetry fits. + */ +extern "C" __global__ void kernelChiSquareAsymmetry(double *data, double *err, double *par, + double *chisq, int *map, double *funcv, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double alpha, double beta) { + //define shared variable for parameters + extern __shared__ double smem[]; + double *p = (double*)smem; + double *f = (double*)&smem[numpar]; + int *m = (int*)&smem[numpar + numfunc]; + + //get thread id and calc global id + int tid; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + //load parameters from global to shared memory + tid = threadIdx.x; + while (tid < numpar) { + p[tid] = par[tid]; + tid += blockDim.x; + } + + //load functions from global to shared memory + tid = threadIdx.x; + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += blockDim.x; + } + + //load maps from global memory + tid = threadIdx.x; + while (tid < nummap) { + m[tid] = map[tid]; + tid += blockDim.x; + } + + //sync threads + __syncthreads(); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double theoVal = fTheory(t, p, f, m); + double ab = alpha*beta; + + double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0) - (ab-1.0)*theoVal); + + #ifdef MLH + chisq[j] = 0.0; // log max likelihood not defined here + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += gridDim.x * blockDim.x; + } +} + diff --git a/src/DKSBase.cpp b/src/DKSBase.cpp new file mode 100644 index 0000000..96e9b19 --- /dev/null +++ b/src/DKSBase.cpp @@ -0,0 +1,861 @@ +#include "DKSBase.h" + +#define API_OPENCL "OpenCL" +#define API_CUDA "Cuda" +#define API_OPENMP "OpenMP" + +#define DEVICE_GPU "-gpu" +#define DEVICE_CPU "-cpu" +#define DEVICE_MIC "-mic" + +//=====================================// +//==========Private functions==========// +//=====================================// + +bool DKSBase::apiOpenCL() { + + if (!m_api_set) + return false; + + if (strcmp(m_api_name, API_OPENCL) != 0) + return false; + + return true; +} + +bool DKSBase::apiCuda() { + + if (!m_api_set) + return false; + + if (strcmp(m_api_name, API_CUDA) != 0) + return false; + + return true; +} + +bool DKSBase::apiOpenMP() { + if (!m_api_set) + return false; + + if (strcmp(m_api_name, API_OPENMP) != 0) + return false; + + return true; +} + +bool DKSBase::deviceGPU() { + if (!m_device_set) + return false; + if (strcmp(m_device_name, DEVICE_GPU) != 0) + return false; + + return true; +} + +bool DKSBase::deviceCPU() { + if (!m_device_set) + return false; + if (strcmp(m_device_name, DEVICE_CPU) != 0) + return false; + + return true; +} + +bool DKSBase::deviceMIC() { + if (!m_device_set) + return false; + if (strcmp(m_device_name, DEVICE_MIC) != 0) + return false; + + return true; +} + + +int DKSBase::loadOpenCLKernel(const char *kernel_name) { + //load kernel + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, kernel_name); + int ierr = OPENCL_SAFECALL( oclbase->ocl_loadKernel(kernel_file) ); + delete[] kernel_file; + + return ierr; +} + +//=====================================// +//==========Public functions===========// +//=====================================// + +DKSBase::DKSBase() { + + m_device_name = NULL; + m_api_name = NULL; + m_function_name = NULL; + + m_device_set = false; + m_api_set = false; + m_function_set = false; + + m_auto_tuning = false; + m_use_config = false; + +#ifdef DKS_CUDA + cbase = new CudaBase(); + cfft = new CudaFFT(cbase); + cgreens = new CudaGreensFunction(cbase); + cchi = new CudaChiSquare(cbase); + ccol = new CudaCollimatorPhysics(cbase); +#endif + +#ifdef DKS_OPENCL + oclbase = new OpenCLBase(); + oclfft = new OpenCLFFT(oclbase); + oclchi = new OpenCLChiSquare(oclbase); + oclcol = new OpenCLCollimatorPhysics(oclbase); +#endif + +#ifdef DKS_MIC + micbase = new MICBase(); + micfft = new MICFFT(micbase); + miccol = new MICCollimatorPhysics(micbase); + micgreens = new MICGreensFunction(micbase); + micchi = new MICChiSquare(micbase); +#endif + +} + +DKSBase::DKSBase(const char* api_name, const char* device_name) { + + setAPI(api_name, strlen(api_name)); + setDevice(device_name, strlen(device_name)); + m_function_name = NULL; + m_function_set = false; + + m_auto_tuning = false; + m_use_config = false; + +#ifdef DKS_CUDA + cbase = new CudaBase(); + cfft = new CudaFFT(cbase); + cgreens = new CudaGreensFunction(cbase); + cchi = new CudaChiSquare(cbase); + ccol = new CudaCollimatorPhysics(cbase); +#endif + +#ifdef DKS_OPENCL + oclbase = new OpenCLBase(); + oclfft = new OpenCLFFT(oclbase); + oclchi = new OpenCLChiSquare(oclbase); + oclcol = new OpenCLCollimatorPhysics(oclbase); +#endif + +#ifdef DKS_MIC + micbase = new MICBase(); + micfft = new MICFFT(micbase); + miccol = new MICCollimatorPhysics(micbase); + micgreens = new MICGreensFunction(micbase); + micchi = new MICChiSquare(micbase); +#endif + +} + + +DKSBase::~DKSBase() { + + if (m_device_name != NULL) + delete[] m_device_name; + + if (m_api_name != NULL) + delete[] m_api_name; + + if (m_function_name != NULL) + delete[] m_function_name; + + +#ifdef DKS_CUDA + delete cfft; + delete cgreens; + delete cchi; + delete ccol; + delete cbase; +#endif + +#ifdef DKS_OPENCL + delete oclfft; + delete oclchi; + delete oclcol; + delete oclbase; +#endif + +#ifdef DKS_MIC + delete micfft; + delete miccol; + delete micgreens; + delete micchi; + delete micbase; +#endif + +} + +/* + Name: setDevice + Info: sets specific device to use. length specifies device_name string length (deprecated) + Return: success or error code +*/ +int DKSBase::setDevice(const char* device_name, int length) { + + if (m_device_set) + delete[] m_device_name; + + int l = strlen(device_name); + m_device_name = new char[l+1]; + + for (int i = 0; i < l; i++) + m_device_name[i] = device_name[i]; + m_device_name[l] = '\0'; + + m_device_set = true; + + return DKS_SUCCESS; + +} + +/* + Name: setAPI + Info: sets specific api (OpenCL, CUDA, OpenACC, OpenMP) to use + Return: success or error code +*/ +int DKSBase::setAPI(const char* api_name, int length) { + + if (m_api_set) + delete[] m_api_name; + + int l = strlen(api_name); + m_api_name = new char[l+1]; + + for (int i = 0; i < l; i++) + m_api_name[i] = api_name[i]; + m_api_name[l] = '\0'; + + m_api_set = true; + + return DKS_SUCCESS; +} + +/* + Name: getDevices + Info: get all available devices + Return: success or error code +*/ +int DKSBase::getDevices() { + + int ierr1 = OPENCL_SAFECALL( oclbase->ocl_getAllDevices() ); + int ierr2 = CUDA_SAFECALL( cbase->cuda_getDevices() ); + int ierr3 = MIC_SAFECALL( micbase->mic_getDevices() ); + + if (ierr1 + ierr2 + ierr3 != DKS_SUCCESS) + return DKS_ERROR; + + return DKS_SUCCESS; +} + +int DKSBase::getDeviceCount(int &ndev) { + ndev = 0; + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_getDeviceCount(ndev) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_getDeviceCount(ndev) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +int DKSBase::getDeviceName(std::string &device_name) { + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_getDeviceName(device_name) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_getDeviceName(device_name) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +int DKSBase::setDefaultDevice(int device) { + std::cout << "Set device " << device << std::endl; + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_setDevice(device) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_setDevice(device) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +int DKSBase::getDeviceList(std::vector &devices) { + if (apiOpenCL()) + return OPENCL_SAFECALL( oclbase->ocl_getUniqueDevices(devices) ); + else if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_getUniqueDevices(devices) ); + else if (apiOpenMP()) + return DKS_ERROR; + else + return DKS_ERROR; +} + +/* + init device +*/ +int DKSBase::initDevice() { + + //if api is not set default is OpenCL + if (!m_api_set) { + setDevice("-gpu", 4); + setAPI(API_OPENCL, 6); + return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); + } else { + if (apiOpenCL()) { + if (!m_device_set) { + setDevice("-gpu", 4); + setAPI(API_OPENCL, 6); + return OPENCL_SAFECALL( oclbase->ocl_setUp("-gpu") ); + } else { + setAPI(API_OPENCL, 6); + return OPENCL_SAFECALL( oclbase->ocl_setUp(m_device_name) ); + } + } else if (apiCuda()) { + setDevice("-gpu", 4); + setAPI(API_CUDA, 4); + return CUDA_SAFECALL(DKS_SUCCESS); + } else if (apiOpenMP()) { + setDevice("-mic", 4); + setAPI(API_OPENMP, 6); + return MIC_SAFECALL(DKS_SUCCESS); + } + } + + return DKS_ERROR; +} + +/* + set up cuda, opencl and mic to allow async data transfer and kernel execution. + name stream 'stolen' from cuda. opencl context ~ cuda stream. + TODO: implementations for OpenCL and MIC still needed +*/ +int DKSBase::createStream(int &streamId) { + + if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_createStream(streamId) ); + else if (apiOpenMP()) + return MIC_SAFECALL( micbase->mic_createStream(streamId) ); + + DEBUG_MSG("Streams not enbled for this platforms jet"); + return DKS_ERROR; +} + +/* send device pointer to other processes */ +#ifdef DKS_MPI +int DKSBase::sendPointer(void *mem_ptr, int dest, MPI_Comm comm) { + + if ( apiCuda() ) { +#ifdef DKS_CUDA + cudaError cerror; + cudaIpcMemHandle_t shandle; + cerror = cudaIpcGetMemHandle(&shandle, mem_ptr); + MPI_Send(&shandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, dest, 100, comm); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error geting mem handle"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +#endif + } + else if (apiOpenMP()) { +#ifdef DKS_MIC + //BENI: + DEBUG_MSG("No SendPointer for MIC is implemented"); + return DKS_ERROR; +#endif + } + else { + DEBUG_MSG("Send device pointer not implemented on selected platform"); + return DKS_ERROR; + } + return DKS_ERROR; +} +#endif + +/* receive device pointer */ +#ifdef DKS_MPI +void * DKSBase::receivePointer(int hostproc, MPI_Comm comm, int &ierr) { + + void *mem_ptr; + if (apiCuda()) { +#ifdef DKS_CUDA + cudaError cerror; + cudaIpcMemHandle_t rhandle; + MPI_Recv(&rhandle, sizeof(cudaIpcMemHandle_t), MPI_BYTE, hostproc, 100, comm, NULL); + cerror = cudaIpcOpenMemHandle(&mem_ptr, rhandle, cudaIpcMemLazyEnablePeerAccess); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error opening received handle"); + ierr = DKS_ERROR; + } +#endif + return mem_ptr; + } + else if (apiOpenMP()) { +#ifdef DKS_MIC + //BENI: + DEBUG_MSG("No ReceivePointer for MIC is implemented"); + return DKS_SUCCESS; +#endif + return mem_ptr; + } + else { + ierr = DKS_ERROR; + DEBUG_MSG("Receive device pointer not implemented for selected platform"); + return mem_ptr; + } +} +#endif + +/* close received handle */ +int DKSBase::closeHandle(void *mem_ptr) { + + if (apiCuda()) { +#ifdef DKS_CUDA + cudaError cerror; + cerror = cudaIpcCloseMemHandle(mem_ptr); + if (cerror != cudaSuccess) { + DEBUG_MSG("Error closing memory handle"); + return DKS_ERROR; + } + + return DKS_SUCCESS; +#endif + } + + DEBUG_MSG("Memory handles not implemented for selected platform"); + return DKS_ERROR; + +} + +/* sync device calls */ +int DKSBase::syncDevice() { + + if (apiCuda()) + return CUDA_SAFECALL( cbase->cuda_syncDevice() ); + else if (apiOpenMP()) + return MIC_SAFECALL( micbase->mic_syncDevice() ); + + return DKS_ERROR; +} + +/* setup fft plans to reuse if multiple ffts of same size are needed */ +int DKSBase::setupFFT(int ndim, int N[3]) { + + if (apiCuda()) { + return CUDA_SAFECALL( cfft->setupFFT(ndim, N) ); + } else if (apiOpenMP()) { + //micbase.mic_setupFFT(ndim, N); + //BENI: setting up RC and CR transformations on MIC + int ierr1 = MIC_SAFECALL( micfft->setupFFTRC(ndim, N, 1.) ); + int ierr2 = MIC_SAFECALL( micfft->setupFFTCR(ndim, N, 1./(N[0]*N[1]*N[2])) ); + if (ierr1 != DKS_SUCCESS) + return ierr1; + if (ierr2 != DKS_SUCCESS) + return ierr2; + return DKS_SUCCESS; + } + + return DKS_ERROR; + +} +//BENI: +int DKSBase::setupFFTRC(int ndim, int N[3], double scale) { + + if (apiCuda()) + return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); + else if (apiOpenMP()) + return MIC_SAFECALL(micfft->setupFFTRC(ndim, N, scale)); + + return DKS_ERROR; + +} + +//BENI: +int DKSBase::setupFFTCR(int ndim, int N[3], double scale) { + + if (apiCuda()) + return CUDA_SAFECALL(cfft->setupFFT(ndim, N)); + else if (apiOpenMP()) + return MIC_SAFECALL(micfft->setupFFTCR(ndim, N, scale)); + + return DKS_ERROR; + +} + +/* call OpenCL FFT function for selected platform */ +int DKSBase::callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiOpenCL()) { + //load kernel and execute + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return OPENCL_SAFECALL( oclfft->executeFFT(data_ptr, ndim, dimsize) ); + else + return DKS_ERROR; + } else if (apiCuda()) { + return CUDA_SAFECALL(cfft->executeFFT(data_ptr, ndim, dimsize, streamId)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micfft->executeFFT(data_ptr, ndim, dimsize)); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call OpenCL IFFT function for selected platform */ +int DKSBase::callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + if (apiOpenCL()) { + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return OPENCL_SAFECALL( oclfft->executeIFFT(data_ptr, ndim, dimsize) ); + else + return DKS_ERROR; + } else if (apiCuda()) { + return CUDA_SAFECALL( cfft->executeIFFT(data_ptr, ndim, dimsize, streamId) ); + } else if (apiOpenMP()) { + return MIC_SAFECALL( micfft->executeIFFT(data_ptr, ndim, dimsize) ); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call normalize FFT function for selected platform */ +int DKSBase::callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiOpenCL()) { + if ( loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLFFT.cl") == DKS_SUCCESS ) + return OPENCL_SAFECALL( oclfft->normalizeFFT(data_ptr, ndim, dimsize) ); + else + return DKS_ERROR; + } else if (apiCuda()) { + return CUDA_SAFECALL( cfft->normalizeFFT(data_ptr, ndim, dimsize, streamId) ); + } else if (apiOpenMP()) { + return MIC_SAFECALL( micfft->normalizeFFT(data_ptr, ndim, dimsize) ); + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call real to complex FFT */ +int DKSBase::callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL( cfft->executeRCFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) ); + else if (apiOpenMP()) + return MIC_SAFECALL( micfft->executeRCFFT(real_ptr,comp_ptr, ndim, dimsize) ); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* call complex to real FFT */ +int DKSBase::callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId) { + if (apiCuda()) + return CUDA_SAFECALL( cfft->executeCRFFT(real_ptr, comp_ptr, ndim, dimsize, streamId) ); + else if (apiOpenMP()) + return MIC_SAFECALL( micfft->executeCRFFT(comp_ptr,real_ptr, ndim, dimsize) ); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; +} + +/* normalize complex to real iFFT */ +int DKSBase::callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId) { + if (apiCuda()) + return CUDA_SAFECALL( cfft->normalizeCRFFT(real_ptr, ndim, dimsize, streamId) ); + + DEBUG_MSG("No implementation for selected platform"); + return DKS_SUCCESS; +} + +/* normalize complex to real iFFT */ +int DKSBase::callTranspose(void *mem_ptr, int N[3], int ndim, int dim) { + if (apiOpenCL()) { + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLTranspose.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclfft->ocl_executeTranspose(mem_ptr, N, ndim, dim)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selected platform"); + return DKS_ERROR; + +} + +int DKSBase::callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, + double hz_m0, double hz_m1, double hz_m2, int streamId) { + + if (apiCuda()) { + return CUDA_SAFECALL(cgreens->cuda_GreensIntegral(tmp_ptr, I, J, K, NI, NJ, + hz_m0, hz_m1, hz_m2, streamId) ); + } else if (apiOpenMP()) { + //BENI: + return MIC_SAFECALL(micgreens->mic_GreensIntegral(tmp_ptr, I, J, K, hz_m0, hz_m1, hz_m2)); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callGreensIntegration(void *mem_ptr, void *tmp_ptr, + int I, int J, int K, int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL(cgreens->cuda_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(micgreens->mic_IntegrationGreensFunction(mem_ptr, tmp_ptr, I, J, K)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL(cgreens->cuda_MirrorRhoField(mem_ptr, I, J, K, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(micgreens->mic_MirrorRhoField(mem_ptr, I, J, K)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId) { + + if (apiCuda()) + return CUDA_SAFECALL(cgreens->cuda_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(micgreens->mic_MultiplyCompelxFields(mem_ptr1, mem_ptr2, size)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + + +int DKSBase::callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, double &result) +{ + + if (apiCuda()) { + return CUDA_SAFECALL(cchi->cuda_PHistoTFFcn(mem_data, mem_par, mem_chisq, + fTimeResolution, fRebin, + sensors, length, numpar, + result)); + } else if (apiOpenCL()) { + + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclchi->ocl_PHistoTFFcn(mem_data, mem_par, mem_chisq, + fTimeResolution, fRebin, + sensors, length, numpar, result)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + if (apiCuda()) { + return CUDA_SAFECALL(cchi->cuda_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, + result)); + } else if (apiOpenCL()) { + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclchi->ocl_singleGaussTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, result)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + if (apiCuda()) { + return CUDA_SAFECALL(cchi->cuda_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, + result)); + } else if (apiOpenCL()) { + + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLChiSquare.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclchi->ocl_doubleLorentzTF(mem_data, mem_t0, mem_par, mem_result, + fTimeResolution, fRebin, fGoodBinOffset, + sensors, length, numpar, result)); + else + return DKS_ERROR; + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callCollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles, int numparams, + int &numaddback, int &numdead) +{ + + if (apiCuda()) { + return CUDA_SAFECALL(ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles)); + } else if (apiOpenCL()) { + if (loadOpenCLKernel("OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl") == DKS_SUCCESS) + return OPENCL_SAFECALL(oclcol->CollimatorPhysics(mem_ptr, par_ptr, numparticles)); + else + return DKS_ERROR; + + } else if (apiOpenMP()) { + return MIC_SAFECALL(miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles)); + } + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + + +int DKSBase::callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles) +{ + + if (apiCuda()) + return CUDA_SAFECALL( ccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) ); + else if (apiOpenMP()) + return MIC_SAFECALL( miccol->CollimatorPhysics(mem_ptr, par_ptr, numparticles) ); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) +{ + + if (apiOpenMP()) { + return MIC_SAFECALL( miccol->CollimatorPhysicsSoA(label_ptr, localID_ptr, + rx_ptr, ry_ptr, rz_ptr, + px_ptr, py_ptr, pz_ptr, + par_ptr, numparticles) ); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + + +int DKSBase::callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) +{ + + if (apiCuda()) + return CUDA_SAFECALL(ccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback)); + else if (apiOpenMP()) + return MIC_SAFECALL(miccol->CollimatorPhysicsSort(mem_ptr, numparticles, numaddback)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; +} + +int DKSBase::callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) +{ + + if (apiOpenMP()) { + return MIC_SAFECALL(miccol->CollimatorPhysicsSortSoA(label_ptr, localID_ptr, + rx_ptr, ry_ptr, rz_ptr, + px_ptr, py_ptr, pz_ptr, + par_ptr, numparticles, numaddback)); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + + +int DKSBase::callInitRandoms(int size) { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_createCurandStates(size)); + else if (apiOpenCL()) + return OPENCL_SAFECALL(oclbase->ocl_createRndStates(size)); + else if (apiOpenMP()) + return MIC_SAFECALL(micbase->mic_createRandStreams(size)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, + bool usedt, int streamId) +{ + + if (apiCuda()) + return CUDA_SAFECALL(ccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, c, + usedt, streamId)); + else if (apiOpenMP()) + return MIC_SAFECALL(miccol->ParallelTTrackerPush(r_ptr, p_ptr, npart, dt_ptr, dt, + c, usedt, streamId)); + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} + +int DKSBase::callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, void *dt_ptr, double dt, + double c, bool usedt, int streamId) +{ + + if (apiCuda()) { + return CUDA_SAFECALL(ccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, + lastSec_ptr, orient_ptr, + npart, nsec, dt_ptr, dt, + c, usedt, streamId)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(miccol->ParallelTTrackerPushTransform(x_ptr, p_ptr, + lastSec_ptr, orient_ptr, + npart, nsec, dt_ptr, dt, + c, usedt, streamId)); + } + + DEBUG_MSG("No implementation for selceted platform"); + return DKS_ERROR; + +} diff --git a/src/DKSBase.h b/src/DKSBase.h new file mode 100644 index 0000000..ea8bc39 --- /dev/null +++ b/src/DKSBase.h @@ -0,0 +1,1133 @@ +/** DKSBase class. + * DKSBase.h + * Author: Uldis Locans + * Date: 15.09.2014 + * Base class of Dynamic Kernel Scheduler that handles the function calls + * from host application to DKS + */ + +#ifndef H_DKS_BASE +#define H_DKS_BASE + +#include +#include +#include +#include + +#include "DKSDefinitions.h" + +#ifdef DKS_MPI +#include +#endif + +#ifdef DKS_OPENCL + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "OpenCL/OpenCLBase.h" +#include "OpenCL/OpenCLFFT.h" +#include "OpenCL/OpenCLChiSquare.h" +#include "OpenCL/OpenCLCollimatorPhysics.h" +#endif + +#ifdef DKS_CUDA +#include "CUDA/CudaBase.cuh" +#include "CUDA/CudaFFT.cuh" +#include "CUDA/CudaGreensFunction.cuh" +#include "CUDA/CudaChiSquare.cuh" +#include "CUDA/CudaCollimatorPhysics.cuh" +#include "nvToolsExt.h" +#endif + +#ifdef DKS_MIC +#include "MIC/MICBase.h" +#include "MIC/MICChiSquare.h" +#include "MIC/MICFFT.h" +#include "MIC/MICCollimatorPhysics.h" +#include "MIC/MICGreensFunction.hpp" +#endif + +#include "Algorithms/CollimatorPhysics.h" +#include "Algorithms/FFT.h" + +#include "AutoTuning/DKSConfig.h" + +/** DKSBase class for handling function calls to DKS library */ +class DKSBase { + +private: + char *m_device_name; + char *m_api_name; + char *m_function_name; + + bool m_device_set; + bool m_api_set; + bool m_function_set; + + bool m_auto_tuning; + bool m_use_config; + +#ifdef DKS_OPENCL + OpenCLBase *oclbase; + OpenCLFFT *oclfft; + OpenCLChiSquare *oclchi; + OpenCLCollimatorPhysics *oclcol; +#endif + +#ifdef DKS_CUDA + CudaBase *cbase; + CudaFFT *cfft; + CudaGreensFunction *cgreens; + CudaChiSquare *cchi; + CudaCollimatorPhysics *ccol; +#endif + +#ifdef DKS_MIC + MICBase *micbase; + MICFFT *micfft; + MICCollimatorPhysics *miccol; + MICGreensFunction *micgreens; + MICChiSquare *micchi; +#endif + +protected: + + //gives access to dks autotuning config file + DKSConfig dksconfig; + + /** + * Check if current API is set to OpenCL + * Return true/false wether current api is opencl + */ + bool apiOpenCL(); + + /** + * Check if current API is set to CUDA. + * Return true/false wether curretn api is cuda + */ + bool apiCuda(); + + /** + * Check if current API is set to OpenMP. + * Return true/false whether current api is OpenMP + */ + bool apiOpenMP(); + + /** Check if device is GPU */ + bool deviceGPU(); + /** Check if device is CPU */ + bool deviceCPU(); + /** Check if device is MIC */ + bool deviceMIC(); + + /** + * Get cbase pointer + */ +#ifdef DKS_CUDA + CudaBase *getCudaBase() { + return cbase; + } +#endif + +#ifdef DKS_OPENCL + OpenCLBase *getOpenCLBase() { + return oclbase; + } +#endif + + /** Call OpenCL base to load specified kenrel file. + * + */ + int loadOpenCLKernel(const char *kernel_name); + + std::string getAPI() { + std::string api_name(m_api_name); + return api_name; + } + + std::string getDevice() { + std::string device_name(&m_device_name[1]); + return device_name; + } + +public: + + /** + * Default constructor. + */ + DKSBase(); + + /** + * Constructor that sets api and devcie to use with DKS. + */ + DKSBase(const char* api_name, const char* device_name); + + + /** + * Destructor. + * Free DKS resources. + */ + ~DKSBase(); + + /** Turn on auto tuning */ + void setAutoTuningOn() { m_auto_tuning = true; } + + /** Turn of auto tuning */ + void setAutoTuningOff() { m_auto_tuning = false; } + + /** Get status of auto tuning */ + bool isAutoTuningOn() { return m_auto_tuning; } + + /** Turn on use of config file */ + void setUseConfigOn() { m_use_config = true; } + + /** Turn off use of config file */ + void setUseConfigOff() { m_use_config = false; } + + /** Check if using config file */ + bool isUseConfigOn() { return m_use_config; } + + /** + * Set device to use with DKS. + * Sets specific device to use with DKS. Supported devices are -gpu and -mic. + * Length specifies the number of characters in device_name array (length - deprecated). + * Return success or error code. + */ + int setDevice(const char* device_name, int length = -1); + + /** + * Set framework to use with DKS. + * Sets framework and API that DKS uses to execute code on device. Supported API's + * are OpenCL, CUDA and OpenMP. Returns success or error code. Length specifies + * the number of characters in api_name array (length - deprecated). + */ + int setAPI(const char* api_name, int length = -1); + + /** + * Prints information about all available devices. + * Calls CUDA, OpenCL and MIC functions to query for available devices + * for each framework and pirnts information about each device. Length specifies + * the number of characters in api_name array + * Returns success or error code + */ + int getDevices(); + + /** + * Returns device count. + * Saves the number of the devices available on the platform to ndev. + */ + int getDeviceCount(int &ndev); + + /** Get the name of the device in use. + * Query the device that is used and get the naem of the device. The name is saved in the + * device_name string. Returns DKS_SUCCESS + */ + int getDeviceName(std::string &device_name); + + /** Set the device to use. + * Pass the index of the device to use by dks. + */ + int setDefaultDevice(int device); + + /** Get unique devices. + * Get a list of all the unique devices available on the platform. + * When API and device type for DKS is set, getDeviceList can get all the unique devices + * available for this API and device type. Used for autotuning if multiple different GPUs are + * installed on the system. + */ + int getDeviceList(std::vector &devices); + + /** + * Inititialize DKS. + * Set framework and device to use. If OpenCL is used create context with device. + * Return success or error code. + */ + int initDevice(); + + /** + * Create stream for async execution. + * Function to create different streams with device to allow assync kernel execution and data + * transfer. Currently implemented for CUDA with cuda streams. streamId will be can be used later + * use the created stream. Returns success or error code. + * TODO: for opencl use different + * contexts similar as cuda streams to achieve async execution. TODO: for intel mic look at + * library (libxstream) from Hans Pabst. + */ + int createStream(int &streamId); + + /** + * Send pointer to device memory from one MPI process to another. + * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses + * cuda icp. Gets icp handle of memory allocated on device pointed by mem_ptr does MPI_Send to + * dest process where matching receivePointer should be called. Returns success or error code. + * TODO: opencl and mic cases still need implementations + */ +#ifdef DKS_MPI + int sendPointer(void *mem_ptr, int dest, MPI_Comm comm); +#endif + + /** + * Receive pointer to device memory from another MPI process. + * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses + * cuda icp. Uses MPI_Recv to get icp handle from another MPI process and opens a reference + * to this memory. Togeter with sendPointer function allows multiple MPI processes to share + * one memory region of the device. Returns success or error code. + * TODO: opencl and mic cases still need implementations + */ +#ifdef DKS_MPI + void * receivePointer(int hostproc, MPI_Comm comm, int &ierr); +#endif + + /** + * Close handle to device memory. + * If receivePointer is used to open memory handle allocated by another MPI process closeHandle + * should be called to free resources instead of freeMemory. Returns success or error code. + * TODO: opencl and mic cases still need implementations. + */ + int closeHandle(void *mem_ptr); + + /** + * Wait till all tasks running on device are completed. + * Forces a device synchronization - waits till all tasks on the device are complete. + * Implemented for cuda. Forces sync only in context in witch it is called - only waits + * for tasks launched by process calling syncDevice. If multiple processes launch different + * tasks each process is responsible for its own synchronization. Returns success or error code. + * TODO: opencl and mic implementations still necessary + */ + int syncDevice(); + + /** + * Allocate memory and transfer data to device. + * Returns a void pointer which can be used in later kernels to reference + * allocated device memory. data_in pointer to data to be transfered to device, + * elements is the number of data elements to transfer, T - type of data to transfer. + * If memory allocation or data transfer fails ierr will be set to error code. + */ + template + void * pushData(const void *data_in, int elements, int &ierr) { + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem mem_ptr; + size_t size = sizeof(T)*elements; + mem_ptr = oclbase->ocl_allocateMemory(size, ierr); + oclbase->ocl_writeData(mem_ptr, data_in, size, CL_FALSE); + + ierr = DKS_SUCCESS; + return mem_ptr; +#endif + } else if (apiCuda()){ +#ifdef DKS_CUDA + //cuda version + void * mem_ptr = NULL; + size_t size = sizeof(T)*elements; + mem_ptr = cbase->cuda_allocateMemory(size, ierr); + cbase->cuda_writeData((T*)mem_ptr, data_in, size); + + ierr = DKS_SUCCESS; + return mem_ptr; +#endif + } else if (apiOpenMP()) { +#ifdef DKS_MIC + void * mem_ptr = NULL; + mem_ptr = micbase.mic_pushData(data_in, elements); + + return mem_ptr; +#endif + } + + ierr = DKS_ERROR; + return NULL; + } + + /** + * Read data from device and free device memory. + * Reads data from device pointed by mem_ptr into data_out pointer. Elements + * specifies the number of data elements to read, T specifies the datatype of + * elements to copy. Returns error code if read data or free memory fails. + */ + template + int pullData(void *mem_ptr, void* data_out, int elements) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + size_t size = sizeof(T)*elements; + cl_mem clmem_ptr = (cl_mem)mem_ptr; + oclbase->ocl_readData(clmem_ptr, data_out, size); + oclbase->ocl_freeMemory(clmem_ptr); +#endif + } else if (apiCuda()) { +#ifdef DKS_CUDA + //cuda version + size_t size = sizeof(T)*elements; + cbase->cuda_readData((T*)mem_ptr, data_out, size); + cbase->cuda_freeMemory(mem_ptr); +#endif + } else if (apiOpenMP()) { +#ifdef DKS_MIC + micbase.mic_pullData(mem_ptr, data_out, elements); +#endif + } + + return DKS_SUCCESS; + } + + /** + * Allocate memory on device and return pointer to device memory. + * Allocates memory of type T, elements specifies the number of + * elements for which memory should be allocated. If memory allocation + * fails ierr is set to error code. Returns void pointer to device memory. + */ + template + void * allocateMemory(int elements, int &ierr) { + ierr = DKS_SUCCESS; + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem mem_ptr; + size_t size = sizeof(T)*elements; + mem_ptr = oclbase->ocl_allocateMemory(size, ierr); + return mem_ptr; +#endif + } else if (apiCuda()) { +#ifdef DKS_CUDA + //cuda version + void * mem_ptr = NULL; + size_t size = sizeof(T)*elements; + mem_ptr = cbase->cuda_allocateMemory(size, ierr); + return mem_ptr; +#endif + } else if (apiOpenMP()) { +#ifdef DKS_MIC + void * mem_ptr = NULL; + mem_ptr = micbase.mic_allocateMemory(elements); + return mem_ptr; +#endif + } + + ierr = DKS_ERROR; + return NULL; + } + + /** + * Allocates host memory as page-locked. + * Used for memroy allocation on the host side for pointer ptr for size elements. + * Page locked memory improves + * data transfer rates between host and device and allows async data transfer + * and kernel execution. Reurns succes or error code. + * TODO: opencl and mic implementations needed. + */ + template + int allocateHostMemory(T *&ptr, int size) + { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_allocateHostMemory(ptr, size)); + + DEBUG_MSG("Pinned memory allocation not implemented for this platform"); + return DKS_ERROR; + } + + /** + * Free host page-locked memory. + * Used to free page-locked memory on the host that was allocated using + * allocateHostMemory. ptr is the host pointer where page-locked memory was allocated, + * size - number of elements held by the memroy. + */ + template + int freeHostMemory(T* &ptr, int size) + { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_freeHostMemory(ptr)); + + return DKS_ERROR; + } + + /** + * Page lock allocated host memory. + * Page locked memory improves data transfer between host and device (true for cuda and + * opencl, maybe also mic). ptr - pointer to memory that needs to be page locked, + * size - number of elements in array. + * TODO: mic and opencl implementations needed + */ + template + int registerHostMemory(T *ptr, int size) { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_hostRegister(ptr, size)); + + return DKS_ERROR; + } + + /** + * Unregister page locked memory. + * TODO: opencl and mic implementations needed· + */ + template + int unregisterHostMemory(T *ptr) { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_hostUnregister(ptr)); + return DKS_ERROR; + } + + /** + * Write data from host to device. + * Write data from data to device memory referenced by mem_ptr. Elements spicify the + * number of elements to write, offset specifies the offset from the first element. + * Returns success or error code. Performs a blocking write - control to the host + * is returned only when data transfer is complete. + */ + template + int writeData(void *mem_ptr, const void *data, int elements, int offset = 0) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + size_t size = sizeof(T)*elements; + size_t offset_bytes = sizeof(T)*offset; + cl_mem clmem_ptr = (cl_mem)mem_ptr; + return oclbase->ocl_writeData(clmem_ptr, data, size, offset_bytes, CL_FALSE); +#endif + + } else if (apiCuda()){ + //cuda version + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset)); + + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_writeData(mem_ptr, data, elements, offset)); + + } + + return DKS_ERROR; + + } + + /** + * Write data to device using async write. + * Queue a async data write and return control to host imediately. + * mem_ptr - device memory pointer, data - host memory pointer, + * elements - number of data elements to write + * stremaId - stream id to use, offset - offset on device from first element + * For trully async execution on cuda stream other than default needs to be created + * and device memory must be page-locked. Otherwise functions just asynchronosly with + * respect to host. + * TODO: mic and opencl implementations needed (goes to blocking writes) + */ + template + int writeDataAsync(void *mem_ptr, const void *data, int elements, + int streamId = -1, int offset = 0) { + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + size_t size = sizeof(T)*elements; + cl_mem clmem_ptr = (cl_mem)mem_ptr; + oclbase->ocl_writeData(clmem_ptr, data, size, 0, CL_FALSE); +#endif + } else if (apiCuda()){ + //cuda version + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_writeDataAsync(mem_ptr, data, elements, streamId, offset)); + } + + return DKS_ERROR; + + } + + /** + * Gather 3D data from multiple mpi processes to one memory region. + * When multiple processes share the same device memory using sendPointer and receivePointer + * gather3DDataAsync allows each process to write data to its memory region. Uses async writes. + * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local + * data dimensions, id - starting indexes in global domain for each process + * streamId - stream to use for data transfers. + * Returns success or error code. + */ +#ifdef DKS_MPI + template + int gather3DDataAsync(void *mem_ptr, const T *data, int Ng[3], int Nl[3], + int id[3], int streamId = -1 ) { + + + //int p = 1; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int hoffset, doffset, ierr; + + //number of continuous memory elements + int elements = Nl[0]; + if (Nl[0] == Ng[0]) { + elements *= Nl[1]; + if (Nl[1] == Ng[1]) + elements *= Nl[2]; + } + + //starting index + int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0]; + + //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split + if (Nl[0] != Ng[0]) { + for (int i = 0; i < Nl[2]; i++) { + for (int j = 0; j < Nl[1]; j++) { + doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid; + hoffset = (i * Nl[1] + j) * elements; + ierr = writeDataAsync(mem_ptr, data + hoffset, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + } + return DKS_SUCCESS; + } + + //copy piece by piece 3rd dim if 2nd dim is split + if (Nl[1] != Ng[1]) { + for (int i = 0; i < Nl[2]; i++) { + doffset = i* Ng[1] * Ng[0] + sid; + ierr = writeDataAsync(mem_ptr, data + i*elements, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + return DKS_SUCCESS; + } + + //if only 3rd dim is split all elements are continuous so write one chunk + doffset = sid; + return writeDataAsync(mem_ptr, data, elements, streamId, doffset); + + } +#endif + + /** + * Scatter 3D data to multiple MPI processes from one device memory region. + * When multiple processes share the same device memory using sendPointer and receivePointer + * scatter3DDataAsync allows each process to read data from its memory region. Uses async reads. + * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local + * data dimensions, id - starting indexes in global domain for each process + * streamId - stream to use for data transfers. + * Returns success or error code. + */ +#ifdef DKS_MPI + template + int scatter3DDataAsync(const void *mem_ptr, T *data, int Ng[3], int Nl[3], + int id[3], int streamId = -1) { + + //int p = 1; + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int hoffset, doffset, ierr; + + //number of continuous memory elements + int elements = Nl[0]; + if (Nl[0] == Ng[0]) { + elements *= Nl[1]; + if (Nl[1] == Ng[1]) + elements *= Nl[2]; + } + + //starting index + int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0]; + + //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split + if (Nl[0] != Ng[0]) { + for (int i = 0; i < Nl[2]; i++) { + for (int j = 0; j < Nl[1]; j++) { + doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid; + hoffset = (i * Nl[1] + j) * elements; + ierr = readDataAsync(mem_ptr, data + hoffset, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + } + return DKS_SUCCESS; + } + + //copy piece by piece 3rd dim if 2nd dim is split + if (Nl[1] != Ng[1]) { + for (int i = 0; i < Nl[2]; i++) { + doffset = i* Ng[1] * Ng[0] + sid; + hoffset = i * elements; + ierr = readDataAsync(mem_ptr, data + hoffset, elements, streamId, doffset); + if (ierr == DKS_ERROR) return DKS_ERROR; + } + return DKS_SUCCESS; + } + + //if only 3rd dim is split all elements are continuous so write one chunk + doffset = sid; + return readDataAsync(mem_ptr, data, elements, streamId, doffset); + + } +#endif + + /** + * Create MPI subarray for 3D data gather and scatter using cuda aware MPI. + * If multiple MPI processes share device and cuda aware MPI is used for data transfer + * creates a MPI subarray so each MPI process can write and read to its own memory region. + * N_global - global domain dimensions, N_local - local domain dimensions, datatype - MPI datatype + */ +#ifdef DKS_MPI + template + MPI_Datatype create3DMPISubarray(int N_global[3], int N_local[3], MPI_Datatype datatype) { + //create MPI datatypes to transfer decomposed domain from GPU memory + int sizes[3] = {N_global[2], N_global[1], N_global[0]}; + int subsizes[3] = {N_local[2], N_local[1], N_local[0]}; + int starts[3] = {0, 0, 0}; + + MPI_Datatype stype, rtype; + MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, datatype, &stype); + MPI_Type_create_resized(stype, 0, sizeof(T), &rtype); + MPI_Type_commit(&rtype); + + return rtype; + } +#endif + + /** + * Gather 3D data from multiple MPI processes to device using cuda aware MPI. + * Using cuda aware mpi allows to gather data to one device memory region allocated + * by one of the mpi processes. mem_ptr - device pointer, data - host memory pointer, + * size - number of elements to transfer, stype - data type of elements, N_global - + * global dimensions of the domain, N_local - local domain dimensions, + * idx,idy,idz - starting indexes in global domain for each process, numNodes - number + * of processes, myNode - current node, rootNode - node that allocated device memory, + * comm - MPI communicator + * TODO: opencl and mic implementations (solution other than cuda aware mpi needed). + */ +#ifdef DKS_MPI + template + int gather3DData(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3], + int N_local[3], int * idx, int * idy, int * idz, + int numNodes, int myNode, int rootNode, MPI_Comm comm) + { + + MPI_Datatype rtype = create3DMPISubarray(N_global, N_local, stype); + + //calculate displacements from global domain size and local domain starting index + int *counts = new int[numNodes]; + int *displs = new int[numNodes]; + for (int i = 0; i < numNodes; i++) { + counts[i] = 1; + displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1]; + } + + if (apiOpenCL()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } else if (apiCuda()) { + MPI_Gatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm ); + } else if (apiOpenMP()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } + + return DKS_SUCCESS; + + } +#endif + + /** + * Gather 3D data from multiple MPI processes to device using cuda aware MPI and non blocking gather. + * For detailed parameter description see gather3DData docs. + * TODO: opencl and mic implementations (solution other than cuda aware mpi needed). + */ +#ifdef DKS_MPI + template + int gather3DDataAsync(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3], + int N_local[3], int * idx, int * idy, int * idz, + int numNodes, int myNode, int rootNode, + MPI_Comm comm, MPI_Request &request) + { + + MPI_Datatype rtype = create3DMPISubarray(N_global, N_local, stype); + + //calculate displacements from global domain size and local domain starting index + int *counts = new int[numNodes]; + int *displs = new int[numNodes]; + for (int i = 0; i < numNodes; i++) { + counts[i] = 1; + displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1]; + } + + if (apiOpenCL()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } else if (apiCuda()) { + MPI_Igatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm, &request ); + + } else if (apiOpenMP()) { + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } + + return DKS_SUCCESS; + + } +#endif + + /** + * Scatter 3D data from device to multiple MPI processes using cuda aware MPI. + * If multiple MPI prcesses share one device allows to scatter 3D data regions + * from device memory allocated by one of the processes to all other MPI processes. + * For detailed parameter description see gather3DData docs. + * TODO: opencl and mic implementations (solution other than cuda aware mpi needed). + */ +#ifdef DKS_MPI + template + int scatter3DData(void *mem_ptr, T *data, int size, MPI_Datatype rtype, int N_global[3], + int N_local[3], int * idx, int * idy, int * idz, + int numNodes, int myNode, int rootNode, MPI_Comm comm) + { + + MPI_Datatype stype = create3DMPISubarray(N_global, N_local, rtype); + + //calculate displacements from global domain size and local domain starting index + int *counts = new int[numNodes]; + int *displs = new int[numNodes]; + for (int i = 0; i < numNodes; i++) { + counts[i] = 1; + displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1]; + } + + if (apiOpenCL()) { + //TODO: gather all the date in root node, transfer to device from root node + } else if (apiCuda()) { + + //async scatter + //use cuda aware mpi + MPI_Scatterv( mem_ptr, counts, displs, stype, data, size, rtype, rootNode, comm ); + return DKS_ERROR; + } else if (apiOpenMP()) { + + //TODO: gather all the date in root node, transfer to device from root node + return DKS_ERROR; + } + + return DKS_SUCCESS; + + } +#endif + + /** + * Read data from device memory. + * Read data referenced by mem_ptr int out_data. Elements indicates the number of data + * elements to read and offset is the offset on the device from start of the memroy. + * Data type to read is specified by T. Performs a blocking read. + */ + template + int readData(const void *mem_ptr, void *out_data, int elements, int offset = 0) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem clmem_ptr = (cl_mem)mem_ptr; + size_t size = sizeof(T)*elements; + size_t offset_bytes = sizeof(T)*offset; + return oclbase->ocl_readData(clmem_ptr, out_data, size, offset_bytes); +#endif + } else if (apiCuda()){ + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_readData(mem_ptr, out_data, elements, offset)); + } + + return DKS_ERROR; + } + + /** + * Performs an async data read from device. + * Queues data read from device and returns control to host. stream id specifies stream to use for + * the read. Device async read can be performed if host memroy is page-locked and strema other than + * default -1 is used. For other parameter detailed description see readData function. + * TODO: opencl and mic implementations (currently reverts to blocking reads). + */ + template + int readDataAsync(const void *mem_ptr, void *out_data, int elements, int streamId = -1, int offset = 0) { + + if (apiOpenCL()) { +#ifdef DKS_OPENCL + //OpenCL version + cl_mem clmem_ptr = (cl_mem)mem_ptr; + size_t size = sizeof(T)*elements; + return oclbase->ocl_readData(clmem_ptr, out_data, size, 0); +#endif + } else if (apiCuda()){ + //cuda version + size_t size = sizeof(T)*elements; + return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset)); + } else if (apiOpenMP()) { + return MIC_SAFECALL(micbase.mic_readDataAsync(mem_ptr, out_data, elements, + streamId, offset)); + } + + return DKS_ERROR; + } + + + /** + * Free memory allocated on device. + * Free memory referenced by mem_ptr, elements - number of elements in memory, + * T - data type. + */ + template + int freeMemory(void *mem_ptr, int elements) { + if (apiOpenCL()) + return OPENCL_SAFECALL(oclbase->ocl_freeMemory((cl_mem)mem_ptr)); + else if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr)); + else if (apiOpenMP()) + return MIC_SAFECALL(micbase.mic_freeMemory(mem_ptr, elements)); + + return DKS_ERROR; + } + + + /////////////////////////////////////////////// + ///////Function library part of dksbase//////// + /////////////////////////////////////////////// + + /** + * Setup FFT function. + * Initializes parameters for fft executuin. If ndim > 0 initializes handles for fft calls. + * If ffts of various sizes are needed setupFFT should be called with ndim 0, in this case + * each fft will do its own setup according to fft size and dimensions. + * TODO: opencl and mic implementations + */ + int setupFFT(int ndim, int N[3]); + //BENI: + int setupFFTRC(int ndim, int N[3], double scale = 1.0); + //BENI: + int setupFFTCR(int ndim, int N[3], double scale = 1.0); + + /** + * Call complex-to-complex fft. + * Executes in place complex to compelx fft on the device on data pointed by data_ptr. + * stream id can be specified to use other streams than default. + * TODO: mic implementation + */ + int callFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call complex-to-complex ifft. + * Executes in place complex to compelx ifft on the device on data pointed by data_ptr. + * stream id can be specified to use other streams than default. + * TODO: mic implementation. + */ + int callIFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Normalize complex to complex ifft. + * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by + * fft size + * TODO: mic implementation. + */ + int callNormalizeFFT(void * data_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call real to complex FFT. + * Executes out of place real to complex fft, real_ptr points to real data, comp_pt - points + * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size + * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast + * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] + * TODO: opencl and mic implementations + */ + int callR2CFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Call complex to real iFFT. + * Executes out of place complex to real ifft, real_ptr points to real data, comp_pt - points + * to complex data, ndim - dimension of data, dimsize size of each dimension. real_ptr size + * should be dimsize[0]*dimsize[1]*disize[2], comp_ptr size should be atleast + * (dimsize[0]/2+1)*dimsize[1]*dimsize[2] + * TODO: opencl and mic implementations. + */ + int callC2RFFT(void * real_ptr, void * comp_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Normalize compelx to real ifft. + * Cuda, mic and OpenCL implementations return ifft unscaled, this function divides each element by + * fft size. + * TODO: opencl and mic implementations. + */ + int callNormalizeC2RFFT(void * real_ptr, int ndim, int dimsize[3], int streamId = -1); + + /** + * Transpose 2D and 3D arrays, OpenCL implementation + * N - size of dimensions, ndim - number of dimensions, dim - dim to transpose + */ + int callTranspose(void *mem_ptr, int N[3], int ndim, int dim); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callGreensIntegral(void *tmp_ptr, int I, int J, int K, int NI, int NJ, + double hz_m0, double hz_m1, double hz_m2, int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callGreensIntegration(void *mem_ptr, void *tmp_ptr, + int I, int J, int K, int streamId = -1); + + /** + * Integrated greens function from OPAL FFTPoissonsolver.cpp put on device. + * For specifics check OPAL docs. + * TODO: opencl and mic implementations. + */ + int callMirrorRhoField(void *mem_ptr, int I, int J, int K, int streamId = -1); + + /** + * Element by element multiplication. + * Multiplies each element of mem_ptr1 with corresponding element of mem_ptr2, size specifies + * the number of elements in mem_ptr1 and mem_ptr2 to use. Results are put in mem_ptr1. + * TODO: opencl and mic implementations. + */ + int callMultiplyComplexFields(void *mem_ptr1, void *mem_ptr2, int size, int streamId = -1); + + /** + * Chi square for parameter fitting on device. + * mem_data - measurement data, mem_par - pointer to parameter set, mem_chisq - pointer for + * intermediate results. Chi square results are put in &results + */ + int callPHistoTFFcn(void *mem_data, void *mem_par, void *mem_chisq, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, double &result); + + /** + * max-log-likelihood for parameter fitting on device. + * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, + * mem_par - pointer to parameter set, mem_results - pointer for + * intermediate results. Chi square results are put in &results. + * TODO: opencl and mic implementations. + */ + int callSingleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffser, + int sensors, int length, int numpar, + double &result); + + /** + * max-log-likelihood for parameter fitting on device. + * mem_data - measurement data, mem_t0 - pointer to time 0 for each sensor, + * mem_par - pointer to parameter set, mem_results - pointer for + * intermediate results. Chi square results are put in &results. + * TODO: opencl and mic implementations. + */ + int callDoubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffser, + int sensors, int length, int numpar, + double &result); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles, int numparams, + int &numaddback, int &numdead); + + + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysics2(void *mem_ptr, void *par_ptr, int numparticles); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * Test function for the MIC to test SoA layout vs AoS layout used in previous versions + */ + int callCollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + /** + * Monte carlo code for the degrader from OPAL classic/5.0/src/Solvers/CollimatorPhysics.cpp on device. + * For specifics check OPAL docs and CudaCollimatorPhysics class documentation. + * TODO: opencl and mic implementations. + */ + int callCollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback); + + /** + * Init random number states and save for reuse on device. + * TODO: opencl and mic implementations. + */ + int callInitRandoms(int size); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + + /** + * Integration code from ParallelTTracker from OPAL. + * For specifics check OPAL docs and CudaCollimatorPhysics class docs + */ + int callParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, void *orient_ptr, + int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1); + + /** + * Print memory information on device (total, used, available) + * TODO: opencl and mic imlementation + */ + int callMemInfo() { + if (apiCuda()) + return CUDA_SAFECALL(cbase->cuda_memInfo()); + + return DKS_ERROR; + } + + /** + * Test function to profile opencl kernel calls. + * Used for debuging and timing purposes only. + */ + void oclEventInfo() { + if (apiOpenCL()) + return OPENCL_SAFECALL(oclbase->ocl_eventInfo()); + + } + + /** + * Test function to profile opencl kernel calls. + * Used for debuging and timing purposes only. + */ + void oclClearEvents() { + if (apiOpenCL()) { +#ifdef DKS_OPENCL + oclbase->ocl_clearEvents(); +#endif + } + } + + +}; + +#endif diff --git a/src/DKSBaseMuSR.cpp b/src/DKSBaseMuSR.cpp new file mode 100644 index 0000000..3df59e9 --- /dev/null +++ b/src/DKSBaseMuSR.cpp @@ -0,0 +1,196 @@ +#include "DKSBaseMuSR.h" + +DKSBaseMuSR::DKSBaseMuSR() { + chiSq = nullptr; + chiSquareSize_m = -1; +} + +DKSBaseMuSR::~DKSBaseMuSR() { + freeChiSquare(); +} + +int DKSBaseMuSR::callCompileProgram(std::string function, bool mlh) { + return chiSq->compileProgram(function, mlh); +} + +int DKSBaseMuSR::callLaunchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result) +{ + + + //if we are not auto tuning and the size of the problem has changed find the new parameters + //from autotuning config file + if (!isAutoTuningOn() && length != chiSquareSize_m) { + int numBlocks, blockSize; + std::string device_name; + getDeviceName(device_name); + dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", + length, "NumBlocks", numBlocks); + dksconfig.getConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", + length, "BlockSize", blockSize); + chiSq->setKernelParams(numBlocks, blockSize); + + //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl; + + chiSquareSize_m = length; + } + + int ierr = chiSq->launchChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, + nummap, timeStart, timeStep, result); + + if ( isAutoTuningOn() ) { + std::vector config; + callAutoTuningChiSquare(fitType, mem_data, mem_err, length, numpar, numfunc, nummap, timeStart, + timeStep, result, config); + } + + return ierr; +} + +int DKSBaseMuSR::callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result, std::vector &config) +{ + + int loops = 100; + DKSAutoTuning *autoTuning; + if (apiCuda()) + autoTuning = new DKSAutoTuning(this, API_CUDA, DEVICE_GPU_NEW, loops); + else if (apiOpenCL() && deviceGPU()) + autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_GPU_NEW, loops); + else if (apiOpenCL() && deviceCPU()) + autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_CPU_NEW, loops); + else if (apiOpenCL() && deviceMIC()) + autoTuning = new DKSAutoTuning(this, API_OPENCL, DEVICE_MIC_NEW, loops); + else + autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW, loops); + + + int maxThreadsPerBlock = 1024; + checkMuSRKernels(fitType, maxThreadsPerBlock); + std::cout << "Max threads for autotune " << maxThreadsPerBlock << std::endl; + + //create the function to be timed + std::function f = std::bind(&ChiSquareRuntime::launchChiSquare, chiSq, + fitType, mem_data, mem_err, length, numpar, numfunc, nummap, + timeStart, timeStep, result); + autoTuning->setFunction(f, "launchChiSquare"); + + //create the parameters for auto-tuning + autoTuning->addParameter(&chiSq->blockSize_m, 32, maxThreadsPerBlock, 32, "BlockSize"); + autoTuning->addParameter(&chiSq->numBlocks_m, 100, 5000, 100, "NumBlocks"); + + autoTuning->lineSearch(); + + //autoTuning->hillClimbing(100); + + //autoTuning->simulatedAnnealing(1e-3, 1e-6); + + //autoTuning->exaustiveSearch(); + + std::string device_name; + getDeviceName(device_name); + dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length, + "NumBlocks", chiSq->numBlocks_m); + dksconfig.addConfigParameter(getAPI(), getDevice(), device_name, "ChiSquare", length, + "BlockSize", chiSq->blockSize_m); + + + config.push_back(chiSq->blockSize_m); + config.push_back(chiSq->numBlocks_m); + + delete autoTuning; + + return DKS_SUCCESS; + +} + +int DKSBaseMuSR::testAutoTuning() { + + DKSAutoTuning *autoTuning; + DKSAutoTuningTester *tester; + + autoTuning = new DKSAutoTuning(this, API_UNKNOWN, DEVICE_UNKNOWN_NEW); + tester = new DKSAutoTuningTester(); + + std::function f = std::bind(&DKSAutoTuningTester::peaksZ, tester); + autoTuning->setFunction(f, "testAutoTuner", false); + + autoTuning->addParameter(&tester->x, -3.0, 3.0, 0.5, "x"); + autoTuning->addParameter(&tester->y, -3.0, 3.0, 0.5, "y"); + + autoTuning->exaustiveSearch(); + + autoTuning->hillClimbing(10); + + autoTuning->simulatedAnnealing(10, 0.0005); + + return DKS_SUCCESS; +} + +int DKSBaseMuSR::callSetConsts(double N0, double tau, double bkg) { + return chiSq->setConsts(N0, tau, bkg); +} + +int DKSBaseMuSR::callSetConsts(double alpha, double beta) { + return chiSq->setConsts(alpha, beta); +} + +int DKSBaseMuSR::initChiSquare(int size_data, int size_param, int size_func, int size_map) { + int ierr; + + if (apiCuda()) { + ierr = CUDA_SAFECALL( DKS_SUCCESS ); + chiSq = CUDA_SAFEINIT(new CudaChiSquareRuntime(getCudaBase())); + } else { + ierr = OPENCL_SAFECALL( DKS_SUCCESS ); + chiSq = OPENCL_SAFECALL(new OpenCLChiSquareRuntime(getOpenCLBase())); + } + + if (ierr == DKS_SUCCESS) { + return chiSq->initChiSquare(size_data, size_param, size_func, size_map); + } else { + DEBUG_MSG("DKS API not set, or DKS compiled without sellected API support"); + return DKS_ERROR; + } +} + +int DKSBaseMuSR::freeChiSquare() { + int ierr = DKS_SUCCESS; + if (chiSq != NULL) { + ierr = chiSq->freeChiSquare(); + delete chiSq; + chiSq = NULL; + } + return ierr; +} + +int DKSBaseMuSR::writeParams(const double *params, int numparams) { + return chiSq->writeParams(params, numparams); +} + +int DKSBaseMuSR::writeFunctions(const double *func, int numfunc) { + return chiSq->writeFunc(func, numfunc); +} + +int DKSBaseMuSR::writeMaps(const int *map, int numfunc) { + return chiSq->writeMap(map, numfunc);; + +} + +int DKSBaseMuSR::checkMuSRKernels(int fitType) { + int threadsPerBlock = 1; + return chiSq->checkChiSquareKernels(fitType, threadsPerBlock); +} + +int DKSBaseMuSR::checkMuSRKernels(int fitType, int &threadsPerBlock) { + return chiSq->checkChiSquareKernels(fitType, threadsPerBlock); +} + +int DKSBaseMuSR::getOperations(int &oper) { + return chiSq->getOperations(oper); +} diff --git a/src/DKSBaseMuSR.h b/src/DKSBaseMuSR.h new file mode 100644 index 0000000..30f2d89 --- /dev/null +++ b/src/DKSBaseMuSR.h @@ -0,0 +1,137 @@ +#ifndef H_DKS_BASEMUSR +#define H_DKS_BASEMUSR + +#include +#include + +#include "AutoTuning/DKSAutoTuning.h" +#include "AutoTuning/DKSAutoTuningTester.h" + +#include "DKSBase.h" + +#include "Algorithms/ChiSquareRuntime.h" + +#ifdef DKS_CUDA +#include "CUDA/CudaChiSquareRuntime.cuh" +#endif + +#ifdef DKS_OPENCL +#include "OpenCL/OpenCLChiSquareRuntime.h" +#endif + +class DKSBaseMuSR : public DKSBase { + +private: + + ChiSquareRuntime *chiSq; + + int chiSquareSize_m; + +public: + + DKSBaseMuSR(); + + ~DKSBaseMuSR(); + + /** Compile the program with kernels to be run. + * String function contains the string that will be added to the code to compile in the + * function: __device__ double fTheory(double t, double *p, double *f, int *m); + * Function string must be a valid C math expression. It can contain operators, math functions + * and predefined functions listed in: + * http://lmu.web.psi.ch/musrfit/user/MUSR/MusrFit.html#A_4.3_The_THEORY_Block + * Predifined functions can be accessed by the abbreviation given in the table + * Parameters can be accesed in form p[idx] or p[m[idx]] - where p represents parameter array + * m represents map array and idx is the index to use from the maps. Precalculated function + * values can be accessed the same way - f[idx] or f[m[idx]]. Returns DKS_SUCCESS if everythin + * runs successfully, otherwise returns DKS_ERROR. If DKS is compiled with debug flag enabled + * prints DKS error message in case something fails + */ + int callCompileProgram(std::string function, bool mlh = false); + + /** Launch chi square calculation on data set writen in mem_data memory on device. + * mem_par, mem_map and mem_func hold pointers to parameter, function and map values + * for this data set (parameter array is one for all the data sets, maps and functions + * change between data sets). Resulting chi square value for this dataset will be put in + * result variable. Returns DKS_SUCCESS if everythin runs successfully, otherwise returns + * DKS_ERROR. If DKS is compiled with debug flag enabled prints DKS error message in case + * something fails + */ + int callLaunchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result); + + /** Launch auto-tuning of chisquare function for the selected device. + * Creates a function pointer to callLaunchChiSquare with necessary arguments bind to + * function call. CUDA and OpenCL version - gives AutoTuning class access to numThreads + * parameter which is varied to find the optimal value by AutoTuning class. Uses brute force + * method to test all the values. + */ + int callAutoTuningChiSquare(int fitType, void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result, std::vector &config); + + /** Set N0, tau and BKG values for the run. + * Needs to be called before kernel launch if these values are changing + */ + int callSetConsts(double N0, double tau, double bkg); + + /** Set alpha and beta values for the run. + * Needs to be called before kernel launch if these values are changing + */ + int callSetConsts(double alpha, double beta); + + /** Init chisquare calculations. + * Size is the maximum number of elements in any of the data sets used. + */ + int initChiSquare(int size_data, int size_param, int size_func, int size_map); + + /** Free temporary device storage allocated for chi^2 kernel. + * Return error code if freeing the device fails. + */ + int freeChiSquare(); + + /** Write params to device. + * Write pramas from double array to device, params device memory is managed by DKS. + */ + int writeParams(const double *params, int numparams); + + /** Write function values to device. + * Write precalculated function values to device, memory for functions on device is handled + * by DKS. + */ + int writeFunctions(const double *func, int numfunc); + + /** Write map indexes to device. + * Write map indexes to use in defined theory function to devive. Memory for map indexes is + * handeld by DKS. + */ + int writeMaps(const int *map, int numfunc); + + /** Check if device can run necessary kernels. + * Check selected device properties to see if device + * suports double precision and if device can run the + * necessary number of work_items / work_groups to successfully + * execute CUDA/OpenCL kernels. + */ + int checkMuSRKernels(int fitType); + + /** Perform the same check as checkMuSRKernels(int fitType) and return max threads per block. + * Used for autotuning to check what is the device limit for threads per block to correctly + * set the upper bound when searching the parameter space. + */ + int checkMuSRKernels(int fitType, int &threadsPerBlock); + + /** Debug function to test auto-tuning search functions + */ + int testAutoTuning(); + + /** Get the number of operations in compiled kernel. + */ + int getOperations(int &oper); + +}; + +#endif diff --git a/src/DKSDefinitions.h b/src/DKSDefinitions.h new file mode 100644 index 0000000..63fba34 --- /dev/null +++ b/src/DKSDefinitions.h @@ -0,0 +1,71 @@ +#ifndef H_DKS_DEFINITIONS +#define H_DKS_DEFINITIONS + +#define API_OPENCL "OpenCL" +#define API_CUDA "Cuda" +#define API_OPENMP "OpenMP" +#define API_UNKNOWN "Unknown" + +#define DEVICE_GPU_NEW "GPU" +#define DEVICE_CPU_NEW "CPU" +#define DEVICE_MIC_NEW "MIC" +#define DEVICE_UNKNOWN_NEW "Unknown" + +#define DEVICE_GPU "-gpu" +#define DEVICE_CPU "-cpu" +#define DEVICE_MIC "-mic" + +//define macro for printing debug messages if debug flag is set +#ifdef DEBUG +#define DEBUG_MSG(x) (std::cout << x << std::endl) +#else +#define DEBUG_MSG(x) +#endif + +//define DKS error codes +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 +#define DKS_API_NOT_ENABLED 100 + +#define OCL_SUCCESS 0 +#define OCL_ERROR 1 + +//define macros to enable or disable calls to specific frameworks +//if framework specific flag is set execute the satement, of not give DKS_API_NOT_ENABLED error +#ifdef DKS_CUDA +#define CUDA_SAFECALL(...) ( __VA_ARGS__ ) +#else +#define CUDA_SAFECALL(...) ( DKS_API_NOT_ENABLED ) +#endif + +#ifdef DKS_OPENCL +#define OPENCL_SAFECALL(...) ( __VA_ARGS__ ) +#else +#define OPENCL_SAFECALL(...) ( DKS_API_NOT_ENABLED ) +#endif + +#ifdef DKS_MIC +#define MIC_SAFECALL(...) ( __VA_ARGS__ ) +#else +#define MIC_SAFECALL(...) ( DKS_API_NOT_ENABLED ) +#endif + +#ifdef DKS_CUDA +#define CUDA_SAFEINIT(x) ( x ) +#else +#define CUDA_SAFEINIT(x) ( NULL ) +#endif + +#ifdef DKS_OPENCL +#define OPENCL_SAFEINIT(x) ( x ) +#else +#define OPENCL_SAFEINIT(x) ( NULL ) +#endif + +#ifdef DKS_MIC +#define MIC_SAFEINIT(x) ( x ) +#else +#define MIC_SAFEINIT(x) ( NULL ) +#endif + +#endif diff --git a/src/DKSDevice.cpp b/src/DKSDevice.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/DKSDevice.h b/src/DKSDevice.h new file mode 100644 index 0000000..79a69fe --- /dev/null +++ b/src/DKSDevice.h @@ -0,0 +1,37 @@ +/* + +Author: Uldis Locans + +Info: class that holds information about the compute device + +Data: 25.09.2014 + +*/ + +#define DKS_DEVICE_TYPE_GPU 1 +#define DKS_DEVICE_TYPE_MIC 2 +#define DKS_DEVICE_TYPE_CPU 3 + +class Device { + + private: + int m_device_id; + int m_device_type; + char *m_device_name; + char *m_device_vendor; + + bool m_sup_opencl; + bool m_sup_cuda; + bool m_sup_openmp; + bool m_sup_openacc; + + int m_pci_bus_id; + + public: + + Device(); + ~Device(); + + + +}; \ No newline at end of file diff --git a/src/DKSImageReconstruction.cpp b/src/DKSImageReconstruction.cpp new file mode 100644 index 0000000..5f2222a --- /dev/null +++ b/src/DKSImageReconstruction.cpp @@ -0,0 +1,130 @@ +#include "DKSImageReconstruction.h" + +DKSImageRecon::DKSImageRecon() { + + //set up base. since reconstruction is always using cuda, set up base to CUDA + setAPI("Cuda"); + setDevice("-gpu"); + initDevice(); + + imageRecon = CUDA_SAFEINIT( new CudaImageReconstruction(getCudaBase()) ); +} + +DKSImageRecon::~DKSImageRecon() { + delete[] imageRecon; +} + +int DKSImageRecon::callCalculateSource(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + int ierr; + ierr = imageRecon->calculateSource(image_space, image_position, source_position, + avg, std, diameter, total_voxels, + total_sources, start); + return ierr; +} + +int DKSImageRecon::callCalculateBackground(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + float diameter, int total_voxels, + int total_sources, int start) +{ + + int ierr; + ierr = imageRecon->calculateBackground(image_space, image_position, + source_position, avg, std, diameter, + total_voxels, total_sources, start); + return ierr; +} + +int DKSImageRecon::callCalculateSources(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + int ierr; + ierr = imageRecon->calculateSources(image_space, image_position, + source_position, avg, std, diameter, + total_voxels, total_sources, start); + return ierr; +} + +int DKSImageRecon::callCalculateBackgrounds(void *image_space, void *image_position, + void *source_position, void *avg, void *std, + void *diameter, int total_voxels, + int total_sources, int start) +{ + + int ierr; + ierr = imageRecon->calculateBackgrounds(image_space, image_position, + source_position, avg, std, diameter, + total_voxels, total_sources, start); + +return ierr; +} + + +int DKSImageRecon::callGenerateNormalization(void *recon, void *image_position, + void *det_position, int total_det) +{ + + int ierr = imageRecon->generateNormalization(recon, image_position, + det_position, total_det); + return ierr; +} + + +int DKSImageRecon::callForwardProjection(void *correction, void *recon, void *list_data, + void *det_position, void *image_position, int num_events) +{ + + int ierr; + ierr = imageRecon->forwardProjection(correction, recon, list_data, det_position, + image_position, num_events); + return ierr; +} + +int DKSImageRecon::callBackwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels) +{ + + int ierr; + ierr = imageRecon->backwardProjection(correction, recon_corrector, list_data, + det_position, image_position, num_events, + num_voxels); + return ierr; +} + +int DKSImageRecon::setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size) { + int ierr = imageRecon->setDimensions(voxel_x, voxel_y, voxel_z, voxel_size); + return ierr; +} + +int DKSImageRecon::setEdge(float x_edge, float y_edge, float z_edge) { + int ierr = imageRecon->setEdge(x_edge, y_edge, z_edge); + return ierr; +} + +int DKSImageRecon::setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2) { + int ierr = imageRecon->setEdge1(x_edge1, y_edge1, z_edge1, z_edge2); + return ierr; +} + +int DKSImageRecon::setMinCrystalInRing(float min_CrystalDist_InOneRing, + float min_CrystalDist_InOneRing1) +{ + int ierr = imageRecon->setMinCrystalInRing(min_CrystalDist_InOneRing, + min_CrystalDist_InOneRing1); + return ierr; +} + +int DKSImageRecon::setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter) +{ + int ierr = imageRecon->setParams(matrix_distance_factor, phantom_diameter, + atten_per_mm, ring_diameter); + return ierr; +} diff --git a/src/DKSImageReconstruction.h b/src/DKSImageReconstruction.h new file mode 100644 index 0000000..32f67ef --- /dev/null +++ b/src/DKSImageReconstruction.h @@ -0,0 +1,120 @@ +#ifndef H_DKS_IMAGERECONSTRUCTION +#define H_DKS_IMAGERECONSTRUCTION + +#include +#include "DKSBase.h" + +#include "Algorithms/ImageReconstruction.h" + +#ifdef DKS_CUDA +#include "CUDA/CudaImageReconstruction.cuh" +#endif + +class DKSImageRecon : public DKSBase { + +private: + + ImageReconstruction *imageRecon; + +public: + + DKSImageRecon(); + + ~DKSImageRecon(); + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateSource(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateBackground(void *image_space, void *image_position, void *source_position, + void *avg, void *std, float diameter, int total_voxels, + int total_sources, int start = 0); + + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateSources(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** Image reconstruction analaysis calculate source. + * + * + */ + int callCalculateBackgrounds(void *image_space, void *image_position, void *source_position, + void *avg, void *std, void *diameter, int total_voxels, + int total_sources, int start = 0); + + /** Image reconstruction - generate normalization. + * + */ + int callGenerateNormalization(void *recon, void *image_position, + void *det_position, int total_det); + + /** Image reconstruction - forward correction. + * + */ + int callForwardProjection(void *correction, void *recon, void *list_data, void *det_position, + void *image_position, int num_events); + + /** Image reconstruction - backward projection. + * + */ + int callBackwardProjection(void *correction, void *recon_corrector, void *list_data, + void *det_position, void *image_position, + int num_events, int num_voxels); + + /** Set the voxel dimensins on device. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setDimensions(int voxel_x, int voxel_y, int voxel_z, float voxel_size); + + /** Set the image edge. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setEdge(float x_edge, float y_edge, float z_edge); + + /** Set the image edge1. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setEdge1(float x_edge1, float y_edge1, float z_edge1, float z_edge2); + + /** Set the minimum crystan in one ring values. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setMinCrystalInRing(float min_CrystalDist_InOneRing, float min_CrystalDist_InOneRing1); + + /** Set all other required parameters for reconstruction. + * Values are stored in GPU memory and used in forward and backward projection calculations. + * Call set function once to transfer the values from host side to GPU. + * If value changes on the host side set functions needs to be called again to update GPU values. + */ + int setParams(float matrix_distance_factor, float phantom_diameter, + float atten_per_mm, float ring_diameter); + + + + + +}; + + +#endif diff --git a/src/DKSStream.h b/src/DKSStream.h new file mode 100644 index 0000000..17e1089 --- /dev/null +++ b/src/DKSStream.h @@ -0,0 +1,24 @@ +/* + Author: Uldis Locans + + Date: 12.12.2014 + + Comment: based on device used create different cuda streams, opencl contexts, (mic - dont know yet) + that allow handling of asynchronoes data transfer and kernel execution on the device + +*/ + +#ifndef H_DKSSTREAM +#define H_DKSSTREAM + +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 + +#include +#include + +class DKSStream { + + + +} diff --git a/src/MIC/CMakeLists.txt b/src/MIC/CMakeLists.txt new file mode 100644 index 0000000..d9b8dcd --- /dev/null +++ b/src/MIC/CMakeLists.txt @@ -0,0 +1,25 @@ +SET (_SRCS + MICBase.cpp + MICChiSquare.cpp + MICFFT.cpp + MICGreensFunction.cpp + MICCollimatorPhysics.cpp + ) + +SET (_HDRS + MICBase.h + MICChiSquare.h + MICFFT.h + MICCollimatorPhysics.h + MICGreensFunction.hpp + MICMergeSort.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/MIC) diff --git a/src/MIC/MICBase.cpp b/src/MIC/MICBase.cpp new file mode 100644 index 0000000..43c15c4 --- /dev/null +++ b/src/MIC/MICBase.cpp @@ -0,0 +1,124 @@ +#include "MICBase.h" + +//constructor, sets default device id equal to 0 +MICBase::MICBase() { + m_device_id = 0; + defaultRndSet = -1; + +} + +//destructor, delete defaultrnd streams if they are set +MICBase::~MICBase() { + mic_deleteRandStreams(); +} + + +//create default rand streams +int MICBase::mic_createRandStreams(int size) { + + int seed = time(NULL); + +#pragma offload target(mic:m_device_id) inout(defaultRndSet) in(seed) + { + + //get the number of threads + int numThreads; + +#pragma omp parallel + numThreads = omp_get_num_threads(); + + //if default rnd stream already allocated delete the array + if (defaultRndSet == 1) + delete[] defaultRndStream; + + //allocate defaultRndStream array + defaultRndStream = new VSLStreamStatePtr[numThreads]; + + //create stream states for each thread +#pragma omp parallel for + for (int i = 0; i < omp_get_num_threads(); i++) + vslNewStream(&defaultRndStream[i], VSL_BRNG_MT2203, seed + i); + + defaultRndSet = 1; + } + + return DKS_SUCCESS; + +} + +//delete default rand streams +int MICBase::mic_deleteRandStreams() { + +#pragma offload target(mic:m_device_id) inout(defaultRndSet) + { + if (defaultRndSet == 1) { + delete[] defaultRndStream; + defaultRndSet = -1; + } + } + + return DKS_ERROR; +} + +//create a new signal for the mic +int MICBase::mic_createStream(int & streamId) { + + //use int as signal, create a new int in micStreams vector, return the id + int tmpStream = micStreams.size(); + micStreams.push_back(tmpStream); + streamId = micStreams.size() - 1; + + //empty offload to create the signal on the mic + /* +#pragma offload target(mic:m_device_id) signal(mic_getStream(streamId)) + { + } + */ + return DKS_SUCCESS; +} + +//get the signal from the vector +int& MICBase::mic_getStream(int id) { + return micStreams[id]; +} + +//delete streams +int MICBase::mic_deleteStreams() { + micStreams.clear(); + + return DKS_SUCCESS; +} + + +//sets device id +int MICBase::mic_setDeviceId(int id) { + m_device_id = id; + + return DKS_SUCCESS; +} + +//get information abaut all available mic devices +//TODO: find a way to check system for avaialbel mic devices + +int MICBase::mic_getDevices() { + + int devices = _Offload_number_of_devices(); + int thread_count = 0; + + std::cout << "==============================" << std::endl; + std::cout << "==========Intel MICs==========" << std::endl; + std::cout << "==============================" << std::endl; + + std::cout << "Total mic devices: " << devices << std::endl; + //std::cout << "Total mic devices: currently cant be found, but it's 1 on kraftwerk" << std::endl; + +#pragma offload target(mic:m_device_id) inout(thread_count) + { + thread_count = omp_get_max_threads(); + } + + std::cout << "Max threads: " << thread_count << std::endl; + + + return DKS_SUCCESS; +} diff --git a/src/MIC/MICBase.h b/src/MIC/MICBase.h new file mode 100644 index 0000000..92b4fe9 --- /dev/null +++ b/src/MIC/MICBase.h @@ -0,0 +1,244 @@ +/* + + Name: MIC Base + Author: Uldis Locans + Info: class to handle set up and data transfer from host to Intel MIC devices + Date: 29.09.2014 + +*/ +#ifndef H_MIC_BASE +#define H_MIC_BASE + +#include +#include +#include +#include +#include +#include +#include + +#include "../DKSDefinitions.h" + +#define DKS_ALLOC alloc_if(1) +#define DKS_FREE free_if(1) +#define DKS_RETAIN free_if(0) +#define DKS_REUSE alloc_if(0) + +#define MIC_WIDTH 128 + +class MICBase { + +private: + std::vector micStreams; + +protected: + + + int defaultRndSet; + +public: + VSLStreamStatePtr *defaultRndStream; + int m_device_id; + + /* constructor */ + MICBase(); + + /* destructor */ + ~MICBase(); + + /* + Info: create MKL rand streams for each thread + Return: success or error code + */ + int mic_createRandStreams(int size); + + /* + Info: delete MKL rand streams + Return: succes or error code + */ + int mic_deleteRandStreams(); + + /* + Info: create a new signal for the mic + Return: success or error code + */ + int mic_createStream(int & streamId); + + /* + Info: get the signal from the vector + Return: mic signal + */ + int& mic_getStream(int id); + + /* + Info: delete streams + Return: success or error code + */ + int mic_deleteStreams(); + + /* + Info: set device id + Return: success or error code + */ + int mic_setDeviceId(int id); + + /* + Info: get mic devices + Return: success or error code + */ + int mic_getDevices(); + + /* + Info: allocate memory on MIC device + Return: success or error code + */ + template + void * mic_allocateMemory(int size) { + + int padding = size % MIC_WIDTH; + int totalsize = size + padding; + + T *tmp = (T*)_mm_malloc(sizeof(T)*totalsize, 64); // = new T[size]; +#pragma offload_transfer target(mic:m_device_id) nocopy(tmp:length(totalsize) DKS_ALLOC DKS_RETAIN) + + return tmp; + } + + /* + Info: transfer data to device + Return: success or error code + */ + template + int mic_writeData(void * data_ptr, const void * data, int size, int offset = 0) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_data = (T*)data; + +#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) ) + + return DKS_SUCCESS; + } + + /* + Info: write data to device, non-blocking + Return: success or error code + */ + template + int mic_writeDataAsync(void * data_ptr, const void * data, int size, int streamId = -1, int offset = 0) + { + T* tmp_ptr = (T*)data_ptr; + T* tmp_data = (T*)data; + +#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_REUSE DKS_RETAIN into(tmp_ptr[offset:size]) ) + + return DKS_SUCCESS; + } + + + /* + Info: read data from device + Return: success or error code + */ + template + int mic_readData(const void * data_ptr, void * result, int size, int offset = 0) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_result = (T*)result; + + //std::cout << "try to read data with size = " << size << " adn offset = " << offset << std::endl; +#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) ) + + return DKS_SUCCESS; + } + + /* + Info: read data from device waiting for signal + Return: success or error code + */ + template + int mic_readDataAsync(const void * data_ptr, void * result, int size, + int streamId = -1, int offset = 0) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_result = (T*)result; + +#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[offset:size] : DKS_REUSE DKS_RETAIN into(tmp_result[0:size]) ) + { + } + + return DKS_SUCCESS; + + } + + /* + Info: wait till all the signals are complete + Return siccess or error code + */ + int mic_syncDevice() { + + //empty offload to wait for all the signals to finish and launch a new empy signal + /* + for (int i = 0; i < micStreams.size(); i++) { +#pragma offload target(mic:m_device_id) wait(mic_getStream(i)) signal(mic_getStream(i)) + { + } + } + */ + + //std::cout << "done read data" << std::endl; + + return DKS_SUCCESS; + + } + + /* + Info: free memory on device + Return: success or error code + */ + template + int mic_freeMemory(void * data_ptr, int size) { + + int padding = size % MIC_WIDTH; + int totalsize = size + padding; + + T* tmp_ptr = (T*)data_ptr; +#pragma offload_transfer target(mic:m_device_id) nocopy(tmp_ptr:length(totalsize) DKS_REUSE DKS_FREE) + { + } + + return DKS_SUCCESS; + } + + /* + Info: allocate memory and write data to device + Return: success or error code + */ + template + void * mic_pushData(const void * data, int size) { + T* tmp_ptr = new T[size]; + T* tmp_data = (T*)data; + +#pragma offload_transfer target(mic:m_device_id) in(tmp_data[0:size] : DKS_ALLOC DKS_RETAIN + into(tmp_ptr[0:size]) ) + { + } + + return tmp_ptr; +} + +/* + Info: read data and free memory on device + Return: success or erro code +*/ + template + int mic_pullData(void * data_ptr, void * result, int size) { + T* tmp_ptr = (T*)data_ptr; + T* tmp_data = (T*)result; + +#pragma offload_transfer target(mic:m_device_id) out(tmp_ptr[0:size] : DKS_REUSE DKS_FREE into(tmp_data[0:size]) ) + { + } + + return DKS_SUCCESS; + } + +}; + +#endif diff --git a/src/MIC/MICChiSquare.cpp b/src/MIC/MICChiSquare.cpp new file mode 100644 index 0000000..35b6d77 --- /dev/null +++ b/src/MIC/MICChiSquare.cpp @@ -0,0 +1,93 @@ +#include "MICChiSquare.h" + +/* + calculate chi^2 on intel mic, use data already loaded on device +*/ +int MICChiSquare::mic_chi2(double *O, double *E, double *result, int size) { + +#pragma offload target(mic:m_micbase->m_device_id) \ + in(O:length(0) DKS_RETAIN DKS_REUSE) \ + in(E:length(0) DKS_RETAIN DKS_REUSE) \ + in(result:length(0) DKS_RETAIN DKS_REUSE) \ + in(size) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) { + result[i] = pow(O[i] - E[i], 2) / E[i]; + } + } + + return DKS_SUCCESS; +} + + +/* + calculate function N(t), use data already loaded on device +*/ +int MICChiSquare::mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT) { + +#pragma offload target(mic:m_micbase->m_device_id) \ + in(nt:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) \ + in(psize) in(nsize) in(jsize) in(deltaT) + { + + double gamma = 0.01; //??? + double tau = 0.01; //??? + + for (int j = 0; j < jsize; j++) { + + int pid = j*psize; + double N0 = p[pid]; + double Nbkg = p[pid+1]; + double A0 = p[pid+2]; + double phi = p[pid+3]; + double sigma = p[pid+4]; + double B = p[pid+5]; + + int idj = j*nsize; + + double a1 = -0.5*sigma*sigma; + double b1 = gamma*B; + +#pragma omp parallel for + for (int n = 0; n < nsize; n++) { + + int id = idj + n; + double t = n*deltaT; + + double a = a1*t*t; + double b = b1*t + phi; + double At = A0 * exp2(a) * cos(b); + + double c = -t/tau; + double Nt = N0 * exp2(c) * (1 + At) + Nbkg; + + nt[id] = Nt; + } + } + + } + + return DKS_SUCCESS; +} + +/* + calculate sum of array +*/ +int MICChiSquare::mic_sum(double *data, double *result, int size) { + double sum = 0; +#pragma offload target(mic:m_micbase->m_device_id) \ + in(data:length(0) DKS_REUSE DKS_RETAIN) \ + in(result:length(0) DKS_REUSE DKS_RETAIN) \ + in(size) in(sum) + { +#pragma omp parallel for reduction(+:sum) + for (int i = 0; i < size; i++) { + sum += data[i]; + } + result[0] = sum; + } + return DKS_SUCCESS; +} + diff --git a/src/MIC/MICChiSquare.h b/src/MIC/MICChiSquare.h new file mode 100644 index 0000000..c62de0b --- /dev/null +++ b/src/MIC/MICChiSquare.h @@ -0,0 +1,51 @@ +/* + + Name: MICChiSquare + Info: calculate chi^2 using intel mic coporcessor + Author: Uldis Locans + Date: 29.09.2014 + +*/ +#ifndef H_MIC_CHI_SQUARE +#define H_MIC_CHI_SQUARE + +#include +#include +#include +#include "MICBase.h" + +class MICChiSquare { + + MICBase *m_micbase; + +public: + + /* constructor */ + MICChiSquare(MICBase *base) { + m_micbase = base; + } + + /* destructor */ + ~MICChiSquare() { } + + /* + Info: calucate chi square + Return: success or error code + */ + int mic_chi2(double *O, double *E, double *result, int size); + + /* + Info: calculate Nt function + Return: success or error code + */ + int mic_Nt(double *nt, double *p, int psize, int nsize, int jsize, double deltaT = 1); + + /* + Info: calculate sum of array + Return: success or error code + */ + int mic_sum(double *data, double *result, int size); + +}; + +#endif diff --git a/src/MIC/MICCollimatorPhysics.cpp b/src/MIC/MICCollimatorPhysics.cpp new file mode 100644 index 0000000..6a1b937 --- /dev/null +++ b/src/MIC/MICCollimatorPhysics.cpp @@ -0,0 +1,876 @@ +#include "MICCollimatorPhysics.h" + +#define M_P 0.93827231e+00 +#define C 299792458.0 +#define PI 3.14159265358979323846 +#define AVO 6.022e23 +#define R_E 2.81794092e-15 +#define eM_E 0.51099906e-03 +#define Z_P 1 +#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 + +#define POSITION 0 +#define ZSIZE 1 +#define RHO_M 2 +#define Z_M 3 +#define A_M 4 +#define A2_C 5 +#define A3_C 6 +#define A4_C 7 +#define A5_C 8 +#define X0_M 9 +#define I_M 10 +#define DT_M 11 + +__declspec(target(mic)) +double dot(mic_double3 d1, mic_double3 d2) { + return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); +} + +__declspec(target(mic)) +double dot(double dx, double dy, double dz) { + return (dx * dx + dy * dy + dz * dz); +} + +__declspec(target(mic)) +bool checkHit(double &z, double *par) { + return ( (z > par[POSITION]) && ( z <= par[POSITION] + par[ZSIZE]) ); +} + + +__declspec(target(mic)) +void Rot(double &px, double &pz, double &x, double &z, double xplane, + double normP, double thetacou, double deltas, int coord) +{ + double Psixz = 1; + double pxz = 1; + + if ( px >= 0 && pz >= 0 ) + Psixz = atan(px/pz); + else if ( px > 0 && pz < 0 ) + Psixz = atan(px/pz) + PI; + else if (px < 0 && pz > 0) + Psixz = atan(px/pz) + 2*PI; + else + Psixz = atan(px/pz) + PI; + + pxz = sqrt(px*px + pz*pz); + + if(coord == 1) { + x = x + deltas * px / normP + xplane*cos(Psixz); + z = z - xplane * sin(Psixz); + } + + if(coord == 2) { + x = x + deltas * px / normP + xplane * cos(Psixz); + z = z - xplane * sin(Psixz) + deltas * pz / normP; + } + + px = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou); + pz = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); +} + +__declspec(target(mic)) +void coulombScat(mic_double3 &R, mic_double3 &P, double *par, VSLStreamStatePtr &stream) { + double Eng = sqrt(dot(P, P) + 1.0) * M_P - M_P; + double gamma = (Eng + M_P) / M_P; + double normP = sqrt(dot(P, P)); + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + double deltas = par[DT_M] * beta * C; + + double theta0 = 13.6e6 / (beta * normP * M_P * 1e9) * + Z_P * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); + + // x-direction: See Physical Review, "Multiple Scattering" + double z1, z2; + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + double thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + thetacou = z2 * theta0; + } + + double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.x, P.z, R.x, R.z, xplane, normP, thetacou, deltas, 1); + + double P2;//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1); + if(P2 < 0.0047) { + double P3, P4; + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.x ,P.z, R.x, R.z, xplane, normP, thetaru, deltas, 0); + } + + // y-direction: See Physical Review, "Multiple Scattering" + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1, 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2, 0.0, 1.0 ); + thetacou = z2 * theta0; + } + + double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + Rot(P.y, P.z, R.y, R.z, yplane, normP, thetacou, deltas, 2); + + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P2, 0, 1); + if(P2 < 0.0047) { + double P3, P4; + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P3, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, 1, &P4, 0, 1); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + if(P4 > 0.5) + thetaru = -thetaru; + Rot(P.y, P.z, R.y, R.z, yplane, normP, thetaru, deltas, 0); + } + +} + +__declspec(target(mic)) +void coulombScat(double *rx, double *ry, double *rz, double *px, double *py, double *pz, int *label, + double *par, VSLStreamStatePtr &stream, int ii, int size) +{ + + double normP[MIC_WIDTH] __attribute__((aligned(64))); + double deltas[MIC_WIDTH] __attribute__((aligned(64))); + double theta0[MIC_WIDTH] __attribute__((aligned(64))); + double P1[MIC_WIDTH] __attribute__((aligned(64))); + double P2[MIC_WIDTH] __attribute__((aligned(64))); + double P3[MIC_WIDTH] __attribute__((aligned(64))); + + double z1[MIC_WIDTH] __attribute__((aligned(64))); + double z2[MIC_WIDTH] __attribute__((aligned(64))); + double thetacou[MIC_WIDTH] __attribute__((aligned(64))); + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + double dotp = dot(px[i], py[i], pz[i]); + double Eng = sqrt(dotp + 1.0) * M_P - M_P; + double gamma = (Eng + M_P) / M_P; + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + + normP[idx] = sqrt(dotp); + deltas[idx] = par[DT_M] * beta * C; + theta0[idx] = 13.6e6 / (beta * normP[idx] * M_P * 1e9) * + Z_P * sqrt(deltas[idx] / par[X0_M]) * (1.0 + 0.038 * log(deltas[idx] / par[X0_M])); + } + } + + // x-direction: See Physical Review, "Multiple Scattering" + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0); + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + size; i++) { + int idx = i - ii; + thetacou[idx] = z2[idx] * theta0[idx]; + } + + //unknown number of iterations, cannot vectorize + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 ); + thetacou[idx] = z2[idx] * theta0[idx]; + } + } + } + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + size; i++) { + int idx = i - ii; + if (label[i] == 0) { + double xplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) + + z2[idx] * deltas[idx] * theta0[idx] / 2.0; + Rot(px[i], pz[i], rx[i], rz[i], xplane, normP[idx], thetacou[idx], deltas[idx], 1); + } + } + + + //generate array of random numbers + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1); + + //P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH] + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + if(P1[idx] < 0.0047) { + double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx]; + + if(P3[idx] > 0.5) + thetaru = -thetaru; + + Rot(px[i] ,pz[i], rx[i], rz[i], 0, 0, thetaru, 0, 0); + } + } + } + + // y-direction: See Physical Review, "Multiple Scattering" + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z1, 0.0, 1.0); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, MIC_WIDTH, z2, 0.0, 1.0); + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + thetacou[idx] = z2[idx] * theta0[idx]; + } + + //unknown number of iterations, cannot vectorize + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + while(fabs(thetacou[idx]) > 3.5 * theta0[idx]) { + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z1[idx], 0.0, 1.0 ); + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &z2[idx], 0.0, 1.0 ); + thetacou[idx] = z2[idx] * theta0[idx]; + } + } + } + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + double yplane = z1[idx] * deltas[idx] * theta0[idx] / sqrt(12.0) + + z2[idx] * deltas[idx] * theta0[idx] / 2.0; + Rot(py[i], pz[i], ry[i], rz[i], yplane, normP[idx], thetacou[idx], deltas[idx], 2); + } + } + + //generate array of random numbers + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P1, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P2, 0, 1); + vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, MIC_WIDTH, P3, 0, 1); + + //P2 = P[i], P3 = P[i+WIDTH], P4 = P[i+2*WIDTH] + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + int idx = i - ii; + if (label[i] == 0) { + if(P1[idx] < 0.0047) { + double thetaru = 2.5 * sqrt(1 / P2[idx]) * sqrt(2.0) * theta0[idx]; + if(P3[idx] > 0.5) + thetaru = -thetaru; + Rot(py[i], pz[i], ry[i], rz[i], 0, 0, thetaru, 0, 0); + } + } + } + +} + +__declspec(target(mic)) +void energyLoss(double &Eng, int &pdead, double *par, VSLStreamStatePtr &stream) { + + double dEdx = 0.0; + const double gamma = (Eng + M_P) / M_P; + const double gamma2 = gamma * gamma; + const double beta = sqrt(1.0 - 1.0 / gamma2); + const double beta2 = beta * beta; + + const double deltas = par[DT_M] * beta * C; + const double deltasrho = deltas * 100 * par[RHO_M]; + const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); + + if ( (Eng > 0.00001) && (Eng < 0.0006) ) { + const double Ts = (Eng * 1E6) / 1.0073; + const double epsilon_low = par[A2_C] * pow(Ts, 0.45); + const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) ); + const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high); + + dEdx = -epsilon / (1E21 * (par[A_M] / AVO) ); + + double tmprnd; + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E ); + const double delta_E = deltasrho * dEdx + tmprnd; + Eng = Eng + delta_E / 1E3; + } + + if (Eng >= 0.0006) { + const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P)); + + dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + double tmprnd; + vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 1, &tmprnd, 0.0, sigma_E ); + const double delta_E = deltasrho * dEdx + tmprnd; + + Eng = Eng + delta_E / 1E3; + } + + + if ((Eng<1E-4) || (dEdx>0)) + pdead = 1; +} + +__declspec(target(mic)) +void energyLoss(double &Eng, double &dEdx, double *par, double *randv, int ri) { + + const double gamma = (Eng + M_P) / M_P; + const double gamma2 = gamma * gamma; + const double beta = sqrt(1.0 - 1.0 / gamma2); + const double beta2 = beta * beta; + + const double deltas = par[DT_M] * beta * C; + const double deltasrho = deltas * 100 * par[RHO_M]; + const double sigma_E = sqrt(K * eM_E * par[RHO_M] * (Z_M / par[A_M]) * deltas * 1E5); + + if ( (Eng > 0.00001) && (Eng < 0.0006) ) { + const double Ts = (Eng * 1E6) / 1.0073; + const double epsilon_low = par[A2_C] * pow(Ts, 0.45); + const double epsilon_high = (par[A3_C] / Ts) * log( 1 + ( par[A4_C] / Ts) + (par[A5_C] *Ts) ); + const double epsilon = (epsilon_low * epsilon_high) / (epsilon_low + epsilon_high); + + dEdx = -epsilon / (1E21 * (par[A_M] / AVO) ); + + const double delta_E = deltasrho * dEdx + sigma_E * randv[ri]; + + Eng = Eng + delta_E / 1E3; + } + + if (Eng >= 0.0006) { + const double Tmax = 2.0 * eM_E * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * eM_E / M_P + (eM_E / M_P) * (eM_E / M_P)); + + dEdx = -K * Z_P * Z_P * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * eM_E * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + const double delta_E = deltasrho * dEdx + sigma_E * randv[ri + MIC_WIDTH]; + + Eng = Eng + delta_E / 1E3; + } + +} + +int MICCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles) { + + //cast device memory pointers to appropriate types + MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr; + double *par = (double*) par_ptr; + +#pragma offload target(mic:m_micbase->m_device_id) \ + inout(data:length(0) DKS_RETAIN DKS_REUSE) \ + in(par:length(0) DKS_RETAIN DKS_REUSE) \ + in(numparticles) + { + +#pragma omp parallel + { + VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; + + //for loop trough particles if not checkhit set label to -2 and update R.x + +#pragma omp for simd + for (int i = 0; i < numparticles; i++) { + if ( !checkHit(data[i].Rincol.z, par) ) { + double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol)); + data[i].Rincol.x = data[i].Rincol.x + par[DT_M] * C * data[i].Pincol.x / sq; + data[i].Rincol.y = data[i].Rincol.y + par[DT_M] * C * data[i].Pincol.y / sq; + data[i].Rincol.z = data[i].Rincol.z + par[DT_M] * C * data[i].Pincol.z / sq; + data[i].label = -2; + } + } + + //for loop trough particles if label == 0 eneregy loss and if pdead update label to -1 +#pragma omp for simd + for (int i = 0; i < numparticles; i++) { + + int pdead = -1; + double sq = sqrt(1.0 + dot(data[i].Pincol, data[i].Pincol)); + double Eng = (sq - 1) * M_P; + + if (data[i].label == 0) { + energyLoss(Eng, pdead, par, stream); + } + + if (pdead == -1) { + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(data[i].Pincol, data[i].Pincol)); + data[i].Pincol.x = data[i].Pincol.x * ptot / sq; + data[i].Pincol.y = data[i].Pincol.y * ptot / sq; + data[i].Pincol.z = data[i].Pincol.z * ptot / sq; + } + + if (pdead == 1) + data[i].label = -1; + } + + //for loop trough particles if label == 0 coulomb scat +#pragma omp for + for (int i = 0; i < numparticles; i++) { + if (data[i].label == 0) { + coulombScat(data[i].Rincol, data[i].Pincol, par, stream); + } + } + + } //end omp parallel + + } //end offload + return DKS_SUCCESS; + +} + + + +int MICCollimatorPhysics::CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) +{ + + + + int *label = (int*)label_ptr; + unsigned *localID = (unsigned*)localID_ptr; + double *rx = (double*)rx_ptr; + double *ry = (double*)ry_ptr; + double *rz = (double*)rz_ptr; + double *px = (double*)px_ptr; + double *py = (double*)py_ptr; + double *pz = (double*)pz_ptr; + double *par = (double*)par_ptr; + + int padding = numparticles % MIC_WIDTH; + int totalpart = numparticles + padding; + +#pragma offload target (mic:0) \ + in(label:length(0) DKS_REUSE DKS_RETAIN) \ + in(localID:length(0) DKS_REUSE DKS_RETAIN) \ + in(rx:length(0) DKS_REUSE DKS_RETAIN) \ + in(ry:length(0) DKS_REUSE DKS_RETAIN) \ + in(rz:length(0) DKS_REUSE DKS_RETAIN) \ + in(px:length(0) DKS_REUSE DKS_RETAIN) \ + in(py:length(0) DKS_REUSE DKS_RETAIN) \ + in(pz:length(0) DKS_REUSE DKS_RETAIN) \ + in(par:length(0) DKS_RETAIN DKS_REUSE) \ + in(totalpart) + { + +#pragma omp parallel + { + //every thread gets its own rnd stream state + VSLStreamStatePtr stream = m_micbase->defaultRndStream[omp_get_thread_num()]; + + + #pragma omp for nowait + for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { + //vectorize main loop + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + if ( !checkHit(rz[i], par) ) { + double sq = sqrt(1.0 + dot(px[i], py[i], pz[i])); + rx[i] = rx[i] + par[DT_M] * C * px[i] / sq; + ry[i] = ry[i] + par[DT_M] * C * py[i] / sq; + rz[i] = rz[i] + par[DT_M] * C * pz[i] / sq; + label[i] = -2; + } + } + } + + + //array of size 2*WIDTH for storing random values for the energyloss function + double randv[2*MIC_WIDTH] __attribute__((aligned(64))); + + //for loop trough particles if label == 0 eneregy loss and if pdead update label to -1 + #pragma omp for nowait + for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { + //create array of rand values (2 per thread) + vdRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, 2*MIC_WIDTH, randv, 0.0, 1.0); + + #pragma vector aligned + #pragma simd + for (int i = ii; i < ii + MIC_WIDTH; i++) { + + double sq = sqrt(1.0 + dot(px[i], py[i], pz[i])); + double Eng = (sq - 1) * M_P; + double dEdx = 0; + + if (label[i] == 0) { + energyLoss(Eng, dEdx, par, randv, i - ii); + } + + if (Eng > 1e-4 && dEdx < 0) { + double ptot = sqrt((M_P + Eng) * (M_P + Eng) - (M_P * M_P)) / M_P; + sq = sqrt(dot(px[i], py[i], pz[i])); + px[i] = px[i] * ptot / sq; + py[i] = py[i] * ptot / sq; + pz[i] = pz[i] * ptot / sq; + } + + if (Eng < 1e-4 || dEdx > 0) + label[i] = -1; + + } //end inner energy loss loop + + } //end outer energy loss loop + + //vectorize coulomb scattering as much as possible +#pragma omp for nowait + for (int ii = 0; ii < totalpart; ii += MIC_WIDTH) { + coulombScat(rx, ry, rz, px, py, pz, label, par, stream, ii, MIC_WIDTH); + } //end coulomb scattering + + } //end omp parallel + + } //end offload + + return DKS_SUCCESS; +} + +int MICCollimatorPhysics::CollimatorPhysicsSort(void *mem_ptr, int numparticles, + int &numaddback) +{ + + //cast device memory pointers to appropriate types + MIC_PART_SMALL *data = (MIC_PART_SMALL*) mem_ptr; + int privateback; + +#pragma offload target(mic:m_micbase->m_device_id) \ + in(data:length(0) DKS_RETAIN DKS_REUSE) \ + in(numparticles) \ + out(privateback) + { + //count dead and addback particles + int privateback = 0; +#pragma omp parallel for reduction(+:privateback) + for (int i = 0; i < numparticles; i++) { + if (data[i].label < 0) + privateback++; + } + //move particles with label < 0 to the end of the array (serial. can we do this parallel?) + if (privateback > 0) { + + int moved = 0; + for (int i = numparticles - 1; i > 0; i--) { + if (data[i].label < 0) { + int idx = numparticles - 1 - moved; + if (i != idx) { + MIC_PART_SMALL tmp = data[i]; + data[i] = data[idx]; + data[idx] = tmp; + } + moved++; + } + } + } + numaddback = privateback; + } + return DKS_SUCCESS; +} + +__declspec(target(mic)) +void micmove(double &a, double &b) { + double tmp = a; + a = b; + b = tmp; +} + +__declspec(target(mic)) +void micmove(int &a, int &b) { + int tmp = a; + a = b; + b = tmp; +} + +__declspec(target(mic)) +void micmove(unsigned &a, unsigned &b) { + unsigned tmp = a; + a = b; + b = tmp; +} + + +int MICCollimatorPhysics::CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, + int &numaddback) +{ + + int *label = (int*)label_ptr; + unsigned *localID = (unsigned*)localID_ptr; + double *rx = (double*)rx_ptr; + double *ry = (double*)ry_ptr; + double *rz = (double*)rz_ptr; + double *px = (double*)px_ptr; + double *py = (double*)py_ptr; + double *pz = (double*)pz_ptr; + double *par = (double*)par_ptr; + + //int padding = numparticles % WIDTH; + //int totalpart = numparticles + padding; + + int privateback; + +#pragma offload target (mic:0) \ + in(label:length(0) DKS_REUSE DKS_RETAIN) \ + in(localID:length(0) DKS_REUSE DKS_RETAIN) \ + in(rx:length(0) DKS_REUSE DKS_RETAIN) \ + in(ry:length(0) DKS_REUSE DKS_RETAIN) \ + in(rz:length(0) DKS_REUSE DKS_RETAIN) \ + in(px:length(0) DKS_REUSE DKS_RETAIN) \ + in(py:length(0) DKS_REUSE DKS_RETAIN) \ + in(pz:length(0) DKS_REUSE DKS_RETAIN) \ + in(par:length(0) DKS_RETAIN DKS_REUSE) \ + in(numparticles) \ + out(privateback) + { + + //count dead and addback particles + int privateback = 0; +#pragma omp parallel for reduction(+:privateback) + for (int i = 0; i < numparticles; i++) { + if (label[i] < 0) + privateback++; + } + + //move particles with label < 0 to the end of the array (serial. can we do this parallel?) + if (privateback > 0) { + int moved = 0; + for (int i = numparticles - 1; i >= 0; i--) { + if (label[i] < 0) { + int idx = numparticles - 1 - moved; + if (i != idx) { + micmove(rx[i], rx[idx]); + micmove(ry[i], ry[idx]); + micmove(rz[i], rz[idx]); + micmove(px[i], px[idx]); + micmove(py[i], py[idx]); + micmove(pz[i], pz[idx]); + micmove(label[i], label[idx]); + micmove(localID[i], localID[idx]); + } + moved++; + } + } + } + numaddback = privateback; + } + + return DKS_SUCCESS; +} + +__declspec(target(mic)) +inline void unitlessOff(mic_double3 &a, const double c) { + a.x *= c; + a.y *= c; + a.z *= c; +} + +__declspec(target(mic)) +inline void unitlessOn(mic_double3 &a, const double c) { + a.x /= c; + a.y /= c; + a.z /= c; +} + +__declspec(target(mic)) +mic_double3 deviceTransformTo(const mic_double3 &vec, const mic_double3 &ori) { + const double sina = sin(ori.x); + const double cosa = cos(ori.x); + const double sinb = sin(ori.y); + const double cosb = cos(ori.y); + const double sinc = sin(ori.z); + const double cosc = cos(ori.z); + + mic_double3 temp; + temp.x = 0.0; + temp.y = 0.0; + temp.z = 0.0; + + temp.x = (cosa * cosc) * vec.x + (cosa * sinc) * vec.y - sina * vec.z; + temp.y = (-cosb * sinc - sina * sinb * cosc) * vec.x + + (cosb * cosc - sina * sinb * sinc) * vec.y - cosa * sinb * vec.z; + temp.z = (-sinb * sinc + sina * cosb * cosc) * vec.x + + (sinb * cosc + sina * cosb * sinc) * vec.y + cosa * cosb * vec.z; + + return temp; +} + +__declspec(target(mic)) +inline void updateR(mic_double3 &R, mic_double3 &P, double dotp, double dtc) { + R.x /= dtc; + R.x += 0.5 * P.x / dotp; + R.x *= dtc; + + R.y /= dtc; + R.y += 0.5 * P.y / dotp; + R.y *= dtc; + + R.z /= dtc; + R.z += 0.5 * P.z / dotp; + R.z *= dtc; +} + +__declspec(target(mic)) +inline void push(mic_double3 *r, mic_double3 *p, double dtc, int npart) { +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 R = r[i]; + mic_double3 P = p[i]; + double dotp = sqrt(1.0 + dot(P, P)); + updateR(R, P, dotp, dtc); + r[i] = R; + } +} + +__declspec(target(mic)) +inline void push(mic_double3 *r, mic_double3 *p, double *gdt, double c, int npart) { +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 R = r[i]; + mic_double3 P = p[i]; + double dtc = gdt[i] * c; + double dotp = sqrt(1.0 + dot(P, P)); + updateR(R, P, dotp, dtc); + r[i] = R; + } +} + + +int MICCollimatorPhysics::ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt, int streamId) +{ + + mic_double3 *r = (mic_double3*)r_ptr; + mic_double3 *p = (mic_double3*)p_ptr; + double *gdt = (double*)dt_ptr; + double dtc = dt * c; + + if (!usedt) { +#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(npart, dtc) + { + push(r, p, dtc, npart); + } + + } else { + +#pragma offload target(mic:m_micbase->m_device_id) in(r:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) in(npart, c) + { + push(r, p, gdt, c, npart); + } + } + + return DKS_SUCCESS; +} + +__declspec(target(mic)) +inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect, + double dtc, int npart, int nsec) +{ + +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 ori; + if (gLastSect[i] > -1 && gLastSect[i] < nsec) { + ori = gOrient[gLastSect[i]]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + mic_double3 tmp = deviceTransformTo(p[i], ori); + mic_double3 X = x[i]; + double dotp = sqrt(1.0 + dot(tmp, tmp)); + updateR(X, tmp, dotp, dtc); + x[i] = X; + } + +} + +__declspec(target(mic)) +inline void pushTransform(mic_double3 *x, mic_double3 *p, mic_double3 *gOrient, long *gLastSect, + double *gdt, double c, int npart, int nsec) +{ + +#pragma omp parallel for simd + for (int i = 0; i < npart; i++) { + mic_double3 ori; + if (gLastSect[i] > -1 && gLastSect[i] < nsec) { + ori = gOrient[gLastSect[i]]; + } else { + ori.x = 0.0; + ori.y = 0.0; + ori.z = 0.0; + } + + mic_double3 tmp = deviceTransformTo(p[i], ori); + mic_double3 X = x[i]; + double dotp = sqrt(1.0 + dot(tmp, tmp)); + double dtc = gdt[i] * c; + + updateR(X, tmp, dotp, dtc); + x[i] = X; + } + +} + +int MICCollimatorPhysics::ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, + void *lastSec_ptr, + void *orient_ptr, int npart, + int nsec, void *dt_ptr, double dt, + double c, bool usedt, int streamId) +{ + + mic_double3 *x = (mic_double3*)x_ptr; + mic_double3 *p = (mic_double3*)p_ptr; + mic_double3 *gOrient = (mic_double3*)orient_ptr; + double *gdt = (double*)dt_ptr; + long *gLastSect = (long*)lastSec_ptr; + double dtc = dt * c; + + if (!usedt) { + +#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(gOrient:length(0) DKS_RETAIN DKS_REUSE) \ + in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) in(npart, nsec, dtc) + { + pushTransform(x, p, gOrient, gLastSect, dtc, npart, nsec); + } + + } else { + +#pragma offload target(mic:m_micbase->m_device_id) in(x:length(0) DKS_RETAIN DKS_REUSE) \ + in(p:length(0) DKS_RETAIN DKS_REUSE) in(gdt:length(0) DKS_RETAIN DKS_REUSE) \ + in(gOrient:length(0) DKS_RETAIN DKS_REUSE) in(gLastSect:length(0) DKS_RETAIN DKS_REUSE) \ + in(npart, nsec, c) + { + pushTransform(x, p, gOrient, gLastSect, gdt, c, npart, nsec); + } + } + + return DKS_SUCCESS; + +} + + diff --git a/src/MIC/MICCollimatorPhysics.h b/src/MIC/MICCollimatorPhysics.h new file mode 100644 index 0000000..0795779 --- /dev/null +++ b/src/MIC/MICCollimatorPhysics.h @@ -0,0 +1,68 @@ +#ifndef H_MIC_COLLIMATORPHYSICS +#define H_MIC_COLLIMATORPHYSICS + +#include +#include +#include +#include +#include + +#include "../Algorithms/CollimatorPhysics.h" +#include "MICBase.h" + +__declspec(target(mic)) +typedef struct { + double x; + double y; + double z; +} mic_double3; + +__declspec(target(mic)) +typedef struct { + int label; + unsigned localID; + mic_double3 Rincol; + mic_double3 Pincol; +} MIC_PART_SMALL; + + +class MICCollimatorPhysics : DKSAlogorithms{ + +private: + + MICBase *m_micbase; + +public: + + MICCollimatorPhysics(MICBase *base) { + m_micbase = base; + }; + + ~MICCollimatorPhysics() { }; + + int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles); + + int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles); + + int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback); + + int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback); + + int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1); + + int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, + void *dt_ptr, double dt, double c, + bool usedt = false, int streamId = -1); + +}; + + +#endif diff --git a/src/MIC/MICFFT.cpp b/src/MIC/MICFFT.cpp new file mode 100644 index 0000000..ab82c83 --- /dev/null +++ b/src/MIC/MICFFT.cpp @@ -0,0 +1,210 @@ +#include "MICFFT.h" +#include +#include +#include +#include + +MICFFT::MICFFT(MICBase *base) { + m_micbase = base; +} + +MICFFT::~MICFFT() { +#pragma offload target(mic:0) + { + DftiFreeDescriptor(&FFTHandle_m); + DftiFreeDescriptor(&handle); + } +} + +//setup fft +int MICFFT::setupFFT(int ndim, int N[3]) { + //set up FFT engine +#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE) + { + + MKL_LONG sizes[3], strides[4]; + sizes[0] = N[0]; sizes[1] = N[1]; sizes[2] = N[2]; + //strides[0] = 0; strides[1] = sizes[1]; strides[2] = 1; strides[3] = sizes[0]*sizes[1]; + strides[0] = 0; strides[1] = sizes[0]*sizes[1]; strides[2] = sizes[0]; strides[3] = 1; + + MKL_LONG dims = 3; + DftiCreateDescriptor(&(this->getHandle()), DFTI_DOUBLE, DFTI_COMPLEX, dims, sizes); + DftiSetValue(this->getHandle(), DFTI_INPUT_STRIDES, strides); + DftiSetValue(this->getHandle(), DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX); + DftiCommitDescriptor(this->getHandle()); + + } + + + return DKS_SUCCESS; +} +//BENI: +//setup fft +int MICFFT::setupFFTRC(int ndim, int N[3], double scale) { + + //set up FFT engine for REAL->COMPLEX + +#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE) + { + + MKL_LONG sizes[3], real_strides[4], complex_strides[4]; + sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0]; + //real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1; + real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1; + //real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1]; + //complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1; + complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1; + //complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1; + + MKL_LONG dims = 3; + DftiCreateDescriptor(&(this->getHandleRC()), DFTI_DOUBLE, DFTI_REAL, dims, sizes); + DftiSetValue(this->getHandleRC(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + DftiSetValue(this->getHandleRC(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT); + DftiSetValue(this->getHandleRC(), DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DftiSetValue(this->getHandleRC(), DFTI_INPUT_STRIDES, real_strides); + DftiSetValue(this->getHandleRC(), DFTI_OUTPUT_STRIDES, complex_strides); + DftiSetValue(this->getHandleRC(), DFTI_FORWARD_SCALE, scale); + DftiCommitDescriptor(this->getHandleRC()); + + } + + return DKS_SUCCESS; +} + +//BENI: +//setup fft +int MICFFT::setupFFTCR(int ndim, int N[3], double scale) { + + //set up FFT engine for COMPLEX->REAL + +#pragma offload target(mic:0) in(N:length(3) DKS_ALLOC DKS_FREE) + { + MKL_LONG sizes[3], real_strides[4], complex_strides[4]; + sizes[0] = N[2]; sizes[1] = N[1]; sizes[2] = N[0]; + //real_strides[0] = 0; real_strides[1] = 2*sizes[1]*(sizes[0]/2+1); real_strides[2] = 2*(sizes[0]/2+1); real_strides[3] = 1; + real_strides[0] = 0; real_strides[1] = sizes[2]*sizes[1]; real_strides[2] = sizes[2]; real_strides[3] = 1; + //real_strides[0] = 0; real_strides[1] = 1; real_strides[2] = sizes[0]; real_strides[3] = sizes[0]*sizes[1]; + //complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[0]/2+1); complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = 1; + complex_strides[0] = 0; complex_strides[1] = sizes[1]*(sizes[2]/2+1); complex_strides[2] = (sizes[2]/2+1); complex_strides[3] = 1; + //complex_strides[0] = 0; complex_strides[2] = (sizes[0]/2+1); complex_strides[3] = sizes[1]*(sizes[0]/2+1); complex_strides[1] = 1; + + MKL_LONG dims = 3; + DftiCreateDescriptor(&(this->getHandleCR()), DFTI_DOUBLE, DFTI_REAL, dims, sizes); + DftiSetValue(this->getHandleCR(),DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + DftiSetValue(this->getHandleCR(), DFTI_PACKED_FORMAT, DFTI_CCE_FORMAT); + DftiSetValue(this->getHandleCR(), DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DftiSetValue(this->getHandleCR(), DFTI_INPUT_STRIDES, complex_strides); + DftiSetValue(this->getHandleCR(), DFTI_OUTPUT_STRIDES, real_strides); + DftiSetValue(this->getHandleCR(), DFTI_BACKWARD_SCALE, scale); + DftiCommitDescriptor(this->getHandleCR()); + + + + } + + return DKS_SUCCESS; +} + +//execute COMPLEX->COMPLEX FFT +int MICFFT::executeFFT(void *mem_ptr, int ndim, int N[3], int streamId, bool forward) { + + _Complex double *ptr = (_Complex double*) mem_ptr; + +#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(forward) + { + if (forward) + DftiComputeForward(this->getHandle(), ptr); + else + DftiComputeBackward(this->getHandle(), ptr); + } + + return DKS_SUCCESS; +} + +//execute iFFT +int MICFFT::executeIFFT(void *mem_ptr, int ndim, int N[3]) { + return mic_executeFFT(mem_ptr, ndim, N, -1, false); +} + +//execute REAL->COMPLEX FFT +int MICFFT::executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) { + + double *real_ptr = (double*) in_ptr; + //std::complex *compl_ptr = (std::complex *) out_ptr; + _Complex double *compl_ptr = (_Complex double *) out_ptr; + int sizereal = N[0]*N[1]*N[2]; + int sizecompl = (N[0]/2+1)*N[1]*N[2]; + +//std::cout << "start real-compl fft on mic " << std::endl; + + //std::cout << "real_ptr = " << real_ptr << std::endl; + //std::cout << "compl_ptr = " << compl_ptr << std::endl; + //std::cout << "EXECUTE AVERAGING OVER 10 LOOPS OF FFT" << std::endl; + +#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE) + //#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE) + { + //for (int i=0;i<10;++i){ //loop 10 times for benchmarking + DftiComputeForward(this->getHandleRC(), real_ptr, compl_ptr); + //} + } + +//std::cout << "end real-compl fft on mic " << std::endl; + + + return DKS_SUCCESS; +} + +//execute COMPLEX->REAL FFT +int MICFFT::executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId) { + + //_Complex double *ptr = (_Complex double*) mem_ptr; + + double *real_ptr = (double*) out_ptr; + _Complex double *compl_ptr = (_Complex double *) in_ptr; + + //std::cout << "real_ptr = " << real_ptr << std::endl; + //std::cout << "compl_ptr = " << compl_ptr << std::endl; + int sizereal = N[0]*N[1]*N[2]; + int sizecompl = (N[0]/2+1)*N[1]*N[2]; + + //std::cout << "offload to perform backward fft ... " << std::endl; +//struct timeval start, end; +//gettimeofday(&start,NULL); +#pragma offload target(mic:0) in(real_ptr:length(0) DKS_RETAIN DKS_REUSE) in(compl_ptr:length(0) DKS_RETAIN DKS_REUSE) + //#pragma offload target(mic:0) nocopy(real_ptr:length(sizereal) RETAIN REUSE) nocopy(compl_ptr:length(sizecompl) RETAIN REUSE) + { + //for (int i=0;i<10;++i){ //loop 10 times for benchmarking + DftiComputeBackward(this->getHandleCR(), compl_ptr, real_ptr); + //} + } + +// End timing offloaded FFT. +//gettimeofday(&end,NULL); +// Print execution time of offloaded computational loop. +//printf ("Total time for IFFT spent = %f seconds\n", +//(double) (end.tv_usec-start.tv_usec) /1000000+(double) (end.tv_sec-start.tv_sec)); + //std::cout << "IFFT DONE!" << std::endl; + return DKS_SUCCESS; +} + + +//normalize IFFT +int MICFFT::normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId) { + + int size = N[0] * N[1] * N[2]; + + _Complex double *ptr = (_Complex double*) mem_ptr; +#pragma offload target(mic:0) in(ptr:length(0) DKS_RETAIN DKS_REUSE) in(size) + { +#pragma omp parallel for + for (int i = 0; i < size; i++) { + __real__ ptr[i] = __real__ ptr[i] / size; + __imag__ ptr[i] = __imag__ ptr[i] / size; + } + } + + return DKS_SUCCESS; + +} + diff --git a/src/MIC/MICFFT.h b/src/MIC/MICFFT.h new file mode 100644 index 0000000..626fc19 --- /dev/null +++ b/src/MIC/MICFFT.h @@ -0,0 +1,79 @@ +#ifndef H_MIC_FFT +#define H_MIC_FFT + +#include +#include + +#include +#include + +#include "../Algorithm/DKSFFT.h" +#include "MICBase.h" + +class MICFFT : public DKSFFT { + +private: + + MICBase *m_micbase; + + /// Internal FFT object for performing serial FFTs. +#pragma offload_attribute(push,target(mic)) + DFTI_DESCRIPTOR_HANDLE FFTHandle_m; //declspec only works for global variables + DFTI_DESCRIPTOR_HANDLE handle; + DFTI_DESCRIPTOR_HANDLE rc_handle; //handle for REAL->COMPLEX + DFTI_DESCRIPTOR_HANDLE cr_handle; //handle for COMPLEX->REAL + +#pragma offload_attribute(pop) + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle(void) { + return FFTHandle_m; + } + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandle1(void) { + return handle; + } + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleRC(void) { + return rc_handle; + } + + __attribute__((target(mic:0))) DFTI_DESCRIPTOR_HANDLE& getHandleCR(void) { + return cr_handle; + } + +public: + + /* constructor */ + MICFFT(MICBase *base); + + /* destructir */ + ~MICFFT(); + + /* + Info: setup mkl fft + Return: success or error code + */ + int setupFFT(int ndim, int N[3]); + //BENI: + int setupFFTRC(int ndim, int N[3], double scale = 1.0); + //BENI: + int setupFFTCR(int ndim, int N[3], double scale = 1.0); + + /* execute FFT on MIC */ + int executeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1, bool forward = true); + + /* execute IFFT on MIC */ + int executeIFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1); + + /* execute REAL->COMPLEX FFT on MIC */ + int executeRCFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1); + + /* execute COMPLEX->REAL FFT on MIC */ + int executeCRFFT(void *in_ptr, void *out_ptr, int ndim, int N[3], int streamId = -1); + + /* normalize IFFT on MIC */ + int normalizeFFT(void *mem_ptr, int ndim, int N[3], int streamId = -1); + +}; + +#endif diff --git a/src/MIC/MICGreensFunction.cpp b/src/MIC/MICGreensFunction.cpp new file mode 100644 index 0000000..6725a1e --- /dev/null +++ b/src/MIC/MICGreensFunction.cpp @@ -0,0 +1,307 @@ +#include "MICGreensFunction.hpp" +#include +#include +#include + +/* constructor */ +MICGreensFunction::MICGreensFunction(MICBase *base) { + m_micbase = base; +} + +/* destructor */ +MICGreensFunction::~MICGreensFunction() { +} + + +/* compute greens integral analytically */ +// Version with extended domain +/* + int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0,double hr_m1, double hr_m2) { + double *tmp_ptr = (double*) tmp_ptr_; + #pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2) + { + std::memset(tmp_ptr,0,(I+1)*(J+1)*(K+1)); + double cellVolume = hr_m0 * hr_m1 * hr_m2; + #pragma omp parallel for collapse(3) schedule(dynamic) + for (int k = 0; k < K; k++) { + for (int j = 0; j < J; j++) { + for (int i = 0; i < I; i++) { + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = 0; + tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgrn = tmpgrn / cellVolume; + + tmp_ptr[k*(J+1)*(I+1) + j*(I+1) + i] = tmpgrn; + } + } + } + } + return 0; + } +*/ + +int MICGreensFunction::mic_GreensIntegral(void * tmp_ptr_, int I,int J, int K, double hr_m0, + double hr_m1, double hr_m2) +{ + + double *tmp_ptr = (double*) tmp_ptr_; +#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I, J,K, hr_m0, hr_m1, hr_m2) + { + std::memset(tmp_ptr,0,I*J*K); + double cellVolume = hr_m0 * hr_m1 * hr_m2; +#pragma omp parallel for collapse(3) schedule(dynamic) + for (int k = 0; k < K; k++) { + for (int j = 0; j < J; j++) { + for (int i = 0; i < I; i++) { + + double vv0 = i * hr_m0 - hr_m0 / 2; + double vv1 = j * hr_m1 - hr_m1 / 2; + double vv2 = k * hr_m2 - hr_m2 / 2; + + double r = sqrt(vv0 * vv0 + vv1 * vv1 + vv2 * vv2); + + double tmpgrn = 0; + tmpgrn += -vv2*vv2 * atan(vv0 * vv1 / (vv2 * r) ); + tmpgrn += -vv1*vv1 * atan(vv0 * vv2 / (vv1 * r) ); + tmpgrn += -vv0*vv0 * atan(vv1 * vv2 / (vv0 * r) ); + + tmpgrn = tmpgrn / 2; + + tmpgrn += vv1 * vv2 * log(vv0 + r); + tmpgrn += vv0 * vv2 * log(vv1 + r); + tmpgrn += vv0 * vv1 * log(vv2 + r); + + tmpgrn = tmpgrn / cellVolume; + + tmp_ptr[k*(J)*(I) + j*(I) + i] = tmpgrn; + } + } + } + } + return 0; +} + + + +/* perform the actual integration */ +// version with extended domain +/* + int MICGreensFunction::mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K) { + double *tmp_ptr = (double*) tmp_ptr_; + double *mem_ptr = (double*) mem_ptr_; + +// the actual integration +#pragma offload target(mic:0) in(tmp_ptr:length(0) DKS_RETAIN DKS_REUSE) in(mem_ptr:length(0) DKS_RETAIN DKS_REUSE) in(I,J,K) +{ +int Ii = I; +int Jj = J; +int Kk = K; +int II = 2*(I-1); int JJ=2*(J-1); int KK=2*(K-1); +std::memset(mem_ptr,0,II*JJ*KK); +I=I+1; J=J+1; K=K+1; + +#pragma omp parallel for collapse(3) +for (int i=0; i +#include + +#include +#include + +#include "MICBase.h" + +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 + +class MICGreensFunction { + +private: + MICBase *m_micbase; + +public: + + /* constructor */ + MICGreensFunction(MICBase *base); + + /* destructor */ + ~MICGreensFunction(); + + /* compute greens integral analytically */ + int mic_GreensIntegral(void * tmp_ptr_, int I, int J, int K, double hr_m0, double hr_m1, double hr_m2); + + /* perform the actual integration */ + int mic_IntegrationGreensFunction(void * mem_ptr_, void * tmp_ptr_,int I,int J, int K); + + /* Mirror rho-Field */ + int mic_MirrorRhoField(void * mem_ptr_, int I, int J, int K); + + /*multiply complex fields*/ + int mic_MultiplyCompelxFields(void * mem_ptr1_, void * mem_ptr2_, int size); + +}; + +#endif diff --git a/src/MIC/MICMergeSort.h b/src/MIC/MICMergeSort.h new file mode 100644 index 0000000..408037b --- /dev/null +++ b/src/MIC/MICMergeSort.h @@ -0,0 +1,116 @@ +#include +#include +#include + +/* default comparison function */ +template +inline bool greaterThan(T x, T y) { + return x > y; +} + +/* swap a and b */ +template +void mergeswap(T &a, T &b) { + T tmp = a; + a = b; + b = tmp; +} + +template +void split_merge(T *a, int ibegin, int iend, T *b, bool (*comp)(T, T) ) { + + if (iend - ibegin < 500) { + quick_sort(a + ibegin, 0, iend - ibegin - 1, comp); + return; + } + + int imiddle = (iend + ibegin) / 2; + +#pragma omp task + split_merge(a, ibegin, imiddle, b, comp); + split_merge(a, imiddle, iend, b, comp); +#pragma omp taskwait + + merge(a, ibegin, imiddle, iend, b, comp); + +} + +template +void merge(T *a, int ibegin, int imiddle, int iend, T *b, bool (*comp)(T, T)) { + + int i0 = ibegin; + int i1 = imiddle; + + //merge two halfs of array a to tmp array b + int i = ibegin; + while (i < iend) { + if (i0 < imiddle && ( i1 >= iend || comp(a[i1], a[i0]) ) ) + b[i++] = a[i0++]; + else + b[i++] = a[i1++]; + } + + //copy b back to a + for (int i = ibegin; i < iend; i++) + a[i] = b[i]; + +} + +template +int partition(T *a, int start, int end, bool (*comp)(T, T) ) { + int p = start; + T x = a[start]; + + for (int i = start + 1; i <= end; i++) { + if ( comp(x, a[i]) ) { + p++; + mergeswap(a[i], a[p]); + } + } + mergeswap(a[p], a[start]); + return p; +} + +template +void merge_sort( T *list, int n, bool (*comp)(T, T) = greaterThan) { + +#pragma omp parallel + { +#pragma omp single + { + T *b = new T[n]; + split_merge(list, 0, n, b, comp); + } + } +} + +template +void quick_sort( T *list, int start, int end, bool (*comp)(T, T) ) { + + if (start < end) { + //for small elements move to insertion sort + if ( (end - start) < 9 ) { + insertion_sort(list, start, end + 1, comp); + } else { + int part = partition(list, start, end, comp); + quick_sort(list, start, part - 1, comp); + quick_sort(list, part + 1, end, comp); + } + } + +} + +template +void insertion_sort( T *list, int start, int end, bool (*comp)(T, T) ) { + + for (int i = start + 1; i < end; i++) { + T key = list[i]; + int j = i - 1; + while ( j >= 0 && comp(list[j], key) ) { + list[j + 1] = list[j]; + j--; + } + list[j + 1] = key; + } + +} diff --git a/src/OpenCL/CMakeLists.txt b/src/OpenCL/CMakeLists.txt new file mode 100644 index 0000000..19cedbe --- /dev/null +++ b/src/OpenCL/CMakeLists.txt @@ -0,0 +1,34 @@ +SET (_SRCS + OpenCLBase.cpp + OpenCLFFT.cpp + OpenCLChiSquare.cpp + OpenCLCollimatorPhysics.cpp + OpenCLChiSquareRuntime.cpp + ) + +SET (_HDRS + OpenCLBase.h + OpenCLFFT.h + OpenCLChiSquare.h + OpenCLCollimatorPhysics.h + OpenCLChiSquareRuntime.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +SET (_KERNELS + OpenCLKernels/OpenCLChiSquare.cl + OpenCLKernels/OpenCLFFT.cl + OpenCLKernels/OpenCLFFTStockham.cl + OpenCLKernels/OpenCLTranspose.cl + OpenCLKernels/OpenCLCollimatorPhysics.cl + OpenCLKernels/OpenCLChiSquareRuntime.cl + ) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/OpenCL) +INSTALL(FILES ${_KERNELS} DESTINATION include/OpenCL/OpenCLKernels) diff --git a/src/OpenCL/OpenCLBase.cpp b/src/OpenCL/OpenCLBase.cpp new file mode 100644 index 0000000..b40fd64 --- /dev/null +++ b/src/OpenCL/OpenCLBase.cpp @@ -0,0 +1,1132 @@ +#include "OpenCLBase.h" + +cl_context OpenCLBase::m_context = NULL; +cl_command_queue OpenCLBase::m_command_queue = NULL; +cl_platform_id OpenCLBase::m_platform_id = NULL; +cl_device_id OpenCLBase::m_device_id = NULL; +cl_event OpenCLBase::m_last_event = NULL; + +OpenCLBase::OpenCLBase() { + //m_context = NULL; + //m_command_queue = NULL; + m_program = NULL; + m_kernel = NULL; + //m_device_id = NULL; + //m_platform_id = NULL; + m_kernel_file = NULL; + + m_last_event = NULL; + + //m_events = new cl_event[500]; + //m_num_events = 0; + + defaultRndSet = 0; + +} + +OpenCLBase::~OpenCLBase() { + ocl_cleanUp(); + m_last_event = NULL; + + if (defaultRndSet == 1) + ocl_deleteRndStates(); +} + +/* create random states */ +int OpenCLBase::ocl_createRndStates(int size) { + //load kernel + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl"); + ocl_loadKernel(kernel_file); + delete[] kernel_file; + + //allocate memory for rand states + int ierr; + defaultRndState = ocl_allocateMemory(sizeof(RNDState)*size, ierr); + + //exec kernel + int seed = 0; + ocl_createKernel("initRand"); + ocl_setKernelArg(0, sizeof(cl_mem), &defaultRndState); + ocl_setKernelArg(1, sizeof(unsigned int), &seed); + ocl_setKernelArg(2, sizeof(int), &size); + + size_t work_items = size; + size_t work_group_size = 1; + + ocl_executeKernel(1, &work_items, &work_group_size); + + defaultRndSet = 1; + + return OCL_SUCCESS; + +} + +/* destroy rnd states */ +int OpenCLBase::ocl_deleteRndStates() { + + ocl_freeMemory(defaultRndState); + defaultRndSet = 0; + + return OCL_SUCCESS; + +} + + +/* + get platform id and device id of device specified by device_name (device name can be -mic, -cpu, -gpu, -all) + finds the first device of the specified type and saves device id and platform id +*/ +int OpenCLBase::ocl_getDevice(const char* device_name) { + + int ierr = 0; + + cl_platform_id *tmp_platform_ids; + cl_uint num_of_platforms, num_of_devices; + + //get device type from name, return with error on failure + ierr = ocl_getDeviceType(device_name, m_device_type); + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Can't find device, OpenCL error: " << ierr << ", " << device_name); + return ierr; + } + + //find all available platforms + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); + return ierr; + } + + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return ierr; + } + + //search each platform for specified device + for (unsigned int i = 0; i < num_of_platforms; i++) { + + //get number of devices and first avaialble device id + ierr = clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 1, &m_device_id, &num_of_devices); + + if (ierr != CL_SUCCESS && ierr != CL_DEVICE_NOT_FOUND) { + DEBUG_MSG("Can't find device id's, OpenCL error: " << ierr); + return ierr; + } + + //if device exists in current platform + if (num_of_devices > 0) { + //save platform id + m_platform_id = tmp_platform_ids[i]; + + //get the name of device that will be used and print its name + size_t size; + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, 0, NULL, &size); + + char* info = new char[size]; + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, size, info, NULL); + + DEBUG_MSG("Accelerator device: " << info); + delete[] info; + + //get the name of the platform + clGetPlatformInfo(m_platform_id, CL_PLATFORM_NAME, 0, NULL, &size); + info = new char[size]; + clGetPlatformInfo(m_platform_id, CL_PLATFORM_NAME, size, info, NULL); + + DEBUG_MSG("Accelerator platform: " << info); + + return OCL_SUCCESS; + } + } + + return OCL_ERROR; +} + +int OpenCLBase::ocl_getDeviceCount(int &ndev) { + int ierr = DKS_SUCCESS; + + cl_platform_id *tmp_platform_ids; + cl_uint num_of_platforms, num_of_devices, total_devices; + + //find platform count + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of platforms, OpenCL error: " << ierr); + return DKS_ERROR; + } + + //find all platform IDs + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return ierr; + } + + //for each platform find number of devices + total_devices = 0; + for (unsigned int i = 0; i < num_of_platforms; i++) { + //get device count for platform + ierr = clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices); + if (ierr != CL_SUCCESS && ierr != CL_DEVICE_NOT_FOUND) { + DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr); + return OCL_ERROR; + } + total_devices += num_of_devices; + num_of_devices = 0; + } + + ndev = total_devices; + return DKS_SUCCESS; + +} + +int OpenCLBase::ocl_getDeviceName(std::string &device_name) { + + int ierr = DKS_SUCCESS; + size_t size; + + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, 0, NULL, &size); + char* name = new char[size]; + clGetDeviceInfo(m_device_id, CL_DEVICE_NAME, size, name, NULL); + + device_name = name; + delete[] name; + return ierr; +} + +int OpenCLBase::ocl_setDevice(int device) { + + int ierr; + + cl_device_id *tmp_device_ids; + cl_platform_id *tmp_platform_ids; + cl_int *tmp_device_counts; + cl_uint num_of_platforms, num_of_devices; + cl_uint total_devices = 0; + + //find all available platforms + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); + return DKS_ERROR; + } + + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + tmp_device_counts = new cl_int[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return DKS_ERROR; + } + + //search each platform for specified device + for (unsigned int i = 0; i < num_of_platforms; i++) { + + //get the number of devices in the platform + num_of_devices = 0; + clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices); + tmp_device_counts[i] = num_of_devices; + total_devices += num_of_devices; + } + + //check in which platform the selected device is located + int tmp_count = 0; + int checked_count = 0; + int id = -1; + int platform = -1; + for (unsigned int i = 0; i < num_of_platforms; i++) { + tmp_count += tmp_device_counts[i]; + if (device < tmp_count) { + id = device - checked_count; + platform = i; + break; + } + checked_count += tmp_device_counts[i]; + } + + ierr = DKS_ERROR; + if (id > 0) { + num_of_devices = tmp_device_counts[platform]; + tmp_device_ids = new cl_device_id[num_of_devices]; + clGetDeviceIDs(tmp_platform_ids[platform], m_device_type, num_of_devices, tmp_device_ids, NULL); + + m_device_id = tmp_device_ids[id]; + m_platform_id = tmp_platform_ids[platform]; + ierr = ocl_createContext(); + + delete[] tmp_device_ids; + } + + delete[] tmp_platform_ids; + delete[] tmp_device_counts; + + return ierr; +} + +int OpenCLBase::ocl_getUniqueDevices(std::vector &devices) { + + int ierr; + + size_t size; + cl_device_id *tmp_device_ids; + cl_platform_id *tmp_platform_ids; + cl_uint num_of_platforms, num_of_devices; + + //find all available platforms + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num platforms, OpenCL error: " << ierr); + return DKS_ERROR; + } + + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return DKS_ERROR; + } + + std::vector< std::string > names; + int checked_count = 0; + int id = 0; + for (unsigned int i = 0; i < num_of_platforms; i++) { + + //get the number of devices in the platform + num_of_devices = 0; + clGetDeviceIDs(tmp_platform_ids[i], m_device_type, 0, NULL, &num_of_devices); + tmp_device_ids = new cl_device_id[num_of_devices]; + clGetDeviceIDs(tmp_platform_ids[i], m_device_type, num_of_devices, tmp_device_ids, NULL); + + for (unsigned int j = 0; j < num_of_devices; j++) { + id = checked_count + j; + clGetDeviceInfo(tmp_device_ids[j], CL_DEVICE_NAME, 0, NULL, &size); + char* name = new char[size]; + clGetDeviceInfo(tmp_device_ids[j], CL_DEVICE_NAME, size, name, NULL); + std::string target = name; + if (id == 0) { + devices.push_back(id); + names.push_back(target); + } else { + bool isPresent = (std::find(names.begin(), names.end(), target) != names.end()); + if (!isPresent) { + devices.push_back(id); + names.push_back(target); + } + } + delete[] name; + } + + checked_count += num_of_devices; + delete[] tmp_device_ids; + } + + delete[] tmp_platform_ids; + + return DKS_SUCCESS; +} + +/* + checks wether device name is specified and sets device type to search for + if invalid device name is specified set device type to default +*/ +int OpenCLBase::ocl_getDeviceType(const char* device_name, cl_device_type &device_type) { + + device_type = CL_DEVICE_TYPE_DEFAULT; + + if (strcmp(device_name, "-mic") == 0) + device_type = CL_DEVICE_TYPE_ACCELERATOR; + + if (strcmp(device_name, "-cpu") == 0) + device_type = CL_DEVICE_TYPE_CPU; + + if (strcmp(device_name, "-gpu") == 0) + device_type = CL_DEVICE_TYPE_GPU; + + if (strcmp(device_name, "-all") == 0) + device_type = CL_DEVICE_TYPE_ALL; + + return OCL_SUCCESS; +} + +/* + creates a context and command queue between host and device +*/ +int OpenCLBase::ocl_createContext() { + int ierr; + + //context properties list + m_context_properties[0] = CL_CONTEXT_PLATFORM; + m_context_properties[1] = (cl_context_properties) m_platform_id; + m_context_properties[2] = 0; + + //create a context with specified device + m_context = clCreateContext(m_context_properties, 1, &m_device_id, NULL, NULL, &ierr); + + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't create context, OpenCL error: " << ierr); + return ierr; + } + + //create command queue using context and device + //m_command_queue = clCreateCommandQueue(m_context, m_device_id, CL_QUEUE_PROFILING_ENABLE, &ierr); + m_command_queue = clCreateCommandQueue(m_context, m_device_id, 0, &ierr); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't create command queue, OpenCL error: " << ierr); + return ierr; + } + + return OCL_SUCCESS; +} + +/* + read file specified by kernel_file and compile the kernel code contained in kernel_file + save reference to the built program to m_program, from witch individual kernels can be extracted +*/ +int OpenCLBase::ocl_buildProgram(const char *kernel_file) { + + cl_int ierr; + long fsize; + char *kernel_source; + + //open file + FILE *fp = fopen(kernel_file, "rb"); + if (!fp) { + DEBUG_MSG("Can't open kernel file: " << kernel_file); + return OCL_ERROR; + } + + //get file size and allocate memory + fseek(fp, 0, SEEK_END); + fsize = ftell(fp); + kernel_source = new char[fsize+1]; + + //read file and content in kernel source + rewind(fp); + fread(kernel_source, 1, sizeof(char)*fsize, fp); + kernel_source[fsize] = '\0'; + fclose(fp); + + ierr = ocl_compileProgram(kernel_source); + + //save currently loaded kernel file + m_kernel_file = new char[strlen(kernel_file) + 1]; + strcpy(m_kernel_file, kernel_file); + + return ierr; + +} + +//given kernel source compile the OpenCL programm +int OpenCLBase::ocl_compileProgram(const char* kernel_source, const char* opts) { + + int ierr; + + //create program from kernel + m_program = clCreateProgramWithSource(m_context, 1, (const char **)&kernel_source, NULL, &ierr); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error creating program from source, OpenCL error: " << ierr); + return DKS_ERROR; + } + + //compile the program, if compilation + ierr = clBuildProgram(m_program, 0, NULL, opts, NULL, NULL); + + /* + check if compileng kernel source succeded, if failed return error code + if in debug mode get compilation info and print program build log witch + will give indication what made the compilation fail + */ +#ifdef DEBUG + if (ierr != CL_SUCCESS) { + + //get build status + cl_build_status status; + clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); + + //get log size + size_t log_size; + clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + + //get log message + char *log = new char[log_size]; + clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, log_size+1, log, NULL); + + //print log messsage + DEBUG_MSG("Build failed! Status:" << status); + DEBUG_MSG("LOG: " << log); + + delete[] log; + + return DKS_ERROR; + } +#else + if (ierr != CL_SUCCESS) + return DKS_ERROR; +#endif + + return DKS_SUCCESS; + +} + + + +//=========================================// +//===============public functions==========// +//=========================================// + +/* + get all device from all platforms +*/ +int OpenCLBase::ocl_getAllDevices() { + + int ierr = DKS_SUCCESS; + + cl_platform_id *tmp_platform_ids, *platform_ids; + cl_uint num_of_platforms, num_of_devices, total_devices; + cl_device_id *tmp_device_ids, *device_ids; + + //find platform count + ierr = clGetPlatformIDs(0, NULL, &num_of_platforms); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of platforms, OpenCL error: " << ierr); + return OCL_ERROR; + } + + //find all platform IDs + tmp_platform_ids = new cl_platform_id[num_of_platforms]; + ierr = clGetPlatformIDs(num_of_platforms, tmp_platform_ids, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find platform id's, OpenCL error: " << ierr); + return ierr; + } + + //for each platform find number of devices + total_devices = 0; + for (unsigned int i = 0; i < num_of_platforms; i++) { + //get device count for platform + ierr = clGetDeviceIDs(tmp_platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_of_devices); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr); + return OCL_ERROR; + } + total_devices += num_of_devices; + } + + //get all device ids + int idx = 0; + platform_ids = new cl_platform_id[total_devices]; + device_ids = new cl_device_id[total_devices]; + tmp_device_ids = new cl_device_id[total_devices]; + + for (unsigned int i = 0; i < num_of_platforms; i++) { + //get device ids + ierr = clGetDeviceIDs(tmp_platform_ids[i], CL_DEVICE_TYPE_ALL, total_devices, tmp_device_ids, &num_of_devices); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Can't find num of devices, OpenCL error: " << ierr); + return OCL_ERROR; + } + + for (unsigned j = 0; j < num_of_devices; j++) { + platform_ids[idx] = tmp_platform_ids[i]; + device_ids[idx] = tmp_device_ids[j]; + idx++; + } + } + + std::cout << std::endl; + std::cout << "==============================" << std::endl; + std::cout << "============OpenCL============" << std::endl; + std::cout << "==============================" << std::endl; + + for (unsigned int i = 0; i < total_devices; i++) { + + //get the name of device that will be used and print its name + size_t size; + + DEBUG_MSG("Device " << i+1 << ":"); + + clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 0, NULL, &size); + char *device_name = new char[size]; + clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, size, device_name, NULL); + DEBUG_MSG("Name: \"" << device_name << "\""); + + clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, 0, NULL, &size); + char *device_vendor = new char[size]; + clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, size, device_vendor, NULL); + DEBUG_MSG("Vendor: \"" << device_vendor << "\""); + + cl_device_type device_type; + clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); + + if (device_type == CL_DEVICE_TYPE_GPU) { + DEBUG_MSG("Device type: GPU"); + } else if (device_type == CL_DEVICE_TYPE_CPU) { + DEBUG_MSG("Device type: CPU"); + } else if (device_type == CL_DEVICE_TYPE_ACCELERATOR) { + DEBUG_MSG("Device type: Accelerator"); + } + + std::cout << "==============================" << std::endl; + + } + + return OCL_SUCCESS; +} + + +/* + find available device, create context and command queue, load kernel file and kompile kernel code +*/ +int OpenCLBase::ocl_setUp(const char *device_name) { + cl_int ierr; + ierr = ocl_getDevice(device_name); + if (ierr != CL_SUCCESS) + return ierr; + + ocl_deviceInfo(false); + + ierr = ocl_createContext(); + if (ierr != CL_SUCCESS) + return ierr; + + return DKS_SUCCESS; +} + +/* + load and compile kernel file if it has changed +*/ +int OpenCLBase::ocl_loadKernel(const char * kernel_file) { + int ierr = OCL_SUCCESS; + + //kernel file has changed + if (m_kernel_file == NULL) { + ierr = ocl_buildProgram(kernel_file); + } else { + if (strcmp(m_kernel_file, kernel_file) != 0) { + ierr = ocl_buildProgram(kernel_file); + } + } + + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Failed to build kernel file " << kernel_file); + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + +//compile kernel form source code provided +int OpenCLBase::ocl_loadKernelFromSource(const char *kernel_source, const char *opts) { + + int ierr = ocl_compileProgram(kernel_source, opts); + + return ierr; +} + +/* + Allocate memory buffer of specified size and type, + available types (read only, write only, read/write) + return memory object +*/ +cl_mem OpenCLBase::ocl_allocateMemory(size_t size, int type, cl_int &ierr) { + cl_mem mem; + mem = clCreateBuffer(m_context, type, size, NULL, &ierr); + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error allocating memory, OpenCL error: " << ierr); + + return mem; +} + +/* + Allocate memory buffer of specified size, type is set to read/write + return memory object +*/ +cl_mem OpenCLBase::ocl_allocateMemory(size_t size, cl_int &ierr) { + cl_mem mem; + + mem = clCreateBuffer(m_context, CL_MEM_READ_WRITE, size, NULL, &ierr); + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error allocating memory, OpenCL error: " << ierr); + + return mem; +} + +/* + write data specified by in_data to device memory, device memory space defined by cl_mem +*/ +int OpenCLBase::ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset, int blocking) { + + cl_int ierr; + + + //std::cout << "Write: " << size*1e-9 << " gb of data" << std::endl; + ierr = clEnqueueWriteBuffer(m_command_queue, mem_ptr, blocking, offset, size, in_data, 0, NULL, &m_last_event); + + //m_events[m_num_events] = m_last_event; + m_events.push_back(m_last_event); + + + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error writing data to device, OpenCL error: " << ierr); + return ierr; + } + + return OCL_SUCCESS; +} + +/* + copy src buffer into dst buffer +*/ +int OpenCLBase::ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size) { + + int ierr; + ierr = clEnqueueCopyBuffer(m_command_queue, src_ptr, dst_ptr, 0, 0, size, 0, NULL, NULL); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error copying buffers, OpenCL error: " << ierr); + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + + +/* + create kernel specified by kernel_name from compiled program +*/ +int OpenCLBase::ocl_createKernel(const char* kernel_name) { + cl_int ierr; + m_kernel = clCreateKernel(m_program, kernel_name, &ierr); + if (ierr != CL_SUCCESS) { + DEBUG_MSG("Error creating kernel, OpenCL error: " << ierr); + return ierr; + } + return OCL_SUCCESS; +} + +/* + set kernel argument, idx is the index of arument, size specifies data size, arg_value value of the argument +*/ +int OpenCLBase::ocl_setKernelArg(int idx, size_t size, const void *arg_value) { + cl_int ierr; + ierr = clSetKernelArg(m_kernel, idx, size, arg_value); + + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error setting kernel arg, OpenCL error: " << ierr); + + return ierr; +} + +/* + executes set kernel, must provide dimensions ndim (1, 2 or 3) and total number of work items + work_items should be an arry of size ndim + optional: work_group_size - can specify how work items are divided in work groups, + if left NULL OpenCL implementation handles this part. +*/ +int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const size_t *work_group_size) { + cl_int ierr; + + cl_event tmp_event; + if (m_last_event == NULL) { + ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, + 0, NULL, &tmp_event); + } else { + ierr = clEnqueueNDRangeKernel(m_command_queue, m_kernel, ndim, NULL, work_items, work_group_size, + 1, &m_last_event, &tmp_event); + } + + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr); + + m_last_event = tmp_event; + m_events.push_back(m_last_event); + + return ierr; +} + +/* + read data from device, mem_ptr points to data on device out_data points to memory in host + blocking specifies wether the read operation is blocking (default CL_TRUE) or non blocking (CL_FALSE) +*/ +int OpenCLBase::ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset, int blocking) { + cl_int ierr; + + ierr = clEnqueueReadBuffer(m_command_queue, mem_ptr, blocking, offset, size, out_data, 0, NULL, &m_last_event); + + m_events.push_back(m_last_event); + + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error reading data from device, OpenCL error: " << ierr); + + return ierr; +} + +/* + free device memory specified by mem_ptr +*/ +int OpenCLBase::ocl_freeMemory(cl_mem mem_ptr) { + cl_int ierr; + ierr = clReleaseMemObject(mem_ptr); + if (ierr != CL_SUCCESS) + DEBUG_MSG("Error freeing memory on device, OpenCL error: " << ierr); + + return ierr; +} + +/* + delete created OpenCL resources +*/ +int OpenCLBase::ocl_cleanUp() { + + if (m_kernel != NULL) { + clReleaseKernel(m_kernel); + m_kernel = NULL; + } + + if (m_program != NULL) { + clReleaseProgram(m_program); + m_program = NULL; + } + + if (m_command_queue != NULL) { + clReleaseCommandQueue(m_command_queue); + m_command_queue = NULL; + } + + if (m_context != NULL) { + clReleaseContext(m_context); + m_context = NULL; + } + + return OCL_SUCCESS; +} + +int OpenCLBase::ocl_deviceInfo(bool verbose) { + + + if (m_device_id == NULL) { + std::cout << "Device not set" << std::endl; + return OCL_ERROR; + } + + + char *info; + cl_bool b_info; + cl_ulong ul_info; + cl_uint ui_info; + size_t info_size; + //size_t *wi_info; + cl_device_type device_type; + + const int count = 12; + const char *info_type[count] = {"char", "cl_device_type", "cl_bool", + "cl_bool", "cl_ulong", "cl_uint", + "cl_uint", "cl_ulong", "size_t", + "size_t[]", "cl_ulong", "char"}; + const char* info_name[count] = {"Name", "Device type","Device available", + "Compiler available", "Global mem size (gb)", "Max clock freq (MHz)", + "Max compute units", "Max buffer size (bytes)", "Max work group size", + "Max work item sizes", "Local mem size (bytes)", "Extensions"}; + const cl_device_info info_value[count] = {CL_DEVICE_NAME, CL_DEVICE_TYPE, CL_DEVICE_AVAILABLE, + CL_DEVICE_COMPILER_AVAILABLE, CL_DEVICE_GLOBAL_MEM_SIZE, CL_DEVICE_MAX_CLOCK_FREQUENCY, + CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, CL_DEVICE_MAX_WORK_GROUP_SIZE, + CL_DEVICE_MAX_WORK_ITEM_SIZES, CL_DEVICE_LOCAL_MEM_SIZE, CL_DEVICE_EXTENSIONS}; + + int print_count; + if (verbose) + print_count = count; + else + print_count = 3; + + + std::cout << "--------------------" << std::endl; + std::cout << "OpenCL device information" << std::endl; + std::cout << "--------------------" << std::endl; + + for (int k = 0; k < print_count; k++) { + if (strcmp(info_type[k], "char") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], 0, NULL, &info_size); + info = new char[info_size]; + clGetDeviceInfo(m_device_id, info_value[k], info_size, info, NULL); + std::cout << info_name[k] << ": " << info << std::endl; + delete[] info; + + } else if (strcmp(info_type[k], "cl_bool") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_bool), &b_info, NULL); + std::cout << info_name[k] << ": " << b_info << std::endl; + + } else if (strcmp(info_type[k], "cl_ulong") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_ulong), &ul_info, NULL); + + if (info_value[k] == CL_DEVICE_GLOBAL_MEM_SIZE) { + double gb = (double)ul_info*1e-9; + std::cout << info_name[k] << ": " << gb << std::endl; + } else if (info_value[k] == CL_DEVICE_LOCAL_MEM_SIZE) { + std::cout << info_name[k] << ": " << ul_info << std::endl; + std::cout << "512^2 bytes: " << sizeof(cl_double2)*512*5 << std::endl; + } else { + std::cout << info_name[k] << ": " << ul_info << std::endl; + } + } else if (strcmp(info_type[k], "cl_uint") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_uint), &ui_info, NULL); + std::cout << info_name[k] << ": " << ui_info << std::endl; + + } else if (strcmp(info_type[k], "size_t") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(size_t), &info_size, NULL); + std::cout << info_name[k] << ": " << info_size << std::endl; + + } else if (strcmp(info_type[k], "size_t[]") == 0 ){ + clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &ui_info, NULL); + size_t wi_info[3];// = new size_t[ui_info]; + clGetDeviceInfo(m_device_id, info_value[k], 3 * sizeof(size_t), &wi_info, NULL); + std::cout << info_name[k] << ": "; + for (unsigned int m = 0; m < ui_info; m++) + std::cout << wi_info[m] << " "; + std::cout << std::endl; + + } else if (strcmp(info_type[k], "cl_device_type") == 0) { + clGetDeviceInfo(m_device_id, info_value[k], sizeof(cl_device_type), &device_type, NULL); + switch (device_type) { + case CL_DEVICE_TYPE_CPU: + std::cout << info_name[k] << ": CPU" << std::endl; + break; + case CL_DEVICE_TYPE_GPU: + std::cout << info_name[k] << ": GPU" << std::endl; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + std::cout << info_name[k] << ": Accelerator" << std::endl; + break; + case CL_DEVICE_TYPE_DEFAULT: + std::cout << info_name[k] << ": Default" << std::endl; + break; + default: + std::cout << info_name[k] << ": Unknown" << std::endl; + break; + } + } + } + return OCL_SUCCESS; +} + +int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size, + bool double_precision, int &threadsPerBlock) +{ + + //build kernel + int ierr = ocl_createKernel(kernel_name); + if (ierr != DKS_SUCCESS) + return ierr; + + //get device properties + size_t max_group_size; + clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0); + cl_ulong local_mem_size; + clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0); + size_t ext_size; + clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size); + char *ext = new char[ext_size]; + clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0); + + //get kernel properties + size_t kernel_group_size; + clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &kernel_group_size, 0); + threadsPerBlock = kernel_group_size; + + cl_ulong kernel_local_mem; + clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(cl_ulong), &kernel_local_mem, 0); + + + std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl; + + + std::cout << "Work groups: device limit " << max_group_size << ", " + << "kernel limit " << kernel_group_size << ", " + << "required " << work_group_size << std::endl; + + + std::cout << "Local memory: device limit " << local_mem_size << std::endl; + + + + std::cout << "Available extensions: " << ext << std::endl; + + std::cout << "End " << kernel_name << " check..." << std::endl << std::endl; + + return DKS_SUCCESS; +} + +void OpenCLBase::ocl_clearEvents() { + + m_events.clear(); + + //delete[] m_events; + //m_num_events = 0; + //m_events = new cl_event[500]; + +} + + + +void OpenCLBase::ocl_eventInfo() { + + std::cout << "Number of events launched: " << m_events.size() << std::endl; + + if (m_events.size() > 0) { + + cl_ulong twrite = 0; + cl_ulong texec = 0; + cl_ulong tread = 0; + int cw = 0; + int ce = 0; + int cr = 0; + + for (unsigned i = 0; i < m_events.size(); i++) { + + cl_ulong tqueue, tsubmit, tstart, tend; + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &tqueue, NULL); + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_SUBMIT, + sizeof(cl_ulong), &tsubmit, NULL); + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &tstart, NULL); + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &tend, NULL); + + cl_command_type type; + clGetEventInfo(m_events[i], CL_EVENT_COMMAND_TYPE, sizeof(cl_int), &type, NULL); + + if (type == CL_COMMAND_WRITE_BUFFER) { + twrite += (tend - tstart); + cw++; + } + + if (type == CL_COMMAND_READ_BUFFER) { + tread += (tend - tstart); + cr++; + } + + if (type == CL_COMMAND_NDRANGE_KERNEL) { + texec += (tend - tstart); + ce++; + } + } + + std::cout << "OpenCL write: " << (twrite * 1e-9) << " in: " << cw << std::endl; + std::cout << "OpenCL exec: " << (texec * 1e-9) << " in: " << ce << std::endl; + std::cout << "OpenCL read: " << (tread * 1e-9) << " in: " << cr << std::endl; + + } + + /* + cl_ulong tqueue, tsubmit, tstart, tend, tref; + + int *list_bad_events = new int[m_num_events]; + int num_bad_events = 0; + + if (m_num_events > 0) { + + double *list_ended = new double[m_num_events]; + + clGetEventProfilingInfo(m_events[0], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &tref, NULL); + + std::cout << std::endl; + std::cout << setw(10) << left << "Event\t| "; + std::cout << setw(10) << left << "queued\t| "; + std::cout << setw(10) << left << "submited\t| "; + std::cout << setw(10) << left << "started\t| "; + std::cout << setw(10) << left << "ended \t| "; + + std::cout << setw(10) << left << "in queue" << std::endl; + std::cout << setw(10) << "-----------------------------------------------------------------------------------" << std::endl; + for (unsigned int i = 0; i < m_num_events; i++) { + + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &tqueue, NULL); + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &tsubmit, NULL); + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &tstart, NULL); + clGetEventProfilingInfo(m_events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tend, NULL); + + cl_command_type type; + clGetEventInfo(m_events[i], CL_EVENT_COMMAND_TYPE, sizeof(cl_int), &type, NULL); + + tqueue = (tqueue >= tref) ? tqueue - tref : tqueue; + tsubmit = (tsubmit > tref) ? tsubmit - tref : tsubmit; + tstart = (tstart > tref) ? tstart - tref : tstart; + tend = (tend > tref) ? tend - tref : tend; + + if (type == CL_COMMAND_READ_BUFFER || type == CL_COMMAND_WRITE_BUFFER) + std::cout << left << i << "*\t| "; + else + std::cout << left << i << "\t| "; + std::cout << setw(7) << left << tqueue << "\t| "; + std::cout << setw(7) << left << tsubmit << "\t| "; + std::cout << setw(7) << left << tstart << "\t| "; + std::cout << setw(7) << left << tend << "\t| "; + + int count = 0; + if (i > 0) { + for (unsigned int j = 0; j < i; j++) { + if (list_ended[j] > tqueue) + count++; + } + } + list_ended[i] = tend; + + std::cout << setw(7) << left << count << std::endl; + + //this seems to be a problem on MIC sometimes + if (tstart == 0) { + list_bad_events[num_bad_events] = i; + num_bad_events++; + } + } + std::cout << setw(10) << "-----------------------------------------------------------------------------------" << std::endl << std::endl; + + //print info about failed events + for (int i = 0; i < num_bad_events; i++) { + cl_int event_status; + int id = list_bad_events[i]; + clGetEventInfo(m_events[id], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &event_status, NULL); + std::cout << "Event " << id << " : "; + switch(event_status) { + case CL_QUEUED: + std::cout << "queued" << std::endl; + break; + case CL_SUBMITTED: + std::cout << "submited" << std::endl; + break; + case CL_RUNNING: + std::cout << "running" << std::endl; + break; + case CL_COMPLETE: + std::cout << "complete" << std::endl; + break; + default: + std::cout << "error" << std::endl; + break; + } + } + } + */ + +} + + + + + + + + + + diff --git a/src/OpenCL/OpenCLBase.h b/src/OpenCL/OpenCLBase.h new file mode 100644 index 0000000..ae0a15c --- /dev/null +++ b/src/OpenCL/OpenCLBase.h @@ -0,0 +1,303 @@ +/* + + Name: OpenCLBase + + Author: Uldis Locans + + Info: OpenCL base class to handle all the common details associated + with kernel launch on OpenCL device + + Date: 2014.09.18 + +*/ + +#ifndef H_OPENCL_BASE +#define H_OPENCL_BASE + +#include +#include +#include +#include +#include +#include + + +#ifdef __APPLE__ +#include +#include +#else +#include +#include +#endif + + + +#include "../DKSDefinitions.h" + +/* struct for random number state */ +typedef struct { + + double s10; + double s11; + double s12; + double s20; + double s21; + double s22; + double z; + bool gen; + +} RNDState; + +class OpenCLBase { + +private: + + static cl_context m_context; + static cl_command_queue m_command_queue; + + static cl_platform_id m_platform_id; + static cl_device_id m_device_id; + + cl_context_properties m_context_properties[3]; + cl_program m_program; + cl_kernel m_kernel; + + static cl_event m_last_event; + cl_int m_num_events; + std::vector m_events; + + char * m_kernel_file; + + cl_device_type m_device_type; + + /* + Name: getPlatforms + Info: get all avaialble platforms and save in m_platform_ids, save number of platforms + Return: success or error code + */ + int ocl_getPlatforms(); + + + /* + Name: getDevice + Info: get first avaialble devices and save device id and platform id for this device, device name: (-gpu, -mic, -cpu) + ReturnL success or error code + */ + int ocl_getDevice(const char* device_name); + + /* + Name getDeviceType + Info: get device type from device name (-gpu, -cpu, -mic) + Return: success or error code + */ + int ocl_getDeviceType(const char* device_name, cl_device_type &device_type); + + /* + Name: createContext + Info: create context with specified device + Return: success or error code + */ + int ocl_createContext(); + + /* + Name: buildProgram + Info: build program from specified kernel file + Return: success or error code + */ + int ocl_buildProgram(const char* kernel_file); + + /** Compile program from kernel source string + * + */ + int ocl_compileProgram(const char* kernel_source, const char* opts = NULL); + +protected: + + int defaultRndSet; + cl_mem defaultRndState; + + +public: + + /* + constructor + */ + OpenCLBase(); + + /* + destructor + */ + ~OpenCLBase(); + + /* + Create RND states + Return: success or error code + */ + int ocl_createRndStates(int size); + + /* + Destroy rnd states + Return: success or error code + */ + int ocl_deleteRndStates(); + + + /* + Name: getAllDevices + Info: get all available devices + ReturnL success or error code + */ + int ocl_getAllDevices(); + + /** Get the OpenCL device count for the set type of device + * + */ + int ocl_getDeviceCount(int &ndev); + + /** Get the name of the device used + */ + int ocl_getDeviceName(std::string &device_name); + + /** Set the device to use for OpenCL kernels. + * device id to use is passed as integer. + */ + int ocl_setDevice(int device); + + /** Get a list of all the unique devices of the same type that can run OpenCL kernels + * Used when GPUs of different types might be pressent on the system. + */ + int ocl_getUniqueDevices(std::vector &devices); + + /* + Name: setUp + Info: set up opencl resources + Return: success or error code + */ + int ocl_setUp(const char* device_name); + + /* + Name: loadKernel + Info: load and compile opencl kernel file if it has changed + Return: success or error code + */ + int ocl_loadKernel(const char* kernel_file); + + + /** Build program from kernel source. + * Builds a program from source code provided in kernel_source. + * If compilation fails will return DKS_ERROR + */ + int ocl_loadKernelFromSource(const char* kernel_source, const char* opts = NULL); + + /* + Name: allocateMemory + Info: allocate memory on device + Return: return pointer to memory + */ + cl_mem ocl_allocateMemory(size_t size, int &ierr); + + /* + Name: allocateMemory + Info: allocate memory on device + Return: return pointer to memory + */ + cl_mem ocl_allocateMemory(size_t size, int type, int &ierr); + + /* + Name: writeData + Info: write data to device memory (needs ptr to mem object) + Return: success or error code + */ + int ocl_writeData(cl_mem mem_ptr, const void * in_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); + + /* + Name: copyData + Info: copy data from one buffer on the device to another + Return: success or error code + */ + int ocl_copyData(cl_mem src_ptr, cl_mem dst_ptr, size_t size); + + /* + Name: createKernel + Info: create kernel from program + Return: success or error code + */ + int ocl_createKernel(const char* kernel_name); + + /* + Name: setKernelArgs + Info: set opencl kernel arguments + Return: success or error code + */ + int ocl_setKernelArg(int idx, size_t size, const void *arg_value); + + /* + Name: executeKernel + Info: execute selected kernel (needs kernel parameters) + Return: success or error code + */ + int ocl_executeKernel(cl_uint, const size_t *work_items, const size_t *work_grou_size = NULL); + + /* + Name: readData + Info: read data from device (needs pointer to mem object) + Return: success or error code + */ + int ocl_readData(cl_mem mem_ptr, void * out_data, size_t size, size_t offset = 0, int blocking = CL_TRUE); + + /* + Name: freeMemory + Info: free device memory (needs ptr to mem object) + Return: success or error code + */ + int ocl_freeMemory(cl_mem mem_ptr); + + /* + Name: cleanUp + Info: free opencl resources + Return: success or error code + */ + int ocl_cleanUp(); + + /* + Name: deviceInfo + Info: print device info (mostly for debugging purposes) + Return: success or error code + */ + int ocl_deviceInfo(bool verbose = true); + + /* Check OpenCL kernel. + * Query device and check if it can run the kernel with required parameters + */ + int ocl_checkKernel(const char* kernel_name, int work_group_size, + bool double_precision, int &threadsPerBlock); + + /* + Name: clearEvents + Info: clear saved events (for debuging purposes) + Return: nothing + */ + void ocl_clearEvents(); + + /* + Name: eventInfo + Info: print information about kernel timings (for debuging purposes) + Return: nothing + */ + void ocl_eventInfo(); + + /* + Return current command queue + */ + cl_command_queue ocl_getQueue() { return m_command_queue; } +}; + +#endif + + + + + + + + diff --git a/src/OpenCL/OpenCLChiSquare.cpp b/src/OpenCL/OpenCLChiSquare.cpp new file mode 100644 index 0000000..7de4a62 --- /dev/null +++ b/src/OpenCL/OpenCLChiSquare.cpp @@ -0,0 +1,157 @@ +#include "OpenCLChiSquare.h" + +double OpenCLChiSquare::ocl_sum(cl_mem data, int length) { + + + int ierr; + //calc number of thread sper workgroup and nr of work groups + size_t work_size_sum = 128; + size_t work_items = (size_t)length; + if (length % work_size_sum > 0) + work_items = (length / work_size_sum + 1) * work_size_sum; + + int work_groups = length / work_size_sum + 1; + + //create tmp array for partial sums + cl_mem tmp_ptr; + + double *partial_sums = new double[work_groups]; + tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr); + + //execute sum kernel + m_oclbase->ocl_createKernel("parallelReductionSum"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr); + m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum); + + //read partial sums and free temp mempry + m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups); + m_oclbase->ocl_freeMemory(tmp_ptr); + + //sumup partial sums on the host + double result = 0; + for (int i = 0; i < work_groups; i++) + result += partial_sums[i]; + + delete[] partial_sums; + + return result; + +} + +int OpenCLChiSquare::ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result) +{ + + //set number of work items and work group sizes for kernel execution + size_t work_size = 128; + + size_t work_items = (size_t)length * sensors; + if (length % work_size > 0) + work_items = (length / work_size + 1) * work_size; + + cl_mem data = (cl_mem)mem_data; + cl_mem par = (cl_mem)mem_par; + cl_mem chi = (cl_mem)mem_result; + + //load and execute PHistotFFcn kernel + m_oclbase->ocl_createKernel("kernelPHistoTFFcn"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &par); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &chi); + m_oclbase->ocl_setKernelArg(3, sizeof(double), &fTimeResolution); + m_oclbase->ocl_setKernelArg(4, sizeof(double), &fRebin); + m_oclbase->ocl_setKernelArg(5, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(6, sizeof(int), &sensors); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(8, sizeof(double)*numpar, NULL); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + result = ocl_sum(chi, sensors*length); + + return DKS_SUCCESS; +} + +int OpenCLChiSquare::ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + //set number of work items and work group sizes for kernel execution + size_t work_size = 128; + size_t work_items = (size_t)length * sensors; + if (length % work_size > 0) + work_items = (length / work_size + 1) * work_size; + + cl_mem data = (cl_mem)mem_data; + cl_mem t0 = (cl_mem)mem_t0; + cl_mem par = (cl_mem)mem_par; + cl_mem chi = (cl_mem)mem_result; + + //load and execute PHistotFFcn kernel + m_oclbase->ocl_createKernel("kernelSingleGaussTF"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi); + m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution); + m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin); + m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + result = ocl_sum(chi, length); + + return DKS_SUCCESS; + +} + + +int OpenCLChiSquare::ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result) +{ + + //set number of work items and work group sizes for kernel execution + size_t work_size = 128; + size_t work_items = (size_t)length * sensors; + if (length % work_size > 0) + work_items = (length / work_size + 1) * work_size; + + cl_mem data = (cl_mem)mem_data; + cl_mem t0 = (cl_mem)mem_t0; + cl_mem par = (cl_mem)mem_par; + cl_mem chi = (cl_mem)mem_result; + + //load and execute PHistotFFcn kernel + m_oclbase->ocl_createKernel("kernelDoubleLorentzTF"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &t0); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &par); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &chi); + m_oclbase->ocl_setKernelArg(4, sizeof(double), &fTimeResolution); + m_oclbase->ocl_setKernelArg(5, sizeof(double), &fRebin); + m_oclbase->ocl_setKernelArg(6, sizeof(double), &fGoodBinOffset); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &sensors); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(10, sizeof(double)*numpar, NULL); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + result = ocl_sum(chi, length); + + return DKS_SUCCESS; + +} + + + diff --git a/src/OpenCL/OpenCLChiSquare.h b/src/OpenCL/OpenCLChiSquare.h new file mode 100644 index 0000000..bbc5da6 --- /dev/null +++ b/src/OpenCL/OpenCLChiSquare.h @@ -0,0 +1,53 @@ +#ifndef H_OPENCL_CHI_SQUARE +#define H_OPENCL_CHI_SQUARE + +#include + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "OpenCLBase.h" + +#define DKS_SUCCESS 0 +#define DKS_ERROR 1 + + +class OpenCLChiSquare { + +private: + + OpenCLBase *m_oclbase; + + double ocl_sum(cl_mem data, int length); + +public: + + OpenCLChiSquare(OpenCLBase *base) { + m_oclbase = base; + } + + ~OpenCLChiSquare() { } + + int ocl_PHistoTFFcn(void *mem_data, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, + int sensors, int length, int numpar, + double &result); + + int ocl_singleGaussTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + int ocl_doubleLorentzTF(void *mem_data, void *mem_t0, void *mem_par, void *mem_result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int sensors, int length, int numpar, + double &result); + + + +}; + +#endif diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp new file mode 100644 index 0000000..f8e21a6 --- /dev/null +++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp @@ -0,0 +1,316 @@ +#include "OpenCLChiSquareRuntime.h" + +OpenCLChiSquareRuntime::OpenCLChiSquareRuntime(OpenCLBase *base) { + + blockSize_m = BLOCK_SIZE; + numBlocks_m = -1; + + m_oclbase = base; + + N0_m = 1.0; + tau_m = 1.0; + bkg_m = 1.0; + alpha_m = 1.0; + beta_m = 1.0; + + ptx_m = NULL; + + initDone_m = false; + +} + +//free temporary resources +OpenCLChiSquareRuntime::~OpenCLChiSquareRuntime() { + delete[] ptx_m; + freeChiSquare(); +} + +//build program string +std::string OpenCLChiSquareRuntime::buildProgram(std::string function) { + + long fsize; + char *kernel_source; + + //get kernel source + char * kernel_file = new char[500]; + kernel_file[0] = '\0'; + strcat(kernel_file, OPENCL_KERNELS); + strcat(kernel_file, "OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl"); + + //read kernels from file + FILE *fp = fopen(kernel_file, "rb"); + if (!fp) + DEBUG_MSG("Can't open kernel file" << kernel_file); + + //get file size and allocate memory + fseek(fp, 0, SEEK_END); + fsize = ftell(fp); + kernel_source = new char[fsize+1]; + + //read file and content in kernel source + rewind(fp); + fread(kernel_source, 1, sizeof(char)*fsize, fp); + kernel_source[fsize] = '\0'; + fclose(fp); + + std::string kernel_string (kernel_source); + return kernel_string + openclFunctHeader + "return " + function + ";" + openclFunctFooter; + +} + +int OpenCLChiSquareRuntime::compileProgram(std::string function, bool mlh) { + + //build program string + std::string openclProg = buildProgram(function); + + //compile flags + std::string opts(""); + if (mlh) + opts = "-DMLH"; + + //compile opencl program from source string + int ierr = m_oclbase->ocl_loadKernelFromSource(openclProg.c_str(), opts.c_str()); + + return ierr; +} + +double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) { + + + int ierr; + //calc number of thread sper workgroup and nr of work groups + size_t work_size_sum = 128; + + /* + size_t work_items = (size_t)length; + if (length % work_size_sum > 0) + work_items = (length / work_size_sum + 1) * work_size_sum; + int work_groups = length / work_size_sum + 1; + */ + + size_t work_items = 80 * work_size_sum; + int work_groups = 80; + + //create tmp array for partial sums + cl_mem tmp_ptr; + + double *partial_sums = new double[work_groups]; + tmp_ptr = m_oclbase->ocl_allocateMemory(work_groups * sizeof(double), ierr); + + //execute sum kernel + //ocl_createKernel("parallelReductionSum"); + m_oclbase->ocl_createKernel("parallelReductionTwoPhase"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &tmp_ptr); + m_oclbase->ocl_setKernelArg(2, work_size_sum*sizeof(double), NULL); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &length); + m_oclbase->ocl_executeKernel(1, &work_items, &work_size_sum); + + //read partial sums and free temp mempry + m_oclbase->ocl_readData(tmp_ptr, partial_sums, sizeof(double)*work_groups); + m_oclbase->ocl_freeMemory(tmp_ptr); + + //sumup partial sums on the host + double result = 0; + for (int i = 0; i < work_groups; i++) + result += partial_sums[i]; + + delete[] partial_sums; + + return result; + +} + +int OpenCLChiSquareRuntime::launchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, double &result) +{ + + int ierr; + + //convert memory to cl_mem + cl_mem cl_mem_data = (cl_mem)mem_data; + cl_mem cl_mem_err = (cl_mem)mem_err; + + cl_mem cl_param = (cl_mem)mem_param_m; + cl_mem cl_chisq = (cl_mem)mem_chisq_m; + cl_mem cl_map = (cl_mem)mem_map_m; + cl_mem cl_func = (cl_mem)mem_func_m; + + //set work item size + size_t work_items; + size_t work_size = (size_t)blockSize_m; + if (numBlocks_m < 0) + work_items = (size_t)length; + else + work_items = (size_t)numBlocks_m * (size_t)blockSize_m; + + if (work_items % work_size > 0) + work_items = (work_items / work_size + 1) * work_size; + + if (fitType == FITTYPE_SINGLE_HISTO) { + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelChiSquareSingleHisto"); + + if (ierr != DKS_SUCCESS) + return ierr; + + //set kernel args + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq); + m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map); + m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func); + m_oclbase->ocl_setKernelArg(6, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap); + m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart); + m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep); + m_oclbase->ocl_setKernelArg(12, sizeof(double), &tau_m); + m_oclbase->ocl_setKernelArg(13, sizeof(double), &N0_m); + m_oclbase->ocl_setKernelArg(14, sizeof(double), &bkg_m); + m_oclbase->ocl_setKernelArg(15, sizeof(double)*numpar, NULL); + m_oclbase->ocl_setKernelArg(16, sizeof(double)*numfunc, NULL); + m_oclbase->ocl_setKernelArg(17, sizeof(int)*nummap, NULL); + + if (ierr != DKS_SUCCESS) + return ierr; + } else if (fitType == FITTYPE_ASYMMETRY) { + //create kernel + ierr = m_oclbase->ocl_createKernel("kernelChiSquareAsymmetry"); + + if (ierr != DKS_SUCCESS) + return ierr; + + //set kernel args + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &cl_mem_data); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &cl_mem_err); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_mem), &cl_param); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_mem), &cl_chisq); + m_oclbase->ocl_setKernelArg(4, sizeof(cl_mem), &cl_map); + m_oclbase->ocl_setKernelArg(5, sizeof(cl_mem), &cl_func); + m_oclbase->ocl_setKernelArg(6, sizeof(int), &length); + m_oclbase->ocl_setKernelArg(7, sizeof(int), &numpar); + m_oclbase->ocl_setKernelArg(8, sizeof(int), &numfunc); + m_oclbase->ocl_setKernelArg(9, sizeof(int), &nummap); + m_oclbase->ocl_setKernelArg(10, sizeof(double), &timeStart); + m_oclbase->ocl_setKernelArg(11, sizeof(double), &timeStep); + m_oclbase->ocl_setKernelArg(12, sizeof(double), &alpha_m); + m_oclbase->ocl_setKernelArg(13, sizeof(double), &beta_m); + m_oclbase->ocl_setKernelArg(14, sizeof(double)*numpar, NULL); + m_oclbase->ocl_setKernelArg(15, sizeof(double)*numfunc, NULL); + m_oclbase->ocl_setKernelArg(16, sizeof(int)*nummap, NULL); + + if (ierr != DKS_SUCCESS) + return ierr; + } else if (fitType == FITTYPE_MU_MINUS) { + // not yet implemented + } else { + return DKS_ERROR; + } + + //execute kernel + ierr = m_oclbase->ocl_executeKernel(1, &work_items, &work_size); + + if (ierr != DKS_SUCCESS) + return ierr; + + //execute sum kernel + result = calculateSum((cl_mem)mem_chisq_m, length); + + return ierr; + +} + +int OpenCLChiSquareRuntime::writeParams(const double *params, int numparams) { + int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_param_m, params, sizeof(double)*numparams); + return ierr; +} + + +int OpenCLChiSquareRuntime::writeFunc(const double *func, int numfunc) { + if (numfunc == 0) + return DKS_SUCCESS; + + int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_func_m, func, sizeof(double)*numfunc); + return ierr; +} + +int OpenCLChiSquareRuntime::writeMap(const int *map, int nummap) { + if (nummap == 0) + return DKS_SUCCESS; + + int ierr = m_oclbase->ocl_writeData( (cl_mem)mem_map_m, map, sizeof(int)*nummap); + return ierr; +} + +int OpenCLChiSquareRuntime::initChiSquare(int size_data, int size_param, + int size_func, int size_map) +{ + + int ierr = DKS_ERROR; + if (initDone_m) { + DEBUG_MSG("Reinitializing ChiSquare"); + freeChiSquare(); + } + + //allocate temporary memory + mem_chisq_m = m_oclbase->ocl_allocateMemory(size_data*sizeof(double), ierr); + mem_param_m = m_oclbase->ocl_allocateMemory(size_param*sizeof(double), ierr); + if (size_func == 0) + size_func = 1; + mem_func_m = m_oclbase->ocl_allocateMemory(size_func*sizeof(double), ierr); + if (size_map == 0) + size_map = 1; + mem_map_m = m_oclbase->ocl_allocateMemory(size_map*sizeof(int), ierr); + initDone_m = true; + + return ierr; + +} + +int OpenCLChiSquareRuntime::freeChiSquare() { + + int ierr = DKS_ERROR; + if (initDone_m) { + + //free memory + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_chisq_m); + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_param_m); + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_func_m); + ierr = m_oclbase->ocl_freeMemory((cl_mem)mem_map_m); + + initDone_m = false; + } + + return ierr; + +} + +int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBlock) { + + int ierr; + char kernel[64]; + + switch (fitType) { + case FITTYPE_SINGLE_HISTO: + strncpy(kernel, "kernelChiSquareSingleHisto", sizeof(kernel)); + break; + case FITTYPE_ASYMMETRY: + strncpy(kernel, "kernelChiSquareAsymmetry", sizeof(kernel)); + break; + case FITTYPE_MU_MINUS: + // not yet implemented + default: + return DKS_ERROR; + } + + ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock); + + return ierr; + +} + diff --git a/src/OpenCL/OpenCLChiSquareRuntime.h b/src/OpenCL/OpenCLChiSquareRuntime.h new file mode 100644 index 0000000..90b5c7c --- /dev/null +++ b/src/OpenCL/OpenCLChiSquareRuntime.h @@ -0,0 +1,103 @@ +#ifndef H_OPENCL_CHISQUARE_RUNTIME +#define H_OPENCL_CHISQUARE_RUNTIME + +#include +#include + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "../Algorithms/ChiSquareRuntime.h" +#include "OpenCLBase.h" + +const std::string openclFunctHeader = "double fTheory(double t, __local double *p, __local double *f, __local int *m) {"; + +const std::string openclFunctFooter = "}\n"; + +class OpenCLChiSquareRuntime : public ChiSquareRuntime { + +private: + + OpenCLBase *m_oclbase; + + /** Private function to add user defined function to kernel string + * + */ + std::string buildProgram(std::string function); + + double calculateSum(cl_mem data, int length); + +public: + + /** Constructor wiht openclbase argument + * + */ + OpenCLChiSquareRuntime(OpenCLBase *base); + + /** Default constructor + * + */ + OpenCLChiSquareRuntime(); + + /** Default destructor + * + */ + ~OpenCLChiSquareRuntime(); + + /** Compile program and save ptx. + * Add function string to the calcFunction kernel and compile the program + * Function must be valid C math expression. Parameters can be addressed in + * a form par[map[idx]] + */ + int compileProgram(std::string function, bool mlh = false); + + /** Launch selected kernel + * Launched the selected kernel from the compiled code. + * Result is put in &result variable + */ + int launchChiSquare(int fitType, + void *mem_data, void *mem_err, int length, + int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double &result); + + /** Write params to device. + * Write params from double array to mem_param_m memory on the device. + */ + int writeParams(const double *params, int numparams); + + /** Write functions to device. + * Write function values from double array to mem_func_m memory on the device. + */ + int writeFunc(const double *func, int numfunc); + + /** Write maps to device. + * Write map values from int array to mem_map_m memory on the device. + */ + int writeMap(const int *map, int nummap); + + /** Allocate temporary memory needed for chi square. + * Initializes the necessary temporary memory for the chi square calculations. Size_data needs to + * the maximum number of elements in any datasets that will be used for calculations. Size_param, + * size_func and size_map are the maximum number of parameters, functions and maps used in + * calculations. + */ + int initChiSquare(int size_data, int size_param, int size_func, int size_map); + + /** Free temporary memory allocated for chi square. + * Frees the chisq temporary memory and memory for params, functions and maps + */ + int freeChiSquare(); + + /** Check MuSR kernels for necessary resources. + * Query device properties to get if sufficient resources are + * available to run the kernels + */ + int checkChiSquareKernels(int fitType, int &threadsPerBlock); + +}; + +#endif diff --git a/src/OpenCL/OpenCLCollimatorPhysics.cpp b/src/OpenCL/OpenCLCollimatorPhysics.cpp new file mode 100644 index 0000000..46d8b24 --- /dev/null +++ b/src/OpenCL/OpenCLCollimatorPhysics.cpp @@ -0,0 +1,107 @@ +#include "OpenCLCollimatorPhysics.h" + +#define M_P 0.93827231e+00 +#define C 299792458.0 +#define PI 3.14159265358979323846 +#define AVO 6.022e23 +#define R_E 2.81794092e-15 +#define eM_E 0.51099906e-03 +#define Z_P 1 +#define K 4.0*PI*AVO*R_E*R_E*eM_E*1e7 + +#define POSITION 0 +#define ZSIZE 1 +#define RHO_M 2 +#define Z_M 3 +#define A_M 4 +#define A2_C 5 +#define A3_C 6 +#define A4_C 7 +#define A5_C 8 +#define X0_M 9 +#define I_M 10 +#define DT_M 11 + +#define BLOCK_SIZE 128 +#define NUMPAR 12 + +/* +TODO: +1. test OpenCL kernel + - is it launched for all particles + - does the random number generatror function properly + - is particle structure updated correctly in memory +2. boost.compute sort for user defined structure crashes +*/ +int OpenCLCollimatorPhysics::CollimatorPhysics(void *mem_ptr, void *par_ptr, + int numparticles) +{ + /* + //set number of total threads, and number threads per block + size_t threads = 1; + size_t blocks = numparticles; + + //cast void ptrs to cl_mem ptrs + cl_mem data = (cl_mem)mem_ptr; + cl_mem params = (cl_mem)par_ptr; + + int numparams = 19; + + //set kernel to execute and kernel arguments + ocl_createKernel("kernelCollimatorPhysics"); + ocl_setKernelArg(0, sizeof(cl_mem), &data); + ocl_setKernelArg(1, sizeof(cl_mem), ¶ms); + ocl_setKernelArg(2, sizeof(cl_mem), &defaultRndState); + ocl_setKernelArg(3, sizeof(int), &numparticles); + ocl_setKernelArg(4, sizeof(double)*numparams, NULL); + + std::cout << "blocks: " << blocks << ", threads: " << threads << std::endl; + + //execute kernel on device + ocl_executeKernel(1, &blocks, &threads); + + //create functions for comparing two particles and counting particles with labels < 0 + + BOOST_COMPUTE_FUNCTION(bool, sort_by_label, (PART_OPENCL a, PART_OPENCL b), + { + return a.label < b.label; + }); + + + + BOOST_COMPUTE_FUNCTION(bool, count_by_label, (PART_OPENCL a), + { + return a.label < 0; + }); + + + //wrap cl_mem memory object in Boost.Compute buffer + std::cout << "wrap buffer" << std::endl; + boost::compute::buffer buf(data); + + //count particles with labels < 0 + std::cout << "wrap command queue" << std::endl; + boost::compute::command_queue queue(ocl_getQueue()); + + std::cout << "count if" << std::endl; + + + numaddback = boost::compute::count_if(boost::compute::make_buffer_iterator(buf,0), + boost::compute::make_buffer_iterator(buf,numparticles), + count_by_label, queue); + + //sort particles with dead and leaving particles at the end using boos::compute + numaddback = 0; + if (numaddback > 0) { + std::cout << "sort" << std::endl; + boost::compute::sort(boost::compute::make_buffer_iterator(buf,0), + boost::compute::make_buffer_iterator(buf, numparticles), + sort_by_label, queue); + } + + + return DKS_SUCCESS; +*/ + std::cout << "OpenCL implementation disabled" << std::endl; + return DKS_ERROR; +} diff --git a/src/OpenCL/OpenCLCollimatorPhysics.h b/src/OpenCL/OpenCLCollimatorPhysics.h new file mode 100644 index 0000000..7b532ff --- /dev/null +++ b/src/OpenCL/OpenCLCollimatorPhysics.h @@ -0,0 +1,85 @@ +#ifndef H_OPENCL_DEGRADER +#define H_OPENCL_DEGRADER + +#include +#include + +#include "../Algorithms/CollimatorPhysics.h" +#include "OpenCLBase.h" + +/* +#include "boost/compute/types/struct.hpp" +#include "boost/compute/type_traits/type_name.hpp" +#include "boost/compute/algorithm/count_if.hpp" +#include "boost/compute/algorithm/sort.hpp" +#include "boost/compute/container/vector.hpp" +#include "boost/compute/iterator/buffer_iterator.hpp" +#include "boost/compute/core.hpp" +*/ + +typedef struct { + double x; + double y; + double z; +} Double3; + +typedef struct { + int label; + unsigned localID; + + Double3 Rincol; + Double3 Pincol; +} PART_OPENCL; + +//adapt struct PART for use in Boost.Compute +//BOOST_COMPUTE_ADAPT_STRUCT(Double3, Double3, (x, y, z)); +//BOOST_COMPUTE_ADAPT_STRUCT(PART_OPENCL, PART_OPENCL, (label, localID, Rincol, Pincol)); + +class OpenCLCollimatorPhysics : public DKSCollimatorPhysics { + +private: + OpenCLBase *m_oclbase; + +public: + + /* constructor */ + OpenCLCollimatorPhysics(OpenCLBase *base) { + m_oclbase = base; + } + + /* destructor */ + ~OpenCLCollimatorPhysics() { + } + + /* execute degrader code on device */ + int CollimatorPhysics(void *mem_ptr, void *par_ptr, int numparticles); + + int CollimatorPhysicsSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles) { return DKS_ERROR; } + + int CollimatorPhysicsSort(void *mem_ptr, int numparticles, int &numaddback) { return DKS_ERROR; } + + int CollimatorPhysicsSortSoA(void *label_ptr, void *localID_ptr, + void *rx_ptr, void *ry_ptr, void *rz_ptr, + void *px_ptr, void *py_ptr, void *pz_ptr, + void *par_ptr, int numparticles, int &numaddback) { return DKS_ERROR; } + + int ParallelTTrackerPush(void *r_ptr, void *p_ptr, int npart, void *dt_ptr, + double dt, double c, bool usedt = false, int streamId = -1) + { + return DKS_ERROR; + } + + int ParallelTTrackerPushTransform(void *x_ptr, void *p_ptr, void *lastSec_ptr, + void *orient_ptr, int npart, int nsec, void *dt_ptr, + double dt, double c, bool usedt = false, + int streamId = -1) + { + return DKS_ERROR; + } + +}; + +#endif diff --git a/src/OpenCL/OpenCLFFT.cpp b/src/OpenCL/OpenCLFFT.cpp new file mode 100644 index 0000000..5cbe9e9 --- /dev/null +++ b/src/OpenCL/OpenCLFFT.cpp @@ -0,0 +1,303 @@ +#include "OpenCLFFT.h" + +//=====================================// +//==========Private functions==========// +//=====================================// + +/* + call fft kernels to execute FFT of the given domain, data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension +*/ +int OpenCLFFT::ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward) { + + //set the number of work items in each dimension + size_t work_items[3]; + work_items[0] = N; + work_items[1] = (ndim > 1) ? N : 1; + work_items[2] = (ndim > 1) ? N : 1; + work_items[cdim] = N / 2; + + int f = (forward) ? 1 : 0; + + //create kernel and set kernel arguments + if (m_oclbase->ocl_createKernel("FFT3D") != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(3, sizeof(int), &f) != OCL_SUCCESS) + return OCL_ERROR; + + + //execute kernel + for (int step = 1; step < N; step <<= 1) { + if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &step) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + +/* + call ifft kernel to execute the bit reverse sort data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension +*/ +int OpenCLFFT::ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N) { + //set work item size + size_t work_items[3]; + work_items[0] = N; + work_items[1] = (ndim > 1) ? N : 1; + work_items[2] = (ndim > 2) ? N : 1; + + //create kernel and set kernel arguments + if (m_oclbase->ocl_createKernel("BitReverseSort3D") != OCL_SUCCESS) + return OCL_ERROR; + + int bits = log2(N); + if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &data) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &bits) != OCL_SUCCESS) + return OCL_ERROR; + + if (m_oclbase->ocl_setKernelArg(2, sizeof(int), &cdim) != OCL_SUCCESS) + return OCL_ERROR; + + //execute kernel + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) { + DEBUG_MSG("Error executing kernel"); + return OCL_ERROR; + } + + return OCL_SUCCESS; + +} + + +//=====================================// +//==========Public functions==========// +//=====================================// + +/* + call fft execution on device for every dimension +*/ +int OpenCLFFT::executeFFT(void *data, int ndim, int N[3], int streamId, bool forward) { + int ierr; + + cl_mem inout = (cl_mem)data; + int n = N[0]; + + for (int dim = 0; dim < ndim; dim++) { + ierr = ocl_callBitReverseKernel(inout, dim, ndim, n); + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Error executing bit reverse"); + return OCL_ERROR; + } + + ierr = ocl_callFFTKernel(inout, dim, ndim, n, forward); + if (ierr != OCL_SUCCESS) { + DEBUG_MSG("Error executing fft reverse"); + return OCL_ERROR; + } + } + + return OCL_SUCCESS; +} + +/* + execute ifft +*/ +int OpenCLFFT::executeIFFT(void *data, int ndim, int N[3], int streamId) { + executeFFT(data, ndim, N, streamId, false); + return OCL_SUCCESS; +} + +/* + call kernel to normalize fft +*/ +int OpenCLFFT::normalizeFFT(void *data, int ndim, int N[3], int streamId) { + + cl_mem inout = (cl_mem)data; + + int n = N[0]; + + //set work item size + size_t work_items[3]; + work_items[0] = n; + work_items[1] = (ndim > 1) ? n : 1; + work_items[2] = (ndim > 2) ? n : 1; + + //create kernel + if (m_oclbase->ocl_createKernel("normalizeFFT") != OCL_SUCCESS) + return OCL_ERROR; + + //set kernel args + unsigned int elements = pow(n, ndim); + if (m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &inout) != OCL_SUCCESS) + return OCL_ERROR; + if (m_oclbase->ocl_setKernelArg(1, sizeof(int), &elements) != OCL_SUCCESS) + return OCL_ERROR; + + //execute kernel + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) { + DEBUG_MSG("Error executing kernel"); + return OCL_ERROR; + } + + return OCL_SUCCESS; +} + +int OpenCLFFT::ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward) { + + int ierr; + int size = sizeof(cl_double2)*pow(N,ndim); + + cl_mem mem_tmp; + cl_mem mem_src = (cl_mem)src; + cl_mem mem_dst = (cl_mem)m_oclbase->ocl_allocateMemory(size, ierr); + + //set the number of work items in each dimension + size_t work_items[3]; + int p = 1; + int threads = N / 2; + int f = (forward) ? -1 : 1; + + //execute kernel + int n = (int)log2(N); + for (int i = 0; i < ndim; i++) { + + int dim = i+1; + p = 1; + work_items[0] = (dim == 1) ? N/2 : N; + work_items[1] = (dim == 2) ? N/2 : N; + work_items[2] = (dim == 3) ? N/2 : N; + + //transpose array if calculating dimension larger than 1 + //if (dim > 1) + // ocl_executeTranspose(mem_src, N, ndim, dim); + + //create kernel and set kernel arguments + if (m_oclbase->ocl_createKernel("fft3d_radix2") != OCL_SUCCESS) + return OCL_ERROR; + + for (int t = 1; t <= log2(N); t++) { + + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_dst); + m_oclbase->ocl_setKernelArg(2, sizeof(int), &p); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &threads); + m_oclbase->ocl_setKernelArg(4, sizeof(int), &dim); + m_oclbase->ocl_setKernelArg(5, sizeof(int), &f); + + if (m_oclbase->ocl_executeKernel(ndim, work_items) != OCL_SUCCESS) + return OCL_ERROR; + + mem_tmp = mem_src; + mem_src = mem_dst; + mem_dst = mem_tmp; + + p = 2*p; + } + + //transpose array back if calculating dimension larger than 1 + //if (dim > 1) + // ocl_executeTranspose(mem_src, N, ndim, dim); + } + + if (ndim*n % 2 == 1) { + m_oclbase->ocl_copyData(mem_src, mem_dst, size); + mem_tmp = mem_src; + mem_src = mem_dst; + mem_dst = mem_tmp; + } + + m_oclbase->ocl_freeMemory(mem_dst); + + return OCL_SUCCESS; + +} + +int OpenCLFFT::ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward) { + + cl_mem mem_src = (cl_mem)src; + + size_t work_items[3] = { (size_t)N/2, (size_t)N, (size_t)N}; + size_t work_group_size[3] = {(size_t)N/2, 1, 1}; + + m_oclbase->ocl_createKernel("fft_batch3D"); + + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_double2)*N, NULL); + m_oclbase->ocl_setKernelArg(2, sizeof(cl_double2)*N, NULL); + m_oclbase->ocl_setKernelArg(3, sizeof(cl_double2), NULL); + m_oclbase->ocl_setKernelArg(4, sizeof(int), &N); + + + for (int dim = 1; dim < ndim+1; dim++) { + m_oclbase->ocl_setKernelArg(5, sizeof(int), &dim); + m_oclbase->ocl_executeKernel(3, work_items, work_group_size); + } + + return OCL_SUCCESS; +} + +int OpenCLFFT::ocl_executeTranspose(void *src, int N[3], int ndim, int dim) { + + cl_mem mem_src = (cl_mem)src; + + if (ndim == 1) + return OCL_SUCCESS; + + size_t work_items[3]; + work_items[0] = N[0]; + work_items[1] = N[1]; + work_items[2] = 1; + + size_t work_group_size[3]; + work_group_size[0] = N[0]; + work_group_size[1] = N[1]; + work_group_size[2] = 1; + + size_t local_size = work_group_size[0] * work_group_size[1] * work_group_size[2]; + + m_oclbase->ocl_createKernel("transpose"); + m_oclbase->ocl_setKernelArg(0, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(1, sizeof(cl_mem), &mem_src); + m_oclbase->ocl_setKernelArg(2, sizeof(int), &N[0]); + m_oclbase->ocl_setKernelArg(3, sizeof(int), &N[1]); + m_oclbase->ocl_setKernelArg(4, sizeof(cl_double2)*local_size, NULL); + m_oclbase->ocl_executeKernel(ndim, work_items, work_group_size); + + return OCL_SUCCESS; +} + +/* +void OpenCLFFT::printData3DN4(cl_double2* &data, int N) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].x; + if (d > 10e-5 || d < -10e-5) + std::cout << d << "\t"; + else + std::cout << 0 << "\t"; + } + } + std::cout << std::endl; + } + std::cout << std::endl; + +} +*/ + + + + diff --git a/src/OpenCL/OpenCLFFT.h b/src/OpenCL/OpenCLFFT.h new file mode 100644 index 0000000..31816f9 --- /dev/null +++ b/src/OpenCL/OpenCLFFT.h @@ -0,0 +1,113 @@ +/* + + Name: OpenCLFFT + + Author: Uldis Locans + + Info:Extend OpenCLBase class to implement fft and ifft functions using OpenCL + + Data: 19.09.2014 + +*/ +#ifndef H_OPENCL_FFT +#define H_OPENCL_FFT + + +#include +#include +#include + +#include "../Algorithms/FFT.h" +#include "OpenCLBase.h" + +class OpenCLFFT : public DKSFFT { + +private: + + OpenCLBase *m_oclbase; + + /* + Info: call fft kernels to execute FFT of the given domain, + data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension + Return: success or error code + */ + int ocl_callFFTKernel(cl_mem &data, int cdim, int ndim, int N, bool forward = true); + + /* + Info: call ifft kernel to execute the bit reverse sort + data - devevice memory ptr, cdim - current dim to transform, + ndim - totla number of dimmensions, N - size of dimension + Return: success or error code + */ + int ocl_callBitReverseKernel(cl_mem &data, int cdim, int ndim, int N); + +public: + + /* constructor - currently does nothing*/ + OpenCLFFT(OpenCLBase *base) { + m_oclbase = base; + } + + /* destructor - currently does nothing*/ + ~OpenCLFFT() { } + + /* + Info: execute forward fft function with data set on device + Return: success or error code + */ + //int ocl_executeFFT(cl_mem &data, int ndim, int N, bool forward = true); + int executeFFT(void *data, int ndim, int N[3], int streamId = -1, bool forward = true); + + /* + Info: execute inverse fft with data set on device + Return: success or error code + */ + //int ocl_executeIFFT(cl_mem &data, int ndim, int N); + int executeIFFT(void *data, int ndim, int N[3], int streamId = -1); + + /* + Info: execute normalize kernel + Return: success or error code + */ + //int ocl_normalizeFFT(cl_mem &data, int ndim, int N); + int normalizeFFT(void *data, int ndim, int N[3], int streamId = -1); + + /* + Info: set FFT size + Return: success or error code + */ + int setupFFT(int ndim, int N[3]) { return DKS_SUCCESS; } + + int setupFFTRC(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + + int setupFFTCR(int ndim, int N[3], double scale = 1.0) { return DKS_SUCCESS; } + + int destroyFFT() { return DKS_SUCCESS; } + + int executeRCFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) + { + return DKS_ERROR; + } + int executeCRFFT(void * real_ptr, void * comp_ptr, int ndim, int N[3], + int streamId = -1) + { + return DKS_ERROR; + } + int normalizeCRFFT(void *real_ptr, int ndim, int N[3], int streamId = -1) + { + return DKS_ERROR; + } + + int ocl_executeFFTStockham(void* &src, int ndim, int N, bool forward = true); + + int ocl_executeFFTStockham2(void* &src, int ndim, int N, bool forward = true); + + int ocl_executeTranspose(void *src, int N[3], int ndim, int dim); + + //void printData3DN4(cl_double2* &data, int N); + +}; + +#endif diff --git a/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl b/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl new file mode 100644 index 0000000..f08f268 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquare.cl @@ -0,0 +1,175 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define TAU 2.197019 + + +__kernel void parallelReductionSum(__global double *data_in, __global double *data_out, + __local double *data_local, int size) +{ + + //get local and global ids, and work group size + int local_id = get_local_id(0); + int global_id = get_global_id(0); + int group_size = get_local_size(0); + + //copy from global memory to local, if global id out of bounds fill with 0s + if (global_id < size) + data_local[local_id] = data_in[global_id]; + else + data_local[local_id] = 0; + + //loop trough reduction steps + for (uint stride = group_size / 2; stride > 0; stride /= 2) { + + //synch all work items in work group + barrier(CLK_LOCAL_MEM_FENCE); + + //create partials summs each step + if (local_id < stride) + data_local[local_id] += data_local[local_id + stride]; + } + + //local thread 0 writes final partial sum to global memory + if (local_id == 0) + data_out[get_group_id(0)] = data_local[0]; + +} + +__kernel void kernelPHistoTFFcn(__global double *data, __global double *par, __global double *chisq, + double fTimeResolution, double fRebin, + int length, int sensors, int numpar, + __local double *p) +{ + + //get work item id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + + //load parameters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync work items inside work group + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + if (j < length) { + + double dt0 = fTimeResolution * 0.5 * (fRebin - 1); + double time = dt0 + fTimeResolution * fRebin * j; + double w = p[0]*0.08516155035269027; + double tt = exp(-time/TAU); + double pp = exp(-0.5 * pow(p[1]*time, 2.0)); + double wt = w * time; + + + int idx; + double ldata, theo; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + ldata = data[idx]; + + theo = p[2+i*4]*tt*(1.0+p[3+i*4]*pp*cos(wt+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + + if (ldata != 0.0) + chisq[idx] = (theo - ldata) * (theo - ldata) / ldata; + else + chisq[idx] = theo * theo; + } + } +} + +__kernel void kernelSingleGaussTF(__global double *data, __global unsigned int *t0, + __global double *par, __global double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar, __local double *p) +{ + + //get work item id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + + //load para,eters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync work items inside work group + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = par[0]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0 + fTimeResolution * fRebin* (j - lft0); + theo = p[2+i*4]*exp(-time/TAU)*(1.0+p[3+i*4]*exp(-0.5*pow(p[1]*time,2.0)) + *cos(w1*time+p[4+i*4]*1.74532925199432955e-2))+p[5+i*4]; + // 1.74532925199432955e-2 = pi/180 + + if ( (ldata > 1.0e-9) && (fabs(theo) > 1.0e-9) ) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } + +} + +__kernel void kernelDoubleLorentzTF(__global double *data, __global unsigned int *t0, + __global double *par, __global double *result, + double fTimeResolution, double fRebin, double fGoodBinOffset, + int length, int sensors, int numpar, __local double *p) +{ + + //get work item id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + + //load para,eters from global to shared memory + if (tid < numpar) + p[tid] = par[tid]; + + //sync work items inside work group + barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); + + if (j < length) { + double dt0 = fTimeResolution*0.5*(fRebin - 1); + double w1 = p[0]*0.08516155035269027; + double w2 = p[2]*0.08516155035269027; + + int idx; + double ldata, lft0, theo, time; + for (int i = 0; i < sensors; i++) { + + idx = i * length + j; + lft0 = t0[i]; + if (j >= lft0 + fGoodBinOffset/fRebin) { + ldata = data[idx]; + time = dt0+fTimeResolution*fRebin*(j-lft0); + + theo = p[4+i*5]*exp(-time/TAU)* + (1.0+p[8+i*5]*p[5+i*5]*exp(-p[1]*time)* + cos(w1*time+p[6+i*5]*1.74532925199432955e-2)+ + (1.0-p[8+i*5])*p[5+i*5]*exp(-p[3]*time)* + cos(w2*time+p[6+i*5]*1.74532925199432955e-2))+p[7+i*5]; + // 1.74532925199432955e-2 = pi/180 + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + result[idx] = (theo - ldata) + ldata*log(ldata/theo); + else + result[idx] = theo - ldata; + } else { + result[idx] = 0; + } + } + } + +} + diff --git a/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl new file mode 100644 index 0000000..bdc9374 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLChiSquareRuntime.cl @@ -0,0 +1,344 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define PI 3.141592653589793115998 +#define TWO_PI 6.283185307179586231996 +#define DEG_TO_RAD 1.7453292519943295474371681e-2 + +/** From 'Numerical Recipes in C' by Press et.al, 1992. */ +//Returns the Bessel function J0(x) for any real x. +double bessj0(double x) { + double ax,z; + double xx,y,ans,ans1,ans2; //Accumulate polynomials in double precision. + + if ((ax=fabs(x)) < 8.0) { //Direct rational function fit. + y=x*x; + ans1=57568490574.0+y*(-13362590354.0+y*(651619640.7+y*(-11214424.18+y*(77392.33017+y*(-184.9052456))))); + ans2=57568490411.0+y*(1029532985.0+y*(9494680.718+y*(59272.64853+y*(267.8532712+y*1.0)))); + ans=ans1/ans2; + } else { //Fitting function (6.5.9). + z=8.0/ax; + y=z*z; + xx=ax-0.785398164; + ans1=1.0+y*(-0.1098628627e-2+y*(0.2734510407e-4+y*(-0.2073370639e-5+y*0.2093887211e-6))); + ans2 = -0.1562499995e-1+y*(0.1430488765e-3+y*(-0.6911147651e-5+y*(0.7621095161e-6-y*0.934945152e-7))); + ans=sqrt(0.636619772/ax)*(cos(xx)*ans1-z*sin(xx)*ans2); + } + return ans; +} + +/** Theory function declaration. + * Definition of the theory function will be build during runtime before compilation. + */ +double fTheory(double t, __local double *p, __local double *f, __local int *m); + +/** MusrFit predefined functions. + * Predefined functions from MusrFit that can be used to define the theory function. + * First parameter in all the functions is alwats time - t, rest of the parameters depend + * on the function. + */ +double se(double t, double lamda) { + return exp( -lamda*t ); +} + +double ge(double t, double lamda, double beta) { + return exp( -pow(lamda*t, beta) ); +} + +double sg(double t, double sigma) { + return exp( -0.5 * pow(sigma*t, 2) ); +} + +double stg(double t, double sigma) { + double sigmatsq = pow(sigma*t,2); + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatsq) * exp(-0.5 * sigmatsq); +} + +double sekt(double t, double lambda) { + double lambdat = lambda*t; + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat) * exp(-lambdat); +} + +double lgkt(double t, double lambda, double sigma) { + double lambdat = lambda*t; + double sigmatsq = pow(sigma*t, 2.0); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - lambdat - sigmatsq) * exp(-lambdat - 0.5*sigmatsq); +} + +double skt(double t, double sigma, double beta) { + if (beta < 1.0e-3) + return 0.0; + double sigmatb = pow(sigma*t, beta); + + return (1.0/3.0) + (2.0/3.0)*(1.0 - sigmatb) * exp(-sigmatb/beta); +} + +double spg(double t, double lambda, double gamma, double q) { + double lam2 = lambda*lambda; + double lamt2q = t*t*lam2*q; + double rate2 = 4.0*lam2*(1.0-q)*t/gamma; + double rateL = sqrt(fabs(rate2)); + double rateT = sqrt(fabs(rate2)+lamt2q); + + return (1.0/3.0)*exp(-rateL) + (2.0/3.0)*(1.0 - lamt2q / rateT)*exp(-rateT); +} + +double rahf(double t, double nu, double lambda) { + double nut = nu*t; + double nuth = nu*t/2.0; + double lamt = lambda*t; + + return (1.0/6.0)*(1.0-nuth)*exp(-nuth) + (1.0/3.0)*(1.0-nut/4.0)*exp(-0.25*(nut+2.44949*lamt)); +} + +double tf(double t, double phi, double nu) { + double tmp_nu = TWO_PI*nu*t; + double tmp_phi = DEG_TO_RAD * phi; + + return cos(tmp_nu + tmp_phi); +} + +double ifld(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + + return alpha*cos(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double b(double t, double phi, double nu) { + return bessj0(TWO_PI*nu*t + DEG_TO_RAD*phi); +} + +double ib(double t, double alpha, double phi, double nu, double lambdaT, double lambdaL) { + double wt = TWO_PI * nu * t; + double ph = DEG_TO_RAD * phi; + + return alpha*bessj0(wt+ph)*exp(-lambdaT*t) + (1.0-alpha)*exp(-lambdaL*t); +} + +double ab(double t, double sigma, double gamma) { + double gt = gamma*t; + + return exp(-pow(sigma/gamma,2.0)*(exp(-gt) - 1.0 + gt)); +} + +double snkzf(double t, double Delta0, double Rb) { + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return (1.0/3.0) + (2.0/3.0)*pow(aa,1.5)*(1.0-D0t2*aa)*exp(-0.5*D0t2*aa); +} + +double snktf(double t, double phi, double nu, double Delta0, double Rb) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double D0t2 = pow(Delta0*t, 2.0); + double aa = 1.0/(1.0+pow(Rb,2.0)*D0t2); + + return sqrt(aa)*exp(-0.5*D0t2*aa)*cos(wt+ph); +} + +double dnkzf(double t, double Delta0, double Rb, double nuc) { + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+4.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-2.0*Delta0*Delta0*theta*aa); +} + +double dnktf(double t, double phi, double nu, double Delta0, double Rb, double nuc) { + double wt = TWO_PI*nu*t; + double ph = DEG_TO_RAD*phi; + double nuct = nuc*t; + double theta = (exp(-nuct) - 1.0 -nuct)/pow(nuc, 2.0); + double aa = 1.0/(1.0+2.0*pow(Rb*Delta0,2.0)*theta); + + return sqrt(aa)*exp(-Delta0*Delta0*theta*aa)*cos(wt+ph); +} + +__kernel void kernelChiSquareSingleHisto(__global double *data, __global double *err, + __global double *par, __global double *chisq, __global int *map, __global double *funcv, + int length, int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double tau, double N0, double bkg, + __local double *p, __local double *f, __local int *m) +{ + + //get thread id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + int lsize = get_local_size(0); + + //load parameters from global to shared memory + while (tid < numpar) { + p[tid] = par[tid]; + tid += lsize; + } + + //load functions from global to shared memory + tid = get_local_id(0); + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += lsize; + } + + //load maps from global memory + tid = get_local_id(0); + while (tid < nummap) { + m[tid] = map[tid]; + tid += lsize; + } + + //sync threads + barrier(CLK_LOCAL_MEM_FENCE); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double theo = N0 * exp (-t/tau ) * (1.0 + fTheory(t, p, f, m)) + bkg; + + #ifdef MLH + if ((ldata > 1.0e-9) && (fabs(theo) > 1.0e-9)) + chisq[j] = 2.0 * ((theo - ldata) + ldata * log(ldata / theo)); + else + chisq[j] = 2.0 * (theo - ldata); + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += get_global_size(0); + } + +} + +__kernel void kernelChiSquareAsymmetry(__global double *data, __global double *err, + __global double *par, __global double *chisq, __global int *map, __global double *funcv, + int length, int numpar, int numfunc, int nummap, + double timeStart, double timeStep, + double alpha, double beta, + __local double *p, __local double *f, __local int *m) +{ + + //get thread id and calc global id + int tid = get_local_id(0); + int j = get_global_id(0); + int lsize = get_local_size(0); + + //load parameters from global to shared memory + while (tid < numpar) { + p[tid] = par[tid]; + tid += lsize; + } + + //load functions from global to shared memory + tid = get_local_id(0); + while (tid < numfunc) { + f[tid] = funcv[tid]; + tid += lsize; + } + + //load maps from global memory + tid = get_local_id(0); + if (tid < nummap) { + m[tid] = map[tid]; + tid += lsize; + } + + //sync threads + barrier(CLK_LOCAL_MEM_FENCE); + + while (j < length) { + + double t = timeStart + j*timeStep; + double ldata = data[j]; + double lerr = err[j]; + + double ab = alpha*beta; + double theoVal = fTheory(t, p, f, m); + double theo = ((ab+1.0)*theoVal - (alpha-1.0))/((alpha+1.0)-(ab-1.0)*theoVal); + + #ifdef MLH + chisq[j] = 0.0; // max log likelihood not defined for asymmetry fit + #else + if (lerr != 0.0) + chisq[j] = (theo - ldata) * (theo - ldata) / (lerr * lerr); + else + chisq[j] = theo * theo; + #endif + + j += get_global_size(0); + } + +} + +__kernel void parallelReductionSum(__global double *data_in, __global double *data_out, + __local double *data_local, int size) +{ + + //get local and global ids, and work group size + int local_id = get_local_id(0); + int global_id = get_global_id(0); + int group_size = get_local_size(0); + + //copy from global memory to local, if global id out of bounds fill with 0s + if (global_id < size) + data_local[local_id] = data_in[global_id]; + else + data_local[local_id] = 0; + + //loop trough reduction steps + for (uint stride = group_size / 2; stride > 0; stride /= 2) { + + //synch all work items in work group + barrier(CLK_LOCAL_MEM_FENCE); + + //create partials summs each step + if (local_id < stride) + data_local[local_id] += data_local[local_id + stride]; + } + + //local thread 0 writes final partial sum to global memory + if (local_id == 0) + data_out[get_group_id(0)] = data_local[0]; + +} + +__kernel void parallelReductionTwoPhase(__global double *data_in, __global double *data_out, + __local double *data_local, int size) +{ + //get local and global ids, and work group size + int local_id = get_local_id(0); + int global_id = get_global_id(0); + int global_size = get_global_size(0); + int group_size = get_local_size(0); + + double acc = 0; + while (global_id < size) { + acc += data_in[global_id]; + global_id += global_size; + } + + //parallel reduction on local work group + data_local[local_id] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + for (uint stride = group_size / 2; stride > 0; stride /= 2) { + //synch all work items in work group + barrier(CLK_LOCAL_MEM_FENCE); + + //create partials summs each step + if (local_id < stride) + data_local[local_id] += data_local[local_id + stride]; + } + + //local thread 0 writes final partial sum to global memory + if (local_id == 0) + data_out[get_group_id(0)] = data_local[0]; + +} diff --git a/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl new file mode 100644 index 0000000..34b08bd --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLCollimatorPhysics.cl @@ -0,0 +1,362 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#pragma OPENCL EXTENSION + + +/******Random numbers********/ + +/* struct for random number state */ +typedef struct { + + double s10; + double s11; + double s12; + double s20; + double s21; + double s22; + double z; + bool gen; + +} RNDState; + +#define NORM 2.328306549295728e-10 +#define M1 4294967087.0 +#define M2 4294944443.0 +#define A12 1403580.0 +#define A13N 810728.0 +#define A21 527612.0 +#define A23N 1370589.0 + +/* MRG32k3a uniform random number generator */ +double rand_uniform(RNDState *s) { + long k; + double p1, p2; + + /* Component 1 */ + p1 = A12 * (*s).s11 - A13N * (*s).s10; + k = p1 / M1; + p1 -= k * M1; + if (p1 < 0.0) + p1 += M1; + (*s).s10 = (*s).s11; + (*s).s11 = (*s).s12; + (*s).s12 = p1; + + /* Component 2 */ + p2 = A21 * (*s).s22 - A23N * (*s).s20; + k = p2 / M2; + p2 -= k * M2; + if (p2 < 0.0) + p2 += M2; + (*s).s20 = (*s).s21; + (*s).s21 = (*s).s22; + (*s).s22 = p2; + + /* Combination */ + if (p1 <= p2) + return ((p1 - p2 + M1) * NORM); + else + return ((p1 - p2) * NORM); +} + +/* get random variable with gaussian distribution */ +double rand_normal(RNDState *s, double mu, double sigma) { + + const double two_pi = 2.0 * 3.141592653589793223846; + double z0; + + if (!(*s).gen) { + (*s).gen = true; + return (*s).z * sigma + mu; + } + + double u1, u2; + u1 = rand_uniform(s); + u2 = rand_uniform(s); + + z0 = sqrt(-2.0 * log(u1)) * cos(two_pi * u2); + (*s).z = sqrt(-2.0 * log(u1)) * sin(two_pi * u2); + (*s).gen = false; + + return z0 * sigma + mu; + + +} + +/* initialize random states */ +__kernel void initRand(__global RNDState *s, unsigned int seed, int N) { + + int id = get_global_id(0); + + if (id < N) { + RNDState tmp; + int tmp_seed = id;// * 0x100000000ULL; + tmp.s10 = 12345 + tmp_seed; + tmp.s11 = 12345 + tmp_seed; + tmp.s12 = 123 + tmp_seed; + tmp.s20 = 12345 + tmp_seed; + tmp.s21 = 12345 + tmp_seed; + tmp.s22 = 123 + tmp_seed; + + tmp.z = 0; + tmp.gen = true; + + s[id] = tmp; + } + +} + + +/**********Degrader**********/ +enum PARAMS { POSITION, + ZSIZE, + M_P, + C, + RHO_M, + PI, + AVO, + R_E, + eM_E, + Z_M, + A_M, + A2_C, + A3_C, + A4_C, + A5_C, + Z_P, + X0_M, + I_M, + DT_M}; + + +typedef struct { + int label; + unsigned localID; + double3 Rincol; + double3 Pincol; +} PART; + +double Dot(double3 d1, double3 d2) { + return (d1.x * d2.x + d1.y * d2.y + d1.z * d2.z); +} + +/* check if particle is in degrader material */ +bool checkHit(double z, double position, double zsize) { + return ( ( z > position) && ( z <= position + zsize) ); +} + +/* calculate particles energy loss */ +void energyLoss(double *Eng, bool *pdead, double deltat, RNDState *s, __local double *par) { + + double dEdx = 0.0; + double gamma = ( (*Eng) + par[M_P]) / par[M_P]; + + double gamma2 = gamma * gamma; + + double beta = sqrt(1.0 - 1.0 / gamma2); + double beta2 = beta * beta; + double deltas = deltat * beta * par[C]; + double deltasrho = deltas * 100 * par[RHO_M]; + double K = 4.0 * par[PI] * par[AVO] * par[R_E] * par[R_E] * par[eM_E] * 1E7; + double sigma_E = sqrt(K * par[eM_E] * par[RHO_M] * (par[Z_M]/par[A_M])* deltas * 1E5); + + if (((*Eng) > 0.00001) && ((*Eng) < 0.0006)) { + double Ts = ((*Eng)*1E6)/1.0073; + double epsilon_low = par[A2_C]*pow(Ts,0.45); + double epsilon_high = (par[A3_C]/Ts)*log(1+(par[A4_C]/Ts)+(par[A5_C]*Ts)); + double epsilon = (epsilon_low*epsilon_high)/(epsilon_low + epsilon_high); + dEdx = - epsilon /(1E21*(par[A_M]/par[AVO])); + double delta_Eave = deltasrho * dEdx; + double delta_E = delta_Eave + rand_normal(s, 0, sigma_E); + + (*Eng) = (*Eng) + delta_E / 1E3; + } + + if ((*Eng) >= 0.0006) { + double Tmax = 2.0 * par[eM_E] * 1e9 * beta2 * gamma2 / + (1.0 + 2.0 * gamma * par[eM_E] / par[M_P] + + (par[eM_E] / par[M_P]) * (par[eM_E] / par[M_P])); + dEdx = -K * par[Z_P] * par[Z_P] * par[Z_M] / (par[A_M] * beta2) * + (1.0 / 2.0 * log(2 * par[eM_E] * 1e9 * beta2 * gamma2 * + Tmax / par[I_M] / par[I_M]) - beta2); + + double delta_Eave = deltasrho * dEdx; + double delta_E = delta_Eave + rand_normal(s, 0, sigma_E); + + (*Eng) = (*Eng)+delta_E / 1E3; + } + + (*pdead) = (((*Eng)<1E-4) || (dEdx>0)); + +} + +/* rotate partocle */ +void Rot(double3 *P, double3 *R, double xplane, + double normP, double thetacou, double deltas, int coord, + __local double *par) +{ + double Psixz; + double pxz; + + double px = (*P).x; + double pz = (*P).z; + double x = (*R).x; + double z = (*R).z; + + if (px>=0 && pz>=0) Psixz = atan(px/pz); + else if (px>0 && pz<0) + Psixz = atan(px/pz) + par[PI]; + else if (px<0 && pz>0) + Psixz = atan(px/pz) + 2*par[PI]; + else + Psixz = atan(px/pz) + par[PI]; + + pxz = sqrt(px*px + pz*pz); + if(coord==1) { + (*R).x = x + deltas * px/normP + xplane*cos(Psixz); + (*R).z = z - xplane * sin(Psixz); + } + if(coord==2) { + (*R).x = x + deltas * px/normP + xplane*cos(Psixz); + (*R).z = z - xplane * sin(Psixz) + deltas * pz / normP; + } + (*P).x = pxz*cos(Psixz)*sin(thetacou) + pxz*sin(Psixz)*cos(thetacou); + (*P).z = -pxz*sin(Psixz)*sin(thetacou) + pxz*cos(Psixz)*cos(thetacou); +} + + +void coulombScat(double3 *R, double3 *P, double deltat, + RNDState *s, __local double* par) { + + double dotP = Dot((*P), (*P)); + + double Eng = sqrt(dotP + 1.0) * par[M_P] - par[M_P]; + double gamma = (Eng + par[M_P]) / par[M_P]; + double beta = sqrt(1.0 - 1.0 / (gamma * gamma)); + double normP = sqrt(dotP); + double deltas = deltat * beta * par[C]; + double theta0 = 13.6e6 / (beta * sqrt(dotP) * par[M_P] * 1e9) * + par[Z_P] * sqrt(deltas / par[X0_M]) * (1.0 + 0.038 * log(deltas / par[X0_M])); + + // x-direction: See Physical Review, "Multiple Scattering" + double z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + double thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + double xplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + int coord = 1; + Rot(P, R, xplane, normP, thetacou, deltas, coord, par); + + double P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + coord = 0; // no change in coordinates but one in momenta-direction + Rot(P, R, xplane, normP, thetaru, deltas, coord, par); + } + + // y-direction: See Physical Review, "Multiple Scattering" + z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + + while(fabs(thetacou) > 3.5 * theta0) { + z1 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + z2 = rand_normal(s, 0, 1);//curand_normal_double(&state);//gsl_ran_gaussian(rGen_m,1.0); + thetacou = z2 * theta0; + } + + double yplane = z1 * deltas * theta0 / sqrt(12.0) + z2 * deltas * theta0 / 2.0; + coord = 2; + Rot(P, R, yplane, normP, thetacou, deltas, coord, par); + + P2 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P2 < 0.0047) { + double P3 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + double thetaru = 2.5 * sqrt(1 / P3) * sqrt(2.0) * theta0; + double P4 = rand_uniform(s);//curand_uniform_double(&state);//gsl_rng_uniform(rGen_m); + if(P4 > 0.5) + thetaru = -thetaru; + coord = 0; // no change in coordinates but one in momenta-direction + Rot(P, R, yplane, normP, thetaru, deltas, coord, par); + } + +} + +#define NUMPARAMS 19 +__kernel void kernelCollimatorPhysics(__global PART *data, __global double *par, + __global RNDState *state, int numparticles, + __local double *p) +{ + + //get global id + int tid = get_local_id(0); + int idx = get_global_id(0); + + printf("idx:\n");//, idx); + + //transfer params to local memory + if (tid < NUMPARAMS) + p[tid] = par[tid]; + + barrier(CLK_LOCAL_MEM_FENCE); + + RNDState s; + double3 R, P; + int l = 0; + if (idx < numparticles) { + R = data[idx].Rincol; + P = data[idx].Pincol; + s = state[idx]; + } + + double sq = sqrt(1.0 + Dot(P, P)); + bool pdead = false; + bool hit = checkHit(R.z, p[POSITION], p[ZSIZE]); + double Eng; + + if (hit) { + Eng = (sq - 1) * p[M_P]; + energyLoss(&Eng, &pdead, p[DT_M], &s, p); + } else { + R.x = R.x + p[DT_M] * p[C] * P.x / sq; + R.y = R.y + p[DT_M] * p[C] * P.y / sq; + R.z = R.z + p[DT_M] * p[C] * P.z / sq; + l = -2; + } + + if (hit && !pdead) { + double ptot = sqrt((p[M_P] + Eng) * (p[M_P] + Eng) - (p[M_P] * p[M_P])) / p[M_P]; + sq = sqrt(Dot(P, P)); + P.x = P.x * ptot / sq; + P.y = P.y * ptot / sq; + P.z = P.z * ptot / sq; + coulombScat(&R, &P, p[DT_M], &s, p); + } + + if (hit && pdead) + l = -1; + + if (idx < numparticles) { + data[idx].Rincol = R; + data[idx].Pincol = P; + data[idx].label = l; + state[idx] = s; + } + +} + + +/* count dead particles and particles leaving material - boost compute? */ + +/* sort particles so dead and leaving particles are at the end of PART array - boost compute */ + + diff --git a/src/OpenCL/OpenCLKernels/OpenCLFFT.cl b/src/OpenCL/OpenCLKernels/OpenCLFFT.cl new file mode 100644 index 0000000..1d4763b --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLFFT.cl @@ -0,0 +1,181 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/* 3D normalize FFT kernel */ +__kernel void normalizeFFT(__global double2 *input, int N) { + int i1 = get_global_id(0); + int i2 = get_global_id(1); + int i3 = get_global_id(2); + int n1 = get_global_size(0); + int n2 = get_global_size(1); + int n3 = get_global_size(2); + + int id = i1; + if (n2 > 1) + id += i2*n2; + if (n3 > 1) + id += i3*n2*n2; + + input[id].x = input[id].x / N; + input[id].y = input[id].y / N; +} + +/* 3D radix 2 FFT kernel */ +__kernel void FFT3D(__global double2 *input, int step, int dim, int forward) { + + int n1 = get_global_size(0); + int n2 = get_global_size(1); + int n3 = get_global_size(2); + int i1 = get_global_id(0); + int i2 = get_global_id(1); + int i3 = get_global_id(2); + + int jump = step << 1; + + int d, idGroup, idLoc, idTwidle, id, match; + if (dim == 0) { + + d = n1 / step; // n1 >> log2(step) + idLoc = i1 / d; + idGroup = i1 & (d-1); //modulo + + idTwidle = idGroup * jump + idLoc; + id = i3*n3*n3 + i2*n2 + idTwidle; + match = id + step; + } else if (dim == 1) { + + d = n2 / step; + idLoc = i2 / d; + idGroup = i2 & (d-1); + + idTwidle = idGroup * jump + idLoc; + id = i3*n3*n3 + idTwidle*n1 + i1; + match = id + step*n1; + } else if (dim == 2) { + + d = n3 / step; + idLoc = i3 / d; + idGroup = i3 & (d-1); + + idTwidle = idGroup * jump + idLoc; + id = idTwidle*n1*n1 + i2*n2 + i1; + match = id + step*n1*n1; + } + + double alpha; + if (forward == 1) + alpha = -( 2 * M_PI / jump ) * idTwidle; + else + alpha = ( 2 * M_PI / jump ) * idTwidle; + + double wr, wi; + wi = sincos(alpha, &wr); + + double2 cTemp; + double2 cTempId = input[id]; + double2 cTempMatch = input[match]; + + cTemp.x = wr*cTempMatch.x - wi*cTempMatch.y; + cTemp.y = wr*cTempMatch.y + wi*cTempMatch.x; + + input[match] = cTempId - cTemp; + input[id] = cTempId + cTemp; + +} + +/* 3D bit reversal sort */ +__kernel void BitReverseSort3D(__global double2 *input, int bits, int dim) { + + int n = get_global_size(0); + int i1 = get_global_id(0); + int i2 = get_global_id(1); + int i3 = get_global_id(2); + + int irev, itmp, istart; + if (dim == 0) { + istart = i1; + irev = i1; + itmp = i1; + } else if (dim == 1) { + irev = i2; + itmp = i2; + istart = i2; + } else if (dim == 2) { + irev = i3; + itmp = i3; + istart = i3; + } + + for (int j = 1; j < bits; j++) { + itmp >>= 1; + irev <<= 1; + irev |= itmp & 1; + } + irev &= n - 1; + + int id1, id2; + if (istart < irev) { + double2 tmp; + id1 = i3*n*n + i2*n + i1; + if (dim == 0) { //i1, irev - w, i2 - h, i3 - d + id2 = i3*n*n + i2*n + irev; + tmp = input[id1]; + input[id1] = input[id2]; + input[id2] = tmp; + } else if (dim == 1) { // i1 - w, i2, irev - h, i3 - d + id2 = i3*n*n + irev*n + i1; + tmp = input[id1]; + input[id1] = input[id2]; + input[id2] = tmp; + } else if (dim == 2) { // i1 - w, i2 - h, i3, irev - d + id2 = irev*n*n + i2*n + i1; + tmp = input[id1]; + input[id1] = input[id2]; + input[id2] = tmp; + } + } +} + + +/* 3D FFT kernel based on Stockham's out-of-place algorithm */ +__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim, const int forward) { + + const int gid1 = get_global_id(0); + const int gid2 = get_global_id(1); + const int gid3 = get_global_id(2); + + int t2 = 2*t; + int k, m, in1, in2, out1, out2; + in1 = gid3*t2*t2 + gid2*t2 + gid1; + if (ndim == 1) { + k = gid1 & (p - 1); + m = (gid1 << 1) - k; + in2 = in1 + t; + out1 = gid3*t2*t2 + gid2*t2 + m; + out2 = out1 + p; + } else if (ndim == 2) { + k = gid2 & (p - 1); + m = (gid2 << 1) - k; + in2 = in1 + t2*t; + out1 = gid3*t2*t2 + m*t2 + gid1; + out2 = out1 + t2*p; + } else if (ndim == 3) { + k = gid3 & (p - 1); + m = (gid3 << 1) - k; + in2 = in1 + t2*t2*t; + out1 = m*t2*t2 + gid2*t2 + gid1; + out2 = out1 + p*t2*t2; + } + + const double2 d1 = src[in1]; + const double2 d2 = src[in2]; + + const double theta = (forward*2*M_PI*k) / (p << 1); + + double cs; + + double sn = sincos(theta, &cs); + const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn); + + dst[out1] = d1 + temp; + dst[out2] = d1 - temp; +} diff --git a/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl b/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl new file mode 100644 index 0000000..b5d9e51 --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLFFTStockham.cl @@ -0,0 +1,214 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define TWOPI 6.28318530718 + +__kernel void fft_radix2(__global double2* src, __global double2* dst, const int p, const int t) { + + const int gid = get_global_id(0); + const int k = gid & (p - 1); + const int m = (gid << 1) - k; + + //src += gid; + //dst += (gid << 1) - k; + + //const double2 in1 = src[0]; + //const double2 in2 = src[t]; + + const double2 in1 = src[gid]; + const double2 in2 = src[gid+t]; + + const double theta = (-2*M_PI*k) / (p << 1); + double cs; + + double sn = sincos(theta, &cs); + const double2 temp = (double2) (in2.x * cs - in2.y * sn, in2.y * cs + in2.x * sn); + + //dst[0] = in1 + temp; + //dst[p] = in1 - temp; + + dst[m] = in1 + temp; + dst[m+p] = in1 - temp; + +} + +__kernel void fft3d_radix2_transpose(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) { + + /* get ids */ + const int gid1 = get_global_id(0); + const int gid2 = get_global_id(1); + const int gid3 = get_global_id(2); + + /* calc indexes */ + int t2 = 2*t; + + int k = gid1 & (p - 1); + int m = (gid1 << 1) - k; + + int tmp = gid3*t2*t2 + gid2*t2; + + int in1 = tmp + gid1; + int in2 = in1 + t; + + int out1 = tmp + m; + int out2 = out1 + p; + + /* calc FFT */ + const double2 d1 = src[in1]; + const double2 d2 = src[in2]; + + const double theta = (-2*M_PI*k) / (p << 1); + double cs; + + double sn = sincos(theta, &cs); + const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn); + + dst[out1] = d1 + temp; + dst[out2] = d1 - temp; +} + +__kernel void fft3d_radix2(__global double2* src, __global double2* dst, const int p, const int t, const int ndim) { + + const int gid1 = get_global_id(0); + const int gid2 = get_global_id(1); + const int gid3 = get_global_id(2); + + int t2 = 2*t; + int k, m, in1, in2, out1, out2; + in1 = gid3*t2*t2 + gid2*t2 + gid1; + if (ndim == 1) { + k = gid1 & (p - 1); + m = (gid1 << 1) - k; + in2 = in1 + t; + out1 = gid3*t2*t2 + gid2*t2 + m; + out2 = out1 + p; + } else if (ndim == 2) { + k = gid2 & (p - 1); + m = (gid2 << 1) - k; + in2 = in1 + t2*t; + out1 = gid3*t2*t2 + m*t2 + gid1; + out2 = out1 + t2*p; + } else if (ndim == 3) { + k = gid3 & (p - 1); + m = (gid3 << 1) - k; + in2 = in1 + t2*t2*t; + out1 = m*t2*t2 + gid2*t2 + gid1; + out2 = out1 + p*t2*t2; + } + + const double2 d1 = src[in1]; + const double2 d2 = src[in2]; + + const double theta = (-2*M_PI*k) / (p << 1); + + double cs; + double sn = sincos(theta, &cs); + const double2 temp = (double2) (d2.x * cs - d2.y * sn, d2.y * cs + d2.x * sn); + + dst[out1] = d1 + temp; + dst[out2] = d1 - temp; +} + + +__kernel void transpose(__global double2 *data, int ndim, int dim) { + + int k = get_global_id(0); + int j = get_global_id(1); + int i = get_global_id(2); + int nk = get_global_size(0); + int nj = get_global_size(1); + int ni = get_global_size(2); + + int n, m; + n = i*ni*ni + j*nj + k; + if (dim == 2) + m = i*ni*ni + k*nj + j; + else + m = k*ni*ni + j*nj + i; + + if (n < m) { + double2 tmp = data[m]; + data[m] = data[n]; + data[n] = tmp; + } +} + +#define PI2 6.28318530718 + +__kernel void fft_batch3D(__global double2 *data_in, __local double2 *d, __local double2 *r, __local double2 *tmp, int N, int dim) { + + int id1 = get_global_id(0); + int id2 = get_global_id(1); + int id3 = get_global_id(2); + + //calc indexes + int sid, offset; + if (dim == 1) { + sid = id3*N*N + id2*N; + offset = 1; + } else if (dim == 2) { + sid = id3*N*N + id2; + offset = N; + } else if (dim == 3) { + sid = id3*N + id2; + offset = N*N; + } + + //copy data from global memory to local + int i1 = id1; + int i2 = id1+N/2; + d[i1] = data_in[sid + i1*offset]; + d[i2] = data_in[sid + i2*offset]; + + barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_GLOBAL_MEM_FENCE); + + //exec fft + int p1, p2, j, k, out1, step, jump, t; + double theta, cs, sn; + + t = 1; + step = 1; + while (step < N) { + jump = step << 1; + + j = i1 >> (t - 1); // same as i1 / step, because t-1 = log2(step) + k = i2 & (step - 1); // same as i2 % step + + out1 = j * jump + k; + + theta = -PI2 * k / jump; + sn = sincos(theta, &cs); + + double2 temp = (double2) (d[i2].x*cs - d[i2].y*sn, d[i2].y*cs + d[i2].x * sn); + + + r[out1] = d[i1] + temp; + r[out1+step] = d[i1] - temp; + + t++; + step = jump; + + //swap local arrays + tmp = r; + r = d; + d = tmp; + + //wait for all threads to finish this iteration + barrier(CLK_LOCAL_MEM_FENCE); + } + + tmp = r; + r = d; + d = tmp; + + //copy data from local memory to global + data_in[sid + i1*offset] = r[i1]; + data_in[sid + i2*offset] = r[i2]; + +} + + + + + + + diff --git a/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl b/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl new file mode 100644 index 0000000..ffbd0ba --- /dev/null +++ b/src/OpenCL/OpenCLKernels/OpenCLTranspose.cl @@ -0,0 +1,41 @@ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/* transpose matrix */ +__kernel void transpose(__global double2 *input, __global double2 *output, + int width, int height, __local double2 *block) +{ + + //transfer row in shared memory + unsigned int xIdx = get_global_id(0); + unsigned int yIdx = get_global_id(1); + int block_dim = get_local_size(0); + + if ( (xIdx < width) && (yIdx < height) ) { + unsigned int idx_in = yIdx * width + xIdx; + block[get_local_id(1)*(block_dim+1)+get_local_id(0)] = input[idx_in]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + xIdx = get_group_id(1) * block_dim + get_local_id(0); + yIdx = get_group_id(0) * block_dim + get_local_id(1); + + if ( (xIdx < height) && (yIdx < width) ) { + unsigned int idx_out = yIdx * height + xIdx; + output[idx_out] = block[get_local_id(0)*(block_dim+1)+get_local_id(1)]; + } + +} + +/* naive transpose matrix kernel */ +__kernel void transpose_naive(__global double2 *input, __global double2 *output, int width, int height) +{ + unsigned int xIdx = get_global_id(0); + unsigned int yIdx = get_global_id(1); + + if (xIdx < width && yIdx < height) { + unsigned int idx_in = xIdx + width * yIdx; + unsigned int idx_out = yIdx + height * xIdx; + output[idx_out] = input[idx_in]; + } +} diff --git a/src/Utility/CMakeLists.txt b/src/Utility/CMakeLists.txt new file mode 100644 index 0000000..8a95785 --- /dev/null +++ b/src/Utility/CMakeLists.txt @@ -0,0 +1,18 @@ +SET (_SRCS + TimeStamp.cpp + DKSTimer.cpp + ) + +SET (_HDRS + TimeStamp.h + DKSTimer.h + ) + +#INCLUDE_DIRECTORIES ( +# ${CMAKE_CURRENT_SOURCE_DIR} +#) + +ADD_SOURCES (${_SRCS}) +ADD_HEADERS (${_HDRS}) + +INSTALL(FILES ${_HDRS} DESTINATION include/Utility) diff --git a/src/Utility/DKSTimer.cpp b/src/Utility/DKSTimer.cpp new file mode 100644 index 0000000..5f495d3 --- /dev/null +++ b/src/Utility/DKSTimer.cpp @@ -0,0 +1,53 @@ +#include "DKSTimer.h" + +//set initial values - running to false, timervalue to zero and name to empty string +DKSTimer::DKSTimer() { + running = false; + timervalue = 0.0; + name = ""; +} + +//destructor does nothing +DKSTimer::~DKSTimer() { + +} + +//init the timer by setting name and clearing timervalue, also sets running to false +void DKSTimer::init(std::string n) { + running = false; + timervalue = 0.0; + name = n; +} + +//if timer is not running get the current time and save to timeStart, set the timer as running +void DKSTimer::start() { + if (!running) { + gettimeofday(&timeStart, NULL); + running = true; + } +} + +//if the timer is running get the current time to timeEnd, calculate the elapsed time befor start +//and end, add elapsed time to timervalue, set the timer as not running +void DKSTimer::stop() { + if (running) { + gettimeofday(&timeEnd, NULL); + timervalue += ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6; + running = false; + } +} + +void DKSTimer::reset() { + running = false; + timervalue = 0.0; +} + +//return the accumulated value of timervalue +double DKSTimer::gettime() { + return timervalue; +} + +void DKSTimer::print() { + std::cout << "DKSTimer " << name << " elapsed time\t" << timervalue << "s" << std::endl; +} diff --git a/src/Utility/DKSTimer.h b/src/Utility/DKSTimer.h new file mode 100644 index 0000000..80025c0 --- /dev/null +++ b/src/Utility/DKSTimer.h @@ -0,0 +1,59 @@ +#ifndef H_DKSTIMER +#define H_DKSTIMER + +#include +#include +#include + +class DKSTimer { + +private: + + bool running; + double timervalue; + struct timeval timeStart; + struct timeval timeEnd; + std::string name; + +public: + + /** Init DKSTimer by seting timer to zero */ + DKSTimer(); + + ~DKSTimer(); + + /** Init the timer + * Set the name for timer and clear all values + */ + void init(std::string n); + + /** Start the timer. + * Get the curret time with gettimeofday and save in timeStart + */ + void start(); + + /** Stop the timer + * Get the curretn time with gettimeofday and save in timeEnd + * Calculate elapsed time by timeEnd - timeStart and add to timervalue + */ + void stop(); + + /** Reset timervalue to zero. + * Set timervalue, timeStart and timeEnd to zero + */ + void reset(); + + /** Return elapsed time in seconds. + * Return the value of timervalue + */ + double gettime(); + + /** Print timer. + * Print the elapsed time of the timer + */ + void print(); + + +}; + +#endif diff --git a/src/Utility/TimeStamp.cpp b/src/Utility/TimeStamp.cpp new file mode 100644 index 0000000..1e239fc --- /dev/null +++ b/src/Utility/TimeStamp.cpp @@ -0,0 +1,11 @@ +#include "TimeStamp.h" + +timestamp_t get_timestamp() { + struct timeval now; + gettimeofday (&now, NULL); + return now.tv_usec + (timestamp_t)now.tv_sec * 1000000; +} + +double get_secs(timestamp_t t_start, timestamp_t t_end) { + return (t_end - t_start) / 1000000.0L; +} \ No newline at end of file diff --git a/src/Utility/TimeStamp.h b/src/Utility/TimeStamp.h new file mode 100644 index 0000000..d53104d --- /dev/null +++ b/src/Utility/TimeStamp.h @@ -0,0 +1,14 @@ +#ifndef H_TIMESTAMPE +#define H_TIMESTAMPE + +#include +#include +#include + +typedef unsigned long long timestamp_t; + +timestamp_t get_timestamp(); +double get_secs(timestamp_t t_start, timestamp_t t_end); + + +#endif \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000..01f33fb --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,84 @@ +INCLUDE_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) + +LINK_DIRECTORIES( ${CMAKE_SOURCE_DIR}/src ) + +#ADD_EXECUTABLE(testDKS testDKS.cpp) +#ADD_EXECUTABLE(testChi testChi.cpp) +#ADD_EXECUTABLE(testFFT testFFT.cpp) +#ADD_EXECUTABLE(testMIC testMIC.cpp) +#ADD_EXECUTABLE(testMICOpenCL testMICOpenCL.cpp) +#ADD_EXECUTABLE(testFFT3D testFFT3D.cpp) +#ADD_EXECUTABLE(testFFT3DRC testFFT3DRC.cpp) +#ADD_EXECUTABLE(testFFT3DRC_MIC testFFT3DRC_MIC.cpp) +#ADD_EXECUTABLE(testFFT3DTiming testFFT3DTiming.cpp) +#ADD_EXECUTABLE(testStockhamFFT testStockhamFFT.cpp) +#ADD_EXECUTABLE(testStockFFT3D testStockFFT3D.cpp) +#ADD_EXECUTABLE(testMemObjects testMemObjects.cpp) +#ADD_EXECUTABLE(testRCFFT testRCFFT.cpp) +#ADD_EXECUTABLE(testOffset testOffset.cpp) +#ADD_EXECUTABLE(testOffsetMPI testOffsetMPI.cpp) +#ADD_EXECUTABLE(testMPI testMPI.cpp) +#ADD_EXECUTABLE(testMPIFFT testMPIFFT.cpp) +#ADD_EXECUTABLE(testGather testGather.cpp) +#ADD_EXECUTABLE(testGatherAsync testGatherAsync.cpp) +#ADD_EXECUTABLE(testTranspose testTranspose.cpp) +ADD_EXECUTABLE(testCollimatorPhysics testCollimatorPhysics.cpp) +#ADD_EXECUTABLE(testCollimatorPhysicsSoA testCollimatorPhysicsSoA.cpp) +#ADD_EXECUTABLE(testPush testPush.cpp) +#ADD_EXECUTABLE(testFFTSolverMIC testFFTSolver_MIC.cpp) +#ADD_EXECUTABLE(testIntegration testTimeIntegration.cpp) +#ADD_EXECUTABLE(testImageReconstruction testImageReconstruction.cpp) + +#shared library +#ADD_EXECUTABLE(testFFT3DSO testFFT3DSO.cpp) + + +#TARGET_LINK_LIBRARIES(testDKS dks) +#TARGET_LINK_LIBRARIES(testChi dks) +#TARGET_LINK_LIBRARIES(testFFT dks) +#TARGET_LINK_LIBRARIES(testMIC dks) +#TARGET_LINK_LIBRARIES(testMICOpenCL dks) +#TARGET_LINK_LIBRARIES(testFFT3D dks) +#TARGET_LINK_LIBRARIES(testFFT3DRC dks) +#TARGET_LINK_LIBRARIES(testFFT3DRC_MIC dks) +#TARGET_LINK_LIBRARIES(testFFT3DTiming dks) +#TARGET_LINK_LIBRARIES(testStockhamFFT dks) +#TARGET_LINK_LIBRARIES(testStockFFT3D dks) +#TARGET_LINK_LIBRARIES(testMemObjects dks) +#TARGET_LINK_LIBRARIES(testRCFFT dks) +#TARGET_LINK_LIBRARIES(testOffset dks) +#TARGET_LINK_LIBRARIES(testOffsetMPI dks) +#TARGET_LINK_LIBRARIES(testMPI dks) +#TARGET_LINK_LIBRARIES(testMPIFFT dks) +#TARGET_LINK_LIBRARIES(testGather dks) +#TARGET_LINK_LIBRARIES(testGatherAsync dks) +#TARGET_LINK_LIBRARIES(testTranspose dks) +TARGET_LINK_LIBRARIES(testCollimatorPhysics dks) +#TARGET_LINK_LIBRARIES(testCollimatorPhysicsSoA dks) +#TARGET_LINK_LIBRARIES(testPush dks) +#TARGET_LINK_LIBRARIES(testFFTSolverMIC dks) +#TARGET_LINK_LIBRARIES(testIntegration dks) +#TARGET_LINK_LIBRARIES(testImageReconstruction dks) + + +#TARGET_LINK_LIBRARIES(testFFT3DSO dksshared) + + +#IF (${COMPILER_NAME} STREQUAL "mpicxx") + #ADD_EXECUTABLE(testGatherAsync2 testGatherAsync2.cpp) + #ADD_EXECUTABLE(testGreens testGreens.cpp) + #ADD_EXECUTABLE(testFFTSolver testFFTSolver.cpp) + #ADD_EXECUTABLE(testCollimatorPhysicsMPI testCollimatorPhysicsMPI.cpp) + #TARGET_LINK_LIBRARIES(testGatherAsync2 dks) + #TARGET_LINK_LIBRARIES(testGreens dks) + #TARGET_LINK_LIBRARIES(testFFTSolver dks) + #TARGET_LINK_LIBRARIES(testCollimatorPhysicsMPI dks) +#ENDIF (${COMPILER_NAME} STREQUAL "mpicxx") + +#ADD_EXECUTABLE(testChiSquare testChiSquare.cpp) +#TARGET_LINK_LIBRARIES(testChiSquare dks) + +#IF (NOT CUDA_VERSION VERSION_LESS "7.0") + #ADD_EXECUTABLE(testChiSquareRT testChiSquareRT.cpp) + #TARGET_LINK_LIBRARIES(testChiSquareRT dks) +#ENDIF (NOT CUDA_VERSION VERSION_LESS "7.0") \ No newline at end of file diff --git a/test/testChi.cpp b/test/testChi.cpp new file mode 100644 index 0000000..0181144 --- /dev/null +++ b/test/testChi.cpp @@ -0,0 +1,141 @@ +#include +#include +#include + +#include "DKSBase.h" +#include "Utility/TimeStamp.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[4]; + + + if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else if (argc == 2){ + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << endl; + + cout << "Begin DKS Base tests" << endl; + + /* inti data */ + int ierr; + int nsize = 4000000; + int jsize = 16; + int psize = 6; + double *data = new double[nsize*jsize]; + double *p = new double[psize*jsize]; + double data_out = 0; + + srand(time(NULL)); + for (int i = 0; i < nsize*jsize; i++) { + //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1; + //data[i] = sign*(double)rand()/RAND_MAX; + data[i] = (double)i / (nsize*jsize); + //data[i] = 1; + } + for (int i = 0; i < psize*jsize; i++) { + //int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1; + //p[i] = sign*(double)rand()/RAND_MAX; + p[i] = (double)i / (nsize*jsize); + //p[i] = 1; + } + /* end init */ + + timestamp_t tstart, tend; + //timestamp_t t0, t1; + + tstart = get_timestamp(); + + //init dks base class, set API to opencl and init connection with OpenCL device + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + + //ptrs to hold reference to device memory + void *dptr, *ntptr, *pptr; + + //allocate memory on device + //t0 = get_timestamp(); + dptr = base.allocateMemory(nsize*jsize, ierr); + ntptr = base.allocateMemory(nsize*jsize, ierr); + pptr = base.allocateMemory(psize*jsize, ierr); + //t1 = get_timestamp(); + //cout << "Allocate memory: " << get_secs(t0, t1) << endl; + + //write data to device + //t0 = get_timestamp(); + base.writeData(dptr, data, nsize*jsize); + //t1 = get_timestamp(); + //cout << "Write data set: " << get_secs(t0, t1) << endl << endl; + + for (int i = 0; i < 5; i++) { + //write parameters to device + //t0 = get_timestamp(); + base.writeData(pptr, p, psize*jsize); + //t1 = get_timestamp(); + //cout << "Write parameters: " << get_secs(t0, t1) << endl; + + //set function to calcNt and execute it with necessary parameters + //t0 = get_timestamp(); + base.callNt(ntptr, pptr, psize, nsize, jsize, 0.025); + //t1 = get_timestamp(); + + //cout << "Calc N(t): " << get_secs(t0, t1) << endl; + + //set function to chi2 and execute it with necessary parameters + //t0 = get_timestamp(); + base.callChi2(ntptr, dptr, ntptr, nsize*jsize); + //t1 = get_timestamp(); + //cout << "Calc chi^2: " << get_secs(t0, t1) << endl; + + //set function so sum and execute it with necessary parameters + //t0 = get_timestamp(); + base.callSum(ntptr, ntptr, nsize*jsize); + //t1 = get_timestamp(); + //cout << "Calc sum: " << get_secs(t0, t1) << endl; + + //read calculated sum (one value) + //t0 = get_timestamp(); + base.readData(ntptr, &data_out, 1); + //t1 = get_timestamp(); + //cout << "Read sum: " << get_secs(t0, t1) << endl; + cout << "Sum nt: " << data_out << endl; + + /* + for (int i = 0; i < psize*jsize; i++) { + int sign = ((double)rand()/RAND_MAX > 0.5) ? 1 : -1; + p[i] = sign*(double)rand()/RAND_MAX; + } + */ + + + //cout << endl; + } + + //free device memory + //t0 = get_timestamp(); + base.freeMemory(dptr, nsize*jsize); + base.freeMemory(ntptr, nsize*jsize); + base.freeMemory(pptr, psize*jsize); + //t1 = get_timestamp(); + //cout << "Free memory: " << get_secs(t0, t1) << endl; + + tend = get_timestamp(); + + cout << endl << "time: " << get_secs(tstart, tend) << endl; + + return 0; +} + diff --git a/test/testChiSquare.cpp b/test/testChiSquare.cpp new file mode 100644 index 0000000..550f3a4 --- /dev/null +++ b/test/testChiSquare.cpp @@ -0,0 +1,168 @@ +#include +#include +#include "DKSBase.h" + +using namespace std; + +void initData(vector< vector > &v, int length) { + + for (unsigned int i = 0; i < v.size(); i++) { + for (int j = 0; j < length; j++) { + v[i].push_back(j); + } + } + +} + + +void printData(vector< vector > &v) { + for (unsigned int i = 0; i < v.size(); i++) { + for (unsigned int j = 0; j < v[i].size(); j++) { + cout << v[i][j] << "\t"; + } + cout << endl; + } +} + +void initData(double *data, int sensors, int length) { + + for (int i = 0; i < sensors; i++) { + for (int j = 0; j < length; j++) { + data[i*length + j] = j; + } + } + +} + + +void printData(double *data, int sensors, int length) { + for (int i = 0; i < sensors; i++) { + for (int j = 0; j < length; j++) { + cout << data[i*length + j] << "\t"; + } + cout << endl; + } +} + +void initPar(double *par, int npar) { + + for (int i = 0; i < npar; i++) + par[i] = (double)i / npar; + +} + +void printDiv(int size) { + for (int i = 0; i < size; i++) + cout << "="; + cout << endl; +} + +void calcChisq(vector< vector > fData, double * par, double fTimeResolution, double fRebin) +{ + + double chisq = 0.0; + double theo, data; + const double tau=2.197019; + const double dt0 = fTimeResolution*0.5*(fRebin-1); + double time; + double w = par[0]*0.08516155035269027; + + unsigned int i, j; + + for (i=0; i > fData; + fData.resize(sensors); + initData(fData, length); + printData(fData); + printDiv(75); + + DKSBase dksbase; + if (useCuda) + dksbase.setAPI("Cuda", 4); + else + dksbase.setAPI("OpenCL", 6); + dksbase.setDevice("-gpu", 4); + dksbase.initDevice(); + dksbase.setupFFT(0, NULL); + + + void *mem_data, *mem_par, *mem_chisq; + cout << "Allocate memory" << endl; + mem_par = dksbase.allocateMemory(npar, ierr); + mem_data = dksbase.allocateMemory(fData.size() * fData[0].size(), ierr); + mem_chisq = dksbase.allocateMemory(fData.size() * fData[0].size(), ierr); + + + cout << "Write data" << endl; + dksbase.writeData(mem_par, par, npar); + for (int i = 0; i < sensors; i++) + dksbase.writeData(mem_data, &fData[i][0], length, i*length); + + + + cout << "Call PHistoTFFcn" << endl; + dksbase.callPHistoTFFcn(mem_data, mem_par, mem_chisq, + fTimeResolution, fRebin, + sensors, length, npar, result); + cout << "Result: " << result << endl; + + + double *out_data = new double[ndata]; + dksbase.readData(mem_chisq, out_data, ndata); + printDiv(75); + printData(out_data, sensors, length); + printDiv(75); + + calcChisq(fData, par, fTimeResolution, fRebin); + printDiv(75); + + cout << "Free memory" << endl; + dksbase.freeMemory(mem_par, npar); + dksbase.freeMemory(mem_data, ndata); + dksbase.freeMemory(mem_chisq, ndata); + + + return 0; + +} diff --git a/test/testChiSquareRT.cpp b/test/testChiSquareRT.cpp new file mode 100644 index 0000000..fcd0b50 --- /dev/null +++ b/test/testChiSquareRT.cpp @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include + +#include "DKSBaseMuSR.h" +#include "Utility/DKSTimer.h" + +void initData(double *data, int N, bool ones = false) { + for (int i = 0; i < N; i++) { + if (ones) + data[i] = 1.0; + else + data[i] = (double)rand() / RAND_MAX; + } +} + +template +void printData(T *data, int N) { + for (int i = 0; i < N; i++) + std::cout << data[i] << "\t"; + std::cout << std::endl; +} + + +const std::string funct = "cos(t*p[0]) - exp(-t*p[m[0]])"; +//std::string funct = "p[m[0]] * se(t, p[m[1]]) * tf(t, f[m[2]], p[m[3]])"; +//const std::string funct = "p[m[0]] * se(t, p[m[1]])"; +//const std::string funct = "p[m[1]] + p[m[0]]"; + +double fTheory(double time, double *par, double *func, int *map) { + return cos(time*par[0]) - exp(-time*par[map[0]]); +} + +double testFunctionSerial(double *data, double *par, double *func, int *map, + double N0, double tau, double bkg, double timeStep, + int startTimeBin, int endTimeBin) +{ + double time, diff, theo; + double chisq = 0; + for (int i = startTimeBin; i < endTimeBin; ++i) { + time = i * timeStep; + theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg; + diff = data[i] - theo; + + chisq += diff * diff / data[i]; + } + + return chisq; +} + +double testFunctionParallel(double *data, double *par, double *func, int *map, + double N0, double tau, double bkg, double timeStep, + int startTimeBin, int endTimeBin) +{ + int i, chunk; + double time, diff, theo; + double chisq = 0; + + chunk = (endTimeBin - startTimeBin) / omp_get_num_procs(); + if (chunk < 10) + chunk = 10; +#pragma omp parallel for default(shared) private (i,time,diff) firstprivate(N0,tau,bkg,timeStep) schedule(dynamic,chunk) reduction(+:chisq) + for (i = startTimeBin; i < endTimeBin; ++i) { + time = i * timeStep; + theo = N0 * exp(-time/tau) * (1.0 + fTheory(time, par, func, map)) + bkg; + diff = data[i] - theo; + + chisq += diff * diff / data[i]; + } + + return chisq; +} + +int main(int argc, char *argv[]) { + + int Loop = 100; + + //init test data on the host + int Ndata = 8; + if (argc > 1) + Ndata = atoi(argv[1]); + + int api = 1; + if (argc > 2) + api = atoi(argv[2]); + + int Npar = 66; + int Nfunc = 1; + int Nmap = 4; + + double *data = new double[Ndata]; + double *par = new double[Npar]; + double *func = new double[Nfunc]; + int *map = new int[Nmap]; + + initData(data, Ndata); + initData(par, Npar); + initData(func, Nfunc); + map[0] = 1; + map[1] = 2; + map[2] = 3; + map[3] = 4; + + //create timers + DKSTimer serialTimer; + DKSTimer cudaTimer; + DKSTimer ompTimer; + DKSTimer gpuOverhead; + serialTimer.init("Serial timer"); + cudaTimer.init("Cuda timer"); + ompTimer.init("OpenMP timer"); + gpuOverhead.init("Overhead for gpu"); + + + //serial version + double resultSerial; + + serialTimer.start(); + for (int i = 0; i < Loop; i++) + resultSerial = testFunctionSerial(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata); + serialTimer.stop(); + + //openmp version + double resultOMP = 0.0; + + ompTimer.start(); + //for (int i = 0; i < Loop; i++) + // resultOMP = testFunctionParallel(data, par, func, map, 1.0, 1.0, 1.0, 0.1, 0, Ndata); + ompTimer.stop(); + + + //create and init dkabase + gpuOverhead.start(); + + DKSBaseMuSR dksbase; + if (api == 1) + dksbase.setAPI("Cuda"); + else + dksbase.setAPI("OpenCL"); + + dksbase.setDevice("-gpu"); + dksbase.initDevice(); + dksbase.initChiSquare(Ndata, Npar, Nfunc, Nmap); + + //allocate memory on the device + int ierr; + void *data_ptr; + + data_ptr = dksbase.allocateMemory(Ndata, ierr); + + dksbase.writeData(data_ptr, data, Ndata); + dksbase.writeFunctions(func, Nfunc); + dksbase.writeMaps(map, Nmap); + + dksbase.callCompileProgram(funct); + gpuOverhead.stop(); + + double resultCuda; + + cudaTimer.start(); + for (int i = 0; i < Loop; i++) { + dksbase.writeParams(par, Npar); + int ierr = dksbase.callLaunchChiSquare(data_ptr, data_ptr, Ndata, Npar, Nfunc, Nmap, + 0.0, 0.1, 0, resultCuda); + + if (ierr != 0) + exit (EXIT_FAILURE); + + } + cudaTimer.stop(); + + std::cout << std::endl; + std::cout << "=======================Results=======================" << std::endl; + std::cout << "Result serial = " << resultSerial << std::endl; + std::cout << "Result prallel = " << resultOMP << std::endl; + std::cout << "Result cuda = " << resultCuda << std::endl; + + std::cout << std::endl; + std::cout << "=======================Timings=======================" << std::endl; + serialTimer.print(); + ompTimer.print(); + cudaTimer.print(); + gpuOverhead.print(); + std::cout << std::endl; + + dksbase.freeMemory(data_ptr, Ndata); + + return 0; + + +} diff --git a/test/testCollimatorPhysics.cpp b/test/testCollimatorPhysics.cpp new file mode 100644 index 0000000..bb5d9b5 --- /dev/null +++ b/test/testCollimatorPhysics.cpp @@ -0,0 +1,248 @@ +#include + +#include +#include + +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" + + +using namespace std; + +typedef struct { + int label; + unsigned localID; + double Rincol[3]; + double Pincol[3]; +} PART_SMALL; + +typedef struct { + double x; + double y; + double z; +} Vector; + +PART_SMALL initPartSmall(int d) { + + PART_SMALL p; + p.label = 0; + p.localID = d; + + p.Rincol[0] = 0.0; + p.Rincol[1] = 0.0; + p.Rincol[2] = 0.02; + + p.Pincol[0] = 0.0; + p.Pincol[1] = 0.0; + p.Pincol[2] = 3.9920183237269791e-01; + + return p; +} + +Vector initVector() { + Vector tmp; + tmp.x = 0.5; + tmp.y = 0.5; + tmp.z = 0.5; + + return tmp; +} + +void printPart(PART_SMALL p) { + cout << "label: " << p.label << ", "; + cout << "localid: " << p.localID << ","; + cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", "; + cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2]; + cout << endl; +} + +void printVector(Vector v) { + cout << v.x << "\t" << v.y << "\t" << v.z << endl; + } + +void initParts(PART_SMALL *p, int N) { + for (int i = 0; i < N; i++) + p[i] = initPartSmall(i); +} + +void printParts(PART_SMALL *p, int N) { + for (int i = 0; i < N; i++) + printPart(p[i]); + cout << endl; +} + +void initVectors(Vector *v, int N) { + for (int i = 0; i < N; i++) + v[i] = initVector(); +} + +void printVectors(Vector *v, int N) { + for (int i = 0; i < N; i++) + printVector(v[i]); + cout << endl; +} + + +void initParams(double *data) { + data[0] = 0.0;//2.0000000000000000e-02; + data[1] = 1.0;//1.0000000000000000e-02; + data[2] = 2.2100000000000000e+00; + data[3] = 6.0000000000000000e+00; + data[4] = 1.2010700000000000e+01; + data[5] = 2.6010000000000000e+00; + data[6] = 1.7010000000000000e+03; + data[7] = 1.2790000000000000e+03; + data[8] = 1.6379999999999999e-02; + data[9] = 1.9321266968325795e-01; + data[10] = 7.9000000000000000e+01; + data[11] = 1.0000000000000002e-12; + +} + +void printDouble(double *data, int N) { + for (int i = 0; i < N; i++) + std::cout << data[i] << "\t"; + std::cout << std::endl; +} + +int main(int argc, char *argv[]) { + + int loop = 10; + int numpart = 1e5; + char *api_name = new char[10]; + char *device_name = new char[10]; + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-npart")) { + numpart = atoi(argv[i+1]); + i++; + } + + if (argv[i] == string("-loop")) { + loop = atoi(argv[i+1]); + i++; + } + + } + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of particles: " << numpart << endl; + cout << "Number of loops: " << loop << endl; + cout << "------------------------------------------------------------" << endl; + + //init part vector to test mc + PART_SMALL *parts = new PART_SMALL[numpart]; + initParts(parts, numpart); + + double *params = new double[12]; + initParams(params); + + //init dks + int ierr; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + //init random + base.callInitRandoms(numpart); + + //**test collimator physics and sort***// + void *part_ptr, *param_ptr; + + //allocate memory for particles + part_ptr = base.allocateMemory(numpart, ierr); + param_ptr = base.allocateMemory(12, ierr); + + //transfer data to device + base.writeData(part_ptr, parts, numpart); + base.writeData(param_ptr, params, 12); + + int numaddback; + //test calls to do some first executions + base.callCollimatorPhysics2(part_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback); + base.syncDevice(); + //std::cout << "particles to add back: " << numaddback << std::endl; + + struct timeval timeStart, timeEnd; + std::cout << "Start MC" << std::endl; + + gettimeofday(&timeStart, NULL); + for (int i = 0; i < loop; i++) { + base.callCollimatorPhysics2(part_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSort(part_ptr, numpart, numaddback); + base.syncDevice(); + } + gettimeofday(&timeEnd, NULL); + + std::cout << "addback: " << numaddback << std::endl; + + std::cout << "End MC" << std::endl; + double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)); + + std::cout << "Time for " << loop << " MC runs: " << t * 1e-6 << "s" << std::endl; + std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl; + + //read data from device + base.readData(part_ptr, parts, numpart); + + //free memory + base.freeMemory(part_ptr, numpart); + base.freeMemory(param_ptr, 12); + + + std::cout << std::fixed << std::setprecision(4); + for (int i = 0; i < 10; i++) { + std::cout << parts[i].label << "\t" + << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" + << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t" + << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t" + << std::endl; + } + + std:: cout << "..." << std::endl; + + for (int i = numpart - 10; i < numpart; i++) { + std::cout << parts[i].label << "\t" + << parts[i].Rincol[0] << "\t" << parts[i].Rincol[1] << "\t" + << parts[i].Rincol[2] << "\t" << parts[i].Pincol[0] << "\t" + << parts[i].Pincol[1] << "\t" << parts[i].Pincol[2] << "\t" + << std::endl; + } + + double arx = 0, ary = 0, arz = 0; + double apx = 0, apy = 0, apz = 0; + for (int i = 0; i < numpart; i++) { + + arx += sqrt(parts[i].Rincol[0] * parts[i].Rincol[0]) / numpart; + ary += sqrt(parts[i].Rincol[1] * parts[i].Rincol[1]) / numpart; + arz += sqrt(parts[i].Rincol[2] * parts[i].Rincol[2]) / numpart; + + apx += sqrt(parts[i].Pincol[0] * parts[i].Pincol[0]) / numpart; + apy += sqrt(parts[i].Pincol[1] * parts[i].Pincol[1]) / numpart; + apz += sqrt(parts[i].Pincol[2] * parts[i].Pincol[2]) / numpart; + + } + + std::cout << std::fixed << std::setprecision(10); + std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl + << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl; + + + cout << "==========================END TEST==========================" << endl; + return 0; + +} diff --git a/test/testCollimatorPhysicsMPI.cpp b/test/testCollimatorPhysicsMPI.cpp new file mode 100644 index 0000000..22e8a84 --- /dev/null +++ b/test/testCollimatorPhysicsMPI.cpp @@ -0,0 +1,126 @@ +#include + +#include + +#include "DKSBase.h" +#include "cuda_runtime.h" + +#include + +using namespace std; + +typedef struct { + int label; + unsigned localID; + double Rincol[3]; + double Pincol[3]; + long IDincol; + int Binincol; + double DTincol; + double Qincol; + long LastSecincol; + double Bfincol[3]; + double Efincol[3]; +} PART; + +PART initPart(int d) { + + PART p; + p.label = d; + p.localID = d; + for (int i = 0; i < 3; i++) { + p.Rincol[i] = 0.5;// / (d+1); + p.Pincol[i] = 0.5;// / (d+1); + p.Bfincol[i] = 1.0 / (d+1); + p.Efincol[i] = 1.0 / (d+1); + } + p.IDincol = d; + p.Binincol = d; + p.DTincol = d; + p.Qincol = d; + p.LastSecincol = d; + + return p; + +} + +void printPart(PART p) { + + cout << "label: " << p.label << ", "; + //cout << "localID: " << p.localID << ", "; + cout << "Rincol: " << p.Rincol[0] << ", " << p.Rincol[1] << ", " << p.Rincol[2] << ", "; + cout << "Pincol: " << p.Pincol[0] << ", " << p.Pincol[1] << ", " << p.Pincol[2] << ", "; + //cout << "IDincol: " << p.IDincol << ", Binincol: " << p.Binincol << ", "; + //cout << "DTincol: " << p.DTincol << ", Qincol: " << p.Qincol << ", LastSecincol: " << p.LastSecincol << ", "; + //cout << "Bfincol: " << p.Bfincol[0] << ", " << p.Bfincol[1] << ", " << p.Bfincol[2] << ", "; + //cout << "Efincol: " << p.Efincol[0] << ", " << p.Efincol[1] << ", " << p.Efincol[2] << endl; + cout << endl; + + +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + int numpart = 500501; + + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.callInitRandoms(numpart); + + PART tmp; + vector p; + vector p_out; + p_out.resize(numpart); + + for (int i = 0; i < numpart; i++) { + tmp = initPart(i + 1); + p.push_back(tmp); + } + + if (numpart <= 20) { + for (int i = 0; i < 10; i++) + printPart(p[i]); + cout << endl; + } + + double params[19]; + for (int i = 0; i < 19; i++) + params[i] = 0.05; + params[0] = 0; + params[1] = 1; + + void *mem_ptr, *par_ptr; + + par_ptr = base.allocateMemory(19, ierr); + base.writeData(par_ptr, params, 19); + + mem_ptr = base.allocateMemory(numpart, ierr); + base.writeData(mem_ptr, &p[0], numpart); + + int addback, dead; + for (int i = 0; i < 100; i++) + base.callCollimatorPhysics(mem_ptr, par_ptr, numpart, 19, addback, dead); + cout << "Add back: " << addback << ", dead: " << dead << endl; + + base.readData(mem_ptr, &p_out[0], numpart); + base.freeMemory(mem_ptr, ierr); + base.freeMemory(par_ptr, ierr); + + if (numpart <= 20) { + for (int i = 0; i < numpart; i++) + printPart(p_out[i]); + } + + MPI_Finalize(); + return 0; + +} diff --git a/test/testCollimatorPhysicsSoA.cpp b/test/testCollimatorPhysicsSoA.cpp new file mode 100644 index 0000000..bc4bf0b --- /dev/null +++ b/test/testCollimatorPhysicsSoA.cpp @@ -0,0 +1,250 @@ +#include +#include + +#include +#include + +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" +#include + +using namespace std; + +typedef struct { + int *label; + unsigned *localID; + double *rx; + double *ry; + double *rz; + double *px; + double *py; + double *pz; +} PART; + + +void initParts(int *label, unsigned *localID, double *rx, double *ry, double *rz, + double *px, double *py, double *pz, int npart) { + + for (int i = 0; i < npart; i++) { + label[i] = 0; + localID[i] = i; + rx[i] = 0.0; + ry[i] = 0.0; + rz[i] = 0.02; + px[i] = 0.0; + py[i] = 0.0; + pz[i] = 3.9920183237269791e-01; + } +} + +void initParams(double *data) { + data[0] = 0.0;//2.0000000000000000e-02; + data[1] = 1.0;//1.0000000000000000e-02; + data[2] = 2.2100000000000000e+00; + data[3] = 6.0000000000000000e+00; + data[4] = 1.2010700000000000e+01; + data[5] = 2.6010000000000000e+00; + data[6] = 1.7010000000000000e+03; + data[7] = 1.2790000000000000e+03; + data[8] = 1.6379999999999999e-02; + data[9] = 1.9321266968325795e-01; + data[10] = 7.9000000000000000e+01; + data[11] = 1.0000000000000002e-12; + +} + +int main(int argc, char *argv[]) { + + int loop = 10; + int numpart = 1e5; + char *api_name = new char[10]; + char *device_name = new char[10]; + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-npart")) { + numpart = atoi(argv[i+1]); + i++; + } + + if (argv[i] == string("-loop")) { + loop = atoi(argv[i+1]); + i++; + } + + } + + int threads = 0; + /* +#pragma offload target(mic:0) out(threads) + { + #pragma omp parallel + { + threads = omp_get_num_threads(); + } + } + */ + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of particles: " << numpart << endl; + cout << "Number of loops: " << loop << endl; + cout << "Number of threads: " << threads << endl; + cout << "------------------------------------------------------------" << endl; + + //init part vector to test mc + //int *label; + //unsigned *localID; + //double *rx, *ry, *rz, *px, *py, *pz; + PART p; + p.label = (int*) _mm_malloc(sizeof(int)*numpart, 64); + p.localID = (unsigned*) _mm_malloc(sizeof(int)*numpart, 64); + p.rx = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.ry = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.rz = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.px = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.py = (double*) _mm_malloc(sizeof(double)*numpart, 64); + p.pz = (double*) _mm_malloc(sizeof(double)*numpart, 64); + initParts(p.label, p.localID, p.rx, p.ry, p.rz, p.px, p.py, p.pz, numpart); + + double *params = new double[12]; + initParams(params); + + //init dks + int ierr; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + //init random + base.callInitRandoms(numpart); + + //**test collimator physics and sort***// + void *label_ptr, *localID_ptr, *rx_ptr, *ry_ptr, *rz_ptr, *px_ptr, *py_ptr, *pz_ptr, *param_ptr; + + //allocate memory for particles + label_ptr = base.allocateMemory(numpart, ierr); + localID_ptr = base.allocateMemory(numpart, ierr); + rx_ptr = base.allocateMemory(numpart, ierr); + ry_ptr = base.allocateMemory(numpart, ierr); + rz_ptr = base.allocateMemory(numpart, ierr); + px_ptr = base.allocateMemory(numpart, ierr); + py_ptr = base.allocateMemory(numpart, ierr); + pz_ptr = base.allocateMemory(numpart, ierr); + + param_ptr = base.allocateMemory(12, ierr); + + //transfer data to device + base.writeData(label_ptr, p.label, numpart); + base.writeData(localID_ptr, p.localID, numpart); + base.writeData(rx_ptr, p.rx, numpart); + base.writeData(ry_ptr, p.ry, numpart); + base.writeData(rz_ptr, p.rz, numpart); + base.writeData(px_ptr, p.px, numpart); + base.writeData(py_ptr, p.py, numpart); + base.writeData(pz_ptr, p.pz, numpart); + + //transfer params to device + base.writeData(param_ptr, params, 12); + + std::cout << "test runs" << std::endl; + + int numaddback; + //test calls to do some first executions + base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart, numaddback); + base.syncDevice(); + + struct timeval timeStart, timeEnd; + std::cout << "Start MC" << std::endl; + + gettimeofday(&timeStart, NULL); + for (int i = 0; i < loop; i++) { + base.callCollimatorPhysicsSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart); + base.callCollimatorPhysicsSortSoA(label_ptr, localID_ptr, rx_ptr, ry_ptr, rz_ptr, px_ptr, + py_ptr, pz_ptr, param_ptr, numpart, numaddback); + base.syncDevice(); + } + gettimeofday(&timeEnd, NULL); + + std::cout << "addback: " << numaddback << std::endl; + + std::cout << "End MC" << std::endl; + double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)); + + std::cout << "Time for " << numpart << " MC runs: " << t * 1e-6 << "s" << std::endl; + std::cout << "Average time for MC run: " << t * 1e-6 / loop << std::endl; + + //read data from device + base.readData(label_ptr, p.label, numpart); + base.readData(localID_ptr, p.localID, numpart); + base.readData(rx_ptr, p.rx, numpart); + base.readData(ry_ptr, p.ry, numpart); + base.readData(rz_ptr, p.rz, numpart); + base.readData(px_ptr, p.px, numpart); + base.readData(py_ptr, p.py, numpart); + base.readData(pz_ptr, p.pz, numpart); + + //free memory + base.freeMemory(label_ptr, numpart); + base.freeMemory(localID_ptr, numpart); + base.freeMemory(rx_ptr, numpart); + base.freeMemory(ry_ptr, numpart); + base.freeMemory(rz_ptr, numpart); + base.freeMemory(px_ptr, numpart); + base.freeMemory(py_ptr, numpart); + base.freeMemory(pz_ptr, numpart); + + base.freeMemory(param_ptr, 12); + + /* + std::cout << std::fixed << std::setprecision(4); + for (int i = 0; i < 10; i++) { + std::cout << p.label[i] << "\t" << p.rx[i] + << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] + << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl; + } + std:: cout << "..." << std::endl; + + for (int i = numpart - 10; i < numpart; i++) { + std::cout << p.label[i] << "\t" << p.rx[i] + << "\t" << p.ry[i] << "\t" << p.rz[i] << "\t" << p.px[i] + << "\t" << p.py[i] << "\t" << p.pz[i] << std::endl; + } + + double arx = 0, ary = 0, arz = 0; + double apx = 0, apy = 0, apz = 0; + for (int i = 0; i < numpart; i++) { + + arx += sqrt(p.rx[i] * p.rx[i]) / numpart; + ary += sqrt(p.ry[i] * p.ry[i]) / numpart; + arz += sqrt(p.rz[i] * p.rz[i]) / numpart; + + apx += sqrt(p.px[i] * p.px[i]) / numpart; + apy += sqrt(p.py[i] * p.py[i]) / numpart; + apz += sqrt(p.pz[i] * p.pz[i]) / numpart; + + } + + std::cout << std::fixed << std::setprecision(10); + std::cout << "R (" << arx << ", " << ary << ", " << arz << ") " << std::endl + << "P (" << apx << ", " << apy << ", " << apz << ") " << std::endl; + */ + cout << "==========================END TEST==========================" << endl; + return 0; + +} diff --git a/test/testDKS.cpp b/test/testDKS.cpp new file mode 100644 index 0000000..4b66732 --- /dev/null +++ b/test/testDKS.cpp @@ -0,0 +1,15 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + DKSBase base = DKSBase(); + base.getDevices(); + + return 0; +} + diff --git a/test/testFFT.cpp b/test/testFFT.cpp new file mode 100644 index 0000000..c3fec1b --- /dev/null +++ b/test/testFFT.cpp @@ -0,0 +1,83 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << "\t" << device_name << endl; + + cout << "Begin DKS Base tests" << endl; + + int N = 2; + int dimsize[3] = {N, N, N}; + + complex *cdata = new complex[N]; + complex *cfft = new complex[N]; + for (int i = 0; i < N; i++) { + cdata[i] = complex(0, 0); + cfft[i] = complex(0, 0); + } + + cdata[0] = complex(1.73205, 1.73205); + + timestamp_t t0, t1; + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + void *mem_ptr; + int ierr; + + /* write data to device */ + mem_ptr = base.pushData< complex >( (const void*)cdata, N, ierr); + + /* execute fft */ + base.callFFT(mem_ptr, 1, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 1, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 1, dimsize); + + /* read data from device */ + base.pullData< complex >(mem_ptr, cfft, N); + + /* print results */ + + cout << "Data" << endl; + for (int i = 0; i < N; i++) + cout << cdata[i] << "\t"; + cout << endl; + + cout << "FFT" << endl; + for (int i = 0; i < N; i++) + cout << cfft[i] << "\t"; + cout << endl; + + + return 0; +} + diff --git a/test/testFFT3D.cpp b/test/testFFT3D.cpp new file mode 100644 index 0000000..ff14242 --- /dev/null +++ b/test/testFFT3D.cpp @@ -0,0 +1,159 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(complex* &data, int N, int dim, bool normalize = false); +void printData3DN4(complex* &data, int N, int dim); + +void compareData(complex* &data1, complex* &data2, int N, int dim); + +/* usage - ./testFFT3D */ +int main(int argc, char *argv[]) { + + int N = 16; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + N = atoi(argv[1]); + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, "-gpu"); + } else if (argc == 4) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, argv[3]); + } else { + N = 16; + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << ", " << device_name << endl; + + int dimsize[3] = {N, N, N}; + + cout << "Begin DKS Base tests, N = " << N << endl; + + int dim = 3; + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cifft = new complex[N*N*N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cdata[i*N*N + j*N + k] = complex((double)k / N, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cifft[i*N*N + j*N + k] = complex(0, 0); + } + } + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + base.setupFFT(3, dimsize); + + void *mem_ptr; + int ierr; + + /* allocate memory on device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + + /* write data to device */ + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + + /* execute fft */ + base.callFFT(mem_ptr, 3, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 3, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 3, dimsize); + + /* read data from device */ + base.readData< complex >(mem_ptr, cifft, N*N*N); + + /* free device memory */ + base.freeMemory< complex >(mem_ptr, N*N*N); + + /* compare results */ + compareData(cdata, cifft, N, dim); + + return 0; +} + +void printData(complex* &data, int N, int dim, bool normalize) { + int ni, nj, nk; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + if (!normalize) { + cout << data[i*ni*ni + j*nj + k].real() << " "; + cout << data[i*ni*ni + j*nj + k].imag() << "\t"; + } else + cout << data[i*ni*ni + j*nj + k].real() / N << "\t"; + } + cout << endl; + } + cout << endl; + } +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testFFT3DRC.cpp b/test/testFFT3DRC.cpp new file mode 100644 index 0000000..b0a0625 --- /dev/null +++ b/test/testFFT3DRC.cpp @@ -0,0 +1,199 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim); +void initData(double *data, int dimsize[3]); +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop); +void printHelp(); + +int main(int argc, char *argv[]) { + + int N1 = 8; + int N2 = 8; + int N3 = 8; + int dim = 3; + int loop = 10; + + if ( readParams(argc, argv, N1, N2, N3, loop) ) + return 0; + + int dimsize[3] = {N3, N2, N1}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = (dimsize[0]/2+1) * dimsize[1] *dimsize[2]; + + double *rdata = new double[sizereal]; + double *outdata = new double[sizereal]; + complex *cfft = new complex[sizecomp]; + + for (int i=0; iREAL) */ + base.setupFFTCR(dim, dimsize,1./(N1*N2*N3)); +#endif + +#ifdef DKS_CUDA + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.setupFFT(dim, dimsize); +#endif + + // allocate memory on device + int ierr; + void *real_ptr, *comp_ptr, *real_res_ptr; + real_ptr = base.allocateMemory(sizereal, ierr); + real_res_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< std::complex >(sizecomp, ierr); + + // execute one run before starting the timers + base.writeData(real_ptr, rdata, sizereal); + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); + base.readData(real_res_ptr, outdata, sizereal); + + //timer for total loop time, FFT and IFFT calls + struct timeval timeStart, timeEnd; + struct timeval timeFFTStart[loop], timeFFTEnd[loop]; + struct timeval timeIFFTStart[loop], timeIFFTEnd[loop]; + + gettimeofday(&timeStart, NULL); + for (int i=0; i(real_ptr, rdata, sizereal); + + // execute rcfft + gettimeofday(&timeFFTStart[i], NULL); + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + gettimeofday(&timeFFTEnd[i], NULL); + + // execute crfft + gettimeofday(&timeIFFTStart[i], NULL); + base.callC2RFFT(real_res_ptr, comp_ptr, dim, dimsize); + gettimeofday(&timeIFFTEnd[i], NULL); + + //normalize +#ifdef DKS_CUDA + base.callNormalizeC2RFFT(real_res_ptr, dim, dimsize); +#endif + + // read IFFT data from device + base.readData(real_res_ptr, outdata, sizereal); + + } + gettimeofday(&timeEnd, NULL); + + // free device memory + base.freeMemory< std::complex >(comp_ptr, sizecomp); + base.freeMemory(real_ptr, sizereal); + base.freeMemory(real_res_ptr, sizereal); + + // compare in and out data to see if we get back the same results + compareData(rdata, outdata, N1, N2, N3, dim); + + //calculate seconds for total time and fft times + double tfft = 0; + double tifft = 0; + double ttot = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1e6 + + (timeEnd.tv_usec - timeStart.tv_usec) ) * 1e-6; + + for (int i = 0; i < loop; i++) { + tfft += ( (timeFFTEnd[i].tv_sec - timeFFTStart[i].tv_sec) * 1e6 + + (timeFFTEnd[i].tv_usec - timeFFTStart[i].tv_usec) ) * 1e-6; + + tifft += ( (timeIFFTEnd[i].tv_sec - timeIFFTStart[i].tv_sec) * 1e6 + + (timeIFFTEnd[i].tv_usec - timeIFFTStart[i].tv_usec) ) * 1e-6; + } + + //print timing results + std::cout << std::fixed << std::setprecision(5) << "\nTiming results" + << "\nTotal time\t" << ttot << "s\tavg time\t" << ttot / loop << "s" + << "\nFFT total\t" << tfft << "s\tFFT avg \t" << tfft / loop << "s" + << "\nIFFT total\t" << tifft << "s\tIFFT avg\t" << tifft / loop << "s" + << "\n\n"; + + return 0; +} + +void compareData(double* data1, double* data2, int NI, int NJ, int NK, int dim) { + int id; + double sum = 0; + for (int i = 0; i < NI; i++) { + for (int j = 0; j < NJ; j++) { + for (int k = 0; k < NK; k++) { + id = k*NI*NJ + j*NI + i; + sum += fabs(data1[id] - data2[id]); + } + } + } + std::cout << "RC <--> CR diff: " << sum << std::endl; +} + +void initData(double *data, int dimsize[3]) { + for (int i = 0; i < dimsize[2]; i++) { + for (int j = 0; j < dimsize[1]; j++) { + for (int k = 0; k < dimsize[0]; k++) { + data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; + } + } + } +} + +void printHelp() { + std::cout << std::endl; + + std::cout << "testFFT3DRC executes 3D real complex and 3D complex real" + << "function on the Intel MIC.\n"; + std::cout << "Operations performed by testRC are: " + << "write data to MIC -> FFT -> IFFT -> read data from MIC.\n"; + std::cout << "To run testFFT3DRC execute: ./testFFT3DRC -grid $x $y $z " + << "-loop $l\n"; + std::cout << "where $x $y $z are number of elements in each dimension and " + << "$l is the number of times all the operations will be performed.\n"; + + std::cout << std::endl; +} + +bool readParams(int argc, char *argv[], int &N1, int &N2, int &N3, int &loop) { + + for (int i = 1; i < argc; i++) { + + if ( argv[i] == std::string("-grid") ) { + N1 = atoi(argv[i + 1]); + N2 = atoi(argv[i + 2]); + N3 = atoi(argv[i + 3]); + i += 3; + } + + if ( argv[i] == std::string("-loop") ) { + loop = atoi(argv[i + 1]); + i += 1; + } + + if ( argv[i] == std::string("-h") || argv[i] == std::string("-help") ) { + printHelp(); + return true; + } + } + + return false; +} diff --git a/test/testFFT3DRC_MIC.cpp b/test/testFFT3DRC_MIC.cpp new file mode 100644 index 0000000..9eafe04 --- /dev/null +++ b/test/testFFT3DRC_MIC.cpp @@ -0,0 +1,220 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(complex* &data, int N, int dim, bool normalize = false); +void printData3DN4(complex* &data, int N, int dim); +void printData3DN4(double* data, int N, int dim); + +void compareData(complex* &data1, complex* &data2, int N, int dim); +void compareData(double* data1, double* data2, int N, int dim); + +/* Compute (K*L)%M accurately */ +static double moda(int K, int L, int M) +{ + return (double)(((long long)K * L) % M); +} +/* Initialize array x(N) to produce unit peaks at x(H) and x(N-H) */ +static void init_r(double *x, int N1, int N2, int N3, int H1=-1, int H2=2, int H3=4) +{ + double TWOPI = 6.2831853071795864769, phase, factor; + int n1, n2, n3, S1, S2, S3, index; + + /* Generalized strides for row-major addressing of x */ + S3 = 1; + S2 = (N3/2+1)*2; + S1 = N2*(N3/2+1)*2; + + factor = ((N1-H1%N1)==0 && (N2-H2%N2)==0 && (N3-H3%N3)==0) ? 1.0 : 2.0; + for (n1 = 0; n1 < N1; n1++) + { + for (n2 = 0; n2 < N2; n2++) + { + for (n3 = 0; n3 < N3; n3++) + { + phase = moda(n1,H1,N1) / N1; + phase += moda(n2,H2,N2) / N2; + phase += moda(n3,H3,N3) / N3; + index = n1*S1 + n2*S2 + n3*S3; + //cout << "index = " << index << endl; + x[index] = factor * cos( TWOPI * phase ) / (N1*N2*N3); + } + } + } +} + + +int main(int argc, char *argv[]) { + + int N = atoi(argv[1]); + int dim = 3; + int dimsize[3] = {N, N, N}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = (dimsize[0]/2 + 1) * dimsize[1] * dimsize[2]; + + //double *rdata = new double[sizereal]; + //double *outdata = new double[sizereal]; + //complex *cfft = new complex[sizecomp]; + double *rdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double)); + double *outdata =(double *)malloc(N*N*(N/2+1)*2*sizeof(double)); + complex *cfft = (complex *)malloc(sizecomp*sizeof(complex)); + + init_r(rdata, N,N,N); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI("OpenMP", 6); + base.setDevice("-mic", 4); + base.initDevice(); + + /* setup forward fft (REAL->COMPLEX) */ + base.setupFFTRC(dim, dimsize); + + int ierr; + void *real_ptr, *comp_ptr; + + /* allocate memory on device */; + real_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + /* write data to device */ + base.writeData(real_ptr, rdata, sizereal); + + //printData3DN4(rdata,N,3); + + /* execute rcfft */ + base.callR2CFFT(real_ptr, comp_ptr, dim, dimsize); + + /* read FFT data from device */ + base.readData< complex >(comp_ptr, cfft, sizecomp); + base.writeData(comp_ptr, cfft, sizereal); + + + /* setup backward fft (COMPLEX->REAL) */ + base.setupFFTCR(dim, dimsize,1./(N*N*N)); + /* execute crfft */ + base.callC2RFFT(real_ptr, comp_ptr, dim, dimsize); + + /* normalize */ + //base.callNormalizeC2RFFT(real_ptr, dim, dimsize); + + /* read FFT data from device */ + //base.readData< complex >(comp_ptr, cfft, sizecomp); + + /* read IFFT data from device */ + base.readData(real_ptr, outdata, sizereal); + + /* free device memory */ + base.freeMemory< complex >(comp_ptr, sizecomp); + base.freeMemory(real_ptr, sizereal); + + /* compare data */ + compareData(rdata, outdata, N, dim); + + return 0; +} + +void printData(complex* &data, int N, int dim, bool normalize) { + int ni, nj, nk; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + if (!normalize) + cout << data[i*ni*ni + j*nj + k].real() << "\t"; + else + cout << data[i*ni*ni + j*nj + k].real() / N << "\t"; + } + cout << endl; + } + cout << endl; + } +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} +void printData3DN4(double* data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k]; + //double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + //if (a < 10e-5 && a > -10e-5) + // a = 0; + + cout << d << "\t"; + } + } + cout << endl; + } + cout << endl; + +} +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + +void compareData(double* data1, double* data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + //sum += fabs(data1[id] - data2[id]/(N*N*N)); + sum += fabs(data1[id] - data2[id]); + } + } + } + cout << "Size " << N << " RC <--> CR diff: " << sum << endl; +} diff --git a/test/testFFT3DSO.cpp b/test/testFFT3DSO.cpp new file mode 100644 index 0000000..ff14242 --- /dev/null +++ b/test/testFFT3DSO.cpp @@ -0,0 +1,159 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(complex* &data, int N, int dim, bool normalize = false); +void printData3DN4(complex* &data, int N, int dim); + +void compareData(complex* &data1, complex* &data2, int N, int dim); + +/* usage - ./testFFT3D */ +int main(int argc, char *argv[]) { + + int N = 16; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + N = atoi(argv[1]); + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, "-gpu"); + } else if (argc == 4) { + N = atoi(argv[1]); + strcpy(api_name, argv[2]); + strcpy(device_name, argv[3]); + } else { + N = 16; + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << ", " << device_name << endl; + + int dimsize[3] = {N, N, N}; + + cout << "Begin DKS Base tests, N = " << N << endl; + + int dim = 3; + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cifft = new complex[N*N*N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cdata[i*N*N + j*N + k] = complex((double)k / N, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cifft[i*N*N + j*N + k] = complex(0, 0); + } + } + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + base.setupFFT(3, dimsize); + + void *mem_ptr; + int ierr; + + /* allocate memory on device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + + /* write data to device */ + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + + /* execute fft */ + base.callFFT(mem_ptr, 3, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 3, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 3, dimsize); + + /* read data from device */ + base.readData< complex >(mem_ptr, cifft, N*N*N); + + /* free device memory */ + base.freeMemory< complex >(mem_ptr, N*N*N); + + /* compare results */ + compareData(cdata, cifft, N, dim); + + return 0; +} + +void printData(complex* &data, int N, int dim, bool normalize) { + int ni, nj, nk; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + if (!normalize) { + cout << data[i*ni*ni + j*nj + k].real() << " "; + cout << data[i*ni*ni + j*nj + k].imag() << "\t"; + } else + cout << data[i*ni*ni + j*nj + k].real() / N << "\t"; + } + cout << endl; + } + cout << endl; + } +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testFFT3DTiming.cpp b/test/testFFT3DTiming.cpp new file mode 100644 index 0000000..27ef7cf --- /dev/null +++ b/test/testFFT3DTiming.cpp @@ -0,0 +1,130 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void compareData(complex* &data1, complex* &data2, int N, int dim); + + +int main(int argc, char *argv[]) { + + int N = 4; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc > 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + N = atoi(argv[3]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + int dimsize[3] = {N, N, N}; + + + cout << "Use api: " << api_name << endl; + + cout << "Begin DKS Base tests, N = " << N << endl; + + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cifft = new complex[N*N*N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cdata[i*N*N + j*N + k] = complex((double)i / N, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cifft[i*N*N + j*N + k] = complex(0, 0); + } + } + } + + timestamp_t t0, t1; + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + void *mem_ptr; + int ierr; + + /* run stest funct to init device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + base.callFFT(mem_ptr, 3, dimsize); + base.callIFFT(mem_ptr, 3, dimsize); + base.callNormalizeFFT(mem_ptr, 3, dimsize); + base.readData< complex >(mem_ptr, cifft, N*N*N); + base.freeMemory< complex >(mem_ptr, N*N*N); + /* end test */ + + int steps = 10; + base.oclClearEvents(); + t0 = get_timestamp(); + for (int i = 0; i < steps; i++) { + + /* allocate memory on device */ + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + + /* write data to device */ + ierr = base.writeData< complex >(mem_ptr, cdata, N*N*N); + + /* execute fft */ + base.callFFT(mem_ptr, 3, dimsize); + + /* execute ifft */ + base.callIFFT(mem_ptr, 3, dimsize); + + /* execute normalize */ + base.callNormalizeFFT(mem_ptr, 3, dimsize); + + /* read data from device */ + base.readData< complex >(mem_ptr, cifft, N*N*N); + + /* free device memory */ + base.freeMemory< complex >(mem_ptr, N); + + //compareData(cdata, cifft, N, 3); + } + t1 = get_timestamp(); + + cout << "=========================" << endl; + //base.oclEventInfo(); + cout << "Average total: " << get_secs(t0, t1) / steps << endl; + cout << "=========================" << endl; + + + + + return 0; +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testFFTAsync.cpp b/test/testFFTAsync.cpp new file mode 100644 index 0000000..89550a9 --- /dev/null +++ b/test/testFFTAsync.cpp @@ -0,0 +1,117 @@ +#include +#include +#include + +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + + + +using namespace std; + +void initData(double *data, int dimsize[3]) { + for (int i = 0; i < dimsize[2]; i++) { + for (int j = 0; j < dimsize[1]; j++) { + for (int k = 0; k < dimsize[0]; k++) { + data[i*dimsize[1]*dimsize[0] + j*dimsize[0] + k] = k; + } + } + } +} + +int main(int argc, char *argv[]) { + + int N = 8; + if (argc == 2) + N = atoi(argv[1]); + + int N1 = N; + int N2 = N; + int N3 = N; + int dim = 3; + + int dimsize[3] = {N3, N2, N1}; + int sizereal = dimsize[0] * dimsize[1] * dimsize[2]; + int sizecomp = dimsize[0] * dimsize[1] * (dimsize[2]/2+1); + + double *data1 = new double[sizereal]; + double *data2 = new double[sizereal]; + + initData(data1, dimsize); + initData(data2, dimsize); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.setupFFT(3, dimsize); + + /* pagelock data */ + base.allocateHostMemory(data1, sizereal); + base.allocateHostMemory(data2, sizereal); + + /* create streams */ + int fft1, fft2; + base.createStream(fft1); + base.createStream(fft2); + + int ierr; + void *real_ptr1, *real_ptr2, *comp_ptr1, *comp_ptr2; + + cout << "allocating memory ..." << endl; + /* allocate memory on device */; + real_ptr1 = base.allocateMemory(sizereal, ierr); + real_ptr2 = base.allocateMemory(sizereal, ierr); + comp_ptr1 = base.allocateMemory< complex >(sizecomp*2, ierr); + comp_ptr2 = base.allocateMemory< complex >(sizecomp*2, ierr); + + cufftHandle defaultPlan; + cudaStream_t cfft1, cfft2; + cufftPlan3d(&defaultPlan, N1, N2, N3, CUFFT_D2Z); + cudaStreamCreate(&cfft1); + cudaStreamCreate(&cfft2); + + + for (int i = 0; i < 5; i++) { + + cufftHandle plan = defaultPlan; + + cout << "Iteration: " << i << endl; + /* write data to device */ + base.writeDataAsync(real_ptr1, data1, sizereal, fft1); + //cudaMemcpyAsync( (double*)real_ptr1,data1,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft1); + + /* execute rcfft */ + base.callR2CFFT(real_ptr1, comp_ptr1, dim, dimsize, fft1); + //cufftSetStream(plan, cfft1); + //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr1, (cufftDoubleComplex*)comp_ptr2); + + /* write data to device */ + base.writeDataAsync(real_ptr2, data2, sizereal, fft2); + //cudaMemcpyAsync( (double*)real_ptr2,data2,sizeof(double)*sizereal,cudaMemcpyHostToDevice,cfft2); + + /* execute rcfft */ + base.callR2CFFT(real_ptr2, comp_ptr2, dim, dimsize, fft2); + //cufftSetStream(plan, cfft2); + //cufftExecD2Z(plan, (cufftDoubleReal*)real_ptr2, (cufftDoubleComplex*)comp_ptr2); + + } + + base.freeMemory(real_ptr1, sizereal); + base.freeMemory(real_ptr2, sizereal); + base.freeMemory< complex >(comp_ptr1, sizereal); + base.freeMemory< complex >(comp_ptr2, sizereal); + + /* free pagelock data */ + base.freeHostMemory(data1, sizereal); + base.freeHostMemory(data2, sizereal); + + return 0; + +} diff --git a/test/testFFTSolver.cpp b/test/testFFTSolver.cpp new file mode 100644 index 0000000..4f01bdc --- /dev/null +++ b/test/testFFTSolver.cpp @@ -0,0 +1,301 @@ +#include +#include +#include + +#include "DKSBase.h" +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "cuda_runtime.h" + +using namespace std; + + +void printData3D(double* data, int N, int NI, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < NI; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void initData(double *data, int N) { + + for (int i = 0; i < N/4 + 1; i++) { + for (int j = 0; j < N/2 + 1; j++) { + for (int k = 0; k < N/2 + 1; k++) { + data[i*N*N + j*N + k] = k+1; + } + } + } +} + +void initData2(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = i; +} + +void initComplex( complex *d, int N) { + + for (int i = 0; i < N; i++) { + d[i] = complex(2, 0); + } + +} + +void printComplex(complex *d, int N) { + + for (int i = 0; i < N; i++) + cout << d[i] << "\t"; + cout << endl; + +} + +void initMirror(double *data, int n1, int n2, int n3) { + int d = 1; + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) + data[i * n2 * n1 + j * n1 + k] = d++; + else + data[i * n2 * n1 + j * n1 + k] = 0; + } + } + } +} + +void printDiv(int c) { + for (int i = 0; i < c; i++) + cout << "-"; + cout << endl; + +} + +void printMirror(double *data, int n1, int n2, int n3) { + + printDiv(75); + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + cout << data[i * n2 * n1 + j * n1 + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + cout << endl; +} + +double sumData(double *data, int datasize) { + + double sum = 0; + for (int i = 0; i < datasize; i++) + sum += data[i]; + + return sum; +} + +int main(int argc, char *argv[]) { + + /* mpi init */ + int rank, nprocs; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + if (nprocs != 8) { + cout << "example was set to run with 8 processes" << endl; + cout << "exit..." << endl; + return 0; + } + + /* set domain size */ + int NG[3] = {64, 64, 32}; + int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; + int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; + int sizerho = NG[0] * NG[1] * NG[2]; + int sizegreen = ng[0] * ng[1] * ng[2]; + int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1; + int id[3]; + + id[0] = 0; + id[1] = NL[1] * (rank % 4); + id[2] = NL[2] * (rank / 4); + + /* print some messages bout the example in the begginig */ + if (rank == 0) { + cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; + cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; + cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; + cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; + int tmp[3]; + for (int p = 1; p < nprocs; p++) { + MPI_Status mpistatus; + MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus); + cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl; + } + } else { + MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD); + } + + /* dks init and create 2 streams */ + int dkserr; + int streamGreens, streamFFT; + DKSBase base;// = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + base.createStream(streamFFT); + if (rank == 0) { + base.createStream(streamGreens); + base.setupFFT(3, NG); + } + + /* allocate memory and init rho field */ + double *rho = new double[sizerho]; + double *rho_out = new double[sizerho]; + //double *green_out = new double[sizegreen]; + initMirror(rho, NL[0], NL[1], NL[2]); + + /* + allocate memory on device for + - rho field + - rho FFT + - tmpgreen + - greens integral + - greens integral FFT + */ + void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr; + if (rank == 0) { + tmpgreen_ptr = base.allocateMemory(sizegreen, dkserr); + rho2_ptr = base.allocateMemory(sizerho, dkserr); + grn_ptr = base.allocateMemory(sizerho, dkserr); + rho2tr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + grntr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + } else { + grntr_ptr = NULL; + rho2_ptr = NULL; + grn_ptr = NULL; + rho2tr_ptr = NULL; + tmpgreen_ptr = NULL; + } + + /* send and receive pointer to allocated memory on device */ + if (rank == 0) { + for (int p = 1; p < nprocs; p++) + base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); + } else { + rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr); + } + MPI_Barrier(MPI_COMM_WORLD); + + /* =================================================*/ + /* =================================================*/ + /* =====loop trough fftpoison solver iterations=====*/ + /* =================================================*/ + /* =================================================*/ + + double old_sum = 0; + double tmp_sum = 0; + for (int l = 0; l < 10000; l++) { + MPI_Barrier(MPI_COMM_WORLD); + /* on node 0, calculate tmpgreen on gpu */ + int hr_m[3] = {1, 1, 1}; + if (rank == 0) + base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], + hr_m[0], hr_m[1], hr_m[2], streamGreens); + + /* calculate greens integral on gpu */ + if (rank == 0) + base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2], streamGreens); + + /* mirror the field */ + if (rank == 0) + base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2], streamGreens); + + + /* get FFT of mirrored greens integral */ + if (rank == 0) + base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG, streamGreens); + + /* transfer rho field to device */ + base.gather3DDataAsync ( rho2_ptr, rho, NG, NL, id, streamFFT); + MPI_Barrier(MPI_COMM_WORLD); + + /* get FFT of rho field */ + if (rank == 0) { + base.syncDevice(); + base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); + } + + /* multiply both FFTs */ + if (rank == 0) + base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); + MPI_Barrier(MPI_COMM_WORLD); + + /* inverse fft and transfer data back */ + /* + multiple device syncs and mpi barriers are used to make sure data + transfer is started when results are ready and progam moves on + only when data transfer is finished + */ + if (rank == 0) { + base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG); + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + MPI_Barrier(MPI_COMM_WORLD); + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + //cout << "result: " << sumData(rho_out, sizerho) << endl; + if (l == 0) { + old_sum = sumData(rho_out, sizerho); + } else { + tmp_sum = sumData(rho_out, sizerho); + if (old_sum != tmp_sum) { + cout << "diff in iteration: " << l << endl; + } + } + } else { + MPI_Barrier(MPI_COMM_WORLD); + base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } + + + } + /* =================================================*/ + /* =================================================*/ + /* ==========end fftpoison solver test run==========*/ + /* =================================================*/ + /* =================================================*/ + + + + /* free memory on device */ + if (rank == 0) { + base.freeMemory(tmpgreen_ptr, sizegreen); + base.freeMemory(grn_ptr, sizerho); + base.freeMemory< complex >(rho2tr_ptr, sizecomp); + base.freeMemory< complex >(grntr_ptr, sizecomp); + MPI_Barrier(MPI_COMM_WORLD); + base.freeMemory(rho2_ptr, sizerho); + cout << "Final sum: " << old_sum << endl; + } else { + base.closeHandle(rho2_ptr); + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + + +} diff --git a/test/testFFTSolver_MIC.cpp b/test/testFFTSolver_MIC.cpp new file mode 100644 index 0000000..29f84f0 --- /dev/null +++ b/test/testFFTSolver_MIC.cpp @@ -0,0 +1,319 @@ +#include +//#include +#include + +#include "DKSBase.h" +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "cuda_runtime.h" + +using namespace std; + + +void printData3D(double* data, int N, int NI, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < NI; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void initData(double *data, int N) { + + for (int i = 0; i < N/4 + 1; i++) { + for (int j = 0; j < N/2 + 1; j++) { + for (int k = 0; k < N/2 + 1; k++) { + data[i*N*N + j*N + k] = k+1; + } + } + } +} + +void initData2(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = i; +} + +void initComplex( complex *d, int N) { + + for (int i = 0; i < N; i++) { + d[i] = complex(2, 0); + } + +} + +void printComplex(complex *d, int N) { + + for (int i = 0; i < N; i++) + cout << d[i] << "\t"; + cout << endl; + +} + +void initMirror(double *data, int n1, int n2, int n3) { + int d = 1; + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) + data[i * n2 * n1 + j * n1 + k] = d++; + else + data[i * n2 * n1 + j * n1 + k] = 0; + } + } + } +} + +void printDiv(int c) { + for (int i = 0; i < c; i++) + cout << "-"; + cout << endl; + +} + +void printMirror(double *data, int n1, int n2, int n3) { + + printDiv(75); + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + cout << data[i * n2 * n1 + j * n1 + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + cout << endl; +} + +double sumData(double *data, int datasize) { + + double sum = 0; + for (int i = 0; i < datasize; i++) + sum += data[i]; + + return sum; +} + +int main(int argc, char *argv[]) { + + /* mpi init */ + //int rank, nprocs; + //MPI_Init(&argc, &argv); + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + /* + if (nprocs != 8) { + cout << "example was set to run with 8 processes" << endl; + cout << "exit..." << endl; + return 0; + } + */ + + /* set domain size */ + int NG[3] = {64, 64, 32}; + int NL[3] = {NG[0], NG[1] / 4, NG[2] / 2}; + int ng[3] = {NG[0]/2 + 1, NG[1]/2 + 1, NG[2]/2 + 1}; + int sizerho = NG[0] * NG[1] * NG[2]; + int sizegreen = ng[0] * ng[1] * ng[2]; + int sizecomp = NG[0] * NG[1] * NG[2] / 2 + 1; + int id[3]; + + //id[0] = 0; + //id[1] = NL[1] * (rank % 4); + //id[2] = NL[2] * (rank / 4); + + /* print some messages bout the example in the begginig */ + cout << "Global domain: " << NG[0] << ", " << NG[1] << ", " << NG[2] << endl; + //cout << "Local domain: " << NL[0] << ", " << NL[1] << ", " << NL[2] << endl; + cout << "Greens domain: " << ng[0] << ", " << ng[1] << ", " << ng[2] << endl; + //cout << "Start idx0: " << id[0] << ", " << id[1] << ", " << id[2] << endl; + int tmp[3]; + /* for (int p = 1; p < nprocs; p++) { + MPI_Status mpistatus; + MPI_Recv(tmp, 3, MPI_INT, p, 1001, MPI_COMM_WORLD, &mpistatus); + cout << "Start idx" << p << ": " << tmp[0] << ", " << tmp[1] << ", " << tmp[2] << endl; + }*/ + // } else { + // MPI_Send(id, 3, MPI_INT, 0, 1001, MPI_COMM_WORLD); + // } + + /* dks init and create 2 streams */ + int dkserr; + //int streamGreens, streamFFT; +#ifdef DKS_MIC + DKSBase base; + base.setAPI("OpenMP", 6); + base.setDevice("-mic", 4); + base.initDevice(); +#endif + +#ifdef DKS_CUDA + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); +#endif + + //base.createStream(streamFFT); + //if (rank == 0) { + // base.createStream(streamGreens); + base.setupFFT(3, NG); + //} + + /* allocate memory and init rho field */ + double *rho = new double[sizerho]; + double *rho_out = new double[sizerho]; + //double *green_out = new double[sizegreen]; + initMirror(rho, NL[0], NL[1], NL[2]); + + /* + allocate memory on device for + - rho field + - rho FFT + - tmpgreen + - greens integral + - greens integral FFT + */ + void *tmpgreen_ptr, *rho2_ptr, *grn_ptr, *rho2tr_ptr, *grntr_ptr; + // if (rank == 0) { + tmpgreen_ptr = base.allocateMemory(sizegreen, dkserr); + rho2_ptr = base.allocateMemory(sizerho, dkserr); + grn_ptr = base.allocateMemory(sizerho, dkserr); + rho2tr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + grntr_ptr = base.allocateMemory< complex >(sizecomp, dkserr); + /* } else { + grntr_ptr = NULL; + rho2_ptr = NULL; + grn_ptr = NULL; + rho2tr_ptr = NULL; + tmpgreen_ptr = NULL; + }*/ + + + /* send and receive pointer to allocated memory on device */ + /* + if (rank == 0) { + for (int p = 1; p < nprocs; p++) + base.sendPointer( rho2_ptr, p, MPI_COMM_WORLD); + } else { + rho2_ptr = base.receivePointer(0, MPI_COMM_WORLD, dkserr); + } + MPI_Barrier(MPI_COMM_WORLD); + */ + + + /* =================================================*/ + /* =================================================*/ + /* =====loop trough fftpoison solver iterations=====*/ + /* =================================================*/ + /* =================================================*/ + + double old_sum = 0; + double tmp_sum = 0; + for (int l = 0; l < 100; l++) { + //MPI_Barrier(MPI_COMM_WORLD); + /* on node 0, calculate tmpgreen on gpu */ + int hr_m[3] = {1, 1, 1}; + //if (rank == 0) + base.callGreensIntegral(tmpgreen_ptr, ng[0], ng[1], ng[2], ng[0], ng[1], + hr_m[0], hr_m[1], hr_m[2]); + + /* calculate greens integral on gpu */ + //if (rank == 0) + base.callGreensIntegration(grn_ptr, tmpgreen_ptr, ng[0], ng[1], ng[2]); + + /* mirror the field */ + //if (rank == 0) + base.callMirrorRhoField(grn_ptr, ng[0], ng[1], ng[2]); + + + /* get FFT of mirrored greens integral */ + //if (rank == 0) + base.callR2CFFT(grn_ptr, grntr_ptr, 3, NG); + + /* transfer rho field to device */ + //base.gather3DDataAsync ( rho2_ptr, rho, NG, NL, id, streamFFT); + base.writeData(rho2_ptr, rho,NG[0]*NG[1]*NG[2]); + //MPI_Barrier(MPI_COMM_WORLD); + + /* get FFT of rho field */ + //if (rank == 0) { + //base.syncDevice(); + base.callR2CFFT(rho2_ptr, rho2tr_ptr, 3, NG); + //} + + /* multiply both FFTs */ + //if (rank == 0) + base.callMultiplyComplexFields(rho2tr_ptr, grntr_ptr, sizecomp); + //MPI_Barrier(MPI_COMM_WORLD); + + /* inverse fft and transfer data back */ + /* + multiple device syncs and mpi barriers are used to make sure data + transfer is started when results are ready and progam moves on + only when data transfer is finished + */ + //if (rank == 0) { + base.callC2RFFT(rho2tr_ptr, rho2_ptr, 3, NG); + //base.syncDevice(); + //MPI_Barrier(MPI_COMM_WORLD); + //base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + base.readData (rho2_ptr, rho_out, NG[0]*NG[1]*NG[2]); + //MPI_Barrier(MPI_COMM_WORLD); + //base.syncDevice(); + //MPI_Barrier(MPI_COMM_WORLD); + //cout << "result: " << sumData(rho_out, sizerho) << endl; + if (l == 0) { + old_sum = sumData(rho_out, sizerho); + } else { + tmp_sum = sumData(rho_out, sizerho); + if (old_sum != tmp_sum) { + cout << "diff in iteration: " << l << endl; + } + } + /*} else { + MPI_Barrier(MPI_COMM_WORLD); + base.scatter3DDataAsync (rho2_ptr, rho_out, NG, NL, id); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } + */ + + + } +/* =================================================*/ +/* =================================================*/ +/* ==========end fftpoison solver test run==========*/ +/* =================================================*/ +/* =================================================*/ + + + +/* free memory on device */ +//if (rank == 0) { +base.freeMemory(tmpgreen_ptr, sizegreen); +base.freeMemory(grn_ptr, sizerho); +base.freeMemory< complex >(rho2tr_ptr, sizecomp); +base.freeMemory< complex >(grntr_ptr, sizecomp); +//MPI_Barrier(MPI_COMM_WORLD); +base.freeMemory(rho2_ptr, sizerho); +cout << "Final sum: " << old_sum << endl; +/*} else { + base.closeHandle(rho2_ptr); + MPI_Barrier(MPI_COMM_WORLD); + }*/ + +//MPI_Finalize(); + + +} diff --git a/test/testGather.cpp b/test/testGather.cpp new file mode 100644 index 0000000..e0f8eaf --- /dev/null +++ b/test/testGather.cpp @@ -0,0 +1,172 @@ +#include +#include +#include + +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "DKSBase.h" + +using namespace std; + + +void printData3D(int* data, int N, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + + +void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") { + + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nz; i++) { + for (int j = 0; j < ny; j++) { + for (int k = 0; k < nx; k++) { + cout << data[i*ny*nx + j*nx + k] << "\t"; + } + cout << endl; + } + cout << endl; + } +} + + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int N_global[3] = {64, 64, 32}; + int N_local[3] = {64, 32, 16}; + int n = N_local[0] * N_local[1] * N_local[2]; + + int idx[4] = {0, 0, 0, 0}; + int idy[4] = {0, 32, 0, 32}; + int idz[4] = {0, 0, 16, 16}; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + int *hdata_in; + if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) { + hdata_in = new int[n]; + cout << "pinned allocation failed!" << endl; + } + initData(hdata_in, n, rank); + + + for (int i = 0; i < 2; i++) { + + MPI_Barrier(MPI_COMM_WORLD); + if (i == 1) + nvtxMarkA("start gather"); + + if (rank == 0) { + + void *mem_ptr, *tmpgreen_ptr; + + mem_ptr = base.allocateMemory(nprocs*n, ierr); + + //call another kernel + int sizegreen = 33 * 33 * 17; + tmpgreen_ptr = base.allocateMemory(sizegreen, ierr); + nvtxMarkA("call green"); + base.callGreensIntegral(tmpgreen_ptr, 33, 33, 17, 33, 33, 0.001, 0.001, 0.00007); + + nvtxMarkA("call gather"); + base.gather3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + + //read and print data once for debug only + /* + if (i == 0 && nprocs*n < 257) { + int *hdata_out_all = new int[nprocs*n]; + base.readData(mem_ptr, hdata_out_all, n*nprocs); + printData3D2(hdata_out_all, N_global[0], N_global[1], N_global[2]); + } + + else { + int *hout_data = new int[nprocs*n]; + base.readData(mem_ptr, hout_data, nprocs*n); + int sum = 0; + for (int s = 0; s < nprocs*n; s++) + sum += hout_data[s]; + + cout << "Sum: " << sum << endl; + } + */ + MPI_Barrier(MPI_COMM_WORLD); + + nvtxMarkA("call scatter"); + base.scatter3DData(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + + base.freeMemory(mem_ptr, n*nprocs); + base.freeMemory(tmpgreen_ptr, sizegreen); + + } else { + + nvtxMarkA("call gather"); + base.gather3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + nvtxMarkA("call scatter"); + base.scatter3DData(NULL, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD); + } + + if (i == 1) + nvtxMarkA("end gather"); + + } + + MPI_Barrier(MPI_COMM_WORLD); + base.freeHostMemory(hdata_in, n); + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testGatherAsync.cpp b/test/testGatherAsync.cpp new file mode 100644 index 0000000..4fe35b5 --- /dev/null +++ b/test/testGatherAsync.cpp @@ -0,0 +1,144 @@ +#include +#include +#include + +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "DKSBase.h" + +using namespace std; + + +void printData3D(int* data, int N, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + //mpi copy + int n = 32*16*16; + int N_global[3] = {32, 32, 32}; + int N_local[3] = {32, 16, 16}; + int idx[4] = {0, 0, 0, 0}; + int idy[4] = {0, 0, 16, 16}; + int idz[4] = {0, 16, 0, 16}; + + //greens kernel + int n1 = 33; + int n2 = 33; + int n3 = 17; + int sizegreen = n1*n2*n3; + + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + int *hdata_in; + if (base.allocateHostMemory(hdata_in, n) != DKS_SUCCESS) { + hdata_in = new int[n]; + cout << "pinned allocation failed!" << endl; + } + initData(hdata_in, n, rank); + + int stream2; + for (int i = 0; i < 2; i++) { + + if (rank == 0) { + if (i == 0) { + cudaProfilerStart(); + base.createStream(stream2); + } + + nvtxMarkA("start gather"); + + void *mem_ptr, *green_ptr; + + mem_ptr = base.allocateMemory(nprocs*n, ierr); + green_ptr = base.allocateMemory(sizegreen, ierr); + + nvtxMarkA("call gather"); + MPI_Request request; + MPI_Status status; + + base.gather3DDataAsync(mem_ptr, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, + request); + + + nvtxMarkA("call kernel"); + base.callGreensIntegral(green_ptr, n1, n2, n3, n1-1, n2-1, + 4.160715e-03, 4.474911e-03, 1.247311e-02, stream2); + + MPI_Wait(&request, &status); + + + base.freeMemory(mem_ptr, n*nprocs); + base.freeMemory(green_ptr, sizegreen); + + MPI_Barrier(MPI_COMM_WORLD); + + nvtxMarkA("end gather"); + + if (i == 1) cudaProfilerStop(); + } else { + + MPI_Request request; + base.gather3DDataAsync(NULL, hdata_in, n, MPI_INT, N_global, N_local, + idx, idy, idz, nprocs, rank, 0, MPI_COMM_WORLD, + request); + + MPI_Barrier(MPI_COMM_WORLD); + } + + } + + base.freeHostMemory(hdata_in, n); + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testGatherAsync2.cpp b/test/testGatherAsync2.cpp new file mode 100644 index 0000000..a2ab21f --- /dev/null +++ b/test/testGatherAsync2.cpp @@ -0,0 +1,205 @@ +#include +#include +#include + +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "DKSBase.h" + +using namespace std; + + +void printData3D(int* data, int N, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void printData3D2(int* data, int nx, int ny, int nz, const char *message = "") { + + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nz; i++) { + for (int j = 0; j < ny; j++) { + for (int k = 0; k < nx; k++) { + cout << data[i*ny*nx + j*nx + k] << "\t"; + } + cout << endl; + } + cout << endl; + } +} + + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs*N; i++) + cout << data[i] << "\t"; + cout << endl << endl; + +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + //cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int Ng[3] = {128, 128, 64}; + int Nl[3] = {128, 64, 32}; + int nglobal = Ng[0] * Ng[1] * Ng[2]; + int nlocal = Nl[0] * Nl[1] * Nl[2]; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + int *hdata_in; + if (base.allocateHostMemory(hdata_in, nlocal) != DKS_SUCCESS) { + hdata_in = new int[nlocal]; + cout << "pinned allocation failed!" << endl; + } + initData(hdata_in, nlocal, rank); + + int *hdata_out; + if (base.allocateHostMemory(hdata_out, nlocal) != DKS_SUCCESS) { + hdata_out = new int[nlocal]; + cout << "pinned allocation failed!" << endl; + } + + //create streams for async execution + int stream1, stream2; + base.createStream(stream1); + base.createStream(stream2); + + if (rank == 0) + base.setupFFT(3, Ng); + + for (int i = 0; i < 1; i++) { + + MPI_Barrier(MPI_COMM_WORLD); + if (i == 1) + nvtxMarkA("start gather"); + + if (rank == 0) { + + int id[3] = {0, 0, 0}; + + void *mem_ptr, *tmpgreen_ptr, *comp_ptr; + + //allocate memory on device + int sizegreen = 65 * 65 * 33; + int sizecomp = 65 * 128 * 64; + mem_ptr = base.allocateMemory(nglobal, ierr); + tmpgreen_ptr = base.allocateMemory(sizegreen, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + //send pointer to other processes + nvtxMarkA("call gather"); + for (int j = 1; j < nprocs; j++) + base.sendPointer(mem_ptr, j, MPI_COMM_WORLD); + + //call another kernel while data transfer is processing + nvtxMarkA("call green"); + base.callGreensIntegral(tmpgreen_ptr, 65, 65, 33, 65, 65, 0.001, 0.001, 0.00007, stream2); + + //write data to device + base.gather3DDataAsync(mem_ptr, hdata_in, Ng, Nl, id, stream1); + + /* execute rcfft */ + //base.callR2CFFT(mem_ptr, comp_ptr, 3, Ng); + + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + + //read data from device + base.scatter3DDataAsync(mem_ptr, hdata_out, Ng, Nl, id); + + MPI_Barrier(MPI_COMM_WORLD); + base.syncDevice(); + MPI_Barrier(MPI_COMM_WORLD); + + + base.freeMemory(mem_ptr, nglobal); + base.freeMemory(tmpgreen_ptr, sizegreen); + base.freeMemory< complex >(comp_ptr, sizecomp); + + } else { + + + void *mem_ptr; + int idy = 0; + int idz = 0;//Nl[2]*rank; + if (rank / 2 == 1) idy = Ng[1] / 2; + if (rank % 2 == 1) idz = Ng[2] / 2; + int id[3] = {0, idy, idz}; + + nvtxMarkA("call gather"); + mem_ptr = base.receivePointer(0, MPI_COMM_WORLD, ierr); + base.gather3DDataAsync(mem_ptr, hdata_in, Ng, Nl, id, stream1); + + MPI_Barrier(MPI_COMM_WORLD); + + base.scatter3DDataAsync(mem_ptr, hdata_out, Ng, Nl, id); + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + base.closeHandle(mem_ptr); + + } + + int sum1 = 0; + for (int c = 0; c < nlocal; c++) + sum1 += hdata_in[c]; + + int sum2 = 0; + for (int c = 0; c < nlocal; c++) + sum2 += hdata_out[c]; + + cout << "Test gather and scatter for rank " << rank << ": " << sum1 << " == " << sum2 << endl; + + + if (i == 1) + nvtxMarkA("end gather"); + + } + + //printData(hdata_in, nlocal, 1); + MPI_Barrier(MPI_COMM_WORLD); + base.freeHostMemory(hdata_in, nlocal); + //delete[] hdata_in; + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testGreens.cpp b/test/testGreens.cpp new file mode 100644 index 0000000..8b554eb --- /dev/null +++ b/test/testGreens.cpp @@ -0,0 +1,239 @@ +#include +#include +#include +#include + +#include "DKSBase.h" +#include "nvToolsExt.h" +#include "cuda_profiler_api.h" +#include "cuda_runtime.h" + +using namespace std; + + +void printData3D(double* data, int N, int NI, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < NI; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + cout << data[i*N*N + j*N + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + +} + +void initData(double *data, int N) { + + for (int i = 0; i < N/4 + 1; i++) { + for (int j = 0; j < N/2 + 1; j++) { + for (int k = 0; k < N/2 + 1; k++) { + data[i*N*N + j*N + k] = k+1; + } + } + } +} + +void initData2(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = i; +} + +void initComplex( complex *d, int N) { + + for (int i = 0; i < N; i++) { + d[i] = complex(2, 0); + } + +} + +void printComplex(complex *d, int N) { + + for (int i = 0; i < N; i++) + cout << d[i] << "\t"; + cout << endl; + +} + +void initMirror(double *data, int n1, int n2, int n3) { + int d = 1; + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + if (i < n3/2 + 1 && j < n2/2 + 1 && k < n1/2 + 1) + data[i * n2 * n1 + j * n1 + k] = d++; + else + data[i * n2 * n1 + j * n1 + k] = 0; + } + } + } +} + +void printDiv(int c) { + for (int i = 0; i < c; i++) + cout << "-"; + cout << endl; + +} + +void printMirror(double *data, int n1, int n2, int n3) { + + printDiv(75); + for (int i = 0; i < n3; i++) { + for (int j = 0; j < n2; j++) { + for (int k = 0; k < n1; k++) { + cout << data[i * n2 * n1 + j * n1 + k] << "\t"; + } + cout << endl; + } + cout << endl; + } + cout << endl; +} + +double sumData(double *data, int datasize) { + + double sum = 0; + for (int i = 0; i < datasize; i++) + sum += data[i]; + + return sum; +} + + + +int main(int argc, char *argv[]) { + + int ierr; + + int N1 = 8; + int N2 = 8; + int N3 = 4; + + int n1 = N1 / 2; + int n2 = N2 / 2; + int n3 = N3 / 2; + + int sizegreen = (n1 + 1) * (n2 + 1) * (n3 + 1); + int sizerho = N1 * N2 * N3; + + double *data_green; //= new double[sizegreen]; + double *data_rho; //= new double[sizerho]; + + double hr_m0 = +4.0264984513873269e-04; + double hr_m1 = +4.3305596731911289e-04; + double hr_m2 = +8.3154085085560838e-04; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + int stream1, stream2; + base.createStream(stream1); + base.createStream(stream2); + cout << "ID stream1: " << stream1 << endl; + cout << "ID stream2: " << stream2 << endl; + + void *mem_green1, *mem_green2, *mem_rho1, *mem_rho2; + + mem_green1 = base.allocateMemory(sizegreen, ierr); + mem_green2 = base.allocateMemory(sizegreen, ierr); + mem_rho1 = base.allocateMemory(sizerho, ierr); + mem_rho2 = base.allocateMemory(sizerho, ierr); + + printDiv(50); + + data_green = new double[sizegreen]; + data_rho = new double[sizerho]; + + base.callGreensIntegral(mem_green1, n1+1, n2+1, n3+1, n1+1, n2+1, + hr_m0, hr_m1, hr_m2, stream1); + base.readData(mem_green1, data_green, sizegreen); + cout << "Sum green: " << sumData(data_green, sizegreen) << endl; + cout << scientific << setprecision(16); + for (int p = 0; p < 7; p++) + cout << data_green[p] << "\t"; + cout << endl; + //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1); + + base.callGreensIntegration(mem_rho1, mem_green1, n1 + 1, n2 + 1, n3 + 1, -1); + base.readData(mem_rho1, data_rho, sizerho); + cout << "Sum integral: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + + base.callMirrorRhoField(mem_rho1, n1, n2, n3, -1); + base.readData(mem_rho1, data_rho, sizerho); + cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + + printDiv(50); + + /* + base.callGreensIntegral(mem_green2, n1+1, n2+1, n3+1, n1+1, n2+1, + 1, 1, 1, -2); + base.readData(mem_green2, data_green, sizegreen); + cout << "Sum green: " << sumData(data_green, sizegreen) << endl; + //printMirror(data_green, n1 + 1, n2 + 1, n3 + 1); + + base.callGreensIntegration(mem_rho2, mem_green2, n1 + 1, n2 + 1, n3 + 1, -2); + base.readData(mem_rho2, data_rho, sizerho); + cout << "Sum integral: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + + base.callMirrorRhoField(mem_rho2, n1, n2, n3, -2); + base.readData(mem_rho2, data_rho, sizerho); + cout << "Sum mirror: " << sumData(data_rho, sizerho) << endl; + //printMirror(data_rho, N1, N2, N3); + */ + printDiv(50); + + base.freeMemory(mem_green1, sizegreen); + base.freeMemory(mem_green2, sizegreen); + base.freeMemory(mem_rho1, sizerho); + base.freeMemory(mem_rho2, sizerho); + + delete [] data_green; + delete [] data_rho; + + //test complex multiplication + int compsize = 300; + complex *data1 = new complex[compsize]; + complex *data2 = new complex[compsize]; + for (int i = 0; i < compsize; i++) { + data1[i] = complex(i+1, i+2); + data2[i] = complex(i+3, i+4); + } + + for (int i = 0; i < 3; i++) + cout << data1[i] << "\t"; + cout << endl; + for (int i = 0; i < 3; i++) + cout << data2[i] << "\t"; + cout << endl; + + void *ptr1, *ptr2; + ptr1 = base.allocateMemory< complex >(compsize, ierr); + ptr2 = base.allocateMemory< complex >(compsize, ierr); + + base.writeData< complex >(ptr1, data1, compsize); + base.writeData< complex >(ptr2, data2, compsize); + + base.callMultiplyComplexFields(ptr1, ptr2, compsize); + + base.readData< complex >(ptr1, data1, compsize); + + for (int i = 0; i < 3; i++) + cout << data1[i] << "\t"; + cout << endl; + + base.freeMemory< complex >(ptr1, compsize); + base.freeMemory< complex >(ptr2, compsize); + + return 0; +} diff --git a/test/testImageReconstruction.cpp b/test/testImageReconstruction.cpp new file mode 100644 index 0000000..2dbb27d --- /dev/null +++ b/test/testImageReconstruction.cpp @@ -0,0 +1,191 @@ +#include +#include +#include +#include "DKSImageReconstruction.h" + +struct voxelPosition { + float x; + float y; + float z; +}; + +void initImage(float *image, int size) { + for (int i = 0; i < size; i++) + image[i] = (float)rand() / RAND_MAX; +} + +void initPosition(voxelPosition *voxel, int N) { + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + int idx = i * N * N + j * N + k; + if (k == 0) + voxel[idx].x = 0.0; + else + voxel[idx].x = voxel[idx - 1].x + 0.1; + + if (j == 0) + voxel[idx].y = 0.0; + else + voxel[idx].y = voxel[idx - N].y + 0.1; + + if (i == 0) + voxel[idx].z = 0.0; + else + voxel[idx].z = voxel[idx - N * N].z + 0.1; + } + } + } +} + +void printPosition(voxelPosition *voxel, int size) { + for (int i = 0; i < size; i++) + std::cout << voxel[i].x << "\t"; + std::cout << std::endl; + for (int i = 0; i < size; i++) + std::cout << voxel[i].y << "\t"; + std::cout << std::endl; + for (int i = 0; i < size; i++) + std::cout << voxel[i].z << "\t"; + std::cout << std::endl; +} + +#define DIAMETER 2.0 +bool select_source(voxelPosition *image_tmp, voxelPosition source_temp, int id) +{ + float distance_x = pow(image_tmp[id].x-source_temp.x,2); + float distance_y = pow(image_tmp[id].y-source_temp.y,2); + float distance_z = pow(image_tmp[id].z-source_temp.z,2); + float distance = sqrt(distance_x + distance_y + distance_z); + + if ( distance < DIAMETER*0.5 ) { + return true; + } + else + return false; +} + +void calculate_source(float *image_space , voxelPosition *image_geometry, + voxelPosition source, int total_voxels, + float *average, float *std) +{ + + int number_selected_maximum = 10000; + float *select; + select = new float[number_selected_maximum]; + for (int j=0;j(total, ierr); + image_position = base.allocateMemory(total, ierr); + source_position = base.allocateMemory(total, ierr); + davg = base.allocateMemory(total, ierr); + dstd = base.allocateMemory(total, ierr); + + base.writeData(image_space, image, total); + base.writeData(image_position, geometry, total); + base.writeData(source_position, geometry, total); + + + gettimeofday(&timeStart, NULL); + base.callCalculateSource(image_space, image_position, source_position, + davg, dstd, DIAMETER, total, total); + + + base.readData(davg, avg, total); + base.readData(dstd, stdev, total); + gettimeofday(&timeEnd, NULL); + ttotal = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)) * 1e-6; + + base.freeMemory(image_space, total); + base.freeMemory(image_position, total); + base.freeMemory(source_position, total); + base.freeMemory(dstd, total); + base.freeMemory(davg, total); + + avgavg = 0; + avgstdev = 0; + for (int i = 0; i < total; i++) { + avgavg += avg[i] / total; + avgstdev += stdev[i] / total; + } + std::cout << "Average: " << avgavg << ", stddev: " << avgstdev << ", time : " << ttotal<< std::endl; + + return N; + +} diff --git a/test/testMIC.cpp b/test/testMIC.cpp new file mode 100644 index 0000000..354e9e4 --- /dev/null +++ b/test/testMIC.cpp @@ -0,0 +1,51 @@ +#include +#include "DKSBase.h" + +using namespace std; + +int main() { + + DKSBase base; + + base.setAPI("OpenMP", 6); + base.initDevice(); + + //init data + int ierr; + int N = 8; + double *in_data = new double[N]; + double *in_data2 = new double[N]; + double *out_data = new double[N]; + double *out_data2 = new double[N]; + + for (int i = 0; i < N; i++) { + in_data[i] = i; + in_data2[i] = i*i; + } + + //test memory allocation, write and read operations + void *d_ptr, *d2_ptr; + + d_ptr = base.allocateMemory(N, ierr); + d2_ptr = base.allocateMemory(N, ierr); + + base.writeData(d_ptr, in_data, N); + base.writeData(d2_ptr, in_data2, N); + + base.readData(d_ptr, out_data, N); + base.readData(d2_ptr, out_data2, N); + base.freeMemory(d_ptr, N); + base.freeMemory(d2_ptr, N); + + //print results + for (int i = 0; i < N; i++) + cout << out_data[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << out_data2[i] << "\t"; + cout << endl; + + return 0; + +} diff --git a/test/testMICOpenCL.cpp b/test/testMICOpenCL.cpp new file mode 100644 index 0000000..110d797 --- /dev/null +++ b/test/testMICOpenCL.cpp @@ -0,0 +1,94 @@ +#include +#include +#include "DKSBase.h" +#include "Utility/TimeStamp.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[4]; + + if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else if (argc == 2){ + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + cout << "Use api: " << api_name << endl; + cout << "Use device: " << device_name << endl; + + + int ierr; + int N = 10000; + double *data = new double[N]; + double *data_out = new double[N]; + double *data_out2 = new double[N]; + + for (int i = 0; i < N; i++) { + data[i] = i; + } + + //init dks base class, set API to opencl and init connection with OpenCL device + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(device_name)); + base.initDevice(); + + //data ptr + void *data_ptr, *data_ptr2; + + //allocate memory + data_ptr = base.allocateMemory(N, ierr); + data_ptr2 = base.allocateMemory(N, ierr); + + //write data to memory and fill data on device + base.writeData(data_ptr, data, N); + base.writeData(data_ptr2, data, N); + //base.callNt(data_ptr2, data_ptr, 6, N, 1, 0); + + //calc sum + base.callSum(data_ptr2, data_ptr2, N); + + //base.callSum(data_ptr, data_ptr, N); + + //chi^2 + //base.callChi2(data_ptr, data_ptr, data_ptr, N); + //base.callChi2(data_ptr2, data_ptr2, data_ptr2, N); + + //read data + base.readData(data_ptr, data_out, N); + base.readData(data_ptr2, data_out2, N); + + //base.oclEventInfo(); + + //free memory + base.freeMemory(data_ptr, N); + base.freeMemory(data_ptr2, N); + + + /* + for (int i = 0; i < N; i++) { + cout << data[i] << "\t"; + } + cout << endl << endl; + for (int i = 0; i < N; i++) { + cout << data_out[i] << "\t"; + } + cout << endl << endl; + for (int i = 0; i < N; i++) { + cout << data_out2[i] << "\t"; + } + cout << endl; + */ + + + + return 0; +} \ No newline at end of file diff --git a/test/testMICPush.cpp b/test/testMICPush.cpp new file mode 100644 index 0000000..a2f7d2a --- /dev/null +++ b/test/testMICPush.cpp @@ -0,0 +1,68 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +typedef struct { + double x; + double y; + double z; +} Part; + +void initData(Part *data, int N) { + for (int i = 0; i < N; i++) { + data[i].x = rand() / RAND_MAX; + data[i].y = rand() / RAND_MAX; + data[i].z = rand() / RAND_MAX; + } +} + +int main() { + + int ierr; + int N = 100000; + + //__declspec(align(64)) Part *R = new Part[N]; + //__declspec(align(64)) Part *P = new Part[N]; + Part *R = new Part[N]; + Part *P = new Part[N]; + + initData(R, N); + initData(P, N); + + DKSBase dksbase; + dksbase.setAPI("OpenMP", 6); + dksbase.setDevice("-mic", 4); + dksbase.initDevice(); + + void *r_ptr, *p_ptr, *dt_ptr; + r_ptr = dksbase.allocateMemory(N, ierr); + p_ptr = dksbase.allocateMemory(N, ierr); + dt_ptr = dksbase.allocateMemory(N, ierr); + + dksbase.writeData(r_ptr, R, N); + + cout << "====================START PUSH====================" << endl; + + for (int i = 0; i < 5; i++) { + //write r to device + dksbase.writeData(r_ptr, R, N); + //calc push + dksbase.callParallelTTrackerPush (r_ptr, p_ptr, N, dt_ptr, + 0.001, 1, false, NULL); + //read R from device + dksbase.readDataAsync (r_ptr, R, N, NULL); + } + + cout << "====================END PUSH====================" << endl; + + + + dksbase.freeMemory(r_ptr, N); + dksbase.freeMemory(p_ptr, N); + dksbase.freeMemory(dt_ptr, N); + + return 0; +} diff --git a/test/testMPI.cpp b/test/testMPI.cpp new file mode 100644 index 0000000..aef3cd6 --- /dev/null +++ b/test/testMPI.cpp @@ -0,0 +1,89 @@ +#include +#include +#include + +#include "DKSBase.h" + +using namespace std; + +void printData(int *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(int *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = (rank+1); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int n = 8; + int sizen = sizeof(int)*n; + int sizeall = sizeof(int)*n*nprocs; + + int *hdata_in = new int[n]; + int *hdata_out = new int[n]; + initData(hdata_in, n, rank); + cout << "In data for process " << rank+1 << ":\t"; + printData(hdata_in, n, 1); + + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + if (rank == 0) { + + int *hdata_out_all = new int[nprocs*n]; + void* mem_ptr; + mem_ptr = base.allocateMemory(nprocs*n, ierr); + + MPI_Gather(hdata_in, n, MPI_INT, mem_ptr, n, MPI_INT, 0, MPI_COMM_WORLD); + + base.readData(mem_ptr, hdata_out_all, n*nprocs); + + MPI_Scatter(mem_ptr, n, MPI_INT, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD); + + base.freeMemory(mem_ptr, n*nprocs); + + printData(hdata_out_all, n, nprocs, "Out data 1:\n"); + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_in, n, 1); + } else { + + MPI_Gather(hdata_in, n, MPI_INT, NULL, NULL, NULL, 0, MPI_COMM_WORLD); + + MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_INT, 0, MPI_COMM_WORLD); + + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_in, n, 1); + + } + + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testMPIFFT.cpp b/test/testMPIFFT.cpp new file mode 100644 index 0000000..69512ff --- /dev/null +++ b/test/testMPIFFT.cpp @@ -0,0 +1,91 @@ +#include +#include +#include + +#include "DKSBase.h" + +using namespace std; + +void printData(complex *data, int N, int nprocs, const char *message = "") { + if (strcmp(message, "") != 0) + cout << message; + + for (int i = 0; i < nprocs; i++) { + for (int j = 0; j < N; j++) + cout << data[i*N + j] << "\t"; + cout << endl; + } +} + +void initData(complex *data, int N, int rank) { + for (int i = 0; i < N; i++) + data[i] = complex((double)rank+1.0, 0.0); +} + +int main(int argc, char *argv[]) { + + int ierr; + int rank, nprocs; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + cout << "Rank " << (rank+1) << " from " << nprocs << endl; + + int n = 8; + + complex *hdata_in = new complex[n]; + complex *hdata_out = new complex[n]; + initData(hdata_in, n, rank); + cout << "In data for process " << rank+1 << ":\t"; + printData(hdata_in, n, 1); + + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + + if (rank == 0) { + + complex *hdata_out_all = new complex[nprocs*n]; + void* mem_ptr; + mem_ptr = base.allocateMemory< complex >(nprocs*n, ierr); + + + MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, mem_ptr, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + + + int dimsize[3] = {n*nprocs, 1, 1}; + base.callFFT(mem_ptr, 1, dimsize); + base.readData< complex >(mem_ptr, hdata_out_all, n*nprocs); + + MPI_Scatter(mem_ptr, n, MPI_DOUBLE_COMPLEX, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + + base.freeMemory< complex >(mem_ptr, n*nprocs); + + printData(hdata_out_all, n, nprocs, "Out data 1:\n"); + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_out, n, 1); + } else { + + MPI_Gather(hdata_in, n, MPI_DOUBLE_COMPLEX, NULL, NULL, NULL, 0, MPI_COMM_WORLD); + + MPI_Scatter(NULL, NULL, NULL, hdata_out, n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + + cout << "Scatter data for proces: " << rank + 1 << ": \t"; + printData(hdata_out, n, 1); + + } + + + MPI_Finalize(); + return 0; +} + + + + + diff --git a/test/testMemObjects.cpp b/test/testMemObjects.cpp new file mode 100644 index 0000000..5a5eaf0 --- /dev/null +++ b/test/testMemObjects.cpp @@ -0,0 +1,75 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + int ierr,n, N; + + if (argc > 1) + n = atoi(argv[1]); + else + n = 10; + + N = 2 << n; + cout << "Elements: " << N << endl; + + double *data = new double[N]; + for (int i = 0; i < N; i++) + data[i] = (double)i / N; + + + DKSBase base = DKSBase(); + base.setAPI("OpenCL", 6); + base.setDevice("-gpu", 4); + base.initDevice(); + + void *ptr1; + ptr1 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr1, data, N); + + void *ptr2; + ptr2 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr2, data, N); + + void *ptr3; + ptr3 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr3, data, N); + + void *ptr4; + ptr4 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr4, data, N); + + void *ptr5; + ptr5 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr5, data, N); + + void *ptr6; + ptr6 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr6, data, N); + + void *ptr7; + ptr7 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr7, data, N); + + void *ptr8; + ptr8 = base.allocateMemory(N, ierr); + ierr = base.writeData(ptr8, data, N); + + base.freeMemory(ptr1, N); + base.freeMemory(ptr2, N); + base.freeMemory(ptr3, N); + base.freeMemory(ptr4, N); + base.freeMemory(ptr5, N); + base.freeMemory(ptr6, N); + base.freeMemory(ptr7, N); + base.freeMemory(ptr8, N); + + + + return 0; +} + diff --git a/test/testOffset.cpp b/test/testOffset.cpp new file mode 100644 index 0000000..cf7e6ec --- /dev/null +++ b/test/testOffset.cpp @@ -0,0 +1,73 @@ +#include +#include + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + + int ierr,n, N; + + N = 8; + n = 4; + + double *data_in = new double[N]; + double *data_out_1 = new double[N]; + double *data_out_2 = new double[N]; + for (int i = 0; i < N; i++) { + data_in[i] = (double)i / N; + data_out_1[i] = 0.0; + data_out_2[i] = 0.0; + } + + cout << "Run example on: " << api_name << " using " << device_name << endl; + + DKSBase base = DKSBase(); + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + void *ptr1; + ptr1 = base.allocateMemory(N, ierr); + + ierr = base.writeData(ptr1, data_in, n, 0); + ierr = base.writeData(ptr1, data_in, n, 4); + + ierr = base.readData(ptr1, data_out_1, N); + ierr = base.readData(ptr1, data_out_2, n, 2); + + base.freeMemory(ptr1, N); + + for (int i = 0; i < N; i++) + cout << data_in[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << data_out_1[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << data_out_2[i] << "\t"; + cout << endl; + + + + + return 0; +} + diff --git a/test/testOffsetMPI.cpp b/test/testOffsetMPI.cpp new file mode 100644 index 0000000..066cf63 --- /dev/null +++ b/test/testOffsetMPI.cpp @@ -0,0 +1,81 @@ +#include +#include +#include + + +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + int rank, size; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + cout << "Rank " << rank << " from " << size << endl; + + + int ierr, N, n; + + N = 8; + n = N / 2; + + double *data_in = new double[n]; + + for (int i = 0; i < n; i++) + data_in[i] = (double)rank + 1.0 + (double)i / n; + + DKSBase base = DKSBase(); + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + if (rank == 0) { + //alocate memory of size N + void *ptr1; + ptr1 = base.allocateMemory(size*N, ierr); + cout << "Sent pointer: " << ptr1 << endl; + + //send ptr to other processes + MPI_Send(&ptr1, sizeof(void*), MPI_BYTE, 1, 123, MPI_COMM_WORLD); + + //wrtie n data with no offset to device and wait for other processes + ierr = base.writeData(ptr1, data_in, n, rank*n); + MPI_Barrier(MPI_COMM_WORLD); + + //read memory of size N from device + double *data_out = new double[N]; + ierr = base.readData(ptr1, data_out, N); + + //free device memory + base.freeMemory(ptr1, size*N); + + //print results + for (int i = 0; i < n; i++) + cout << data_in[i] << "\t"; + cout << endl; + + for (int i = 0; i < N; i++) + cout << data_out[i] << "\t"; + cout << endl; + + } else { + //receive device memory pointer + void *ptr2; + MPI_Recv(&ptr2, sizeof(void*), MPI_BYTE, 0, 123, MPI_COMM_WORLD, NULL); + cout << "Received pointer: " << ptr2 << endl; + //write data with an offset + base.writeData(ptr2, data_in, n, rank*n); + + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + + + return 0; +} + diff --git a/test/testPush.cpp b/test/testPush.cpp new file mode 100644 index 0000000..d2f13b0 --- /dev/null +++ b/test/testPush.cpp @@ -0,0 +1,57 @@ +#include +#include +#include + +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" + +using namespace std; + + +void initData(double3 *data, int N) { + for (int i = 0; i < N; i++) { + data[i].x = rand() / RAND_MAX; + data[i].y = rand() / RAND_MAX; + data[i].z = rand() / RAND_MAX; + } +} + + +int main() { + + int ierr; + int N = 1000000; + double3 *R = new double3[N]; + double3 *P = new double3[N]; + + initData(R, N); + initData(P, N); + + DKSBase dksbase; + dksbase.setAPI("Cuda", 4); + dksbase.setDevice("-gpu", 4); + dksbase.initDevice(); + + void *r_ptr, *p_ptr; + + r_ptr = dksbase.allocateMemory(N, ierr); + p_ptr = dksbase.allocateMemory(N, ierr); + + dksbase.writeData(r_ptr, R, N); + dksbase.writeData(p_ptr, P, N); + + for (int i = 0; i < 100; i++) + dksbase.callParallelTTrackerPush(r_ptr, p_ptr, N, NULL, 0.5, 1, false); + + + dksbase.readData(r_ptr, R, N); + dksbase.readData(p_ptr, P, N); + + dksbase.freeMemory(r_ptr, N); + dksbase.freeMemory(p_ptr, N); + + + return 0; +} diff --git a/test/testRCFFT.cpp b/test/testRCFFT.cpp new file mode 100644 index 0000000..841c04a --- /dev/null +++ b/test/testRCFFT.cpp @@ -0,0 +1,168 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData(double* &data, int N1, int N2); +void printData(complex* &data, int N1, int N2); +void printData3DN4(complex* &data, int N, int dim); +void printData3DN4(double* &data, int N, int dim); + + +void compareData(double* &data1, double* &data2, int N, int dim); + + + +int main(int argc, char *argv[]) { + + int N1 = 4; + int N2 = 4; + + if (argc == 3) { + N1 = atoi(argv[1]); + N2 = atoi(argv[2]); + } + + int dimsize[3] = {N1, N2, 1}; + + cout << "Begin RC 3D FFT tests, grid = " << N1 << "\t" << N2 << endl; + int sizereal = N1*N2; + int sizecomp = N1*(N2/2+1); + + int dim = 3; + double *cdata = new double[sizereal]; + complex *cfft = new complex[sizecomp]; + + for (int i = 0; i < N2; i++) { + for (int j = 0; j < N1; j++) { + cdata[i*N1 + j] = (double)(j) / N1; + } + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI("Cuda", 4); + base.setDevice("-gpu", 4); + base.initDevice(); + + void *real_ptr, *comp_ptr; + int ierr; + /* allocate memory on device */ + real_ptr = base.allocateMemory(sizereal, ierr); + comp_ptr = base.allocateMemory< complex >(sizecomp, ierr); + + /* write data to device */ + ierr = base.writeData(real_ptr, cdata, sizereal); + + /* execute fft */ + base.callR2CFFT(real_ptr, comp_ptr, 2, dimsize); + + /* read data from device */ + base.readData< complex >(comp_ptr, cfft, sizecomp); + + /* free device memory */ + base.freeMemory(real_ptr, sizereal); + base.freeMemory< complex >(comp_ptr, sizecomp); + + cout << "FFT complete" << endl; + + + /* print results */ + printData(cdata, N1, N2); + printData(cfft, N1, N2); + + + + return 0; +} + +void printData(double* &data, int N1, int N2) { + + for (int i = 0; i < N2; i++) { + for (int j = 0; j < N1; j++) { + cout << data[i*N1 + j] << " "; + } + cout << endl; + } + cout << endl; +} + +void printData(complex* &data, int N1, int N2) { + + complex tmp(0.0, 0.0); + for (int i = 0; i < N2/2+1; i++) { + for (int j = 0; j < N1; j++) { + tmp = data[i*N1 + j]; + if (tmp.real() < 0.00001 && tmp.real() > -0.00001) tmp = complex(0.0, tmp.imag()); + if (tmp.imag() < 0.00001 && tmp.imag() > -0.00001) tmp = complex(tmp.real(), 0.0); + + cout << tmp << " "; + } + cout << endl; + } + cout << endl; +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + + double d = data[i*N*N + j*N + k].real(); + double a = data[i*N*N + j*N + k].imag(); + + if (d < 10e-5 && d > -10e-5) + d = 0; + if (a < 10e-5 && a > -10e-5) + a = 0; + + cout << d << "; " << a << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void printData3DN4(double* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k]; + if (d > 10e-5 || d < -10e-5) + cout << d << "\t"; + else + cout << 0 << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(double* &data1, double* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id] - data2[id]); + } + } + } + cout << "Size " << N << " CC <--> CC diff: " << sum << endl; +} + diff --git a/test/testStockFFT3D.cpp b/test/testStockFFT3D.cpp new file mode 100644 index 0000000..036a7e2 --- /dev/null +++ b/test/testStockFFT3D.cpp @@ -0,0 +1,181 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void printData3DN4(complex* &data, int N, int dim); +void compareData(complex* &data1, complex* &data2, int N, int dim); + +int main(int argc, char *argv[]) { + + int n = 2; + if (argc == 2) + n = atoi(argv[1]); + + int N = pow(2,n); + + cout << "Begin DKS Base tests" << endl; + + cout << "FFT size: " << N << endl; + + int dimsize[3] = {N, N, N}; + + + complex *cdata = new complex[N*N*N]; + complex *cfft = new complex[N*N*N]; + complex *cfft2 = new complex[N*N*N]; + complex *cfft3 = new complex[N*N*N]; + + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + //cdata[i*N*N + j*N + k] = complex((double)k/(N*N*N), 0); + cdata[i*N*N + j*N + k] = complex(k, 0); + cfft[i*N*N + j*N + k] = complex(0, 0); + cfft2[i*N*N + j*N + k] = complex(0, 0); + cfft3[i*N + j*N + k] = complex(0, 0); + } + } + } + + if (N == 4) + printData3DN4(cdata, N, 3); + + /* init DKSBase */ + cout << "Init device and set function" << endl; + int ierr; + + + timestamp_t t0, t1; + + /* stockham radix-2 out-of-place fft */ + DKSBase base2; + base2.setAPI("OpenCL", 6); + base2.setDevice("-gpu", 4); + base2.initDevice(); + + cout << endl; + void *src_ptr; + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + src_ptr = base2.allocateMemory< complex >(N*N*N, ierr); + base2.writeData< complex >(src_ptr, cdata, N*N*N); + base2.callFFTStockham(src_ptr, 3, dimsize); + base2.readData< complex >(src_ptr, cfft2, N*N*N); + base2.freeMemory< complex >(src_ptr, N*N*N); + t1 = get_timestamp(); + cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl; + } + + if (N == 4) + printData3DN4(cfft2, N, 3); + + //delete base2; + cout << endl; + + /* CUDA cufft */ + DKSBase base3; + base3.setAPI("Cuda", 4); + base3.setDevice("-gpu", 4); + base3.initDevice(); + + cout << endl; + void *cuda_ptr; + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + cuda_ptr = base3.allocateMemory< complex >(N*N*N, ierr); + base3.writeData< complex >(cuda_ptr, cdata, N*N*N); + base3.callFFT(cuda_ptr, 3, dimsize); + base3.readData< complex >(cuda_ptr, cfft3, N*N*N); + base3.freeMemory< complex >(cuda_ptr, N*N*N); + t1 = get_timestamp(); + cout << "Cuda FFT time: " << get_secs(t0, t1) << endl; + } + + if (N == 4) + printData3DN4(cfft3, N, 3); + + //delete base3; + cout << endl; + + + /* radix-2 in place fft */ + DKSBase base; + base.setAPI("OpenCL", 6); + base.setDevice("-gpu", 4); + base.initDevice(); + + cout << endl; + void *mem_ptr; + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + mem_ptr = base.allocateMemory< complex >(N*N*N, ierr); + base.writeData< complex >(mem_ptr, cdata, N*N*N); + base.callFFT(mem_ptr, 3, dimsize); + base.readData< complex >(mem_ptr, cfft, N*N*N); + base.freeMemory< complex >(mem_ptr, N*N*N); + t1 = get_timestamp(); + cout << "in-place FFT time: " << get_secs(t0, t1) << endl; + } + + if (N == 4) + printData3DN4(cfft, N, 3); + + //delete base; + cout << endl; + + /* compare results */ + cout << endl; + + cout << "Radix 2 vs Stockham: "; + compareData(cfft, cfft2, N, 3); + + cout << "Radix 2 vs Cufft: "; + compareData(cfft, cfft3, N, 3); + + cout << "Stockham vs Cufft: "; + compareData(cfft2, cfft3, N, 3); + + return 0; +} + +void printData3DN4(complex* &data, int N, int dim) { + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + for (int k = 0; k < N; k++) { + double d = data[i*N*N + j*N + k].real(); + if (d > 10e-5 || d < -10e-5) + cout << d << "\t"; + else + cout << 0 << "\t"; + } + } + cout << endl; + } + cout << endl; + +} + +void compareData(complex* &data1, complex* &data2, int N, int dim) { + int ni, nj, nk, id; + ni = (dim > 2) ? N : 1; + nj = (dim > 1) ? N : 1; + nk = N; + double sum = 0; + for (int i = 0; i < ni; i++) { + for (int j = 0; j < nj; j++) { + for (int k = 0; k < nk; k++) { + id = i*ni*ni + j*nj + k; + sum += fabs(data1[id].real() - data2[id].real()); + sum += fabs(data1[id].imag() - data2[id].imag()); + } + } + } + cout << "CC <--> CC diff: " << sum << endl; +} \ No newline at end of file diff --git a/test/testStockhamFFT.cpp b/test/testStockhamFFT.cpp new file mode 100644 index 0000000..fdc1656 --- /dev/null +++ b/test/testStockhamFFT.cpp @@ -0,0 +1,107 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +int main(int argc, char *argv[]) { + + int n = 2; + char *api_name = new char[10]; + char *device_name = new char[10]; + if (argc == 2) { + strcpy(api_name, argv[1]); + strcpy(device_name, "-gpu"); + } else if (argc == 3) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + } else if (argc == 4) { + strcpy(api_name, argv[1]); + strcpy(device_name, argv[2]); + n = atoi(argv[3]); + } else { + strcpy(api_name, "OpenCL"); + strcpy(device_name, "-gpu"); + } + + int N = pow(2,n); + cout << "Use api: " << api_name << endl; + + cout << "Begin DKS Base tests" << endl; + + cout << "FFT size: " << N << endl; + + int dimsize[3] = {N, N, N}; + + complex *cdata = new complex[N]; + complex *cfft = new complex[N]; + complex *cfft2 = new complex[N]; + complex *cfftsrc = new complex[N]; + for (int i = 0; i < N; i++) { + cdata[i] = complex((double)i / N, 0); + cfft[i] = complex(0, 0); + cfft2[i] = complex(0, 0); + cfftsrc[i] = complex(0, 0); + } + + /* init DKSBase */ + cout << "Init device and set function" << endl; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + + timestamp_t t0, t1; + + /* radix-2 in place fft */ + void *mem_ptr; + int ierr; + + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + mem_ptr = base.allocateMemory< complex >(N, ierr); + base.writeData< complex >(mem_ptr, cdata, N); + base.callFFT(mem_ptr, 1, dimsize); + base.readData< complex >(mem_ptr, cfft, N); + base.freeMemory< complex >(mem_ptr, N); + t1 = get_timestamp(); + cout << "in-place FFT time: " << get_secs(t0, t1) << endl; + } + + cout << endl; + + /* stockham radix-2 out-of-place fft */ + void *src_ptr; + + for (int i = 0; i < 5; i++) { + t0 = get_timestamp(); + src_ptr = base.allocateMemory< complex >(N, ierr); + base.writeData< complex >(src_ptr, cdata, N); + base.callFFTStockham(src_ptr, 1, dimsize); + base.readData< complex >(src_ptr, cfft2, N); + base.freeMemory< complex >(src_ptr, N); + t1 = get_timestamp(); + cout << "out-of-place FFT time: " << get_secs(t0, t1) << endl; + } + + double diff = 0; + for (int i = 0; i < N; i++) { + diff += fabs(cfft[i].real() - cfft2[i].real()); + diff += fabs(cfft[i].imag() - cfft2[i].imag()); + } + + cout << endl << "Difference: " << diff << endl; + + if (diff > 0.00001) { + for (int i = 0; i < 10; i++) { + cout << cfft[i] << "\t" << cfft2[i] << endl; + } + } + + return 0; +} + diff --git a/test/testTimeIntegration.cpp b/test/testTimeIntegration.cpp new file mode 100644 index 0000000..80fec6b --- /dev/null +++ b/test/testTimeIntegration.cpp @@ -0,0 +1,227 @@ +#include +#include +#include +#include +#include "DKSBase.h" + +#include +#include "cuda_runtime.h" + +using namespace std; + +typedef struct { + double x; + double y; + double z; +} Vector; + +Vector initVector() { + Vector tmp; + tmp.x = 0.5; + tmp.y = 0.5; + tmp.z = 0.5; + + return tmp; +} + +void initVectors(Vector *v, int N) { + for (int i = 0; i < N; i++) + v[i] = initVector(); +} + +void initDouble(double *data, int N) { + for (int i = 0; i < N; i++) + data[i] = 0.005; +} + +void initLastSect(long *data, int N) { + for (int i = 0; i < N; i++) + data[i] = -1; +} + +void checkSum(Vector *v, int N) { + double sum = 0; + for (int i = 0; i < N; i++) + sum += v[i].x + v[i].y + v[i].z; + + std::cout << "checksum: " << sum << std::endl; +} + +int main(int argc, char *argv[]) { + + int loop = 10; + int numpart = 10; + char *api_name = new char[10]; + char *device_name = new char[10]; + strcpy(api_name, "Cuda"); + strcpy(device_name, "-gpu"); + + for (int i = 1; i < argc; i++) { + + if (argv[i] == string("-mic")) { + strcpy(api_name, "OpenMP"); + strcpy(device_name, "-mic"); + } + + if (argv[i] == string("-npart")) { + numpart = atoi(argv[i+1]); + i++; + } + + if (argv[i] == string("-loop")) { + loop = atoi(argv[i+1]); + i++; + } + + } + + cout << "=========================BEGIN TEST=========================" << endl; + cout << "Use api: " << api_name << "\t" << device_name << endl; + cout << "Number of particles: " << numpart << endl; + cout << "------------------------------------------------------------" << endl; + + //init p,r and dt arrays to test time integration + Vector *r = new Vector[numpart]; + Vector *p = new Vector[numpart]; + Vector *x = new Vector[numpart]; + Vector *ori = new Vector[5]; + initVectors(r, numpart); + initVectors(p, numpart); + initVectors(x, numpart); + initVectors(ori, 5); + + double *dt = new double[numpart]; + initDouble(dt, numpart); + + long *ls = new long[numpart]; + initLastSect(ls, numpart); + + //init dks + int ierr; + DKSBase base; + base.setAPI(api_name, strlen(api_name)); + base.setDevice(device_name, strlen(api_name)); + base.initDevice(); + + int stream1, stream2; + base.createStream(stream1); + base.createStream(stream2); + + base.registerHostMemory(r, numpart); + base.registerHostMemory(p, numpart); + base.registerHostMemory(x, numpart); + base.registerHostMemory(dt, numpart); + base.registerHostMemory(ls, numpart); + + //***test parallelttrackerpush***// + void *r_ptr, *p_ptr, *x_ptr, *dt_ptr, *ls_ptr, *ori_ptr; + + //allocate memory on the device + r_ptr = base.allocateMemory(numpart, ierr); + p_ptr = base.allocateMemory(numpart, ierr); + x_ptr = base.allocateMemory(numpart, ierr); + dt_ptr = base.allocateMemory(numpart, ierr); + ls_ptr = base.allocateMemory(numpart, ierr); + ori_ptr = base.allocateMemory(5, ierr); + + //transfer data to device + base.writeData(r_ptr, r, numpart); + base.writeData(p_ptr, p, numpart); + base.writeData(x_ptr, x, numpart); + base.writeData(ori_ptr, ori, 5); + + + //do some couple of integration loops before the timer is started + for (int i = 0; i < 5; i++) { + //calc push + base.callParallelTTrackerPush (r_ptr, p_ptr, numpart, dt_ptr, + 0.05, 1, false, stream1); + + //read R from device + base.readDataAsync (r_ptr, r, numpart, stream1); + + //write LastSection to device + base.writeDataAsync (ls_ptr, ls, numpart, stream2); + + //calc push + base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5, + dt_ptr, 0.05, 1, false, stream2); + //read x from device + base.readDataAsync(x_ptr, x, numpart, stream2); + + //sync and wait till all tasks and reads are complete + base.syncDevice(); + } + + checkSum(r, numpart); + checkSum(x, numpart); + + + + //start the timing of integration + struct timeval timeStart, timeEnd; + std::cout << "start integration" << std::endl; + + gettimeofday(&timeStart, NULL); + for (int i = 0; i < loop; i++) { + + //calc push + base.callParallelTTrackerPush(r_ptr, p_ptr, numpart, dt_ptr, 0.05, 1, false, stream1); + + //read R from device + base.readDataAsync (r_ptr, r, numpart, stream1); + + //write LastSection to device + base.writeDataAsync (ls_ptr, ls, numpart, stream2); + + //calc push transform + base.callParallelTTrackerPushTransform(x_ptr, p_ptr, ls_ptr, ori_ptr, numpart, 5, + dt_ptr, 0.05, 1, false, stream2); + + //read R from device + base.readDataAsync(x_ptr, x, numpart, stream2); + + //sync and wait till all tasks and reads are complete + base.syncDevice(); + } + gettimeofday(&timeEnd, NULL); + + std::cout << "end integration" << std::endl; + double t = ( (timeEnd.tv_sec - timeStart.tv_sec) * 1000000 + + (timeEnd.tv_usec - timeStart.tv_usec)); + + std::cout << "Time for " << numpart << " integrations: " << t * 1e-6 << "s" << std::endl; + std::cout << "Average time for integration: " << t * 1e-6 / loop << std::endl; + + checkSum(r, numpart); + checkSum(x, numpart); + + + + //free memory + base.freeMemory(r_ptr, numpart); + base.freeMemory(p_ptr, numpart); + base.freeMemory(x_ptr, numpart); + base.freeMemory(ori_ptr, 5); + base.freeMemory(dt_ptr, numpart); + base.freeMemory(ls_ptr, numpart); + + //unregister host memory + base.unregisterHostMemory(r); + base.unregisterHostMemory(p); + base.unregisterHostMemory(x); + base.unregisterHostMemory(dt); + base.unregisterHostMemory(ls); + + //free host memory + delete[] r; + delete[] x; + delete[] p; + delete[] dt; + delete[] ls; + delete[] ori; + + cout << "==========================END TEST==========================" << endl; + return 0; + +} diff --git a/test/testTranspose.cpp b/test/testTranspose.cpp new file mode 100644 index 0000000..7d7b34c --- /dev/null +++ b/test/testTranspose.cpp @@ -0,0 +1,76 @@ +#include +#include +#include + +#include "Utility/TimeStamp.h" +#include "DKSBase.h" + +using namespace std; + +void initData(complex *d, int N, int dim) { + + int size = N; + if (dim == 2) size = N*N; + if (dim == 3) size = N*N*N; + + for (int i = 0; i < size; i++) + d[i] = complex(i, 0); + +} + +void printData(complex *d, int N, int dim) { + + int NZ = N; + int NY = (dim > 1) ? N : 1; + int NX = (dim > 2) ? N : 1; + + for (int i = 0; i < NX; i++) { + for (int j = 0; j < NY; j++) { + for (int k = 0; k < NZ; k++) { + std::cout << d[i*N*N + j*N + k].real() << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; + +} + +int main(int argc, char *argv[]) { + + int N = (argc > 1) ? atoi(argv[1]) : 4; + int dimN[3] = {N, N, 1}; + int dim = 2; + int ndim = 1; + int size = dimN[0] * dimN[1] * dimN[2]; + + std::complex *hd_in = new std::complex[size]; + std::complex *hd_out = new std::complex[size]; + initData(hd_in, N, dim); + printData(hd_in, N, dim); + + DKSBase base; + base.setAPI("OpenCL", 6); + base.setDevice("-gpu", 4); + base.initDevice(); + + int ierr; + void *mem_ptr; + + mem_ptr = base.allocateMemory< std::complex >(size, ierr); + base.writeData< std::complex >(mem_ptr, hd_in, size); + + base.callTranspose(mem_ptr, dimN, dim, ndim); + + base.readData< std::complex >(mem_ptr, hd_out, size); + base.freeMemory< std::complex >(mem_ptr, size); + + printData(hd_out, N, 2); + + delete[] hd_in; + delete[] hd_out; + + return 0; + +}