945 lines
28 KiB
C++
945 lines
28 KiB
C++
/** DKSBase class.
|
|
* DKSBase.h
|
|
* Author: Uldis Locans
|
|
* Date: 15.09.2014
|
|
* Base class of Dynamic Kernel Scheduler that handles the function calls
|
|
* from host application to DKS
|
|
*/
|
|
|
|
#ifndef H_DKS_BASE
|
|
#define H_DKS_BASE
|
|
|
|
#include <iostream>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
#include <sys/time.h>
|
|
|
|
#include "DKSDefinitions.h"
|
|
|
|
#ifdef DKS_MPI
|
|
#include <mpi.h>
|
|
#endif
|
|
|
|
#ifdef DKS_OPENCL
|
|
|
|
#ifdef __APPLE__
|
|
#include <OpenCL/opencl.h>
|
|
#else
|
|
#include <CL/cl.h>
|
|
#endif
|
|
|
|
#include "OpenCL/OpenCLBase.h"
|
|
#include "OpenCL/OpenCLChiSquare.h"
|
|
#endif
|
|
|
|
#ifdef DKS_AMD
|
|
#include "OpenCL/OpenCLFFT.h"
|
|
#include "OpenCL/OpenCLCollimatorPhysics.h"
|
|
#include "OpenCL/OpenCLGreensFunction.h"
|
|
#endif
|
|
|
|
#ifdef DKS_CUDA
|
|
#include "CUDA/CudaBase.cuh"
|
|
#include "CUDA/CudaFFT.cuh"
|
|
#include "CUDA/CudaGreensFunction.cuh"
|
|
#include "CUDA/CudaChiSquare.cuh"
|
|
#include "CUDA/CudaCollimatorPhysics.cuh"
|
|
#include "nvToolsExt.h"
|
|
#endif
|
|
|
|
#ifdef DKS_MIC
|
|
#include "MIC/MICBase.h"
|
|
#include "MIC/MICChiSquare.h"
|
|
#include "MIC/MICFFT.h"
|
|
#include "MIC/MICCollimatorPhysics.h"
|
|
#include "MIC/MICGreensFunction.hpp"
|
|
#endif
|
|
|
|
#include "Algorithms/GreensFunction.h"
|
|
#include "Algorithms/CollimatorPhysics.h"
|
|
#include "Algorithms/FFT.h"
|
|
|
|
#include "AutoTuning/DKSConfig.h"
|
|
|
|
/** DKSBase class for handling function calls to DKS library */
|
|
class DKSBase {
|
|
|
|
private:
|
|
char *m_device_name;
|
|
char *m_api_name;
|
|
char *m_function_name;
|
|
|
|
bool m_device_set;
|
|
bool m_api_set;
|
|
bool m_function_set;
|
|
|
|
bool m_auto_tuning;
|
|
bool m_use_config;
|
|
|
|
#ifdef DKS_OPENCL
|
|
OpenCLBase *oclbase;
|
|
OpenCLChiSquare *oclchi;
|
|
#endif
|
|
|
|
#ifdef DKS_CUDA
|
|
CudaBase *cbase;
|
|
CudaChiSquare *cchi;
|
|
#endif
|
|
|
|
#ifdef DKS_MIC
|
|
MICBase *micbase;
|
|
MICChiSquare *micchi;
|
|
#endif
|
|
|
|
protected:
|
|
|
|
//gives access to dks autotuning config file
|
|
DKSConfig dksconfig;
|
|
|
|
/**
|
|
* Check if current API is set to OpenCL
|
|
* Return true/false wether current api is opencl
|
|
*/
|
|
bool apiOpenCL();
|
|
|
|
/**
|
|
* Check if current API is set to CUDA.
|
|
* Return true/false wether curretn api is cuda
|
|
*/
|
|
bool apiCuda();
|
|
|
|
/**
|
|
* Check if current API is set to OpenMP.
|
|
* Return true/false whether current api is OpenMP
|
|
*/
|
|
bool apiOpenMP();
|
|
|
|
/** Check if device is GPU */
|
|
bool deviceGPU();
|
|
/** Check if device is CPU */
|
|
bool deviceCPU();
|
|
/** Check if device is MIC */
|
|
bool deviceMIC();
|
|
|
|
/**
|
|
* Get cbase pointer
|
|
*/
|
|
#ifdef DKS_CUDA
|
|
CudaBase *getCudaBase() {
|
|
return cbase;
|
|
}
|
|
#endif
|
|
|
|
#ifdef DKS_OPENCL
|
|
OpenCLBase *getOpenCLBase() {
|
|
return oclbase;
|
|
}
|
|
#endif
|
|
|
|
#ifdef DKS_MIC
|
|
MICBase *getMICBase() {
|
|
return micbase;
|
|
}
|
|
#endif
|
|
|
|
/** Call OpenCL base to load specified kenrel file.
|
|
*
|
|
*/
|
|
int loadOpenCLKernel(const char *kernel_name);
|
|
|
|
std::string getAPI() {
|
|
std::string api_name(m_api_name);
|
|
return api_name;
|
|
}
|
|
|
|
std::string getDevice() {
|
|
std::string device_name(&m_device_name[1]);
|
|
return device_name;
|
|
}
|
|
|
|
|
|
public:
|
|
|
|
/**
|
|
* Default constructor.
|
|
*/
|
|
DKSBase();
|
|
|
|
/**
|
|
* Constructor that sets api and devcie to use with DKS.
|
|
*/
|
|
DKSBase(const char* api_name, const char* device_name);
|
|
|
|
|
|
/**
|
|
* Destructor.
|
|
* Free DKS resources.
|
|
*/
|
|
~DKSBase();
|
|
|
|
/** Function to initialize objects based on the device used.
|
|
*
|
|
*/
|
|
int setupDevice();
|
|
|
|
/** Turn on auto tuning */
|
|
void setAutoTuningOn() { m_auto_tuning = true; }
|
|
|
|
/** Turn of auto tuning */
|
|
void setAutoTuningOff() { m_auto_tuning = false; }
|
|
|
|
/** Get status of auto tuning */
|
|
bool isAutoTuningOn() { return m_auto_tuning; }
|
|
|
|
/** Turn on use of config file */
|
|
void setUseConfigOn() { m_use_config = true; }
|
|
|
|
/** Turn off use of config file */
|
|
void setUseConfigOff() { m_use_config = false; }
|
|
|
|
/** Check if using config file */
|
|
bool isUseConfigOn() { return m_use_config; }
|
|
|
|
/**
|
|
* Set device to use with DKS.
|
|
* Sets specific device to use with DKS. Supported devices are -gpu and -mic.
|
|
* Length specifies the number of characters in device_name array (length - deprecated).
|
|
* Return success or error code.
|
|
*/
|
|
int setDevice(const char* device_name, int length = -1);
|
|
|
|
/**
|
|
* Set framework to use with DKS.
|
|
* Sets framework and API that DKS uses to execute code on device. Supported API's
|
|
* are OpenCL, CUDA and OpenMP. Returns success or error code. Length specifies
|
|
* the number of characters in api_name array (length - deprecated).
|
|
*/
|
|
int setAPI(const char* api_name, int length = -1);
|
|
|
|
/**
|
|
* Prints information about all available devices.
|
|
* Calls CUDA, OpenCL and MIC functions to query for available devices
|
|
* for each framework and pirnts information about each device. Length specifies
|
|
* the number of characters in api_name array
|
|
* Returns success or error code
|
|
*/
|
|
int getDevices();
|
|
|
|
/**
|
|
* Returns device count.
|
|
* Saves the number of the devices available on the platform to ndev.
|
|
*/
|
|
int getDeviceCount(int &ndev);
|
|
|
|
/** Get the name of the device in use.
|
|
* Query the device that is used and get the naem of the device. The name is saved in the
|
|
* device_name string. Returns DKS_SUCCESS
|
|
*/
|
|
int getDeviceName(std::string &device_name);
|
|
|
|
/** Set the device to use.
|
|
* Pass the index of the device to use by dks.
|
|
*/
|
|
int setDefaultDevice(int device);
|
|
|
|
/** Get unique devices.
|
|
* Get a list of all the unique devices available on the platform.
|
|
* When API and device type for DKS is set, getDeviceList can get all the unique devices
|
|
* available for this API and device type. Used for autotuning if multiple different GPUs are
|
|
* installed on the system.
|
|
*/
|
|
int getDeviceList(std::vector<int> &devices);
|
|
|
|
/**
|
|
* Inititialize DKS.
|
|
* Set framework and device to use. If OpenCL is used create context with device.
|
|
* Return success or error code.
|
|
*/
|
|
int initDevice();
|
|
|
|
/**
|
|
* Create stream for async execution.
|
|
* Function to create different streams with device to allow assync kernel execution and data
|
|
* transfer. Currently implemented for CUDA with cuda streams. streamId will be can be used later
|
|
* use the created stream. Returns success or error code.
|
|
* TODO: for opencl use different
|
|
* contexts similar as cuda streams to achieve async execution. TODO: for intel mic look at
|
|
* library (libxstream) from Hans Pabst.
|
|
*/
|
|
int createStream(int &streamId);
|
|
|
|
/**
|
|
* Send pointer to device memory from one MPI process to another.
|
|
* Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses
|
|
* cuda icp. Gets icp handle of memory allocated on device pointed by mem_ptr does MPI_Send to
|
|
* dest process where matching receivePointer should be called. Returns success or error code.
|
|
* TODO: opencl and mic cases still need implementations
|
|
*/
|
|
#ifdef DKS_MPI
|
|
int sendPointer(void *mem_ptr, int dest, MPI_Comm comm);
|
|
#endif
|
|
|
|
/**
|
|
* Receive pointer to device memory from another MPI process.
|
|
* Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses
|
|
* cuda icp. Uses MPI_Recv to get icp handle from another MPI process and opens a reference
|
|
* to this memory. Togeter with sendPointer function allows multiple MPI processes to share
|
|
* one memory region of the device. Returns success or error code.
|
|
* TODO: opencl and mic cases still need implementations
|
|
*/
|
|
#ifdef DKS_MPI
|
|
void * receivePointer(int hostproc, MPI_Comm comm, int &ierr);
|
|
#endif
|
|
|
|
/**
|
|
* Close handle to device memory.
|
|
* If receivePointer is used to open memory handle allocated by another MPI process closeHandle
|
|
* should be called to free resources instead of freeMemory. Returns success or error code.
|
|
* TODO: opencl and mic cases still need implementations.
|
|
*/
|
|
int closeHandle(void *mem_ptr);
|
|
|
|
/**
|
|
* Wait till all tasks running on device are completed.
|
|
* Forces a device synchronization - waits till all tasks on the device are complete.
|
|
* Implemented for cuda. Forces sync only in context in witch it is called - only waits
|
|
* for tasks launched by process calling syncDevice. If multiple processes launch different
|
|
* tasks each process is responsible for its own synchronization. Returns success or error code.
|
|
* TODO: opencl and mic implementations still necessary
|
|
*/
|
|
int syncDevice();
|
|
|
|
/**
|
|
* Allocate memory and transfer data to device.
|
|
* Returns a void pointer which can be used in later kernels to reference
|
|
* allocated device memory. data_in pointer to data to be transfered to device,
|
|
* elements is the number of data elements to transfer, T - type of data to transfer.
|
|
* If memory allocation or data transfer fails ierr will be set to error code.
|
|
*/
|
|
template <typename T>
|
|
void * pushData(const void *data_in, int elements, int &ierr) {
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
cl_mem mem_ptr;
|
|
size_t size = sizeof(T)*elements;
|
|
mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
|
|
oclbase->ocl_writeData(mem_ptr, data_in, size, CL_FALSE);
|
|
|
|
ierr = DKS_SUCCESS;
|
|
return mem_ptr;
|
|
#endif
|
|
} else if (apiCuda()){
|
|
#ifdef DKS_CUDA
|
|
//cuda version
|
|
void * mem_ptr = NULL;
|
|
size_t size = sizeof(T)*elements;
|
|
mem_ptr = cbase->cuda_allocateMemory(size, ierr);
|
|
cbase->cuda_writeData((T*)mem_ptr, data_in, size);
|
|
|
|
ierr = DKS_SUCCESS;
|
|
return mem_ptr;
|
|
#endif
|
|
} else if (apiOpenMP()) {
|
|
#ifdef DKS_MIC
|
|
void * mem_ptr = NULL;
|
|
mem_ptr = micbase.mic_pushData<T>(data_in, elements);
|
|
|
|
return mem_ptr;
|
|
#endif
|
|
}
|
|
|
|
ierr = DKS_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Read data from device and free device memory.
|
|
* Reads data from device pointed by mem_ptr into data_out pointer. Elements
|
|
* specifies the number of data elements to read, T specifies the datatype of
|
|
* elements to copy. Returns error code if read data or free memory fails.
|
|
*/
|
|
template<typename T>
|
|
int pullData(void *mem_ptr, void* data_out, int elements) {
|
|
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
size_t size = sizeof(T)*elements;
|
|
cl_mem clmem_ptr = (cl_mem)mem_ptr;
|
|
oclbase->ocl_readData(clmem_ptr, data_out, size);
|
|
oclbase->ocl_freeMemory(clmem_ptr);
|
|
#endif
|
|
} else if (apiCuda()) {
|
|
#ifdef DKS_CUDA
|
|
//cuda version
|
|
size_t size = sizeof(T)*elements;
|
|
cbase->cuda_readData((T*)mem_ptr, data_out, size);
|
|
cbase->cuda_freeMemory(mem_ptr);
|
|
#endif
|
|
} else if (apiOpenMP()) {
|
|
#ifdef DKS_MIC
|
|
micbase.mic_pullData<T>(mem_ptr, data_out, elements);
|
|
#endif
|
|
}
|
|
|
|
return DKS_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* Allocate memory on device and return pointer to device memory.
|
|
* Allocates memory of type T, elements specifies the number of
|
|
* elements for which memory should be allocated. If memory allocation
|
|
* fails ierr is set to error code. Returns void pointer to device memory.
|
|
*/
|
|
template<typename T>
|
|
void * allocateMemory(int elements, int &ierr) {
|
|
ierr = DKS_SUCCESS;
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
cl_mem mem_ptr;
|
|
size_t size = sizeof(T)*elements;
|
|
mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
|
|
return mem_ptr;
|
|
#endif
|
|
} else if (apiCuda()) {
|
|
#ifdef DKS_CUDA
|
|
//cuda version
|
|
void * mem_ptr = NULL;
|
|
size_t size = sizeof(T)*elements;
|
|
mem_ptr = cbase->cuda_allocateMemory(size, ierr);
|
|
return mem_ptr;
|
|
#endif
|
|
} else if (apiOpenMP()) {
|
|
#ifdef DKS_MIC
|
|
void * mem_ptr = NULL;
|
|
mem_ptr = micbase->mic_allocateMemory<T>(elements);
|
|
return mem_ptr;
|
|
#endif
|
|
}
|
|
|
|
ierr = DKS_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Allocates host memory as page-locked.
|
|
* Used for memroy allocation on the host side for pointer ptr for size elements.
|
|
* Page locked memory improves
|
|
* data transfer rates between host and device and allows async data transfer
|
|
* and kernel execution. Reurns succes or error code.
|
|
* TODO: opencl and mic implementations needed.
|
|
*/
|
|
template<typename T>
|
|
int allocateHostMemory(T *&ptr, int size)
|
|
{
|
|
if (apiCuda())
|
|
return CUDA_SAFECALL(cbase->cuda_allocateHostMemory(ptr, size));
|
|
|
|
DEBUG_MSG("Pinned memory allocation not implemented for this platform");
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Free host page-locked memory.
|
|
* Used to free page-locked memory on the host that was allocated using
|
|
* allocateHostMemory. ptr is the host pointer where page-locked memory was allocated,
|
|
* size - number of elements held by the memroy.
|
|
*/
|
|
template<typename T>
|
|
int freeHostMemory(T* &ptr, int size)
|
|
{
|
|
if (apiCuda())
|
|
return CUDA_SAFECALL(cbase->cuda_freeHostMemory(ptr));
|
|
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Page lock allocated host memory.
|
|
* Page locked memory improves data transfer between host and device (true for cuda and
|
|
* opencl, maybe also mic). ptr - pointer to memory that needs to be page locked,
|
|
* size - number of elements in array.
|
|
* TODO: mic and opencl implementations needed
|
|
*/
|
|
template <typename T>
|
|
int registerHostMemory(T *ptr, int size) {
|
|
if (apiCuda())
|
|
return CUDA_SAFECALL(cbase->cuda_hostRegister(ptr, size));
|
|
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Unregister page locked memory.
|
|
* TODO: opencl and mic implementations needed·
|
|
*/
|
|
template <typename T>
|
|
int unregisterHostMemory(T *ptr) {
|
|
if (apiCuda())
|
|
return CUDA_SAFECALL(cbase->cuda_hostUnregister(ptr));
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Write data from host to device.
|
|
* Write data from data to device memory referenced by mem_ptr. Elements spicify the
|
|
* number of elements to write, offset specifies the offset from the first element.
|
|
* Returns success or error code. Performs a blocking write - control to the host
|
|
* is returned only when data transfer is complete.
|
|
*/
|
|
template<typename T>
|
|
int writeData(void *mem_ptr, const void *data, int elements, int offset = 0) {
|
|
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
size_t size = sizeof(T)*elements;
|
|
size_t offset_bytes = sizeof(T)*offset;
|
|
cl_mem clmem_ptr = (cl_mem)mem_ptr;
|
|
return oclbase->ocl_writeData(clmem_ptr, data, size, offset_bytes, CL_FALSE);
|
|
#endif
|
|
|
|
} else if (apiCuda()){
|
|
//cuda version
|
|
size_t size = sizeof(T)*elements;
|
|
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
|
|
|
|
} else if (apiOpenMP()) {
|
|
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
|
|
|
|
}
|
|
|
|
return DKS_ERROR;
|
|
|
|
}
|
|
|
|
/**
|
|
* Write data to device using async write.
|
|
* Queue a async data write and return control to host imediately.
|
|
* mem_ptr - device memory pointer, data - host memory pointer,
|
|
* elements - number of data elements to write
|
|
* stremaId - stream id to use, offset - offset on device from first element
|
|
* For trully async execution on cuda stream other than default needs to be created
|
|
* and device memory must be page-locked. Otherwise functions just asynchronosly with
|
|
* respect to host.
|
|
* TODO: mic and opencl implementations needed (goes to blocking writes)
|
|
*/
|
|
template<typename T>
|
|
int writeDataAsync(void *mem_ptr, const void *data, int elements,
|
|
int streamId = -1, int offset = 0) {
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
size_t size = sizeof(T)*elements;
|
|
cl_mem clmem_ptr = (cl_mem)mem_ptr;
|
|
oclbase->ocl_writeData(clmem_ptr, data, size, 0, CL_FALSE);
|
|
#endif
|
|
} else if (apiCuda()){
|
|
//cuda version
|
|
size_t size = sizeof(T)*elements;
|
|
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
|
|
} else if (apiOpenMP()) {
|
|
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
|
|
}
|
|
|
|
return DKS_ERROR;
|
|
|
|
}
|
|
|
|
/**
|
|
* Gather 3D data from multiple mpi processes to one memory region.
|
|
* When multiple processes share the same device memory using sendPointer and receivePointer
|
|
* gather3DDataAsync allows each process to write data to its memory region. Uses async writes.
|
|
* mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
|
|
* data dimensions, id - starting indexes in global domain for each process
|
|
* streamId - stream to use for data transfers.
|
|
* Returns success or error code.
|
|
*/
|
|
#ifdef DKS_MPI
|
|
template<typename T>
|
|
int gather3DDataAsync(void *mem_ptr, const T *data, int Ng[3], int Nl[3],
|
|
int id[3], int streamId = -1 ) {
|
|
|
|
|
|
//int p = 1;
|
|
int rank;
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
|
|
int hoffset, doffset, ierr;
|
|
|
|
//number of continuous memory elements
|
|
int elements = Nl[0];
|
|
if (Nl[0] == Ng[0]) {
|
|
elements *= Nl[1];
|
|
if (Nl[1] == Ng[1])
|
|
elements *= Nl[2];
|
|
}
|
|
|
|
//starting index
|
|
int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];
|
|
|
|
//copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
|
|
if (Nl[0] != Ng[0]) {
|
|
for (int i = 0; i < Nl[2]; i++) {
|
|
for (int j = 0; j < Nl[1]; j++) {
|
|
doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
|
|
hoffset = (i * Nl[1] + j) * elements;
|
|
ierr = writeDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
|
|
if (ierr == DKS_ERROR) return DKS_ERROR;
|
|
}
|
|
}
|
|
return DKS_SUCCESS;
|
|
}
|
|
|
|
//copy piece by piece 3rd dim if 2nd dim is split
|
|
if (Nl[1] != Ng[1]) {
|
|
for (int i = 0; i < Nl[2]; i++) {
|
|
doffset = i* Ng[1] * Ng[0] + sid;
|
|
ierr = writeDataAsync<T>(mem_ptr, data + i*elements, elements, streamId, doffset);
|
|
if (ierr == DKS_ERROR) return DKS_ERROR;
|
|
}
|
|
return DKS_SUCCESS;
|
|
}
|
|
|
|
//if only 3rd dim is split all elements are continuous so write one chunk
|
|
doffset = sid;
|
|
return writeDataAsync<T>(mem_ptr, data, elements, streamId, doffset);
|
|
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Scatter 3D data to multiple MPI processes from one device memory region.
|
|
* When multiple processes share the same device memory using sendPointer and receivePointer
|
|
* scatter3DDataAsync allows each process to read data from its memory region. Uses async reads.
|
|
* mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
|
|
* data dimensions, id - starting indexes in global domain for each process
|
|
* streamId - stream to use for data transfers.
|
|
* Returns success or error code.
|
|
*/
|
|
#ifdef DKS_MPI
|
|
template<typename T>
|
|
int scatter3DDataAsync(const void *mem_ptr, T *data, int Ng[3], int Nl[3],
|
|
int id[3], int streamId = -1) {
|
|
|
|
//int p = 1;
|
|
//int rank;
|
|
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
|
|
int hoffset, doffset, ierr;
|
|
|
|
//number of continuous memory elements
|
|
int elements = Nl[0];
|
|
if (Nl[0] == Ng[0]) {
|
|
elements *= Nl[1];
|
|
if (Nl[1] == Ng[1])
|
|
elements *= Nl[2];
|
|
}
|
|
|
|
//starting index
|
|
int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];
|
|
|
|
//copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
|
|
if (Nl[0] != Ng[0]) {
|
|
for (int i = 0; i < Nl[2]; i++) {
|
|
for (int j = 0; j < Nl[1]; j++) {
|
|
doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
|
|
hoffset = (i * Nl[1] + j) * elements;
|
|
ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
|
|
if (ierr == DKS_ERROR) return DKS_ERROR;
|
|
}
|
|
}
|
|
return DKS_SUCCESS;
|
|
}
|
|
|
|
//copy piece by piece 3rd dim if 2nd dim is split
|
|
if (Nl[1] != Ng[1]) {
|
|
for (int i = 0; i < Nl[2]; i++) {
|
|
doffset = i* Ng[1] * Ng[0] + sid;
|
|
hoffset = i * elements;
|
|
ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
|
|
if (ierr == DKS_ERROR) return DKS_ERROR;
|
|
}
|
|
return DKS_SUCCESS;
|
|
}
|
|
|
|
//if only 3rd dim is split all elements are continuous so write one chunk
|
|
doffset = sid;
|
|
return readDataAsync<T>(mem_ptr, data, elements, streamId, doffset);
|
|
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Create MPI subarray for 3D data gather and scatter using cuda aware MPI.
|
|
* If multiple MPI processes share device and cuda aware MPI is used for data transfer
|
|
* creates a MPI subarray so each MPI process can write and read to its own memory region.
|
|
* N_global - global domain dimensions, N_local - local domain dimensions, datatype - MPI datatype
|
|
*/
|
|
#ifdef DKS_MPI
|
|
template<typename T>
|
|
MPI_Datatype create3DMPISubarray(int N_global[3], int N_local[3], MPI_Datatype datatype) {
|
|
//create MPI datatypes to transfer decomposed domain from GPU memory
|
|
int sizes[3] = {N_global[2], N_global[1], N_global[0]};
|
|
int subsizes[3] = {N_local[2], N_local[1], N_local[0]};
|
|
int starts[3] = {0, 0, 0};
|
|
|
|
MPI_Datatype stype, rtype;
|
|
MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, datatype, &stype);
|
|
MPI_Type_create_resized(stype, 0, sizeof(T), &rtype);
|
|
MPI_Type_commit(&rtype);
|
|
|
|
return rtype;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Gather 3D data from multiple MPI processes to device using cuda aware MPI.
|
|
* Using cuda aware mpi allows to gather data to one device memory region allocated
|
|
* by one of the mpi processes. mem_ptr - device pointer, data - host memory pointer,
|
|
* size - number of elements to transfer, stype - data type of elements, N_global -
|
|
* global dimensions of the domain, N_local - local domain dimensions,
|
|
* idx,idy,idz - starting indexes in global domain for each process, numNodes - number
|
|
* of processes, myNode - current node, rootNode - node that allocated device memory,
|
|
* comm - MPI communicator
|
|
* TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
|
|
*/
|
|
#ifdef DKS_MPI
|
|
template<typename T>
|
|
int gather3DData(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
|
|
int N_local[3], int * idx, int * idy, int * idz,
|
|
int numNodes, int myNode, int rootNode, MPI_Comm comm)
|
|
{
|
|
|
|
MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);
|
|
|
|
//calculate displacements from global domain size and local domain starting index
|
|
int *counts = new int[numNodes];
|
|
int *displs = new int[numNodes];
|
|
for (int i = 0; i < numNodes; i++) {
|
|
counts[i] = 1;
|
|
displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
|
|
}
|
|
|
|
if (apiOpenCL()) {
|
|
//TODO: gather all the date in root node, transfer to device from root node
|
|
return DKS_ERROR;
|
|
} else if (apiCuda()) {
|
|
MPI_Gatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm );
|
|
} else if (apiOpenMP()) {
|
|
//TODO: gather all the date in root node, transfer to device from root node
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
return DKS_SUCCESS;
|
|
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Gather 3D data from multiple MPI processes to device using cuda aware MPI and non blocking gather.
|
|
* For detailed parameter description see gather3DData docs.
|
|
* TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
|
|
*/
|
|
#ifdef DKS_MPI
|
|
template<typename T>
|
|
int gather3DDataAsync(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
|
|
int N_local[3], int * idx, int * idy, int * idz,
|
|
int numNodes, int myNode, int rootNode,
|
|
MPI_Comm comm, MPI_Request &request)
|
|
{
|
|
|
|
MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);
|
|
|
|
//calculate displacements from global domain size and local domain starting index
|
|
int *counts = new int[numNodes];
|
|
int *displs = new int[numNodes];
|
|
for (int i = 0; i < numNodes; i++) {
|
|
counts[i] = 1;
|
|
displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
|
|
}
|
|
|
|
if (apiOpenCL()) {
|
|
//TODO: gather all the date in root node, transfer to device from root node
|
|
return DKS_ERROR;
|
|
} else if (apiCuda()) {
|
|
MPI_Igatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm, &request );
|
|
|
|
} else if (apiOpenMP()) {
|
|
//TODO: gather all the date in root node, transfer to device from root node
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
return DKS_SUCCESS;
|
|
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Scatter 3D data from device to multiple MPI processes using cuda aware MPI.
|
|
* If multiple MPI prcesses share one device allows to scatter 3D data regions
|
|
* from device memory allocated by one of the processes to all other MPI processes.
|
|
* For detailed parameter description see gather3DData docs.
|
|
* TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
|
|
*/
|
|
#ifdef DKS_MPI
|
|
template<typename T>
|
|
int scatter3DData(void *mem_ptr, T *data, int size, MPI_Datatype rtype, int N_global[3],
|
|
int N_local[3], int * idx, int * idy, int * idz,
|
|
int numNodes, int myNode, int rootNode, MPI_Comm comm)
|
|
{
|
|
|
|
MPI_Datatype stype = create3DMPISubarray<T>(N_global, N_local, rtype);
|
|
|
|
//calculate displacements from global domain size and local domain starting index
|
|
int *counts = new int[numNodes];
|
|
int *displs = new int[numNodes];
|
|
for (int i = 0; i < numNodes; i++) {
|
|
counts[i] = 1;
|
|
displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
|
|
}
|
|
|
|
if (apiOpenCL()) {
|
|
//TODO: gather all the date in root node, transfer to device from root node
|
|
} else if (apiCuda()) {
|
|
|
|
//async scatter
|
|
//use cuda aware mpi
|
|
MPI_Scatterv( mem_ptr, counts, displs, stype, data, size, rtype, rootNode, comm );
|
|
return DKS_ERROR;
|
|
} else if (apiOpenMP()) {
|
|
|
|
//TODO: gather all the date in root node, transfer to device from root node
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
return DKS_SUCCESS;
|
|
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Read data from device memory.
|
|
* Read data referenced by mem_ptr int out_data. Elements indicates the number of data
|
|
* elements to read and offset is the offset on the device from start of the memroy.
|
|
* Data type to read is specified by T. Performs a blocking read.
|
|
*/
|
|
template<typename T>
|
|
int readData(const void *mem_ptr, void *out_data, int elements, int offset = 0) {
|
|
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
cl_mem clmem_ptr = (cl_mem)mem_ptr;
|
|
size_t size = sizeof(T)*elements;
|
|
size_t offset_bytes = sizeof(T)*offset;
|
|
return oclbase->ocl_readData(clmem_ptr, out_data, size, offset_bytes);
|
|
#endif
|
|
} else if (apiCuda()){
|
|
size_t size = sizeof(T)*elements;
|
|
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
|
|
} else if (apiOpenMP()) {
|
|
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
|
|
}
|
|
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Performs an async data read from device.
|
|
* Queues data read from device and returns control to host. stream id specifies stream to use for
|
|
* the read. Device async read can be performed if host memroy is page-locked and strema other than
|
|
* default -1 is used. For other parameter detailed description see readData function.
|
|
* TODO: opencl and mic implementations (currently reverts to blocking reads).
|
|
*/
|
|
template<typename T>
|
|
int readDataAsync(const void *mem_ptr, void *out_data, int elements, int streamId = -1, int offset = 0) {
|
|
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
//OpenCL version
|
|
cl_mem clmem_ptr = (cl_mem)mem_ptr;
|
|
size_t size = sizeof(T)*elements;
|
|
return oclbase->ocl_readData(clmem_ptr, out_data, size, 0);
|
|
#endif
|
|
} else if (apiCuda()){
|
|
//cuda version
|
|
size_t size = sizeof(T)*elements;
|
|
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
|
|
} else if (apiOpenMP()) {
|
|
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
|
|
streamId, offset));
|
|
}
|
|
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
|
|
/**
|
|
* Free memory allocated on device.
|
|
* Free memory referenced by mem_ptr, elements - number of elements in memory,
|
|
* T - data type.
|
|
*/
|
|
template<typename T>
|
|
int freeMemory(void *mem_ptr, int elements) {
|
|
if (apiOpenCL())
|
|
return OPENCL_SAFECALL(oclbase->ocl_freeMemory((cl_mem)mem_ptr));
|
|
else if (apiCuda())
|
|
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
|
|
else if (apiOpenMP())
|
|
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
|
|
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Create random numbers on the device and fille mem_data array
|
|
*/
|
|
int callCreateRandomNumbers(void *mem_ptr, int size);
|
|
|
|
/**
|
|
* Init random number states and save for reuse on device.
|
|
* TODO: opencl and mic implementations.
|
|
*/
|
|
int callInitRandoms(int size);
|
|
|
|
/**
|
|
* Print memory information on device (total, used, available)
|
|
* TODO: opencl and mic imlementation
|
|
*/
|
|
int callMemInfo() {
|
|
if (apiCuda())
|
|
return CUDA_SAFECALL(cbase->cuda_memInfo());
|
|
|
|
return DKS_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Test function to profile opencl kernel calls.
|
|
* Used for debuging and timing purposes only.
|
|
*/
|
|
void oclEventInfo() {
|
|
if (apiOpenCL())
|
|
return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
|
|
|
|
}
|
|
|
|
/**
|
|
* Test function to profile opencl kernel calls.
|
|
* Used for debuging and timing purposes only.
|
|
*/
|
|
void oclClearEvents() {
|
|
if (apiOpenCL()) {
|
|
#ifdef DKS_OPENCL
|
|
oclbase->ocl_clearEvents();
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
};
|
|
|
|
#endif
|