Files
DKS/src/DKSBase.h
2017-02-28 15:06:45 +01:00

945 lines
28 KiB
C++

/** DKSBase class.
* DKSBase.h
* Author: Uldis Locans
* Date: 15.09.2014
* Base class of Dynamic Kernel Scheduler that handles the function calls
* from host application to DKS
*/
#ifndef H_DKS_BASE
#define H_DKS_BASE
#include <iostream>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include "DKSDefinitions.h"
#ifdef DKS_MPI
#include <mpi.h>
#endif
#ifdef DKS_OPENCL
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "OpenCL/OpenCLBase.h"
#include "OpenCL/OpenCLChiSquare.h"
#endif
#ifdef DKS_AMD
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#include "OpenCL/OpenCLGreensFunction.h"
#endif
#ifdef DKS_CUDA
#include "CUDA/CudaBase.cuh"
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaChiSquare.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#include "nvToolsExt.h"
#endif
#ifdef DKS_MIC
#include "MIC/MICBase.h"
#include "MIC/MICChiSquare.h"
#include "MIC/MICFFT.h"
#include "MIC/MICCollimatorPhysics.h"
#include "MIC/MICGreensFunction.hpp"
#endif
#include "Algorithms/GreensFunction.h"
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"
#include "AutoTuning/DKSConfig.h"
/** DKSBase class for handling function calls to DKS library */
class DKSBase {
private:
char *m_device_name;
char *m_api_name;
char *m_function_name;
bool m_device_set;
bool m_api_set;
bool m_function_set;
bool m_auto_tuning;
bool m_use_config;
#ifdef DKS_OPENCL
OpenCLBase *oclbase;
OpenCLChiSquare *oclchi;
#endif
#ifdef DKS_CUDA
CudaBase *cbase;
CudaChiSquare *cchi;
#endif
#ifdef DKS_MIC
MICBase *micbase;
MICChiSquare *micchi;
#endif
protected:
//gives access to dks autotuning config file
DKSConfig dksconfig;
/**
* Check if current API is set to OpenCL
* Return true/false wether current api is opencl
*/
bool apiOpenCL();
/**
* Check if current API is set to CUDA.
* Return true/false wether curretn api is cuda
*/
bool apiCuda();
/**
* Check if current API is set to OpenMP.
* Return true/false whether current api is OpenMP
*/
bool apiOpenMP();
/** Check if device is GPU */
bool deviceGPU();
/** Check if device is CPU */
bool deviceCPU();
/** Check if device is MIC */
bool deviceMIC();
/**
* Get cbase pointer
*/
#ifdef DKS_CUDA
CudaBase *getCudaBase() {
return cbase;
}
#endif
#ifdef DKS_OPENCL
OpenCLBase *getOpenCLBase() {
return oclbase;
}
#endif
#ifdef DKS_MIC
MICBase *getMICBase() {
return micbase;
}
#endif
/** Call OpenCL base to load specified kenrel file.
*
*/
int loadOpenCLKernel(const char *kernel_name);
std::string getAPI() {
std::string api_name(m_api_name);
return api_name;
}
std::string getDevice() {
std::string device_name(&m_device_name[1]);
return device_name;
}
public:
/**
* Default constructor.
*/
DKSBase();
/**
* Constructor that sets api and devcie to use with DKS.
*/
DKSBase(const char* api_name, const char* device_name);
/**
* Destructor.
* Free DKS resources.
*/
~DKSBase();
/** Function to initialize objects based on the device used.
*
*/
int setupDevice();
/** Turn on auto tuning */
void setAutoTuningOn() { m_auto_tuning = true; }
/** Turn of auto tuning */
void setAutoTuningOff() { m_auto_tuning = false; }
/** Get status of auto tuning */
bool isAutoTuningOn() { return m_auto_tuning; }
/** Turn on use of config file */
void setUseConfigOn() { m_use_config = true; }
/** Turn off use of config file */
void setUseConfigOff() { m_use_config = false; }
/** Check if using config file */
bool isUseConfigOn() { return m_use_config; }
/**
* Set device to use with DKS.
* Sets specific device to use with DKS. Supported devices are -gpu and -mic.
* Length specifies the number of characters in device_name array (length - deprecated).
* Return success or error code.
*/
int setDevice(const char* device_name, int length = -1);
/**
* Set framework to use with DKS.
* Sets framework and API that DKS uses to execute code on device. Supported API's
* are OpenCL, CUDA and OpenMP. Returns success or error code. Length specifies
* the number of characters in api_name array (length - deprecated).
*/
int setAPI(const char* api_name, int length = -1);
/**
* Prints information about all available devices.
* Calls CUDA, OpenCL and MIC functions to query for available devices
* for each framework and pirnts information about each device. Length specifies
* the number of characters in api_name array
* Returns success or error code
*/
int getDevices();
/**
* Returns device count.
* Saves the number of the devices available on the platform to ndev.
*/
int getDeviceCount(int &ndev);
/** Get the name of the device in use.
* Query the device that is used and get the naem of the device. The name is saved in the
* device_name string. Returns DKS_SUCCESS
*/
int getDeviceName(std::string &device_name);
/** Set the device to use.
* Pass the index of the device to use by dks.
*/
int setDefaultDevice(int device);
/** Get unique devices.
* Get a list of all the unique devices available on the platform.
* When API and device type for DKS is set, getDeviceList can get all the unique devices
* available for this API and device type. Used for autotuning if multiple different GPUs are
* installed on the system.
*/
int getDeviceList(std::vector<int> &devices);
/**
* Inititialize DKS.
* Set framework and device to use. If OpenCL is used create context with device.
* Return success or error code.
*/
int initDevice();
/**
* Create stream for async execution.
* Function to create different streams with device to allow assync kernel execution and data
* transfer. Currently implemented for CUDA with cuda streams. streamId will be can be used later
* use the created stream. Returns success or error code.
* TODO: for opencl use different
* contexts similar as cuda streams to achieve async execution. TODO: for intel mic look at
* library (libxstream) from Hans Pabst.
*/
int createStream(int &streamId);
/**
* Send pointer to device memory from one MPI process to another.
* Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses
* cuda icp. Gets icp handle of memory allocated on device pointed by mem_ptr does MPI_Send to
* dest process where matching receivePointer should be called. Returns success or error code.
* TODO: opencl and mic cases still need implementations
*/
#ifdef DKS_MPI
int sendPointer(void *mem_ptr, int dest, MPI_Comm comm);
#endif
/**
* Receive pointer to device memory from another MPI process.
* Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses
* cuda icp. Uses MPI_Recv to get icp handle from another MPI process and opens a reference
* to this memory. Togeter with sendPointer function allows multiple MPI processes to share
* one memory region of the device. Returns success or error code.
* TODO: opencl and mic cases still need implementations
*/
#ifdef DKS_MPI
void * receivePointer(int hostproc, MPI_Comm comm, int &ierr);
#endif
/**
* Close handle to device memory.
* If receivePointer is used to open memory handle allocated by another MPI process closeHandle
* should be called to free resources instead of freeMemory. Returns success or error code.
* TODO: opencl and mic cases still need implementations.
*/
int closeHandle(void *mem_ptr);
/**
* Wait till all tasks running on device are completed.
* Forces a device synchronization - waits till all tasks on the device are complete.
* Implemented for cuda. Forces sync only in context in witch it is called - only waits
* for tasks launched by process calling syncDevice. If multiple processes launch different
* tasks each process is responsible for its own synchronization. Returns success or error code.
* TODO: opencl and mic implementations still necessary
*/
int syncDevice();
/**
* Allocate memory and transfer data to device.
* Returns a void pointer which can be used in later kernels to reference
* allocated device memory. data_in pointer to data to be transfered to device,
* elements is the number of data elements to transfer, T - type of data to transfer.
* If memory allocation or data transfer fails ierr will be set to error code.
*/
template <typename T>
void * pushData(const void *data_in, int elements, int &ierr) {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
cl_mem mem_ptr;
size_t size = sizeof(T)*elements;
mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
oclbase->ocl_writeData(mem_ptr, data_in, size, CL_FALSE);
ierr = DKS_SUCCESS;
return mem_ptr;
#endif
} else if (apiCuda()){
#ifdef DKS_CUDA
//cuda version
void * mem_ptr = NULL;
size_t size = sizeof(T)*elements;
mem_ptr = cbase->cuda_allocateMemory(size, ierr);
cbase->cuda_writeData((T*)mem_ptr, data_in, size);
ierr = DKS_SUCCESS;
return mem_ptr;
#endif
} else if (apiOpenMP()) {
#ifdef DKS_MIC
void * mem_ptr = NULL;
mem_ptr = micbase.mic_pushData<T>(data_in, elements);
return mem_ptr;
#endif
}
ierr = DKS_ERROR;
return NULL;
}
/**
* Read data from device and free device memory.
* Reads data from device pointed by mem_ptr into data_out pointer. Elements
* specifies the number of data elements to read, T specifies the datatype of
* elements to copy. Returns error code if read data or free memory fails.
*/
template<typename T>
int pullData(void *mem_ptr, void* data_out, int elements) {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
size_t size = sizeof(T)*elements;
cl_mem clmem_ptr = (cl_mem)mem_ptr;
oclbase->ocl_readData(clmem_ptr, data_out, size);
oclbase->ocl_freeMemory(clmem_ptr);
#endif
} else if (apiCuda()) {
#ifdef DKS_CUDA
//cuda version
size_t size = sizeof(T)*elements;
cbase->cuda_readData((T*)mem_ptr, data_out, size);
cbase->cuda_freeMemory(mem_ptr);
#endif
} else if (apiOpenMP()) {
#ifdef DKS_MIC
micbase.mic_pullData<T>(mem_ptr, data_out, elements);
#endif
}
return DKS_SUCCESS;
}
/**
* Allocate memory on device and return pointer to device memory.
* Allocates memory of type T, elements specifies the number of
* elements for which memory should be allocated. If memory allocation
* fails ierr is set to error code. Returns void pointer to device memory.
*/
template<typename T>
void * allocateMemory(int elements, int &ierr) {
ierr = DKS_SUCCESS;
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
cl_mem mem_ptr;
size_t size = sizeof(T)*elements;
mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
return mem_ptr;
#endif
} else if (apiCuda()) {
#ifdef DKS_CUDA
//cuda version
void * mem_ptr = NULL;
size_t size = sizeof(T)*elements;
mem_ptr = cbase->cuda_allocateMemory(size, ierr);
return mem_ptr;
#endif
} else if (apiOpenMP()) {
#ifdef DKS_MIC
void * mem_ptr = NULL;
mem_ptr = micbase->mic_allocateMemory<T>(elements);
return mem_ptr;
#endif
}
ierr = DKS_ERROR;
return NULL;
}
/**
* Allocates host memory as page-locked.
* Used for memroy allocation on the host side for pointer ptr for size elements.
* Page locked memory improves
* data transfer rates between host and device and allows async data transfer
* and kernel execution. Reurns succes or error code.
* TODO: opencl and mic implementations needed.
*/
template<typename T>
int allocateHostMemory(T *&ptr, int size)
{
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_allocateHostMemory(ptr, size));
DEBUG_MSG("Pinned memory allocation not implemented for this platform");
return DKS_ERROR;
}
/**
* Free host page-locked memory.
* Used to free page-locked memory on the host that was allocated using
* allocateHostMemory. ptr is the host pointer where page-locked memory was allocated,
* size - number of elements held by the memroy.
*/
template<typename T>
int freeHostMemory(T* &ptr, int size)
{
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_freeHostMemory(ptr));
return DKS_ERROR;
}
/**
* Page lock allocated host memory.
* Page locked memory improves data transfer between host and device (true for cuda and
* opencl, maybe also mic). ptr - pointer to memory that needs to be page locked,
* size - number of elements in array.
* TODO: mic and opencl implementations needed
*/
template <typename T>
int registerHostMemory(T *ptr, int size) {
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_hostRegister(ptr, size));
return DKS_ERROR;
}
/**
* Unregister page locked memory.
* TODO: opencl and mic implementations needed·
*/
template <typename T>
int unregisterHostMemory(T *ptr) {
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_hostUnregister(ptr));
return DKS_ERROR;
}
/**
* Write data from host to device.
* Write data from data to device memory referenced by mem_ptr. Elements spicify the
* number of elements to write, offset specifies the offset from the first element.
* Returns success or error code. Performs a blocking write - control to the host
* is returned only when data transfer is complete.
*/
template<typename T>
int writeData(void *mem_ptr, const void *data, int elements, int offset = 0) {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
size_t size = sizeof(T)*elements;
size_t offset_bytes = sizeof(T)*offset;
cl_mem clmem_ptr = (cl_mem)mem_ptr;
return oclbase->ocl_writeData(clmem_ptr, data, size, offset_bytes, CL_FALSE);
#endif
} else if (apiCuda()){
//cuda version
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));
}
return DKS_ERROR;
}
/**
* Write data to device using async write.
* Queue a async data write and return control to host imediately.
* mem_ptr - device memory pointer, data - host memory pointer,
* elements - number of data elements to write
* stremaId - stream id to use, offset - offset on device from first element
* For trully async execution on cuda stream other than default needs to be created
* and device memory must be page-locked. Otherwise functions just asynchronosly with
* respect to host.
* TODO: mic and opencl implementations needed (goes to blocking writes)
*/
template<typename T>
int writeDataAsync(void *mem_ptr, const void *data, int elements,
int streamId = -1, int offset = 0) {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
size_t size = sizeof(T)*elements;
cl_mem clmem_ptr = (cl_mem)mem_ptr;
oclbase->ocl_writeData(clmem_ptr, data, size, 0, CL_FALSE);
#endif
} else if (apiCuda()){
//cuda version
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
}
return DKS_ERROR;
}
/**
* Gather 3D data from multiple mpi processes to one memory region.
* When multiple processes share the same device memory using sendPointer and receivePointer
* gather3DDataAsync allows each process to write data to its memory region. Uses async writes.
* mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
* data dimensions, id - starting indexes in global domain for each process
* streamId - stream to use for data transfers.
* Returns success or error code.
*/
#ifdef DKS_MPI
template<typename T>
int gather3DDataAsync(void *mem_ptr, const T *data, int Ng[3], int Nl[3],
int id[3], int streamId = -1 ) {
//int p = 1;
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int hoffset, doffset, ierr;
//number of continuous memory elements
int elements = Nl[0];
if (Nl[0] == Ng[0]) {
elements *= Nl[1];
if (Nl[1] == Ng[1])
elements *= Nl[2];
}
//starting index
int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];
//copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
if (Nl[0] != Ng[0]) {
for (int i = 0; i < Nl[2]; i++) {
for (int j = 0; j < Nl[1]; j++) {
doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
hoffset = (i * Nl[1] + j) * elements;
ierr = writeDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
if (ierr == DKS_ERROR) return DKS_ERROR;
}
}
return DKS_SUCCESS;
}
//copy piece by piece 3rd dim if 2nd dim is split
if (Nl[1] != Ng[1]) {
for (int i = 0; i < Nl[2]; i++) {
doffset = i* Ng[1] * Ng[0] + sid;
ierr = writeDataAsync<T>(mem_ptr, data + i*elements, elements, streamId, doffset);
if (ierr == DKS_ERROR) return DKS_ERROR;
}
return DKS_SUCCESS;
}
//if only 3rd dim is split all elements are continuous so write one chunk
doffset = sid;
return writeDataAsync<T>(mem_ptr, data, elements, streamId, doffset);
}
#endif
/**
* Scatter 3D data to multiple MPI processes from one device memory region.
* When multiple processes share the same device memory using sendPointer and receivePointer
* scatter3DDataAsync allows each process to read data from its memory region. Uses async reads.
* mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
* data dimensions, id - starting indexes in global domain for each process
* streamId - stream to use for data transfers.
* Returns success or error code.
*/
#ifdef DKS_MPI
template<typename T>
int scatter3DDataAsync(const void *mem_ptr, T *data, int Ng[3], int Nl[3],
int id[3], int streamId = -1) {
//int p = 1;
//int rank;
//MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int hoffset, doffset, ierr;
//number of continuous memory elements
int elements = Nl[0];
if (Nl[0] == Ng[0]) {
elements *= Nl[1];
if (Nl[1] == Ng[1])
elements *= Nl[2];
}
//starting index
int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];
//copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
if (Nl[0] != Ng[0]) {
for (int i = 0; i < Nl[2]; i++) {
for (int j = 0; j < Nl[1]; j++) {
doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
hoffset = (i * Nl[1] + j) * elements;
ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
if (ierr == DKS_ERROR) return DKS_ERROR;
}
}
return DKS_SUCCESS;
}
//copy piece by piece 3rd dim if 2nd dim is split
if (Nl[1] != Ng[1]) {
for (int i = 0; i < Nl[2]; i++) {
doffset = i* Ng[1] * Ng[0] + sid;
hoffset = i * elements;
ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
if (ierr == DKS_ERROR) return DKS_ERROR;
}
return DKS_SUCCESS;
}
//if only 3rd dim is split all elements are continuous so write one chunk
doffset = sid;
return readDataAsync<T>(mem_ptr, data, elements, streamId, doffset);
}
#endif
/**
* Create MPI subarray for 3D data gather and scatter using cuda aware MPI.
* If multiple MPI processes share device and cuda aware MPI is used for data transfer
* creates a MPI subarray so each MPI process can write and read to its own memory region.
* N_global - global domain dimensions, N_local - local domain dimensions, datatype - MPI datatype
*/
#ifdef DKS_MPI
template<typename T>
MPI_Datatype create3DMPISubarray(int N_global[3], int N_local[3], MPI_Datatype datatype) {
//create MPI datatypes to transfer decomposed domain from GPU memory
int sizes[3] = {N_global[2], N_global[1], N_global[0]};
int subsizes[3] = {N_local[2], N_local[1], N_local[0]};
int starts[3] = {0, 0, 0};
MPI_Datatype stype, rtype;
MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, datatype, &stype);
MPI_Type_create_resized(stype, 0, sizeof(T), &rtype);
MPI_Type_commit(&rtype);
return rtype;
}
#endif
/**
* Gather 3D data from multiple MPI processes to device using cuda aware MPI.
* Using cuda aware mpi allows to gather data to one device memory region allocated
* by one of the mpi processes. mem_ptr - device pointer, data - host memory pointer,
* size - number of elements to transfer, stype - data type of elements, N_global -
* global dimensions of the domain, N_local - local domain dimensions,
* idx,idy,idz - starting indexes in global domain for each process, numNodes - number
* of processes, myNode - current node, rootNode - node that allocated device memory,
* comm - MPI communicator
* TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
*/
#ifdef DKS_MPI
template<typename T>
int gather3DData(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
int N_local[3], int * idx, int * idy, int * idz,
int numNodes, int myNode, int rootNode, MPI_Comm comm)
{
MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);
//calculate displacements from global domain size and local domain starting index
int *counts = new int[numNodes];
int *displs = new int[numNodes];
for (int i = 0; i < numNodes; i++) {
counts[i] = 1;
displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
}
if (apiOpenCL()) {
//TODO: gather all the date in root node, transfer to device from root node
return DKS_ERROR;
} else if (apiCuda()) {
MPI_Gatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm );
} else if (apiOpenMP()) {
//TODO: gather all the date in root node, transfer to device from root node
return DKS_ERROR;
}
return DKS_SUCCESS;
}
#endif
/**
* Gather 3D data from multiple MPI processes to device using cuda aware MPI and non blocking gather.
* For detailed parameter description see gather3DData docs.
* TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
*/
#ifdef DKS_MPI
template<typename T>
int gather3DDataAsync(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
int N_local[3], int * idx, int * idy, int * idz,
int numNodes, int myNode, int rootNode,
MPI_Comm comm, MPI_Request &request)
{
MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);
//calculate displacements from global domain size and local domain starting index
int *counts = new int[numNodes];
int *displs = new int[numNodes];
for (int i = 0; i < numNodes; i++) {
counts[i] = 1;
displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
}
if (apiOpenCL()) {
//TODO: gather all the date in root node, transfer to device from root node
return DKS_ERROR;
} else if (apiCuda()) {
MPI_Igatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm, &request );
} else if (apiOpenMP()) {
//TODO: gather all the date in root node, transfer to device from root node
return DKS_ERROR;
}
return DKS_SUCCESS;
}
#endif
/**
* Scatter 3D data from device to multiple MPI processes using cuda aware MPI.
* If multiple MPI prcesses share one device allows to scatter 3D data regions
* from device memory allocated by one of the processes to all other MPI processes.
* For detailed parameter description see gather3DData docs.
* TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
*/
#ifdef DKS_MPI
template<typename T>
int scatter3DData(void *mem_ptr, T *data, int size, MPI_Datatype rtype, int N_global[3],
int N_local[3], int * idx, int * idy, int * idz,
int numNodes, int myNode, int rootNode, MPI_Comm comm)
{
MPI_Datatype stype = create3DMPISubarray<T>(N_global, N_local, rtype);
//calculate displacements from global domain size and local domain starting index
int *counts = new int[numNodes];
int *displs = new int[numNodes];
for (int i = 0; i < numNodes; i++) {
counts[i] = 1;
displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
}
if (apiOpenCL()) {
//TODO: gather all the date in root node, transfer to device from root node
} else if (apiCuda()) {
//async scatter
//use cuda aware mpi
MPI_Scatterv( mem_ptr, counts, displs, stype, data, size, rtype, rootNode, comm );
return DKS_ERROR;
} else if (apiOpenMP()) {
//TODO: gather all the date in root node, transfer to device from root node
return DKS_ERROR;
}
return DKS_SUCCESS;
}
#endif
/**
* Read data from device memory.
* Read data referenced by mem_ptr int out_data. Elements indicates the number of data
* elements to read and offset is the offset on the device from start of the memroy.
* Data type to read is specified by T. Performs a blocking read.
*/
template<typename T>
int readData(const void *mem_ptr, void *out_data, int elements, int offset = 0) {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
cl_mem clmem_ptr = (cl_mem)mem_ptr;
size_t size = sizeof(T)*elements;
size_t offset_bytes = sizeof(T)*offset;
return oclbase->ocl_readData(clmem_ptr, out_data, size, offset_bytes);
#endif
} else if (apiCuda()){
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
}
return DKS_ERROR;
}
/**
* Performs an async data read from device.
* Queues data read from device and returns control to host. stream id specifies stream to use for
* the read. Device async read can be performed if host memroy is page-locked and strema other than
* default -1 is used. For other parameter detailed description see readData function.
* TODO: opencl and mic implementations (currently reverts to blocking reads).
*/
template<typename T>
int readDataAsync(const void *mem_ptr, void *out_data, int elements, int streamId = -1, int offset = 0) {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
//OpenCL version
cl_mem clmem_ptr = (cl_mem)mem_ptr;
size_t size = sizeof(T)*elements;
return oclbase->ocl_readData(clmem_ptr, out_data, size, 0);
#endif
} else if (apiCuda()){
//cuda version
size_t size = sizeof(T)*elements;
return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
} else if (apiOpenMP()) {
return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
streamId, offset));
}
return DKS_ERROR;
}
/**
* Free memory allocated on device.
* Free memory referenced by mem_ptr, elements - number of elements in memory,
* T - data type.
*/
template<typename T>
int freeMemory(void *mem_ptr, int elements) {
if (apiOpenCL())
return OPENCL_SAFECALL(oclbase->ocl_freeMemory((cl_mem)mem_ptr));
else if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
else if (apiOpenMP())
return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));
return DKS_ERROR;
}
/**
* Create random numbers on the device and fille mem_data array
*/
int callCreateRandomNumbers(void *mem_ptr, int size);
/**
* Init random number states and save for reuse on device.
* TODO: opencl and mic implementations.
*/
int callInitRandoms(int size);
/**
* Print memory information on device (total, used, available)
* TODO: opencl and mic imlementation
*/
int callMemInfo() {
if (apiCuda())
return CUDA_SAFECALL(cbase->cuda_memInfo());
return DKS_ERROR;
}
/**
* Test function to profile opencl kernel calls.
* Used for debuging and timing purposes only.
*/
void oclEventInfo() {
if (apiOpenCL())
return OPENCL_SAFECALL(oclbase->ocl_eventInfo());
}
/**
* Test function to profile opencl kernel calls.
* Used for debuging and timing purposes only.
*/
void oclClearEvents() {
if (apiOpenCL()) {
#ifdef DKS_OPENCL
oclbase->ocl_clearEvents();
#endif
}
}
};
#endif