DKS/src/DKSBase.h

/** DKSBase class.
 * DKSBase.h
 * Author: Uldis Locans
 * Date: 15.09.2014
 * Base class of Dynamic Kernel Scheduler that handles the function calls
 * from host application to DKS
 */

#ifndef H_DKS_BASE
#define H_DKS_BASE

#include <iostream>
#include <string.h>
#include <time.h>
#include <sys/time.h>

#include "DKSDefinitions.h"

#ifdef DKS_MPI
#include <mpi.h>
#endif

#ifdef DKS_OPENCL

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#include "OpenCL/OpenCLBase.h"
#include "OpenCL/OpenCLChiSquare.h"
#endif

#ifdef DKS_AMD
#include "OpenCL/OpenCLFFT.h"
#include "OpenCL/OpenCLCollimatorPhysics.h"
#include "OpenCL/OpenCLGreensFunction.h"
#endif

#ifdef DKS_CUDA
#include "CUDA/CudaBase.cuh"
#include "CUDA/CudaFFT.cuh"
#include "CUDA/CudaGreensFunction.cuh"
#include "CUDA/CudaChiSquare.cuh"
#include "CUDA/CudaCollimatorPhysics.cuh"
#include "nvToolsExt.h"
#endif

#ifdef DKS_MIC
#include "MIC/MICBase.h"
#include "MIC/MICChiSquare.h"
#include "MIC/MICFFT.h"
#include "MIC/MICCollimatorPhysics.h"
#include "MIC/MICGreensFunction.hpp"
#endif

#include "Algorithms/GreensFunction.h"
#include "Algorithms/CollimatorPhysics.h"
#include "Algorithms/FFT.h"

#include "AutoTuning/DKSConfig.h"

/** DKSBase class for handling function calls to DKS library */
class DKSBase {

private:
  char *m_device_name;
  char *m_api_name;
  char *m_function_name;

  bool m_device_set;
  bool m_api_set;
  bool m_function_set;

  bool m_auto_tuning;
  bool m_use_config;

#ifdef DKS_OPENCL
  OpenCLBase *oclbase;
  OpenCLChiSquare *oclchi;
#endif

#ifdef DKS_CUDA
  CudaBase *cbase;
  CudaChiSquare *cchi;
#endif

#ifdef DKS_MIC
  MICBase *micbase;
  MICChiSquare *micchi;
#endif

protected:

  //gives access to dks autotuning config file
  DKSConfig dksconfig;

  /**
   * Check if current API is set to OpenCL
   * Return true/false wether current api is opencl
   */
  bool apiOpenCL();

  /**
   * Check if current API is set to CUDA.
   * Return true/false wether curretn api is cuda
   */
  bool apiCuda();

  /**
   * Check if current API is set to OpenMP.
   * Return true/false whether current api is OpenMP
   */
  bool apiOpenMP();

  /** Check if device is GPU */
  bool deviceGPU();
  /** Check if device is CPU */
  bool deviceCPU();
  /** Check if device is MIC */
  bool deviceMIC();

  /**
   * Get cbase pointer
   */
#ifdef DKS_CUDA
  CudaBase *getCudaBase() {
    return cbase;
  }
#endif

#ifdef DKS_OPENCL
  OpenCLBase *getOpenCLBase() {
    return oclbase;
  }
#endif

#ifdef DKS_MIC
  MICBase *getMICBase() {
    return micbase;
  }
#endif

  /** Call OpenCL base to load specified kenrel file.
   *
   */
  int loadOpenCLKernel(const char *kernel_name);

  std::string getAPI() {
    std::string api_name(m_api_name);
    return api_name;
  }

  std::string getDevice() {
    std::string device_name(&m_device_name[1]);
    return device_name;
  }


public:

  /**
   * Default constructor.
   */
  DKSBase();

  /**
   * Constructor that sets api and devcie to use with DKS.
   */
  DKSBase(const char* api_name, const char* device_name);


  /**
   * Destructor.
   * Free DKS resources.
   */
  ~DKSBase();

  /** Function to initialize objects based on the device used.
   *
   */
  int setupDevice();

  /** Turn on auto tuning */
  void setAutoTuningOn() { m_auto_tuning = true; }

  /** Turn of auto tuning */
  void setAutoTuningOff() { m_auto_tuning = false; }

  /** Get status of auto tuning */
  bool isAutoTuningOn() { return m_auto_tuning; }

  /** Turn on use of config file */
  void setUseConfigOn() { m_use_config = true; }

  /** Turn off use of config file */
  void setUseConfigOff() { m_use_config = false; }

  /** Check if using config file */
  bool isUseConfigOn() { return m_use_config; }

  /**
   * Set device to use with DKS.
   * Sets specific device to use with DKS. Supported devices are -gpu and -mic.
   * Length specifies the number of characters in device_name array (length - deprecated).
   * Return success or error code.
   */
  int setDevice(const char* device_name, int length = -1);

  /**
   * Set framework to use with DKS.
   * Sets framework and API that DKS uses to execute code on device. Supported API's
   * are OpenCL, CUDA and OpenMP. Returns success or error code. Length specifies
   * the number of characters in api_name array (length - deprecated).
   */
  int setAPI(const char* api_name, int length = -1);

  /**
   * Prints information about all available devices.
   * Calls CUDA, OpenCL and MIC functions to query for available devices
   * for each framework and pirnts information about each device. Length specifies
   * the number of characters in api_name array
   * Returns success or error code
   */
  int getDevices();

  /**
   * Returns device count.
   * Saves the number of the devices available on the platform to ndev.
   */
  int getDeviceCount(int &ndev);

  /** Get the name of the device in use.
   *  Query the device that is used and get the naem of the device. The name is saved in the
   *  device_name string. Returns DKS_SUCCESS
   */
  int getDeviceName(std::string &device_name);

  /** Set the device to use.
   *  Pass the index of the device to use by dks.
   */
  int setDefaultDevice(int device);

  /** Get unique devices.
   *  Get a list of all the unique devices available on the platform.
   *  When API and device type for DKS is set, getDeviceList can get all the unique devices
   *  available for this API and device type. Used for autotuning if multiple different GPUs are
   *  installed on the system.
   */
  int getDeviceList(std::vector<int> &devices);

  /**
   * Inititialize DKS.
   * Set framework and device to use. If OpenCL is used create context with device.
   * Return success or error code.
   */
  int initDevice();

  /**
   * Create stream for async execution.
   * Function to create different streams with device to allow assync kernel execution and data
   * transfer. Currently implemented for CUDA with cuda streams. streamId will be can be used later
   * use the created stream. Returns success or error code.
   * TODO: for opencl use different
   * contexts similar as cuda streams to achieve async execution. TODO: for intel mic look at
   * library (libxstream) from Hans Pabst.
   */
  int createStream(int &streamId);

  /**
   * Send pointer to device memory from one MPI process to another.
   * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses
   * cuda icp. Gets icp handle of memory allocated on device pointed by mem_ptr does MPI_Send to
   * dest process where matching receivePointer should be called. Returns success or error code.
   * TODO: opencl and mic cases still need implementations
   */
#ifdef DKS_MPI
  int sendPointer(void *mem_ptr, int dest, MPI_Comm comm);
#endif

  /**
   * Receive pointer to device memory from another MPI process.
   * Implemented only if mpi compiler is used to build DKS. Implemented only for cuda. Uses
   * cuda icp. Uses MPI_Recv to get icp handle from another MPI process and opens a reference
   * to this memory. Togeter with sendPointer function allows multiple MPI processes to share
   * one memory region of the device. Returns success or error code.
   * TODO: opencl and mic cases still need implementations
   */
#ifdef DKS_MPI
  void * receivePointer(int hostproc, MPI_Comm comm, int &ierr);
#endif

  /**
   * Close handle to device memory.
   * If receivePointer is used to open memory handle allocated by another MPI process closeHandle
   * should be called to free resources instead of freeMemory. Returns success or error code.
   * TODO: opencl and mic cases still need implementations.
   */
  int closeHandle(void *mem_ptr);

  /**
   * Wait till all tasks running on device are completed.
   * Forces a device synchronization - waits till all tasks on the device are complete.
   * Implemented for cuda. Forces sync only in context in witch it is called - only waits
   * for tasks launched by process calling syncDevice. If multiple processes launch different
   * tasks each process is responsible for its own synchronization. Returns success or error code.
   * TODO: opencl and mic implementations still necessary
   */
  int syncDevice();

  /**
   * Allocate memory and transfer data to device.
   * Returns a void pointer which can be used in later kernels to reference
   * allocated device memory. data_in pointer to data to be transfered to device,
   * elements is the number of data elements to transfer, T - type of data to transfer.
   * If memory allocation or data transfer fails ierr will be set to error code.
   */
  template <typename T>
  void * pushData(const void *data_in, int elements, int &ierr) {
    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      cl_mem mem_ptr;
      size_t size = sizeof(T)*elements;
      mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
      oclbase->ocl_writeData(mem_ptr, data_in, size, CL_FALSE);

      ierr = DKS_SUCCESS;
      return mem_ptr;
#endif
    } else if (apiCuda()){
#ifdef DKS_CUDA
      //cuda version
      void * mem_ptr = NULL;
      size_t size = sizeof(T)*elements;
      mem_ptr = cbase->cuda_allocateMemory(size, ierr);
      cbase->cuda_writeData((T*)mem_ptr, data_in, size);

      ierr = DKS_SUCCESS;
      return mem_ptr;
#endif
    } else if (apiOpenMP()) {
#ifdef DKS_MIC
      void * mem_ptr = NULL;
      mem_ptr = micbase.mic_pushData<T>(data_in, elements);

      return mem_ptr;
#endif
    }

    ierr = DKS_ERROR;
    return NULL;
  }

  /**
   * Read data from device and free device memory.
   * Reads data from device pointed by mem_ptr into data_out pointer. Elements
   * specifies the number of data elements to read, T specifies the datatype of
   * elements to copy. Returns error code if read data or free memory fails.
   */
  template<typename T>
  int pullData(void *mem_ptr, void* data_out, int elements) {

    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      size_t size = sizeof(T)*elements;
      cl_mem clmem_ptr = (cl_mem)mem_ptr;
      oclbase->ocl_readData(clmem_ptr, data_out, size);
      oclbase->ocl_freeMemory(clmem_ptr);
#endif
    } else if (apiCuda()) {
#ifdef DKS_CUDA
      //cuda version
      size_t size = sizeof(T)*elements;
      cbase->cuda_readData((T*)mem_ptr, data_out, size);
      cbase->cuda_freeMemory(mem_ptr);
#endif
    } else if (apiOpenMP()) {
#ifdef DKS_MIC
      micbase.mic_pullData<T>(mem_ptr, data_out, elements);
#endif
    }

    return DKS_SUCCESS;
  }

  /**
   * Allocate memory on device and return pointer to device memory.
   * Allocates memory of type T, elements specifies the number of
   * elements for which memory should be allocated. If memory allocation
   * fails ierr is set to error code. Returns void pointer to device memory.
   */
  template<typename T>
  void * allocateMemory(int elements, int &ierr) {
    ierr = DKS_SUCCESS;
    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      cl_mem mem_ptr;
      size_t size = sizeof(T)*elements;
      mem_ptr = oclbase->ocl_allocateMemory(size, ierr);
      return mem_ptr;
#endif
    } else if (apiCuda()) {
#ifdef DKS_CUDA
      //cuda version
      void * mem_ptr = NULL;
      size_t size = sizeof(T)*elements;
      mem_ptr = cbase->cuda_allocateMemory(size, ierr);
      return mem_ptr;
#endif
    } else if (apiOpenMP()) {
#ifdef DKS_MIC
      void * mem_ptr = NULL;
      mem_ptr = micbase->mic_allocateMemory<T>(elements);
      return mem_ptr;
#endif
    }

    ierr = DKS_ERROR;
    return NULL;
  }

  /**
   * Allocates host memory as page-locked.
   * Used for memroy allocation on the host side for pointer ptr for size elements.
   * Page locked memory improves
   * data transfer rates between host and device and allows async data transfer
   * and kernel execution. Reurns succes or error code.
   * TODO: opencl and mic implementations needed.
   */
  template<typename T>
  int allocateHostMemory(T *&ptr, int size)
  {
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_allocateHostMemory(ptr, size));

    DEBUG_MSG("Pinned memory allocation not implemented for this platform");
    return DKS_ERROR;
  }

  /**
   * Free host page-locked memory.
   * Used to free page-locked memory on the host that was allocated using
   * allocateHostMemory. ptr is the host pointer where page-locked memory was allocated,
   * size - number of elements held by the memroy.
   */
  template<typename T>
  int freeHostMemory(T* &ptr, int size)
  {
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_freeHostMemory(ptr));

    return DKS_ERROR;
  }

  /**
   * Page lock allocated host memory.
   * Page locked memory improves data transfer between host and device (true for cuda and
   * opencl, maybe also mic). ptr - pointer to memory that needs to be page locked,
   * size - number of elements in array.
   * TODO: mic and opencl implementations needed
   */
  template <typename T>
  int registerHostMemory(T *ptr, int size) {
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_hostRegister(ptr, size));

    return DKS_ERROR;
  }

  /**
   * Unregister page locked memory.
   * TODO: opencl and mic implementations needed·
   */
  template <typename T>
  int unregisterHostMemory(T *ptr) {
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_hostUnregister(ptr));
    return DKS_ERROR;
  }

  /**
   * Write data from host to device.
   * Write data from data to device memory referenced by mem_ptr. Elements spicify the
   * number of elements to write, offset specifies the offset from the first element.
   * Returns success or error code. Performs a blocking write - control to the host
   * is returned only when data transfer is complete.
   */
  template<typename T>
  int writeData(void *mem_ptr, const void *data, int elements, int offset = 0) {

    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      size_t size = sizeof(T)*elements;
      size_t offset_bytes = sizeof(T)*offset;
      cl_mem clmem_ptr = (cl_mem)mem_ptr;
      return oclbase->ocl_writeData(clmem_ptr, data, size, offset_bytes, CL_FALSE);
#endif

    } else if (apiCuda()){
      //cuda version
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_writeData((T*)mem_ptr, data, size, offset));

    } else if (apiOpenMP()) {
      return MIC_SAFECALL(micbase->mic_writeData<T>(mem_ptr, data, elements, offset));

    }

    return DKS_ERROR;

  }

  /**
   * Write data to device using async write.
   * Queue a async data write and return control to host imediately.
   * mem_ptr - device memory pointer, data - host memory pointer,
   * elements - number of data elements to write
   * stremaId - stream id to use, offset - offset on device from first element
   * For trully async execution on cuda stream other than default needs to be created
   * and device memory must be page-locked. Otherwise functions just asynchronosly with
   * respect to host.
   * TODO: mic and opencl implementations needed (goes to blocking writes)
   */
  template<typename T>
  int writeDataAsync(void *mem_ptr, const void *data, int elements,
		     int streamId = -1, int offset = 0) {
    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      size_t size = sizeof(T)*elements;
      cl_mem clmem_ptr = (cl_mem)mem_ptr;
      oclbase->ocl_writeData(clmem_ptr, data, size, 0, CL_FALSE);
#endif
    } else if (apiCuda()){
      //cuda version
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_writeDataAsync((T*)mem_ptr, data, size, streamId, offset));
    } else if (apiOpenMP()) {
      return MIC_SAFECALL(micbase->mic_writeDataAsync<T>(mem_ptr, data, elements, streamId, offset));
    }

    return DKS_ERROR;

  }

  /**
   * Gather 3D data from multiple mpi processes to one memory region.
   * When multiple processes share the same device memory using sendPointer and receivePointer
   * gather3DDataAsync allows each process to write data to its memory region. Uses async writes.
   * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
   * data dimensions, id - starting indexes in global domain for each process
   * streamId - stream to use for data transfers.
   * Returns success or error code.
   */
#ifdef DKS_MPI
  template<typename T>
  int gather3DDataAsync(void *mem_ptr, const T *data, int Ng[3], int Nl[3],
			int id[3], int streamId = -1 ) {


    //int p = 1;
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int hoffset, doffset, ierr;

    //number of continuous memory elements
    int elements = Nl[0];
    if (Nl[0] == Ng[0]) {
      elements *= Nl[1];
      if (Nl[1] == Ng[1])
	elements *= Nl[2];
    }

    //starting index
    int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];

    //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
    if (Nl[0] != Ng[0]) {
      for (int i = 0; i < Nl[2]; i++) {
	for (int j = 0; j < Nl[1]; j++) {
	  doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
	  hoffset = (i * Nl[1] + j) * elements;
	  ierr = writeDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
	  if (ierr == DKS_ERROR) return DKS_ERROR;
	}
      }
      return DKS_SUCCESS;
    }

    //copy piece by piece 3rd dim if 2nd dim is split
    if (Nl[1] != Ng[1]) {
      for (int i = 0; i < Nl[2]; i++) {
	doffset = i* Ng[1] * Ng[0] + sid;
	ierr = writeDataAsync<T>(mem_ptr, data + i*elements, elements, streamId, doffset);
	if (ierr == DKS_ERROR) return DKS_ERROR;
      }
      return DKS_SUCCESS;
    }

    //if only 3rd dim is split all elements are continuous so write one chunk
    doffset = sid;
    return writeDataAsync<T>(mem_ptr, data, elements, streamId, doffset);

  }
#endif

  /**
   * Scatter 3D data to multiple MPI processes from one device memory region.
   * When multiple processes share the same device memory using sendPointer and receivePointer
   * scatter3DDataAsync allows each process to read data from its memory region. Uses async reads.
   * mem_ptr - device pointer, data - host pointer, Ng - global dimensions of data, Nl - local
   * data dimensions, id - starting indexes in global domain for each process
   * streamId - stream to use for data transfers.
   * Returns success or error code.
   */
#ifdef DKS_MPI
  template<typename T>
  int scatter3DDataAsync(const void *mem_ptr, T *data, int Ng[3], int Nl[3],
			 int id[3], int streamId = -1) {

    //int p = 1;
    //int rank;
    //MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int hoffset, doffset, ierr;

    //number of continuous memory elements
    int elements = Nl[0];
    if (Nl[0] == Ng[0]) {
      elements *= Nl[1];
      if (Nl[1] == Ng[1])
	elements *= Nl[2];
    }

    //starting index
    int sid = id[2] * Ng[1] * Ng[0] + id[1] * Ng[0] + id[0];

    //copy piece-by-piece 2nd and 3rd dim if 1st dimension is split
    if (Nl[0] != Ng[0]) {
      for (int i = 0; i < Nl[2]; i++) {
	for (int j = 0; j < Nl[1]; j++) {
	  doffset = i * Ng[1] * Ng[0] + j * Ng[0] + sid;
	  hoffset = (i * Nl[1] + j) * elements;
	  ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
	  if (ierr == DKS_ERROR) return DKS_ERROR;
	}
      }
      return DKS_SUCCESS;
    }

    //copy piece by piece 3rd dim if 2nd dim is split
    if (Nl[1] != Ng[1]) {
      for (int i = 0; i < Nl[2]; i++) {
	doffset = i* Ng[1] * Ng[0] + sid;
	hoffset = i * elements;
	ierr = readDataAsync<T>(mem_ptr, data + hoffset, elements, streamId, doffset);
	if (ierr == DKS_ERROR) return DKS_ERROR;
      }
      return DKS_SUCCESS;
    }

    //if only 3rd dim is split all elements are continuous so write one chunk
    doffset = sid;
    return readDataAsync<T>(mem_ptr, data, elements, streamId, doffset);

  }
#endif

  /**
   * Create MPI subarray for 3D data gather and scatter using cuda aware MPI.
   * If multiple MPI processes share device and cuda aware MPI is used for data transfer
   * creates a MPI subarray so each MPI process can write and read to its own memory region.
   * N_global - global domain dimensions, N_local - local domain dimensions, datatype - MPI datatype
   */
#ifdef DKS_MPI
  template<typename T>
  MPI_Datatype create3DMPISubarray(int N_global[3], int N_local[3], MPI_Datatype datatype) {
    //create MPI datatypes to transfer decomposed domain from GPU memory
    int sizes[3] = {N_global[2], N_global[1], N_global[0]};
    int subsizes[3] = {N_local[2], N_local[1], N_local[0]};
    int starts[3] = {0, 0, 0};

    MPI_Datatype stype, rtype;
    MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C, datatype, &stype);
    MPI_Type_create_resized(stype, 0, sizeof(T), &rtype);
    MPI_Type_commit(&rtype);

    return rtype;
  }
#endif

  /**
   * Gather 3D data from multiple MPI processes to device using cuda aware MPI.
   * Using cuda aware mpi allows to gather data to one device memory region allocated
   * by one of the mpi processes. mem_ptr - device pointer, data - host memory pointer,
   * size - number of elements to transfer, stype - data type of elements, N_global -
   * global dimensions of the domain, N_local - local domain dimensions,
   * idx,idy,idz - starting indexes in global domain for each process, numNodes - number
   * of processes, myNode - current node, rootNode - node that allocated device memory,
   * comm - MPI communicator
   * TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
   */
#ifdef DKS_MPI
  template<typename T>
  int gather3DData(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
		   int N_local[3], int * idx, int * idy, int * idz,
		   int numNodes, int myNode, int rootNode, MPI_Comm comm)
  {

    MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);

    //calculate displacements from global domain size and local domain starting index
    int *counts = new int[numNodes];
    int *displs = new int[numNodes];
    for (int i = 0; i < numNodes; i++) {
      counts[i] = 1;
      displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
    }

    if (apiOpenCL()) {
      //TODO: gather all the date in root node, transfer to device from root node
      return DKS_ERROR;
    } else if (apiCuda()) {
      MPI_Gatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm );
    } else if (apiOpenMP()) {
      //TODO: gather all the date in root node, transfer to device from root node
      return DKS_ERROR;
    }

    return DKS_SUCCESS;

  }
#endif

  /**
   * Gather 3D data from multiple MPI processes to device using cuda aware MPI and non blocking gather.
   * For detailed parameter description see gather3DData docs.
   * TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
   */
#ifdef DKS_MPI
  template<typename T>
  int gather3DDataAsync(void *mem_ptr, T *data, int size, MPI_Datatype stype, int N_global[3],
			int N_local[3], int * idx, int * idy, int * idz,
			int numNodes, int myNode, int rootNode,
			MPI_Comm comm, MPI_Request &request)
  {

    MPI_Datatype rtype = create3DMPISubarray<T>(N_global, N_local, stype);

    //calculate displacements from global domain size and local domain starting index
    int *counts = new int[numNodes];
    int *displs = new int[numNodes];
    for (int i = 0; i < numNodes; i++) {
      counts[i] = 1;
      displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
    }

    if (apiOpenCL()) {
      //TODO: gather all the date in root node, transfer to device from root node
      return DKS_ERROR;
    } else if (apiCuda()) {
      MPI_Igatherv( data, size, stype, mem_ptr, counts, displs, rtype, rootNode, comm, &request );

    } else if (apiOpenMP()) {
      //TODO: gather all the date in root node, transfer to device from root node
      return DKS_ERROR;
    }

    return DKS_SUCCESS;

  }
#endif

  /**
   * Scatter 3D data from device to multiple MPI processes using cuda aware MPI.
   * If multiple MPI prcesses share one device allows to scatter 3D data regions
   * from device memory allocated by one of the processes to all other MPI processes.
   * For detailed parameter description see gather3DData docs.
   * TODO: opencl and mic implementations (solution other than cuda aware mpi needed).
   */
#ifdef DKS_MPI
  template<typename T>
  int scatter3DData(void *mem_ptr, T *data, int size, MPI_Datatype rtype, int N_global[3],
		    int N_local[3], int * idx, int * idy, int * idz,
		    int numNodes, int myNode, int rootNode, MPI_Comm comm)
  {

    MPI_Datatype stype = create3DMPISubarray<T>(N_global, N_local, rtype);

    //calculate displacements from global domain size and local domain starting index
    int *counts = new int[numNodes];
    int *displs = new int[numNodes];
    for (int i = 0; i < numNodes; i++) {
      counts[i] = 1;
      displs[i] = idx[i] + idy[i] * N_global[0] + idz[i] * N_global[0] * N_global[1];
    }

    if (apiOpenCL()) {
      //TODO: gather all the date in root node, transfer to device from root node
    } else if (apiCuda()) {

      //async scatter
      //use cuda aware mpi
      MPI_Scatterv( mem_ptr, counts, displs, stype, data, size, rtype, rootNode, comm );
      return DKS_ERROR;
    } else if (apiOpenMP()) {

      //TODO: gather all the date in root node, transfer to device from root node
      return DKS_ERROR;
    }

    return DKS_SUCCESS;

  }
#endif

  /**
   * Read data from device memory.
   * Read data referenced by mem_ptr int out_data. Elements indicates the number of data
   * elements to read and offset is the offset on the device from start of the memroy.
   * Data type to read is specified by T. Performs a blocking read.
   */
  template<typename T>
  int readData(const void *mem_ptr, void *out_data, int elements, int offset = 0) {

    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      cl_mem clmem_ptr = (cl_mem)mem_ptr;
      size_t size = sizeof(T)*elements;
      size_t offset_bytes = sizeof(T)*offset;
      return oclbase->ocl_readData(clmem_ptr, out_data, size, offset_bytes);
#endif
    } else if (apiCuda()){
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readData((T*)mem_ptr, out_data, size, offset));
    } else if (apiOpenMP()) {
      return MIC_SAFECALL(micbase->mic_readData<T>(mem_ptr, out_data, elements, offset));
    }

    return DKS_ERROR;
  }

  /**
   * Performs an async data read from device.
   * Queues data read from device and returns control to host. stream id specifies stream to use for
   * the read. Device async read can be performed if host memroy is page-locked and strema other than
   * default -1 is used. For other parameter detailed description see readData function.
   * TODO: opencl and mic implementations (currently reverts to blocking reads).
   */
  template<typename T>
  int readDataAsync(const void *mem_ptr, void *out_data, int elements, int streamId = -1, int offset = 0) {

    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      //OpenCL version
      cl_mem clmem_ptr = (cl_mem)mem_ptr;
      size_t size = sizeof(T)*elements;
      return oclbase->ocl_readData(clmem_ptr, out_data, size, 0);
#endif
    } else if (apiCuda()){
      //cuda version
      size_t size = sizeof(T)*elements;
      return CUDA_SAFECALL(cbase->cuda_readDataAsync((T*)mem_ptr, out_data, size, streamId, offset));
    } else if (apiOpenMP()) {
      return MIC_SAFECALL(micbase->mic_readDataAsync<T>(mem_ptr, out_data, elements,
						       streamId, offset));
    }

    return DKS_ERROR;
  }


  /**
   * Free memory allocated on device.
   * Free memory referenced by mem_ptr, elements - number of elements in memory,
   * T - data type.
   */
  template<typename T>
  int freeMemory(void *mem_ptr, int elements) {
    if (apiOpenCL())
      return OPENCL_SAFECALL(oclbase->ocl_freeMemory((cl_mem)mem_ptr));
    else if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_freeMemory(mem_ptr));
    else if (apiOpenMP())
      return MIC_SAFECALL(micbase->mic_freeMemory<T>(mem_ptr, elements));

    return DKS_ERROR;
  }

  /**
   * Create random numbers on the device and fille mem_data array
   */
  int callCreateRandomNumbers(void *mem_ptr, int size);

  /**
   * Init random number states and save for reuse on device.
   * TODO: opencl and mic implementations.
   */
  int callInitRandoms(int size);

  /**
   * Print memory information on device (total, used, available)
   * TODO: opencl and mic imlementation
   */
  int callMemInfo() {
    if (apiCuda())
      return CUDA_SAFECALL(cbase->cuda_memInfo());

    return DKS_ERROR;
  }

  /**
   * Test function to profile opencl kernel calls.
   * Used for debuging and timing purposes only.
   */
  void oclEventInfo() {
    if (apiOpenCL())
      return OPENCL_SAFECALL(oclbase->ocl_eventInfo());

  }

  /**
   * Test function to profile opencl kernel calls.
   * Used for debuging and timing purposes only.
   */
  void oclClearEvents() {
    if (apiOpenCL()) {
#ifdef DKS_OPENCL
      oclbase->ocl_clearEvents();
#endif
    }
  }


};

#endif