From 79833cf7f51a86d0568142ab8db1fc7a79b917c2 Mon Sep 17 00:00:00 2001
From: Uldis Locans <uldis_l@pc8372-mac-mini.psi.ch>
Date: Thu, 17 Aug 2017 16:56:57 +0200
Subject: [PATCH] update work item size correctly for devices where supported
 size is smaller than DKS default

---
 src/DKSBaseMuSR.cpp                   |  5 +++--
 src/OpenCL/OpenCLBase.cpp             | 23 +++++++++++++++--------
 src/OpenCL/OpenCLChiSquareRuntime.cpp | 11 ++++++++---
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/DKSBaseMuSR.cpp b/src/DKSBaseMuSR.cpp
index 3df59e9..6e7d06f 100644
--- a/src/DKSBaseMuSR.cpp
+++ b/src/DKSBaseMuSR.cpp
@@ -24,6 +24,7 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
   //if we are not auto tuning and the size of the problem has changed find the new parameters
   //from autotuning config file
   if (!isAutoTuningOn() && length != chiSquareSize_m) {
+    /*
     int numBlocks, blockSize;
     std::string device_name;
     getDeviceName(device_name);
@@ -33,8 +34,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
 				 length, "BlockSize", blockSize);
     chiSq->setKernelParams(numBlocks, blockSize);
     
-    //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
-
+    std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
+    */
     chiSquareSize_m = length;
   } 
 
diff --git a/src/OpenCL/OpenCLBase.cpp b/src/OpenCL/OpenCLBase.cpp
index 4dad528..40677b9 100644
--- a/src/OpenCL/OpenCLBase.cpp
+++ b/src/OpenCL/OpenCLBase.cpp
@@ -756,7 +756,9 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const
   }
 	
   if (ierr != CL_SUCCESS)
-    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
+    DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr 
+	      << " work items: " << *work_items << ", " 
+	      << " work group: " << *work_group_size);
 		
   m_last_event = tmp_event;
   m_events.push_back(m_last_event);
@@ -937,22 +939,27 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
   if (ierr != DKS_SUCCESS)
     return ierr;
 
-  //get device properties
+  /* get device properties */
+  //maximum number of work-items in a work group supported by device
   size_t max_group_size;
   clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
+  //maxumum local memory size per work group
   cl_ulong local_mem_size;
   clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
+  //get the supported extensions
   size_t ext_size;
   clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
   char *ext = new char[ext_size];
   clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
 
-  //get kernel properties
+  /* get kernel properties */
+  //get max work group size that can be used for this kernel
   size_t kernel_group_size;
   clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, 
 			   sizeof(size_t), &kernel_group_size, 0);
   threadsPerBlock = kernel_group_size;
 
+  //get max local memory size that can be used for this kernel
   cl_ulong kernel_local_mem;
   clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
 			   sizeof(cl_ulong), &kernel_local_mem, 0);
@@ -961,18 +968,18 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
   std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
 
 
-  std::cout << "Work groups: device limit " << max_group_size << ", "
-	    << "kernel limit " << kernel_group_size << ", "
+  std::cout << "Work group size: max for device " << max_group_size << " > "
+	    << "max for kernel " << kernel_group_size << " > "
 	    << "required " << work_group_size << std::endl;
   
 
   std::cout << "Local memory: device limit " << local_mem_size << std::endl;
-  
+  std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
   
 
-  std::cout << "Available extensions: " << ext << std::endl;
+  std::cout << std::endl << "Available extensions: " << ext << std::endl;
 
-  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
+  std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;   
 
   return DKS_SUCCESS;
 }
diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp
index 2b97b87..3cd71f2 100644
--- a/src/OpenCL/OpenCLChiSquareRuntime.cpp
+++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp
@@ -78,8 +78,8 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
 
   
   int ierr;
-  //calc number of thread sper workgroup and nr of work groups
-  size_t work_size_sum = 128;
+  //calc number of threads per workgroup and nr of work groups
+  size_t work_size_sum = (size_t)blockSize_m;
 
   /*
   size_t work_items = (size_t)length;
@@ -141,6 +141,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
   //set work item size
   size_t work_items;
   size_t work_size = (size_t)blockSize_m;
+
   if (numBlocks_m < 0)
     work_items = (size_t)length;
   else
@@ -312,7 +313,11 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
   }
 
   //check the GPU kernel
-  ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
+  ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
+  if (threadsPerBlock < blockSize_m) {
+    std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
+    blockSize_m = threadsPerBlock;
+  }
 
   return ierr;