From 79833cf7f51a86d0568142ab8db1fc7a79b917c2 Mon Sep 17 00:00:00 2001 From: Uldis Locans Date: Thu, 17 Aug 2017 16:56:57 +0200 Subject: [PATCH] update work item size correctly for devices where supported size is smaller than DKS default --- src/DKSBaseMuSR.cpp | 5 +++-- src/OpenCL/OpenCLBase.cpp | 23 +++++++++++++++-------- src/OpenCL/OpenCLChiSquareRuntime.cpp | 11 ++++++++--- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/DKSBaseMuSR.cpp b/src/DKSBaseMuSR.cpp index 3df59e9..6e7d06f 100644 --- a/src/DKSBaseMuSR.cpp +++ b/src/DKSBaseMuSR.cpp @@ -24,6 +24,7 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType, //if we are not auto tuning and the size of the problem has changed find the new parameters //from autotuning config file if (!isAutoTuningOn() && length != chiSquareSize_m) { + /* int numBlocks, blockSize; std::string device_name; getDeviceName(device_name); @@ -33,8 +34,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType, length, "BlockSize", blockSize); chiSq->setKernelParams(numBlocks, blockSize); - //std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl; - + std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl; + */ chiSquareSize_m = length; } diff --git a/src/OpenCL/OpenCLBase.cpp b/src/OpenCL/OpenCLBase.cpp index 4dad528..40677b9 100644 --- a/src/OpenCL/OpenCLBase.cpp +++ b/src/OpenCL/OpenCLBase.cpp @@ -756,7 +756,9 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const } if (ierr != CL_SUCCESS) - DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr); + DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr + << " work items: " << *work_items << ", " + << " work group: " << *work_group_size); m_last_event = tmp_event; m_events.push_back(m_last_event); @@ -937,22 +939,27 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size, if (ierr != DKS_SUCCESS) return ierr; - //get device properties + /* get device properties */ + //maximum number of work-items in a work group supported by device size_t max_group_size; clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0); + //maxumum local memory size per work group cl_ulong local_mem_size; clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0); + //get the supported extensions size_t ext_size; clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size); char *ext = new char[ext_size]; clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0); - //get kernel properties + /* get kernel properties */ + //get max work group size that can be used for this kernel size_t kernel_group_size; clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_group_size, 0); threadsPerBlock = kernel_group_size; + //get max local memory size that can be used for this kernel cl_ulong kernel_local_mem; clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &kernel_local_mem, 0); @@ -961,18 +968,18 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size, std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl; - std::cout << "Work groups: device limit " << max_group_size << ", " - << "kernel limit " << kernel_group_size << ", " + std::cout << "Work group size: max for device " << max_group_size << " > " + << "max for kernel " << kernel_group_size << " > " << "required " << work_group_size << std::endl; std::cout << "Local memory: device limit " << local_mem_size << std::endl; - + std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl; - std::cout << "Available extensions: " << ext << std::endl; + std::cout << std::endl << "Available extensions: " << ext << std::endl; - std::cout << "End " << kernel_name << " check..." << std::endl << std::endl; + std::cout << "End " << kernel_name << " check..." << std::endl << std::endl; return DKS_SUCCESS; } diff --git a/src/OpenCL/OpenCLChiSquareRuntime.cpp b/src/OpenCL/OpenCLChiSquareRuntime.cpp index 2b97b87..3cd71f2 100644 --- a/src/OpenCL/OpenCLChiSquareRuntime.cpp +++ b/src/OpenCL/OpenCLChiSquareRuntime.cpp @@ -78,8 +78,8 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) { int ierr; - //calc number of thread sper workgroup and nr of work groups - size_t work_size_sum = 128; + //calc number of threads per workgroup and nr of work groups + size_t work_size_sum = (size_t)blockSize_m; /* size_t work_items = (size_t)length; @@ -141,6 +141,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType, //set work item size size_t work_items; size_t work_size = (size_t)blockSize_m; + if (numBlocks_m < 0) work_items = (size_t)length; else @@ -312,7 +313,11 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl } //check the GPU kernel - ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock); + ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock); + if (threadsPerBlock < blockSize_m) { + std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl; + blockSize_m = threadsPerBlock; + } return ierr;