update work item size correctly for devices where supported size is smaller than DKS default
This commit is contained in:
@ -24,6 +24,7 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
|
|||||||
//if we are not auto tuning and the size of the problem has changed find the new parameters
|
//if we are not auto tuning and the size of the problem has changed find the new parameters
|
||||||
//from autotuning config file
|
//from autotuning config file
|
||||||
if (!isAutoTuningOn() && length != chiSquareSize_m) {
|
if (!isAutoTuningOn() && length != chiSquareSize_m) {
|
||||||
|
/*
|
||||||
int numBlocks, blockSize;
|
int numBlocks, blockSize;
|
||||||
std::string device_name;
|
std::string device_name;
|
||||||
getDeviceName(device_name);
|
getDeviceName(device_name);
|
||||||
@ -33,8 +34,8 @@ int DKSBaseMuSR::callLaunchChiSquare(int fitType,
|
|||||||
length, "BlockSize", blockSize);
|
length, "BlockSize", blockSize);
|
||||||
chiSq->setKernelParams(numBlocks, blockSize);
|
chiSq->setKernelParams(numBlocks, blockSize);
|
||||||
|
|
||||||
//std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
|
std::cout << "Parameters set to: " << numBlocks << ", " << blockSize << std::endl;
|
||||||
|
*/
|
||||||
chiSquareSize_m = length;
|
chiSquareSize_m = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -756,7 +756,9 @@ int OpenCLBase::ocl_executeKernel(cl_uint ndim, const size_t *work_items, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ierr != CL_SUCCESS)
|
if (ierr != CL_SUCCESS)
|
||||||
DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr);
|
DEBUG_MSG("Error executing kernel, OpenCL error: " << ierr
|
||||||
|
<< " work items: " << *work_items << ", "
|
||||||
|
<< " work group: " << *work_group_size);
|
||||||
|
|
||||||
m_last_event = tmp_event;
|
m_last_event = tmp_event;
|
||||||
m_events.push_back(m_last_event);
|
m_events.push_back(m_last_event);
|
||||||
@ -937,22 +939,27 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
|
|||||||
if (ierr != DKS_SUCCESS)
|
if (ierr != DKS_SUCCESS)
|
||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
//get device properties
|
/* get device properties */
|
||||||
|
//maximum number of work-items in a work group supported by device
|
||||||
size_t max_group_size;
|
size_t max_group_size;
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_group_size, 0);
|
||||||
|
//maxumum local memory size per work group
|
||||||
cl_ulong local_mem_size;
|
cl_ulong local_mem_size;
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
|
||||||
|
//get the supported extensions
|
||||||
size_t ext_size;
|
size_t ext_size;
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, 0, 0, &ext_size);
|
||||||
char *ext = new char[ext_size];
|
char *ext = new char[ext_size];
|
||||||
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
|
clGetDeviceInfo(m_device_id, CL_DEVICE_EXTENSIONS, ext_size, ext, 0);
|
||||||
|
|
||||||
//get kernel properties
|
/* get kernel properties */
|
||||||
|
//get max work group size that can be used for this kernel
|
||||||
size_t kernel_group_size;
|
size_t kernel_group_size;
|
||||||
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE,
|
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_WORK_GROUP_SIZE,
|
||||||
sizeof(size_t), &kernel_group_size, 0);
|
sizeof(size_t), &kernel_group_size, 0);
|
||||||
threadsPerBlock = kernel_group_size;
|
threadsPerBlock = kernel_group_size;
|
||||||
|
|
||||||
|
//get max local memory size that can be used for this kernel
|
||||||
cl_ulong kernel_local_mem;
|
cl_ulong kernel_local_mem;
|
||||||
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
|
clGetKernelWorkGroupInfo(m_kernel, m_device_id, CL_KERNEL_LOCAL_MEM_SIZE,
|
||||||
sizeof(cl_ulong), &kernel_local_mem, 0);
|
sizeof(cl_ulong), &kernel_local_mem, 0);
|
||||||
@ -961,18 +968,18 @@ int OpenCLBase::ocl_checkKernel(const char* kernel_name, int work_group_size,
|
|||||||
std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
|
std::cout << std::endl << "Begin " << kernel_name << " check..." << std::endl;
|
||||||
|
|
||||||
|
|
||||||
std::cout << "Work groups: device limit " << max_group_size << ", "
|
std::cout << "Work group size: max for device " << max_group_size << " > "
|
||||||
<< "kernel limit " << kernel_group_size << ", "
|
<< "max for kernel " << kernel_group_size << " > "
|
||||||
<< "required " << work_group_size << std::endl;
|
<< "required " << work_group_size << std::endl;
|
||||||
|
|
||||||
|
|
||||||
std::cout << "Local memory: device limit " << local_mem_size << std::endl;
|
std::cout << "Local memory: device limit " << local_mem_size << std::endl;
|
||||||
|
std::cout << "Local memory: kernel needs " << kernel_local_mem << std::endl;
|
||||||
|
|
||||||
|
|
||||||
std::cout << "Available extensions: " << ext << std::endl;
|
std::cout << std::endl << "Available extensions: " << ext << std::endl;
|
||||||
|
|
||||||
std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
|
std::cout << "End " << kernel_name << " check..." << std::endl << std::endl;
|
||||||
|
|
||||||
return DKS_SUCCESS;
|
return DKS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -78,8 +78,8 @@ double OpenCLChiSquareRuntime::calculateSum(cl_mem data, int length) {
|
|||||||
|
|
||||||
|
|
||||||
int ierr;
|
int ierr;
|
||||||
//calc number of thread sper workgroup and nr of work groups
|
//calc number of threads per workgroup and nr of work groups
|
||||||
size_t work_size_sum = 128;
|
size_t work_size_sum = (size_t)blockSize_m;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
size_t work_items = (size_t)length;
|
size_t work_items = (size_t)length;
|
||||||
@ -141,6 +141,7 @@ int OpenCLChiSquareRuntime::launchChiSquare(int fitType,
|
|||||||
//set work item size
|
//set work item size
|
||||||
size_t work_items;
|
size_t work_items;
|
||||||
size_t work_size = (size_t)blockSize_m;
|
size_t work_size = (size_t)blockSize_m;
|
||||||
|
|
||||||
if (numBlocks_m < 0)
|
if (numBlocks_m < 0)
|
||||||
work_items = (size_t)length;
|
work_items = (size_t)length;
|
||||||
else
|
else
|
||||||
@ -312,7 +313,11 @@ int OpenCLChiSquareRuntime::checkChiSquareKernels(int fitType, int &threadsPerBl
|
|||||||
}
|
}
|
||||||
|
|
||||||
//check the GPU kernel
|
//check the GPU kernel
|
||||||
ierr = m_oclbase->ocl_checkKernel(kernel, 128, true, threadsPerBlock);
|
ierr = m_oclbase->ocl_checkKernel(kernel, blockSize_m, true, threadsPerBlock);
|
||||||
|
if (threadsPerBlock < blockSize_m) {
|
||||||
|
std::cout << "Default OpenCL blocksize changed in DKS to: " << threadsPerBlock << std::endl;
|
||||||
|
blockSize_m = threadsPerBlock;
|
||||||
|
}
|
||||||
|
|
||||||
return ierr;
|
return ierr;
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user