#include "CudaChiSquareRuntime.cuh" CudaChiSquareRuntime::CudaChiSquareRuntime(CudaBase *base) { blockSize_m = BLOCK_SIZE; numBlocks_m = -1; ptx_m = NULL; m_base = base; base_create = false; setUpContext(); } //constructor, init cuda device and create context CudaChiSquareRuntime::CudaChiSquareRuntime() { blockSize_m = BLOCK_SIZE; numBlocks_m = -1; ptx_m = NULL; m_base = new CudaBase(); base_create = true; setUpContext(); } //free resources CudaChiSquareRuntime::~CudaChiSquareRuntime() { delete[] ptx_m; cuCtxDestroy(context_m); freeChiSquare(); if (base_create) delete m_base; } void CudaChiSquareRuntime::setUpContext() { cuInit(0); cuDeviceGet(&cuDevice_m, 0); cuCtxCreate(&context_m, 0, cuDevice_m); N0_m = 1.0; tau_m = 1.0; bkg_m = 1.0; initDone_m = false; } //build program string std::string CudaChiSquareRuntime::buildProgram(std::string function) { long fsize; char *kernel_source; //get kernel source char * kernel_file = new char[500]; kernel_file[0] = '\0'; strcat(kernel_file, OPENCL_KERNELS); strcat(kernel_file, "CUDA/NVRTCKernels/CudaChiSquareKernel.cu"); //read kernels from file FILE *fp = fopen(kernel_file, "rb"); if (!fp) DEBUG_MSG("Can't open kernel file" << kernel_file); //get file size and allocate memory fseek(fp, 0, SEEK_END); fsize = ftell(fp); kernel_source = new char[fsize+1]; //read file and content in kernel source rewind(fp); fread(kernel_source, 1, sizeof(char)*fsize, fp); kernel_source[fsize] = '\0'; fclose(fp); std::string kernel_string (kernel_source); return kernel_string + cudaFunctHeader + "return " + function + ";" + cudaFunctFooter; } // int CudaChiSquareRuntime::compileProgram(std::string function, bool mlh) { //build program string std::string cudaProg = buildProgram(function); //create program nvrtcProgram prog; //std::cout << cudaProg.c_str() << std::endl; nvrtcCreateProgram(&prog, cudaProg.c_str(), "chiSquareRuntime.cu", 0, NULL, NULL); //compile program const char *opts[] = {"-fmad=false", ""}; int numopts = 1; if (mlh) { opts[1] = "-DMLH"; numopts = 2; } nvrtcResult compileResults = nvrtcCompileProgram(prog, numopts, opts); if (compileResults != NVRTC_SUCCESS) { //obtain compilation log size_t logSize; nvrtcGetProgramLogSize(prog, &logSize); char *log = new char[logSize]; nvrtcGetProgramLog(prog, log); DEBUG_MSG("Compilation failed!"); DEBUG_MSG(log); delete[] log; return DKS_ERROR; } else { DEBUG_MSG("Compilation successfull!"); } //obtain PTX from program if (ptx_m != NULL) delete[] ptx_m; size_t ptxSize; nvrtcGetPTXSize(prog, &ptxSize); ptx_m = new char[ptxSize]; nvrtcResult nvrtcPTXResult = nvrtcGetPTX(prog, ptx_m); if (nvrtcPTXResult != NVRTC_SUCCESS) { DEBUG_MSG("Get PTX failed!"); return DKS_ERROR; } //load module from ptx CUresult loadResult = cuModuleLoadDataEx(&module_m, ptx_m, 0, 0, 0); if (loadResult != CUDA_SUCCESS) { DEBUG_MSG("Load module from ptx failed!"); return DKS_ERROR; } // Destroy the program nvrtcDestroyProgram(&prog); return DKS_SUCCESS; } int CudaChiSquareRuntime::launchChiSquare(int fitType, void *mem_data, void *mem_err, int length, int numpar, int numfunc, int nummap, double timeStart, double timeStep, double &result) { if (!initDone_m) { DEBUG_MSG("ChiSquare init needs to be called at some point!"); return DKS_ERROR; } int blocks; int threads = blockSize_m; if (numBlocks_m < 0) blocks = length / threads + 1; else blocks = numBlocks_m; CUresult cuStatus; void **args = 0; if (fitType == FITTYPE_SINGLE_HISTO) { cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareSingleHisto"); if (cuStatus != CUDA_SUCCESS) { DEBUG_MSG("Failed to get function from module!"); return DKS_ERROR; } args = (void**) malloc(15 * sizeof(void*)); args[0] = &mem_data; args[1] = &mem_err; args[2] = &mem_param_m; args[3] = &mem_chisq_m; args[4] = &mem_map_m; args[5] = &mem_func_m; args[6] = &length; args[7] = &numpar; args[8] = &numfunc; args[9] = &nummap; args[10] = &timeStart; args[11] = &timeStep; args[12] = &tau_m; args[13] = &N0_m; args[14] = &bkg_m; } else if (fitType == FITTYPE_ASYMMETRY) { cuStatus = cuModuleGetFunction(&kernel_m, module_m, "kernelChiSquareAsymmetry"); if (cuStatus != CUDA_SUCCESS) { DEBUG_MSG("Failed to get function from module!"); return DKS_ERROR; } args = (void**) malloc(14 * sizeof(void*)); args[0] = &mem_data; args[1] = &mem_err; args[2] = &mem_param_m; args[3] = &mem_chisq_m; args[4] = &mem_map_m; args[5] = &mem_func_m; args[6] = &length; args[7] = &numpar; args[8] = &numfunc; args[9] = &nummap; args[10] = &timeStart; args[11] = &timeStep; args[12] = &alpha_m; args[13] = &beta_m; } else if (fitType == FITTYPE_MU_MINUS) { DEBUG_MSG("Not Yet Implemented!"); return DKS_ERROR; } else { DEBUG_MSG("Undefined Fit Type!"); return DKS_ERROR; } cuStatus = cuLaunchKernel(kernel_m, blocks, 1, 1, threads, 1, 1, (numpar + numfunc)*sizeof(double) + nummap*sizeof(int), NULL, args, 0); if (cuStatus != CUDA_SUCCESS) { std::string msg; msg = "Failed to run kernel! (" + std::to_string(blocks) + ", " + std::to_string(threads) + ")"; DEBUG_MSG(msg); const char *desc; cuGetErrorString(cuStatus, &desc); std::cout << desc << std::endl; return DKS_ERROR; } cublasStatus_t status; status = cublasDasum(defaultCublasRT, length, (double*)mem_chisq_m, 1, &result); if (status != CUBLAS_STATUS_SUCCESS) { DEBUG_MSG("cublas sum failed!"); return DKS_ERROR; } // cleanup if (args) free(args); return DKS_SUCCESS; } int CudaChiSquareRuntime::writeParams(const double *params, int numparams) { int ierr = m_base->cuda_writeData( (double*)mem_param_m, params, sizeof(double)*numparams); return ierr; } int CudaChiSquareRuntime::writeFunc(const double *func, int numfunc) { int ierr = m_base->cuda_writeData( (double*)mem_func_m, func, sizeof(double)*numfunc); return ierr; } int CudaChiSquareRuntime::writeMap(const int *map, int nummap) { int ierr = m_base->cuda_writeData( (int*)mem_map_m, map, sizeof(int)*nummap); return ierr; } int CudaChiSquareRuntime::initChiSquare(int size_data, int size_param, int size_func, int size_map) { int ierr = DKS_ERROR; if (initDone_m) { DEBUG_MSG("Reinitializing ChiSquare"); freeChiSquare(); } //init cublas cublasStatus_t status = CUBLAS_STATUS_SUCCESS; status = cublasCreate(&defaultCublasRT); if (status != CUBLAS_STATUS_SUCCESS) DEBUG_MSG("CUBLAS create default handle failed!"); //allocate temporary memory mem_chisq_m = m_base->cuda_allocateMemory(size_data*sizeof(double), ierr); mem_param_m = m_base->cuda_allocateMemory(size_param*sizeof(double), ierr); mem_func_m = m_base->cuda_allocateMemory(size_func*sizeof(double), ierr); mem_map_m = m_base->cuda_allocateMemory(size_map*sizeof(int), ierr); initDone_m = true; return ierr; } int CudaChiSquareRuntime::freeChiSquare() { int ierr = DKS_ERROR; if (initDone_m) { //delete cublas cublasStatus_t status = CUBLAS_STATUS_SUCCESS; status = cublasDestroy(defaultCublasRT); if (status != CUBLAS_STATUS_SUCCESS) { DEBUG_MSG("CUBLAS delete default handle failed!"); return DKS_ERROR; } //free memory ierr = m_base->cuda_freeMemory(mem_chisq_m); ierr = m_base->cuda_freeMemory(mem_param_m); ierr = m_base->cuda_freeMemory(mem_func_m); ierr = m_base->cuda_freeMemory(mem_map_m); initDone_m = false; } return ierr; }