diff --git a/image_analysis/GPUImageAnalysis.cpp b/image_analysis/GPUImageAnalysis.cpp index ae4487a9..07e0d335 100644 --- a/image_analysis/GPUImageAnalysis.cpp +++ b/image_analysis/GPUImageAnalysis.cpp @@ -8,8 +8,8 @@ struct CudaStreamWrapper { int32_t just_anything_as_this_wont_be_used; }; -GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const std::vector &mask) : - xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr), rad_integration_nbins(0) { +GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels) : + xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr) { } GPUImageAnalysis::~GPUImageAnalysis() {} @@ -32,6 +32,6 @@ void GPUImageAnalysis::RegisterBuffer() {} void GPUImageAnalysis::UnregisterBuffer() {} -void GPUImageAnalysis::LoadDataToGPU(bool apply_pixel_mask_on_gpu) {} +void GPUImageAnalysis::LoadDataToGPU() {} #endif diff --git a/image_analysis/GPUImageAnalysis.cu b/image_analysis/GPUImageAnalysis.cu index 0157b90e..37c3b71a 100644 --- a/image_analysis/GPUImageAnalysis.cu +++ b/image_analysis/GPUImageAnalysis.cu @@ -1,14 +1,12 @@ // Copyright (2019-2023) Paul Scherrer Institute - #include "GPUImageAnalysis.h" -#include "../common/JFJochException.h" -#include "../common/DiffractionGeometry.h" -#include -#include "../common/CUDAWrapper.h" +#include "../common/JFJochException.h" // input X x Y pixels array -// output X x Y byte array +// output X x Y bit array + +static constexpr int WARP_SIZE = 32; // assume warp size of 32 cuda threads per warp struct CudaStreamWrapper { cudaStream_t v; @@ -19,6 +17,60 @@ inline void cuda_err(cudaError_t val) { throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val)); } +// Calculate byte size of output +// output_type: type of output array elements +// ptr: dummy pointer to make template type inference possible +// xpixels: number of bits (=pixel results) in x direction +// ypixels: number of bits (=pixel results) in y direction +// returns the size of the output (xpixels X ypixels bits) in bytes for an array with elements of type output_type +template +static std::size_t output_byte_size(const output_type* ptr, int32_t xpixels, int32_t ypixels) noexcept { + constexpr auto bit_group_size = sizeof(output_type) * 8; // number of bits per output array element + return ((xpixels * ypixels + bit_group_size - 1) / bit_group_size) * sizeof(output_type); +} + +// Get the bit ptr[line, col] of a bit array with lines of size xpixels bits +// output_type: type of bit array elements +// ptr: pointer to array with bits stored in element type output_type +// xpixels: size in bits (=pixel results) of one line (=row of array) +// line: bit line (=row) number +// col: bit column number +// returns the bit ptr[line, col] +template +static bool get_bit(const output_type* ptr, int32_t xpixels, int32_t line, int32_t col) noexcept { + constexpr auto bit_group_size = sizeof(output_type) * 8; // number of bits per output array element + const auto global_bit_idx = line * xpixels + col; + const auto element_idx = global_bit_idx / bit_group_size; + const auto local_bit_idx = global_bit_idx % bit_group_size; + return (ptr[element_idx] & (1 << local_bit_idx)) != 0; +} + +// Write pixel results to bit array +// params: spot finding parameters +// out: pixel result bit array +// pixel: flat pixel index = bit index into bit array +// val: pixel result +// **NOTE**: assumes sizeof(*out) * 8 == WARP_SIZE +__device__ __forceinline__ void write_result(const spot_parameters& params, uint32_t* out, int32_t pixel, uint8_t val) +{ + static_assert(sizeof(*out) * 8 == WARP_SIZE, "Violation of essential implementation assumtion: WARP_SIZE must match output array element type bit size!"); + static constexpr unsigned ALL_THREADS = unsigned{-1}; + const int32_t laneid = threadIdx.x & (WARP_SIZE - 1); + unsigned result = __ballot_sync(ALL_THREADS, val); + const int32_t idx = pixel / WARP_SIZE; // global uint32_t index + const int32_t bit = pixel % WARP_SIZE; // local bit index + + if ((bit >= laneid) & (laneid == 0)) { // write to upper part of uint32_t + result <<= bit; + if (result) + atomicOr(&out[idx], result); + } else if ((bit < laneid) & (bit == 0)) { // write to lower part of uint32_t + result >>= laneid; + if (result) + atomicOr(&out[idx], result); + } +} + // Determine if pixel could be a spot // params: spot finding parameters // val: pixel value @@ -44,7 +96,7 @@ __device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, c // Find pixels that could be spots // in: image input values -// out: pixel result byte array, 1 byte per pixel (0:no/1:candidate spot) +// out: pixel result bit array, 1 bit per pixel (0:no/1:candidate spot) // params: spot finding parameters // // The algorithm uses multiple waves (blockDim.y) that run over sections of rows. @@ -55,7 +107,7 @@ __device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, c // Every cuda thread is associated with a particular column. The thread maintains // the sum/sum2/count values in shared memory for it's column. To do this, the input // pixel values for the hight of the aggregation window are saved in shared memory. -__global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parameters params) +__global__ void analyze_pixel(const int16_t *in, uint32_t *out, const spot_parameters params) { // assumption: 2 * params.nby + 1 <= params.rows and 2 * params.nbx + 1 <= params.cols const int32_t window = 2 * (int)params.nby + 1; // vertical window @@ -111,6 +163,7 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame // wave front up to rmax do { __syncthreads(); // make others see the shared values + uint8_t val = 0; if (result_col) { // write at the back end of the wave total_sum = total_sum2 = total_count = 0; for (auto j = left; j < right; j++) { @@ -118,8 +171,9 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame total_sum2 += shared_sum2[j]; total_count += shared_count[j]; } - out[back * params.cols + col] = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count); + val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count); } + write_result(params, out, back * params.cols + col, val); back++; __syncthreads(); // keep shared values until others have seen them if (data_col) { // read at the front end of the wave @@ -144,6 +198,7 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame // wave back up to rmax do { __syncthreads(); // make others see the shared values + uint8_t val = 0; if (result_col) { // write at the back end of the wave total_sum = total_sum2 = total_count = 0; for (auto j = left; j < right; j++) { @@ -151,8 +206,9 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame total_sum2 += shared_sum2[j]; total_count += shared_count[j]; } - out[back * params.cols + col] = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count); + val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count); } + write_result(params, out, back * params.cols + col, val); back++; __syncthreads(); // keep shared values until others have seen them if (data_col) { // read at the front end of the wave if possible @@ -178,30 +234,13 @@ __global__ void analyze_pixel(const int16_t *in, uint8_t *out, const spot_parame } while (back < rmax); } -__global__ void spot_finder_reduce(uint32_t *output, uint32_t* counter, const uint8_t* input, - uint32_t npixel, uint32_t output_size) { - uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x; - if (idx < npixel) { - if (input[idx]) { - auto old_counter = atomicAdd(counter, 1); - if (old_counter < output_size) - output[old_counter] = idx; - } - } -} +GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels) : + xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr) { -__global__ void apply_pixel_mask(int16_t *image, const uint8_t *mask, uint32_t npixel) { - uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x; - if (idx < npixel) { - if (mask[idx] == 0) - image[idx] = INT16_MIN; - } -} + int device_count; + cuda_err(cudaGetDeviceCount(&device_count)); -GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const std::vector &mask) : - xpixels(in_xpixels), ypixels(in_ypixels), gpu_out(nullptr), numberOfSMs(1) { - - if (get_gpu_count() == 0) + if (device_count == 0) throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "No CUDA devices found"); int deviceId; @@ -215,32 +254,18 @@ GPUImageAnalysis::GPUImageAnalysis(int32_t in_xpixels, int32_t in_ypixels, const cudastream = new(CudaStreamWrapper); cuda_err(cudaStreamCreate(&cudastream->v)); - cuda_err(cudaMalloc(&gpu_mask, xpixels * ypixels * sizeof(int8_t))); cuda_err(cudaMalloc(&gpu_in, xpixels * ypixels * sizeof(int16_t))); - cuda_err(cudaMalloc(&gpu_out, xpixels * ypixels * sizeof(int8_t))); - cuda_err(cudaMalloc(&gpu_out_reduced, maxStrongPixel * sizeof(uint32_t))); - cuda_err(cudaMalloc(&gpu_out_reduced_counter, sizeof(uint32_t))); - cuda_err(cudaHostAlloc(&host_out_reduced, maxStrongPixel * sizeof(uint32_t), cudaHostAllocPortable)); - cuda_err(cudaHostAlloc(&host_out_reduced_counter, sizeof(uint32_t), cudaHostAllocPortable)); - - cuda_err(cudaMemsetAsync(gpu_mask, 1, xpixels*ypixels, cudastream->v)); - - if (mask.size() != xpixels * ypixels) - throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Mismatch in mask size"); - cudaMemcpy(gpu_mask, mask.data(), xpixels*ypixels, cudaMemcpyHostToDevice); + cuda_err(cudaMalloc(&gpu_out, output_byte_size(gpu_out, xpixels, ypixels))); + cuda_err(cudaHostAlloc(&host_out, output_byte_size(host_out, xpixels, ypixels), cudaHostAllocPortable)); } GPUImageAnalysis::~GPUImageAnalysis() { cudaStreamDestroy(cudastream->v); delete(cudastream); - cudaFreeHost(host_out_reduced); - cudaFreeHost(host_out_reduced_counter); - cudaFree(gpu_mask); + cudaFreeHost(host_out); cudaFree(gpu_in); cudaFree(gpu_out); - cudaFree(gpu_out_reduced); - cudaFree(gpu_out_reduced_counter); } void GPUImageAnalysis::SetInputBuffer(void *ptr) { @@ -287,23 +312,12 @@ void GPUImageAnalysis::RunSpotFinder(const JFJochProtoBuf::DataProcessingSetting ) * numberOfCudaThreads; const dim3 blocks(nBlocks, numberOfWaves); - cuda_err(cudaMemsetAsync(gpu_out, 0, xpixels * ypixels * sizeof(uint8_t), cudastream->v)); + cuda_err(cudaMemsetAsync(gpu_out, 0, output_byte_size(gpu_out, xpixels, ypixels), cudastream->v)); analyze_pixel<<v>>> (gpu_in, gpu_out, spot_params); } - { - cuda_err(cudaMemsetAsync(gpu_out_reduced_counter, 0, sizeof(uint32_t), cudastream->v)); - const auto nblocks = - xpixels * ypixels / numberOfCudaThreads + ((xpixels * ypixels % numberOfCudaThreads == 0) ? 0 : 1); - - spot_finder_reduce<<v>>>(gpu_out_reduced, gpu_out_reduced_counter, - gpu_out, xpixels*ypixels, maxStrongPixel); - } - cuda_err(cudaMemcpyAsync(host_out_reduced, gpu_out_reduced, maxStrongPixel * sizeof(uint32_t), - cudaMemcpyDeviceToHost,cudastream->v)); - - cuda_err(cudaMemcpyAsync(host_out_reduced_counter, gpu_out_reduced_counter, sizeof(uint32_t), + cuda_err(cudaMemcpyAsync(host_out, gpu_out, output_byte_size(gpu_out, xpixels, ypixels), cudaMemcpyDeviceToHost,cudastream->v)); } @@ -313,17 +327,26 @@ void GPUImageAnalysis::GetSpotFinderResults(StrongPixelSet &pixel_set) { cuda_err(cudaStreamSynchronize(cudastream->v)); - for (int i = 0; i < std::min(maxStrongPixel, *host_out_reduced_counter); i++) { - size_t npixel = host_out_reduced[i]; - size_t line = npixel / xpixels; - size_t col = npixel % xpixels; - pixel_set.AddStrongPixel(col, line, host_in[npixel]); + auto out_size = output_byte_size(host_out, xpixels, ypixels); + auto out_ptr = (uint32_t *) host_out; + + for (int i = 0; i < out_size / sizeof(out_ptr[0]); i++) { + if (out_ptr[i]) { + for (int j = 0; j < 8 * sizeof(out_ptr[0]); j++) { + if (out_ptr[i] & (1 << j)) { + size_t npixel = i * 8 * sizeof(out_ptr[0])| j; + size_t line = npixel / xpixels; + size_t col = npixel % xpixels; + pixel_set.AddStrongPixel(col, line, host_in[npixel]); + } + } + } } } void GPUImageAnalysis::GetSpotFinderResults(const DiffractionExperiment &experiment, - const JFJochProtoBuf::DataProcessingSettings &settings, - std::vector &vec) { + const JFJochProtoBuf::DataProcessingSettings &settings, + std::vector &vec) { StrongPixelSet pixel_set(experiment); GetSpotFinderResults(pixel_set); pixel_set.FindSpots(experiment, settings, vec); @@ -337,16 +360,9 @@ void GPUImageAnalysis::UnregisterBuffer() { cudaHostUnregister(host_in); } -void GPUImageAnalysis::LoadDataToGPU(bool apply_pixel_mask_on_gpu) { +void GPUImageAnalysis::LoadDataToGPU() { if (host_in == nullptr) throw JFJochException(JFJochExceptionCategory::SpotFinderError, "Host/GPU buffer not defined"); cuda_err(cudaMemcpy(gpu_in, host_in, xpixels * ypixels * sizeof(int16_t), cudaMemcpyHostToDevice)); - - if (apply_pixel_mask_on_gpu) { - const auto nblocks = - xpixels * ypixels / numberOfCudaThreads + ((xpixels * ypixels % numberOfCudaThreads == 0) ? 0 : 1); - apply_pixel_mask<<v>>>(gpu_in, gpu_mask, xpixels * ypixels); - } } - diff --git a/image_analysis/GPUImageAnalysis.h b/image_analysis/GPUImageAnalysis.h index 5b4d0bb1..8e20818e 100644 --- a/image_analysis/GPUImageAnalysis.h +++ b/image_analysis/GPUImageAnalysis.h @@ -27,26 +27,21 @@ class GPUImageAnalysis { const int32_t ypixels; int16_t *host_in = nullptr; - uint8_t *gpu_mask = nullptr; int16_t *gpu_in = nullptr; - uint8_t *gpu_out = nullptr; - uint32_t *gpu_out_reduced = nullptr; - uint32_t *gpu_out_reduced_counter = nullptr; - uint32_t *host_out_reduced = nullptr; - uint32_t *host_out_reduced_counter = nullptr; + uint32_t *gpu_out = nullptr; + uint32_t *host_out = nullptr; int numberOfSMs; const int numberOfCudaThreads = 128; // #threads per block that works well for Nvidia T4 const int numberOfWaves = 40; // #waves that works well for Nvidia T4 const int windowSizeLimit = 21; // limit on the window size (2nby+1, 2nbx+1) to prevent shared memory problems - const int maxStrongPixel = 65536; public: - GPUImageAnalysis(int32_t xpixels, int32_t ypixels, const std::vector &mask); + GPUImageAnalysis(int32_t xpixels, int32_t ypixels); ~GPUImageAnalysis(); void SetInputBuffer(void *host_in); - void LoadDataToGPU(bool apply_pixel_mask_on_gpu = true); + void LoadDataToGPU(); void RunSpotFinder(const JFJochProtoBuf::DataProcessingSettings &settings); void GetSpotFinderResults(StrongPixelSet &pixel_set); diff --git a/receiver/JFJochReceiver.cpp b/receiver/JFJochReceiver.cpp index 460429b2..14d7278f 100644 --- a/receiver/JFJochReceiver.cpp +++ b/receiver/JFJochReceiver.cpp @@ -406,9 +406,7 @@ void JFJochReceiver::FrameTransformationThread() { std::unique_ptr spot_finder; try { - spot_finder = std::make_unique(experiment.GetXPixelsNum(), - experiment.GetYPixelsNum(), - one_byte_mask); + spot_finder = std::make_unique(experiment.GetXPixelsNum(), experiment.GetYPixelsNum()); spot_finder->SetInputBuffer(transformation.GetPreview16BitImage()); spot_finder->RegisterBuffer(); } catch (const JFJochException& e) { @@ -519,7 +517,7 @@ void JFJochReceiver::FrameTransformationThread() { auto local_data_processing_settings = GetDataProcessingSettings(); if (calculate_spots) { - spot_finder->LoadDataToGPU(!experiment.GetApplyPixelMaskInFPGA()); + spot_finder->LoadDataToGPU(); spot_finder->RunSpotFinder(local_data_processing_settings); spot_finder->GetSpotFinderResults(experiment, GetDataProcessingSettings(), spots); for (const auto &spot: spots) diff --git a/tests/SpotFinderIntegration.cpp b/tests/SpotFinderIntegration.cpp index 354dfe73..8eb52d4b 100644 --- a/tests/SpotFinderIntegration.cpp +++ b/tests/SpotFinderIntegration.cpp @@ -73,10 +73,7 @@ TEST_CASE("SpotFinder_GPU", "[GPUImageAnalysis]") { image[60000] = static_cast(mean + 15 * sqrt(mean)); std::vector threshold_vals = {3,4,6,10}; - std::vector one_byte_mask(experiment.GetPixelsNum(), 1); - - GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), - one_byte_mask); + GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum()); spot_finder.SetInputBuffer(image.data()); @@ -112,63 +109,3 @@ TEST_CASE("SpotFinder_GPU", "[GPUImageAnalysis]") { std::cout << "Common: " << naive_cpu.Common(colspot_gpu) << " fast impl: " << colspot_gpu.Count() << " ref. impl: " << naive_cpu.Count() << std::endl; } } - -TEST_CASE("SpotFinder_GPU_Mask", "[GPUImageAnalysis]") { - DiffractionExperiment experiment(DetectorGeometry(1)); - experiment.Mode(DetectorMode::Raw); - - const uint16_t nbx = 5; - - const double mean = 30; - - std::vector image(512*1024); - - // Predictable random number generator, so test always gives the same result - std::mt19937 g1(2023); - // Poissonian distribution, with mean == variance - std::normal_distribution distribution(mean, sqrt(mean)); - - for (auto &i: image) - i = static_cast(distribution(g1)); - - image[2000] = static_cast(mean + 15 * sqrt(mean)); - image[8000] = static_cast(mean + 15 * sqrt(mean)); - image[14000] = static_cast(mean + 15 * sqrt(mean)); - image[20000] = static_cast(mean + 15 * sqrt(mean)); - image[25000] = static_cast(mean + 15 * sqrt(mean)); - image[60000] = static_cast(mean + 15 * sqrt(mean)); - - std::vector one_byte_mask(experiment.GetPixelsNum(), 1); - - JFJochProtoBuf::DataProcessingSettings settings; - settings.set_local_bkg_size(nbx); - settings.set_photon_count_threshold(1); - settings.set_signal_to_noise_threshold(10); - - { - GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), one_byte_mask); - spot_finder.SetInputBuffer(image.data()); - - // Without mask - StrongPixelSet colspot_gpu(experiment); - spot_finder.LoadDataToGPU(); - spot_finder.RunSpotFinder(settings); - spot_finder.GetSpotFinderResults(colspot_gpu); - - REQUIRE(colspot_gpu.Count() == 6); - } - { - one_byte_mask[2000] = 0; - one_byte_mask[25000] = 0; - one_byte_mask[60000] = 0; - GPUImageAnalysis spot_finder(experiment.GetXPixelsNum(), experiment.GetYPixelsNum(), one_byte_mask); - - StrongPixelSet colspot_gpu(experiment); - spot_finder.SetInputBuffer(image.data()); - spot_finder.LoadDataToGPU(); - spot_finder.RunSpotFinder(settings); - spot_finder.GetSpotFinderResults(colspot_gpu); - - REQUIRE(colspot_gpu.Count() == 3); - } -} diff --git a/tools/DataAnalysisPerfTest.cpp b/tools/DataAnalysisPerfTest.cpp index 6c1ea348..8dc39689 100644 --- a/tools/DataAnalysisPerfTest.cpp +++ b/tools/DataAnalysisPerfTest.cpp @@ -230,19 +230,19 @@ int main(int argc, char **argv) { logger.Info("COLSPOT NBX=NBY=5 (GPU)"); if (GPUImageAnalysis::GPUPresent()) { - GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask); + GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum()); TestSpotFinder(x, settings, local_peakfinder_gpu,image_conv.data(), nimages); } logger.Info("COLSPOT NBX=NBY=5 (GPU/no copy to device)"); if (GPUImageAnalysis::GPUPresent()) { - GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask); + GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum()); TestSpotFinderWithoutCopyToDevice(x, settings, local_peakfinder_gpu,image_conv.data(), nimages); } settings.set_local_bkg_size(3); logger.Info("COLSPOT NBX=NBY=3 (GPU)"); if (GPUImageAnalysis::GPUPresent()) { - GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask); + GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum()); TestSpotFinder(x, settings, local_peakfinder_gpu,image_conv.data(), nimages); } @@ -250,7 +250,7 @@ int main(int argc, char **argv) { logger.Info("Full package"); if (GPUImageAnalysis::GPUPresent()) { - GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum(), one_byte_mask); + GPUImageAnalysis local_peakfinder_gpu(x.GetXPixelsNum(), x.GetYPixelsNum()); TestAll(x, settings, local_peakfinder_gpu,image_conv.data(), nimages); TestAllWithROI(x, settings, local_peakfinder_gpu,image_conv.data(), nimages); }