// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only // GPU Spot finding developed by Hans-Christian Stadler (PSI) // Copyright (2019-2023) Paul Scherrer Institute #include "ImageSpotFinderGPU.h" #include "../../common/JFJochException.h" struct spot_parameters { int32_t width; int32_t height; float strong_pixel_threshold2; int32_t count_threshold; }; // input X x Y pixels array // output X x Y bit array static constexpr int WARP_SIZE = 32; // assume warp size of 32 cuda threads per warp inline void cuda_err(cudaError_t val) { if (val != cudaSuccess) throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val)); } // Write pixel results to bit array // params: spot finding parameters // out: pixel result bit array // pixel: flat pixel index = bit index into bit array // val: pixel result // **NOTE**: assumes sizeof(*out) * 8 == WARP_SIZE __device__ __forceinline__ void write_result(const spot_parameters& params, uint32_t* out, int32_t pixel, uint8_t val) { static_assert(sizeof(*out) * 8 == WARP_SIZE, "Violation of essential implementation assumption: WARP_SIZE must match output array element type bit size!"); static constexpr unsigned ALL_THREADS = UINT32_MAX; const int32_t laneid = threadIdx.x & (WARP_SIZE - 1); unsigned result = __ballot_sync(ALL_THREADS, val); const int32_t idx = pixel / WARP_SIZE; // global uint32_t index const int32_t bit = pixel % WARP_SIZE; // local bit index if ((bit >= laneid) && (laneid == 0)) { // write to upper part of uint32_t result <<= bit; if (result) atomicOr(&out[idx], result); } else if ((bit < laneid) && (bit == 0)) { // write to lower part of uint32_t result >>= laneid; if (result) atomicOr(&out[idx], result); } } // Determine if pixel could be a spot // params: spot finding parameters // val: pixel value // sum: window sum // sum2: window sum of squares // count: window valid pixels count // return the pixel result: 0-no spot / 1-spot candidate __device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, const int64_t val, int64_t sum, int64_t sum2, int64_t count) { sum -= val; sum2 -= val * val; count -= 1; const int64_t var = count * sum2 - (sum * sum); // This should be divided by ((2*NBX+1) * (2*NBY+1)-1)*((2*NBX+1) * (2*NBY+1)) const int64_t in_minus_mean = val * count - sum; // Should be divided by ((2*NBX+1) * (2*NBY+1)); const int64_t tmp1 = in_minus_mean * in_minus_mean; const float tmp2 = var * params.strong_pixel_threshold2; bool snr_criterion; if (params.strong_pixel_threshold2 == 0.0f) snr_criterion = true; else snr_criterion = (count > ImageSpotFinder::MIN_VALID_PIXELS) && (in_minus_mean > 0) && (tmp1 > tmp2); bool count_criterion = (params.count_threshold == 0.0f) || (val > params.count_threshold); bool strong_pixel = snr_criterion && count_criterion; if (val == INT32_MAX) strong_pixel = true; else if (val == INT32_MIN) strong_pixel = false; return strong_pixel ? 1 : 0; } // Find pixels that could be spots // in: image input values // out: pixel result bit array, 1 bit per pixel (0:no/1:candidate spot) // params: spot finding parameters // // The algorithm uses multiple waves (blockDim.y) that run over sections of rows. // Each wave will write output at the back row and read input at the front row. // Each wave is split into column output sections (blockDim.x) // A wave section (block) is responsible for a particular row/column section and // maintains sum/sum2/count values per column for the output row. // Every cuda thread is associated with a particular column. The thread maintains // the sum/sum2/count values in shared memory for it's column. To do this, the input // pixel values for the hight of the aggregation window are saved in shared memory. __global__ void analyze_pixel(const int32_t *in, uint32_t *prev_out, uint32_t *out, const spot_parameters params) { // assumption: 2 * params.nby + 1 <= params.rows and 2 * params.nbx + 1 <= params.width const int32_t window = 2 * (int)ImageSpotFinder::NBX + 1; // vertical window const int32_t writeSize = blockDim.x - 2 * ImageSpotFinder::NBX; // output columns per block const int32_t cmin = blockIdx.x * writeSize; // lowest output column const int32_t cmax = min(cmin + writeSize, static_cast(params.width)); // past highest output column const int32_t col = cmin + threadIdx.x - ImageSpotFinder::NBX; // thread -> column mapping const bool data_col = (col >= 0) && (col < static_cast(params.width)); // read global mem const bool result_col = (col >= cmin) && (col < cmax); // write result const int32_t nWaves = gridDim.y; // number of waves const int32_t rowsPerWave = (params.height + nWaves - 1) / nWaves; // rows per wave const int32_t rmin = blockIdx.y * rowsPerWave; // lowest result row for this wave const int32_t rmax = min(rmin + rowsPerWave, static_cast(params.height)); // past highest result row for this wave const int32_t left = max(static_cast(threadIdx.x) - static_cast(ImageSpotFinder::NBX), 0); // leftmost column touched by this thread const int32_t right = min(static_cast(threadIdx.x) + static_cast(ImageSpotFinder::NBX) + 1, static_cast(params.width)); // past rightmost column touched by this thread int32_t back = rmin; // back of wave for writing int32_t front = max(back - static_cast(ImageSpotFinder::NBX), 0); // front of wave for reading (needs to overtake back initially) extern __shared__ int64_t shared_mem[]; int64_t* shared_sum = shared_mem; // shared buffer [blockDim.x] int64_t* shared_sum2 = &shared_sum[blockDim.x]; // shared buffer [blockDim.x] auto shared_count = reinterpret_cast(&shared_sum2[blockDim.x]); // shared buffer [blockDim.x] auto shared_val = reinterpret_cast(&shared_count[blockDim.x]); // shared cyclic buffer [window, blockDim.x] int64_t total_sum; // totals int64_t total_sum2; int32_t total_count; // initialize sum, sum2, count, val buffers shared_sum[threadIdx.x] = 0; // shared values without effect on totals shared_sum2[threadIdx.x] = 0; shared_count[threadIdx.x] = 0; for (int i=0; i(ImageSpotFinder::NBX) + 1); // wave front up to rmax do { __syncthreads(); // make others see the shared values uint8_t val = 0; if (result_col) { // write at the back end of the wave total_sum = total_sum2 = total_count = 0; for (auto j = left; j < right; j++) { total_sum += shared_sum[j]; total_sum2 += shared_sum2[j]; total_count += shared_count[j]; } val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count); } write_result(params, out, back * params.width + col, val); back++; __syncthreads(); // keep shared values until others have seen them if (data_col) { // read at the front end of the wave int16_t cnt = 0; int32_t old = shared_val[(front % window) * blockDim.x + threadIdx.x]; if (old == INT32_MAX || old == INT32_MIN) { old = 0; // no effect value cnt = 1; // bring count to normal } int32_t val = in[front * params.width + col]; shared_val[(front % window) * blockDim.x + threadIdx.x] = val; if (val == INT32_MAX || val == INT32_MIN) { val = 0; // no effect value cnt -= 1; // count diff from normal } shared_sum[threadIdx.x] += val - old; shared_sum2[threadIdx.x] += val * val - old * old; shared_count[threadIdx.x] += cnt; } front++; } while (front < rmax); // wave back up to rmax do { __syncthreads(); // make others see the shared values uint8_t val = 0; if (result_col) { // write at the back end of the wave total_sum = total_sum2 = total_count = 0; for (auto j = left; j < right; j++) { total_sum += shared_sum[j]; total_sum2 += shared_sum2[j]; total_count += shared_count[j]; } val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count); } write_result(params, out, back * params.width + col, val); back++; __syncthreads(); // keep shared values until others have seen them if (data_col) { // read at the front end of the wave if possible int16_t cnt = -1; // normal count diff int32_t old = shared_val[(front % window) * blockDim.x + threadIdx.x]; if (old == INT32_MAX || old == INT32_MIN) { old = 0; // no effect value cnt += 1; // bring count to normal } int32_t val = 0; if (front < params.height) { const int32_t npixel = front * params.width + col; const bool sat = ((prev_out[npixel / 32] & (1U << (npixel % 32))) != 0); val = sat ? INT32_MAX : in[npixel]; if (val == INT32_MAX || val == INT32_MIN) val = 0; // no effect value else cnt += 1; // count diff from normal } shared_sum[threadIdx.x] += val - old; shared_sum2[threadIdx.x] += val * val - old * old; shared_count[threadIdx.x] += cnt; } front++; } while (back < rmax); } ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height) : ImageSpotFinder(in_width, in_height), input_buffer_reg(input_buffer), output_buffer_reg(output_buffer) { gpu_in = CudaDevicePtr(width * height); gpu_out_0 = CudaDevicePtr(OutputSize()); gpu_out_1 = CudaDevicePtr(OutputSize()); } std::vector ImageSpotFinderGPU::Run(const SpotFindingSettings &settings, const std::vector &res_mask) { spot_parameters spot_params{}; spot_params.height = height; spot_params.width = width; spot_params.strong_pixel_threshold2 = settings.signal_to_noise_threshold * settings.signal_to_noise_threshold; spot_params.count_threshold = settings.photon_count_threshold; if (2 * NBX + 1 > windowSizeLimit) throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nbx exceeds window size limit"); if (2 * NBX + 1 > windowSizeLimit) throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nby exceeds window size limit"); if (windowSizeLimit > numberOfCudaThreads) throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of cuda threads"); if (windowSizeLimit > spot_params.width) throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of columns"); if (windowSizeLimit > spot_params.height) throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of height"); const auto nWriters = numberOfCudaThreads - 2 * NBX; const auto nBlocks = (spot_params.width + nWriters - 1) / nWriters; const auto window = 2 * NBX + 1; const auto sharedSize = (2 * sizeof(int64_t) + // sum, sum2 window * sizeof(int32_t) + // val 1 * sizeof(int16_t) // count ) * numberOfCudaThreads; const dim3 blocks(nBlocks, numberOfWaves); cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice, stream)); cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), stream)); cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), stream)); analyze_pixel<<>> (gpu_in, gpu_out_1, gpu_out_0, spot_params); analyze_pixel<<>> (gpu_in, gpu_out_0, gpu_out_1, spot_params); cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); cuda_err(cudaStreamSynchronize(stream)); return ExtractSpots(settings, res_mask); }