// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include "AzIntEngineGPU.h" inline void cuda_err(cudaError_t val) { if (val != cudaSuccess) throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val)); } __global__ void gpu_azim_shared( const uint16_t *__restrict__ pixel_to_bin, const float *__restrict__ corrections, const int32_t *__restrict__ input_buffer, float *__restrict__ azint_sum, float *__restrict__ azint_sum2, uint32_t *__restrict__ azint_count, size_t num_pixels, int azint_bins) { extern __shared__ float shared[]; float *s_sum = shared; float *s_sum2 = &s_sum[azint_bins]; uint32_t *s_count = (uint32_t *) &s_sum2[azint_bins]; // Initialize shared memory for (int i = threadIdx.x; i < azint_bins; i += blockDim.x) { s_sum[i] = 0.0f; s_sum2[i] = 0.0f; s_count[i] = 0; } __syncthreads(); for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_pixels; idx += blockDim.x * gridDim.x) { uint16_t bin = pixel_to_bin[idx]; int32_t v = input_buffer[idx]; bool valid = (v != INT32_MIN) & (v != INT32_MAX); if (bin < azint_bins && valid) { const float val = static_cast(v) * corrections[idx]; const float val2 = val * val; atomicAdd(&s_sum[bin], val); atomicAdd(&s_sum2[bin], val2); atomicAdd(&s_count[bin], 1); } } __syncthreads(); // Merge to global memory for (unsigned int i = threadIdx.x; i < azint_bins; i += blockDim.x) { atomicAdd(&azint_sum[i], s_sum[i]); atomicAdd(&azint_sum2[i], s_sum2[i]); atomicAdd(&azint_count[i], s_count[i]); } } __global__ void gpu_azim( const uint16_t *__restrict__ pixel_to_bin, const float *__restrict__ corrections, const int32_t *__restrict__ input_buffer, float *__restrict__ azint_sum, float *__restrict__ azint_sum2, uint32_t *__restrict__ azint_count, size_t num_pixels, int azint_bins) { for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_pixels; idx += blockDim.x * gridDim.x) { uint16_t bin = pixel_to_bin[idx]; int32_t v = input_buffer[idx]; bool valid = (v != INT32_MIN) & (v != INT32_MAX); if (bin < azint_bins && valid) { const float val = static_cast(v) * corrections[idx]; const float val2 = val * val; atomicAdd(&azint_sum[bin], val); atomicAdd(&azint_sum2[bin], val2); atomicAdd(&azint_count[bin], 1); } } } AzIntEngineGPU::AzIntEngineGPU(const AzimuthalIntegrationMapping &integration, std::shared_ptr stream) : AzIntEngine(integration), stream(stream), gpu_azint_correction(npixel), gpu_pixel_to_bin(npixel), gpu_sum(azint_bins), gpu_sum2(azint_bins), gpu_count(azint_bins), cpu_sum_reg(azint_sum), cpu_sum2_reg(azint_sum2), cpu_count_reg(azint_count) { cudaDeviceProp prop{}; cudaGetDeviceProperties(&prop, 0); threads = 128; blocks = 4 * prop.multiProcessorCount; shared_size = prop.sharedMemPerBlock; shared_needed = azint_bins * (2 * sizeof(float) + sizeof(uint32_t)); cudaMemcpy(gpu_azint_correction, integration.Corrections().data(), sizeof(float) * npixel, cudaMemcpyHostToDevice); cudaMemcpy(gpu_pixel_to_bin, integration.GetPixelToBin().data(), sizeof(uint16_t) * npixel, cudaMemcpyHostToDevice); } void AzIntEngineGPU::Run(const ImagePreprocessorBuffer &image, AzimuthalIntegrationProfile &profile) { if (image.size() != integration.GetPixelToBin().size()) throw std::runtime_error("ImageSpotFinder::AzimIntegration: Mismatch in size"); cuda_err(cudaMemsetAsync(gpu_sum, 0, sizeof(float) * azint_bins, *stream)); cuda_err(cudaMemsetAsync(gpu_sum2, 0, sizeof(float) * azint_bins, *stream)); cuda_err(cudaMemsetAsync(gpu_count, 0, sizeof(uint32_t) * azint_bins, *stream)); if (shared_needed < shared_size) { gpu_azim_shared<<>>( gpu_pixel_to_bin,gpu_azint_correction,image.getGPUBuffer(), gpu_sum, gpu_sum2, gpu_count, npixel, azint_bins ); } else { gpu_azim<<>>( gpu_pixel_to_bin,gpu_azint_correction,image.getGPUBuffer(), gpu_sum, gpu_sum2, gpu_count, npixel, azint_bins ); } cudaMemcpyAsync(azint_sum.data(), gpu_sum, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream); cudaMemcpyAsync(azint_sum2.data(), gpu_sum2, sizeof(float) * azint_bins, cudaMemcpyDeviceToHost, *stream); cudaMemcpyAsync(azint_count.data(), gpu_count, sizeof(uint32_t) * azint_bins, cudaMemcpyDeviceToHost, *stream); cuda_err(cudaStreamSynchronize(*stream)); profile.Clear(integration); profile.Add(azint_sum, azint_count); }