// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include "ImagePreprocessorGPU.h" template __global__ void preprocess_kernel( const T *__restrict__ input, const uint8_t *__restrict__ mask, int32_t *__restrict__ output, ImageStatistics *__restrict__ stats, T saturation_limit, T err_value, int npixels) { // Shared block accumulators __shared__ unsigned long long s_masked; __shared__ unsigned long long s_saturated; __shared__ unsigned long long s_error; __shared__ long long s_max; __shared__ long long s_min; if (threadIdx.x == 0) { s_masked = 0; s_saturated = 0; s_error = 0; s_max = INT64_MIN; s_min = INT64_MAX; } __syncthreads(); // Thread-local accumulators unsigned long long local_masked = 0; unsigned long long local_saturated = 0; unsigned long long local_error = 0; long long local_max = INT64_MIN; long long local_min = INT64_MAX; constexpr bool is_signed = std::is_signed::value; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < npixels; i += blockDim.x * gridDim.x) { T v = input[i]; bool is_masked = mask[i]; bool is_sat = v >= saturation_limit; bool is_err = is_signed && (v == err_value); bool valid = !(is_masked || is_sat || is_err); // Output output[i] = is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v; // Counters local_masked += is_masked; local_saturated += (!is_masked && is_sat); local_error += (!is_masked && !is_sat && is_err); // Min/max only for valid if (valid) { int64_t val = (int64_t) v; if (val > local_max) local_max = val; if (val < local_min) local_min = val; } } // Reduce to shared memory atomicAdd(&s_masked, local_masked); atomicAdd(&s_saturated, local_saturated); atomicAdd(&s_error, local_error); if (local_min <= local_max) { atomicMax((long long *) &s_max, (long long) local_max); atomicMin((long long *) &s_min, (long long) local_min); } __syncthreads(); // One thread writes block result if (threadIdx.x == 0) { atomicAdd(&stats->masked_pixel_count, s_masked); atomicAdd(&stats->saturated_pixel_count, s_saturated); atomicAdd(&stats->error_pixel_count, s_error); atomicMax((long long *) &stats->max_value, (long long) s_max); atomicMin((long long *) &stats->min_value, (long long) s_min); } } ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask, std::shared_ptr stream) : ImagePreprocessor(experiment), stream(stream), gpu_mask(npixels), gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal gpu_stats(1), cpu_stats(1), cpu_stats_reg(cpu_stats) { // Setup mask std::vector mask_vec(npixels); for (int i = 0; i < npixels; i++) mask_vec[i] = (mask.GetMask().at(i) != 0); cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice); // Setup GPU settings cudaDeviceProp prop{}; cudaGetDeviceProperties(&prop, 0); threads = 128; blocks = 4 * prop.multiProcessorCount; } ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image, const uint8_t *image_ptr, CompressedImageMode image_mode) { switch (image_mode) { case CompressedImageMode::Int8: return Analyze(processed_image, image_ptr, INT8_MIN, INT8_MAX); case CompressedImageMode::Int16: return Analyze(processed_image, image_ptr, INT16_MIN, INT16_MAX); case CompressedImageMode::Int32: return Analyze(processed_image, image_ptr, INT32_MIN, INT32_MAX); case CompressedImageMode::Uint8: return Analyze(processed_image, image_ptr, UINT8_MAX, UINT8_MAX); case CompressedImageMode::Uint16: return Analyze(processed_image, image_ptr, UINT16_MAX, UINT16_MAX); case CompressedImageMode::Uint32: return Analyze(processed_image, image_ptr, UINT32_MAX, UINT32_MAX); default: throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported"); } } template ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image, const uint8_t *input, T err_value, T sat_value) { if (sat_value > saturation_limit) sat_value = static_cast(saturation_limit); cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice); cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX}; cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream); preprocess_kernel <<< blocks, threads, 0, *stream >>>( reinterpret_cast(gpu_decompressed_image.get()), gpu_mask, processed_image.getGPUBuffer(), gpu_stats, sat_value, err_value, npixels); cudaMemcpyAsync(processed_image.data(), processed_image.getGPUBuffer(), npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream); cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream); cudaStreamSynchronize(*stream); return cpu_stats[0]; }