ImageProcessor: Work in progress on GPU code to preprocess images

2026-04-22 21:06:43 +02:00
parent 4ee249fca2
commit f208109717
8 changed files with 313 additions and 76 deletions
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#include "ImagePreprocessorGPU.h"
+
+template<class T>
+__global__ void preprocess_kernel(
+    const T *__restrict__ input,
+    const uint8_t *__restrict__ mask,
+    int32_t *__restrict__ output,
+    ImageStatistics *__restrict__ stats,
+    T saturation_limit,
+    T err_value,
+    int npixels) {
+    // Shared block accumulators
+    __shared__ unsigned long long s_masked;
+    __shared__ unsigned long long s_saturated;
+    __shared__ unsigned long long s_error;
+    __shared__ long long s_max;
+    __shared__ long long s_min;
+
+    if (threadIdx.x == 0) {
+        s_masked = 0;
+        s_saturated = 0;
+        s_error = 0;
+        s_max = INT64_MIN;
+        s_min = INT64_MAX;
+    }
+    __syncthreads();
+
+    // Thread-local accumulators
+    unsigned long long local_masked = 0;
+    unsigned long long local_saturated = 0;
+    unsigned long long local_error = 0;
+    long long local_max = INT64_MIN;
+    long long local_min = INT64_MAX;
+
+    constexpr bool is_signed = std::is_signed<T>::value;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < npixels;
+         i += blockDim.x * gridDim.x) {
+        T v = input[i];
+
+        bool is_masked = mask[i];
+        bool is_sat = v >= saturation_limit;
+        bool is_err = is_signed && (v == err_value);
+
+        bool valid = !(is_masked || is_sat || is_err);
+
+        // Output
+        output[i] =
+            is_masked ? INT32_MIN :
+            is_sat    ? INT32_MAX :
+            is_err    ? INT32_MIN :
+                        (int32_t)v;
+
+        // Counters
+        local_masked   += is_masked;
+        local_saturated+= (!is_masked && is_sat);
+        local_error    += (!is_masked && !is_sat && is_err);
+
+        // Min/max only for valid
+        if (valid) {
+            int64_t val = (int64_t)v;
+            if (val > local_max) local_max = val;
+            if (val < local_min) local_min = val;
+        }
+    }
+
+    // Reduce to shared memory
+    atomicAdd(&s_masked, local_masked);
+    atomicAdd(&s_saturated, local_saturated);
+    atomicAdd(&s_error, local_error);
+
+    if (local_min <= local_max) {
+        atomicMax((long long*)&s_max, (long long)local_max);
+        atomicMin((long long*)&s_min, (long long)local_min);
+    }
+
+    __syncthreads();
+
+    // One thread writes block result
+    if (threadIdx.x == 0) {
+        atomicAdd(&stats->masked_pixel_count, s_masked);
+        atomicAdd(&stats->saturated_pixel_count, s_saturated);
+        atomicAdd(&stats->error_pixel_count, s_error);
+
+        atomicMax((long long*)&stats->max_value, (long long)s_max);
+        atomicMin((long long*)&stats->min_value, (long long)s_min);
+    }
+}
+
+ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
+    : ImagePreprocessor(experiment),
+      gpu_mask(npixels),
+      gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
+      gpu_image(npixels),
+      gpu_stats(1),
+      cpu_stats(1),
+      cpu_stats_reg(cpu_stats) {
+
+    stream = std::make_shared<CudaStream>();
+
+    // Setup mask
+    std::vector<uint8_t> mask_vec(npixels);
+    for (int i = 0; i < npixels; i++)
+        mask_vec[i] = (mask.GetMask().at(i) != 0);
+
+    cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);
+
+    // Setup GPU settings
+    cudaDeviceProp prop{};
+    cudaGetDeviceProperties(&prop, 0);
+
+    threads = 128;
+    blocks = 4 * prop.multiProcessorCount;
+}
+
+ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image, const uint8_t *image_ptr,
+                                              CompressedImageMode image_mode) {
+    switch (image_mode) {
+        case CompressedImageMode::Int8:
+            return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
+        case CompressedImageMode::Int16:
+            return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
+        case CompressedImageMode::Int32:
+            return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
+        case CompressedImageMode::Uint8:
+            return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
+        case CompressedImageMode::Uint16:
+            return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
+        case CompressedImageMode::Uint32:
+            return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
+        default:
+            throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
+    }
+}
+
+template<class T>
+ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image,
+                                              const uint8_t *input,
+                                              T err_value,
+                                              T sat_value) {
+
+    if (sat_value > saturation_limit)
+        sat_value = static_cast<T>(saturation_limit);
+
+    cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);
+
+    cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
+    cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
+    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
+        reinterpret_cast<const T *>(gpu_decompressed_image.get()),
+        gpu_mask,
+        gpu_image,
+        gpu_stats,
+        sat_value,
+        err_value,
+        npixels);
+    cudaMemcpyAsync(processed_image.data(), gpu_image, npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
+    cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);
+
+    cudaStreamSynchronize(*stream);
+
+    return cpu_stats[0];
+}
+
+const int32_t *ImagePreprocessorGPU::GetImageDevicePtr() const {
+    return gpu_image;
+}
+
+std::shared_ptr<CudaStream> ImagePreprocessorGPU::GetStream() const {
+    return stream;
+}