Jungfraujoch/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "ImagePreprocessorGPU.h"

template<class T>
__global__ void preprocess_kernel(
    const T *__restrict__ input,
    const uint8_t *__restrict__ mask,
    int32_t *__restrict__ output,
    ImageStatistics *__restrict__ stats,
    T saturation_limit,
    T err_value,
    int npixels) {
    // Shared block accumulators
    __shared__ unsigned long long s_masked;
    __shared__ unsigned long long s_saturated;
    __shared__ unsigned long long s_error;
    __shared__ long long s_max;
    __shared__ long long s_min;

    if (threadIdx.x == 0) {
        s_masked = 0;
        s_saturated = 0;
        s_error = 0;
        s_max = INT64_MIN;
        s_min = INT64_MAX;
    }
    __syncthreads();

    // Thread-local accumulators
    unsigned long long local_masked = 0;
    unsigned long long local_saturated = 0;
    unsigned long long local_error = 0;
    long long local_max = INT64_MIN;
    long long local_min = INT64_MAX;

    constexpr bool is_signed = std::is_signed<T>::value;

    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < npixels;
         i += blockDim.x * gridDim.x) {
        T v = input[i];

        bool is_masked = mask[i];
        bool is_sat = v >= saturation_limit;
        bool is_err = is_signed && (v == err_value);

        bool valid = !(is_masked || is_sat || is_err);

        // Output
        output[i] =
                is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;

        // Counters
        local_masked += is_masked;
        local_saturated += (!is_masked && is_sat);
        local_error += (!is_masked && !is_sat && is_err);

        // Min/max only for valid
        if (valid) {
            int64_t val = (int64_t) v;
            if (val > local_max) local_max = val;
            if (val < local_min) local_min = val;
        }
    }

    // Reduce to shared memory
    atomicAdd(&s_masked, local_masked);
    atomicAdd(&s_saturated, local_saturated);
    atomicAdd(&s_error, local_error);

    if (local_min <= local_max) {
        atomicMax((long long *) &s_max, (long long) local_max);
        atomicMin((long long *) &s_min, (long long) local_min);
    }

    __syncthreads();

    // One thread writes block result
    if (threadIdx.x == 0) {
        atomicAdd(&stats->masked_pixel_count, s_masked);
        atomicAdd(&stats->saturated_pixel_count, s_saturated);
        atomicAdd(&stats->error_pixel_count, s_error);

        atomicMax((long long *) &stats->max_value, (long long) s_max);
        atomicMin((long long *) &stats->min_value, (long long) s_min);
    }
}

ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
                                           std::shared_ptr<CudaStream> stream)
    : ImagePreprocessor(experiment),
      stream(stream),
      gpu_mask(npixels),
      gpu_decompressed_image(npixels * sizeof(uint32_t)),  // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
      gpu_stats(1),
      cpu_stats(1),
      cpu_stats_reg(cpu_stats) {
    // Setup mask
    std::vector<uint8_t> mask_vec(npixels);
    for (int i = 0; i < npixels; i++)
        mask_vec[i] = (mask.GetMask().at(i) != 0);

    cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);

    // Setup GPU settings
    cudaDeviceProp prop{};
    cudaGetDeviceProperties(&prop, 0);

    threads = 128;
    blocks = 4 * prop.multiProcessorCount;
}

ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image, const uint8_t *image_ptr,
                                              CompressedImageMode image_mode) {
    switch (image_mode) {
        case CompressedImageMode::Int8:
            return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
        case CompressedImageMode::Int16:
            return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
        case CompressedImageMode::Int32:
            return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
        case CompressedImageMode::Uint8:
            return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
        case CompressedImageMode::Uint16:
            return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
        case CompressedImageMode::Uint32:
            return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
        default:
            throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
    }
}

template<class T>
ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image,
                                              const uint8_t *input,
                                              T err_value,
                                              T sat_value) {
    if (sat_value > saturation_limit)
        sat_value = static_cast<T>(saturation_limit);

    cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);

    cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
    cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
        reinterpret_cast<const T *>(gpu_decompressed_image.get()),
        gpu_mask,
        processed_image.getGPUBuffer(),
        gpu_stats,
        sat_value,
        err_value,
        npixels);
    cudaMemcpyAsync(processed_image.data(), processed_image.getGPUBuffer(), npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
    cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);

    cudaStreamSynchronize(*stream);

    return cpu_stats[0];
}