Jungfraujoch/tests/ROIIntegrationGPUTest.cpp

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include <catch2/catch_all.hpp>
#include "../common/CUDAWrapper.h"

#ifdef JFJOCH_USE_CUDA

#include <map>
#include <string>
#include <vector>

#include "../image_analysis/roi/ROIIntegrationCPU.h"
#include "../image_analysis/roi/ROIIntegrationGPU.h"
#include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h"
#include "../common/DiffractionExperiment.h"

namespace {
DiffractionExperiment make_roi_experiment() {
    DiffractionExperiment experiment(DetJF(1));
    // Overlapping boxes so some pixels belong to several ROIs at once (multi-bit
    // mask), exercising the per-bit accumulation that has to match between CPU and GPU.
    // ROIBox is (name, x_min, x_max, y_min, y_max), kept within the JF module bounds.
    experiment.ROI().SetROI(ROIDefinition{.boxes = {
        ROIBox("roiA", 10, 210, 20, 220),
        ROIBox("roiB", 100, 300, 100, 300),
        ROIBox("roiC", 0, 150, 0, 150),
        ROIBox("roiD", 50, 250, 50, 250),
    }});
    return experiment;
}

void compare_results(const std::map<std::string, ROIMessage> &cpu,
                     const std::map<std::string, ROIMessage> &gpu) {
    REQUIRE(cpu.size() == gpu.size());
    for (const auto &[name, c] : cpu) {
        INFO("ROI " << name);
        REQUIRE(gpu.contains(name));
        const auto &g = gpu.at(name);
        CHECK(g.sum           == c.sum);
        CHECK(g.sum_square    == c.sum_square);
        CHECK(g.max_count     == c.max_count);
        CHECK(g.pixels        == c.pixels);
        CHECK(g.x_weighted    == c.x_weighted);
        CHECK(g.y_weighted    == c.y_weighted);
        CHECK(g.pixels_masked == c.pixels_masked);
    }
}
} // namespace

// The GPU kernel reduces with atomics and two's-complement unsigned accumulators,
// while the CPU path is a plain serial loop. On identical input every per-ROI
// statistic must be bit-for-bit identical, so we run both and compare.
TEST_CASE("ROIIntegrationGPU_MatchesCPU") {
    if (get_gpu_count() == 0) {
        WARN("No CUDA GPU present. Skipping ROIIntegrationGPU_MatchesCPU");
        return;
    }

    const DiffractionExperiment experiment = make_roi_experiment();
    const auto roi_map = experiment.ExportROIMap();
    const size_t npixel = roi_map.size();
    const uint16_t roi_count = experiment.ROI().size();
    REQUIRE(roi_count == 4);

    // Deterministic image with both positive and negative values; negatives exercise
    // the signed weighted-sum path (val * x can be negative).
    std::vector<int32_t> values(npixel);
    for (size_t i = 0; i < npixel; i++)
        values[i] = static_cast<int32_t>((i * 2654435761u) % 1000) - 500;

    // Inject one saturated (INT32_MAX) and one masked (INT32_MIN) pixel into every ROI
    // so both the "max only, not summed" and "fully excluded" branches are covered.
    for (uint16_t r = 0; r < roi_count; r++) {
        bool injected_sat = false, injected_mask = false;
        for (size_t i = 0; i < npixel && !(injected_sat && injected_mask); i++) {
            if (!(roi_map[i] & (1u << r)))
                continue;
            if (!injected_sat) { values[i] = INT32_MAX; injected_sat = true; }
            else if (!injected_mask) { values[i] = INT32_MIN; injected_mask = true; }
        }
        REQUIRE(injected_sat);
        REQUIRE(injected_mask);
    }

    // CPU reference
    ImagePreprocessorBuffer cpu_image(npixel);
    for (size_t i = 0; i < npixel; i++)
        cpu_image[i] = values[i];

    ROIIntegrationCPU cpu(experiment);
    std::map<std::string, ROIMessage> out_cpu;
    cpu.Run(cpu_image, out_cpu);

    // GPU under test — identical input uploaded to the device
    auto stream = std::make_shared<CudaStream>();
    ImagePreprocessorBufferGPU gpu_image(npixel);
    for (size_t i = 0; i < npixel; i++)
        gpu_image[i] = values[i];

    REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(),
                            gpu_image.getBuffer().data(),
                            npixel * sizeof(int32_t),
                            cudaMemcpyHostToDevice,
                            *stream) == cudaSuccess);

    ROIIntegrationGPU gpu(experiment, stream);
    std::map<std::string, ROIMessage> out_gpu;
    gpu.Run(gpu_image, out_gpu);

    compare_results(out_cpu, out_gpu);
}

#endif