// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include #include "../common/CUDAWrapper.h" #ifdef JFJOCH_USE_CUDA #include #include #include #include "../image_analysis/roi/ROIIntegrationCPU.h" #include "../image_analysis/roi/ROIIntegrationGPU.h" #include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h" #include "../common/DiffractionExperiment.h" namespace { DiffractionExperiment make_roi_experiment() { DiffractionExperiment experiment(DetJF(1)); // Overlapping boxes so some pixels belong to several ROIs at once (multi-bit // mask), exercising the per-bit accumulation that has to match between CPU and GPU. // ROIBox is (name, x_min, x_max, y_min, y_max), kept within the JF module bounds. experiment.ROI().SetROI(ROIDefinition{.boxes = { ROIBox("roiA", 10, 210, 20, 220), ROIBox("roiB", 100, 300, 100, 300), ROIBox("roiC", 0, 150, 0, 150), ROIBox("roiD", 50, 250, 50, 250), }}); return experiment; } void compare_results(const std::map &cpu, const std::map &gpu) { REQUIRE(cpu.size() == gpu.size()); for (const auto &[name, c] : cpu) { INFO("ROI " << name); REQUIRE(gpu.contains(name)); const auto &g = gpu.at(name); CHECK(g.sum == c.sum); CHECK(g.sum_square == c.sum_square); CHECK(g.max_count == c.max_count); CHECK(g.pixels == c.pixels); CHECK(g.x_weighted == c.x_weighted); CHECK(g.y_weighted == c.y_weighted); CHECK(g.pixels_masked == c.pixels_masked); } } } // namespace // The GPU kernel reduces with atomics and two's-complement unsigned accumulators, // while the CPU path is a plain serial loop. On identical input every per-ROI // statistic must be bit-for-bit identical, so we run both and compare. TEST_CASE("ROIIntegrationGPU_MatchesCPU") { if (get_gpu_count() == 0) { WARN("No CUDA GPU present. Skipping ROIIntegrationGPU_MatchesCPU"); return; } const DiffractionExperiment experiment = make_roi_experiment(); const auto roi_map = experiment.ExportROIMap(); const size_t npixel = roi_map.size(); const uint16_t roi_count = experiment.ROI().size(); REQUIRE(roi_count == 4); // Deterministic image with both positive and negative values; negatives exercise // the signed weighted-sum path (val * x can be negative). std::vector values(npixel); for (size_t i = 0; i < npixel; i++) values[i] = static_cast((i * 2654435761u) % 1000) - 500; // Inject one saturated (INT32_MAX) and one masked (INT32_MIN) pixel into every ROI // so both the "max only, not summed" and "fully excluded" branches are covered. for (uint16_t r = 0; r < roi_count; r++) { bool injected_sat = false, injected_mask = false; for (size_t i = 0; i < npixel && !(injected_sat && injected_mask); i++) { if (!(roi_map[i] & (1u << r))) continue; if (!injected_sat) { values[i] = INT32_MAX; injected_sat = true; } else if (!injected_mask) { values[i] = INT32_MIN; injected_mask = true; } } REQUIRE(injected_sat); REQUIRE(injected_mask); } // CPU reference ImagePreprocessorBuffer cpu_image(npixel); for (size_t i = 0; i < npixel; i++) cpu_image[i] = values[i]; ROIIntegrationCPU cpu(experiment); std::map out_cpu; cpu.Run(cpu_image, out_cpu); // GPU under test — identical input uploaded to the device auto stream = std::make_shared(); ImagePreprocessorBufferGPU gpu_image(npixel); for (size_t i = 0; i < npixel; i++) gpu_image[i] = values[i]; REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(), npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess); ROIIntegrationGPU gpu(experiment, stream); std::map out_gpu; gpu.Run(gpu_image, out_gpu); compare_results(out_cpu, out_gpu); } #endif