bragg_integration: GPU box + profile-fit integrator (standalone engine)

Reimplement BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch profile fit) under one roof as a base + CPU + GPU engine, mirroring the AzIntEngine / ROIIntegration pattern. Reads the preprocessed int32 ImagePreprocessorBuffer (masked=INT32_MIN, saturated=INT32_MAX), the same buffer AzIntEngineGPU/ROIIntegrationGPU consume. The CUDA engine runs one block per reflection with shared-memory reductions across six kernels (reset, mask, box-sum, profile learning, profile build, Kabsch fit); the resolution shell is computed inline. The learning/fit hot path is single precision (FP64 is throttled on consumer GPUs; reproduces the double CPU path to ~1e-4). Collapsing the per-frame CUDA API calls into one reset kernel keeps launch-latency overhead low. Standalone for now: NOT wired into IndexAndRefine. See BRAGG_INTEGRATION_ENGINE.md for the design and the binding steps. BraggIntegrationEngineGPUTest checks GPU == CPU across all three modes (box/gaussian/empirical) within numeric tolerance, plus a [bragg_bench] perf sweep. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 20:59:45 +02:00
parent 347228d008
commit ddddfb6ffc
10 changed files with 1371 additions and 0 deletions
@@ -0,0 +1,199 @@
+// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
+// SPDX-License-Identifier: GPL-3.0-only
+
+#include <catch2/catch_all.hpp>
+#include "../common/CUDAWrapper.h"
+
+#ifdef JFJOCH_USE_CUDA
+
+#include <chrono>
+#include <cmath>
+#include <vector>
+
+#include "../common/BraggIntegrationSettings.h"
+#include "../common/DetectorSetup.h"
+#include "../common/DiffractionExperiment.h"
+#include "../common/Reflection.h"
+#include "../image_analysis/bragg_integration/BraggIntegrationEngineCPU.h"
+#include "../image_analysis/bragg_integration/BraggIntegrationEngineGPU.h"
+#include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h"
+
+namespace {
+
+// A grid of clean Gaussian spots on a flat background, each seeding one predicted reflection.
+struct Scene {
+    std::vector<int32_t> image;
+    std::vector<Reflection> predicted;
+    size_t width = 0, height = 0;
+};
+
+Reflection MakeReflection(float x, float y, float d, int hkl) {
+    Reflection r{};
+    r.h = hkl; r.k = hkl; r.l = hkl;
+    r.predicted_x = x;
+    r.predicted_y = y;
+    r.d = d;
+    r.rlp = 1.0f;
+    r.partiality = 1.0f;
+    return r;
+}
+
+Scene BuildScene(size_t width, size_t height, int spacing = 60) {
+    Scene s;
+    s.width = width;
+    s.height = height;
+    s.image.assign(width * height, 12);   // flat background
+
+    // A grid of spots, well separated so background rings do not overlap the neighbours' disks.
+    // A spread of intensities (some weak, some very strong) and a spread of d (so several resolution
+    // shells are populated) exercises the strong-spot selection, shell learning and the fit.
+    const int margin = 45;
+    int hkl = 1;
+    for (int gy = 0; margin + gy * spacing < static_cast<int>(height) - margin; ++gy) {
+        for (int gx = 0; margin + gx * spacing < static_cast<int>(width) - margin; ++gx) {
+            const float cx = static_cast<float>(margin + gx * spacing) + 0.3f;   // sub-pixel offset
+            const float cy = static_cast<float>(margin + gy * spacing) - 0.2f;
+            const double amp = 150.0 + 60.0 * ((gx * 7 + gy * 13) % 30);   // 150..1890
+            const double sigma = 1.3;
+            for (int dy = -6; dy <= 6; ++dy)
+                for (int dx = -6; dx <= 6; ++dx) {
+                    const int x = static_cast<int>(std::lround(cx)) + dx;
+                    const int y = static_cast<int>(std::lround(cy)) + dy;
+                    if (x < 0 || y < 0 || x >= static_cast<int>(width) || y >= static_cast<int>(height)) continue;
+                    const double ex = x - cx, ey = y - cy;
+                    const double g = amp * std::exp(-(ex * ex + ey * ey) / (2.0 * sigma * sigma));
+                    s.image[y * width + x] += static_cast<int32_t>(std::lround(g));
+                }
+            const float d = 1.4f + 0.12f * static_cast<float>((gx + gy) % 12);   // 1.4..2.72 A
+            s.predicted.push_back(MakeReflection(cx, cy, d, hkl++));
+        }
+    }
+
+    // A few masked (INT32_MIN) and saturated (INT32_MAX) pixels in background gaps to exercise the
+    // validity rejection in both engines identically.
+    for (int k = 0; k < 20; ++k) {
+        const size_t idx = (static_cast<size_t>(k) * 2654435761u) % s.image.size();
+        s.image[idx] = (k % 2) ? INT32_MIN : INT32_MAX;
+    }
+    return s;
+}
+
+DiffractionExperiment MakeExperiment(IntegratorMode mode, std::optional<float> bandwidth_fwhm,
+                                     const DetectorSetup &det = DetJF(2)) {
+    DiffractionExperiment experiment(det);   // DetJF(2) (small) keeps the correctness test fast
+    experiment.DetectorDistance_mm(100.0f).IncidentEnergy_keV(WVL_1A_IN_KEV)
+              .BeamX_pxl(400.0f).BeamY_pxl(400.0f);
+    experiment.BandwidthFWHM(bandwidth_fwhm);
+    BraggIntegrationSettings settings;
+    settings.Integrator(mode);
+    experiment.ImportBraggIntegrationSettings(settings);
+    return experiment;
+}
+
+void CompareCpuVsGpu(IntegratorMode mode, std::optional<float> bandwidth_fwhm) {
+    const DiffractionExperiment experiment = MakeExperiment(mode, bandwidth_fwhm);
+    const size_t width = experiment.GetXPixelsNum();
+    const size_t height = experiment.GetYPixelsNum();
+    const size_t npixel = experiment.GetPixelsNum();
+    REQUIRE(npixel == width * height);
+
+    const Scene scene = BuildScene(width, height);
+    REQUIRE(scene.image.size() == npixel);
+    REQUIRE(scene.predicted.size() > 60);
+
+    // CPU reference
+    ImagePreprocessorBuffer cpu_image(npixel);
+    for (size_t i = 0; i < npixel; ++i)
+        cpu_image[i] = scene.image[i];
+    BraggIntegrationEngineCPU cpu(experiment);
+    const auto out_cpu = cpu.Run(cpu_image, scene.predicted, scene.predicted.size(), 5);
+
+    // GPU under test, identical input uploaded to the device
+    auto stream = std::make_shared<CudaStream>();
+    ImagePreprocessorBufferGPU gpu_image(npixel);
+    for (size_t i = 0; i < npixel; ++i)
+        gpu_image[i] = scene.image[i];
+    REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
+                            npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
+    BraggIntegrationEngineGPU gpu(experiment, stream);
+    const auto out_gpu = gpu.Run(gpu_image, scene.predicted, scene.predicted.size(), 5);
+
+    // The ok/observed decisions are deterministic geometry, so both engines return the same set in
+    // the same (predicted-index) order. Intensities differ only by float rounding and the unordered
+    // atomic summation of the learned profile, so compare up to a small tolerance.
+    REQUIRE(out_gpu.size() == out_cpu.size());
+    REQUIRE(out_cpu.size() > 40);
+    for (size_t i = 0; i < out_cpu.size(); ++i) {
+        INFO("mode " << static_cast<int>(mode) << " reflection " << i << " hkl " << out_cpu[i].h);
+        CHECK(out_gpu[i].h == out_cpu[i].h);
+        CHECK(out_gpu[i].image_number == out_cpu[i].image_number);
+        CHECK(out_gpu[i].bkg == Catch::Approx(out_cpu[i].bkg).epsilon(0.02).margin(0.5));
+        CHECK(out_gpu[i].I == Catch::Approx(out_cpu[i].I).epsilon(0.03).margin(2.0));
+        CHECK(out_gpu[i].sigma == Catch::Approx(out_cpu[i].sigma).epsilon(0.03).margin(0.5));
+    }
+}
+
+} // namespace
+
+TEST_CASE("BraggIntegrationEngineGPU_MatchesCPU") {
+    if (get_gpu_count() == 0) {
+        WARN("No CUDA GPU present. Skipping BraggIntegrationEngineGPU_MatchesCPU");
+        return;
+    }
+
+    SECTION("BoxSum")                    { CompareCpuVsGpu(IntegratorMode::BoxSum, std::nullopt); }
+    SECTION("ProfileGaussian mono")      { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, std::nullopt); }
+    SECTION("ProfileGaussian broadband") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, 0.03f); }
+    SECTION("ProfileEmpirical")          { CompareCpuVsGpu(IntegratorMode::ProfileEmpirical, std::nullopt); }
+}
+
+// Hidden ([.]) benchmark: the raison d'etre of the GPU port is < 2 ms/frame (vs ~142 ms on the CPU
+// for ProfileIntegrate2D). Run explicitly with:  ./jfjoch_test "[bragg_bench]"
+TEST_CASE("BraggIntegrationEngineGPU_Benchmark", "[.][bragg_bench]") {
+    if (get_gpu_count() == 0) {
+        WARN("No CUDA GPU present. Skipping benchmark");
+        return;
+    }
+    const DiffractionExperiment experiment = MakeExperiment(IntegratorMode::ProfileGaussian, std::nullopt, DetJF4M());
+    const size_t width = experiment.GetXPixelsNum();
+    const size_t height = experiment.GetYPixelsNum();
+    const size_t npixel = experiment.GetPixelsNum();
+    REQUIRE(npixel == width * height);
+
+    auto stream = std::make_shared<CudaStream>();
+    BraggIntegrationEngineGPU gpu(experiment, stream);
+    for (int spacing : {28, 40, 60, 90}) {
+        const Scene scene = BuildScene(width, height, spacing);
+        const size_t nrefl = scene.predicted.size();
+
+        ImagePreprocessorBufferGPU gpu_image(npixel);
+        for (size_t i = 0; i < npixel; ++i) gpu_image[i] = scene.image[i];
+        REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
+                                npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
+        cudaStreamSynchronize(*stream);
+
+        auto run = [&] { return gpu.Run(gpu_image, scene.predicted, nrefl, 0); };
+        for (int i = 0; i < 5; ++i) run();   // warm-up (allocations, JIT)
+
+        const int iters = 100;
+        const auto t0 = std::chrono::steady_clock::now();
+        size_t observed = 0;
+        for (int i = 0; i < iters; ++i) observed += run().size();
+        const auto t1 = std::chrono::steady_clock::now();
+        const double ms = std::chrono::duration<double, std::milli>(t1 - t0).count() / iters;
+
+        BraggIntegrationEngineCPU cpu(experiment);
+        ImagePreprocessorBuffer cpu_image(npixel);
+        for (size_t i = 0; i < npixel; ++i) cpu_image[i] = scene.image[i];
+        const auto c0 = std::chrono::steady_clock::now();
+        const size_t cpu_observed = cpu.Run(cpu_image, scene.predicted, nrefl, 0).size();
+        const auto c1 = std::chrono::steady_clock::now();
+        const double cpu_ms = std::chrono::duration<double, std::milli>(c1 - c0).count();
+
+        WARN(width << "x" << height << " | " << nrefl << " refl (" << observed / iters
+             << " obs) | GPU " << ms << " ms | CPU " << cpu_ms << " ms (" << cpu_observed
+             << " obs) | speedup " << cpu_ms / ms << "x");
+    }
+}
+
+#endif