bragg_integration: GPU box + profile-fit integrator (standalone engine)
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s

Reimplement BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch
profile fit) under one roof as a base + CPU + GPU engine, mirroring the
AzIntEngine / ROIIntegration pattern. Reads the preprocessed int32
ImagePreprocessorBuffer (masked=INT32_MIN, saturated=INT32_MAX), the same
buffer AzIntEngineGPU/ROIIntegrationGPU consume.

The CUDA engine runs one block per reflection with shared-memory
reductions across six kernels (reset, mask, box-sum, profile learning,
profile build, Kabsch fit); the resolution shell is computed inline. The
learning/fit hot path is single precision (FP64 is throttled on consumer
GPUs; reproduces the double CPU path to ~1e-4). Collapsing the per-frame
CUDA API calls into one reset kernel keeps launch-latency overhead low.

Standalone for now: NOT wired into IndexAndRefine. See
BRAGG_INTEGRATION_ENGINE.md for the design and the binding steps.
BraggIntegrationEngineGPUTest checks GPU == CPU across all three modes
(box/gaussian/empirical) within numeric tolerance, plus a [bragg_bench]
perf sweep.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-07-02 20:59:45 +02:00
co-authored by Claude Opus 4.8
parent 347228d008
commit ddddfb6ffc
10 changed files with 1371 additions and 0 deletions
+199
View File
@@ -0,0 +1,199 @@
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include <catch2/catch_all.hpp>
#include "../common/CUDAWrapper.h"
#ifdef JFJOCH_USE_CUDA
#include <chrono>
#include <cmath>
#include <vector>
#include "../common/BraggIntegrationSettings.h"
#include "../common/DetectorSetup.h"
#include "../common/DiffractionExperiment.h"
#include "../common/Reflection.h"
#include "../image_analysis/bragg_integration/BraggIntegrationEngineCPU.h"
#include "../image_analysis/bragg_integration/BraggIntegrationEngineGPU.h"
#include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h"
namespace {
// A grid of clean Gaussian spots on a flat background, each seeding one predicted reflection.
struct Scene {
std::vector<int32_t> image;
std::vector<Reflection> predicted;
size_t width = 0, height = 0;
};
Reflection MakeReflection(float x, float y, float d, int hkl) {
Reflection r{};
r.h = hkl; r.k = hkl; r.l = hkl;
r.predicted_x = x;
r.predicted_y = y;
r.d = d;
r.rlp = 1.0f;
r.partiality = 1.0f;
return r;
}
Scene BuildScene(size_t width, size_t height, int spacing = 60) {
Scene s;
s.width = width;
s.height = height;
s.image.assign(width * height, 12); // flat background
// A grid of spots, well separated so background rings do not overlap the neighbours' disks.
// A spread of intensities (some weak, some very strong) and a spread of d (so several resolution
// shells are populated) exercises the strong-spot selection, shell learning and the fit.
const int margin = 45;
int hkl = 1;
for (int gy = 0; margin + gy * spacing < static_cast<int>(height) - margin; ++gy) {
for (int gx = 0; margin + gx * spacing < static_cast<int>(width) - margin; ++gx) {
const float cx = static_cast<float>(margin + gx * spacing) + 0.3f; // sub-pixel offset
const float cy = static_cast<float>(margin + gy * spacing) - 0.2f;
const double amp = 150.0 + 60.0 * ((gx * 7 + gy * 13) % 30); // 150..1890
const double sigma = 1.3;
for (int dy = -6; dy <= 6; ++dy)
for (int dx = -6; dx <= 6; ++dx) {
const int x = static_cast<int>(std::lround(cx)) + dx;
const int y = static_cast<int>(std::lround(cy)) + dy;
if (x < 0 || y < 0 || x >= static_cast<int>(width) || y >= static_cast<int>(height)) continue;
const double ex = x - cx, ey = y - cy;
const double g = amp * std::exp(-(ex * ex + ey * ey) / (2.0 * sigma * sigma));
s.image[y * width + x] += static_cast<int32_t>(std::lround(g));
}
const float d = 1.4f + 0.12f * static_cast<float>((gx + gy) % 12); // 1.4..2.72 A
s.predicted.push_back(MakeReflection(cx, cy, d, hkl++));
}
}
// A few masked (INT32_MIN) and saturated (INT32_MAX) pixels in background gaps to exercise the
// validity rejection in both engines identically.
for (int k = 0; k < 20; ++k) {
const size_t idx = (static_cast<size_t>(k) * 2654435761u) % s.image.size();
s.image[idx] = (k % 2) ? INT32_MIN : INT32_MAX;
}
return s;
}
DiffractionExperiment MakeExperiment(IntegratorMode mode, std::optional<float> bandwidth_fwhm,
const DetectorSetup &det = DetJF(2)) {
DiffractionExperiment experiment(det); // DetJF(2) (small) keeps the correctness test fast
experiment.DetectorDistance_mm(100.0f).IncidentEnergy_keV(WVL_1A_IN_KEV)
.BeamX_pxl(400.0f).BeamY_pxl(400.0f);
experiment.BandwidthFWHM(bandwidth_fwhm);
BraggIntegrationSettings settings;
settings.Integrator(mode);
experiment.ImportBraggIntegrationSettings(settings);
return experiment;
}
void CompareCpuVsGpu(IntegratorMode mode, std::optional<float> bandwidth_fwhm) {
const DiffractionExperiment experiment = MakeExperiment(mode, bandwidth_fwhm);
const size_t width = experiment.GetXPixelsNum();
const size_t height = experiment.GetYPixelsNum();
const size_t npixel = experiment.GetPixelsNum();
REQUIRE(npixel == width * height);
const Scene scene = BuildScene(width, height);
REQUIRE(scene.image.size() == npixel);
REQUIRE(scene.predicted.size() > 60);
// CPU reference
ImagePreprocessorBuffer cpu_image(npixel);
for (size_t i = 0; i < npixel; ++i)
cpu_image[i] = scene.image[i];
BraggIntegrationEngineCPU cpu(experiment);
const auto out_cpu = cpu.Run(cpu_image, scene.predicted, scene.predicted.size(), 5);
// GPU under test, identical input uploaded to the device
auto stream = std::make_shared<CudaStream>();
ImagePreprocessorBufferGPU gpu_image(npixel);
for (size_t i = 0; i < npixel; ++i)
gpu_image[i] = scene.image[i];
REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
BraggIntegrationEngineGPU gpu(experiment, stream);
const auto out_gpu = gpu.Run(gpu_image, scene.predicted, scene.predicted.size(), 5);
// The ok/observed decisions are deterministic geometry, so both engines return the same set in
// the same (predicted-index) order. Intensities differ only by float rounding and the unordered
// atomic summation of the learned profile, so compare up to a small tolerance.
REQUIRE(out_gpu.size() == out_cpu.size());
REQUIRE(out_cpu.size() > 40);
for (size_t i = 0; i < out_cpu.size(); ++i) {
INFO("mode " << static_cast<int>(mode) << " reflection " << i << " hkl " << out_cpu[i].h);
CHECK(out_gpu[i].h == out_cpu[i].h);
CHECK(out_gpu[i].image_number == out_cpu[i].image_number);
CHECK(out_gpu[i].bkg == Catch::Approx(out_cpu[i].bkg).epsilon(0.02).margin(0.5));
CHECK(out_gpu[i].I == Catch::Approx(out_cpu[i].I).epsilon(0.03).margin(2.0));
CHECK(out_gpu[i].sigma == Catch::Approx(out_cpu[i].sigma).epsilon(0.03).margin(0.5));
}
}
} // namespace
TEST_CASE("BraggIntegrationEngineGPU_MatchesCPU") {
if (get_gpu_count() == 0) {
WARN("No CUDA GPU present. Skipping BraggIntegrationEngineGPU_MatchesCPU");
return;
}
SECTION("BoxSum") { CompareCpuVsGpu(IntegratorMode::BoxSum, std::nullopt); }
SECTION("ProfileGaussian mono") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, std::nullopt); }
SECTION("ProfileGaussian broadband") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, 0.03f); }
SECTION("ProfileEmpirical") { CompareCpuVsGpu(IntegratorMode::ProfileEmpirical, std::nullopt); }
}
// Hidden ([.]) benchmark: the raison d'etre of the GPU port is < 2 ms/frame (vs ~142 ms on the CPU
// for ProfileIntegrate2D). Run explicitly with: ./jfjoch_test "[bragg_bench]"
TEST_CASE("BraggIntegrationEngineGPU_Benchmark", "[.][bragg_bench]") {
if (get_gpu_count() == 0) {
WARN("No CUDA GPU present. Skipping benchmark");
return;
}
const DiffractionExperiment experiment = MakeExperiment(IntegratorMode::ProfileGaussian, std::nullopt, DetJF4M());
const size_t width = experiment.GetXPixelsNum();
const size_t height = experiment.GetYPixelsNum();
const size_t npixel = experiment.GetPixelsNum();
REQUIRE(npixel == width * height);
auto stream = std::make_shared<CudaStream>();
BraggIntegrationEngineGPU gpu(experiment, stream);
for (int spacing : {28, 40, 60, 90}) {
const Scene scene = BuildScene(width, height, spacing);
const size_t nrefl = scene.predicted.size();
ImagePreprocessorBufferGPU gpu_image(npixel);
for (size_t i = 0; i < npixel; ++i) gpu_image[i] = scene.image[i];
REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
cudaStreamSynchronize(*stream);
auto run = [&] { return gpu.Run(gpu_image, scene.predicted, nrefl, 0); };
for (int i = 0; i < 5; ++i) run(); // warm-up (allocations, JIT)
const int iters = 100;
const auto t0 = std::chrono::steady_clock::now();
size_t observed = 0;
for (int i = 0; i < iters; ++i) observed += run().size();
const auto t1 = std::chrono::steady_clock::now();
const double ms = std::chrono::duration<double, std::milli>(t1 - t0).count() / iters;
BraggIntegrationEngineCPU cpu(experiment);
ImagePreprocessorBuffer cpu_image(npixel);
for (size_t i = 0; i < npixel; ++i) cpu_image[i] = scene.image[i];
const auto c0 = std::chrono::steady_clock::now();
const size_t cpu_observed = cpu.Run(cpu_image, scene.predicted, nrefl, 0).size();
const auto c1 = std::chrono::steady_clock::now();
const double cpu_ms = std::chrono::duration<double, std::milli>(c1 - c0).count();
WARN(width << "x" << height << " | " << nrefl << " refl (" << observed / iters
<< " obs) | GPU " << ms << " ms | CPU " << cpu_ms << " ms (" << cpu_observed
<< " obs) | speedup " << cpu_ms / ms << "x");
}
}
#endif