bragg_integration: GPU box + profile-fit integrator (standalone engine)
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s
Reimplement BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch profile fit) under one roof as a base + CPU + GPU engine, mirroring the AzIntEngine / ROIIntegration pattern. Reads the preprocessed int32 ImagePreprocessorBuffer (masked=INT32_MIN, saturated=INT32_MAX), the same buffer AzIntEngineGPU/ROIIntegrationGPU consume. The CUDA engine runs one block per reflection with shared-memory reductions across six kernels (reset, mask, box-sum, profile learning, profile build, Kabsch fit); the resolution shell is computed inline. The learning/fit hot path is single precision (FP64 is throttled on consumer GPUs; reproduces the double CPU path to ~1e-4). Collapsing the per-frame CUDA API calls into one reset kernel keeps launch-latency overhead low. Standalone for now: NOT wired into IndexAndRefine. See BRAGG_INTEGRATION_ENGINE.md for the design and the binding steps. BraggIntegrationEngineGPUTest checks GPU == CPU across all three modes (box/gaussian/empirical) within numeric tolerance, plus a [bragg_bench] perf sweep. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,199 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include "../common/CUDAWrapper.h"
|
||||
|
||||
#ifdef JFJOCH_USE_CUDA
|
||||
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/BraggIntegrationSettings.h"
|
||||
#include "../common/DetectorSetup.h"
|
||||
#include "../common/DiffractionExperiment.h"
|
||||
#include "../common/Reflection.h"
|
||||
#include "../image_analysis/bragg_integration/BraggIntegrationEngineCPU.h"
|
||||
#include "../image_analysis/bragg_integration/BraggIntegrationEngineGPU.h"
|
||||
#include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// A grid of clean Gaussian spots on a flat background, each seeding one predicted reflection.
|
||||
struct Scene {
|
||||
std::vector<int32_t> image;
|
||||
std::vector<Reflection> predicted;
|
||||
size_t width = 0, height = 0;
|
||||
};
|
||||
|
||||
Reflection MakeReflection(float x, float y, float d, int hkl) {
|
||||
Reflection r{};
|
||||
r.h = hkl; r.k = hkl; r.l = hkl;
|
||||
r.predicted_x = x;
|
||||
r.predicted_y = y;
|
||||
r.d = d;
|
||||
r.rlp = 1.0f;
|
||||
r.partiality = 1.0f;
|
||||
return r;
|
||||
}
|
||||
|
||||
Scene BuildScene(size_t width, size_t height, int spacing = 60) {
|
||||
Scene s;
|
||||
s.width = width;
|
||||
s.height = height;
|
||||
s.image.assign(width * height, 12); // flat background
|
||||
|
||||
// A grid of spots, well separated so background rings do not overlap the neighbours' disks.
|
||||
// A spread of intensities (some weak, some very strong) and a spread of d (so several resolution
|
||||
// shells are populated) exercises the strong-spot selection, shell learning and the fit.
|
||||
const int margin = 45;
|
||||
int hkl = 1;
|
||||
for (int gy = 0; margin + gy * spacing < static_cast<int>(height) - margin; ++gy) {
|
||||
for (int gx = 0; margin + gx * spacing < static_cast<int>(width) - margin; ++gx) {
|
||||
const float cx = static_cast<float>(margin + gx * spacing) + 0.3f; // sub-pixel offset
|
||||
const float cy = static_cast<float>(margin + gy * spacing) - 0.2f;
|
||||
const double amp = 150.0 + 60.0 * ((gx * 7 + gy * 13) % 30); // 150..1890
|
||||
const double sigma = 1.3;
|
||||
for (int dy = -6; dy <= 6; ++dy)
|
||||
for (int dx = -6; dx <= 6; ++dx) {
|
||||
const int x = static_cast<int>(std::lround(cx)) + dx;
|
||||
const int y = static_cast<int>(std::lround(cy)) + dy;
|
||||
if (x < 0 || y < 0 || x >= static_cast<int>(width) || y >= static_cast<int>(height)) continue;
|
||||
const double ex = x - cx, ey = y - cy;
|
||||
const double g = amp * std::exp(-(ex * ex + ey * ey) / (2.0 * sigma * sigma));
|
||||
s.image[y * width + x] += static_cast<int32_t>(std::lround(g));
|
||||
}
|
||||
const float d = 1.4f + 0.12f * static_cast<float>((gx + gy) % 12); // 1.4..2.72 A
|
||||
s.predicted.push_back(MakeReflection(cx, cy, d, hkl++));
|
||||
}
|
||||
}
|
||||
|
||||
// A few masked (INT32_MIN) and saturated (INT32_MAX) pixels in background gaps to exercise the
|
||||
// validity rejection in both engines identically.
|
||||
for (int k = 0; k < 20; ++k) {
|
||||
const size_t idx = (static_cast<size_t>(k) * 2654435761u) % s.image.size();
|
||||
s.image[idx] = (k % 2) ? INT32_MIN : INT32_MAX;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
DiffractionExperiment MakeExperiment(IntegratorMode mode, std::optional<float> bandwidth_fwhm,
|
||||
const DetectorSetup &det = DetJF(2)) {
|
||||
DiffractionExperiment experiment(det); // DetJF(2) (small) keeps the correctness test fast
|
||||
experiment.DetectorDistance_mm(100.0f).IncidentEnergy_keV(WVL_1A_IN_KEV)
|
||||
.BeamX_pxl(400.0f).BeamY_pxl(400.0f);
|
||||
experiment.BandwidthFWHM(bandwidth_fwhm);
|
||||
BraggIntegrationSettings settings;
|
||||
settings.Integrator(mode);
|
||||
experiment.ImportBraggIntegrationSettings(settings);
|
||||
return experiment;
|
||||
}
|
||||
|
||||
void CompareCpuVsGpu(IntegratorMode mode, std::optional<float> bandwidth_fwhm) {
|
||||
const DiffractionExperiment experiment = MakeExperiment(mode, bandwidth_fwhm);
|
||||
const size_t width = experiment.GetXPixelsNum();
|
||||
const size_t height = experiment.GetYPixelsNum();
|
||||
const size_t npixel = experiment.GetPixelsNum();
|
||||
REQUIRE(npixel == width * height);
|
||||
|
||||
const Scene scene = BuildScene(width, height);
|
||||
REQUIRE(scene.image.size() == npixel);
|
||||
REQUIRE(scene.predicted.size() > 60);
|
||||
|
||||
// CPU reference
|
||||
ImagePreprocessorBuffer cpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i)
|
||||
cpu_image[i] = scene.image[i];
|
||||
BraggIntegrationEngineCPU cpu(experiment);
|
||||
const auto out_cpu = cpu.Run(cpu_image, scene.predicted, scene.predicted.size(), 5);
|
||||
|
||||
// GPU under test, identical input uploaded to the device
|
||||
auto stream = std::make_shared<CudaStream>();
|
||||
ImagePreprocessorBufferGPU gpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i)
|
||||
gpu_image[i] = scene.image[i];
|
||||
REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
|
||||
npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
|
||||
BraggIntegrationEngineGPU gpu(experiment, stream);
|
||||
const auto out_gpu = gpu.Run(gpu_image, scene.predicted, scene.predicted.size(), 5);
|
||||
|
||||
// The ok/observed decisions are deterministic geometry, so both engines return the same set in
|
||||
// the same (predicted-index) order. Intensities differ only by float rounding and the unordered
|
||||
// atomic summation of the learned profile, so compare up to a small tolerance.
|
||||
REQUIRE(out_gpu.size() == out_cpu.size());
|
||||
REQUIRE(out_cpu.size() > 40);
|
||||
for (size_t i = 0; i < out_cpu.size(); ++i) {
|
||||
INFO("mode " << static_cast<int>(mode) << " reflection " << i << " hkl " << out_cpu[i].h);
|
||||
CHECK(out_gpu[i].h == out_cpu[i].h);
|
||||
CHECK(out_gpu[i].image_number == out_cpu[i].image_number);
|
||||
CHECK(out_gpu[i].bkg == Catch::Approx(out_cpu[i].bkg).epsilon(0.02).margin(0.5));
|
||||
CHECK(out_gpu[i].I == Catch::Approx(out_cpu[i].I).epsilon(0.03).margin(2.0));
|
||||
CHECK(out_gpu[i].sigma == Catch::Approx(out_cpu[i].sigma).epsilon(0.03).margin(0.5));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_CASE("BraggIntegrationEngineGPU_MatchesCPU") {
|
||||
if (get_gpu_count() == 0) {
|
||||
WARN("No CUDA GPU present. Skipping BraggIntegrationEngineGPU_MatchesCPU");
|
||||
return;
|
||||
}
|
||||
|
||||
SECTION("BoxSum") { CompareCpuVsGpu(IntegratorMode::BoxSum, std::nullopt); }
|
||||
SECTION("ProfileGaussian mono") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, std::nullopt); }
|
||||
SECTION("ProfileGaussian broadband") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, 0.03f); }
|
||||
SECTION("ProfileEmpirical") { CompareCpuVsGpu(IntegratorMode::ProfileEmpirical, std::nullopt); }
|
||||
}
|
||||
|
||||
// Hidden ([.]) benchmark: the raison d'etre of the GPU port is < 2 ms/frame (vs ~142 ms on the CPU
|
||||
// for ProfileIntegrate2D). Run explicitly with: ./jfjoch_test "[bragg_bench]"
|
||||
TEST_CASE("BraggIntegrationEngineGPU_Benchmark", "[.][bragg_bench]") {
|
||||
if (get_gpu_count() == 0) {
|
||||
WARN("No CUDA GPU present. Skipping benchmark");
|
||||
return;
|
||||
}
|
||||
const DiffractionExperiment experiment = MakeExperiment(IntegratorMode::ProfileGaussian, std::nullopt, DetJF4M());
|
||||
const size_t width = experiment.GetXPixelsNum();
|
||||
const size_t height = experiment.GetYPixelsNum();
|
||||
const size_t npixel = experiment.GetPixelsNum();
|
||||
REQUIRE(npixel == width * height);
|
||||
|
||||
auto stream = std::make_shared<CudaStream>();
|
||||
BraggIntegrationEngineGPU gpu(experiment, stream);
|
||||
for (int spacing : {28, 40, 60, 90}) {
|
||||
const Scene scene = BuildScene(width, height, spacing);
|
||||
const size_t nrefl = scene.predicted.size();
|
||||
|
||||
ImagePreprocessorBufferGPU gpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i) gpu_image[i] = scene.image[i];
|
||||
REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
|
||||
npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
auto run = [&] { return gpu.Run(gpu_image, scene.predicted, nrefl, 0); };
|
||||
for (int i = 0; i < 5; ++i) run(); // warm-up (allocations, JIT)
|
||||
|
||||
const int iters = 100;
|
||||
const auto t0 = std::chrono::steady_clock::now();
|
||||
size_t observed = 0;
|
||||
for (int i = 0; i < iters; ++i) observed += run().size();
|
||||
const auto t1 = std::chrono::steady_clock::now();
|
||||
const double ms = std::chrono::duration<double, std::milli>(t1 - t0).count() / iters;
|
||||
|
||||
BraggIntegrationEngineCPU cpu(experiment);
|
||||
ImagePreprocessorBuffer cpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i) cpu_image[i] = scene.image[i];
|
||||
const auto c0 = std::chrono::steady_clock::now();
|
||||
const size_t cpu_observed = cpu.Run(cpu_image, scene.predicted, nrefl, 0).size();
|
||||
const auto c1 = std::chrono::steady_clock::now();
|
||||
const double cpu_ms = std::chrono::duration<double, std::milli>(c1 - c0).count();
|
||||
|
||||
WARN(width << "x" << height << " | " << nrefl << " refl (" << observed / iters
|
||||
<< " obs) | GPU " << ms << " ms | CPU " << cpu_ms << " ms (" << cpu_observed
|
||||
<< " obs) | speedup " << cpu_ms / ms << "x");
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user