// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include #include "../common/CUDAWrapper.h" #ifdef JFJOCH_USE_CUDA #include #include #include #include "../common/BraggIntegrationSettings.h" #include "../common/DetectorSetup.h" #include "../common/DiffractionExperiment.h" #include "../common/Reflection.h" #include "../image_analysis/bragg_integration/BraggIntegrationEngineCPU.h" #include "../image_analysis/bragg_integration/BraggIntegrationEngineGPU.h" #include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h" namespace { // A grid of clean Gaussian spots on a flat background, each seeding one predicted reflection. struct Scene { std::vector image; std::vector predicted; size_t width = 0, height = 0; }; Reflection MakeReflection(float x, float y, float d, int hkl) { Reflection r{}; r.h = hkl; r.k = hkl; r.l = hkl; r.predicted_x = x; r.predicted_y = y; r.d = d; r.rlp = 1.0f; r.partiality = 1.0f; return r; } Scene BuildScene(size_t width, size_t height, int spacing = 60) { Scene s; s.width = width; s.height = height; s.image.assign(width * height, 12); // flat background // A grid of spots, well separated so background rings do not overlap the neighbours' disks. // A spread of intensities (some weak, some very strong) and a spread of d (so several resolution // shells are populated) exercises the strong-spot selection, shell learning and the fit. const int margin = 45; int hkl = 1; for (int gy = 0; margin + gy * spacing < static_cast(height) - margin; ++gy) { for (int gx = 0; margin + gx * spacing < static_cast(width) - margin; ++gx) { const float cx = static_cast(margin + gx * spacing) + 0.3f; // sub-pixel offset const float cy = static_cast(margin + gy * spacing) - 0.2f; const double amp = 150.0 + 60.0 * ((gx * 7 + gy * 13) % 30); // 150..1890 const double sigma = 1.3; for (int dy = -6; dy <= 6; ++dy) for (int dx = -6; dx <= 6; ++dx) { const int x = static_cast(std::lround(cx)) + dx; const int y = static_cast(std::lround(cy)) + dy; if (x < 0 || y < 0 || x >= static_cast(width) || y >= static_cast(height)) continue; const double ex = x - cx, ey = y - cy; const double g = amp * std::exp(-(ex * ex + ey * ey) / (2.0 * sigma * sigma)); s.image[y * width + x] += static_cast(std::lround(g)); } const float d = 1.4f + 0.12f * static_cast((gx + gy) % 12); // 1.4..2.72 A s.predicted.push_back(MakeReflection(cx, cy, d, hkl++)); } } // A few masked (INT32_MIN) and saturated (INT32_MAX) pixels in background gaps to exercise the // validity rejection in both engines identically. for (int k = 0; k < 20; ++k) { const size_t idx = (static_cast(k) * 2654435761u) % s.image.size(); s.image[idx] = (k % 2) ? INT32_MIN : INT32_MAX; } return s; } DiffractionExperiment MakeExperiment(IntegratorMode mode, std::optional bandwidth_fwhm, const DetectorSetup &det = DetJF(2)) { DiffractionExperiment experiment(det); // DetJF(2) (small) keeps the correctness test fast experiment.DetectorDistance_mm(100.0f).IncidentEnergy_keV(WVL_1A_IN_KEV) .BeamX_pxl(400.0f).BeamY_pxl(400.0f); experiment.BandwidthFWHM(bandwidth_fwhm); BraggIntegrationSettings settings; settings.Integrator(mode); experiment.ImportBraggIntegrationSettings(settings); return experiment; } void CompareCpuVsGpu(IntegratorMode mode, std::optional bandwidth_fwhm) { const DiffractionExperiment experiment = MakeExperiment(mode, bandwidth_fwhm); const size_t width = experiment.GetXPixelsNum(); const size_t height = experiment.GetYPixelsNum(); const size_t npixel = experiment.GetPixelsNum(); REQUIRE(npixel == width * height); const Scene scene = BuildScene(width, height); REQUIRE(scene.image.size() == npixel); REQUIRE(scene.predicted.size() > 60); // CPU reference ImagePreprocessorBuffer cpu_image(npixel); for (size_t i = 0; i < npixel; ++i) cpu_image[i] = scene.image[i]; BraggIntegrationEngineCPU cpu(experiment); const auto out_cpu = cpu.Run(cpu_image, scene.predicted, scene.predicted.size(), 5); // GPU under test, identical input uploaded to the device auto stream = std::make_shared(); ImagePreprocessorBufferGPU gpu_image(npixel); for (size_t i = 0; i < npixel; ++i) gpu_image[i] = scene.image[i]; REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(), npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess); BraggIntegrationEngineGPU gpu(experiment, stream); const auto out_gpu = gpu.Run(gpu_image, scene.predicted, scene.predicted.size(), 5); // The ok/observed decisions are deterministic geometry, so both engines return the same set in // the same (predicted-index) order. Intensities differ only by float rounding and the unordered // atomic summation of the learned profile, so compare up to a small tolerance. REQUIRE(out_gpu.size() == out_cpu.size()); REQUIRE(out_cpu.size() > 40); for (size_t i = 0; i < out_cpu.size(); ++i) { INFO("mode " << static_cast(mode) << " reflection " << i << " hkl " << out_cpu[i].h); CHECK(out_gpu[i].h == out_cpu[i].h); CHECK(out_gpu[i].image_number == out_cpu[i].image_number); CHECK(out_gpu[i].bkg == Catch::Approx(out_cpu[i].bkg).epsilon(0.02).margin(0.5)); CHECK(out_gpu[i].I == Catch::Approx(out_cpu[i].I).epsilon(0.03).margin(2.0)); CHECK(out_gpu[i].sigma == Catch::Approx(out_cpu[i].sigma).epsilon(0.03).margin(0.5)); } } } // namespace TEST_CASE("BraggIntegrationEngineGPU_MatchesCPU") { if (get_gpu_count() == 0) { WARN("No CUDA GPU present. Skipping BraggIntegrationEngineGPU_MatchesCPU"); return; } SECTION("BoxSum") { CompareCpuVsGpu(IntegratorMode::BoxSum, std::nullopt); } SECTION("ProfileGaussian mono") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, std::nullopt); } SECTION("ProfileGaussian broadband") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, 0.03f); } SECTION("ProfileEmpirical") { CompareCpuVsGpu(IntegratorMode::ProfileEmpirical, std::nullopt); } } // Hidden ([.]) benchmark: the raison d'etre of the GPU port is < 2 ms/frame (vs ~142 ms on the CPU // for ProfileIntegrate2D). Run explicitly with: ./jfjoch_test "[bragg_bench]" TEST_CASE("BraggIntegrationEngineGPU_Benchmark", "[.][bragg_bench]") { if (get_gpu_count() == 0) { WARN("No CUDA GPU present. Skipping benchmark"); return; } const DiffractionExperiment experiment = MakeExperiment(IntegratorMode::ProfileGaussian, std::nullopt, DetJF4M()); const size_t width = experiment.GetXPixelsNum(); const size_t height = experiment.GetYPixelsNum(); const size_t npixel = experiment.GetPixelsNum(); REQUIRE(npixel == width * height); auto stream = std::make_shared(); BraggIntegrationEngineGPU gpu(experiment, stream); for (int spacing : {28, 40, 60, 90}) { const Scene scene = BuildScene(width, height, spacing); const size_t nrefl = scene.predicted.size(); ImagePreprocessorBufferGPU gpu_image(npixel); for (size_t i = 0; i < npixel; ++i) gpu_image[i] = scene.image[i]; REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(), npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess); cudaStreamSynchronize(*stream); auto run = [&] { return gpu.Run(gpu_image, scene.predicted, nrefl, 0); }; for (int i = 0; i < 5; ++i) run(); // warm-up (allocations, JIT) const int iters = 100; const auto t0 = std::chrono::steady_clock::now(); size_t observed = 0; for (int i = 0; i < iters; ++i) observed += run().size(); const auto t1 = std::chrono::steady_clock::now(); const double ms = std::chrono::duration(t1 - t0).count() / iters; BraggIntegrationEngineCPU cpu(experiment); ImagePreprocessorBuffer cpu_image(npixel); for (size_t i = 0; i < npixel; ++i) cpu_image[i] = scene.image[i]; const auto c0 = std::chrono::steady_clock::now(); const size_t cpu_observed = cpu.Run(cpu_image, scene.predicted, nrefl, 0).size(); const auto c1 = std::chrono::steady_clock::now(); const double cpu_ms = std::chrono::duration(c1 - c0).count(); WARN(width << "x" << height << " | " << nrefl << " refl (" << observed / iters << " obs) | GPU " << ms << " ms | CPU " << cpu_ms << " ms (" << cpu_observed << " obs) | speedup " << cpu_ms / ms << "x"); } } #endif