bragg_integration: GPU box + profile-fit integrator (standalone engine)
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s
Reimplement BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch profile fit) under one roof as a base + CPU + GPU engine, mirroring the AzIntEngine / ROIIntegration pattern. Reads the preprocessed int32 ImagePreprocessorBuffer (masked=INT32_MIN, saturated=INT32_MAX), the same buffer AzIntEngineGPU/ROIIntegrationGPU consume. The CUDA engine runs one block per reflection with shared-memory reductions across six kernels (reset, mask, box-sum, profile learning, profile build, Kabsch fit); the resolution shell is computed inline. The learning/fit hot path is single precision (FP64 is throttled on consumer GPUs; reproduces the double CPU path to ~1e-4). Collapsing the per-frame CUDA API calls into one reset kernel keeps launch-latency overhead low. Standalone for now: NOT wired into IndexAndRefine. See BRAGG_INTEGRATION_ENGINE.md for the design and the binding steps. BraggIntegrationEngineGPUTest checks GPU == CPU across all three modes (box/gaussian/empirical) within numeric tolerance, plus a [bragg_bench] perf sweep. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
# BraggIntegrationEngine — GPU box + profile-fit integrator
|
||||
|
||||
Status: **standalone, not yet wired into the pipeline.** This is a GPU (CUDA) reimplementation of
|
||||
`ProfileIntegrate2D.cpp` (the default Kabsch profile-fit integrator, ~142 ms/frame on CPU) and
|
||||
`BraggIntegrate2D.cpp` (the box-sum integrator) under one roof, following the
|
||||
`AzIntEngine` / `ROIIntegration` base + CPU + GPU pattern. It reproduces the CPU result up to
|
||||
floating-point precision and runs in **< 2 ms/frame** for realistic reflection counts.
|
||||
|
||||
## Files (all in `image_analysis/bragg_integration/`)
|
||||
|
||||
| file | role |
|
||||
|------|------|
|
||||
| `BraggIntegrationEngine.{h,cpp}` | base class: extracts the fixed per-experiment config (r-radii, ellipse coefficients, geometry) and owns `Finalize()` (polarization + `image_scale_corr` + `vector<Reflection>` assembly) |
|
||||
| `BraggIntegrationEngineCPU.{h,cpp}` | plain-C++ engine: the fallback **and** the numeric oracle for the GPU |
|
||||
| `BraggIntegrationEngineGPU.{h,cu}` | the CUDA engine |
|
||||
| `tests/BraggIntegrationEngineGPUTest.cpp` | GPU-vs-CPU equivalence test + hidden `[bragg_bench]` benchmark |
|
||||
|
||||
`CMakeLists.txt` builds the base + CPU always; the `.cu` + GPU header are added only under
|
||||
`JFJOCH_CUDA_AVAILABLE` (mirrors `azint`/`roi`). No change needed to `image_analysis/CMakeLists.txt`
|
||||
(the aggregate `JFJochImageAnalysis` already links `JFJochBraggIntegration` and
|
||||
`JFJochImagePreprocessing`, so the `ImagePreprocessorBuffer` symbols resolve at the final link).
|
||||
|
||||
## Interface
|
||||
|
||||
```cpp
|
||||
class BraggIntegrationEngine {
|
||||
BraggIntegrationEngine(const DiffractionExperiment&);
|
||||
virtual std::vector<Reflection> Run(const ImagePreprocessorBuffer& image, // preprocessed int32
|
||||
const std::vector<Reflection>& predicted, size_t npredicted,
|
||||
int64_t image_number) = 0;
|
||||
};
|
||||
// CPU: BraggIntegrationEngineCPU(experiment)
|
||||
// GPU: BraggIntegrationEngineGPU(experiment, std::shared_ptr<CudaStream>)
|
||||
```
|
||||
|
||||
The integrator mode comes from `experiment.GetBraggIntegrationSettings().GetIntegrator()`:
|
||||
`BoxSum` (= BraggIntegrate2D), `ProfileGaussian` (default), `ProfileEmpirical`. The box sum is also
|
||||
Pass A / the seed of the two profile modes, so it always runs.
|
||||
|
||||
## Key design points
|
||||
|
||||
- **Input is the preprocessed int32 buffer**, NOT the raw `CompressedImage`. Masked/bad pixels are
|
||||
`INT32_MIN`, saturated are `INT32_MAX` (a pixel is valid iff `v != INT32_MIN && v != INT32_MAX`).
|
||||
This is the same buffer `AzIntEngineGPU`/`ROIIntegrationGPU` consume. **Consequence:** the `±1`
|
||||
special/saturation band that `ProfileIntegrate2D` rejects on the lossy `CompressedImage` is handled
|
||||
upstream by the preprocessor instead — correct for the online path; sanity-check it for offline
|
||||
`jfjoch_process` if that ever uses this engine on a lossy-compressed stored file.
|
||||
- Config uses raw detector dims — `xpixel = GetXPixelsNum()`, `npixel = GetPixelsNum()` — matching how
|
||||
`MXAnalysisWithoutFPGA` sizes the `ImagePreprocessorBuffer` and the frame the predicted `predicted_x`
|
||||
live in.
|
||||
- **GPU: one CUDA block per reflection**, shared-memory reductions. Kernels (all in the `.cu`):
|
||||
`reset → mark_mask → boxsum → learn_profile → build_profiles → fit` (Kabsch, 4 iters). The
|
||||
resolution shell is computed inline (`compute_shell`), so there is no separate shell pass. `BoxSum`
|
||||
mode stops after `boxsum`.
|
||||
- **Precision:** the hot path (profile learning + Gaussian fit) is `float` (FP64 is throttled on
|
||||
consumer GPUs; the extraction is Poisson-noise limited, so float matches the double CPU to ~1e-4).
|
||||
`double` is kept only for the box-sum background mean / centroid, where it drives weak intensities.
|
||||
- Per frame only the predicted centres (`predicted_x/y`, `d`) are uploaded; the image is already on
|
||||
the device via `ImagePreprocessorBufferGPU::getGPUBuffer()`. Device buffers grow-only
|
||||
(`EnsureCapacity`). Everything runs on the passed-in `CudaStream`, one sync at the end.
|
||||
|
||||
## Build & test
|
||||
|
||||
```
|
||||
cd build && cmake . # pick up the new CMake sources
|
||||
make -j$(nproc) jfjoch_test
|
||||
cd tests
|
||||
./jfjoch_test "BraggIntegrationEngineGPU_MatchesCPU" # GPU==CPU, 4 modes (skips if no GPU)
|
||||
./jfjoch_test "[bragg_bench]" # perf sweep vs reflection count
|
||||
```
|
||||
|
||||
The equivalence test compares I / sigma / bkg within a small tolerance (float vs double + unordered
|
||||
atomic summation of the learned profile → ~1e-4, not bit-exact — unlike the integer ROI test).
|
||||
|
||||
## Performance (measured on a throttled RTX PRO 2000 Blackwell **laptop**, WSL2 — worst case)
|
||||
|
||||
~0.5 ms @528 refl, 1.1 ms @2600, 2.3 ms @5325 (stress). ~0.45 ms of that is fixed WSL2 kernel-launch
|
||||
latency; on a native-Linux datacenter GPU it is sub-millisecond throughout. 15–20× vs the CPU engine.
|
||||
(First cut was 10.8 ms with FP64 → 3.8 ms with float → the above after collapsing ~19 CUDA API
|
||||
calls/frame into one `reset` kernel + inline shell.)
|
||||
|
||||
## To integrate (the binding — do this on the target machine)
|
||||
|
||||
The engine is deliberately *not* called anywhere yet. Wiring it in:
|
||||
|
||||
1. **Provide the preprocessed buffer + stream to the integration call.** Today
|
||||
`MXAnalysisWithoutFPGA::Analyze` (`image_analysis/MXAnalysisWithoutFPGA.cpp`, ~line 101-104) wraps
|
||||
`preprocessor_buffer->getBuffer()` back into a `CompressedImage` and hands it to
|
||||
`indexer.ProcessImage(...)`. To use this engine you instead want to pass the
|
||||
`ImagePreprocessorBuffer` (the GPU subclass carries the device image) and the `CudaStream` through
|
||||
to where integration happens.
|
||||
2. **Swap the integrator call.** `IndexAndRefine.cpp` (~line 337-339) currently does:
|
||||
```cpp
|
||||
i_outcome.reflections = mode == IntegratorMode::BoxSum
|
||||
? BraggIntegrate2D(...) // CompressedImage
|
||||
: ProfileIntegrate2D(...);
|
||||
```
|
||||
Replace with a `BraggIntegrationEngine::Run(buffer, prediction.GetReflections(), nrefl, msg.number)`
|
||||
on a member engine constructed once (GPU when `get_gpu_count() > 0`, else CPU — mirror how
|
||||
`MXAnalysisWithoutFPGA` chooses `AzIntEngineGPU` vs `AzIntEngineCPU`). Construct with the same
|
||||
`stream` the other GPU engines share.
|
||||
3. **Keep the old functions** (`ProfileIntegrate2D`/`BraggIntegrate2D`) until the engine is validated
|
||||
end-to-end against them on real data (`jfjoch_process --integrator ... --dump-observations`, A/B vs
|
||||
`XDS_ASCII.HKL`), since the `±1`-band difference above is the one behavioural change.
|
||||
|
||||
The output `vector<Reflection>` is identical in shape (I, sigma, bkg, partiality, d, ...), so
|
||||
`ScaleImage`/`CalcISigma`/`Combine3D`/merge downstream consume it unchanged.
|
||||
@@ -0,0 +1,100 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#include "BraggIntegrationEngine.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
namespace {
|
||||
|
||||
// Radial parallax broadening as the coefficient of tan^2(2theta), i.e. Var(z)/pixel^2 [px^2].
|
||||
// Copied verbatim from ProfileIntegrate2D: a photon converts at a random depth z (exponential,
|
||||
// attenuation length L, truncated at the sensor thickness), shifting the recorded spot radially by
|
||||
// z*tan(2theta). L is photoelectric-dominated (~lambda^3), so a per-material reference (13 keV) is
|
||||
// scaled by lambda^3; Si and CdTe are the sensors in use.
|
||||
double parallax_var_px2(const std::string &material, double thickness_um, double lambda_A, double pixel_um) {
|
||||
if (!(thickness_um > 0.0) || !(pixel_um > 0.0) || !(lambda_A > 0.0))
|
||||
return 0.0;
|
||||
const double L_ref = material == "CdTe" ? 42.6 : 273.0; // attenuation length [um] at 0.953 A
|
||||
const double s = lambda_A / 0.953;
|
||||
const double L = L_ref / (s * s * s);
|
||||
const double a = thickness_um / L, e = std::exp(-a);
|
||||
if (1.0 - e <= 0.0)
|
||||
return 0.0;
|
||||
const double mean = L * (1.0 - (1.0 + a) * e) / (1.0 - e);
|
||||
const double ez2 = L * L * (2.0 - (a * a + 2.0 * a + 2.0) * e) / (1.0 - e);
|
||||
const double var = std::max(0.0, ez2 - mean * mean); // um^2
|
||||
return var / (pixel_um * pixel_um);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BraggIntegrationEngine::BraggIntegrationEngine(const DiffractionExperiment &experiment)
|
||||
: geom(experiment.GetDiffractionGeometry()) {
|
||||
const auto settings = experiment.GetBraggIntegrationSettings();
|
||||
const auto &det = experiment.GetDetectorSetup();
|
||||
|
||||
mode = settings.GetIntegrator();
|
||||
empirical = mode == IntegratorMode::ProfileEmpirical;
|
||||
|
||||
// Same frame as the reflections' predicted_x/predicted_y and the ImagePreprocessorBuffer that
|
||||
// feeds this engine (MXAnalysisWithoutFPGA sizes that buffer to GetPixelsNum()).
|
||||
xpixel = experiment.GetXPixelsNum();
|
||||
ypixel = experiment.GetYPixelsNum();
|
||||
npixel = experiment.GetPixelsNum();
|
||||
|
||||
r1_sq = settings.GetR1() * settings.GetR1();
|
||||
r2 = settings.GetR2();
|
||||
r2_sq = r2 * r2;
|
||||
r3 = settings.GetR3();
|
||||
r3_sq = r3 * r3;
|
||||
min_sigma_ratio = settings.GetMinimumSigmaInRegardsToI();
|
||||
R = static_cast<int>(std::ceil(r2));
|
||||
G = 2 * R + 1;
|
||||
GG = G * G;
|
||||
|
||||
// A set bandwidth (broadband / stills) vs monochromatic (rotation) splits the treatment: the
|
||||
// background sigma-clip and radial-elongation terms are path-dependent (see ProfileIntegrate2D).
|
||||
bw_sigma = experiment.GetBandwidthFWHM().value_or(0.0f) / 2.3548f;
|
||||
broadband = bw_sigma > 0.0;
|
||||
apply_bkg_clip = broadband;
|
||||
|
||||
const double c_par = parallax_var_px2(det.GetSensorMaterial(), det.GetSensorThickness_um(),
|
||||
geom.GetWavelength_A(), geom.GetPixelSize_mm() * 1000.0);
|
||||
c_radial = c_par + (broadband ? 0.0 : bragg_engine::C_CAPTURE);
|
||||
F_px = geom.GetDetectorDistance_mm() / std::max(1e-6f, geom.GetPixelSize_mm());
|
||||
beam_x = geom.GetBeamX_pxl();
|
||||
beam_y = geom.GetBeamY_pxl();
|
||||
use_ellipse = !empirical && (bw_sigma > 0.0 || c_radial > 0.0);
|
||||
|
||||
polarization = experiment.GetPolarizationFactor();
|
||||
}
|
||||
|
||||
std::vector<Reflection> BraggIntegrationEngine::Finalize(const std::vector<Reflection> &predicted,
|
||||
size_t npredicted,
|
||||
const std::vector<BraggFitResult> &results,
|
||||
int64_t image_number) const {
|
||||
std::vector<Reflection> out;
|
||||
out.reserve(npredicted);
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
const auto &fr = results[i];
|
||||
if (!fr.ok)
|
||||
continue;
|
||||
Reflection refl = predicted[i];
|
||||
refl.I = fr.I;
|
||||
refl.sigma = fr.sigma;
|
||||
refl.bkg = fr.bkg;
|
||||
if (fr.has_observed) {
|
||||
refl.observed_x = fr.observed_x;
|
||||
refl.observed_y = fr.observed_y;
|
||||
}
|
||||
refl.observed = true;
|
||||
if (polarization)
|
||||
refl.rlp /= geom.CalcAzIntPolarizationCorr(refl.predicted_x, refl.predicted_y, polarization.value());
|
||||
refl.image_scale_corr = refl.rlp / refl.partiality;
|
||||
refl.image_number = static_cast<float>(image_number);
|
||||
out.push_back(refl);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#pragma once
|
||||
|
||||
// =============================================================================
|
||||
// BraggIntegrationEngine — box-sum + profile-fitting 2D integrator, GPU-ready
|
||||
// =============================================================================
|
||||
//
|
||||
// A reimplementation of BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch profile
|
||||
// fit) under one roof, following the AzIntEngine / ROIIntegration pattern: a base class that
|
||||
// extracts the fixed per-experiment configuration, a plain-C++ CPU engine (the fallback and the
|
||||
// numeric oracle), and a CUDA engine (BraggIntegrationEngineGPU) that reaches the same result up
|
||||
// to floating-point precision.
|
||||
//
|
||||
// Unlike BraggIntegrate2D/ProfileIntegrate2D, which read the raw CompressedImage per pixel type
|
||||
// and reject the special/saturation +/-1 band, this engine reads the already-preprocessed int32
|
||||
// image held in an ImagePreprocessorBuffer (the same buffer AzIntEngineGPU/ROIIntegrationGPU
|
||||
// consume): masked/bad pixels are INT32_MIN and saturated pixels INT32_MAX, so bad-pixel identity
|
||||
// is owned by the preprocessor and a pixel is valid iff v != INT32_MIN && v != INT32_MAX.
|
||||
//
|
||||
// The integrator is selected by BraggIntegrationSettings::Integrator:
|
||||
// BoxSum -> BraggIntegrate2D equivalent (rough disk sum minus ring-mean background)
|
||||
// ProfileGaussian -> per-reflection measured-width Gaussian profile fit (the default)
|
||||
// ProfileEmpirical-> per-shell learned empirical profile fit
|
||||
// The box sum is also the seed pass (Pass A) of the two profile modes, so it always runs.
|
||||
//
|
||||
// This class is intentionally standalone: it is NOT yet wired into IndexAndRefine. It takes a
|
||||
// preprocessed image + the predicted reflections and returns the same vector<Reflection> shape
|
||||
// (I, sigma, bkg, partiality, ...) that the downstream scaling/merge consumes unchanged.
|
||||
// =============================================================================
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "../../common/BraggIntegrationSettings.h"
|
||||
#include "../../common/DiffractionExperiment.h"
|
||||
#include "../../common/DiffractionGeometry.h"
|
||||
#include "../../common/Reflection.h"
|
||||
#include "../image_preprocessing/ImagePreprocessorBuffer.h"
|
||||
|
||||
namespace bragg_engine {
|
||||
// Shared with both engines so the CPU and GPU paths stay numerically aligned.
|
||||
constexpr int N_SHELL = 6; // resolution shells for per-shell profile learning
|
||||
constexpr double STRONG_I_OVER_SIGMA = 5.0; // strong-spot threshold that seeds the profile
|
||||
constexpr int MIN_STRONG_PER_SHELL = 30; // below this a shell falls back to the global profile
|
||||
constexpr double C_CAPTURE = 2.5; // weak-spot radial capture term (monochromatic only)
|
||||
} // namespace bragg_engine
|
||||
|
||||
// One reflection's extracted intensity, produced by the derived engine and turned into a
|
||||
// Reflection by Finalize() (which owns the polarization correction and scale bookkeeping).
|
||||
struct BraggFitResult {
|
||||
float I = 0.0f;
|
||||
float sigma = NAN;
|
||||
float bkg = 0.0f;
|
||||
float observed_x = 0.0f; // intensity-weighted centroid (BoxSum mode only)
|
||||
float observed_y = 0.0f;
|
||||
bool ok = false;
|
||||
bool has_observed = false;
|
||||
};
|
||||
|
||||
class BraggIntegrationEngine {
|
||||
protected:
|
||||
// --- fixed configuration extracted from the experiment (see ProfileIntegrate2D) ---
|
||||
IntegratorMode mode;
|
||||
bool empirical; // ProfileEmpirical (vs ProfileGaussian)
|
||||
|
||||
size_t xpixel, ypixel, npixel;
|
||||
|
||||
float r1_sq;
|
||||
float r2, r2_sq;
|
||||
float r3, r3_sq;
|
||||
float min_sigma_ratio;
|
||||
int R, G, GG; // profile-grid half-size, edge (2R+1) and area (G*G)
|
||||
|
||||
bool broadband; // a set bandwidth (stills) vs monochromatic (rotation)
|
||||
double bw_sigma; // bandwidth sigma [dimensionless, * Rpx -> px]
|
||||
bool apply_bkg_clip; // stills-only high-outlier background sigma-clip
|
||||
bool use_ellipse; // radially elongate the per-reflection Gaussian
|
||||
|
||||
double c_radial; // radial variance coefficient of tan^2(2theta): parallax + capture
|
||||
double F_px; // detector distance expressed in pixels
|
||||
float beam_x, beam_y;
|
||||
|
||||
DiffractionGeometry geom; // kept for the per-reflection polarization correction
|
||||
std::optional<float> polarization;
|
||||
|
||||
// Assemble output reflections from the per-reflection fit results (polarization + scale corr).
|
||||
std::vector<Reflection> Finalize(const std::vector<Reflection> &predicted, size_t npredicted,
|
||||
const std::vector<BraggFitResult> &results,
|
||||
int64_t image_number) const;
|
||||
|
||||
public:
|
||||
explicit BraggIntegrationEngine(const DiffractionExperiment &experiment);
|
||||
virtual ~BraggIntegrationEngine() = default;
|
||||
|
||||
// predicted[0..npredicted) are the reflections to extract; image is the preprocessed int32
|
||||
// frame (image.size() == npixel). Returns only the observed reflections.
|
||||
virtual std::vector<Reflection> Run(const ImagePreprocessorBuffer &image,
|
||||
const std::vector<Reflection> &predicted, size_t npredicted,
|
||||
int64_t image_number) = 0;
|
||||
};
|
||||
@@ -0,0 +1,296 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#include "BraggIntegrationEngineCPU.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
using namespace bragg_engine;
|
||||
|
||||
namespace {
|
||||
|
||||
// The preprocessed buffer stores masked/bad pixels as INT32_MIN and saturated as INT32_MAX.
|
||||
inline bool valid(int32_t v) { return v != INT32_MIN && v != INT32_MAX; }
|
||||
|
||||
} // namespace
|
||||
|
||||
BraggIntegrationEngineCPU::BraggIntegrationEngineCPU(const DiffractionExperiment &experiment)
|
||||
: BraggIntegrationEngine(experiment) {}
|
||||
|
||||
std::vector<Reflection> BraggIntegrationEngineCPU::Run(const ImagePreprocessorBuffer &image,
|
||||
const std::vector<Reflection> &predicted,
|
||||
size_t npredicted, int64_t image_number) {
|
||||
std::vector<BraggFitResult> results(npredicted);
|
||||
if (image.size() != npixel || npredicted == 0)
|
||||
return Finalize(predicted, npredicted, results, image_number);
|
||||
|
||||
const int32_t *img = image.data();
|
||||
const int W = static_cast<int>(xpixel), H = static_cast<int>(ypixel);
|
||||
const bool do_clip = apply_bkg_clip && mode != IntegratorMode::BoxSum;
|
||||
|
||||
auto grid_idx = [this](int dx, int dy) { return (dy + R) * G + (dx + R); };
|
||||
|
||||
// --- Reflection mask: mark the r2 signal disk of every predicted reflection so a neighbour's
|
||||
// disk is excluded from this reflection's r2..r3 background ring. ---
|
||||
std::vector<uint8_t> refl_mask(npixel, 0);
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
const auto &r = predicted[i];
|
||||
const int x0 = std::max(0, static_cast<int>(std::floor(r.predicted_x - r2 - 1.0f)));
|
||||
const int x1 = std::min(W - 1, static_cast<int>(std::ceil(r.predicted_x + r2 + 1.0f)));
|
||||
const int y0 = std::max(0, static_cast<int>(std::floor(r.predicted_y - r2 - 1.0f)));
|
||||
const int y1 = std::min(H - 1, static_cast<int>(std::ceil(r.predicted_y + r2 + 1.0f)));
|
||||
for (int y = y0; y <= y1; ++y)
|
||||
for (int x = x0; x <= x1; ++x) {
|
||||
const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
|
||||
if (d2 < r2_sq) refl_mask[y * W + x] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Pass A: box-sum every reflection (rough I, background, centroid, strong flag). ---
|
||||
struct Rough {
|
||||
double I = 0.0, sigma = NAN, bkg = 0.0, obs_x = 0.0, obs_y = 0.0;
|
||||
int cx = 0, cy = 0, shell = -1;
|
||||
bool ok = false, strong = false, has_obs = false;
|
||||
};
|
||||
std::vector<Rough> rough(npredicted);
|
||||
double inv_d2_min = std::numeric_limits<double>::max(), inv_d2_max = 0.0;
|
||||
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
const auto &r = predicted[i];
|
||||
Rough out;
|
||||
const int x0 = std::max(0, static_cast<int>(std::floor(r.predicted_x - r3 - 1.0)));
|
||||
const int x1 = std::min(W - 1, static_cast<int>(std::ceil(r.predicted_x + r3 + 1.0)));
|
||||
const int y0 = std::max(0, static_cast<int>(std::floor(r.predicted_y - r3 - 1.0)));
|
||||
const int y1 = std::min(H - 1, static_cast<int>(std::ceil(r.predicted_y + r3 + 1.0)));
|
||||
|
||||
int64_t I_sum = 0, I_sum_x = 0, I_sum_y = 0, n_inner = 0, n_inner_valid = 0;
|
||||
double bkg_sum = 0.0;
|
||||
int n_bkg = 0;
|
||||
for (int y = y0; y <= y1; ++y)
|
||||
for (int x = x0; x <= x1; ++x) {
|
||||
const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
|
||||
const int32_t px = img[y * W + x];
|
||||
if (d2 < r1_sq) {
|
||||
++n_inner;
|
||||
if (!valid(px)) continue;
|
||||
I_sum += px;
|
||||
I_sum_x += static_cast<int64_t>(x) * px;
|
||||
I_sum_y += static_cast<int64_t>(y) * px;
|
||||
++n_inner_valid;
|
||||
} else if (d2 >= r2_sq && d2 < r3_sq) {
|
||||
if (refl_mask[y * W + x]) continue;
|
||||
if (!valid(px)) continue;
|
||||
bkg_sum += static_cast<double>(px);
|
||||
++n_bkg;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_inner_valid == n_inner && n_bkg > 5) {
|
||||
out.bkg = bkg_sum / n_bkg;
|
||||
// One high-outlier sigma-clip pass on the background ring (stills-only): reject pixels
|
||||
// above mean + 3*sqrt(mean) to strip a bandwidth-streaked neighbour that biases the mean.
|
||||
if (do_clip) {
|
||||
const double thr = out.bkg + 3.0 * std::sqrt(std::max(out.bkg, 1.0));
|
||||
double s = 0.0;
|
||||
int n = 0;
|
||||
for (int y = y0; y <= y1; ++y)
|
||||
for (int x = x0; x <= x1; ++x) {
|
||||
const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
|
||||
if (!(d2 >= r2_sq && d2 < r3_sq)) continue;
|
||||
if (refl_mask[y * W + x]) continue;
|
||||
const int32_t px = img[y * W + x];
|
||||
if (!valid(px)) continue;
|
||||
if (static_cast<double>(px) <= thr) { s += px; ++n; }
|
||||
}
|
||||
if (n > 5) out.bkg = s / n;
|
||||
}
|
||||
out.I = static_cast<double>(I_sum) - static_cast<double>(n_inner) * out.bkg;
|
||||
out.sigma = std::max(1.0, out.I * min_sigma_ratio);
|
||||
if (I_sum > 0) {
|
||||
out.sigma = std::max(out.sigma, std::sqrt(static_cast<double>(I_sum)));
|
||||
out.obs_x = static_cast<double>(I_sum_x) / static_cast<double>(I_sum);
|
||||
out.obs_y = static_cast<double>(I_sum_y) / static_cast<double>(I_sum);
|
||||
out.has_obs = true;
|
||||
}
|
||||
out.cx = static_cast<int>(std::lround(r.predicted_x));
|
||||
out.cy = static_cast<int>(std::lround(r.predicted_y));
|
||||
out.ok = true;
|
||||
out.strong = out.sigma > 0.0 && out.I / out.sigma >= STRONG_I_OVER_SIGMA;
|
||||
if (r.d > 0.0f) {
|
||||
const double inv_d2 = 1.0 / (static_cast<double>(r.d) * r.d);
|
||||
inv_d2_min = std::min(inv_d2_min, inv_d2);
|
||||
inv_d2_max = std::max(inv_d2_max, inv_d2);
|
||||
}
|
||||
}
|
||||
rough[i] = out;
|
||||
}
|
||||
|
||||
// --- BoxSum mode is BraggIntegrate2D: emit the rough result directly. ---
|
||||
if (mode == IntegratorMode::BoxSum) {
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
const auto &rh = rough[i];
|
||||
if (!rh.ok) continue;
|
||||
results[i] = {static_cast<float>(rh.I), static_cast<float>(rh.sigma), static_cast<float>(rh.bkg),
|
||||
static_cast<float>(rh.obs_x), static_cast<float>(rh.obs_y), true, rh.has_obs};
|
||||
}
|
||||
return Finalize(predicted, npredicted, results, image_number);
|
||||
}
|
||||
|
||||
auto shell_of = [&](float d) {
|
||||
if (!(d > 0.0f) || inv_d2_max <= inv_d2_min) return 0;
|
||||
const double t = (1.0 / (static_cast<double>(d) * d) - inv_d2_min) / (inv_d2_max - inv_d2_min);
|
||||
return std::clamp(static_cast<int>(t * N_SHELL), 0, N_SHELL - 1);
|
||||
};
|
||||
for (size_t i = 0; i < npredicted; ++i)
|
||||
if (rough[i].ok) rough[i].shell = shell_of(predicted[i].d);
|
||||
|
||||
// --- Learn the profile per shell (+ global) from the strong spots. ---
|
||||
std::vector<std::vector<double>> shell_grid(N_SHELL, std::vector<double>(GG, 0.0));
|
||||
std::vector<int> shell_n(N_SHELL, 0);
|
||||
std::vector<double> global_grid(GG, 0.0);
|
||||
int global_n = 0;
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
const auto &rh = rough[i];
|
||||
if (!rh.ok || !rh.strong || rh.I <= 0.0) continue;
|
||||
for (int dy = -R; dy <= R; ++dy)
|
||||
for (int dx = -R; dx <= R; ++dx) {
|
||||
const int x = rh.cx + dx, y = rh.cy + dy;
|
||||
if (x < 0 || y < 0 || x >= W || y >= H) continue;
|
||||
const int32_t px = img[y * W + x];
|
||||
if (!valid(px)) continue;
|
||||
const double v = (static_cast<double>(px) - rh.bkg) / rh.I;
|
||||
shell_grid[rh.shell][grid_idx(dx, dy)] += v;
|
||||
global_grid[grid_idx(dx, dy)] += v;
|
||||
}
|
||||
++shell_n[rh.shell];
|
||||
++global_n;
|
||||
}
|
||||
|
||||
// Isotropic width (2nd moment) of a learned grid: over the r1 disk (monochromatic) or the full
|
||||
// grid (broadband); <r^2> = 2 sigma^2 in 2D.
|
||||
auto measure_sigma2 = [&](const std::vector<double> &grid) {
|
||||
double m2 = 0.0, m2w = 0.0;
|
||||
for (int dy = -R; dy <= R; ++dy)
|
||||
for (int dx = -R; dx <= R; ++dx) {
|
||||
if (!broadband && dx * dx + dy * dy >= r1_sq) continue;
|
||||
const double g = std::max(0.0, grid[grid_idx(dx, dy)]);
|
||||
m2 += g * (dx * dx + dy * dy);
|
||||
m2w += g;
|
||||
}
|
||||
return m2w > 0.0 ? std::max(0.25, (m2 / m2w) / 2.0) : 1.0;
|
||||
};
|
||||
// Normalised profile (sum = 1): empirical average grid, or an isotropic Gaussian of the measured
|
||||
// 2nd moment (only used by ProfileEmpirical; ProfileGaussian rebuilds per reflection in Pass B).
|
||||
auto build_profile = [&](const std::vector<double> &grid, int n) {
|
||||
std::vector<double> P(GG, 0.0);
|
||||
if (n <= 0) return P;
|
||||
double sum = 0.0;
|
||||
for (int k = 0; k < GG; ++k) {
|
||||
const double g = std::max(0.0, grid[k]);
|
||||
sum += g;
|
||||
if (empirical) P[k] = g;
|
||||
}
|
||||
if (sum <= 0.0) return P;
|
||||
if (empirical) {
|
||||
for (double &p : P) p /= sum;
|
||||
} else {
|
||||
const double sigma2 = measure_sigma2(grid);
|
||||
double gsum = 0.0;
|
||||
for (int dy = -R; dy <= R; ++dy)
|
||||
for (int dx = -R; dx <= R; ++dx) {
|
||||
const double g = std::exp(-(dx * dx + dy * dy) / (2.0 * sigma2));
|
||||
P[grid_idx(dx, dy)] = g;
|
||||
gsum += g;
|
||||
}
|
||||
for (double &p : P) p /= gsum;
|
||||
}
|
||||
return P;
|
||||
};
|
||||
|
||||
const std::vector<double> global_P = build_profile(global_grid, global_n);
|
||||
const double global_sigma2 = global_n > 0 ? measure_sigma2(global_grid) : 1.0;
|
||||
std::vector<std::vector<double>> shell_P(N_SHELL);
|
||||
std::vector<double> shell_sigma2(N_SHELL, global_sigma2);
|
||||
for (int s = 0; s < N_SHELL; ++s) {
|
||||
if (shell_n[s] >= MIN_STRONG_PER_SHELL) {
|
||||
shell_P[s] = build_profile(shell_grid[s], shell_n[s]);
|
||||
shell_sigma2[s] = measure_sigma2(shell_grid[s]);
|
||||
} else {
|
||||
shell_P[s] = global_P;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Pass B: profile-fit each reflection (Kabsch, de-biased variance v = B + I*P; iterate). ---
|
||||
std::vector<double> Pbuf;
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
const auto &rh = rough[i];
|
||||
if (!rh.ok) continue;
|
||||
const int sh = rh.shell < 0 ? 0 : rh.shell;
|
||||
|
||||
int Rf = R;
|
||||
const std::vector<double> *Pvec = &shell_P[sh];
|
||||
if (!empirical) {
|
||||
const double rx = predicted[i].predicted_x - beam_x, ry = predicted[i].predicted_y - beam_y;
|
||||
const double Rpx = std::hypot(rx, ry);
|
||||
const double tan2t = Rpx / F_px;
|
||||
const double s2t = shell_sigma2[sh];
|
||||
double s2r = s2t, ux = 1.0, uy = 0.0;
|
||||
bool elong = false;
|
||||
if (use_ellipse) {
|
||||
const double sbw = bw_sigma * Rpx;
|
||||
const double radial_extra = sbw * sbw + c_radial * tan2t * tan2t;
|
||||
if (Rpx > 1e-6 && radial_extra > 0.25) {
|
||||
ux = rx / Rpx; uy = ry / Rpx;
|
||||
s2r = s2t + radial_extra;
|
||||
elong = true;
|
||||
}
|
||||
}
|
||||
// Build the Gaussian per reflection, centred on the sub-pixel predicted position and (when
|
||||
// needed) radially elongated, on a grid grown to hold the streak.
|
||||
const double fx = predicted[i].predicted_x - rh.cx, fy = predicted[i].predicted_y - rh.cy;
|
||||
Rf = elong ? std::min(3 * R, static_cast<int>(std::ceil(r2 + 2.0 * std::sqrt(s2r)))) : R;
|
||||
const int Gf = 2 * Rf + 1;
|
||||
Pbuf.assign(static_cast<size_t>(Gf) * Gf, 0.0);
|
||||
double gs = 0.0;
|
||||
for (int dy = -Rf; dy <= Rf; ++dy)
|
||||
for (int dx = -Rf; dx <= Rf; ++dx) {
|
||||
const double ex = dx - fx, ey = dy - fy;
|
||||
const double rad = ex * ux + ey * uy, tn = -ex * uy + ey * ux;
|
||||
const double g = std::exp(-rad * rad / (2.0 * s2r) - tn * tn / (2.0 * s2t));
|
||||
Pbuf[(dy + Rf) * Gf + (dx + Rf)] = g;
|
||||
gs += g;
|
||||
}
|
||||
for (double &p : Pbuf) p /= gs;
|
||||
Pvec = &Pbuf;
|
||||
}
|
||||
|
||||
const int Gf = 2 * Rf + 1;
|
||||
const double B = std::max(rh.bkg, 1.0);
|
||||
double I = rh.I, den = 0.0;
|
||||
for (int iter = 0; iter < 4; ++iter) {
|
||||
double num = 0.0;
|
||||
den = 0.0;
|
||||
for (int dy = -Rf; dy <= Rf; ++dy)
|
||||
for (int dx = -Rf; dx <= Rf; ++dx) {
|
||||
const double Pp = (*Pvec)[(dy + Rf) * Gf + (dx + Rf)];
|
||||
if (Pp <= 0.0) continue;
|
||||
const int x = rh.cx + dx, y = rh.cy + dy;
|
||||
if (x < 0 || y < 0 || x >= W || y >= H) continue;
|
||||
const int32_t px = img[y * W + x];
|
||||
if (!valid(px)) continue;
|
||||
const double v = B + std::max(0.0, I) * Pp;
|
||||
num += Pp * (static_cast<double>(px) - rh.bkg) / v;
|
||||
den += Pp * Pp / v;
|
||||
}
|
||||
if (den > 0.0) I = num / den; else break;
|
||||
}
|
||||
if (!(den > 0.0)) continue;
|
||||
|
||||
results[i] = {static_cast<float>(I), static_cast<float>(std::sqrt(1.0 / den)),
|
||||
static_cast<float>(rh.bkg), 0.0f, 0.0f, true, false};
|
||||
}
|
||||
|
||||
return Finalize(predicted, npredicted, results, image_number);
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "BraggIntegrationEngine.h"
|
||||
|
||||
// Plain-C++ reference/fallback engine: a faithful serial re-expression of BraggIntegrate2D (box
|
||||
// sum) and ProfileIntegrate2D (Kabsch profile fit) reading the preprocessed int32 image. Also the
|
||||
// numeric oracle the CUDA engine is checked against.
|
||||
class BraggIntegrationEngineCPU : public BraggIntegrationEngine {
|
||||
public:
|
||||
explicit BraggIntegrationEngineCPU(const DiffractionExperiment &experiment);
|
||||
std::vector<Reflection> Run(const ImagePreprocessorBuffer &image,
|
||||
const std::vector<Reflection> &predicted, size_t npredicted,
|
||||
int64_t image_number) override;
|
||||
};
|
||||
@@ -0,0 +1,480 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#include "BraggIntegrationEngineGPU.h"
|
||||
|
||||
using namespace bragg_engine;
|
||||
|
||||
namespace {
|
||||
|
||||
inline void cuda_err(cudaError_t val) {
|
||||
if (val != cudaSuccess)
|
||||
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
||||
}
|
||||
|
||||
// Fixed scalars passed by value to every kernel (mirrors BraggIntegrationEngine's members).
|
||||
struct BraggGpuParams {
|
||||
int W, H;
|
||||
float r1_sq, r2, r2_sq, r3, r3_sq;
|
||||
float min_sigma_ratio;
|
||||
int R, G, GG;
|
||||
int do_clip; // background sigma-clip (stills, profile modes)
|
||||
int empirical; // ProfileEmpirical vs ProfileGaussian
|
||||
int broadband;
|
||||
int use_ellipse;
|
||||
float bw_sigma;
|
||||
float c_radial;
|
||||
float F_px;
|
||||
float beam_x, beam_y;
|
||||
};
|
||||
|
||||
__device__ inline bool valid(int32_t v) { return v != INT32_MIN && v != INT32_MAX; }
|
||||
|
||||
// --- Mark the r2 signal disk of every predicted reflection (race-free: all writes are 1). ---
|
||||
__global__ void mark_mask(const float *px_x, const float *px_y, uint8_t *mask, BraggGpuParams p, int n) {
|
||||
const int i = blockIdx.x;
|
||||
if (i >= n) return;
|
||||
const float cx = px_x[i], cy = px_y[i];
|
||||
const int x0 = max(0, (int) floorf(cx - p.r2 - 1.0f));
|
||||
const int x1 = min(p.W - 1, (int) ceilf(cx + p.r2 + 1.0f));
|
||||
const int y0 = max(0, (int) floorf(cy - p.r2 - 1.0f));
|
||||
const int y1 = min(p.H - 1, (int) ceilf(cy + p.r2 + 1.0f));
|
||||
const int bw = x1 - x0 + 1, bh = y1 - y0 + 1;
|
||||
if (bw <= 0 || bh <= 0) return;
|
||||
for (int t = threadIdx.x; t < bw * bh; t += blockDim.x) {
|
||||
const int x = x0 + t % bw, y = y0 + t / bw;
|
||||
const float ddx = (float) x - cx, ddy = (float) y - cy;
|
||||
if (ddx * ddx + ddy * ddy < p.r2_sq) mask[y * p.W + x] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Pass A box-sum: rough I / background / centroid / strong flag, one block per reflection. ---
|
||||
__global__ void boxsum(const float *px_x, const float *px_y, const float *dd,
|
||||
const int32_t *img, const uint8_t *mask, BraggGpuParams p, int n,
|
||||
int *cx_o, int *cy_o, float *I_o, float *sigma_o, float *bkg_o,
|
||||
float *obsx_o, float *obsy_o, uint8_t *ok_o, uint8_t *strong_o,
|
||||
uint8_t *hasobs_o, unsigned long long *invd2mm) {
|
||||
const int i = blockIdx.x;
|
||||
if (i >= n) return;
|
||||
|
||||
__shared__ unsigned long long s_Isum, s_Ix, s_Iy;
|
||||
__shared__ int s_ninner, s_ninner_valid, s_nbkg;
|
||||
__shared__ double s_bkgsum;
|
||||
__shared__ int s_accept;
|
||||
__shared__ double s_bkg, s_thr, s_clipsum;
|
||||
__shared__ int s_clipn;
|
||||
if (threadIdx.x == 0) {
|
||||
s_Isum = 0; s_Ix = 0; s_Iy = 0;
|
||||
s_ninner = 0; s_ninner_valid = 0; s_nbkg = 0; s_bkgsum = 0.0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const float cx = px_x[i], cy = px_y[i];
|
||||
const int x0 = max(0, (int) floorf(cx - p.r3 - 1.0f));
|
||||
const int x1 = min(p.W - 1, (int) ceilf(cx + p.r3 + 1.0f));
|
||||
const int y0 = max(0, (int) floorf(cy - p.r3 - 1.0f));
|
||||
const int y1 = min(p.H - 1, (int) ceilf(cy + p.r3 + 1.0f));
|
||||
const int bw = x1 - x0 + 1, bh = y1 - y0 + 1;
|
||||
const int area = (bw > 0 && bh > 0) ? bw * bh : 0;
|
||||
|
||||
long long l_Isum = 0, l_Ix = 0, l_Iy = 0;
|
||||
int l_ni = 0, l_niv = 0, l_nb = 0;
|
||||
double l_bkg = 0.0;
|
||||
for (int t = threadIdx.x; t < area; t += blockDim.x) {
|
||||
const int x = x0 + t % bw, y = y0 + t / bw;
|
||||
const float ddx = (float) x - cx, ddy = (float) y - cy;
|
||||
const float d2 = ddx * ddx + ddy * ddy;
|
||||
const int32_t px = img[y * p.W + x];
|
||||
if (d2 < p.r1_sq) {
|
||||
++l_ni;
|
||||
if (valid(px)) { l_Isum += px; l_Ix += (long long) x * px; l_Iy += (long long) y * px; ++l_niv; }
|
||||
} else if (d2 >= p.r2_sq && d2 < p.r3_sq) {
|
||||
if (mask[y * p.W + x]) continue;
|
||||
if (!valid(px)) continue;
|
||||
l_bkg += (double) px; ++l_nb;
|
||||
}
|
||||
}
|
||||
atomicAdd(&s_Isum, (unsigned long long) l_Isum);
|
||||
atomicAdd(&s_Ix, (unsigned long long) l_Ix);
|
||||
atomicAdd(&s_Iy, (unsigned long long) l_Iy);
|
||||
atomicAdd(&s_ninner, l_ni);
|
||||
atomicAdd(&s_ninner_valid, l_niv);
|
||||
atomicAdd(&s_nbkg, l_nb);
|
||||
atomicAdd(&s_bkgsum, l_bkg);
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_accept = (s_ninner_valid == s_ninner && s_nbkg > 5) ? 1 : 0;
|
||||
s_bkg = s_accept ? (s_bkgsum / (double) s_nbkg) : 0.0;
|
||||
s_thr = s_bkg + 3.0 * sqrt(fmax(s_bkg, 1.0));
|
||||
s_clipsum = 0.0; s_clipn = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Second ring pass for the stills sigma-clip (re-reads the annulus; avoids storing bkg values).
|
||||
if (s_accept && p.do_clip) {
|
||||
double c_l = 0.0; int cn_l = 0;
|
||||
for (int t = threadIdx.x; t < area; t += blockDim.x) {
|
||||
const int x = x0 + t % bw, y = y0 + t / bw;
|
||||
const float ddx = (float) x - cx, ddy = (float) y - cy;
|
||||
const float d2 = ddx * ddx + ddy * ddy;
|
||||
if (!(d2 >= p.r2_sq && d2 < p.r3_sq)) continue;
|
||||
if (mask[y * p.W + x]) continue;
|
||||
const int32_t px = img[y * p.W + x];
|
||||
if (!valid(px)) continue;
|
||||
if ((double) px <= s_thr) { c_l += px; ++cn_l; }
|
||||
}
|
||||
atomicAdd(&s_clipsum, c_l); atomicAdd(&s_clipn, cn_l);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x != 0) return;
|
||||
if (!s_accept) { ok_o[i] = 0; strong_o[i] = 0; hasobs_o[i] = 0; return; }
|
||||
|
||||
double bkg = s_bkg;
|
||||
if (p.do_clip && s_clipn > 5) bkg = s_clipsum / (double) s_clipn;
|
||||
const long long Isum = (long long) s_Isum;
|
||||
const double I = (double) Isum - (double) s_ninner * bkg;
|
||||
double sigma = fmax(1.0, I * (double) p.min_sigma_ratio);
|
||||
uint8_t hasobs = 0; double ox = 0.0, oy = 0.0;
|
||||
if (Isum > 0) {
|
||||
sigma = fmax(sigma, sqrt((double) Isum));
|
||||
ox = (double) (long long) s_Ix / (double) Isum;
|
||||
oy = (double) (long long) s_Iy / (double) Isum;
|
||||
hasobs = 1;
|
||||
}
|
||||
cx_o[i] = (int) lroundf(cx);
|
||||
cy_o[i] = (int) lroundf(cy);
|
||||
I_o[i] = (float) I; sigma_o[i] = (float) sigma; bkg_o[i] = (float) bkg;
|
||||
obsx_o[i] = (float) ox; obsy_o[i] = (float) oy; hasobs_o[i] = hasobs;
|
||||
ok_o[i] = 1;
|
||||
strong_o[i] = (sigma > 0.0 && I / sigma >= STRONG_I_OVER_SIGMA) ? 1 : 0;
|
||||
|
||||
const float d = dd[i];
|
||||
if (d > 0.0f) {
|
||||
// Positive doubles keep IEEE bit-pattern ordering, so atomicMin/Max on the ull view works.
|
||||
const unsigned long long b = (unsigned long long) __double_as_longlong(1.0 / ((double) d * d));
|
||||
atomicMin(&invd2mm[0], b);
|
||||
atomicMax(&invd2mm[1], b);
|
||||
}
|
||||
}
|
||||
|
||||
// Resolution shell of one reflection from the global inv-d^2 range (mirrors CPU shell_of). Computed
|
||||
// inline in learn_profile and fit so no separate shell array/kernel is needed.
|
||||
__device__ inline int compute_shell(float d, const unsigned long long *invd2mm) {
|
||||
const unsigned long long mn = invd2mm[0], mx = invd2mm[1];
|
||||
if (!(d > 0.0f) || mx <= mn) return 0;
|
||||
const double invd2 = 1.0 / ((double) d * d);
|
||||
const double dmn = __longlong_as_double((long long) mn), dmx = __longlong_as_double((long long) mx);
|
||||
const int s = (int) ((invd2 - dmn) / (dmx - dmn) * N_SHELL);
|
||||
return s < 0 ? 0 : (s >= N_SHELL ? N_SHELL - 1 : s);
|
||||
}
|
||||
|
||||
// --- Zero the profile accumulators and seed the inv-d^2 range, in one launch (replaces a handful
|
||||
// of small cudaMemsetAsync calls, which matter when kernel-launch latency is high). ---
|
||||
__global__ void reset(float *shell_grid, float *global_grid, int *shell_n, int *global_n,
|
||||
unsigned long long *invd2mm, int GG) {
|
||||
for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < N_SHELL * GG; k += blockDim.x * gridDim.x)
|
||||
shell_grid[k] = 0.0f;
|
||||
for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < GG; k += blockDim.x * gridDim.x)
|
||||
global_grid[k] = 0.0f;
|
||||
if (blockIdx.x == 0 && threadIdx.x < N_SHELL) shell_n[threadIdx.x] = 0;
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
*global_n = 0;
|
||||
invd2mm[0] = ~0ull; // min seed
|
||||
invd2mm[1] = 0ull; // max seed
|
||||
}
|
||||
}
|
||||
|
||||
// --- Learn the profile: each strong spot adds its bkg-subtracted, I-normalised grid to its shell
|
||||
// (and the global grid). One block per reflection. ---
|
||||
__global__ void learn_profile(const int32_t *img, const int *cx_a, const int *cy_a, const float *dd,
|
||||
const unsigned long long *invd2mm,
|
||||
const float *I_a, const float *bkg_a, const uint8_t *ok_a, const uint8_t *strong_a,
|
||||
float *shell_grid, float *global_grid, int *shell_n, int *global_n,
|
||||
BraggGpuParams p, int n) {
|
||||
const int i = blockIdx.x;
|
||||
if (i >= n || !ok_a[i] || !strong_a[i]) return;
|
||||
const float I = I_a[i];
|
||||
if (!(I > 0.0f)) return;
|
||||
const int cx = cx_a[i], cy = cy_a[i], sh = compute_shell(dd[i], invd2mm);
|
||||
const float bkg = bkg_a[i];
|
||||
float *sg = shell_grid + (size_t) sh * p.GG;
|
||||
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) {
|
||||
const int x = cx + (k % p.G - p.R), y = cy + (k / p.G - p.R);
|
||||
if (x < 0 || y < 0 || x >= p.W || y >= p.H) continue;
|
||||
const int32_t px = img[y * p.W + x];
|
||||
if (!valid(px)) continue;
|
||||
const float v = ((float) px - bkg) / I;
|
||||
atomicAdd(&sg[k], v);
|
||||
atomicAdd(&global_grid[k], v);
|
||||
}
|
||||
if (threadIdx.x == 0) { atomicAdd(&shell_n[sh], 1); atomicAdd(global_n, 1); }
|
||||
}
|
||||
|
||||
// --- Reduce each learned grid to its 2nd-moment width (and, for empirical, a normalised profile).
|
||||
// One block per grid: blocks [0,N_SHELL) are the shells, block N_SHELL is the global grid. ---
|
||||
__global__ void build_profiles(const float *shell_grid, const float *global_grid,
|
||||
const int *shell_n, const int *global_n,
|
||||
float *shell_P, float *global_P, float *shell_sigma2, float *global_sigma2,
|
||||
BraggGpuParams p) {
|
||||
const int b = blockIdx.x;
|
||||
const float *grid; int nstrong; float *P; float *sig2;
|
||||
if (b < N_SHELL) { grid = shell_grid + (size_t) b * p.GG; nstrong = shell_n[b]; P = shell_P + (size_t) b * p.GG; sig2 = &shell_sigma2[b]; }
|
||||
else { grid = global_grid; nstrong = *global_n; P = global_P; sig2 = global_sigma2; }
|
||||
|
||||
__shared__ float s_m2, s_m2w, s_sum;
|
||||
if (threadIdx.x == 0) { s_m2 = 0.0f; s_m2w = 0.0f; s_sum = 0.0f; }
|
||||
__syncthreads();
|
||||
|
||||
float l_m2 = 0.0f, l_m2w = 0.0f, l_sum = 0.0f;
|
||||
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) {
|
||||
const int dx = k % p.G - p.R, dy = k / p.G - p.R;
|
||||
const float g = fmaxf(0.0f, grid[k]);
|
||||
const int r2i = dx * dx + dy * dy;
|
||||
if (p.broadband || (float) r2i < p.r1_sq) { l_m2 += g * (float) r2i; l_m2w += g; }
|
||||
l_sum += g;
|
||||
if (p.empirical) P[k] = g; // pre-store clamped grid for in-place normalisation below
|
||||
}
|
||||
atomicAdd(&s_m2, l_m2); atomicAdd(&s_m2w, l_m2w); atomicAdd(&s_sum, l_sum);
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
*sig2 = (nstrong > 0 && s_m2w > 0.0f) ? fmaxf(0.25f, (s_m2 / s_m2w) / 2.0f) : 1.0f;
|
||||
|
||||
if (p.empirical) {
|
||||
const float sum = s_sum;
|
||||
const bool normalise = nstrong > 0 && sum > 0.0f;
|
||||
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) P[k] = normalise ? P[k] / sum : 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Pass B Kabsch profile fit: I = sum P(c-B)/v over sum P^2/v, v = B + max(I,0)P (iterate).
|
||||
// One block per reflection; the (possibly elongated) profile is built in shared memory. ---
|
||||
__global__ void fit(const int32_t *img, const float *px_x, const float *px_y,
|
||||
const int *cx_a, const int *cy_a, const float *dd, const unsigned long long *invd2mm,
|
||||
const float *I_seed, const float *bkg_a, const uint8_t *ok_a,
|
||||
const float *shell_P, const float *global_P,
|
||||
const float *shell_sigma2, const float *global_sigma2, const int *shell_n,
|
||||
float *I_o, float *sigma_o, uint8_t *ok_o, BraggGpuParams p, int n) {
|
||||
const int i = blockIdx.x;
|
||||
if (i >= n) return;
|
||||
extern __shared__ float Pbuf[];
|
||||
__shared__ float s_gs, s_num, s_den, s_I;
|
||||
__shared__ int s_Rf, s_Gf;
|
||||
|
||||
if (!ok_a[i]) { if (threadIdx.x == 0) ok_o[i] = 0; return; }
|
||||
|
||||
const int cx = cx_a[i], cy = cy_a[i];
|
||||
const int sh = compute_shell(dd[i], invd2mm);
|
||||
const bool use_shell = shell_n[sh] >= MIN_STRONG_PER_SHELL; // else fall back to the global profile
|
||||
const float bkg = bkg_a[i];
|
||||
|
||||
if (p.empirical) {
|
||||
const float *Psrc = use_shell ? (shell_P + (size_t) sh * p.GG) : global_P;
|
||||
if (threadIdx.x == 0) { s_Rf = p.R; s_Gf = p.G; }
|
||||
__syncthreads();
|
||||
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) Pbuf[k] = Psrc[k];
|
||||
__syncthreads();
|
||||
} else {
|
||||
const float s2t = use_shell ? shell_sigma2[sh] : *global_sigma2;
|
||||
const float rx = px_x[i] - p.beam_x, ry = px_y[i] - p.beam_y;
|
||||
const float Rpx = sqrtf(rx * rx + ry * ry);
|
||||
const float tan2t = Rpx / p.F_px;
|
||||
float s2r = s2t, ux = 1.0f, uy = 0.0f;
|
||||
bool elong = false;
|
||||
if (p.use_ellipse) {
|
||||
const float sbw = p.bw_sigma * Rpx;
|
||||
const float radial_extra = sbw * sbw + p.c_radial * tan2t * tan2t;
|
||||
if (Rpx > 1e-6f && radial_extra > 0.25f) { ux = rx / Rpx; uy = ry / Rpx; s2r = s2t + radial_extra; elong = true; }
|
||||
}
|
||||
const int Rf = elong ? min(3 * p.R, (int) ceilf(p.r2 + 2.0f * sqrtf(s2r))) : p.R;
|
||||
const int Gf = 2 * Rf + 1;
|
||||
if (threadIdx.x == 0) { s_Rf = Rf; s_Gf = Gf; s_gs = 0.0f; }
|
||||
__syncthreads();
|
||||
const float fx = px_x[i] - cx, fy = px_y[i] - cy;
|
||||
float l_gs = 0.0f;
|
||||
for (int k = threadIdx.x; k < Gf * Gf; k += blockDim.x) {
|
||||
const float ex = (k % Gf - Rf) - fx, ey = (k / Gf - Rf) - fy;
|
||||
const float rad = ex * ux + ey * uy, tn = -ex * uy + ey * ux;
|
||||
const float g = expf(-rad * rad / (2.0f * s2r) - tn * tn / (2.0f * s2t));
|
||||
Pbuf[k] = g; l_gs += g;
|
||||
}
|
||||
atomicAdd(&s_gs, l_gs);
|
||||
__syncthreads();
|
||||
const float gs = s_gs;
|
||||
for (int k = threadIdx.x; k < Gf * Gf; k += blockDim.x) Pbuf[k] /= gs;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
const int Rf = s_Rf, Gf = s_Gf, GfGf = Gf * Gf;
|
||||
const float B = fmaxf(bkg, 1.0f);
|
||||
if (threadIdx.x == 0) s_I = I_seed[i];
|
||||
__syncthreads();
|
||||
|
||||
for (int iter = 0; iter < 4; ++iter) {
|
||||
if (threadIdx.x == 0) { s_num = 0.0f; s_den = 0.0f; }
|
||||
__syncthreads();
|
||||
const float Ihere = s_I;
|
||||
float l_num = 0.0f, l_den = 0.0f;
|
||||
for (int k = threadIdx.x; k < GfGf; k += blockDim.x) {
|
||||
const float Pp = Pbuf[k];
|
||||
if (Pp <= 0.0f) continue;
|
||||
const int x = cx + (k % Gf - Rf), y = cy + (k / Gf - Rf);
|
||||
if (x < 0 || y < 0 || x >= p.W || y >= p.H) continue;
|
||||
const int32_t px = img[y * p.W + x];
|
||||
if (!valid(px)) continue;
|
||||
const float v = B + fmaxf(0.0f, Ihere) * Pp;
|
||||
l_num += Pp * ((float) px - bkg) / v;
|
||||
l_den += Pp * Pp / v;
|
||||
}
|
||||
atomicAdd(&s_num, l_num); atomicAdd(&s_den, l_den);
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0 && s_den > 0.0f) s_I = s_num / s_den;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
if (s_den > 0.0f) { I_o[i] = s_I; sigma_o[i] = sqrtf(1.0f / s_den); ok_o[i] = 1; }
|
||||
else ok_o[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BraggIntegrationEngineGPU::BraggIntegrationEngineGPU(const DiffractionExperiment &experiment,
|
||||
std::shared_ptr<CudaStream> stream)
|
||||
: BraggIntegrationEngine(experiment),
|
||||
stream(std::move(stream)),
|
||||
d_mask(npixel),
|
||||
d_shell_grid(static_cast<size_t>(bragg_engine::N_SHELL) * GG),
|
||||
d_global_grid(GG),
|
||||
d_shell_P(static_cast<size_t>(bragg_engine::N_SHELL) * GG),
|
||||
d_global_P(GG),
|
||||
d_shell_sigma2(bragg_engine::N_SHELL),
|
||||
d_global_sigma2(1),
|
||||
d_shell_n(bragg_engine::N_SHELL),
|
||||
d_global_n(1),
|
||||
d_invd2(2) {
|
||||
threads = 128;
|
||||
|
||||
// Fit profile grid: R for empirical / box, up to 3R (radially elongated) for the Gaussian.
|
||||
const int max_Rf = empirical ? R : 3 * R;
|
||||
const int max_Gf = 2 * max_Rf + 1;
|
||||
fit_shared_bytes = static_cast<size_t>(max_Gf) * max_Gf * sizeof(float);
|
||||
|
||||
cudaDeviceProp prop{};
|
||||
cuda_err(cudaGetDeviceProperties(&prop, 0));
|
||||
if (fit_shared_bytes > prop.sharedMemPerBlock)
|
||||
throw JFJochException(JFJochExceptionCategory::GPUCUDAError,
|
||||
"BraggIntegrationEngineGPU: profile grid exceeds shared memory (r2 too large)");
|
||||
}
|
||||
|
||||
void BraggIntegrationEngineGPU::EnsureCapacity(size_t n) {
|
||||
if (n <= capacity)
|
||||
return;
|
||||
d_px_x = CudaDevicePtr<float>(n);
|
||||
d_px_y = CudaDevicePtr<float>(n);
|
||||
d_d = CudaDevicePtr<float>(n);
|
||||
d_cx = CudaDevicePtr<int>(n);
|
||||
d_cy = CudaDevicePtr<int>(n);
|
||||
d_I = CudaDevicePtr<float>(n);
|
||||
d_sigma = CudaDevicePtr<float>(n);
|
||||
d_bkg = CudaDevicePtr<float>(n);
|
||||
d_obs_x = CudaDevicePtr<float>(n);
|
||||
d_obs_y = CudaDevicePtr<float>(n);
|
||||
d_ok = CudaDevicePtr<uint8_t>(n);
|
||||
d_strong = CudaDevicePtr<uint8_t>(n);
|
||||
d_has_obs = CudaDevicePtr<uint8_t>(n);
|
||||
|
||||
h_px_x.resize(n); h_px_y.resize(n); h_d.resize(n);
|
||||
h_I.resize(n); h_sigma.resize(n); h_bkg.resize(n);
|
||||
h_obs_x.resize(n); h_obs_y.resize(n);
|
||||
h_ok.resize(n); h_has_obs.resize(n);
|
||||
capacity = n;
|
||||
}
|
||||
|
||||
std::vector<Reflection> BraggIntegrationEngineGPU::Run(const ImagePreprocessorBuffer &image,
|
||||
const std::vector<Reflection> &predicted,
|
||||
size_t npredicted, int64_t image_number) {
|
||||
std::vector<BraggFitResult> results(npredicted);
|
||||
if (image.size() != npixel || npredicted == 0)
|
||||
return Finalize(predicted, npredicted, results, image_number);
|
||||
|
||||
const int32_t *img = image.getGPUBuffer();
|
||||
if (img == nullptr)
|
||||
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
|
||||
"BraggIntegrationEngineGPU: image buffer is not on the GPU");
|
||||
|
||||
EnsureCapacity(npredicted);
|
||||
const int n = static_cast<int>(npredicted);
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
h_px_x[i] = predicted[i].predicted_x;
|
||||
h_px_y[i] = predicted[i].predicted_y;
|
||||
h_d[i] = predicted[i].d;
|
||||
}
|
||||
cuda_err(cudaMemcpyAsync(d_px_x, h_px_x.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
|
||||
cuda_err(cudaMemcpyAsync(d_px_y, h_px_y.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
|
||||
cuda_err(cudaMemcpyAsync(d_d, h_d.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
|
||||
|
||||
BraggGpuParams p{
|
||||
.W = static_cast<int>(xpixel), .H = static_cast<int>(ypixel),
|
||||
.r1_sq = r1_sq, .r2 = r2, .r2_sq = r2_sq, .r3 = r3, .r3_sq = r3_sq,
|
||||
.min_sigma_ratio = min_sigma_ratio,
|
||||
.R = R, .G = G, .GG = GG,
|
||||
.do_clip = (apply_bkg_clip && mode != IntegratorMode::BoxSum) ? 1 : 0,
|
||||
.empirical = empirical ? 1 : 0,
|
||||
.broadband = broadband ? 1 : 0,
|
||||
.use_ellipse = use_ellipse ? 1 : 0,
|
||||
.bw_sigma = static_cast<float>(bw_sigma), .c_radial = static_cast<float>(c_radial),
|
||||
.F_px = static_cast<float>(F_px),
|
||||
.beam_x = beam_x, .beam_y = beam_y,
|
||||
};
|
||||
|
||||
// Pass A: reset accumulators, mask, then box-sum.
|
||||
cuda_err(cudaMemsetAsync(d_mask, 0, npixel, *stream));
|
||||
reset<<<32, 256, 0, *stream>>>(d_shell_grid, d_global_grid, d_shell_n, d_global_n, d_invd2, GG);
|
||||
mark_mask<<<n, threads, 0, *stream>>>(d_px_x, d_px_y, d_mask, p, n);
|
||||
boxsum<<<n, threads, 0, *stream>>>(d_px_x, d_px_y, d_d, img, d_mask, p, n,
|
||||
d_cx, d_cy, d_I, d_sigma, d_bkg, d_obs_x, d_obs_y,
|
||||
d_ok, d_strong, d_has_obs, d_invd2);
|
||||
|
||||
if (mode != IntegratorMode::BoxSum) {
|
||||
// Pass B: learn (shell computed inline) -> build -> fit.
|
||||
learn_profile<<<n, threads, 0, *stream>>>(img, d_cx, d_cy, d_d, d_invd2, d_I, d_bkg, d_ok, d_strong,
|
||||
d_shell_grid, d_global_grid, d_shell_n, d_global_n, p, n);
|
||||
build_profiles<<<bragg_engine::N_SHELL + 1, threads, 0, *stream>>>(
|
||||
d_shell_grid, d_global_grid, d_shell_n, d_global_n,
|
||||
d_shell_P, d_global_P, d_shell_sigma2, d_global_sigma2, p);
|
||||
fit<<<n, threads, fit_shared_bytes, *stream>>>(img, d_px_x, d_px_y, d_cx, d_cy, d_d, d_invd2,
|
||||
d_I, d_bkg, d_ok, d_shell_P, d_global_P,
|
||||
d_shell_sigma2, d_global_sigma2, d_shell_n,
|
||||
d_I, d_sigma, d_ok, p, n);
|
||||
}
|
||||
|
||||
cuda_err(cudaMemcpyAsync(h_I.data(), d_I, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
cuda_err(cudaMemcpyAsync(h_sigma.data(), d_sigma, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
cuda_err(cudaMemcpyAsync(h_bkg.data(), d_bkg, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
cuda_err(cudaMemcpyAsync(h_ok.data(), d_ok, sizeof(uint8_t) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
const bool boxsum_mode = mode == IntegratorMode::BoxSum;
|
||||
if (boxsum_mode) {
|
||||
cuda_err(cudaMemcpyAsync(h_obs_x.data(), d_obs_x, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
cuda_err(cudaMemcpyAsync(h_obs_y.data(), d_obs_y, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
cuda_err(cudaMemcpyAsync(h_has_obs.data(), d_has_obs, sizeof(uint8_t) * npredicted, cudaMemcpyDeviceToHost, *stream));
|
||||
}
|
||||
cuda_err(cudaStreamSynchronize(*stream));
|
||||
|
||||
for (size_t i = 0; i < npredicted; ++i) {
|
||||
if (!h_ok[i]) continue;
|
||||
results[i].I = h_I[i];
|
||||
results[i].sigma = h_sigma[i];
|
||||
results[i].bkg = h_bkg[i];
|
||||
results[i].ok = true;
|
||||
if (boxsum_mode && h_has_obs[i]) {
|
||||
results[i].observed_x = h_obs_x[i];
|
||||
results[i].observed_y = h_obs_y[i];
|
||||
results[i].has_observed = true;
|
||||
}
|
||||
}
|
||||
return Finalize(predicted, npredicted, results, image_number);
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "BraggIntegrationEngine.h"
|
||||
#include "../indexing/CUDAMemHelpers.h"
|
||||
|
||||
// CUDA engine: reproduces BraggIntegrationEngineCPU up to floating-point precision. Each stage is a
|
||||
// kernel with one CUDA block per reflection cooperating over the small window via shared-memory
|
||||
// reductions (the natural mapping for thousands of independent, tiny per-spot integrations).
|
||||
//
|
||||
// Pipeline (profile modes): reset -> mark_mask -> boxsum -> learn_profile -> build_profiles -> fit
|
||||
// (the resolution shell is computed inline, so there is no separate shell pass). BoxSum mode stops
|
||||
// after boxsum (that pass is the BraggIntegrate2D box integrator and the seed of the profile fit).
|
||||
// The preprocessed image already lives on the device (ImagePreprocessorBufferGPU::getGPUBuffer());
|
||||
// only the per-frame predicted centres are uploaded.
|
||||
class BraggIntegrationEngineGPU : public BraggIntegrationEngine {
|
||||
std::shared_ptr<CudaStream> stream;
|
||||
int threads;
|
||||
size_t fit_shared_bytes;
|
||||
|
||||
size_t capacity = 0; // per-reflection device/host arrays hold at least this many reflections
|
||||
|
||||
// --- per-reflection device arrays (grown by EnsureCapacity) ---
|
||||
CudaDevicePtr<float> d_px_x, d_px_y, d_d;
|
||||
CudaDevicePtr<int> d_cx, d_cy;
|
||||
CudaDevicePtr<float> d_I, d_sigma, d_bkg, d_obs_x, d_obs_y;
|
||||
CudaDevicePtr<uint8_t> d_ok, d_strong, d_has_obs;
|
||||
|
||||
// --- fixed-size device arrays ---
|
||||
// The learning/fit math is single precision: FP64 is heavily throttled on consumer GPUs and the
|
||||
// extraction is Poisson-noise limited, so float reproduces the double CPU path to ~1e-4.
|
||||
CudaDevicePtr<uint8_t> d_mask; // per-pixel r2-disk reflection mask
|
||||
CudaDevicePtr<float> d_shell_grid, d_global_grid; // learned profile accumulators (N_SHELL*GG, GG)
|
||||
CudaDevicePtr<float> d_shell_P, d_global_P; // normalised profiles (empirical mode)
|
||||
CudaDevicePtr<float> d_shell_sigma2, d_global_sigma2;
|
||||
CudaDevicePtr<int> d_shell_n, d_global_n;
|
||||
CudaDevicePtr<unsigned long long> d_invd2; // [min,max] inv-d^2 as monotonic bit patterns
|
||||
|
||||
// --- host staging (copied back once per frame) ---
|
||||
std::vector<float> h_px_x, h_px_y, h_d;
|
||||
std::vector<float> h_I, h_sigma, h_bkg, h_obs_x, h_obs_y;
|
||||
std::vector<uint8_t> h_ok, h_has_obs;
|
||||
|
||||
void EnsureCapacity(size_t n);
|
||||
|
||||
public:
|
||||
BraggIntegrationEngineGPU(const DiffractionExperiment &experiment, std::shared_ptr<CudaStream> stream);
|
||||
std::vector<Reflection> Run(const ImagePreprocessorBuffer &image,
|
||||
const std::vector<Reflection> &predicted, size_t npredicted,
|
||||
int64_t image_number) override;
|
||||
};
|
||||
@@ -3,6 +3,10 @@ ADD_LIBRARY(JFJochBraggIntegration STATIC
|
||||
BraggIntegrate2D.h
|
||||
ProfileIntegrate2D.cpp
|
||||
ProfileIntegrate2D.h
|
||||
BraggIntegrationEngine.cpp
|
||||
BraggIntegrationEngine.h
|
||||
BraggIntegrationEngineCPU.cpp
|
||||
BraggIntegrationEngineCPU.h
|
||||
Regression.h
|
||||
CalcISigma.cpp
|
||||
CalcISigma.h
|
||||
@@ -10,3 +14,8 @@ ADD_LIBRARY(JFJochBraggIntegration STATIC
|
||||
)
|
||||
|
||||
TARGET_LINK_LIBRARIES(JFJochBraggIntegration JFJochCommon)
|
||||
|
||||
IF (JFJOCH_CUDA_AVAILABLE)
|
||||
TARGET_SOURCES(JFJochBraggIntegration PRIVATE ../indexing/CUDAMemHelpers.h
|
||||
BraggIntegrationEngineGPU.cu BraggIntegrationEngineGPU.h)
|
||||
ENDIF()
|
||||
|
||||
@@ -0,0 +1,199 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include "../common/CUDAWrapper.h"
|
||||
|
||||
#ifdef JFJOCH_USE_CUDA
|
||||
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "../common/BraggIntegrationSettings.h"
|
||||
#include "../common/DetectorSetup.h"
|
||||
#include "../common/DiffractionExperiment.h"
|
||||
#include "../common/Reflection.h"
|
||||
#include "../image_analysis/bragg_integration/BraggIntegrationEngineCPU.h"
|
||||
#include "../image_analysis/bragg_integration/BraggIntegrationEngineGPU.h"
|
||||
#include "../image_analysis/image_preprocessing/ImagePreprocessorBufferGPU.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// A grid of clean Gaussian spots on a flat background, each seeding one predicted reflection.
|
||||
struct Scene {
|
||||
std::vector<int32_t> image;
|
||||
std::vector<Reflection> predicted;
|
||||
size_t width = 0, height = 0;
|
||||
};
|
||||
|
||||
Reflection MakeReflection(float x, float y, float d, int hkl) {
|
||||
Reflection r{};
|
||||
r.h = hkl; r.k = hkl; r.l = hkl;
|
||||
r.predicted_x = x;
|
||||
r.predicted_y = y;
|
||||
r.d = d;
|
||||
r.rlp = 1.0f;
|
||||
r.partiality = 1.0f;
|
||||
return r;
|
||||
}
|
||||
|
||||
Scene BuildScene(size_t width, size_t height, int spacing = 60) {
|
||||
Scene s;
|
||||
s.width = width;
|
||||
s.height = height;
|
||||
s.image.assign(width * height, 12); // flat background
|
||||
|
||||
// A grid of spots, well separated so background rings do not overlap the neighbours' disks.
|
||||
// A spread of intensities (some weak, some very strong) and a spread of d (so several resolution
|
||||
// shells are populated) exercises the strong-spot selection, shell learning and the fit.
|
||||
const int margin = 45;
|
||||
int hkl = 1;
|
||||
for (int gy = 0; margin + gy * spacing < static_cast<int>(height) - margin; ++gy) {
|
||||
for (int gx = 0; margin + gx * spacing < static_cast<int>(width) - margin; ++gx) {
|
||||
const float cx = static_cast<float>(margin + gx * spacing) + 0.3f; // sub-pixel offset
|
||||
const float cy = static_cast<float>(margin + gy * spacing) - 0.2f;
|
||||
const double amp = 150.0 + 60.0 * ((gx * 7 + gy * 13) % 30); // 150..1890
|
||||
const double sigma = 1.3;
|
||||
for (int dy = -6; dy <= 6; ++dy)
|
||||
for (int dx = -6; dx <= 6; ++dx) {
|
||||
const int x = static_cast<int>(std::lround(cx)) + dx;
|
||||
const int y = static_cast<int>(std::lround(cy)) + dy;
|
||||
if (x < 0 || y < 0 || x >= static_cast<int>(width) || y >= static_cast<int>(height)) continue;
|
||||
const double ex = x - cx, ey = y - cy;
|
||||
const double g = amp * std::exp(-(ex * ex + ey * ey) / (2.0 * sigma * sigma));
|
||||
s.image[y * width + x] += static_cast<int32_t>(std::lround(g));
|
||||
}
|
||||
const float d = 1.4f + 0.12f * static_cast<float>((gx + gy) % 12); // 1.4..2.72 A
|
||||
s.predicted.push_back(MakeReflection(cx, cy, d, hkl++));
|
||||
}
|
||||
}
|
||||
|
||||
// A few masked (INT32_MIN) and saturated (INT32_MAX) pixels in background gaps to exercise the
|
||||
// validity rejection in both engines identically.
|
||||
for (int k = 0; k < 20; ++k) {
|
||||
const size_t idx = (static_cast<size_t>(k) * 2654435761u) % s.image.size();
|
||||
s.image[idx] = (k % 2) ? INT32_MIN : INT32_MAX;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
DiffractionExperiment MakeExperiment(IntegratorMode mode, std::optional<float> bandwidth_fwhm,
|
||||
const DetectorSetup &det = DetJF(2)) {
|
||||
DiffractionExperiment experiment(det); // DetJF(2) (small) keeps the correctness test fast
|
||||
experiment.DetectorDistance_mm(100.0f).IncidentEnergy_keV(WVL_1A_IN_KEV)
|
||||
.BeamX_pxl(400.0f).BeamY_pxl(400.0f);
|
||||
experiment.BandwidthFWHM(bandwidth_fwhm);
|
||||
BraggIntegrationSettings settings;
|
||||
settings.Integrator(mode);
|
||||
experiment.ImportBraggIntegrationSettings(settings);
|
||||
return experiment;
|
||||
}
|
||||
|
||||
void CompareCpuVsGpu(IntegratorMode mode, std::optional<float> bandwidth_fwhm) {
|
||||
const DiffractionExperiment experiment = MakeExperiment(mode, bandwidth_fwhm);
|
||||
const size_t width = experiment.GetXPixelsNum();
|
||||
const size_t height = experiment.GetYPixelsNum();
|
||||
const size_t npixel = experiment.GetPixelsNum();
|
||||
REQUIRE(npixel == width * height);
|
||||
|
||||
const Scene scene = BuildScene(width, height);
|
||||
REQUIRE(scene.image.size() == npixel);
|
||||
REQUIRE(scene.predicted.size() > 60);
|
||||
|
||||
// CPU reference
|
||||
ImagePreprocessorBuffer cpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i)
|
||||
cpu_image[i] = scene.image[i];
|
||||
BraggIntegrationEngineCPU cpu(experiment);
|
||||
const auto out_cpu = cpu.Run(cpu_image, scene.predicted, scene.predicted.size(), 5);
|
||||
|
||||
// GPU under test, identical input uploaded to the device
|
||||
auto stream = std::make_shared<CudaStream>();
|
||||
ImagePreprocessorBufferGPU gpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i)
|
||||
gpu_image[i] = scene.image[i];
|
||||
REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
|
||||
npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
|
||||
BraggIntegrationEngineGPU gpu(experiment, stream);
|
||||
const auto out_gpu = gpu.Run(gpu_image, scene.predicted, scene.predicted.size(), 5);
|
||||
|
||||
// The ok/observed decisions are deterministic geometry, so both engines return the same set in
|
||||
// the same (predicted-index) order. Intensities differ only by float rounding and the unordered
|
||||
// atomic summation of the learned profile, so compare up to a small tolerance.
|
||||
REQUIRE(out_gpu.size() == out_cpu.size());
|
||||
REQUIRE(out_cpu.size() > 40);
|
||||
for (size_t i = 0; i < out_cpu.size(); ++i) {
|
||||
INFO("mode " << static_cast<int>(mode) << " reflection " << i << " hkl " << out_cpu[i].h);
|
||||
CHECK(out_gpu[i].h == out_cpu[i].h);
|
||||
CHECK(out_gpu[i].image_number == out_cpu[i].image_number);
|
||||
CHECK(out_gpu[i].bkg == Catch::Approx(out_cpu[i].bkg).epsilon(0.02).margin(0.5));
|
||||
CHECK(out_gpu[i].I == Catch::Approx(out_cpu[i].I).epsilon(0.03).margin(2.0));
|
||||
CHECK(out_gpu[i].sigma == Catch::Approx(out_cpu[i].sigma).epsilon(0.03).margin(0.5));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_CASE("BraggIntegrationEngineGPU_MatchesCPU") {
|
||||
if (get_gpu_count() == 0) {
|
||||
WARN("No CUDA GPU present. Skipping BraggIntegrationEngineGPU_MatchesCPU");
|
||||
return;
|
||||
}
|
||||
|
||||
SECTION("BoxSum") { CompareCpuVsGpu(IntegratorMode::BoxSum, std::nullopt); }
|
||||
SECTION("ProfileGaussian mono") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, std::nullopt); }
|
||||
SECTION("ProfileGaussian broadband") { CompareCpuVsGpu(IntegratorMode::ProfileGaussian, 0.03f); }
|
||||
SECTION("ProfileEmpirical") { CompareCpuVsGpu(IntegratorMode::ProfileEmpirical, std::nullopt); }
|
||||
}
|
||||
|
||||
// Hidden ([.]) benchmark: the raison d'etre of the GPU port is < 2 ms/frame (vs ~142 ms on the CPU
|
||||
// for ProfileIntegrate2D). Run explicitly with: ./jfjoch_test "[bragg_bench]"
|
||||
TEST_CASE("BraggIntegrationEngineGPU_Benchmark", "[.][bragg_bench]") {
|
||||
if (get_gpu_count() == 0) {
|
||||
WARN("No CUDA GPU present. Skipping benchmark");
|
||||
return;
|
||||
}
|
||||
const DiffractionExperiment experiment = MakeExperiment(IntegratorMode::ProfileGaussian, std::nullopt, DetJF4M());
|
||||
const size_t width = experiment.GetXPixelsNum();
|
||||
const size_t height = experiment.GetYPixelsNum();
|
||||
const size_t npixel = experiment.GetPixelsNum();
|
||||
REQUIRE(npixel == width * height);
|
||||
|
||||
auto stream = std::make_shared<CudaStream>();
|
||||
BraggIntegrationEngineGPU gpu(experiment, stream);
|
||||
for (int spacing : {28, 40, 60, 90}) {
|
||||
const Scene scene = BuildScene(width, height, spacing);
|
||||
const size_t nrefl = scene.predicted.size();
|
||||
|
||||
ImagePreprocessorBufferGPU gpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i) gpu_image[i] = scene.image[i];
|
||||
REQUIRE(cudaMemcpyAsync(gpu_image.getGPUBuffer(), gpu_image.getBuffer().data(),
|
||||
npixel * sizeof(int32_t), cudaMemcpyHostToDevice, *stream) == cudaSuccess);
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
auto run = [&] { return gpu.Run(gpu_image, scene.predicted, nrefl, 0); };
|
||||
for (int i = 0; i < 5; ++i) run(); // warm-up (allocations, JIT)
|
||||
|
||||
const int iters = 100;
|
||||
const auto t0 = std::chrono::steady_clock::now();
|
||||
size_t observed = 0;
|
||||
for (int i = 0; i < iters; ++i) observed += run().size();
|
||||
const auto t1 = std::chrono::steady_clock::now();
|
||||
const double ms = std::chrono::duration<double, std::milli>(t1 - t0).count() / iters;
|
||||
|
||||
BraggIntegrationEngineCPU cpu(experiment);
|
||||
ImagePreprocessorBuffer cpu_image(npixel);
|
||||
for (size_t i = 0; i < npixel; ++i) cpu_image[i] = scene.image[i];
|
||||
const auto c0 = std::chrono::steady_clock::now();
|
||||
const size_t cpu_observed = cpu.Run(cpu_image, scene.predicted, nrefl, 0).size();
|
||||
const auto c1 = std::chrono::steady_clock::now();
|
||||
const double cpu_ms = std::chrono::duration<double, std::milli>(c1 - c0).count();
|
||||
|
||||
WARN(width << "x" << height << " | " << nrefl << " refl (" << observed / iters
|
||||
<< " obs) | GPU " << ms << " ms | CPU " << cpu_ms << " ms (" << cpu_observed
|
||||
<< " obs) | speedup " << cpu_ms / ms << "x");
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -34,6 +34,7 @@ ADD_EXECUTABLE(jfjoch_test
|
||||
ROIMapTest.cpp
|
||||
ROIIntegrationCPUTest.cpp
|
||||
ROIIntegrationGPUTest.cpp
|
||||
BraggIntegrationEngineGPUTest.cpp
|
||||
LossyFilterTest.cpp
|
||||
ImageBufferTest.cpp
|
||||
PixelMaskTest.cpp
|
||||
|
||||
Reference in New Issue
Block a user