Files
Jungfraujoch/image_analysis/bragg_integration/BraggIntegrationEngineCPU.cpp
T
leonarski_fandClaude Opus 4.8 ddddfb6ffc
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s
bragg_integration: GPU box + profile-fit integrator (standalone engine)
Reimplement BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch
profile fit) under one roof as a base + CPU + GPU engine, mirroring the
AzIntEngine / ROIIntegration pattern. Reads the preprocessed int32
ImagePreprocessorBuffer (masked=INT32_MIN, saturated=INT32_MAX), the same
buffer AzIntEngineGPU/ROIIntegrationGPU consume.

The CUDA engine runs one block per reflection with shared-memory
reductions across six kernels (reset, mask, box-sum, profile learning,
profile build, Kabsch fit); the resolution shell is computed inline. The
learning/fit hot path is single precision (FP64 is throttled on consumer
GPUs; reproduces the double CPU path to ~1e-4). Collapsing the per-frame
CUDA API calls into one reset kernel keeps launch-latency overhead low.

Standalone for now: NOT wired into IndexAndRefine. See
BRAGG_INTEGRATION_ENGINE.md for the design and the binding steps.
BraggIntegrationEngineGPUTest checks GPU == CPU across all three modes
(box/gaussian/empirical) within numeric tolerance, plus a [bragg_bench]
perf sweep.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 20:59:45 +02:00

297 lines
14 KiB
C++

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "BraggIntegrationEngineCPU.h"
#include <algorithm>
#include <cmath>
#include <limits>
using namespace bragg_engine;
namespace {
// The preprocessed buffer stores masked/bad pixels as INT32_MIN and saturated as INT32_MAX.
inline bool valid(int32_t v) { return v != INT32_MIN && v != INT32_MAX; }
} // namespace
BraggIntegrationEngineCPU::BraggIntegrationEngineCPU(const DiffractionExperiment &experiment)
: BraggIntegrationEngine(experiment) {}
std::vector<Reflection> BraggIntegrationEngineCPU::Run(const ImagePreprocessorBuffer &image,
const std::vector<Reflection> &predicted,
size_t npredicted, int64_t image_number) {
std::vector<BraggFitResult> results(npredicted);
if (image.size() != npixel || npredicted == 0)
return Finalize(predicted, npredicted, results, image_number);
const int32_t *img = image.data();
const int W = static_cast<int>(xpixel), H = static_cast<int>(ypixel);
const bool do_clip = apply_bkg_clip && mode != IntegratorMode::BoxSum;
auto grid_idx = [this](int dx, int dy) { return (dy + R) * G + (dx + R); };
// --- Reflection mask: mark the r2 signal disk of every predicted reflection so a neighbour's
// disk is excluded from this reflection's r2..r3 background ring. ---
std::vector<uint8_t> refl_mask(npixel, 0);
for (size_t i = 0; i < npredicted; ++i) {
const auto &r = predicted[i];
const int x0 = std::max(0, static_cast<int>(std::floor(r.predicted_x - r2 - 1.0f)));
const int x1 = std::min(W - 1, static_cast<int>(std::ceil(r.predicted_x + r2 + 1.0f)));
const int y0 = std::max(0, static_cast<int>(std::floor(r.predicted_y - r2 - 1.0f)));
const int y1 = std::min(H - 1, static_cast<int>(std::ceil(r.predicted_y + r2 + 1.0f)));
for (int y = y0; y <= y1; ++y)
for (int x = x0; x <= x1; ++x) {
const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
if (d2 < r2_sq) refl_mask[y * W + x] = 1;
}
}
// --- Pass A: box-sum every reflection (rough I, background, centroid, strong flag). ---
struct Rough {
double I = 0.0, sigma = NAN, bkg = 0.0, obs_x = 0.0, obs_y = 0.0;
int cx = 0, cy = 0, shell = -1;
bool ok = false, strong = false, has_obs = false;
};
std::vector<Rough> rough(npredicted);
double inv_d2_min = std::numeric_limits<double>::max(), inv_d2_max = 0.0;
for (size_t i = 0; i < npredicted; ++i) {
const auto &r = predicted[i];
Rough out;
const int x0 = std::max(0, static_cast<int>(std::floor(r.predicted_x - r3 - 1.0)));
const int x1 = std::min(W - 1, static_cast<int>(std::ceil(r.predicted_x + r3 + 1.0)));
const int y0 = std::max(0, static_cast<int>(std::floor(r.predicted_y - r3 - 1.0)));
const int y1 = std::min(H - 1, static_cast<int>(std::ceil(r.predicted_y + r3 + 1.0)));
int64_t I_sum = 0, I_sum_x = 0, I_sum_y = 0, n_inner = 0, n_inner_valid = 0;
double bkg_sum = 0.0;
int n_bkg = 0;
for (int y = y0; y <= y1; ++y)
for (int x = x0; x <= x1; ++x) {
const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
const int32_t px = img[y * W + x];
if (d2 < r1_sq) {
++n_inner;
if (!valid(px)) continue;
I_sum += px;
I_sum_x += static_cast<int64_t>(x) * px;
I_sum_y += static_cast<int64_t>(y) * px;
++n_inner_valid;
} else if (d2 >= r2_sq && d2 < r3_sq) {
if (refl_mask[y * W + x]) continue;
if (!valid(px)) continue;
bkg_sum += static_cast<double>(px);
++n_bkg;
}
}
if (n_inner_valid == n_inner && n_bkg > 5) {
out.bkg = bkg_sum / n_bkg;
// One high-outlier sigma-clip pass on the background ring (stills-only): reject pixels
// above mean + 3*sqrt(mean) to strip a bandwidth-streaked neighbour that biases the mean.
if (do_clip) {
const double thr = out.bkg + 3.0 * std::sqrt(std::max(out.bkg, 1.0));
double s = 0.0;
int n = 0;
for (int y = y0; y <= y1; ++y)
for (int x = x0; x <= x1; ++x) {
const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
if (!(d2 >= r2_sq && d2 < r3_sq)) continue;
if (refl_mask[y * W + x]) continue;
const int32_t px = img[y * W + x];
if (!valid(px)) continue;
if (static_cast<double>(px) <= thr) { s += px; ++n; }
}
if (n > 5) out.bkg = s / n;
}
out.I = static_cast<double>(I_sum) - static_cast<double>(n_inner) * out.bkg;
out.sigma = std::max(1.0, out.I * min_sigma_ratio);
if (I_sum > 0) {
out.sigma = std::max(out.sigma, std::sqrt(static_cast<double>(I_sum)));
out.obs_x = static_cast<double>(I_sum_x) / static_cast<double>(I_sum);
out.obs_y = static_cast<double>(I_sum_y) / static_cast<double>(I_sum);
out.has_obs = true;
}
out.cx = static_cast<int>(std::lround(r.predicted_x));
out.cy = static_cast<int>(std::lround(r.predicted_y));
out.ok = true;
out.strong = out.sigma > 0.0 && out.I / out.sigma >= STRONG_I_OVER_SIGMA;
if (r.d > 0.0f) {
const double inv_d2 = 1.0 / (static_cast<double>(r.d) * r.d);
inv_d2_min = std::min(inv_d2_min, inv_d2);
inv_d2_max = std::max(inv_d2_max, inv_d2);
}
}
rough[i] = out;
}
// --- BoxSum mode is BraggIntegrate2D: emit the rough result directly. ---
if (mode == IntegratorMode::BoxSum) {
for (size_t i = 0; i < npredicted; ++i) {
const auto &rh = rough[i];
if (!rh.ok) continue;
results[i] = {static_cast<float>(rh.I), static_cast<float>(rh.sigma), static_cast<float>(rh.bkg),
static_cast<float>(rh.obs_x), static_cast<float>(rh.obs_y), true, rh.has_obs};
}
return Finalize(predicted, npredicted, results, image_number);
}
auto shell_of = [&](float d) {
if (!(d > 0.0f) || inv_d2_max <= inv_d2_min) return 0;
const double t = (1.0 / (static_cast<double>(d) * d) - inv_d2_min) / (inv_d2_max - inv_d2_min);
return std::clamp(static_cast<int>(t * N_SHELL), 0, N_SHELL - 1);
};
for (size_t i = 0; i < npredicted; ++i)
if (rough[i].ok) rough[i].shell = shell_of(predicted[i].d);
// --- Learn the profile per shell (+ global) from the strong spots. ---
std::vector<std::vector<double>> shell_grid(N_SHELL, std::vector<double>(GG, 0.0));
std::vector<int> shell_n(N_SHELL, 0);
std::vector<double> global_grid(GG, 0.0);
int global_n = 0;
for (size_t i = 0; i < npredicted; ++i) {
const auto &rh = rough[i];
if (!rh.ok || !rh.strong || rh.I <= 0.0) continue;
for (int dy = -R; dy <= R; ++dy)
for (int dx = -R; dx <= R; ++dx) {
const int x = rh.cx + dx, y = rh.cy + dy;
if (x < 0 || y < 0 || x >= W || y >= H) continue;
const int32_t px = img[y * W + x];
if (!valid(px)) continue;
const double v = (static_cast<double>(px) - rh.bkg) / rh.I;
shell_grid[rh.shell][grid_idx(dx, dy)] += v;
global_grid[grid_idx(dx, dy)] += v;
}
++shell_n[rh.shell];
++global_n;
}
// Isotropic width (2nd moment) of a learned grid: over the r1 disk (monochromatic) or the full
// grid (broadband); <r^2> = 2 sigma^2 in 2D.
auto measure_sigma2 = [&](const std::vector<double> &grid) {
double m2 = 0.0, m2w = 0.0;
for (int dy = -R; dy <= R; ++dy)
for (int dx = -R; dx <= R; ++dx) {
if (!broadband && dx * dx + dy * dy >= r1_sq) continue;
const double g = std::max(0.0, grid[grid_idx(dx, dy)]);
m2 += g * (dx * dx + dy * dy);
m2w += g;
}
return m2w > 0.0 ? std::max(0.25, (m2 / m2w) / 2.0) : 1.0;
};
// Normalised profile (sum = 1): empirical average grid, or an isotropic Gaussian of the measured
// 2nd moment (only used by ProfileEmpirical; ProfileGaussian rebuilds per reflection in Pass B).
auto build_profile = [&](const std::vector<double> &grid, int n) {
std::vector<double> P(GG, 0.0);
if (n <= 0) return P;
double sum = 0.0;
for (int k = 0; k < GG; ++k) {
const double g = std::max(0.0, grid[k]);
sum += g;
if (empirical) P[k] = g;
}
if (sum <= 0.0) return P;
if (empirical) {
for (double &p : P) p /= sum;
} else {
const double sigma2 = measure_sigma2(grid);
double gsum = 0.0;
for (int dy = -R; dy <= R; ++dy)
for (int dx = -R; dx <= R; ++dx) {
const double g = std::exp(-(dx * dx + dy * dy) / (2.0 * sigma2));
P[grid_idx(dx, dy)] = g;
gsum += g;
}
for (double &p : P) p /= gsum;
}
return P;
};
const std::vector<double> global_P = build_profile(global_grid, global_n);
const double global_sigma2 = global_n > 0 ? measure_sigma2(global_grid) : 1.0;
std::vector<std::vector<double>> shell_P(N_SHELL);
std::vector<double> shell_sigma2(N_SHELL, global_sigma2);
for (int s = 0; s < N_SHELL; ++s) {
if (shell_n[s] >= MIN_STRONG_PER_SHELL) {
shell_P[s] = build_profile(shell_grid[s], shell_n[s]);
shell_sigma2[s] = measure_sigma2(shell_grid[s]);
} else {
shell_P[s] = global_P;
}
}
// --- Pass B: profile-fit each reflection (Kabsch, de-biased variance v = B + I*P; iterate). ---
std::vector<double> Pbuf;
for (size_t i = 0; i < npredicted; ++i) {
const auto &rh = rough[i];
if (!rh.ok) continue;
const int sh = rh.shell < 0 ? 0 : rh.shell;
int Rf = R;
const std::vector<double> *Pvec = &shell_P[sh];
if (!empirical) {
const double rx = predicted[i].predicted_x - beam_x, ry = predicted[i].predicted_y - beam_y;
const double Rpx = std::hypot(rx, ry);
const double tan2t = Rpx / F_px;
const double s2t = shell_sigma2[sh];
double s2r = s2t, ux = 1.0, uy = 0.0;
bool elong = false;
if (use_ellipse) {
const double sbw = bw_sigma * Rpx;
const double radial_extra = sbw * sbw + c_radial * tan2t * tan2t;
if (Rpx > 1e-6 && radial_extra > 0.25) {
ux = rx / Rpx; uy = ry / Rpx;
s2r = s2t + radial_extra;
elong = true;
}
}
// Build the Gaussian per reflection, centred on the sub-pixel predicted position and (when
// needed) radially elongated, on a grid grown to hold the streak.
const double fx = predicted[i].predicted_x - rh.cx, fy = predicted[i].predicted_y - rh.cy;
Rf = elong ? std::min(3 * R, static_cast<int>(std::ceil(r2 + 2.0 * std::sqrt(s2r)))) : R;
const int Gf = 2 * Rf + 1;
Pbuf.assign(static_cast<size_t>(Gf) * Gf, 0.0);
double gs = 0.0;
for (int dy = -Rf; dy <= Rf; ++dy)
for (int dx = -Rf; dx <= Rf; ++dx) {
const double ex = dx - fx, ey = dy - fy;
const double rad = ex * ux + ey * uy, tn = -ex * uy + ey * ux;
const double g = std::exp(-rad * rad / (2.0 * s2r) - tn * tn / (2.0 * s2t));
Pbuf[(dy + Rf) * Gf + (dx + Rf)] = g;
gs += g;
}
for (double &p : Pbuf) p /= gs;
Pvec = &Pbuf;
}
const int Gf = 2 * Rf + 1;
const double B = std::max(rh.bkg, 1.0);
double I = rh.I, den = 0.0;
for (int iter = 0; iter < 4; ++iter) {
double num = 0.0;
den = 0.0;
for (int dy = -Rf; dy <= Rf; ++dy)
for (int dx = -Rf; dx <= Rf; ++dx) {
const double Pp = (*Pvec)[(dy + Rf) * Gf + (dx + Rf)];
if (Pp <= 0.0) continue;
const int x = rh.cx + dx, y = rh.cy + dy;
if (x < 0 || y < 0 || x >= W || y >= H) continue;
const int32_t px = img[y * W + x];
if (!valid(px)) continue;
const double v = B + std::max(0.0, I) * Pp;
num += Pp * (static_cast<double>(px) - rh.bkg) / v;
den += Pp * Pp / v;
}
if (den > 0.0) I = num / den; else break;
}
if (!(den > 0.0)) continue;
results[i] = {static_cast<float>(I), static_cast<float>(std::sqrt(1.0 / den)),
static_cast<float>(rh.bkg), 0.0f, 0.0f, true, false};
}
return Finalize(predicted, npredicted, results, image_number);
}