Jungfraujoch/image_analysis/bragg_integration/BraggIntegrationEngineCPU.cpp

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "BraggIntegrationEngineCPU.h"

#include <algorithm>
#include <cmath>
#include <limits>

using namespace bragg_engine;

namespace {

// The preprocessed buffer stores masked/bad pixels as INT32_MIN and saturated as INT32_MAX.
inline bool valid(int32_t v) { return v != INT32_MIN && v != INT32_MAX; }

}   // namespace

BraggIntegrationEngineCPU::BraggIntegrationEngineCPU(const DiffractionExperiment &experiment)
    : BraggIntegrationEngine(experiment) {}

std::vector<Reflection> BraggIntegrationEngineCPU::Run(const ImagePreprocessorBuffer &image,
                                                       const std::vector<Reflection> &predicted,
                                                       size_t npredicted, int64_t image_number) {
    std::vector<BraggFitResult> results(npredicted);
    if (image.size() != npixel || npredicted == 0)
        return Finalize(predicted, npredicted, results, image_number);

    const int32_t *img = image.data();
    const int W = static_cast<int>(xpixel), H = static_cast<int>(ypixel);
    const bool do_clip = apply_bkg_clip && mode != IntegratorMode::BoxSum;

    auto grid_idx = [this](int dx, int dy) { return (dy + R) * G + (dx + R); };

    // --- Reflection mask: mark the r2 signal disk of every predicted reflection so a neighbour's
    //     disk is excluded from this reflection's r2..r3 background ring. ---
    std::vector<uint8_t> refl_mask(npixel, 0);
    for (size_t i = 0; i < npredicted; ++i) {
        const auto &r = predicted[i];
        const int x0 = std::max(0, static_cast<int>(std::floor(r.predicted_x - r2 - 1.0f)));
        const int x1 = std::min(W - 1, static_cast<int>(std::ceil(r.predicted_x + r2 + 1.0f)));
        const int y0 = std::max(0, static_cast<int>(std::floor(r.predicted_y - r2 - 1.0f)));
        const int y1 = std::min(H - 1, static_cast<int>(std::ceil(r.predicted_y + r2 + 1.0f)));
        for (int y = y0; y <= y1; ++y)
            for (int x = x0; x <= x1; ++x) {
                const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
                if (d2 < r2_sq) refl_mask[y * W + x] = 1;
            }
    }

    // --- Pass A: box-sum every reflection (rough I, background, centroid, strong flag). ---
    struct Rough {
        double I = 0.0, sigma = NAN, bkg = 0.0, obs_x = 0.0, obs_y = 0.0;
        int cx = 0, cy = 0, shell = -1;
        bool ok = false, strong = false, has_obs = false;
    };
    std::vector<Rough> rough(npredicted);
    double inv_d2_min = std::numeric_limits<double>::max(), inv_d2_max = 0.0;

    for (size_t i = 0; i < npredicted; ++i) {
        const auto &r = predicted[i];
        Rough out;
        const int x0 = std::max(0, static_cast<int>(std::floor(r.predicted_x - r3 - 1.0)));
        const int x1 = std::min(W - 1, static_cast<int>(std::ceil(r.predicted_x + r3 + 1.0)));
        const int y0 = std::max(0, static_cast<int>(std::floor(r.predicted_y - r3 - 1.0)));
        const int y1 = std::min(H - 1, static_cast<int>(std::ceil(r.predicted_y + r3 + 1.0)));

        int64_t I_sum = 0, I_sum_x = 0, I_sum_y = 0, n_inner = 0, n_inner_valid = 0;
        double bkg_sum = 0.0;
        int n_bkg = 0;
        for (int y = y0; y <= y1; ++y)
            for (int x = x0; x <= x1; ++x) {
                const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
                const int32_t px = img[y * W + x];
                if (d2 < r1_sq) {
                    ++n_inner;
                    if (!valid(px)) continue;
                    I_sum += px;
                    I_sum_x += static_cast<int64_t>(x) * px;
                    I_sum_y += static_cast<int64_t>(y) * px;
                    ++n_inner_valid;
                } else if (d2 >= r2_sq && d2 < r3_sq) {
                    if (refl_mask[y * W + x]) continue;
                    if (!valid(px)) continue;
                    bkg_sum += static_cast<double>(px);
                    ++n_bkg;
                }
            }

        if (n_inner_valid == n_inner && n_bkg > 5) {
            out.bkg = bkg_sum / n_bkg;
            // One high-outlier sigma-clip pass on the background ring (stills-only): reject pixels
            // above mean + 3*sqrt(mean) to strip a bandwidth-streaked neighbour that biases the mean.
            if (do_clip) {
                const double thr = out.bkg + 3.0 * std::sqrt(std::max(out.bkg, 1.0));
                double s = 0.0;
                int n = 0;
                for (int y = y0; y <= y1; ++y)
                    for (int x = x0; x <= x1; ++x) {
                        const double d2 = (x - r.predicted_x) * (x - r.predicted_x) + (y - r.predicted_y) * (y - r.predicted_y);
                        if (!(d2 >= r2_sq && d2 < r3_sq)) continue;
                        if (refl_mask[y * W + x]) continue;
                        const int32_t px = img[y * W + x];
                        if (!valid(px)) continue;
                        if (static_cast<double>(px) <= thr) { s += px; ++n; }
                    }
                if (n > 5) out.bkg = s / n;
            }
            out.I = static_cast<double>(I_sum) - static_cast<double>(n_inner) * out.bkg;
            out.sigma = std::max(1.0, out.I * min_sigma_ratio);
            if (I_sum > 0) {
                out.sigma = std::max(out.sigma, std::sqrt(static_cast<double>(I_sum)));
                out.obs_x = static_cast<double>(I_sum_x) / static_cast<double>(I_sum);
                out.obs_y = static_cast<double>(I_sum_y) / static_cast<double>(I_sum);
                out.has_obs = true;
            }
            out.cx = static_cast<int>(std::lround(r.predicted_x));
            out.cy = static_cast<int>(std::lround(r.predicted_y));
            out.ok = true;
            out.strong = out.sigma > 0.0 && out.I / out.sigma >= STRONG_I_OVER_SIGMA;
            if (r.d > 0.0f) {
                const double inv_d2 = 1.0 / (static_cast<double>(r.d) * r.d);
                inv_d2_min = std::min(inv_d2_min, inv_d2);
                inv_d2_max = std::max(inv_d2_max, inv_d2);
            }
        }
        rough[i] = out;
    }

    // --- BoxSum mode is BraggIntegrate2D: emit the rough result directly. ---
    if (mode == IntegratorMode::BoxSum) {
        for (size_t i = 0; i < npredicted; ++i) {
            const auto &rh = rough[i];
            if (!rh.ok) continue;
            results[i] = {static_cast<float>(rh.I), static_cast<float>(rh.sigma), static_cast<float>(rh.bkg),
                          static_cast<float>(rh.obs_x), static_cast<float>(rh.obs_y), true, rh.has_obs};
        }
        return Finalize(predicted, npredicted, results, image_number);
    }

    auto shell_of = [&](float d) {
        if (!(d > 0.0f) || inv_d2_max <= inv_d2_min) return 0;
        const double t = (1.0 / (static_cast<double>(d) * d) - inv_d2_min) / (inv_d2_max - inv_d2_min);
        return std::clamp(static_cast<int>(t * N_SHELL), 0, N_SHELL - 1);
    };
    for (size_t i = 0; i < npredicted; ++i)
        if (rough[i].ok) rough[i].shell = shell_of(predicted[i].d);

    // --- Learn the profile per shell (+ global) from the strong spots. ---
    std::vector<std::vector<double>> shell_grid(N_SHELL, std::vector<double>(GG, 0.0));
    std::vector<int> shell_n(N_SHELL, 0);
    std::vector<double> global_grid(GG, 0.0);
    int global_n = 0;
    for (size_t i = 0; i < npredicted; ++i) {
        const auto &rh = rough[i];
        if (!rh.ok || !rh.strong || rh.I <= 0.0) continue;
        for (int dy = -R; dy <= R; ++dy)
            for (int dx = -R; dx <= R; ++dx) {
                const int x = rh.cx + dx, y = rh.cy + dy;
                if (x < 0 || y < 0 || x >= W || y >= H) continue;
                const int32_t px = img[y * W + x];
                if (!valid(px)) continue;
                const double v = (static_cast<double>(px) - rh.bkg) / rh.I;
                shell_grid[rh.shell][grid_idx(dx, dy)] += v;
                global_grid[grid_idx(dx, dy)] += v;
            }
        ++shell_n[rh.shell];
        ++global_n;
    }

    // Isotropic width (2nd moment) of a learned grid: over the r1 disk (monochromatic) or the full
    // grid (broadband); <r^2> = 2 sigma^2 in 2D.
    auto measure_sigma2 = [&](const std::vector<double> &grid) {
        double m2 = 0.0, m2w = 0.0;
        for (int dy = -R; dy <= R; ++dy)
            for (int dx = -R; dx <= R; ++dx) {
                if (!broadband && dx * dx + dy * dy >= r1_sq) continue;
                const double g = std::max(0.0, grid[grid_idx(dx, dy)]);
                m2 += g * (dx * dx + dy * dy);
                m2w += g;
            }
        return m2w > 0.0 ? std::max(0.25, (m2 / m2w) / 2.0) : 1.0;
    };
    // Normalised profile (sum = 1): empirical average grid, or an isotropic Gaussian of the measured
    // 2nd moment (only used by ProfileEmpirical; ProfileGaussian rebuilds per reflection in Pass B).
    auto build_profile = [&](const std::vector<double> &grid, int n) {
        std::vector<double> P(GG, 0.0);
        if (n <= 0) return P;
        double sum = 0.0;
        for (int k = 0; k < GG; ++k) {
            const double g = std::max(0.0, grid[k]);
            sum += g;
            if (empirical) P[k] = g;
        }
        if (sum <= 0.0) return P;
        if (empirical) {
            for (double &p : P) p /= sum;
        } else {
            const double sigma2 = measure_sigma2(grid);
            double gsum = 0.0;
            for (int dy = -R; dy <= R; ++dy)
                for (int dx = -R; dx <= R; ++dx) {
                    const double g = std::exp(-(dx * dx + dy * dy) / (2.0 * sigma2));
                    P[grid_idx(dx, dy)] = g;
                    gsum += g;
                }
            for (double &p : P) p /= gsum;
        }
        return P;
    };

    const std::vector<double> global_P = build_profile(global_grid, global_n);
    const double global_sigma2 = global_n > 0 ? measure_sigma2(global_grid) : 1.0;
    std::vector<std::vector<double>> shell_P(N_SHELL);
    std::vector<double> shell_sigma2(N_SHELL, global_sigma2);
    for (int s = 0; s < N_SHELL; ++s) {
        if (shell_n[s] >= MIN_STRONG_PER_SHELL) {
            shell_P[s] = build_profile(shell_grid[s], shell_n[s]);
            shell_sigma2[s] = measure_sigma2(shell_grid[s]);
        } else {
            shell_P[s] = global_P;
        }
    }

    // --- Pass B: profile-fit each reflection (Kabsch, de-biased variance v = B + I*P; iterate). ---
    std::vector<double> Pbuf;
    for (size_t i = 0; i < npredicted; ++i) {
        const auto &rh = rough[i];
        if (!rh.ok) continue;
        const int sh = rh.shell < 0 ? 0 : rh.shell;

        int Rf = R;
        const std::vector<double> *Pvec = &shell_P[sh];
        if (!empirical) {
            const double rx = predicted[i].predicted_x - beam_x, ry = predicted[i].predicted_y - beam_y;
            const double Rpx = std::hypot(rx, ry);
            const double tan2t = Rpx / F_px;
            const double s2t = shell_sigma2[sh];
            double s2r = s2t, ux = 1.0, uy = 0.0;
            bool elong = false;
            if (use_ellipse) {
                const double sbw = bw_sigma * Rpx;
                const double radial_extra = sbw * sbw + c_radial * tan2t * tan2t;
                if (Rpx > 1e-6 && radial_extra > 0.25) {
                    ux = rx / Rpx; uy = ry / Rpx;
                    s2r = s2t + radial_extra;
                    elong = true;
                }
            }
            // Build the Gaussian per reflection, centred on the sub-pixel predicted position and (when
            // needed) radially elongated, on a grid grown to hold the streak.
            const double fx = predicted[i].predicted_x - rh.cx, fy = predicted[i].predicted_y - rh.cy;
            Rf = elong ? std::min(3 * R, static_cast<int>(std::ceil(r2 + 2.0 * std::sqrt(s2r)))) : R;
            const int Gf = 2 * Rf + 1;
            Pbuf.assign(static_cast<size_t>(Gf) * Gf, 0.0);
            double gs = 0.0;
            for (int dy = -Rf; dy <= Rf; ++dy)
                for (int dx = -Rf; dx <= Rf; ++dx) {
                    const double ex = dx - fx, ey = dy - fy;
                    const double rad = ex * ux + ey * uy, tn = -ex * uy + ey * ux;
                    const double g = std::exp(-rad * rad / (2.0 * s2r) - tn * tn / (2.0 * s2t));
                    Pbuf[(dy + Rf) * Gf + (dx + Rf)] = g;
                    gs += g;
                }
            for (double &p : Pbuf) p /= gs;
            Pvec = &Pbuf;
        }

        const int Gf = 2 * Rf + 1;
        const double B = std::max(rh.bkg, 1.0);
        double I = rh.I, den = 0.0;
        for (int iter = 0; iter < 4; ++iter) {
            double num = 0.0;
            den = 0.0;
            for (int dy = -Rf; dy <= Rf; ++dy)
                for (int dx = -Rf; dx <= Rf; ++dx) {
                    const double Pp = (*Pvec)[(dy + Rf) * Gf + (dx + Rf)];
                    if (Pp <= 0.0) continue;
                    const int x = rh.cx + dx, y = rh.cy + dy;
                    if (x < 0 || y < 0 || x >= W || y >= H) continue;
                    const int32_t px = img[y * W + x];
                    if (!valid(px)) continue;
                    const double v = B + std::max(0.0, I) * Pp;
                    num += Pp * (static_cast<double>(px) - rh.bkg) / v;
                    den += Pp * Pp / v;
                }
            if (den > 0.0) I = num / den; else break;
        }
        if (!(den > 0.0)) continue;

        results[i] = {static_cast<float>(I), static_cast<float>(std::sqrt(1.0 / den)),
                      static_cast<float>(rh.bkg), 0.0f, 0.0f, true, false};
    }

    return Finalize(predicted, npredicted, results, image_number);
}