Jungfraujoch/image_analysis/pixel_refinement/PixelRefine.cpp

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "PixelRefine.h"

#include <algorithm>
#include <cmath>
#include <limits>
#include <utility>
#include <vector>

#include <Eigen/Dense>
#include <ceres/ceres.h>
#include <ceres/rotation.h>

#include "../geom_refinement/LatticeReduction.h"

namespace {

// Per-pixel observation, in *raw* detector counts (no per-pixel solid-angle or
// polarization correction - same units the "normal" integrator works in; the
// per-reflection polarization correction is applied via ReflGroup::pol).
struct PixelObs {
    double x, y;          // detector pixel coordinate
    double Iobs;          // raw pixel value (signal + background)
    double Ibkg;          // local background estimate (per-shoebox level, raw counts)
    double weight;        // 1/sigma_pixel (used only by the optional orientation refinement)
};

// One reflection together with the pixels of its shoebox.
struct ReflGroup {
    int h, k, l;
    double d;
    double Itrue;             // reference intensity (held fixed)
    double R_bw_sq;           // bandwidth radial-width^2 contribution (0 = monochromatic)
    double pol;               // per-reflection polarization correction (raw = true * pol)
    double Ibkg;              // local flat background (raw counts, constant over the shoebox)
    double predicted_x, predicted_y;
    double R1_eff = 0.0;      // tangential profile width to use (Term 2)
    std::vector<PixelObs> pixels;
};

// Median of a vector (in place, partially reorders it).
double MedianInPlace(std::vector<double> &v) {
    if (v.empty())
        return 0.0;
    const size_t mid = v.size() / 2;
    std::nth_element(v.begin(), v.begin() + mid, v.end());
    if (v.size() % 2 == 1)
        return v[mid];
    const double hi = v[mid];
    std::nth_element(v.begin(), v.begin() + mid - 1, v.begin() + mid);
    return 0.5 * (v[mid - 1] + hi);
}

// Mask marking the *core* (radius `radius`) of every predicted spot, so that the
// local-background sampling of one reflection never picks up a neighbouring
// reflection's signal. Same idea as BraggIntegrate2D::BuildReflectionMask.
std::vector<uint8_t> BuildSpotMask(const std::vector<Reflection> &predicted, int nrefl,
                                   size_t xpixel, size_t ypixel, int radius) {
    std::vector<uint8_t> mask(xpixel * ypixel, 0);
    const double r_sq = static_cast<double>(radius) * radius;
    for (int i = 0; i < nrefl; ++i) {
        const auto &r = predicted[i];
        const int cx = static_cast<int>(std::lround(r.predicted_x));
        const int cy = static_cast<int>(std::lround(r.predicted_y));
        const int x0 = std::max(0, cx - radius);
        const int x1 = std::min<int>(static_cast<int>(xpixel) - 1, cx + radius);
        const int y0 = std::max(0, cy - radius);
        const int y1 = std::min<int>(static_cast<int>(ypixel) - 1, cy + radius);
        for (int y = y0; y <= y1; ++y) {
            for (int x = x0; x <= x1; ++x) {
                const double dx = x - r.predicted_x;
                const double dy = y - r.predicted_y;
                if (dx * dx + dy * dy <= r_sq)
                    mask[static_cast<size_t>(xpixel) * y + x] = 1;
            }
        }
    }
    return mask;
}

// Square shoebox bounds (inclusive) around a predicted spot, clamped to the
// detector. The centre is rounded to the nearest pixel with std::lround so the
// signal box is centred identically to the spot-core mask (BuildSpotMask) and
// the local-background ring (EstimateLocalBackground), which also lround.
struct ShoeboxBox { int min_x, max_x, min_y, max_y; };
ShoeboxBox ShoeboxBounds(double px, double py, int radius, size_t xpixel, size_t ypixel) {
    const int cx = static_cast<int>(std::lround(px));
    const int cy = static_cast<int>(std::lround(py));
    return {
        std::max(cx - radius, 0),
        std::min<int>(cx + radius, static_cast<int>(xpixel) - 1),
        std::max(cy - radius, 0),
        std::min<int>(cy + radius, static_cast<int>(ypixel) - 1)
    };
}

// Local flat background around one shoebox, in raw detector counts. Samples the
// square ring shoebox_radius < max(|dx|,|dy|) <= bkg_outer_radius centred on the
// spot, dropping pixels that belong to any spot core (spot_mask) or carry a
// masked/saturated sentinel, and returns their MEAN. (The median, used previously,
// sits below the mean for a skewed background and so biases the subtracted intensity
// positive.) Mirrors the local-background region of BraggIntegrate2D.
template<class T>
bool EstimateLocalBackground(const T *image,
                             const std::vector<uint8_t> &spot_mask,
                             size_t xpixel, size_t ypixel,
                             double cx, double cy,
                             int shoebox_radius, int bkg_outer_radius,
                             double &bkg_mean) {
    const int icx = static_cast<int>(std::lround(cx));
    const int icy = static_cast<int>(std::lround(cy));
    const int x0 = std::max(0, icx - bkg_outer_radius);
    const int x1 = std::min<int>(static_cast<int>(xpixel) - 1, icx + bkg_outer_radius);
    const int y0 = std::max(0, icy - bkg_outer_radius);
    const int y1 = std::min<int>(static_cast<int>(ypixel) - 1, icy + bkg_outer_radius);

    std::vector<double> vals;
    vals.reserve(static_cast<size_t>((x1 - x0 + 1) * (y1 - y0 + 1)));
    for (int y = y0; y <= y1; ++y) {
        for (int x = x0; x <= x1; ++x) {
            // Skip the square shoebox core: that is signal, not background.
            if (std::abs(x - icx) <= shoebox_radius && std::abs(y - icy) <= shoebox_radius)
                continue;
            const size_t np = static_cast<size_t>(xpixel) * y + x;
            if (spot_mask[np])
                continue;
            const T raw = image[np];
            if (raw == std::numeric_limits<T>::max())
                continue;
            if (std::is_signed_v<T> && raw == std::numeric_limits<T>::min())
                continue;
            vals.push_back(static_cast<double>(raw));
        }
    }

    if (vals.size() < 5)
        return false;

    // Mean background, NOT median. The median of a right-skewed (Poisson) background sits
    // below the mean, so subtracting it under-subtracts and biases every weak integrated
    // intensity positive - which averages up over multiplicity into fake <I/sig> in the
    // no-signal high-resolution shells. The mean is unbiased; modern fast detectors put
    // few zingers in the ring, so it is not robustified here.
    double sum = 0.0;
    for (const double v : vals)
        sum += v;
    bkg_mean = sum / static_cast<double>(vals.size());
    return true;
}

// Per-pixel: map a detector pixel through the current geometry into the
// reference reciprocal frame. Cheap (a few trig + one rotation); depends on the
// pixel and the detector geometry, not on the lattice.
template<typename T>
void ObservedRecip(const T *beam, const T *distance_mm, const T *detector_rot,
                   double obs_x, double obs_y,
                   double pixel_size, double inv_lambda,
                   Eigen::Matrix<T, 3, 1> &e_obs_recip) {
    // PyFAI convention (left-handed for rot1/rot2): rot3 = 0 assumed.
    const T c1 = ceres::cos(detector_rot[0]);
    const T s1 = ceres::sin(detector_rot[0]);
    const T c2 = ceres::cos(detector_rot[1]);
    const T s2 = ceres::sin(detector_rot[1]);

    const T det_x = (T(obs_x) - beam[0]) * T(pixel_size);
    const T det_y = (T(obs_y) - beam[1]) * T(pixel_size);
    const T det_z = T(distance_mm[0]);

    const T t1_x = c1 * det_x + s1 * det_z;
    const T t1_y = det_y;
    const T t1_z = -s1 * det_x + c1 * det_z;

    const T x = t1_x;
    const T y = c2 * t1_y + s2 * t1_z;
    const T z = -s2 * t1_y + c2 * t1_z;

    const T inv_norm = T(1) / ceres::sqrt(x * x + y * y + z * z);

    T recip_raw[3] = {
        x * inv_norm * T(inv_lambda),
        y * inv_norm * T(inv_lambda),
        (z * inv_norm - T(1.0)) * T(inv_lambda)
    };
    e_obs_recip = Eigen::Matrix<T, 3, 1>(recip_raw[0], recip_raw[1], recip_raw[2]);
}

// Per-reflection: predicted node g_hkl, |g_hkl|^2, and the Ewald-sphere normal.
// This is the expensive part (symmetry-aware B matrix, three rotations, cross
// products) - it depends only on the lattice (p0,p1,p2) and hkl, so for a whole
// shoebox it can be computed once. Convention identical to XtalOptimizer.
template<typename T>
bool PredictedNode(const T *p0, const T *p1, const T *p2,
                   double exp_h, double exp_k, double exp_l,
                   gemmi::CrystalSystem symmetry, double inv_lambda,
                   Eigen::Matrix<T, 3, 1> &e_pred_recip,
                   Eigen::Matrix<T, 3, 1> &n_radial, T &q_sq) {
    Eigen::Matrix<T, 3, 1> e_uc_len = Eigen::Matrix<T, 3, 1>::Zero();
    Eigen::Matrix<T, 3, 3> Bmat = Eigen::Matrix<T, 3, 3>::Identity();

    if (symmetry == gemmi::CrystalSystem::Hexagonal) {
        e_uc_len << p1[0], p1[0], p1[2];
        Bmat(0, 1) = T(-0.5);
        Bmat(1, 1) = T(sqrt(3.0) / 2.0);
    } else if (symmetry == gemmi::CrystalSystem::Orthorhombic) {
        e_uc_len << p1[0], p1[1], p1[2];
    } else if (symmetry == gemmi::CrystalSystem::Tetragonal) {
        e_uc_len << p1[0], p1[0], p1[2];
    } else if (symmetry == gemmi::CrystalSystem::Cubic) {
        e_uc_len << p1[0], p1[0], p1[0];
    } else if (symmetry == gemmi::CrystalSystem::Monoclinic) {
        e_uc_len << p1[0], p1[1], p1[2];
        Bmat(0, 2) = ceres::cos(p2[0]);
        Bmat(2, 2) = ceres::sin(p2[0]);
    } else {
        const T ca = ceres::cos(p2[0]);
        const T cb = ceres::cos(p2[1]);
        const T cg = ceres::cos(p2[2]);
        const T sg = ceres::sin(p2[2]);

        e_uc_len << p1[0], p1[1], p1[2];

        Bmat(0, 1) = cg;
        Bmat(1, 1) = sg;

        const T cx = cb;
        const T cy = (ca - cb * cg) / sg;
        const T v = T(1) - cx * cx - cy * cy;
        const T cz = (v >= T(0)) ? ceres::sqrt(v) : T(0);

        Bmat(0, 2) = cx;
        Bmat(1, 2) = cy;
        Bmat(2, 2) = cz;
    }

    const T L0 = e_uc_len[0];
    const T L1 = e_uc_len[1];
    const T L2 = e_uc_len[2];

    T col0_unrot[3] = {Bmat(0, 0) * L0, Bmat(1, 0) * L0, Bmat(2, 0) * L0};
    T col1_unrot[3] = {Bmat(0, 1) * L1, Bmat(1, 1) * L1, Bmat(2, 1) * L1};
    T col2_unrot[3] = {Bmat(0, 2) * L2, Bmat(1, 2) * L2, Bmat(2, 2) * L2};

    T col0_rot[3], col1_rot[3], col2_rot[3];
    ceres::AngleAxisRotatePoint(p0, col0_unrot, col0_rot);
    ceres::AngleAxisRotatePoint(p0, col1_unrot, col1_rot);
    ceres::AngleAxisRotatePoint(p0, col2_unrot, col2_rot);

    const Eigen::Matrix<T, 3, 1> A(col0_rot[0], col0_rot[1], col0_rot[2]);
    const Eigen::Matrix<T, 3, 1> Bv(col1_rot[0], col1_rot[1], col1_rot[2]);
    const Eigen::Matrix<T, 3, 1> C(col2_rot[0], col2_rot[1], col2_rot[2]);

    const Eigen::Matrix<T, 3, 1> BxC = Bv.cross(C);
    const Eigen::Matrix<T, 3, 1> CxA = C.cross(A);
    const Eigen::Matrix<T, 3, 1> AxB = A.cross(Bv);

    const T Vol = A.dot(BxC);
    if (ceres::abs(Vol) < T(1e-12))
        return false;
    const T invV = T(1) / Vol;

    e_pred_recip = (BxC * T(exp_h) + CxA * T(exp_k) + AxB * T(exp_l)) * invV;
    q_sq = e_pred_recip.squaredNorm();

    // Ewald sphere centre at -k_i = (0,0,-inv_lambda); radial normal at g_hkl.
    const Eigen::Matrix<T, 3, 1> S_pred(
        e_pred_recip[0],
        e_pred_recip[1],
        e_pred_recip[2] + T(inv_lambda));
    const T S_pred_norm = S_pred.norm();
    if (S_pred_norm < T(1e-10))
        return false;

    n_radial = S_pred / S_pred_norm;
    return true;
}

// Geometry terms for one shoebox pixel under a FIXED geometry (PixelRefine no
// longer refines geometry, so this is a plain double evaluation, not a Ceres cost):
//   q_sq        = |g_hkl|^2                 (predicted node, for the B-factor)
//   eps_radial  = deviation along the Ewald normal (the partiality direction)
//   eps_tang_sq = squared deviation in the tangent plane (the profile direction)
bool GeometryProbe(double obs_x, double obs_y, double lambda, double pixel_size,
                   int h, int k, int l, gemmi::CrystalSystem symmetry,
                   const double beam[2], double dist_mm, const double detector_rot[2],
                   const double p0[3], const double p1[3], const double p2[3],
                   double &q_sq, double &eps_radial, double &eps_tang_sq) {
    const double inv_lambda = 1.0 / lambda;
    Eigen::Vector3d e_obs;
    ObservedRecip<double>(beam, &dist_mm, detector_rot, obs_x, obs_y, pixel_size, inv_lambda, e_obs);

    Eigen::Vector3d e_pred, n_radial;
    if (!PredictedNode<double>(p0, p1, p2, h, k, l, symmetry, inv_lambda, e_pred, n_radial, q_sq))
        return false;

    const Eigen::Vector3d delta_q = e_obs - e_pred;
    eps_radial = delta_q.dot(n_radial);
    eps_tang_sq = (delta_q - eps_radial * n_radial).squaredNorm();
    return true;
}

// Pulls a scalar parameter towards an expected value with a fixed weight (the
// data-scaled prior). Identical in spirit to ScaleOnTheFly's regularizer: it is what
// keeps the per-image scale G from wandering on weakly-constrained images and
// scrambling the cross-image merge.
struct ScalarRegularizer {
    ScalarRegularizer(double weight, double expected) : weight(weight), expected(expected) {}
    template<typename T>
    bool operator()(const T *p, T *residual) const {
        residual[0] = T(weight) * (p[0] - T(expected));
        return true;
    }
    double weight;
    double expected;
};

// Term 1 of the factored likelihood (FACTORED_MODEL.md): the per-reflection
// *intensity* (0th-moment) residual. The profile-fit amplitude J should equal the
// scaled reference J_model = G * exp(-B/4d^2) * partiality * pol * I_ref. One scalar
// residual per reflection, weighted by the model-expected (Fisher) sigma_J. This is
// the scaling residual - integration and scaling become one objective, and the empty
// pixels (which make no residual of their own) stop dominating the fit. Geometry is
// fixed, so J, partiality and sigma_J are constants and only G and B are free.
struct IntensityResidual {
    IntensityResidual(double J, double sigma_J, double partiality, double pol,
                      double I_ref, double inv_4d2)
        : J(J), inv_sigma(1.0 / sigma_J), partiality(partiality), pol(pol),
          I_ref(I_ref), inv_4d2(inv_4d2) {}
    template<typename T>
    bool operator()(const T *const G, const T *const B, T *residual) const {
        const T B_term = ceres::exp(-B[0] * T(inv_4d2));
        const T J_model = G[0] * B_term * T(partiality) * T(pol) * T(I_ref);
        residual[0] = (J_model - T(J)) * T(inv_sigma);
        return true;
    }
    double J, inv_sigma, partiality, pol, I_ref, inv_4d2;
};

// Per-shoebox per-pixel forward-model cost (raw counts), used ONLY by the optional
// orientation refinement: I_pred = G*Itrue*B_term*P_radial*P_tang*pol + Ibkg. The
// expensive node geometry is computed once per reflection; the cheap ObservedRecip +
// Gaussian profile run per pixel. Shares ObservedRecip/PredictedNode with GeometryProbe.
struct ShoeboxResidual {
    ShoeboxResidual(const ReflGroup &g, double lambda, double pixel_size,
                    gemmi::CrystalSystem symmetry)
        : pixels(g.pixels), Itrue(g.Itrue), R_bw_sq(g.R_bw_sq), pol(g.pol),
          exp_h(g.h), exp_k(g.k), exp_l(g.l),
          inv_lambda(1.0 / lambda), pixel_size(pixel_size), symmetry(symmetry) {}
    template<typename T>
    bool operator()(const T *const *params, T *residual) const {
        // 0 beam 1 dist 2 detector_rot 3 p0 4 p1 5 p2 6 scale 7 B 8 R
        const T *beam = params[0]; const T *distance_mm = params[1]; const T *detector_rot = params[2];
        const T *p0 = params[3]; const T *p1 = params[4]; const T *p2 = params[5];
        const T *scale_factor = params[6]; const T *B = params[7]; const T *R = params[8];
        if (R[0] < T(1e-10) || R[1] < T(1e-10))
            return false;
        Eigen::Matrix<T, 3, 1> e_pred_recip, n_radial;
        T q_sq;
        if (!PredictedNode(p0, p1, p2, exp_h, exp_k, exp_l, symmetry, inv_lambda,
                           e_pred_recip, n_radial, q_sq))
            return false;
        const T B_term = ceres::exp(-B[0] * q_sq / T(4.0));
        const T R0_eff_sq = R[0] * R[0] + T(R_bw_sq);
        for (size_t i = 0; i < pixels.size(); ++i) {
            const PixelObs &obs = pixels[i];
            Eigen::Matrix<T, 3, 1> e_obs_recip;
            ObservedRecip(beam, distance_mm, detector_rot, obs.x, obs.y, pixel_size, inv_lambda, e_obs_recip);
            const Eigen::Matrix<T, 3, 1> delta_q = e_obs_recip - e_pred_recip;
            const T eps_radial = delta_q.dot(n_radial);
            const T eps_tang_sq = (delta_q - eps_radial * n_radial).squaredNorm();
            const T P_radial = ceres::exp(-eps_radial * eps_radial / R0_eff_sq);
            const T P_tang = ceres::exp(-eps_tang_sq / (R[1] * R[1])) / (T(M_PI) * R[1] * R[1]);
            const T signal = scale_factor[0] * T(Itrue) * B_term * P_radial * P_tang * T(pol);
            residual[i] = (signal + T(obs.Ibkg) - T(obs.Iobs)) * T(obs.weight);
        }
        return true;
    }
    std::vector<PixelObs> pixels;
    const double Itrue, R_bw_sq, pol;
    const double exp_h, exp_k, exp_l;
    const double inv_lambda, pixel_size;
    gemmi::CrystalSystem symmetry;
};

// Anchors the orientation (angle-axis vector) to its prior with a data-scaled weight, so
// the per-image fit can only make a small, signal-supported sub-spot correction.
struct OrientationRegularizer {
    OrientationRegularizer(double weight, const double prior[3]) : weight(weight) {
        for (int i = 0; i < 3; ++i)
            prior_[i] = prior[i];
    }
    template<typename T>
    bool operator()(const T *p0, T *residual) const {
        for (int i = 0; i < 3; ++i)
            residual[i] = T(weight) * (p0[i] - T(prior_[i]));
        return true;
    }
    double weight;
    double prior_[3];
};

} // namespace

PixelRefine::PixelRefine(const DiffractionExperiment &experiment,
                         const std::vector<MergedReflection> &reference)
    : xpixel(experiment.GetXPixelsNum()),
      ypixel(experiment.GetYPixelsNum()),
      experiment(experiment),
      hkl_key_generator(experiment.GetScalingSettings().GetMergeFriedel(),
                        experiment.GetSpaceGroupNumber().value_or(1)) {
    for (const auto &ref: reference)
        reference_data[hkl_key_generator(ref)] = ref.I;
}

void PixelRefine::BuildParameterBlocks(const PixelRefineData &data,
                                       double beam[2], double &dist_mm,
                                       double detector_rot[2],
                                       double latt_vec0[3], double latt_vec1[3], double latt_vec2[3]) const {
    beam[0] = data.geom.GetBeamX_pxl();
    beam[1] = data.geom.GetBeamY_pxl();
    dist_mm = data.geom.GetDetectorDistance_mm();
    detector_rot[0] = data.geom.GetPoniRot1_rad();
    detector_rot[1] = data.geom.GetPoniRot2_rad();

    for (int i = 0; i < 3; ++i)
        latt_vec0[i] = latt_vec1[i] = latt_vec2[i] = 0.0;

    double beta = data.latt.GetUnitCell().beta;
    switch (data.crystal_system) {
        case gemmi::CrystalSystem::Orthorhombic:
            LatticeToRodriguesAndLengths_GS(data.latt, latt_vec0, latt_vec1);
            break;
        case gemmi::CrystalSystem::Tetragonal:
            LatticeToRodriguesAndLengths_GS(data.latt, latt_vec0, latt_vec1);
            latt_vec1[0] = (latt_vec1[0] + latt_vec1[1]) / 2.0;
            break;
        case gemmi::CrystalSystem::Cubic:
            LatticeToRodriguesAndLengths_GS(data.latt, latt_vec0, latt_vec1);
            latt_vec1[0] = (latt_vec1[0] + latt_vec1[1] + latt_vec1[2]) / 3.0;
            break;
        case gemmi::CrystalSystem::Hexagonal:
            LatticeToRodriguesAndLengths_Hex(data.latt, latt_vec0, latt_vec1);
            break;
        case gemmi::CrystalSystem::Monoclinic:
            LatticeToRodriguesLengthsBeta_Mono(data.latt, latt_vec0, latt_vec1, beta);
            latt_vec2[0] = beta;
            break;
        default: {
            LatticeToRodriguesAndLengths_GS(data.latt, latt_vec0, latt_vec1);
            const auto uc = data.latt.GetUnitCell();
            latt_vec2[0] = uc.alpha * M_PI / 180.0;
            latt_vec2[1] = uc.beta * M_PI / 180.0;
            latt_vec2[2] = uc.gamma * M_PI / 180.0;
            break;
        }
    }
}

template<class T>
void PixelRefine::Run(const T *image,
                      BraggPrediction &prediction,
                      PixelRefineData &data) {
    data.solved = false;
    data.reflections.clear();

    const double lambda = data.geom.GetWavelength_A();
    const double pixel_size = data.geom.GetPixelSize_mm();

    const BraggPredictionSettings settings_prediction{
        .high_res_A = experiment.GetBraggIntegrationSettings().GetDMinLimit_A(),
        .ewald_dist_cutoff = static_cast<float>(data.ewald_dist_cutoff),
        .max_hkl = 100,
        .centering = data.centering,
        .bandwidth_sigma = static_cast<float>(data.bandwidth)  // relative Δλ/λ sigma
    };

    const int radius = data.shoebox_radius;
    const int bkg_outer_radius = std::max(radius + 1, data.bkg_outer_radius);

    // Per-reflection polarization correction (raw = true * pol), evaluated once at
    // the predicted spot - same handling as BraggIntegrate2D. Identity if disabled.
    const auto pol_factor = experiment.GetPolarizationFactor();
    auto polarization = [&](double x, double y) -> double {
        if (!pol_factor)
            return 1.0;
        return data.geom.CalcAzIntPolarizationCorr(static_cast<float>(x), static_cast<float>(y),
                                                   pol_factor.value());
    };

    // Bandwidth radial-width^2 (in the code's R = sqrt(2)*sigma convention):
    //   R_bw^2 = (b*lambda)^2 / (2 d^4),   b = relative bandwidth (sigma).
    // A fixed per-reflection constant; data.bandwidth == 0 -> monochromatic no-op.
    const double bw = data.bandwidth;
    auto bandwidth_radial_sq = [&](double d) -> double {
        if (bw <= 0.0 || d <= 0.0)
            return 0.0;
        const double bl = bw * lambda;
        return bl * bl / (2.0 * d * d * d * d);
    };

    // Geometry is FIXED here: orientation/cell/detector were already refined upstream
    // by XtalOptimizer (IndexAndRefine::RefineGeometryIfNeeded). PixelRefine is an
    // intensity-only operation - it predicts shoeboxes with this geometry, measures the
    // tangential profile width, and fits the per-image scale G (and B) to the reference.
    double beam[2], dist_mm, detector_rot[2];
    double latt_vec0[3], latt_vec1[3], latt_vec2[3];
    BuildParameterBlocks(data, beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2);

    // ---- 1-2. Predict shoeboxes + collect pixels (a lambda so it can be re-run after
    //           an orientation refinement moves the predicted positions). ----------
    // A spot-core mask over ALL predictions keeps each reflection's background ring from
    // picking up a neighbour's signal. Pixels carry a fit weight (background-limited
    // variance, signal-weighted toward the predicted centre) used only by the optional
    // orientation refinement - the factored integration weights by v = Ibkg directly.
    std::vector<ReflGroup> groups;
    auto build_groups = [&]() {
        groups.clear();
        DiffractionExperiment exp_iter = experiment;
        exp_iter.BeamX_pxl(data.geom.GetBeamX_pxl())
                .BeamY_pxl(data.geom.GetBeamY_pxl())
                .DetectorDistance_mm(data.geom.GetDetectorDistance_mm())
                .PoniRot1_rad(data.geom.GetPoniRot1_rad())
                .PoniRot2_rad(data.geom.GetPoniRot2_rad());
        const int nrefl = prediction.Calc(exp_iter, data.latt, settings_prediction);
        const auto &predicted = prediction.GetReflections();
        const auto spot_mask = BuildSpotMask(predicted, nrefl, xpixel, ypixel, radius);
        for (int ri = 0; ri < nrefl; ++ri) {
            const auto &refl = predicted[ri];
            const auto hkl = hkl_key_generator(refl);
            if (!reference_data.contains(hkl))
                continue;
            double Ibkg = 0.0;
            if (!EstimateLocalBackground(image, spot_mask, xpixel, ypixel,
                                         refl.predicted_x, refl.predicted_y,
                                         radius, bkg_outer_radius, Ibkg))
                continue;
            ReflGroup g;
            g.h = refl.h;
            g.k = refl.k;
            g.l = refl.l;
            g.d = refl.d;
            g.Itrue = reference_data[hkl];
            g.R_bw_sq = bandwidth_radial_sq(refl.d);
            g.pol = polarization(refl.predicted_x, refl.predicted_y);
            g.Ibkg = Ibkg;
            g.predicted_x = refl.predicted_x;
            g.predicted_y = refl.predicted_y;
            const auto box = ShoeboxBounds(refl.predicted_x, refl.predicted_y, radius, xpixel, ypixel);
            for (int y = box.min_y; y <= box.max_y; ++y) {
                for (int x = box.min_x; x <= box.max_x; ++x) {
                    const size_t npixel = xpixel * y + x;
                    if (image[npixel] == std::numeric_limits<T>::max())
                        continue;
                    if (std::is_signed_v<T> && (image[npixel] == std::numeric_limits<T>::min()))
                        continue;
                    double weight = 1.0 / std::sqrt(std::max(Ibkg, 1.0));
                    if (data.fit_signal_sigma_pix > 0.0) {
                        const double dx = x - refl.predicted_x, dy = y - refl.predicted_y;
                        const double s2 = data.fit_signal_sigma_pix * data.fit_signal_sigma_pix;
                        weight *= std::exp(-0.5 * (dx * dx + dy * dy) / s2);
                    }
                    g.pixels.push_back({static_cast<double>(x), static_cast<double>(y),
                                        static_cast<double>(image[npixel]), Ibkg, weight});
                }
            }
            if (!g.pixels.empty())
                groups.push_back(std::move(g));
        }
    };
    build_groups();
    if (groups.empty())
        return;

    // ---- Optional per-image orientation refinement (off by default) -----------------
    // The pre-factored per-pixel step: refine the orientation (and a nuisance scale)
    // against the shoebox pixels via ShoeboxResidual, regularised to the spot-centroid
    // orientation, then re-predict. Dropped from the factored model (geometry fixed to
    // XtalOptimizer); behind PixelRefineData::refine_orientation to A/B its R-free effect.
    if (data.refine_orientation) {
        const double orient_prior[3] = {latt_vec0[0], latt_vec0[1], latt_vec0[2]};
        ceres::Problem oprob;
        size_t npix = 0;
        for (const auto &g : groups) {
            auto *cost = new ceres::DynamicAutoDiffCostFunction<ShoeboxResidual>(
                new ShoeboxResidual(g, lambda, pixel_size, data.crystal_system));
            for (int b : {2, 1, 2, 3, 3, 3, 1, 1, 2})
                cost->AddParameterBlock(b);
            cost->SetNumResiduals(static_cast<int>(g.pixels.size()));
            oprob.AddResidualBlock(cost, nullptr, beam, &dist_mm, detector_rot,
                                   latt_vec0, latt_vec1, latt_vec2,
                                   &data.scale_factor, &data.B_factor, data.R);
            npix += g.pixels.size();
        }
        // Refine only orientation (latt_vec0) + the nuisance scale G; everything else fixed.
        oprob.SetParameterBlockConstant(beam);
        oprob.SetParameterBlockConstant(&dist_mm);
        oprob.SetParameterBlockConstant(detector_rot);
        oprob.SetParameterBlockConstant(latt_vec1);
        oprob.SetParameterBlockConstant(latt_vec2);
        oprob.SetParameterBlockConstant(&data.B_factor);
        oprob.SetParameterBlockConstant(data.R);
        oprob.SetParameterLowerBound(&data.scale_factor, 0, 0.0);
        if (data.orient_reg_sigma_deg > 0.0 && npix > 0) {
            const double sigma_rad = std::max(data.orient_reg_sigma_deg * M_PI / 180.0, 1e-9);
            const double w = std::sqrt(static_cast<double>(npix)) / sigma_rad;
            oprob.AddResidualBlock(new ceres::AutoDiffCostFunction<OrientationRegularizer, 3, 3>(
                new OrientationRegularizer(w, orient_prior)), nullptr, latt_vec0);
        }
        if (data.scale_reg_sigma > 0.0) {
            const double w = std::sqrt(static_cast<double>(groups.size()) / data.scale_reg_sigma);
            oprob.AddResidualBlock(new ceres::AutoDiffCostFunction<ScalarRegularizer, 1, 1>(
                new ScalarRegularizer(w, 1.0)), nullptr, &data.scale_factor);
        }
        ceres::Solver::Options oopt;
        oopt.linear_solver_type = ceres::DENSE_QR;
        oopt.logging_type = ceres::LoggingType::SILENT;
        oopt.minimizer_progress_to_stdout = false;
        oopt.max_solver_time_in_seconds = data.max_time_s;
        oopt.num_threads = 1;
        ceres::Solver::Summary osum;
        ceres::Solve(oopt, &oprob, &osum);

        // Write the refined orientation back into data.latt (cell held at latt_vec1/2),
        // then re-predict + rebuild groups at the new orientation. Scale is reset; Term 1
        // re-fits it properly below.
        switch (data.crystal_system) {
            case gemmi::CrystalSystem::Tetragonal:
                latt_vec1[1] = latt_vec1[0];
                data.latt = AngleAxisAndCellToLattice(latt_vec0, latt_vec1, M_PI/2, M_PI/2, M_PI/2);
                break;
            case gemmi::CrystalSystem::Cubic:
                latt_vec1[1] = latt_vec1[0];
                latt_vec1[2] = latt_vec1[0];
                data.latt = AngleAxisAndCellToLattice(latt_vec0, latt_vec1, M_PI/2, M_PI/2, M_PI/2);
                break;
            case gemmi::CrystalSystem::Hexagonal:
                latt_vec1[1] = latt_vec1[0];
                data.latt = AngleAxisAndCellToLattice(latt_vec0, latt_vec1, M_PI/2, M_PI/2, 2.0*M_PI/3.0);
                break;
            case gemmi::CrystalSystem::Monoclinic:
                data.latt = AngleAxisAndCellToLattice(latt_vec0, latt_vec1, M_PI/2, latt_vec2[0], M_PI/2);
                break;
            case gemmi::CrystalSystem::Orthorhombic:
                data.latt = AngleAxisAndCellToLattice(latt_vec0, latt_vec1, M_PI/2, M_PI/2, M_PI/2);
                break;
            default:
                data.latt = AngleAxisAndCellToLattice(latt_vec0, latt_vec1,
                                                      latt_vec2[0], latt_vec2[1], latt_vec2[2]);
                break;
        }
        data.scale_factor = 1.0;
        build_groups();
        if (groups.empty())
            return;
    }

    // ---- 3. Term 2: per-resolution tangential profile width R1 ----------------
    // R1 = sqrt(2*<eps_t^2>) from the intensity-weighted tangential second moment of
    // the strong spots, binned by resolution (low res small spots, high res larger).
    // A *shape* statistic, normalised by the total intensity, so it is decoupled from
    // the per-image scale - which is what makes measuring it (rather than fitting it,
    // where it is degenerate with G) stable. Weak spots fall back to the global R[1].
    for (auto &g : groups)
        g.R1_eff = data.R[1];
    {
        constexpr int n_bins = 6;
        const double box_px = (2.0 * radius + 1.0) * (2.0 * radius + 1.0);
        double s2min = 1e30, s2max = 0.0;
        for (const auto &g : groups) {
            const double s2 = 1.0 / (g.d * g.d);
            s2min = std::min(s2min, s2);
            s2max = std::max(s2max, s2);
        }
        const double span = std::max(s2max - s2min, 1e-12);
        auto bin_of = [&](double d) {
            return std::clamp(static_cast<int>((1.0 / (d * d) - s2min) / span * n_bins), 0, n_bins - 1);
        };
        std::vector<std::vector<double>> bin_M2(n_bins);
        for (const auto &g : groups) {
            double sw = 0.0, sw_et2 = 0.0;
            for (const auto &px : g.pixels) {
                double q_sq, eps_r, eps_t_sq;
                if (!GeometryProbe(px.x, px.y, lambda, pixel_size, g.h, g.k, g.l, data.crystal_system,
                                   beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2,
                                   q_sq, eps_r, eps_t_sq))
                    continue;
                const double w = std::max(px.Iobs - g.Ibkg, 0.0);
                sw += w;
                sw_et2 += w * eps_t_sq;
            }
            if (sw <= 0.0)
                continue;
            const double signif = sw / std::sqrt(std::max(box_px * g.Ibkg, 1.0));
            if (signif >= 5.0) // only well-measured spots define the shape
                bin_M2[bin_of(g.d)].push_back(sw_et2 / sw);
        }
        std::vector<double> bin_R1(n_bins, data.R[1]);
        for (int b = 0; b < n_bins; ++b)
            if (bin_M2[b].size() >= 5) {
                const double r1 = data.r1_multiplier * std::sqrt(2.0 * std::max(MedianInPlace(bin_M2[b]), 0.0));
                if (std::isfinite(r1) && r1 > 1e-4)
                    bin_R1[b] = std::clamp(r1, 1e-4, 0.05);
            }
        for (auto &g : groups)
            g.R1_eff = bin_R1[bin_of(g.d)];
    }

    // ---- 4. Term 1: one intensity residual per reflection; fit G (and B) ------
    // J / partiality / sigma_J are computed here as constants (geometry & R fixed),
    // and only the per-image scale G and Debye-Waller B are optimised.
    ceres::Problem problem;
    size_t n_blocks = 0;
    const double R0 = data.R[0];
    for (const auto &g : groups) {
        const double R1 = g.R1_eff; // Term 2: per-resolution profile width
        if (!(R1 > 0.0) || !(R0 > 0.0))
            continue;
        double num = 0.0, den = 0.0, rad = 0.0;
        std::vector<std::pair<double, double>> pt_sig; // (P_t, Iobs-Bg) for the Fisher pass
        pt_sig.reserve(g.pixels.size());
        for (const auto &px : g.pixels) {
            double q_sq, eps_r, eps_t_sq;
            if (!GeometryProbe(px.x, px.y, lambda, pixel_size, g.h, g.k, g.l, data.crystal_system,
                               beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2,
                               q_sq, eps_r, eps_t_sq))
                continue;
            const double P_t = std::exp(-eps_t_sq / (R1 * R1)) / (M_PI * R1 * R1);
            const double R0_eff_sq = R0 * R0 + g.R_bw_sq;
            const double P_rad = std::exp(-eps_r * eps_r / R0_eff_sq);
            const double v = std::max(g.Ibkg, 1.0);
            const double sig = px.Iobs - g.Ibkg;
            num += P_t * sig / v;
            den += P_t * P_t / v;
            rad += P_rad * P_t * P_t / v;
            pt_sig.emplace_back(P_t, sig);
        }
        if (!(den > 0.0))
            continue;
        const double J = num / den;
        const double partiality = rad / den;
        // Model-expected (Fisher) variance: v_p = background + expected signal J*P_t,
        // not the per-pixel observed counts (which down-bias) - so the weight tracks
        // information, and an expected-strong reflection that is absent hurts.
        double den_f = 0.0;
        for (const auto &[P_t, sig] : pt_sig) {
            const double v_f = std::max(g.Ibkg + std::max(J, 0.0) * P_t, 1.0);
            den_f += P_t * P_t / v_f;
        }
        const double sigma_J = std::sqrt(1.0 / std::max(den_f, 1e-30));
        const double inv_4d2 = (g.d > 0.0) ? 1.0 / (4.0 * g.d * g.d) : 0.0;
        auto *cost = new ceres::AutoDiffCostFunction<IntensityResidual, 1, 1, 1>(
            new IntensityResidual(J, sigma_J, partiality, g.pol, g.Itrue, inv_4d2));
        problem.AddResidualBlock(cost, nullptr, &data.scale_factor, &data.B_factor);
        ++n_blocks;
    }
    data.residual_count = n_blocks;
    if (n_blocks == 0)
        return;

    // G >= 0; B fixed unless requested; G regularized -> 1 with weight sqrt(n/sigma)
    // (mirrors ScaleOnTheFly) so weakly-measured images cannot drift and scramble the merge.
    problem.SetParameterLowerBound(&data.scale_factor, 0, 0.0);
    if (!data.refine_B)
        problem.SetParameterBlockConstant(&data.B_factor);
    if (data.scale_reg_sigma > 0.0) {
        const double w = std::sqrt(static_cast<double>(groups.size()) / data.scale_reg_sigma);
        auto *reg = new ceres::AutoDiffCostFunction<ScalarRegularizer, 1, 1>(
            new ScalarRegularizer(w, 1.0));
        problem.AddResidualBlock(reg, nullptr, &data.scale_factor);
    }

    ceres::Solver::Options options;
    options.linear_solver_type = ceres::DENSE_QR;
    options.minimizer_progress_to_stdout = false;
    options.logging_type = ceres::LoggingType::SILENT;
    options.max_solver_time_in_seconds = data.max_time_s;
    options.num_threads = 1;
    ceres::Solver::Summary summary;
    ceres::Solve(options, &problem, &summary);
    data.final_cost = summary.final_cost;
    data.solved = summary.IsSolutionUsable();

    // ---- 5. Extract integrated reflections ------------------------------------
    // Profile fitting gives the recorded amplitude (fitting the tangential profile P_t
    // against the background-subtracted pixels):
    //     J      = sum_p[ P_t,p (Iobs_p - Ibkg)/v_p ] / sum_p[ P_t,p^2 / v_p ]
    //            ~ G * Itrue * B_term * partiality * pol     (recorded raw counts)
    //     var(J) = 1 / sum_p[ P_t,p^2 / v_p ]
    //
    // Output split (Merge multiplies r.I * image_scale_corr and weights by
    // 1/(sigma*image_scale_corr)^2):
    //   r.I              = J / (B_term * partiality * pol) = G * Itrue
    //   r.sigma          = sqrt(var(J)) / (B_term * partiality * pol)
    //   r.partiality     = profile-weighted P_radial in (0,1]  (the rocking fraction)
    //   r.completeness   = live/total tangential profile in (0,1]  (detector clipping)
    //   r.image_scale_corr = 1/G                              (per-image scale ONLY)
    // so r.I * image_scale_corr = Itrue: B, partiality and polarization live on the
    // intensity, G lives on image_scale_corr - one clean meaning per field. We walk the
    // full (unclamped) shoebox once: every grid point feeds the total tangential profile
    // (completeness denominator); points that are real, live pixels also feed the fit.
    data.reflections.reserve(groups.size());
    for (const auto &g : groups) {
        const int cx = static_cast<int>(std::lround(g.predicted_x));
        const int cy = static_cast<int>(std::lround(g.predicted_y));
        const double B_term = std::exp(-data.B_factor / (4.0 * g.d * g.d));

        double num = 0.0, den = 0.0, bkg_sum = 0.0, radial_sum = 0.0;
        double prof_live = 0.0, prof_full = 0.0; // tangential profile: captured / total
        size_t n = 0;

        for (int y = cy - radius; y <= cy + radius; ++y) {
            for (int x = cx - radius; x <= cx + radius; ++x) {
                double q_sq, eps_r, eps_t_sq;
                if (!GeometryProbe(static_cast<double>(x), static_cast<double>(y),
                                   lambda, pixel_size, g.h, g.k, g.l, data.crystal_system,
                                   beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2,
                                   q_sq, eps_r, eps_t_sq))
                    continue;
                if (!(data.R[0] > 0.0) || !(g.R1_eff > 0.0))
                    continue;

                // Tangential profile shape (area-normalized) -> the fit template, using the
                // per-reflection R1_eff (Term 2).
                const double R1 = g.R1_eff;
                const double P_t = std::exp(-eps_t_sq / (R1 * R1)) / (M_PI * R1 * R1);
                prof_full += P_t; // whole shoebox, on- or off-detector

                // Only real, unmasked detector pixels carry signal.
                if (x < 0 || x >= static_cast<int>(xpixel) || y < 0 || y >= static_cast<int>(ypixel))
                    continue;
                const size_t np = static_cast<size_t>(xpixel) * y + x;
                if (image[np] == std::numeric_limits<T>::max())
                    continue;
                if (std::is_signed_v<T> && image[np] == std::numeric_limits<T>::min())
                    continue;

                const double Iobs = static_cast<double>(image[np]); // raw counts
                // Background-limited variance (constant over the shoebox): weighting by
                // the observed count biases the amplitude negative where signal is weakest.
                const double v = std::max(g.Ibkg, 1.0);

                // Peak-normalized radial factor (the partiality), in (0,1]. MUST use the
                // same P_t^2/v weights as the amplitude, else an R0_eff-dependent (hence
                // resolution-dependent) factor is left behind in r.I.
                const double R0_eff_sq = data.R[0] * data.R[0] + g.R_bw_sq;
                const double P_radial = std::exp(-eps_r * eps_r / R0_eff_sq);

                const double w = P_t * P_t / v;
                num += P_t * (Iobs - g.Ibkg) / v;
                den += w;
                radial_sum += P_radial * w;
                prof_live += P_t;
                bkg_sum += g.Ibkg;
                ++n;
            }
        }

        Reflection r{};
        r.h = g.h;
        r.k = g.k;
        r.l = g.l;
        r.d = static_cast<float>(g.d);
        r.predicted_x = static_cast<float>(g.predicted_x);
        r.predicted_y = static_cast<float>(g.predicted_y);
        r.observed_x = NAN;
        r.observed_y = NAN;
        r.rlp = 1.0f;
        r.partiality = (den > 0.0) ? static_cast<float>(radial_sum / den) : 1.0f;
        r.completeness = (prof_full > 0.0) ? static_cast<float>(prof_live / prof_full) : 1.0f;

        if (den > 0.0 && n > 0) {
            const double I_amp = num / den;             // ~ G*Itrue*B_term*partiality*pol
            const double sigma_amp = std::sqrt(1.0 / den);
            const double corr = static_cast<double>(r.partiality) * B_term * g.pol;
            r.bkg = static_cast<float>(bkg_sum / static_cast<double>(n));
            r.observed = true;

            if (corr > 0.0 && data.scale_factor > 0.0) {
                r.I = static_cast<float>(I_amp / corr);       // = G*Itrue
                r.sigma = static_cast<float>(sigma_amp / corr);
                r.image_scale_corr = static_cast<float>(1.0 / data.scale_factor); // G only
            } else {
                r.I = static_cast<float>(I_amp);
                r.sigma = static_cast<float>(sigma_amp);
                r.image_scale_corr = NAN;
            }
        } else {
            r.I = 0.0f;
            r.sigma = NAN;
            r.bkg = 0.0f;
            r.observed = false;
        }
        data.reflections.push_back(r);
    }

    // ---- 6. Per-image CC vs reference (diagnostic) ----------------------------
    // Pearson CC of the scaled estimate (r.I * image_scale_corr = Itrue_est) against
    // the reference intensities, over the matched reflections.
    {
        double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0;
        size_t cn = 0;
        for (const auto &r : data.reflections) {
            if (!r.observed || !std::isfinite(r.I) || !std::isfinite(r.image_scale_corr))
                continue;
            const auto it = reference_data.find(hkl_key_generator(r));
            if (it == reference_data.end())
                continue;
            const double x = static_cast<double>(r.I) * static_cast<double>(r.image_scale_corr);
            const double y = it->second;
            if (!std::isfinite(x) || !std::isfinite(y))
                continue;
            sx += x; sy += y; sxx += x * x; syy += y * y; sxy += x * y; ++cn;
        }
        data.cc = NAN;
        data.cc_n = static_cast<int64_t>(cn);
        if (cn >= 3) {
            const double nd = static_cast<double>(cn);
            const double cov = sxy - sx * sy / nd;
            const double vx = sxx - sx * sx / nd;
            const double vy = syy - sy * sy / nd;
            if (vx > 0.0 && vy > 0.0)
                data.cc = cov / std::sqrt(vx * vy);
        }
    }
}

template void PixelRefine::Run<int8_t>(const int8_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<int16_t>(const int16_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<int32_t>(const int32_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<uint8_t>(const uint8_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<uint16_t>(const uint16_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<uint32_t>(const uint32_t *, BraggPrediction &, PixelRefineData &);