Jungfraujoch/image_analysis/pixel_refinement/PixelRefine.cpp

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "../../common/JFJochMath.h"
#include "PixelRefine.h"

#include <algorithm>
#include <cmath>
#include <limits>
#include <utility>
#include <vector>

#include <Eigen/Dense>
#include <ceres/ceres.h>

namespace {

// Per-pixel observation, in *raw* detector counts (no per-pixel solid-angle or
// polarization correction - same units the "normal" integrator works in; the
// per-reflection polarization correction is applied via ReflGroup::pol).
struct PixelObs {
    double x, y;          // detector pixel coordinate
    double Iobs;          // raw pixel value (signal + background)
    double Ibkg;          // local background estimate (per-shoebox level, raw counts)
};

// One reflection together with the pixels of its shoebox.
struct ReflGroup {
    int h, k, l;
    double d;
    double Itrue;             // reference intensity (held fixed)
    double R_bw_sq;           // bandwidth radial-width^2 contribution (0 = monochromatic)
    double pol;               // per-reflection polarization correction (raw = true * pol)
    double Ibkg;              // local flat background (raw counts, constant over the shoebox)
    double predicted_x, predicted_y;
    double R1_eff = 0.0;      // tangential profile width to use (Term 2)
    std::vector<PixelObs> pixels;
};

// Median of a vector (in place, partially reorders it).
double MedianInPlace(std::vector<double> &v) {
    if (v.empty())
        return 0.0;
    const size_t mid = v.size() / 2;
    std::nth_element(v.begin(), v.begin() + mid, v.end());
    if (v.size() % 2 == 1)
        return v[mid];
    const double hi = v[mid];
    std::nth_element(v.begin(), v.begin() + mid - 1, v.begin() + mid);
    return 0.5 * (v[mid - 1] + hi);
}

// Mask marking the *core* (radius `radius`) of every predicted spot, so that the
// local-background sampling of one reflection never picks up a neighbouring
// reflection's signal. Same idea as BraggIntegrate2D::BuildReflectionMask.
std::vector<uint8_t> BuildSpotMask(const std::vector<Reflection> &predicted, int nrefl,
                                   size_t xpixel, size_t ypixel, int radius) {
    std::vector<uint8_t> mask(xpixel * ypixel, 0);
    const double r_sq = static_cast<double>(radius) * radius;
    for (int i = 0; i < nrefl; ++i) {
        const auto &r = predicted[i];
        const int cx = static_cast<int>(std::lround(r.predicted_x));
        const int cy = static_cast<int>(std::lround(r.predicted_y));
        const int x0 = std::max(0, cx - radius);
        const int x1 = std::min<int>(static_cast<int>(xpixel) - 1, cx + radius);
        const int y0 = std::max(0, cy - radius);
        const int y1 = std::min<int>(static_cast<int>(ypixel) - 1, cy + radius);
        for (int y = y0; y <= y1; ++y) {
            for (int x = x0; x <= x1; ++x) {
                const double dx = x - r.predicted_x;
                const double dy = y - r.predicted_y;
                if (dx * dx + dy * dy <= r_sq)
                    mask[static_cast<size_t>(xpixel) * y + x] = 1;
            }
        }
    }
    return mask;
}

// Square shoebox bounds (inclusive) around a predicted spot, clamped to the
// detector. The centre is rounded to the nearest pixel with std::lround so the
// signal box is centred identically to the spot-core mask (BuildSpotMask) and
// the local-background ring (EstimateLocalBackground), which also lround.
struct ShoeboxBox { int min_x, max_x, min_y, max_y; };
ShoeboxBox ShoeboxBounds(double px, double py, int radius, size_t xpixel, size_t ypixel) {
    const int cx = static_cast<int>(std::lround(px));
    const int cy = static_cast<int>(std::lround(py));
    return {
        std::max(cx - radius, 0),
        std::min<int>(cx + radius, static_cast<int>(xpixel) - 1),
        std::max(cy - radius, 0),
        std::min<int>(cy + radius, static_cast<int>(ypixel) - 1)
    };
}

// Local flat background around one shoebox, in raw detector counts. Samples the
// square ring shoebox_radius < max(|dx|,|dy|) <= bkg_outer_radius centred on the
// spot, dropping pixels that belong to any spot core (spot_mask) or carry a
// masked/saturated sentinel, and returns their MEAN. (The median, used previously,
// sits below the mean for a skewed background and so biases the subtracted intensity
// positive.) Mirrors the local-background region of BraggIntegrate2D.
template<class T>
bool EstimateLocalBackground(const T *image,
                             const std::vector<uint8_t> &spot_mask,
                             size_t xpixel, size_t ypixel,
                             double cx, double cy,
                             int shoebox_radius, int bkg_outer_radius,
                             double &bkg_mean) {
    const int icx = static_cast<int>(std::lround(cx));
    const int icy = static_cast<int>(std::lround(cy));
    const int x0 = std::max(0, icx - bkg_outer_radius);
    const int x1 = std::min<int>(static_cast<int>(xpixel) - 1, icx + bkg_outer_radius);
    const int y0 = std::max(0, icy - bkg_outer_radius);
    const int y1 = std::min<int>(static_cast<int>(ypixel) - 1, icy + bkg_outer_radius);

    std::vector<double> vals;
    vals.reserve(static_cast<size_t>((x1 - x0 + 1) * (y1 - y0 + 1)));
    for (int y = y0; y <= y1; ++y) {
        for (int x = x0; x <= x1; ++x) {
            // Skip the square shoebox core: that is signal, not background.
            if (std::abs(x - icx) <= shoebox_radius && std::abs(y - icy) <= shoebox_radius)
                continue;
            const size_t np = static_cast<size_t>(xpixel) * y + x;
            if (spot_mask[np])
                continue;
            const T raw = image[np];
            if (raw == std::numeric_limits<T>::max())
                continue;
            if (std::is_signed_v<T> && raw == std::numeric_limits<T>::min())
                continue;
            vals.push_back(static_cast<double>(raw));
        }
    }

    if (vals.size() < 5)
        return false;

    // Mean background, NOT median. The median of a right-skewed (Poisson) background sits
    // below the mean, so subtracting it under-subtracts and biases every weak integrated
    // intensity positive - which averages up over multiplicity into fake <I/sig> in the
    // no-signal high-resolution shells. The mean is unbiased; modern fast detectors put
    // few zingers in the ring, so it is not robustified here.
    double sum = 0.0;
    for (const double v : vals)
        sum += v;
    bkg_mean = sum / static_cast<double>(vals.size());
    return true;
}

// Per-pixel: map a detector pixel through the current geometry into the
// reference reciprocal frame. Cheap (a few trig + one rotation); depends on the
// pixel and the detector geometry, not on the lattice.
template<typename T>
void ObservedRecip(const T *beam, const T *distance_mm, const T *detector_rot,
                   double obs_x, double obs_y,
                   double pixel_size, double inv_lambda,
                   Eigen::Matrix<T, 3, 1> &e_obs_recip) {
    // PyFAI convention (left-handed for rot1/rot2): rot3 = 0 assumed.
    const T c1 = ceres::cos(detector_rot[0]);
    const T s1 = ceres::sin(detector_rot[0]);
    const T c2 = ceres::cos(detector_rot[1]);
    const T s2 = ceres::sin(detector_rot[1]);

    const T det_x = (T(obs_x) - beam[0]) * T(pixel_size);
    const T det_y = (T(obs_y) - beam[1]) * T(pixel_size);
    const T det_z = T(distance_mm[0]);

    const T t1_x = c1 * det_x + s1 * det_z;
    const T t1_y = det_y;
    const T t1_z = -s1 * det_x + c1 * det_z;

    const T x = t1_x;
    const T y = c2 * t1_y + s2 * t1_z;
    const T z = -s2 * t1_y + c2 * t1_z;

    const T inv_norm = T(1) / ceres::sqrt(x * x + y * y + z * z);

    T recip_raw[3] = {
        x * inv_norm * T(inv_lambda),
        y * inv_norm * T(inv_lambda),
        (z * inv_norm - T(1.0)) * T(inv_lambda)
    };
    e_obs_recip = Eigen::Matrix<T, 3, 1>(recip_raw[0], recip_raw[1], recip_raw[2]);
}

// Per-reflection: predicted node g_hkl, |g_hkl|^2, and the Ewald-sphere normal.
// (a0,a1,a2) are the three real-space lattice column vectors in the lab frame
// (latt.Vec0/1/2()), used exactly as indexed: PixelRefine does not refine the cell,
// so there is no symmetry manifold to constrain - a general (triclinic) reciprocal
// inverse reproduces every crystal system from the actual cell. Depends only on the
// lattice and hkl, so for a whole shoebox it is computed once.
bool PredictedNode(const double *a0, const double *a1, const double *a2,
                   double exp_h, double exp_k, double exp_l, double inv_lambda,
                   Eigen::Vector3d &e_pred_recip,
                   Eigen::Vector3d &n_radial, double &q_sq) {
    const Eigen::Vector3d A(a0[0], a0[1], a0[2]);
    const Eigen::Vector3d Bv(a1[0], a1[1], a1[2]);
    const Eigen::Vector3d C(a2[0], a2[1], a2[2]);

    const Eigen::Vector3d BxC = Bv.cross(C);
    const Eigen::Vector3d CxA = C.cross(A);
    const Eigen::Vector3d AxB = A.cross(Bv);

    const double Vol = A.dot(BxC);
    if (std::abs(Vol) < 1e-12)
        return false;
    const double invV = 1.0 / Vol;

    e_pred_recip = (BxC * exp_h + CxA * exp_k + AxB * exp_l) * invV;
    q_sq = e_pred_recip.squaredNorm();

    // Ewald sphere centre at -k_i = (0,0,-inv_lambda); radial normal at g_hkl.
    const Eigen::Vector3d S_pred(
        e_pred_recip[0], e_pred_recip[1], e_pred_recip[2] + inv_lambda);
    const double S_pred_norm = S_pred.norm();
    if (S_pred_norm < 1e-10)
        return false;

    n_radial = S_pred / S_pred_norm;
    return true;
}

// Geometry terms for one shoebox pixel under a FIXED geometry (PixelRefine no
// longer refines geometry, so this is a plain double evaluation, not a Ceres cost):
//   q_sq        = |g_hkl|^2                 (predicted node, for the B-factor)
//   eps_radial  = deviation along the Ewald normal (the partiality direction)
//   eps_tang_sq = squared deviation in the tangent plane (the profile direction)
bool GeometryProbe(double obs_x, double obs_y, double lambda, double pixel_size,
                   int h, int k, int l,
                   const double beam[2], double dist_mm, const double detector_rot[2],
                   const double p0[3], const double p1[3], const double p2[3],
                   double &q_sq, double &eps_radial, double &eps_tang_sq) {
    const double inv_lambda = 1.0 / lambda;
    Eigen::Vector3d e_obs;
    ObservedRecip<double>(beam, &dist_mm, detector_rot, obs_x, obs_y, pixel_size, inv_lambda, e_obs);

    Eigen::Vector3d e_pred, n_radial;
    if (!PredictedNode(p0, p1, p2, h, k, l, inv_lambda, e_pred, n_radial, q_sq))
        return false;

    const Eigen::Vector3d delta_q = e_obs - e_pred;
    eps_radial = delta_q.dot(n_radial);
    eps_tang_sq = (delta_q - eps_radial * n_radial).squaredNorm();
    return true;
}

// Pulls a scalar parameter towards an expected value with a fixed weight (the
// data-scaled prior). Identical in spirit to ScaleOnTheFly's regularizer: it is what
// keeps the per-image scale G from wandering on weakly-constrained images and
// scrambling the cross-image merge.
struct ScalarRegularizer {
    ScalarRegularizer(double weight, double expected) : weight(weight), expected(expected) {}
    template<typename T>
    bool operator()(const T *p, T *residual) const {
        residual[0] = T(weight) * (p[0] - T(expected));
        return true;
    }
    double weight;
    double expected;
};

// Term 1 of the factored likelihood (FACTORED_MODEL.md): the per-reflection
// *intensity* (0th-moment) residual. The profile-fit amplitude J should equal the
// scaled reference J_model = G * exp(-B/4d^2) * partiality * pol * I_ref. One scalar
// residual per reflection, weighted by the model-expected (Fisher) sigma_J. This is
// the scaling residual - integration and scaling become one objective, and the empty
// pixels (which make no residual of their own) stop dominating the fit. Geometry is
// fixed, so J, partiality and sigma_J are constants and only G and B are free.
struct IntensityResidual {
    IntensityResidual(double J, double sigma_J, double partiality, double pol,
                      double I_ref, double inv_4d2)
        : J(J), inv_sigma(1.0 / sigma_J), partiality(partiality), pol(pol),
          I_ref(I_ref), inv_4d2(inv_4d2) {}
    template<typename T>
    bool operator()(const T *const G, const T *const B, T *residual) const {
        const T B_term = ceres::exp(-B[0] * T(inv_4d2));
        const T J_model = G[0] * B_term * T(partiality) * T(pol) * T(I_ref);
        residual[0] = (J_model - T(J)) * T(inv_sigma);
        return true;
    }
    double J, inv_sigma, partiality, pol, I_ref, inv_4d2;
};

} // namespace

PixelRefine::PixelRefine(const DiffractionExperiment &experiment,
                         const std::vector<MergedReflection> &reference)
    : xpixel(experiment.GetXPixelsNum()),
      ypixel(experiment.GetYPixelsNum()),
      experiment(experiment),
      hkl_key_generator(experiment.GetScalingSettings().GetMergeFriedel(),
                        experiment.GetSpaceGroupNumber().value_or(1)) {
    for (const auto &ref: reference)
        reference_data[hkl_key_generator(ref)] = ref.I;
}

void PixelRefine::BuildParameterBlocks(const PixelRefineData &data,
                                       double beam[2], double &dist_mm,
                                       double detector_rot[2],
                                       double latt_vec0[3], double latt_vec1[3], double latt_vec2[3]) const {
    beam[0] = data.geom.GetBeamX_pxl();
    beam[1] = data.geom.GetBeamY_pxl();
    dist_mm = data.geom.GetDetectorDistance_mm();
    detector_rot[0] = data.geom.GetPoniRot1_rad();
    detector_rot[1] = data.geom.GetPoniRot2_rad();

    // The three real-space lattice columns in the lab frame, used as-is. PixelRefine
    // does not refine the cell, so there is no symmetry manifold to constrain; the
    // indexed cell is taken exactly, with no re-idealisation to ideal symmetry.
    const Coord &a = data.latt.Vec0();
    const Coord &b = data.latt.Vec1();
    const Coord &c = data.latt.Vec2();
    for (int i = 0; i < 3; ++i) {
        latt_vec0[i] = a[i];
        latt_vec1[i] = b[i];
        latt_vec2[i] = c[i];
    }
}

template<class T>
void PixelRefine::Run(const T *image,
                      BraggPrediction &prediction,
                      PixelRefineData &data) {
    data.solved = false;
    data.reflections.clear();

    const double lambda = data.geom.GetWavelength_A();
    const double pixel_size = data.geom.GetPixelSize_mm();

    const BraggPredictionSettings settings_prediction{
        .high_res_A = experiment.GetBraggIntegrationSettings().GetDMinLimit_A(),
        .ewald_dist_cutoff = static_cast<float>(data.ewald_dist_cutoff),
        .max_hkl = 100,
        .centering = data.centering,
        .bandwidth_sigma = static_cast<float>(data.bandwidth)  // relative Δλ/λ sigma
    };

    const int radius = data.shoebox_radius;
    const int bkg_outer_radius = std::max(radius + 1, data.bkg_outer_radius);

    // Per-reflection polarization correction (raw = true * pol), evaluated once at
    // the predicted spot - same handling as BraggIntegrate2D. Identity if disabled.
    const auto pol_factor = experiment.GetPolarizationFactor();
    auto polarization = [&](double x, double y) -> double {
        if (!pol_factor)
            return 1.0;
        return data.geom.CalcAzIntPolarizationCorr(static_cast<float>(x), static_cast<float>(y),
                                                   pol_factor.value());
    };

    // Bandwidth radial-width^2 (in the code's R = sqrt(2)*sigma convention):
    //   R_bw^2 = (b*lambda)^2 / (2 d^4),   b = relative bandwidth (sigma).
    // A fixed per-reflection constant; data.bandwidth == 0 -> monochromatic no-op.
    const double bw = data.bandwidth;
    auto bandwidth_radial_sq = [&](double d) -> double {
        if (bw <= 0.0 || d <= 0.0)
            return 0.0;
        const double bl = bw * lambda;
        return bl * bl / (2.0 * d * d * d * d);
    };

    // Geometry is FIXED here: orientation/cell/detector were already refined upstream
    // by XtalOptimizer (IndexAndRefine::RefineGeometryIfNeeded). PixelRefine is an
    // intensity-only operation - it predicts shoeboxes with this geometry, measures the
    // tangential profile width, and fits the per-image scale G (and B) to the reference.
    double beam[2], dist_mm, detector_rot[2];
    double latt_vec0[3], latt_vec1[3], latt_vec2[3];
    BuildParameterBlocks(data, beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2);

    // ---- 1. Predict shoeboxes for the current geometry ------------------------
    DiffractionExperiment exp_iter = experiment;
    exp_iter.BeamX_pxl(data.geom.GetBeamX_pxl())
            .BeamY_pxl(data.geom.GetBeamY_pxl())
            .DetectorDistance_mm(data.geom.GetDetectorDistance_mm())
            .PoniRot1_rad(data.geom.GetPoniRot1_rad())
            .PoniRot2_rad(data.geom.GetPoniRot2_rad());
    const int nrefl = prediction.Calc(exp_iter, data.latt, settings_prediction);

    // ---- 2. Collect per-reflection shoebox pixels + local background ----------
    // GetReflections() returns the full pre-sized buffer; only the first nrefl
    // entries are valid for this image. A spot-core mask over ALL predictions keeps
    // each reflection's background ring from picking up a neighbour's signal.
    const auto &predicted = prediction.GetReflections();
    const auto spot_mask = BuildSpotMask(predicted, nrefl, xpixel, ypixel, radius);

    std::vector<ReflGroup> groups;
    for (int ri = 0; ri < nrefl; ++ri) {
        const auto &refl = predicted[ri];
        const auto hkl = hkl_key_generator(refl);
        if (!reference_data.contains(hkl))
            continue;

        // Local flat background from the ring around the shoebox (raw counts). If we
        // cannot estimate a clean local background the reflection is dropped, exactly
        // as BraggIntegrate2D marks it unobserved when too few background pixels survive.
        double Ibkg = 0.0;
        if (!EstimateLocalBackground(image, spot_mask, xpixel, ypixel,
                                     refl.predicted_x, refl.predicted_y,
                                     radius, bkg_outer_radius, Ibkg))
            continue;

        ReflGroup g;
        g.h = refl.h;
        g.k = refl.k;
        g.l = refl.l;
        g.d = refl.d;
        g.Itrue = reference_data[hkl];
        g.R_bw_sq = bandwidth_radial_sq(refl.d);
        g.pol = polarization(refl.predicted_x, refl.predicted_y);
        g.Ibkg = Ibkg;
        g.predicted_x = refl.predicted_x;
        g.predicted_y = refl.predicted_y;

        const auto box = ShoeboxBounds(refl.predicted_x, refl.predicted_y, radius, xpixel, ypixel);
        for (int y = box.min_y; y <= box.max_y; ++y) {
            for (int x = box.min_x; x <= box.max_x; ++x) {
                const size_t npixel = xpixel * y + x;
                // Skip sentinel (masked / saturated) pixels.
                if (image[npixel] == std::numeric_limits<T>::max())
                    continue;
                if (std::is_signed_v<T> && (image[npixel] == std::numeric_limits<T>::min()))
                    continue;
                g.pixels.push_back({static_cast<double>(x), static_cast<double>(y),
                                    static_cast<double>(image[npixel]), Ibkg});
            }
        }
        if (!g.pixels.empty())
            groups.push_back(std::move(g));
    }
    if (groups.empty())
        return;

    // ---- 3. Term 2: per-resolution tangential profile width R1 ----------------
    // R1 = sqrt(2*<eps_t^2>) from the intensity-weighted tangential second moment of
    // the strong spots, binned by resolution (low res small spots, high res larger).
    // A *shape* statistic, normalised by the total intensity, so it is decoupled from
    // the per-image scale - which is what makes measuring it (rather than fitting it,
    // where it is degenerate with G) stable. Weak spots fall back to the global R[1].
    for (auto &g : groups)
        g.R1_eff = data.R[1];
    {
        constexpr int n_bins = 6;
        const double box_px = (2.0 * radius + 1.0) * (2.0 * radius + 1.0);
        double s2min = 1e30, s2max = 0.0;
        for (const auto &g : groups) {
            const double s2 = 1.0 / (g.d * g.d);
            s2min = std::min(s2min, s2);
            s2max = std::max(s2max, s2);
        }
        const double span = std::max(s2max - s2min, 1e-12);
        auto bin_of = [&](double d) {
            return std::clamp(static_cast<int>((1.0 / (d * d) - s2min) / span * n_bins), 0, n_bins - 1);
        };
        std::vector<std::vector<double>> bin_M2(n_bins);
        for (const auto &g : groups) {
            double sw = 0.0, sw_et2 = 0.0;
            for (const auto &px : g.pixels) {
                double q_sq, eps_r, eps_t_sq;
                if (!GeometryProbe(px.x, px.y, lambda, pixel_size, g.h, g.k, g.l,
                                   beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2,
                                   q_sq, eps_r, eps_t_sq))
                    continue;
                const double w = std::max(px.Iobs - g.Ibkg, 0.0);
                sw += w;
                sw_et2 += w * eps_t_sq;
            }
            if (sw <= 0.0)
                continue;
            const double signif = sw / std::sqrt(std::max(box_px * g.Ibkg, 1.0));
            if (signif >= 5.0) // only well-measured spots define the shape
                bin_M2[bin_of(g.d)].push_back(sw_et2 / sw);
        }
        std::vector<double> bin_R1(n_bins, data.R[1]);
        for (int b = 0; b < n_bins; ++b)
            if (bin_M2[b].size() >= 5) {
                const double r1 = data.r1_multiplier * std::sqrt(2.0 * std::max(MedianInPlace(bin_M2[b]), 0.0));
                if (std::isfinite(r1) && r1 > 1e-4)
                    bin_R1[b] = std::clamp(r1, 1e-4, 0.05);
            }
        for (auto &g : groups)
            g.R1_eff = bin_R1[bin_of(g.d)];
    }

    // ---- 4. Term 1: one intensity residual per reflection; fit G (and B) ------
    // J / partiality / sigma_J are computed here as constants (geometry & R fixed),
    // and only the per-image scale G and Debye-Waller B are optimised.
    ceres::Problem problem;
    size_t n_blocks = 0;
    const double R0 = data.R[0];
    for (const auto &g : groups) {
        const double R1 = g.R1_eff; // Term 2: per-resolution profile width
        if (!(R1 > 0.0) || !(R0 > 0.0))
            continue;
        double num = 0.0, den = 0.0, rad = 0.0;
        std::vector<std::pair<double, double>> pt_sig; // (P_t, Iobs-Bg) for the Fisher pass
        pt_sig.reserve(g.pixels.size());
        for (const auto &px : g.pixels) {
            double q_sq, eps_r, eps_t_sq;
            if (!GeometryProbe(px.x, px.y, lambda, pixel_size, g.h, g.k, g.l,
                               beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2,
                               q_sq, eps_r, eps_t_sq))
                continue;
            const double P_t = std::exp(-eps_t_sq / (R1 * R1)) / (PI * R1 * R1);
            const double R0_eff_sq = R0 * R0 + g.R_bw_sq;
            const double P_rad = std::exp(-eps_r * eps_r / R0_eff_sq);
            const double v = std::max(g.Ibkg, 1.0);
            const double sig = px.Iobs - g.Ibkg;
            num += P_t * sig / v;
            den += P_t * P_t / v;
            rad += P_rad * P_t * P_t / v;
            pt_sig.emplace_back(P_t, sig);
        }
        if (!(den > 0.0))
            continue;
        const double J = num / den;
        const double partiality = rad / den;
        // Model-expected (Fisher) variance: v_p = background + expected signal J*P_t,
        // not the per-pixel observed counts (which down-bias) - so the weight tracks
        // information, and an expected-strong reflection that is absent hurts.
        double den_f = 0.0;
        for (const auto &[P_t, sig] : pt_sig) {
            const double v_f = std::max(g.Ibkg + std::max(J, 0.0) * P_t, 1.0);
            den_f += P_t * P_t / v_f;
        }
        const double sigma_J = std::sqrt(1.0 / std::max(den_f, 1e-30));
        const double inv_4d2 = (g.d > 0.0) ? 1.0 / (4.0 * g.d * g.d) : 0.0;
        auto *cost = new ceres::AutoDiffCostFunction<IntensityResidual, 1, 1, 1>(
            new IntensityResidual(J, sigma_J, partiality, g.pol, g.Itrue, inv_4d2));
        problem.AddResidualBlock(cost, nullptr, &data.scale_factor, &data.B_factor);
        ++n_blocks;
    }
    data.residual_count = n_blocks;
    if (n_blocks == 0)
        return;

    // G >= 0; B fixed unless requested; G regularized -> 1 with weight sqrt(n/sigma)
    // (mirrors ScaleOnTheFly) so weakly-measured images cannot drift and scramble the merge.
    problem.SetParameterLowerBound(&data.scale_factor, 0, 0.0);
    if (!data.refine_B)
        problem.SetParameterBlockConstant(&data.B_factor);
    if (data.scale_reg_sigma > 0.0) {
        const double w = std::sqrt(static_cast<double>(groups.size()) / data.scale_reg_sigma);
        auto *reg = new ceres::AutoDiffCostFunction<ScalarRegularizer, 1, 1>(
            new ScalarRegularizer(w, 1.0));
        problem.AddResidualBlock(reg, nullptr, &data.scale_factor);
    }

    ceres::Solver::Options options;
    options.linear_solver_type = ceres::DENSE_QR;
    options.minimizer_progress_to_stdout = false;
    options.logging_type = ceres::LoggingType::SILENT;
    options.max_solver_time_in_seconds = data.max_time_s;
    options.num_threads = 1;
    ceres::Solver::Summary summary;
    ceres::Solve(options, &problem, &summary);
    data.final_cost = summary.final_cost;
    data.solved = summary.IsSolutionUsable();

    // ---- 5. Extract integrated reflections ------------------------------------
    // Profile fitting gives the recorded amplitude (fitting the tangential profile P_t
    // against the background-subtracted pixels):
    //     J      = sum_p[ P_t,p (Iobs_p - Ibkg)/v_p ] / sum_p[ P_t,p^2 / v_p ]
    //            ~ G * Itrue * B_term * partiality * pol     (recorded raw counts)
    //     var(J) = 1 / sum_p[ P_t,p^2 / v_p ]
    //
    // Output split (Merge multiplies r.I * image_scale_corr and weights by
    // 1/(sigma*image_scale_corr)^2):
    //   r.I              = J / (B_term * partiality * pol) = G * Itrue
    //   r.sigma          = sqrt(var(J)) / (B_term * partiality * pol)
    //   r.partiality     = profile-weighted P_radial in (0,1]  (the rocking fraction)
    //   r.completeness   = live/total tangential profile in (0,1]  (detector clipping)
    //   r.image_scale_corr = 1/G                              (per-image scale ONLY)
    // so r.I * image_scale_corr = Itrue: B, partiality and polarization live on the
    // intensity, G lives on image_scale_corr - one clean meaning per field. We walk the
    // full (unclamped) shoebox once: every grid point feeds the total tangential profile
    // (completeness denominator); points that are real, live pixels also feed the fit.
    data.reflections.reserve(groups.size());
    for (const auto &g : groups) {
        const int cx = static_cast<int>(std::lround(g.predicted_x));
        const int cy = static_cast<int>(std::lround(g.predicted_y));
        const double B_term = std::exp(-data.B_factor / (4.0 * g.d * g.d));

        double num = 0.0, den = 0.0, bkg_sum = 0.0, radial_sum = 0.0;
        double prof_live = 0.0, prof_full = 0.0; // tangential profile: captured / total
        size_t n = 0;

        for (int y = cy - radius; y <= cy + radius; ++y) {
            for (int x = cx - radius; x <= cx + radius; ++x) {
                double q_sq, eps_r, eps_t_sq;
                if (!GeometryProbe(static_cast<double>(x), static_cast<double>(y),
                                   lambda, pixel_size, g.h, g.k, g.l,
                                   beam, dist_mm, detector_rot, latt_vec0, latt_vec1, latt_vec2,
                                   q_sq, eps_r, eps_t_sq))
                    continue;
                if (!(data.R[0] > 0.0) || !(g.R1_eff > 0.0))
                    continue;

                // Tangential profile shape (area-normalized) -> the fit template, using the
                // per-reflection R1_eff (Term 2).
                const double R1 = g.R1_eff;
                const double P_t = std::exp(-eps_t_sq / (R1 * R1)) / (PI * R1 * R1);
                prof_full += P_t; // whole shoebox, on- or off-detector

                // Only real, unmasked detector pixels carry signal.
                if (x < 0 || x >= static_cast<int>(xpixel) || y < 0 || y >= static_cast<int>(ypixel))
                    continue;
                const size_t np = static_cast<size_t>(xpixel) * y + x;
                if (image[np] == std::numeric_limits<T>::max())
                    continue;
                if (std::is_signed_v<T> && image[np] == std::numeric_limits<T>::min())
                    continue;

                const double Iobs = static_cast<double>(image[np]); // raw counts
                // Background-limited variance (constant over the shoebox): weighting by
                // the observed count biases the amplitude negative where signal is weakest.
                const double v = std::max(g.Ibkg, 1.0);

                // Peak-normalized radial factor (the partiality), in (0,1]. MUST use the
                // same P_t^2/v weights as the amplitude, else an R0_eff-dependent (hence
                // resolution-dependent) factor is left behind in r.I.
                const double R0_eff_sq = data.R[0] * data.R[0] + g.R_bw_sq;
                const double P_radial = std::exp(-eps_r * eps_r / R0_eff_sq);

                const double w = P_t * P_t / v;
                num += P_t * (Iobs - g.Ibkg) / v;
                den += w;
                radial_sum += P_radial * w;
                prof_live += P_t;
                bkg_sum += g.Ibkg;
                ++n;
            }
        }

        Reflection r{};
        r.h = g.h;
        r.k = g.k;
        r.l = g.l;
        r.d = static_cast<float>(g.d);
        r.predicted_x = static_cast<float>(g.predicted_x);
        r.predicted_y = static_cast<float>(g.predicted_y);
        r.observed_x = NAN;
        r.observed_y = NAN;
        r.rlp = 1.0f;
        r.partiality = (den > 0.0) ? static_cast<float>(radial_sum / den) : 1.0f;
        r.completeness = (prof_full > 0.0) ? static_cast<float>(prof_live / prof_full) : 1.0f;

        if (den > 0.0 && n > 0) {
            const double I_amp = num / den;             // ~ G*Itrue*B_term*partiality*pol
            const double sigma_amp = std::sqrt(1.0 / den);
            const double corr = static_cast<double>(r.partiality) * B_term * g.pol;
            r.bkg = static_cast<float>(bkg_sum / static_cast<double>(n));
            r.observed = true;

            if (corr > 0.0 && data.scale_factor > 0.0) {
                r.I = static_cast<float>(I_amp / corr);       // = G*Itrue
                r.sigma = static_cast<float>(sigma_amp / corr);
                r.image_scale_corr = static_cast<float>(1.0 / data.scale_factor); // G only
            } else {
                r.I = static_cast<float>(I_amp);
                r.sigma = static_cast<float>(sigma_amp);
                r.image_scale_corr = NAN;
            }
        } else {
            r.I = 0.0f;
            r.sigma = NAN;
            r.bkg = 0.0f;
            r.observed = false;
        }
        data.reflections.push_back(r);
    }

    // ---- 6. Per-image CC vs reference (diagnostic) ----------------------------
    // Pearson CC of the scaled estimate (r.I * image_scale_corr = Itrue_est) against
    // the reference intensities, over the matched reflections.
    {
        double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0;
        size_t cn = 0;
        for (const auto &r : data.reflections) {
            if (!r.observed || !std::isfinite(r.I) || !std::isfinite(r.image_scale_corr))
                continue;
            const auto it = reference_data.find(hkl_key_generator(r));
            if (it == reference_data.end())
                continue;
            const double x = static_cast<double>(r.I) * static_cast<double>(r.image_scale_corr);
            const double y = it->second;
            if (!std::isfinite(x) || !std::isfinite(y))
                continue;
            sx += x; sy += y; sxx += x * x; syy += y * y; sxy += x * y; ++cn;
        }
        data.cc = NAN;
        data.cc_n = static_cast<int64_t>(cn);
        if (cn >= 3) {
            const double nd = static_cast<double>(cn);
            const double cov = sxy - sx * sy / nd;
            const double vx = sxx - sx * sx / nd;
            const double vy = syy - sy * sy / nd;
            if (vx > 0.0 && vy > 0.0)
                data.cc = cov / std::sqrt(vx * vy);
        }
    }
}

template void PixelRefine::Run<int8_t>(const int8_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<int16_t>(const int16_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<int32_t>(const int32_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<uint8_t>(const uint8_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<uint16_t>(const uint16_t *, BraggPrediction &, PixelRefineData &);
template void PixelRefine::Run<uint32_t>(const uint32_t *, BraggPrediction &, PixelRefineData &);