Jungfraujoch/image_analysis/scale_merge/RotationScaleMerge.cpp

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "RotationScaleMerge.h"

#include <algorithm>
#include <atomic>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <future>
#include <limits>
#include <random>

#include <gemmi/reciproc.hpp>
#include <gemmi/symmetry.hpp>

#include "HKLKey.h"
#include "../../common/CorrelationCoefficient.h"
#include "../../common/CrystalLattice.h"
#include "../../common/Definitions.h"
#include "../../common/JFJochException.h"
#include "../../common/ResolutionShells.h"

namespace {
    // These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat
    // implementation is numerically identical - see the comments there for the physics.
    constexpr size_t MIN_REFLECTIONS = 20;   // per-frame scale needs at least this many
    constexpr double SCALE_ROBUST_K = 3.0;   // Cauchy loss scale (sigma units) for the per-frame G fit
    constexpr float  MAX_FRAME_GAP = 2.0f;   // a rocking event is a run of frames no more apart than this
    constexpr double CHI2_1_MEDIAN = 0.454936;

    double SafeInv(double x, double fallback) {
        if (!std::isfinite(x) || x == 0.0)
            return fallback;
        return 1.0 / x;
    }

    // Kabsch rotation partiality: the fraction of a reflection recorded in the sampled slice, from the
    // erf of the rocking angle relative to the mosaic width. Identical to ScaleOnTheFly's RotationPartiality
    // (and the predictor's), so recomputing here just swaps in the smoothed mosaicity.
    float RotationPartiality(double delta_phi_deg, double zeta, double mosaicity_deg, double wedge_deg) {
        const double half_wedge = wedge_deg / 2.0;
        const double c1 = zeta / std::sqrt(2.0);
        const double arg_plus = (delta_phi_deg + half_wedge) * c1 / mosaicity_deg;
        const double arg_minus = (delta_phi_deg - half_wedge) * c1 / mosaicity_deg;
        return static_cast<float>((std::erf(arg_plus) - std::erf(arg_minus)) / 2.0);
    }

    // Deterministic CC1/2 half from the frame's stable index (splitmix64), matching Merge.cpp.
    int HalfForImage(int64_t image_id) {
        uint64_t z = static_cast<uint64_t>(image_id) + 0x9e3779b97f4a7c15ULL;
        z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
        z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
        z = z ^ (z >> 31);
        return static_cast<int>(z & 1ULL);
    }

    struct ScaleObs { double coeff, Iobs, weight; };

    // Robust per-frame scale (linear in G): the exact objective ScaleOnTheFly::SolveScaleIRLS solves.
    double SolveScaleIRLS(const std::vector<ScaleObs> &obs, double robust_k) {
        auto weighted_scale = [&obs](auto robust_weight) {
            double num = 0.0, den = 0.0;
            for (const auto &o : obs) {
                const double rw = robust_weight(o);
                const double w2 = o.weight * o.weight;
                num += rw * w2 * o.coeff * o.Iobs;
                den += rw * w2 * o.coeff * o.coeff;
            }
            return den > 0.0 ? num / den : NAN;
        };

        double G = weighted_scale([](const ScaleObs &) { return 1.0; });
        if (!std::isfinite(G))
            return 1.0;
        G = std::max(0.0, G);

        const double k2 = robust_k * robust_k;
        for (int iter = 0; iter < 30; ++iter) {
            const double G_prev = G;
            const double G_next = weighted_scale([&](const ScaleObs &o) {
                const double res = o.weight * (G * o.coeff - o.Iobs);
                return 1.0 / (1.0 + res * res / k2);
            });
            if (!std::isfinite(G_next))
                break;
            G = std::max(0.0, G_next);
            if (std::abs(G - G_prev) <= 1e-7 * std::max(G, 1.0))
                break;
        }
        return G;
    }

    // Run fn(i) for i in [0, n) over `nthreads` workers pulling from a shared atomic counter - the same
    // self-load-balancing pattern the rest of the codebase uses (heavy frames don't stall light ones).
    // Work-stealing per-item parallel: one atomic fetch per item. Use ONLY when the per-item work is
    // heavy and uneven (e.g. per-frame fits) - the atomic amortises. For millions of tiny uniform items
    // use ParallelChunks instead; a per-item atomic there is pure contention.
    template <typename Fn>
    void ParallelFor(int n, size_t nthreads, Fn fn) {
        if (n <= 0) return;
        if (nthreads <= 1 || n == 1) {
            for (int i = 0; i < n; ++i) fn(i);
            return;
        }
        const size_t local = std::min(nthreads, static_cast<size_t>(n));
        std::atomic<int> next = 0;
        std::vector<std::future<void>> futures;
        futures.reserve(local);
        for (size_t t = 0; t < local; ++t)
            futures.emplace_back(std::async(std::launch::async, [&] {
                for (int i = next.fetch_add(1); i < n; i = next.fetch_add(1))
                    fn(i);
            }));
        for (auto &f : futures) f.get();
    }

    // Chunked parallel: each worker gets one contiguous [lo, hi) range, no per-item synchronisation.
    // Right for millions of cheap uniform items (the CPU stand-in for a flat CUDA grid-stride kernel).
    template <typename Fn>
    void ParallelChunks(int n, size_t nthreads, Fn fn) {
        if (n <= 0) return;
        const int nt = static_cast<int>(std::max<size_t>(1, std::min(nthreads, static_cast<size_t>(n))));
        if (nt == 1) { fn(0, n); return; }
        const int chunk = (n + nt - 1) / nt;
        std::vector<std::future<void>> futures;
        futures.reserve(nt);
        for (int t = 0; t < nt; ++t) {
            const int lo = t * chunk, hi = std::min(n, lo + chunk);
            if (lo >= hi) break;
            futures.emplace_back(std::async(std::launch::async, [&fn, lo, hi] { fn(lo, hi); }));
        }
        for (auto &f : futures) f.get();
    }

    double median_of(std::vector<double> &v) {
        std::nth_element(v.begin(), v.begin() + v.size() / 2, v.end());
        return v[v.size() / 2];
    }
}

RotationScaleMerge::RotationScaleMerge(const DiffractionExperiment &experiment,
                                       std::vector<IntegrationOutcome> &partial_outcomes,
                                       std::optional<UnitCell> reference_cell,
                                       int scaling_iterations, float ice_ring_half_width_q,
                                       size_t nthreads, Logger &logger,
                                       std::string observation_dump_path)
    : x(experiment), partials_out(partial_outcomes), reference_cell(std::move(reference_cell)),
      nthreads(nthreads == 0 ? std::thread::hardware_concurrency() : nthreads), logger(logger),
      observation_dump_path(std::move(observation_dump_path)) {
    const auto s = x.GetScalingSettings();
    min_partiality = s.GetMinPartiality();
    d_min_limit = s.GetHighResolutionLimit_A();
    merge_friedel = s.GetMergeFriedel();
    capture_uncertainty_coeff = s.GetCaptureUncertaintyCoeff();
    reject_nsigma = s.GetOutlierRejectNsigma();
    reject_outliers = reject_nsigma > 0.0;
    rfree_fraction = s.GetRfreeFraction();
    scale_fulls = s.GetScaleFulls();
    scaling_iter = std::max(1, scaling_iterations);
    if (const auto forced = s.GetForcedMosaicity(); forced.has_value())
        mosaicity_deg = *forced;
    else
        mosaicity_deg = s.GetDefaultMosaicity();
    ice_half_width_q = ice_ring_half_width_q;
}

void RotationScaleMerge::Ingest() {
    n_frames = static_cast<int>(partials_out.size());

    size_t total = 0;
    for (const auto &o : partials_out) total += o.reflections.size();

    partials.clear();
    partials.reserve(total);
    frame_start.assign(n_frames, 0);
    frame_count.assign(n_frames, 0);
    frame_cell_ok.assign(n_frames, 1);
    g_partial.assign(n_frames, 1.0);

    const float dist_tol = x.GetIndexingSettings().GetUnitCellDistTolerance();
    const float ang_tol = x.GetIndexingSettings().GetUnitCellAngleTolerance_deg();

    for (int o = 0; o < n_frames; ++o) {
        frame_start[o] = static_cast<int32_t>(partials.size());
        if (reference_cell) {
            const auto cell = partials_out[o].latt.GetUnitCell();
            frame_cell_ok[o] = cell.is_close(*reference_cell, dist_tol, ang_tol) ? 1 : 0;
        }
        for (const auto &r : partials_out[o].reflections) {
            Obs obs{};
            obs.h = r.h; obs.k = r.k; obs.l = r.l;
            obs.I = r.I; obs.sigma = r.sigma; obs.d = r.d; obs.rlp = r.rlp;
            obs.partiality = r.partiality; obs.zeta = r.zeta; obs.delta_phi = r.delta_phi_deg; obs.bkg = r.bkg;
            obs.image_number = r.image_number;
            obs.frame = o;
            obs.on_ice = r.on_ice_ring ? 1 : 0;
            obs.corr = r.image_scale_corr;
            obs.group = -1;
            partials.push_back(obs);
        }
        frame_count[o] = static_cast<int32_t>(partials.size()) - frame_start[o];
    }

    // Sort ONCE by (raw h,k,l, image_number) and split into raw-hkl runs. This is the one expensive sort;
    // both the 3D combine (event split) and the per-space-group ASU grouping reuse this order.
    perm.resize(partials.size());
    for (int i = 0; i < static_cast<int>(partials.size()); ++i) perm[i] = i;
    std::sort(perm.begin(), perm.end(), [&](int32_t a, int32_t b) {
        const auto &x1 = partials[a];
        const auto &y1 = partials[b];
        if (x1.h != y1.h) return x1.h < y1.h;
        if (x1.k != y1.k) return x1.k < y1.k;
        if (x1.l != y1.l) return x1.l < y1.l;
        return x1.image_number < y1.image_number;
    });
    rawrun_start.clear(); rawrun_count.clear();
    rawrun_h.clear(); rawrun_k.clear(); rawrun_l.clear(); rawrun_d.clear();
    for (int i = 0; i < static_cast<int>(perm.size()); ) {
        const auto &o0 = partials[perm[i]];
        int j = i;
        float d = NAN;
        while (j < static_cast<int>(perm.size())) {
            const auto &o = partials[perm[j]];
            if (o.h != o0.h || o.k != o0.k || o.l != o0.l) break;
            if (!std::isfinite(d) && std::isfinite(o.d) && o.d > 0.0f) d = o.d;
            ++j;
        }
        rawrun_start.push_back(i);
        rawrun_count.push_back(j - i);
        rawrun_h.push_back(o0.h); rawrun_k.push_back(o0.k); rawrun_l.push_back(o0.l);
        rawrun_d.push_back(d);
        i = j;
    }
    rawrun_group.assign(rawrun_start.size(), -1);
    logger.Info("RotationScaleMerge: ingested {} partial observations from {} frames ({} distinct hkl)",
                total, n_frames, rawrun_start.size());

    SmoothMosaicityAndPartiality();

#ifdef JFJOCH_USE_CUDA
    // Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs
    // fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine.
    gpu_ = std::make_unique<RotationScaleMergeGPU>();
    gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr);
    if (gpu_active_) {
        const int n = static_cast<int>(partials.size());
        std::vector<float> I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n);
        std::vector<uint8_t> onice(n);
        std::vector<int32_t> frm(n);
        for (int i = 0; i < n; ++i) {
            const auto &o = partials[i];
            I[i] = o.I; sigma[i] = o.sigma; rlp[i] = o.rlp; part[i] = o.partiality;
            zeta[i] = o.zeta; onice[i] = o.on_ice; frm[i] = o.frame; corr[i] = o.corr;
            bkg[i] = o.bkg; img[i] = o.image_number; dd[i] = o.d;
        }
        gpu_->SetPartials(n, n_frames, I.data(), sigma.data(), rlp.data(), part.data(), zeta.data(),
                          onice.data(), frm.data(), corr.data(), frame_start.data(), frame_count.data());
        gpu_->SetCombineInputs(bkg.data(), img.data(), dd.data());
        gpu_->SetRawRuns(static_cast<int>(rawrun_start.size()), static_cast<int>(perm.size()), perm.data(),
                         rawrun_start.data(), rawrun_count.data(),
                         rawrun_h.data(), rawrun_k.data(), rawrun_l.data());
        gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr;
        logger.Info("RotationScaleMerge: GPU partial-scaling{} active",
                    gpu_combine_ ? " + combine + scale-fulls" : "");
    }
#endif
}

void RotationScaleMerge::SmoothMosaicityAndPartiality() {
    // Raw per-frame mosaicity as measured at integration (image-local, deterministic).
    std::vector<double> mos_raw(n_frames, NAN);
    for (int o = 0; o < n_frames; ++o) {
        const auto &m = partials_out[o].mosaicity_deg;
        if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
    }

    // Frame-order moving average with the same window as smooth-G (a rotation range -> frame count).
    // With smoothing off, fall back to the per-frame value (still deterministic, just unsmoothed).
    const auto ss = x.GetScalingSettings();
    const double smooth_deg = ss.GetSmoothGDegrees();
    const auto gon = x.GetGoniometer();
    const double osc = gon ? std::fabs(gon->GetIncrement_deg()) : 0.0;
    mos_smooth.assign(n_frames, NAN);
    if (smooth_deg > 0.0 && osc > 1e-6) {
        int window = std::max(1, static_cast<int>(std::lround(smooth_deg / osc)));
        if (window % 2 == 0) ++window;
        const int half = window / 2;
        for (int o = 0; o < n_frames; ++o) {
            double sum = 0.0;
            int cnt = 0;
            for (int j = std::max(0, o - half); j <= std::min(n_frames - 1, o + half); ++j)
                if (std::isfinite(mos_raw[j])) { sum += mos_raw[j]; ++cnt; }
            if (cnt > 0) mos_smooth[o] = static_cast<float>(sum / cnt);
        }
    } else {
        for (int o = 0; o < n_frames; ++o) mos_smooth[o] = static_cast<float>(mos_raw[o]);
    }

    // Recompute each partial's partiality from the smoothed mosaicity (same wedge the predictor used).
    // Frames without a mosaicity keep the stored partiality.
    const double wedge = gon ? std::fabs(gon->GetWedge_deg()) : 0.0;
    ParallelChunks(static_cast<int>(partials.size()), nthreads, [&](int lo, int hi) {
        for (int i = lo; i < hi; ++i) {
            auto &o = partials[i];
            const float mos = mos_smooth[o.frame];
            if (std::isfinite(mos) && mos > 1e-6f && std::isfinite(o.zeta) && o.zeta > 0.0f
                && std::isfinite(o.delta_phi))
                o.partiality = RotationPartiality(o.delta_phi, o.zeta, mos, wedge);
        }
    });
    logger.Info("Recomputed partiality from frame-order-smoothed mosaicity");
}

int RotationScaleMerge::ComputeAsuGroups(const HKLKeyGenerator &keygen) {
    // One ASU reduction per distinct raw hkl (not per observation): a raw hkl is eligible if it is not
    // systematically absent and its resolution is in range. Group the eligible raw hkls by ASU key
    // (sort the ~#distinct-hkl keys, not the millions of observations), then hand out dense ids.
    const int n_run = static_cast<int>(rawrun_start.size());
    std::vector<uint64_t> key(n_run);
    std::vector<uint8_t> eligible(n_run, 0);
    // The gemmi ASU reduction / absence test per raw hkl is the cost here and is independent per run
    // (HKLKeyGenerator is const, so concurrent reads are safe) - compute keys in parallel chunks.
    ParallelChunks(n_run, nthreads, [&](int lo, int hi) {
        for (int r = lo; r < hi; ++r) {
            rawrun_group[r] = -1;
            if (keygen.IsSystematicallyAbsent(rawrun_h[r], rawrun_k[r], rawrun_l[r]))
                continue;
            const float d = rawrun_d[r];   // resolution is a per-raw-hkl property (all its partials share d)
            if (!std::isfinite(d) || d <= 0.0f) continue;
            if (d_min_limit && d < *d_min_limit) continue;
            key[r] = keygen(rawrun_h[r], rawrun_k[r], rawrun_l[r]).pack();
            eligible[r] = 1;
        }
    });
    std::vector<int32_t> idx;
    idx.reserve(n_run);
    for (int r = 0; r < n_run; ++r)
        if (eligible[r]) idx.push_back(r);
    std::sort(idx.begin(), idx.end(), [&](int32_t a, int32_t b) { return key[a] < key[b]; });

    group_h.clear(); group_k.clear(); group_l.clear();
    int n_groups = 0;
    for (size_t j = 0; j < idx.size(); ++j) {
        if (j == 0 || key[idx[j]] != key[idx[j - 1]]) {
            const int r = idx[j];
            const auto hkl = keygen(rawrun_h[r], rawrun_k[r], rawrun_l[r]);
            group_h.push_back(hkl.plus ? hkl.h : -hkl.h);
            group_k.push_back(hkl.plus ? hkl.k : -hkl.k);
            group_l.push_back(hkl.plus ? hkl.l : -hkl.l);
            ++n_groups;
        }
        rawrun_group[idx[j]] = n_groups - 1;
    }

    // Stamp each partial's group from its raw hkl, keeping the exact per-observation AcceptReflection
    // finiteness (finite I, finite rlp != 0, finite sigma > 0) that grouping by raw hkl alone would miss.
    // Parallel over raw-hkl runs: distinct runs own disjoint perm ranges, hence disjoint observations.
    ParallelChunks(n_run, nthreads, [&](int rlo, int rhi) {
        for (int r = rlo; r < rhi; ++r) {
            const int g = rawrun_group[r];
            const int lo = rawrun_start[r], hi = rawrun_start[r] + rawrun_count[r];
            for (int p = lo; p < hi; ++p) {
                auto &o = partials[perm[p]];
                o.group = (g >= 0 && std::isfinite(o.I) && std::isfinite(o.rlp) && o.rlp != 0.0f
                           && std::isfinite(o.sigma) && o.sigma > 0.0f) ? g : -1;
            }
        }
    });

#ifdef JFJOCH_USE_CUDA
    // Group-ordered permutation (obs bucketed by ASU group, obs-index order) + its CSR, so the GPU
    // reduction is a deterministic segmented reduction (fixed order, no atomics). Built per space group.
    if (gpu_active_) {
        const int n = static_cast<int>(partials.size());
        std::vector<int32_t> group_ids(n), gcount(n_groups, 0);
        for (int i = 0; i < n; ++i) {
            group_ids[i] = partials[i].group;
            if (partials[i].group >= 0) ++gcount[partials[i].group];
        }
        std::vector<int32_t> gstart(n_groups, 0);
        int acc = 0;
        for (int g = 0; g < n_groups; ++g) { gstart[g] = acc; acc += gcount[g]; }
        std::vector<int32_t> gperm(acc), gfill = gstart;
        for (int i = 0; i < n; ++i) {
            const int g = partials[i].group;
            if (g >= 0) gperm[gfill[g]++] = i;
        }
        gpu_->SetGroups(n_groups, group_ids.data(), gperm.data(), acc, gstart.data(), gcount.data());
    }
#endif
    return n_groups;
}

void RotationScaleMerge::ReduceGroupMeans(const std::vector<Obs> &obs, int n_groups,
                                          bool exclude_ice, const std::vector<char> &masked,
                                          std::vector<double> &out_mean) const {
    // Inverse-variance per-group mean of I*corr = the merge reference (a segmented reduction over the
    // groups; the CPU stand-in for a CUDA reduce_by_key). No cell mask here: the scaling reference
    // (MergeAll) is built without a reference cell - only the final merge applies it.
    std::vector<double> sw(n_groups, 0.0), swI(n_groups, 0.0);
    for (const auto &o : obs) {
        if (o.group < 0) continue;
        if (!(o.corr > 0.0f) || !std::isfinite(o.corr)) continue;
        if (exclude_ice && o.on_ice) continue;
        if (!masked.empty()) {
            const int ring = IceRingIndex(o.d, ice_half_width_q);
            if (ring >= 0 && ring < static_cast<int>(masked.size()) && masked[ring]) continue;
        }
        if (o.partiality < min_partiality) continue;
        const float I_corr = o.I * o.corr;
        const float sigma_corr = o.sigma * o.corr;
        if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0f) continue;
        const double w = 1.0 / (static_cast<double>(sigma_corr) * sigma_corr);
        sw[o.group] += w;
        swI[o.group] += w * I_corr;
    }
    out_mean.assign(n_groups, NAN);
    for (int g = 0; g < n_groups; ++g)
        if (sw[g] > 0.0) out_mean[g] = swI[g] / sw[g];
}

void RotationScaleMerge::FitPerFrameG(std::vector<Obs> &obs, const std::vector<int32_t> &fstart,
                                      const std::vector<int32_t> &fcount,
                                      const std::vector<double> &group_mean_in,
                                      bool unity, std::vector<double> &g) {
    std::vector<uint8_t> scaled(fstart.size(), 0);
    ParallelFor(static_cast<int>(fstart.size()), nthreads, [&](int f) {
        std::vector<ScaleObs> so;
        so.reserve(fcount[f]);
        const int lo = fstart[f], hi = fstart[f] + fcount[f];
        for (int i = lo; i < hi; ++i) {
            const auto &o = obs[i];
            if (o.group < 0) continue;
            if (o.on_ice) continue;
            const double mean = group_mean_in[o.group];
            if (!std::isfinite(mean)) continue;
            double coeff;
            if (unity) {
                coeff = mean;                       // partiality already folded into the full, rlp = 1
            } else {
                if (!(std::isfinite(o.zeta) && o.zeta > 0.0f)) continue;   // Rotation model needs zeta > 0
                coeff = o.partiality * SafeInv(o.rlp, 1.0) * mean;
            }
            so.push_back({coeff, static_cast<double>(o.I), SafeInv(o.sigma, 1.0)});
        }
        if (so.size() < MIN_REFLECTIONS) return;   // leave g[f]/corr untouched (as ScaleOnTheFly does)
        g[f] = SolveScaleIRLS(so, SCALE_ROBUST_K);
        scaled[f] = 1;
    });
    // Remember which frames were fitted this call (so the caller updates corr only there).
    frame_scaled_scratch = std::move(scaled);
}

void RotationScaleMerge::UpdateCorr(std::vector<Obs> &obs, const std::vector<double> &g,
                                    const std::vector<uint8_t> &frame_scaled) const {
    ParallelChunks(static_cast<int>(obs.size()), nthreads, [&](int lo, int hi) {
        for (int i = lo; i < hi; ++i) {
            auto &o = obs[i];
            if (!frame_scaled[o.frame]) continue;
            const double denom = static_cast<double>(o.partiality) * g[o.frame];   // B_term = 1 (no B refine)
            if (std::isfinite(o.rlp) && std::isfinite(denom) && denom > 0.0)
                o.corr = static_cast<float>(o.rlp / denom);
            else
                o.corr = NAN;
        }
    });
}

void RotationScaleMerge::SmoothG(std::vector<Obs> &obs, std::vector<double> &g, int window) const {
    const int n = static_cast<int>(g.size());
    const int half = window / 2;
    std::vector<double> g_smooth(n, NAN);
    for (int o = 0; o < n; ++o) {
        double sum_log = 0.0;
        int count = 0;
        for (int j = std::max(0, o - half); j <= std::min(n - 1, o + half); ++j) {
            if (frame_scaled_scratch[j] && std::isfinite(g[j]) && g[j] > 0.0) {
                sum_log += std::log(g[j]);
                ++count;
            }
        }
        if (count > 0) g_smooth[o] = std::exp(sum_log / count);
    }
    for (auto &o : obs) {
        const int f = o.frame;
        if (!frame_scaled_scratch[f] || !std::isfinite(g[f]) || g[f] <= 0.0 || !std::isfinite(g_smooth[f]))
            continue;
        if (std::isfinite(o.corr))
            o.corr = static_cast<float>(o.corr * (g[f] / g_smooth[f]));
    }
    for (int f = 0; f < n; ++f)
        if (frame_scaled_scratch[f] && std::isfinite(g[f]) && g[f] > 0.0 && std::isfinite(g_smooth[f]))
            g[f] = g_smooth[f];
}

void RotationScaleMerge::Combine() {
    fulls.clear();
    g_full.assign(n_frames, 1.0);

    // Combine one raw-hkl run into fulls, appended to `out`. Independent per run (the events of one hkl
    // touch no shared state), so runs parallelise cleanly. Returns the number of usable partials seen.
    // `dump` (serial path only) writes each emitted full for the diagnostic observation dump.
    auto process_rawrun = [&](int r, std::vector<Obs> &out, std::ofstream *dump) -> size_t {
        const int lo = rawrun_start[r], hi = rawrun_start[r] + rawrun_count[r];
        std::vector<int32_t> ev;   // usable perm-indices of this raw hkl, in image-number order
        ev.reserve(hi - lo);
        for (int p = lo; p < hi; ++p) {
            const auto &o = partials[perm[p]];
            if (!std::isfinite(o.corr) || o.corr <= 0.0f) continue;
            if (!std::isfinite(o.I) || !std::isfinite(o.sigma) || o.sigma <= 0.0f) continue;
            ev.push_back(perm[p]);
        }
        const int group = rawrun_group[r];

        size_t i = 0;
        while (i < ev.size()) {
            size_t kk = i + 1;
            float last_frame = partials[ev[i]].image_number;
            while (kk < ev.size()) {
                const float frame = partials[ev[kk]].image_number;
                if (frame - last_frame > MAX_FRAME_GAP) break;
                last_frame = frame;
                ++kk;
            }

            double pooled_bkg = 0.0;
            int n_pool = 0;
            for (size_t m = i; m < kk; ++m) {
                const float b = partials[ev[m]].bkg;
                if (std::isfinite(b)) { pooled_bkg += b; ++n_pool; }
            }
            pooled_bkg = n_pool > 0 ? pooled_bkg / n_pool : 0.0;
            auto pooled_I = [&](const Obs &r2) {
                const double n_bkg = std::max(0.0, static_cast<double>(r2.sigma) * r2.sigma - r2.I)
                                     / std::max(r2.bkg, 1.0f);
                return static_cast<double>(r2.I) + n_bkg * (static_cast<double>(r2.bkg) - pooled_bkg);
            };

            double sum_w = 0.0, sum_wI = 0.0, sum_partiality = 0.0;
            float d = NAN;
            int peak_outcome = partials[ev[i]].frame;
            float peak_frame = partials[ev[i]].image_number;
            float peak_partiality = -1.0f;
            const bool on_ice = partials[ev[i]].on_ice;
            for (size_t m = i; m < kk; ++m) {
                const auto &r2 = partials[ev[m]];
                const double sigma_corr = static_cast<double>(r2.sigma) * r2.corr;
                const double w = 1.0 / (sigma_corr * sigma_corr);
                sum_w += w;
                sum_wI += w * pooled_I(r2) * r2.corr;
                sum_partiality += r2.partiality;
                if (r2.partiality > peak_partiality) {
                    peak_partiality = r2.partiality;
                    peak_outcome = r2.frame;
                    peak_frame = r2.image_number;
                }
                if (!std::isfinite(d) && std::isfinite(r2.d) && r2.d > 0.0f) d = r2.d;
            }
            double F = sum_wI / sum_w;

            for (int iter = 0; iter < 3; ++iter) {
                sum_w = 0.0; sum_wI = 0.0;
                for (size_t m = i; m < kk; ++m) {
                    const auto &r2 = partials[ev[m]];
                    const double corr = r2.corr;
                    const double I_corr = pooled_I(r2) * corr;
                    const double sigma_corr = static_cast<double>(r2.sigma) * corr;
                    const double bkg_var = sigma_corr * sigma_corr - corr * I_corr;
                    double var = std::max(0.0, bkg_var) + corr * std::max(0.0, F);
                    if (!(var > 0.0)) var = sigma_corr * sigma_corr;
                    const double w = 1.0 / var;
                    sum_w += w;
                    sum_wI += w * I_corr;
                }
                F = sum_wI / sum_w;
            }

            const int n_frames_event = static_cast<int>(kk - i);
            i = kk;

            if (sum_w <= 0.0 || sum_partiality < min_partiality)
                continue;

            double sigma_full = 1.0 / std::sqrt(sum_w);
            if (capture_uncertainty_coeff > 0.0) {
                const double frac = std::min(1.0, sum_partiality);
                const double extra = capture_uncertainty_coeff * (1.0 - frac) * std::max(0.0, F);
                sigma_full = std::sqrt(sigma_full * sigma_full + extra * extra);
            }

            Obs full{};
            full.h = rawrun_h[r]; full.k = rawrun_k[r]; full.l = rawrun_l[r];
            full.I = static_cast<float>(F);
            full.sigma = static_cast<float>(sigma_full);
            full.d = d;
            full.rlp = 1.0f;
            full.partiality = 1.0f;
            full.corr = 1.0f;
            full.image_number = peak_frame;
            full.frame = peak_outcome;
            full.on_ice = on_ice ? 1 : 0;
            full.group = group;   // the raw hkl's ASU group for the current space group (<0 = absent)
            out.push_back(full);

            if (dump != nullptr)
                *dump << full.h << ' ' << full.k << ' ' << full.l << ' ' << full.I << ' '
                      << full.sigma << ' ' << d << ' ' << n_frames_event << ' ' << sum_partiality << ' '
                      << static_cast<int>(peak_frame) << '\n';
        }
        return ev.size();
    };

    const int n_run = static_cast<int>(rawrun_start.size());
    size_t n_used = 0;

    if (!observation_dump_path.empty() || nthreads <= 1) {
        // Serial (diagnostic dump needs a single writer).
        std::ofstream dump;
        if (!observation_dump_path.empty()) {
            dump.open(observation_dump_path);
            dump << "# h k l I sigma d n_frames captured_fraction\n";
        }
        fulls.reserve(n_run);
        for (int r = 0; r < n_run; ++r)
            n_used += process_rawrun(r, fulls, dump.is_open() ? &dump : nullptr);
    } else {
        // Parallel over contiguous rawrun chunks; concatenate the per-thread fulls in run order so the
        // result is deterministic.
        const int nt = static_cast<int>(std::min(nthreads, static_cast<size_t>(n_run)));
        const int chunk = (n_run + nt - 1) / nt;
        std::vector<std::vector<Obs>> part(nt);
        std::vector<size_t> used(nt, 0);
        std::vector<std::future<void>> futures;
        futures.reserve(nt);
        for (int t = 0; t < nt; ++t) {
            const int r0 = t * chunk, r1 = std::min(n_run, r0 + chunk);
            if (r0 >= r1) break;
            futures.emplace_back(std::async(std::launch::async, [&, t, r0, r1] {
                part[t].reserve(r1 - r0);
                for (int r = r0; r < r1; ++r) used[t] += process_rawrun(r, part[t], nullptr);
            }));
        }
        for (auto &f : futures) f.get();
        size_t total = 0;
        for (int t = 0; t < nt; ++t) { total += part[t].size(); n_used += used[t]; }
        fulls.reserve(total);
        for (int t = 0; t < nt; ++t)
            fulls.insert(fulls.end(), part[t].begin(), part[t].end());
    }

    SortFullsByFrame();
    logger.Info("3D combine: {} fulls from {} partials", fulls.size(), n_used);
}

void RotationScaleMerge::SortFullsByFrame() {
    // Sort the fulls by their (peak) frame and build per-frame CSR ranges, so the scale-fulls step can
    // fit a per-frame G by slicing contiguous ranges (the same layout the partials use).
    std::sort(fulls.begin(), fulls.end(),
              [](const Obs &a, const Obs &b) { return a.frame < b.frame; });
    fulls_frame_start.assign(n_frames, 0);
    fulls_frame_count.assign(n_frames, 0);
    for (int i = 0; i < static_cast<int>(fulls.size()); ) {
        const int f = fulls[i].frame;
        fulls_frame_start[f] = i;
        int j = i;
        while (j < static_cast<int>(fulls.size()) && fulls[j].frame == f) ++j;
        fulls_frame_count[f] = j - i;
        i = j;
    }
}

void RotationScaleMerge::FinalizePerFrameScale(int n_groups, const std::vector<double> &partial_group_mean,
                                               const std::vector<uint8_t> &frame_scaled) {
    // Per-frame CC vs the merged reference (CalculateGlobalCC), computed once now (not every iteration).
    std::vector<double> cc(n_frames, NAN);
    std::vector<int64_t> cc_n(n_frames, 0);
    ParallelFor(n_frames, nthreads, [&](int f) {
        double sx = 0, sy = 0, sx2 = 0, sy2 = 0, sxy = 0;
        size_t n = 0;
        const int lo = frame_start[f], hi = frame_start[f] + frame_count[f];
        for (int i = lo; i < hi; ++i) {
            const auto &o = partials[i];
            if (o.on_ice) continue;
            if (o.group < 0) continue;
            if (o.partiality < min_partiality) continue;
            if (!std::isfinite(o.I) || !std::isfinite(o.corr) || o.corr <= 0.0f) continue;
            if (!std::isfinite(o.sigma) || o.sigma <= 0.0f) continue;
            const double mean = partial_group_mean[o.group];
            if (!std::isfinite(mean)) continue;
            const double img = static_cast<double>(o.I) * o.corr;
            sx += img; sy += mean; sx2 += img * img; sy2 += mean * mean; sxy += img * mean;
            ++n;
        }
        if (n < MIN_REFLECTIONS) return;
        const double nd = static_cast<double>(n);
        const double cov = sxy - sx * sy / nd;
        const double vx = sx2 - sx * sx / nd;
        const double vy = sy2 - sy * sy / nd;
        if (vx > 0.0 && vy > 0.0) { cc[f] = cov / std::sqrt(vx * vy); cc_n[f] = static_cast<int64_t>(n); }
    });

    for (int f = 0; f < n_frames; ++f) {
        auto &o = partials_out[f];
        if (frame_scaled[f]) {
            o.image_scale_g = static_cast<float>(g_partial[f]);
            o.mosaicity_deg = (f < static_cast<int>(mos_smooth.size()) && std::isfinite(mos_smooth[f]))
                                  ? mos_smooth[f] : static_cast<float>(mosaicity_deg);
            if (std::isfinite(cc[f])) { o.image_scale_cc = static_cast<float>(cc[f]); o.image_scale_cc_n = cc_n[f]; }
            else { o.image_scale_cc.reset(); o.image_scale_cc_n.reset(); }
        } else {
            o.image_scale_g.reset();
            o.image_scale_cc.reset();
            o.image_scale_cc_n.reset();
            o.mosaicity_deg.reset();
        }
        o.image_scale_b_factor_Ang2.reset();
        o.image_scale_wedge_deg.reset();
    }
}

namespace {
    // Possible unique reflections per shell for the completeness column - mirrors CalcPossibleReflections.
    void PossiblePerShell(int space_group_number, const UnitCell &cell, double d_min, double d_max,
                          const ResolutionShells &shells, bool merge_friedel, std::vector<int> &possible) {
        gemmi::UnitCell gemmi_cell = cell;
        const gemmi::SpaceGroup *sg = gemmi::find_spacegroup_by_number(space_group_number);
        if (sg == nullptr) return;
        const std::vector<gemmi::Miller> hkls = gemmi::make_miller_vector(gemmi_cell, sg, d_min, d_max, true);
        const gemmi::GroupOps gops = sg->operations();
        CrystalLattice lattice(cell);
        const auto astar = lattice.Astar(), bstar = lattice.Bstar(), cstar = lattice.Cstar();
        for (const auto &hkl : hkls) {
            const auto q = hkl[0] * astar + hkl[1] * bstar + hkl[2] * cstar;
            const auto qlen = q.Length();
            if (qlen < 1e-6) continue;
            const auto shell = shells.GetShell(1.0 / qlen);
            if (!shell.has_value()) continue;
            const int s = *shell;
            if (s >= 0 && s < static_cast<int>(possible.size()))
                possible[s] += (merge_friedel || gops.is_reflection_centric(hkl)) ? 1 : 2;
        }
    }
}

RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool for_search,
                                                            const std::vector<char> &masked) {
    // A full is usable for the merge / error model if it passes AddImage's filters (with the current
    // ice/masked-ring context). group >= 0 already encodes "not absent and passes AcceptReflection".
    auto masked_ring = [&](const Obs &o) {
        if (masked.empty()) return false;
        const int ring = IceRingIndex(o.d, ice_half_width_q);
        return ring >= 0 && ring < static_cast<int>(masked.size()) && masked[ring];
    };
    auto usable_merge = [&](const Obs &o) {
        if (o.group < 0) return false;
        if (!frame_cell_ok[o.frame]) return false;
        if (!(o.corr > 0.0f) || !std::isfinite(o.corr)) return false;
        if (for_search && o.on_ice) return false;
        if (masked_ring(o)) return false;
        if (o.partiality < min_partiality) return false;
        const float I_corr = o.I * o.corr, sigma_corr = o.sigma * o.corr;
        return std::isfinite(I_corr) && std::isfinite(sigma_corr) && sigma_corr > 0.0f;
    };

    // ---- Error model: fit dev2 = a*sigma^2 + b^2*<I>^2 from symmetry-equivalent scatter. ----
    std::vector<double> em_mean(n_groups, NAN);
    std::vector<float> reject_median(n_groups, NAN);
    double error_model_a = 1.0, error_model_b = 0.0, error_model_chi2 = 0.0;
    bool error_model_active = false;
    {
        // Per-group inverse-variance mean over usable fulls (>=2 obs), and the leverage-corrected samples.
        std::vector<double> sw(n_groups, 0.0), swI(n_groups, 0.0);
        std::vector<int> cnt(n_groups, 0);
        for (const auto &o : fulls) {
            if (!usable_merge(o)) continue;
            const double sigma_corr = static_cast<double>(o.sigma) * o.corr;
            const double w = 1.0 / (sigma_corr * sigma_corr);
            sw[o.group] += w; swI[o.group] += w * (static_cast<double>(o.I) * o.corr); cnt[o.group]++;
        }
        for (int g = 0; g < n_groups; ++g)
            if (cnt[g] >= 2 && sw[g] > 0.0) em_mean[g] = swI[g] / sw[g];

        if (reject_outliers) {
            std::vector<std::vector<float>> iv(n_groups);
            for (const auto &o : fulls)
                if (usable_merge(o) && cnt[o.group] >= 2)
                    iv[o.group].push_back(o.I * o.corr);
            for (int g = 0; g < n_groups; ++g)
                if (!iv[g].empty()) {
                    std::nth_element(iv[g].begin(), iv[g].begin() + iv[g].size() / 2, iv[g].end());
                    reject_median[g] = iv[g][iv[g].size() / 2];
                }
        }

        struct Sample { double s2, I2, dev2; };
        std::vector<Sample> samples;
        samples.reserve(fulls.size());
        for (const auto &o : fulls) {
            if (!usable_merge(o) || cnt[o.group] < 2) continue;
            const double mean = em_mean[o.group];
            if (!std::isfinite(mean)) continue;
            const double sigma_corr = static_cast<double>(o.sigma) * o.corr;
            const double s2 = sigma_corr * sigma_corr;
            const double w = 1.0 / s2;
            const double factor = 1.0 - w / sw[o.group];
            if (factor < 0.05) continue;
            const double resid = static_cast<double>(o.I) * o.corr - mean;
            samples.push_back({s2, mean * mean, resid * resid / factor});
        }

        constexpr int n_bins = 16;
        if (samples.size() >= static_cast<size_t>(8 * n_bins)) {
            std::sort(samples.begin(), samples.end(),
                      [](const Sample &a, const Sample &b) { return a.I2 < b.I2; });
            std::vector<double> bs2, bI2, bd2;
            const size_t per = samples.size() / n_bins;
            for (int bin = 0; bin < n_bins; ++bin) {
                const size_t lo = bin * per;
                const size_t hi = (bin == n_bins - 1) ? samples.size() : lo + per;
                std::vector<double> vs2, vI2, vd2;
                for (size_t i = lo; i < hi; ++i) {
                    vs2.push_back(samples[i].s2); vI2.push_back(samples[i].I2); vd2.push_back(samples[i].dev2);
                }
                bs2.push_back(median_of(vs2));
                bI2.push_back(median_of(vI2));
                bd2.push_back(median_of(vd2) / CHI2_1_MEDIAN);
            }
            std::vector<double> bd2_sorted = bd2;
            const double dev2_floor = std::max(1e-30, 1e-3 * median_of(bd2_sorted));
            double Ass = 0, AsI = 0, AII = 0, Bs = 0, BI = 0;
            for (int bin = 0; bin < n_bins; ++bin) {
                const double s2 = bs2[bin], I2 = bI2[bin], d2 = bd2[bin];
                const double d2w = std::max(d2, dev2_floor);
                const double wgt = 1.0 / (d2w * d2w);
                Ass += wgt * s2 * s2; AsI += wgt * s2 * I2; AII += wgt * I2 * I2;
                Bs += wgt * s2 * d2; BI += wgt * I2 * d2;
            }
            const double det = Ass * AII - AsI * AsI;
            if (det > 1e-10 * Ass * AII) {
                error_model_a = std::clamp((Bs * AII - BI * AsI) / det, 0.25, 100.0);
                const double b2 = std::max((Ass * BI - AsI * Bs) / det, 0.0);
                error_model_b = std::sqrt(b2);
                error_model_active = true;
                std::vector<double> chi2;
                chi2.reserve(samples.size());
                for (const auto &s : samples) {
                    const double v = error_model_a * s.s2 + b2 * s.I2;
                    if (v > 0.0) chi2.push_back(s.dev2 / v);
                }
                error_model_chi2 = chi2.empty() ? 0.0 : median_of(chi2) / CHI2_1_MEDIAN;
            }
        }
    }
    if (error_model_active)
        logger.Info("Error model: a={:.3f} b={:.3f} ISa={:.1f} chi2={:.2f}", error_model_a, error_model_b,
                    error_model_b > 0 ? 1.0 / error_model_b : 0.0, error_model_chi2);

    auto corrected_sigma = [&](float I_corr, float sigma_corr, int g) -> float {
        if (!error_model_active) return sigma_corr;
        const double I_for_b = std::isfinite(em_mean[g]) ? em_mean[g] : I_corr;
        const double v = error_model_a * static_cast<double>(sigma_corr) * sigma_corr
                         + (error_model_b * I_for_b) * (error_model_b * I_for_b);
        return v > 0.0 ? static_cast<float>(std::sqrt(v)) : sigma_corr;
    };

    // ---- Merge: per-group inverse-variance sums with corrected sigma + deterministic half sets. ----
    struct Accum { double swI = 0, sw = 0, swIh[2] = {0, 0}, swh[2] = {0, 0}; size_t nh[2] = {0, 0}; float d = NAN; };
    std::vector<Accum> acc(n_groups);
    size_t reject_count = 0;
    for (const auto &o : fulls) {
        if (!usable_merge(o)) continue;
        const float I_corr = o.I * o.corr;
        float sigma_corr = o.sigma * o.corr;
        sigma_corr = corrected_sigma(I_corr, sigma_corr, o.group);
        if (reject_outliers && error_model_active && std::isfinite(reject_median[o.group])
            && std::fabs(I_corr - reject_median[o.group]) > reject_nsigma * sigma_corr) {
            ++reject_count;
            continue;
        }
        const double w = 1.0 / (static_cast<double>(sigma_corr) * sigma_corr);
        const double wI = w * I_corr;
        const int half = HalfForImage(o.frame);
        auto &a = acc[o.group];
        a.swI += wI; a.sw += w;
        a.swIh[half] += wI; a.swh[half] += w; a.nh[half]++;
        if (!std::isfinite(a.d) && std::isfinite(o.d) && o.d > 0.0f) a.d = o.d;
    }

    // ---- Export merged reflections (+ resolution-shell R-free flags). ----
    Result result;
    result.isa = error_model_b > 0 ? 1.0 / error_model_b : 0.0;
    std::vector<double> merged_I(n_groups, NAN);
    float d_min = std::numeric_limits<float>::max(), d_max = 0.0f;
    for (int g = 0; g < n_groups; ++g) {
        const auto &a = acc[g];
        if (a.sw <= 0.0) continue;
        MergedReflection mr{};
        mr.h = group_h[g]; mr.k = group_k[g]; mr.l = group_l[g];
        mr.I = static_cast<float>(a.swI / a.sw);
        mr.sigma = 1.0f / std::sqrt(static_cast<float>(a.sw));
        mr.I_half[0] = mr.I_half[1] = NAN;
        mr.sigma_half[0] = mr.sigma_half[1] = NAN;
        mr.d = a.d;
        if (a.nh[0] + a.nh[1] > 0 && a.swh[0] > 0.0 && a.swh[1] > 0.0) {
            for (int i = 0; i < 2; ++i) {
                mr.I_half[i] = static_cast<float>(a.swIh[i] / a.swh[i]);
                mr.sigma_half[i] = 1.0f / std::sqrt(static_cast<float>(a.swh[i]));
            }
        }
        if (!std::isfinite(a.d) || a.d <= 0.0f) continue;
        d_min = std::min(d_min, a.d);
        d_max = std::max(d_max, a.d);
        merged_I[g] = mr.I;
        result.merged.push_back(mr);
    }

    if (rfree_fraction > 0.0 && !result.merged.empty() && d_min < d_max && d_min > 0.0f) {
        constexpr int n_shells = 20;
        ResolutionShells shells(d_min * 0.999f, d_max * 1.001f, n_shells);
        std::vector<std::vector<size_t>> shell_groups(n_shells);
        for (size_t i = 0; i < result.merged.size(); ++i) {
            const auto shell = shells.GetShell(result.merged[i].d);
            if (shell && *shell >= 0 && *shell < n_shells) shell_groups[*shell].push_back(i);
        }
        std::mt19937 rng(12345u);
        std::bernoulli_distribution dist(rfree_fraction);
        for (const auto &grp : shell_groups)
            for (const size_t i : grp) result.merged[i].rfree_flag = dist(rng);
    }

    if (reject_count > 0)
        logger.Info("Merge outlier rejection: dropped {} observations", reject_count);

    // ---- Statistics (10 shells): completeness, multiplicity, <I/sigma>, R_meas, CC1/2. ----
    constexpr int n_shells = 10;
    float sd_min = std::numeric_limits<float>::max(), sd_max = 0.0f;
    for (const auto &m : result.merged) {
        if (!std::isfinite(m.d) || m.d <= 0.0f) continue;
        if (d_min_limit && m.d < *d_min_limit) continue;
        sd_min = std::min(sd_min, m.d); sd_max = std::max(sd_max, m.d);
    }
    if (!(sd_min < sd_max && sd_min > 0.0f))
        throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
                              "RotationScaleMerge: resolution calculation failed");
    const float d_min_pad = sd_min * 0.999f, d_max_pad = sd_max * 1.001f;
    ResolutionShells shells(d_min_pad, d_max_pad, n_shells);
    const auto shell_mean_1_d2 = shells.GetShellMeanOneOverResSq();
    const auto shell_min_res = shells.GetShellMinRes();

    struct ShellAcc {
        int unique = 0, total_obs = 0, possible = 0;
        double sum_i_over_sigma = 0.0; int n_i_over_sigma = 0;
        CorrelationCoefficient cc_half;
    };
    std::vector<ShellAcc> sa(n_shells);
    std::vector<int> possible(n_shells, 0);
    if (reference_cell)
        PossiblePerShell(x.GetSpaceGroupNumber().value_or(1), *reference_cell, d_min_pad, d_max_pad,
                         shells, merge_friedel, possible);
    for (int s = 0; s < n_shells; ++s) sa[s].possible = possible[s];

    CorrelationCoefficient cc_half_overall;
    for (const auto &m : result.merged) {
        const auto shell = shells.GetShell(m.d);
        if (!shell || *shell < 0 || *shell >= n_shells) continue;
        if (std::isfinite(m.I) && std::isfinite(m.sigma) && m.sigma > 0.0) {
            auto &s = sa[*shell];
            s.unique++;
            s.sum_i_over_sigma += m.I / m.sigma; s.n_i_over_sigma++;
            if (std::isfinite(m.I_half[0]) && std::isfinite(m.I_half[1])) {
                s.cc_half.Add(m.I_half[0], m.I_half[1]);
                cc_half_overall.Add(m.I_half[0], m.I_half[1]);
            }
        }
    }

    // R_meas: re-walk the fulls (Mask = cell only; no ice / masked-ring / error-model), accumulate
    // |I_i - <I>| per reflection.
    struct RmeasObs { double sum_abs_dev = 0, sum_I = 0; int n = 0, shell = -1; };
    std::vector<RmeasObs> rmeas(n_groups);
    for (const auto &o : fulls) {
        if (o.group < 0) continue;
        if (!frame_cell_ok[o.frame]) continue;
        if (!(o.corr > 0.0f) || !std::isfinite(o.corr)) continue;
        if (o.partiality < min_partiality) continue;
        const float I_corr = o.I * o.corr, sigma_corr = o.sigma * o.corr;
        if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0f) continue;
        const auto shell = shells.GetShell(o.d);
        if (!shell || *shell < 0 || *shell >= n_shells) continue;
        sa[*shell].total_obs++;
        if (std::isfinite(merged_I[o.group])) {
            auto &r = rmeas[o.group];
            r.sum_abs_dev += std::fabs(static_cast<double>(I_corr) - merged_I[o.group]);
            r.sum_I += I_corr; r.n++; r.shell = *shell;
        }
    }
    std::vector<double> rmeas_num(n_shells, 0.0), rmeas_den(n_shells, 0.0);
    double rmeas_num_all = 0.0, rmeas_den_all = 0.0;
    for (const auto &r : rmeas) {
        if (r.n < 2 || r.shell < 0 || r.shell >= n_shells) continue;
        const double factor = std::sqrt(static_cast<double>(r.n) / (r.n - 1));
        rmeas_num[r.shell] += factor * r.sum_abs_dev; rmeas_den[r.shell] += r.sum_I;
        rmeas_num_all += factor * r.sum_abs_dev; rmeas_den_all += r.sum_I;
    }

    MergeStatistics &out = result.statistics;
    out.shells.resize(n_shells);
    for (int s = 0; s < n_shells; ++s) {
        auto &ss = out.shells[s];
        ss.mean_one_over_d2 = shell_mean_1_d2[s];
        ss.d_min = shell_min_res[s];
        ss.d_max = s == 0 ? d_max_pad : shell_min_res[s - 1];
        ss.total_observations = sa[s].total_obs;
        ss.unique_reflections = sa[s].unique;
        ss.possible_unique_reflections = sa[s].possible;
        ss.mean_i_over_sigma = sa[s].n_i_over_sigma > 0 ? sa[s].sum_i_over_sigma / sa[s].n_i_over_sigma : 0.0;
        ss.cc_half = sa[s].cc_half.GetCC();
        ss.cc_ref = NAN;
        ss.r_meas = rmeas_den[s] > 0.0 ? rmeas_num[s] / rmeas_den[s] : NAN;
    }
    auto &overall = out.overall;
    overall.d_min = sd_min; overall.d_max = sd_max;
    double sum_ios = 0.0; int n_ios = 0;
    for (int s = 0; s < n_shells; ++s) {
        overall.total_observations += sa[s].total_obs;
        overall.unique_reflections += sa[s].unique;
        overall.possible_unique_reflections += sa[s].possible;
        sum_ios += sa[s].sum_i_over_sigma; n_ios += sa[s].n_i_over_sigma;
    }
    overall.mean_i_over_sigma = n_ios > 0 ? sum_ios / n_ios : 0.0;
    overall.cc_half = cc_half_overall.GetCC();
    overall.cc_ref = NAN;
    overall.r_meas = rmeas_den_all > 0.0 ? rmeas_num_all / rmeas_den_all : NAN;

    logger.Info("Merge complete ({} unique reflections)", result.merged.size());
    return result;
}

RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
                                                   const std::vector<char> &masked_ice_rings) {
    auto t_last = std::chrono::steady_clock::now();
    auto lap = [&](const char *what) {
        const auto now = std::chrono::steady_clock::now();
        logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration<double>(now - t_last).count());
        t_last = now;
    };

    const int sg_number = x.GetSpaceGroupNumber().value_or(1);
    HKLKeyGenerator keygen(merge_friedel, sg_number);

    // --- 1. Per-frame partial scaling (Rotation model, per-image G only). ---
    const int n_groups = ComputeAsuGroups(keygen);   // one ASU grouping, shared by partials and fulls
    lap("group hkl");
    std::vector<double> partial_mean;
    bool scaled_on_gpu = false;
#ifdef JFJOCH_USE_CUDA
    if (gpu_active_) {
        // Refresh corr on the device (smooth-G mutated it on the host between passes), run the whole
        // scaling loop on the GPU, then read corr + per-frame G/scaled back.
        std::vector<float> corr(partials.size());
        for (size_t i = 0; i < partials.size(); ++i) corr[i] = partials[i].corr;
        gpu_->SetCorr(corr.data());
        gpu_->ScalePartials(scaling_iter, SCALE_ROBUST_K, min_partiality, d_min_limit.has_value());
        gpu_->GetCorr(corr.data());
        frame_scaled_scratch.assign(n_frames, 0);
        gpu_->GetG(g_partial.data(), frame_scaled_scratch.data());
        for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i];
        scaled_on_gpu = true;
    }
#endif
    if (!scaled_on_gpu) {
        for (int it = 0; it < scaling_iter; ++it) {
            ReduceGroupMeans(partials, n_groups, false, {}, partial_mean);
            FitPerFrameG(partials, frame_start, frame_count, partial_mean, /*unity=*/false, g_partial);
            UpdateCorr(partials, g_partial, frame_scaled_scratch);
        }
    }
    lap("scale partials");
    const std::vector<uint8_t> partial_scaled = frame_scaled_scratch;

    // --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. ---
    const auto s = x.GetScalingSettings();
    const double smooth_g_deg = s.GetSmoothGDegrees();
    const auto gonio = x.GetGoniometer();
    const double osc_deg = gonio ? std::fabs(gonio->GetIncrement_deg()) : 0.0;
    if (smooth_g_deg > 0.0 && osc_deg > 1e-6) {
        int window = std::max(1, static_cast<int>(std::lround(smooth_g_deg / osc_deg)));
        if (window % 2 == 0) ++window;
        SmoothG(partials, g_partial, window);
    }

    // Per-frame CC + write G/CC/mosaicity back onto the partials (once).
    ReduceGroupMeans(partials, n_groups, false, {}, partial_mean);
    FinalizePerFrameScale(n_groups, partial_mean, partial_scaled);

    // --- 3. 3D combine of per-frame partials into fulls (fulls inherit their ASU group here). ---
    bool combined_on_gpu = false;
    bool scaled_fulls_on_gpu = false;
#ifdef JFJOCH_USE_CUDA
    // GPU combine (+ scale-fulls) keeps the fulls resident on the device: combine, then build the frame /
    // ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU
    // stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity
    // scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback.
    if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) {
        std::vector<float> corr(partials.size());          // refresh the smoothed corr on the device
        for (size_t i = 0; i < partials.size(); ++i) corr[i] = partials[i].corr;
        gpu_->SetCorr(corr.data());
        const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff);
        g_full.assign(n_frames, 1.0);

        if (scale_fulls && nf > 0) {
            // Frame + group CSRs over the emit-ordered fulls, built by counting sort on the host (stable,
            // deterministic). frame is always in [0, n_frames); group is <0 for absent/out-of-range fulls.
            std::vector<int32_t> ff(nf), fg(nf);
            gpu_->GetFullsKeys(ff.data(), fg.data());
            std::vector<int32_t> f_start(n_frames, 0), f_count(n_frames, 0), f_perm(nf);
            for (int i = 0; i < nf; ++i) ++f_count[ff[i]];
            for (int f = 1; f < n_frames; ++f) f_start[f] = f_start[f - 1] + f_count[f - 1];
            { std::vector<int32_t> fill = f_start; for (int i = 0; i < nf; ++i) f_perm[fill[ff[i]]++] = i; }
            gpu_->SetFullsFrameCSR(f_perm.data(), nf, f_start.data(), f_count.data());

            std::vector<int32_t> g_count(n_groups, 0), g_start(n_groups, 0);
            for (int i = 0; i < nf; ++i) if (fg[i] >= 0) ++g_count[fg[i]];
            int acc = 0;
            for (int g = 0; g < n_groups; ++g) { g_start[g] = acc; acc += g_count[g]; }
            std::vector<int32_t> g_perm(acc);
            { std::vector<int32_t> fill = g_start; for (int i = 0; i < nf; ++i) if (fg[i] >= 0) g_perm[fill[fg[i]]++] = i; }
            gpu_->SetFullsGroups(g_perm.data(), acc, g_start.data(), g_count.data());

            gpu_->ScaleFulls(scaling_iter, SCALE_ROBUST_K, min_partiality);
            scaled_fulls_on_gpu = true;
        }

        fulls.assign(nf, Obs{});
        std::vector<int32_t> fh(nf), fk(nf), fl(nf), fframe(nf), fgroup(nf);
        std::vector<float> fI(nf), fsig(nf), fd(nf), fimg(nf), fcorr(nf, 1.0f);
        std::vector<uint8_t> fon(nf);
        gpu_->GetFulls(fh.data(), fk.data(), fl.data(), fI.data(), fsig.data(), fd.data(),
                       fimg.data(), fframe.data(), fon.data(), fgroup.data());
        if (scaled_fulls_on_gpu) gpu_->GetFullsCorr(fcorr.data());
        for (int i = 0; i < nf; ++i) {
            Obs &o = fulls[i];
            o.h = fh[i]; o.k = fk[i]; o.l = fl[i];
            o.I = fI[i]; o.sigma = fsig[i]; o.d = fd[i];
            o.rlp = 1.0f; o.partiality = 1.0f; o.corr = fcorr[i];
            o.image_number = fimg[i]; o.frame = fframe[i];
            o.on_ice = fon[i]; o.group = fgroup[i];
        }
        logger.Info("3D combine{} (GPU): {} fulls", scaled_fulls_on_gpu ? " + scale-fulls" : "", nf);
        combined_on_gpu = true;
    }
#endif
    if (!combined_on_gpu)
        Combine();
    lap("combine");

    // --- 4. Scale the fulls (XDS order, Unity model). ---
    if (scale_fulls && !scaled_fulls_on_gpu) {
        std::vector<double> full_mean;
        for (int it = 0; it < scaling_iter; ++it) {
            ReduceGroupMeans(fulls, n_groups, false, {}, full_mean);
            FitPerFrameG(fulls, fulls_frame_start, fulls_frame_count, full_mean, /*unity=*/true, g_full);
            UpdateCorr(fulls, g_full, frame_scaled_scratch);
        }
        logger.Info("Scaled fulls (XDS order, Unity model)");
    }
    lap("scale fulls");

    // --- 5. Error model + merge + statistics. ---
    auto r = MergeAndStats(n_groups, for_search, masked_ice_rings);
    lap("merge+stats");
    return r;
}