RotationScaleMerge: drop profiling + env gates, honour forced mosaicity

Remove the [rsm] per-stage lap timing and the JFJOCH_RSM_NO_GPU / JFJOCH_RSM_CPU_COMBINE env gates now that the GPU-resident path is the validated default (it runs whenever a GPU is present, with the CPU loops as the bit-parity fallback; the diagnostic-dump path still uses the CPU combine). Honour a fixed (forced) mosaicity: SmoothMosaicityAndPartiality now overrides every frame with GetForcedMosaicity() when set, instead of always reading the per-frame integration value - so the caller can route the --mosaicity case through RotationScaleMerge (its partiality recompute makes it a natural fit) rather than a separate path. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-03 11:41:31 +02:00
parent 34b3c3c4e7
commit fccf9b83e7
2 changed files with 34 additions and 47 deletions
@@ -7,7 +7,6 @@
 #include <atomic>
 #include <cmath>
 #include <cstdint>
-#include <cstdlib>
 #include <fstream>
 #include <future>
 #include <limits>
@@ -24,8 +23,8 @@
 #include "../../common/ResolutionShells.h"

 namespace {
-    // These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat
-    // implementation is numerically identical - see the comments there for the physics.
+    // These mirror the per-image ScaleOnTheFly / Merge rocking-curve physics verbatim so this flat
+    // implementation is numerically identical - see the comments there for the details.
    constexpr size_t MIN_REFLECTIONS = 20;   // per-frame scale needs at least this many
    constexpr double SCALE_ROBUST_K = 3.0;   // Cauchy loss scale (sigma units) for the per-frame G fit
    constexpr float  MAX_FRAME_GAP = 2.0f;   // a rocking event is a run of frames no more apart than this
@@ -253,7 +252,7 @@ void RotationScaleMerge::Ingest() {
    // Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs
    // fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine.
    gpu_ = std::make_unique<RotationScaleMergeGPU>();
-    gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr);
+    gpu_active_ = gpu_->Available();
    if (gpu_active_) {
        const int n = static_cast<int>(partials.size());
        std::vector<float> I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n);
@@ -272,19 +271,23 @@ void RotationScaleMerge::Ingest() {
                         rawrun_start.data(), rawrun_count.data(),
                         rawrun_h.data(), rawrun_k.data(), rawrun_l.data());
        gpu_->SetFrameCellOk(frame_cell_ok.data());
-        gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr;
-        logger.Info("RotationScaleMerge: GPU partial-scaling{} active",
-                    gpu_combine_ ? " + combine + scale-fulls" : "");
+        logger.Info("RotationScaleMerge: GPU scaling + combine + scale-fulls + merge active");
    }
 #endif
 }

 void RotationScaleMerge::SmoothMosaicityAndPartiality() {
-    // Raw per-frame mosaicity as measured at integration (image-local, deterministic).
+    // Per-frame mosaicity to recompute partiality from. A forced (fixed) mosaicity overrides every frame;
+    // otherwise use the per-frame value measured at integration (image-local, deterministic).
+    const auto forced_mosaicity = x.GetScalingSettings().GetForcedMosaicity();
    std::vector<double> mos_raw(n_frames, NAN);
-    for (int o = 0; o < n_frames; ++o) {
-        const auto &m = partials_out[o].mosaicity_deg;
-        if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
+    if (forced_mosaicity.has_value() && std::isfinite(*forced_mosaicity) && *forced_mosaicity > 0.0) {
+        for (int o = 0; o < n_frames; ++o) mos_raw[o] = *forced_mosaicity;
+    } else {
+        for (int o = 0; o < n_frames; ++o) {
+            const auto &m = partials_out[o].mosaicity_deg;
+            if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
+        }
    }

    // Frame-order moving average with the same window as smooth-G (a rotation range -> frame count).
@@ -1173,19 +1176,11 @@ RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool

 RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
                                                   const std::vector<char> &masked_ice_rings) {
-    auto t_last = std::chrono::steady_clock::now();
-    auto lap = [&](const char *what) {
-        const auto now = std::chrono::steady_clock::now();
-        logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration<double>(now - t_last).count());
-        t_last = now;
-    };
-
    const int sg_number = x.GetSpaceGroupNumber().value_or(1);
    HKLKeyGenerator keygen(merge_friedel, sg_number);

    // --- 1. Per-frame partial scaling (Rotation model, per-image G only). ---
    const int n_groups = ComputeAsuGroups(keygen);   // one ASU grouping, shared by partials and fulls
-    lap("group hkl");
    std::vector<double> partial_mean;
    bool scaled_on_gpu = false;
 #ifdef JFJOCH_USE_CUDA
@@ -1206,7 +1201,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
            UpdateCorr(partials, g_partial, frame_scaled_scratch);
        }
    }
-    lap("scale partials");
    const std::vector<uint8_t> partial_scaled = frame_scaled_scratch;

    // --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. ---
@@ -1242,9 +1236,9 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
    }

 #ifdef JFJOCH_USE_CUDA
-    // The GPU keeps corr resident through scaling + smooth-G; only a CPU combine (JFJOCH_RSM_CPU_COMBINE,
-    // or the diagnostic dump) reads host partials[].corr, so refresh it just for that path.
-    if (gpu_active_ && (!gpu_combine_ || !observation_dump_path.empty())) {
+    // The GPU keeps corr resident through scaling + smooth-G; only the diagnostic dump falls back to the
+    // CPU combine, which reads host partials[].corr, so refresh it just for that path.
+    if (gpu_active_ && !observation_dump_path.empty()) {
        std::vector<float> corr(partials.size());
        gpu_->GetCorr(corr.data());
        for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i];
@@ -1278,7 +1272,7 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
    // ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU
    // stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity
    // scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback.
-    if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) {
+    if (gpu_active_ && observation_dump_path.empty()) {
        // The smoothed corr is already resident (scaling + smooth-G ran on the device, no round-trip).
        const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff);
        g_full.assign(n_frames, 1.0);
@@ -1327,7 +1321,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
 #endif
    if (!combined_on_gpu)
        Combine();
-    lap("combine");

    // --- 4. Scale the fulls (XDS order, Unity model). ---
    if (scale_fulls && !scaled_fulls_on_gpu) {
@@ -1339,10 +1332,8 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
        }
        logger.Info("Scaled fulls (XDS order, Unity model)");
    }
-    lap("scale fulls");

    // --- 5. Error model + merge + statistics. ---
    auto r = MergeAndStats(n_groups, for_search, masked_ice_rings, combined_on_gpu && scaled_fulls_on_gpu);
-    lap("merge+stats");
    return r;
 }
@@ -19,21 +19,21 @@
 #include "RotationScaleMergeGPU.h"
 #endif

-// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path).
+// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path): recompute the
+// per-frame partiality from the (smoothed) mosaicity, robustly fit a per-image scale G, 3D-combine each
+// rocking event's partials into fulls, refit a per-frame scale on the fulls (XDS order), and merge with
+// a global error model.
 //
-// This is a distinct, faster path from ScaleOnTheFly + MergeAll + CombineRotationObservations +
-// MergeOnTheFly. Those rebuild a std::map keyed by hkl on *every* scaling iteration and every merge
-// (7-14 map rebuilds per space-group pass), which dominates the offline wall clock. Here the per-frame
-// partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is computed once per
-// space group (by a sort, not a map) and reused across all scaling iterations; every hot step is a flat
-// loop over those vectors, so it also maps directly onto CUDA kernels (segmented reduction + per-frame
-// solve). CC1/2 and the per-image CC are computed once at the end, not every iteration.
+// The per-frame partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is
+// computed once per space group (by a sort, not a map) and reused across all scaling iterations; every
+// hot step is a flat loop over those vectors, so the whole pipeline maps onto CUDA kernels (segmented
+// reduction + per-frame solve) and runs GPU-resident when a GPU is present, with the CPU loops as the
+// bit-parity fallback. CC1/2 and the per-image CC are computed once at the end, not every iteration.
 //
-// It reproduces the numerics of the CPU pipeline exactly (same robust IRLS per-frame G, same 3D combine,
-// same XDS-order scale-fulls, same global error model, same merge statistics) - the speed-up is purely
-// from the data layout, not from cutting corners. It is used only for the self-scaling rotation case
-// with per-image G (Rotation partiality, no B refinement, no external reference, no absorption surface);
-// stills, B-factor refinement, reference scaling and the absorption surface stay on the classic path.
+// Used only for the self-scaling rotation case with per-image G (Rotation partiality, a fixed/forced
+// mosaicity is honoured by the recompute). It does NOT support B-factor refinement, external-reference
+// scaling, an absorption surface or wedge refinement - the caller rejects those combinations. Stills use
+// the per-image ScaleOnTheFly (fixed partiality) instead.
 class RotationScaleMerge {
 public:
    struct Result {
@@ -130,15 +130,11 @@ private:
    std::vector<int32_t> group_h, group_k, group_l;

 #ifdef JFJOCH_USE_CUDA
-    // GPU engine for the partial-scaling loop (segmented reduce + per-frame IRLS + corr update). Null /
-    // inactive when no GPU; the CPU loops are the fallback. Built in Ingest.
+    // GPU engine: the whole hot path (scaling, combine, scale-fulls, per-frame CC, smooth-G, merge +
+    // error model) runs on the device, resident, when a GPU is present. Null / inactive otherwise, with
+    // the CPU loops as the bit-parity fallback. Built in Ingest.
    std::unique_ptr<RotationScaleMergeGPU> gpu_;
    bool gpu_active_ = false;
-    // Phase-2 GPU combine + scale-fulls (partials->fulls, scaled, kept resident on the device). On by
-    // default when a GPU is present - validated bit-parity + run-to-run deterministic vs the CPU path;
-    // set JFJOCH_RSM_CPU_COMBINE to fall back to the CPU combine/scale-fulls. See
-    // RotationScaleMergeGPU::Combine.
-    bool gpu_combine_ = false;
 #endif

    // --- helpers (each a flat pass; see the .cpp) ---