From fccf9b83e7c11e30dec957f2d4e2ad3cb4d8770b Mon Sep 17 00:00:00 2001
From: Filip Leonarski <filip.leonarski@psi.ch>
Date: Fri, 3 Jul 2026 11:41:31 +0200
Subject: [PATCH] RotationScaleMerge: drop profiling + env gates, honour forced
 mosaicity

Remove the [rsm] per-stage lap timing and the JFJOCH_RSM_NO_GPU / JFJOCH_RSM_CPU_COMBINE
env gates now that the GPU-resident path is the validated default (it runs whenever a GPU
is present, with the CPU loops as the bit-parity fallback; the diagnostic-dump path still
uses the CPU combine).

Honour a fixed (forced) mosaicity: SmoothMosaicityAndPartiality now overrides every frame
with GetForcedMosaicity() when set, instead of always reading the per-frame integration
value - so the caller can route the --mosaicity case through RotationScaleMerge (its
partiality recompute makes it a natural fit) rather than a separate path.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../scale_merge/RotationScaleMerge.cpp        | 45 ++++++++-----------
 .../scale_merge/RotationScaleMerge.h          | 36 +++++++--------
 2 files changed, 34 insertions(+), 47 deletions(-)
diff --git a/image_analysis/scale_merge/RotationScaleMerge.cpp b/image_analysis/scale_merge/RotationScaleMerge.cpp
index fca148f4..f754e408 100644
--- a/image_analysis/scale_merge/RotationScaleMerge.cpp
+++ b/image_analysis/scale_merge/RotationScaleMerge.cpp
@@ -7,7 +7,6 @@
 #include <atomic>
 #include <cmath>
 #include <cstdint>
-#include <cstdlib>
 #include <fstream>
 #include <future>
 #include <limits>
@@ -24,8 +23,8 @@
 #include "../../common/ResolutionShells.h"
 
 namespace {
-    // These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat
-    // implementation is numerically identical - see the comments there for the physics.
+    // These mirror the per-image ScaleOnTheFly / Merge rocking-curve physics verbatim so this flat
+    // implementation is numerically identical - see the comments there for the details.
     constexpr size_t MIN_REFLECTIONS = 20;   // per-frame scale needs at least this many
     constexpr double SCALE_ROBUST_K = 3.0;   // Cauchy loss scale (sigma units) for the per-frame G fit
     constexpr float  MAX_FRAME_GAP = 2.0f;   // a rocking event is a run of frames no more apart than this
@@ -253,7 +252,7 @@ void RotationScaleMerge::Ingest() {
     // Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs
     // fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine.
     gpu_ = std::make_unique<RotationScaleMergeGPU>();
-    gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr);
+    gpu_active_ = gpu_->Available();
     if (gpu_active_) {
         const int n = static_cast<int>(partials.size());
         std::vector<float> I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n);
@@ -272,19 +271,23 @@ void RotationScaleMerge::Ingest() {
                          rawrun_start.data(), rawrun_count.data(),
                          rawrun_h.data(), rawrun_k.data(), rawrun_l.data());
         gpu_->SetFrameCellOk(frame_cell_ok.data());
-        gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr;
-        logger.Info("RotationScaleMerge: GPU partial-scaling{} active",
-                    gpu_combine_ ? " + combine + scale-fulls" : "");
+        logger.Info("RotationScaleMerge: GPU scaling + combine + scale-fulls + merge active");
     }
 #endif
 }
 
 void RotationScaleMerge::SmoothMosaicityAndPartiality() {
-    // Raw per-frame mosaicity as measured at integration (image-local, deterministic).
+    // Per-frame mosaicity to recompute partiality from. A forced (fixed) mosaicity overrides every frame;
+    // otherwise use the per-frame value measured at integration (image-local, deterministic).
+    const auto forced_mosaicity = x.GetScalingSettings().GetForcedMosaicity();
     std::vector<double> mos_raw(n_frames, NAN);
-    for (int o = 0; o < n_frames; ++o) {
-        const auto &m = partials_out[o].mosaicity_deg;
-        if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
+    if (forced_mosaicity.has_value() && std::isfinite(*forced_mosaicity) && *forced_mosaicity > 0.0) {
+        for (int o = 0; o < n_frames; ++o) mos_raw[o] = *forced_mosaicity;
+    } else {
+        for (int o = 0; o < n_frames; ++o) {
+            const auto &m = partials_out[o].mosaicity_deg;
+            if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
+        }
     }
 
     // Frame-order moving average with the same window as smooth-G (a rotation range -> frame count).
@@ -1173,19 +1176,11 @@ RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool
 
 RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
                                                    const std::vector<char> &masked_ice_rings) {
-    auto t_last = std::chrono::steady_clock::now();
-    auto lap = [&](const char *what) {
-        const auto now = std::chrono::steady_clock::now();
-        logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration<double>(now - t_last).count());
-        t_last = now;
-    };
-
     const int sg_number = x.GetSpaceGroupNumber().value_or(1);
     HKLKeyGenerator keygen(merge_friedel, sg_number);
 
     // --- 1. Per-frame partial scaling (Rotation model, per-image G only). ---
     const int n_groups = ComputeAsuGroups(keygen);   // one ASU grouping, shared by partials and fulls
-    lap("group hkl");
     std::vector<double> partial_mean;
     bool scaled_on_gpu = false;
 #ifdef JFJOCH_USE_CUDA
@@ -1206,7 +1201,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
             UpdateCorr(partials, g_partial, frame_scaled_scratch);
         }
     }
-    lap("scale partials");
     const std::vector<uint8_t> partial_scaled = frame_scaled_scratch;
 
     // --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. ---
@@ -1242,9 +1236,9 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
     }
 
 #ifdef JFJOCH_USE_CUDA
-    // The GPU keeps corr resident through scaling + smooth-G; only a CPU combine (JFJOCH_RSM_CPU_COMBINE,
-    // or the diagnostic dump) reads host partials[].corr, so refresh it just for that path.
-    if (gpu_active_ && (!gpu_combine_ || !observation_dump_path.empty())) {
+    // The GPU keeps corr resident through scaling + smooth-G; only the diagnostic dump falls back to the
+    // CPU combine, which reads host partials[].corr, so refresh it just for that path.
+    if (gpu_active_ && !observation_dump_path.empty()) {
         std::vector<float> corr(partials.size());
         gpu_->GetCorr(corr.data());
         for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i];
@@ -1278,7 +1272,7 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
     // ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU
     // stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity
     // scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback.
-    if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) {
+    if (gpu_active_ && observation_dump_path.empty()) {
         // The smoothed corr is already resident (scaling + smooth-G ran on the device, no round-trip).
         const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff);
         g_full.assign(n_frames, 1.0);
@@ -1327,7 +1321,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
 #endif
     if (!combined_on_gpu)
         Combine();
-    lap("combine");
 
     // --- 4. Scale the fulls (XDS order, Unity model). ---
     if (scale_fulls && !scaled_fulls_on_gpu) {
@@ -1339,10 +1332,8 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
         }
         logger.Info("Scaled fulls (XDS order, Unity model)");
     }
-    lap("scale fulls");
 
     // --- 5. Error model + merge + statistics. ---
     auto r = MergeAndStats(n_groups, for_search, masked_ice_rings, combined_on_gpu && scaled_fulls_on_gpu);
-    lap("merge+stats");
     return r;
 }
diff --git a/image_analysis/scale_merge/RotationScaleMerge.h b/image_analysis/scale_merge/RotationScaleMerge.h
index 0803c084..f5ab0ae7 100644
--- a/image_analysis/scale_merge/RotationScaleMerge.h
+++ b/image_analysis/scale_merge/RotationScaleMerge.h
@@ -19,21 +19,21 @@
 #include "RotationScaleMergeGPU.h"
 #endif
 
-// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path).
+// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path): recompute the
+// per-frame partiality from the (smoothed) mosaicity, robustly fit a per-image scale G, 3D-combine each
+// rocking event's partials into fulls, refit a per-frame scale on the fulls (XDS order), and merge with
+// a global error model.
 //
-// This is a distinct, faster path from ScaleOnTheFly + MergeAll + CombineRotationObservations +
-// MergeOnTheFly. Those rebuild a std::map keyed by hkl on *every* scaling iteration and every merge
-// (7-14 map rebuilds per space-group pass), which dominates the offline wall clock. Here the per-frame
-// partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is computed once per
-// space group (by a sort, not a map) and reused across all scaling iterations; every hot step is a flat
-// loop over those vectors, so it also maps directly onto CUDA kernels (segmented reduction + per-frame
-// solve). CC1/2 and the per-image CC are computed once at the end, not every iteration.
+// The per-frame partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is
+// computed once per space group (by a sort, not a map) and reused across all scaling iterations; every
+// hot step is a flat loop over those vectors, so the whole pipeline maps onto CUDA kernels (segmented
+// reduction + per-frame solve) and runs GPU-resident when a GPU is present, with the CPU loops as the
+// bit-parity fallback. CC1/2 and the per-image CC are computed once at the end, not every iteration.
 //
-// It reproduces the numerics of the CPU pipeline exactly (same robust IRLS per-frame G, same 3D combine,
-// same XDS-order scale-fulls, same global error model, same merge statistics) - the speed-up is purely
-// from the data layout, not from cutting corners. It is used only for the self-scaling rotation case
-// with per-image G (Rotation partiality, no B refinement, no external reference, no absorption surface);
-// stills, B-factor refinement, reference scaling and the absorption surface stay on the classic path.
+// Used only for the self-scaling rotation case with per-image G (Rotation partiality, a fixed/forced
+// mosaicity is honoured by the recompute). It does NOT support B-factor refinement, external-reference
+// scaling, an absorption surface or wedge refinement - the caller rejects those combinations. Stills use
+// the per-image ScaleOnTheFly (fixed partiality) instead.
 class RotationScaleMerge {
 public:
     struct Result {
@@ -130,15 +130,11 @@ private:
     std::vector<int32_t> group_h, group_k, group_l;
 
 #ifdef JFJOCH_USE_CUDA
-    // GPU engine for the partial-scaling loop (segmented reduce + per-frame IRLS + corr update). Null /
-    // inactive when no GPU; the CPU loops are the fallback. Built in Ingest.
+    // GPU engine: the whole hot path (scaling, combine, scale-fulls, per-frame CC, smooth-G, merge +
+    // error model) runs on the device, resident, when a GPU is present. Null / inactive otherwise, with
+    // the CPU loops as the bit-parity fallback. Built in Ingest.
     std::unique_ptr<RotationScaleMergeGPU> gpu_;
     bool gpu_active_ = false;
-    // Phase-2 GPU combine + scale-fulls (partials->fulls, scaled, kept resident on the device). On by
-    // default when a GPU is present - validated bit-parity + run-to-run deterministic vs the CPU path;
-    // set JFJOCH_RSM_CPU_COMBINE to fall back to the CPU combine/scale-fulls. See
-    // RotationScaleMergeGPU::Combine.
-    bool gpu_combine_ = false;
 #endif
 
     // --- helpers (each a flat pass; see the .cpp) ---