From fccf9b83e7c11e30dec957f2d4e2ad3cb4d8770b Mon Sep 17 00:00:00 2001 From: Filip Leonarski Date: Fri, 3 Jul 2026 11:41:31 +0200 Subject: [PATCH] RotationScaleMerge: drop profiling + env gates, honour forced mosaicity Remove the [rsm] per-stage lap timing and the JFJOCH_RSM_NO_GPU / JFJOCH_RSM_CPU_COMBINE env gates now that the GPU-resident path is the validated default (it runs whenever a GPU is present, with the CPU loops as the bit-parity fallback; the diagnostic-dump path still uses the CPU combine). Honour a fixed (forced) mosaicity: SmoothMosaicityAndPartiality now overrides every frame with GetForcedMosaicity() when set, instead of always reading the per-frame integration value - so the caller can route the --mosaicity case through RotationScaleMerge (its partiality recompute makes it a natural fit) rather than a separate path. Co-Authored-By: Claude Opus 4.8 --- .../scale_merge/RotationScaleMerge.cpp | 45 ++++++++----------- .../scale_merge/RotationScaleMerge.h | 36 +++++++-------- 2 files changed, 34 insertions(+), 47 deletions(-) diff --git a/image_analysis/scale_merge/RotationScaleMerge.cpp b/image_analysis/scale_merge/RotationScaleMerge.cpp index fca148f4..f754e408 100644 --- a/image_analysis/scale_merge/RotationScaleMerge.cpp +++ b/image_analysis/scale_merge/RotationScaleMerge.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -24,8 +23,8 @@ #include "../../common/ResolutionShells.h" namespace { - // These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat - // implementation is numerically identical - see the comments there for the physics. + // These mirror the per-image ScaleOnTheFly / Merge rocking-curve physics verbatim so this flat + // implementation is numerically identical - see the comments there for the details. constexpr size_t MIN_REFLECTIONS = 20; // per-frame scale needs at least this many constexpr double SCALE_ROBUST_K = 3.0; // Cauchy loss scale (sigma units) for the per-frame G fit constexpr float MAX_FRAME_GAP = 2.0f; // a rocking event is a run of frames no more apart than this @@ -253,7 +252,7 @@ void RotationScaleMerge::Ingest() { // Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs // fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine. gpu_ = std::make_unique(); - gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr); + gpu_active_ = gpu_->Available(); if (gpu_active_) { const int n = static_cast(partials.size()); std::vector I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n); @@ -272,19 +271,23 @@ void RotationScaleMerge::Ingest() { rawrun_start.data(), rawrun_count.data(), rawrun_h.data(), rawrun_k.data(), rawrun_l.data()); gpu_->SetFrameCellOk(frame_cell_ok.data()); - gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr; - logger.Info("RotationScaleMerge: GPU partial-scaling{} active", - gpu_combine_ ? " + combine + scale-fulls" : ""); + logger.Info("RotationScaleMerge: GPU scaling + combine + scale-fulls + merge active"); } #endif } void RotationScaleMerge::SmoothMosaicityAndPartiality() { - // Raw per-frame mosaicity as measured at integration (image-local, deterministic). + // Per-frame mosaicity to recompute partiality from. A forced (fixed) mosaicity overrides every frame; + // otherwise use the per-frame value measured at integration (image-local, deterministic). + const auto forced_mosaicity = x.GetScalingSettings().GetForcedMosaicity(); std::vector mos_raw(n_frames, NAN); - for (int o = 0; o < n_frames; ++o) { - const auto &m = partials_out[o].mosaicity_deg; - if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m; + if (forced_mosaicity.has_value() && std::isfinite(*forced_mosaicity) && *forced_mosaicity > 0.0) { + for (int o = 0; o < n_frames; ++o) mos_raw[o] = *forced_mosaicity; + } else { + for (int o = 0; o < n_frames; ++o) { + const auto &m = partials_out[o].mosaicity_deg; + if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m; + } } // Frame-order moving average with the same window as smooth-G (a rotation range -> frame count). @@ -1173,19 +1176,11 @@ RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search, const std::vector &masked_ice_rings) { - auto t_last = std::chrono::steady_clock::now(); - auto lap = [&](const char *what) { - const auto now = std::chrono::steady_clock::now(); - logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration(now - t_last).count()); - t_last = now; - }; - const int sg_number = x.GetSpaceGroupNumber().value_or(1); HKLKeyGenerator keygen(merge_friedel, sg_number); // --- 1. Per-frame partial scaling (Rotation model, per-image G only). --- const int n_groups = ComputeAsuGroups(keygen); // one ASU grouping, shared by partials and fulls - lap("group hkl"); std::vector partial_mean; bool scaled_on_gpu = false; #ifdef JFJOCH_USE_CUDA @@ -1206,7 +1201,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search, UpdateCorr(partials, g_partial, frame_scaled_scratch); } } - lap("scale partials"); const std::vector partial_scaled = frame_scaled_scratch; // --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. --- @@ -1242,9 +1236,9 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search, } #ifdef JFJOCH_USE_CUDA - // The GPU keeps corr resident through scaling + smooth-G; only a CPU combine (JFJOCH_RSM_CPU_COMBINE, - // or the diagnostic dump) reads host partials[].corr, so refresh it just for that path. - if (gpu_active_ && (!gpu_combine_ || !observation_dump_path.empty())) { + // The GPU keeps corr resident through scaling + smooth-G; only the diagnostic dump falls back to the + // CPU combine, which reads host partials[].corr, so refresh it just for that path. + if (gpu_active_ && !observation_dump_path.empty()) { std::vector corr(partials.size()); gpu_->GetCorr(corr.data()); for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i]; @@ -1278,7 +1272,7 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search, // ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU // stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity // scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback. - if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) { + if (gpu_active_ && observation_dump_path.empty()) { // The smoothed corr is already resident (scaling + smooth-G ran on the device, no round-trip). const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff); g_full.assign(n_frames, 1.0); @@ -1327,7 +1321,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search, #endif if (!combined_on_gpu) Combine(); - lap("combine"); // --- 4. Scale the fulls (XDS order, Unity model). --- if (scale_fulls && !scaled_fulls_on_gpu) { @@ -1339,10 +1332,8 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search, } logger.Info("Scaled fulls (XDS order, Unity model)"); } - lap("scale fulls"); // --- 5. Error model + merge + statistics. --- auto r = MergeAndStats(n_groups, for_search, masked_ice_rings, combined_on_gpu && scaled_fulls_on_gpu); - lap("merge+stats"); return r; } diff --git a/image_analysis/scale_merge/RotationScaleMerge.h b/image_analysis/scale_merge/RotationScaleMerge.h index 0803c084..f5ab0ae7 100644 --- a/image_analysis/scale_merge/RotationScaleMerge.h +++ b/image_analysis/scale_merge/RotationScaleMerge.h @@ -19,21 +19,21 @@ #include "RotationScaleMergeGPU.h" #endif -// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path). +// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path): recompute the +// per-frame partiality from the (smoothed) mosaicity, robustly fit a per-image scale G, 3D-combine each +// rocking event's partials into fulls, refit a per-frame scale on the fulls (XDS order), and merge with +// a global error model. // -// This is a distinct, faster path from ScaleOnTheFly + MergeAll + CombineRotationObservations + -// MergeOnTheFly. Those rebuild a std::map keyed by hkl on *every* scaling iteration and every merge -// (7-14 map rebuilds per space-group pass), which dominates the offline wall clock. Here the per-frame -// partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is computed once per -// space group (by a sort, not a map) and reused across all scaling iterations; every hot step is a flat -// loop over those vectors, so it also maps directly onto CUDA kernels (segmented reduction + per-frame -// solve). CC1/2 and the per-image CC are computed once at the end, not every iteration. +// The per-frame partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is +// computed once per space group (by a sort, not a map) and reused across all scaling iterations; every +// hot step is a flat loop over those vectors, so the whole pipeline maps onto CUDA kernels (segmented +// reduction + per-frame solve) and runs GPU-resident when a GPU is present, with the CPU loops as the +// bit-parity fallback. CC1/2 and the per-image CC are computed once at the end, not every iteration. // -// It reproduces the numerics of the CPU pipeline exactly (same robust IRLS per-frame G, same 3D combine, -// same XDS-order scale-fulls, same global error model, same merge statistics) - the speed-up is purely -// from the data layout, not from cutting corners. It is used only for the self-scaling rotation case -// with per-image G (Rotation partiality, no B refinement, no external reference, no absorption surface); -// stills, B-factor refinement, reference scaling and the absorption surface stay on the classic path. +// Used only for the self-scaling rotation case with per-image G (Rotation partiality, a fixed/forced +// mosaicity is honoured by the recompute). It does NOT support B-factor refinement, external-reference +// scaling, an absorption surface or wedge refinement - the caller rejects those combinations. Stills use +// the per-image ScaleOnTheFly (fixed partiality) instead. class RotationScaleMerge { public: struct Result { @@ -130,15 +130,11 @@ private: std::vector group_h, group_k, group_l; #ifdef JFJOCH_USE_CUDA - // GPU engine for the partial-scaling loop (segmented reduce + per-frame IRLS + corr update). Null / - // inactive when no GPU; the CPU loops are the fallback. Built in Ingest. + // GPU engine: the whole hot path (scaling, combine, scale-fulls, per-frame CC, smooth-G, merge + + // error model) runs on the device, resident, when a GPU is present. Null / inactive otherwise, with + // the CPU loops as the bit-parity fallback. Built in Ingest. std::unique_ptr gpu_; bool gpu_active_ = false; - // Phase-2 GPU combine + scale-fulls (partials->fulls, scaled, kept resident on the device). On by - // default when a GPU is present - validated bit-parity + run-to-run deterministic vs the CPU path; - // set JFJOCH_RSM_CPU_COMBINE to fall back to the CPU combine/scale-fulls. See - // RotationScaleMergeGPU::Combine. - bool gpu_combine_ = false; #endif // --- helpers (each a flat pass; see the .cpp) ---