RotationScaleMerge: drop profiling + env gates, honour forced mosaicity

Remove the [rsm] per-stage lap timing and the JFJOCH_RSM_NO_GPU / JFJOCH_RSM_CPU_COMBINE
env gates now that the GPU-resident path is the validated default (it runs whenever a GPU
is present, with the CPU loops as the bit-parity fallback; the diagnostic-dump path still
uses the CPU combine).

Honour a fixed (forced) mosaicity: SmoothMosaicityAndPartiality now overrides every frame
with GetForcedMosaicity() when set, instead of always reading the per-frame integration
value - so the caller can route the --mosaicity case through RotationScaleMerge (its
partiality recompute makes it a natural fit) rather than a separate path.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-07-03 11:41:31 +02:00
co-authored by Claude Opus 4.8
parent 34b3c3c4e7
commit fccf9b83e7
2 changed files with 34 additions and 47 deletions
@@ -7,7 +7,6 @@
#include <atomic>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <future>
#include <limits>
@@ -24,8 +23,8 @@
#include "../../common/ResolutionShells.h"
namespace {
// These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat
// implementation is numerically identical - see the comments there for the physics.
// These mirror the per-image ScaleOnTheFly / Merge rocking-curve physics verbatim so this flat
// implementation is numerically identical - see the comments there for the details.
constexpr size_t MIN_REFLECTIONS = 20; // per-frame scale needs at least this many
constexpr double SCALE_ROBUST_K = 3.0; // Cauchy loss scale (sigma units) for the per-frame G fit
constexpr float MAX_FRAME_GAP = 2.0f; // a rocking event is a run of frames no more apart than this
@@ -253,7 +252,7 @@ void RotationScaleMerge::Ingest() {
// Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs
// fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine.
gpu_ = std::make_unique<RotationScaleMergeGPU>();
gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr);
gpu_active_ = gpu_->Available();
if (gpu_active_) {
const int n = static_cast<int>(partials.size());
std::vector<float> I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n);
@@ -272,19 +271,23 @@ void RotationScaleMerge::Ingest() {
rawrun_start.data(), rawrun_count.data(),
rawrun_h.data(), rawrun_k.data(), rawrun_l.data());
gpu_->SetFrameCellOk(frame_cell_ok.data());
gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr;
logger.Info("RotationScaleMerge: GPU partial-scaling{} active",
gpu_combine_ ? " + combine + scale-fulls" : "");
logger.Info("RotationScaleMerge: GPU scaling + combine + scale-fulls + merge active");
}
#endif
}
void RotationScaleMerge::SmoothMosaicityAndPartiality() {
// Raw per-frame mosaicity as measured at integration (image-local, deterministic).
// Per-frame mosaicity to recompute partiality from. A forced (fixed) mosaicity overrides every frame;
// otherwise use the per-frame value measured at integration (image-local, deterministic).
const auto forced_mosaicity = x.GetScalingSettings().GetForcedMosaicity();
std::vector<double> mos_raw(n_frames, NAN);
for (int o = 0; o < n_frames; ++o) {
const auto &m = partials_out[o].mosaicity_deg;
if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
if (forced_mosaicity.has_value() && std::isfinite(*forced_mosaicity) && *forced_mosaicity > 0.0) {
for (int o = 0; o < n_frames; ++o) mos_raw[o] = *forced_mosaicity;
} else {
for (int o = 0; o < n_frames; ++o) {
const auto &m = partials_out[o].mosaicity_deg;
if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
}
}
// Frame-order moving average with the same window as smooth-G (a rotation range -> frame count).
@@ -1173,19 +1176,11 @@ RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool
RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
const std::vector<char> &masked_ice_rings) {
auto t_last = std::chrono::steady_clock::now();
auto lap = [&](const char *what) {
const auto now = std::chrono::steady_clock::now();
logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration<double>(now - t_last).count());
t_last = now;
};
const int sg_number = x.GetSpaceGroupNumber().value_or(1);
HKLKeyGenerator keygen(merge_friedel, sg_number);
// --- 1. Per-frame partial scaling (Rotation model, per-image G only). ---
const int n_groups = ComputeAsuGroups(keygen); // one ASU grouping, shared by partials and fulls
lap("group hkl");
std::vector<double> partial_mean;
bool scaled_on_gpu = false;
#ifdef JFJOCH_USE_CUDA
@@ -1206,7 +1201,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
UpdateCorr(partials, g_partial, frame_scaled_scratch);
}
}
lap("scale partials");
const std::vector<uint8_t> partial_scaled = frame_scaled_scratch;
// --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. ---
@@ -1242,9 +1236,9 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
}
#ifdef JFJOCH_USE_CUDA
// The GPU keeps corr resident through scaling + smooth-G; only a CPU combine (JFJOCH_RSM_CPU_COMBINE,
// or the diagnostic dump) reads host partials[].corr, so refresh it just for that path.
if (gpu_active_ && (!gpu_combine_ || !observation_dump_path.empty())) {
// The GPU keeps corr resident through scaling + smooth-G; only the diagnostic dump falls back to the
// CPU combine, which reads host partials[].corr, so refresh it just for that path.
if (gpu_active_ && !observation_dump_path.empty()) {
std::vector<float> corr(partials.size());
gpu_->GetCorr(corr.data());
for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i];
@@ -1278,7 +1272,7 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
// ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU
// stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity
// scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback.
if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) {
if (gpu_active_ && observation_dump_path.empty()) {
// The smoothed corr is already resident (scaling + smooth-G ran on the device, no round-trip).
const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff);
g_full.assign(n_frames, 1.0);
@@ -1327,7 +1321,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
#endif
if (!combined_on_gpu)
Combine();
lap("combine");
// --- 4. Scale the fulls (XDS order, Unity model). ---
if (scale_fulls && !scaled_fulls_on_gpu) {
@@ -1339,10 +1332,8 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
}
logger.Info("Scaled fulls (XDS order, Unity model)");
}
lap("scale fulls");
// --- 5. Error model + merge + statistics. ---
auto r = MergeAndStats(n_groups, for_search, masked_ice_rings, combined_on_gpu && scaled_fulls_on_gpu);
lap("merge+stats");
return r;
}
+16 -20
View File
@@ -19,21 +19,21 @@
#include "RotationScaleMergeGPU.h"
#endif
// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path).
// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path): recompute the
// per-frame partiality from the (smoothed) mosaicity, robustly fit a per-image scale G, 3D-combine each
// rocking event's partials into fulls, refit a per-frame scale on the fulls (XDS order), and merge with
// a global error model.
//
// This is a distinct, faster path from ScaleOnTheFly + MergeAll + CombineRotationObservations +
// MergeOnTheFly. Those rebuild a std::map keyed by hkl on *every* scaling iteration and every merge
// (7-14 map rebuilds per space-group pass), which dominates the offline wall clock. Here the per-frame
// partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is computed once per
// space group (by a sort, not a map) and reused across all scaling iterations; every hot step is a flat
// loop over those vectors, so it also maps directly onto CUDA kernels (segmented reduction + per-frame
// solve). CC1/2 and the per-image CC are computed once at the end, not every iteration.
// The per-frame partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is
// computed once per space group (by a sort, not a map) and reused across all scaling iterations; every
// hot step is a flat loop over those vectors, so the whole pipeline maps onto CUDA kernels (segmented
// reduction + per-frame solve) and runs GPU-resident when a GPU is present, with the CPU loops as the
// bit-parity fallback. CC1/2 and the per-image CC are computed once at the end, not every iteration.
//
// It reproduces the numerics of the CPU pipeline exactly (same robust IRLS per-frame G, same 3D combine,
// same XDS-order scale-fulls, same global error model, same merge statistics) - the speed-up is purely
// from the data layout, not from cutting corners. It is used only for the self-scaling rotation case
// with per-image G (Rotation partiality, no B refinement, no external reference, no absorption surface);
// stills, B-factor refinement, reference scaling and the absorption surface stay on the classic path.
// Used only for the self-scaling rotation case with per-image G (Rotation partiality, a fixed/forced
// mosaicity is honoured by the recompute). It does NOT support B-factor refinement, external-reference
// scaling, an absorption surface or wedge refinement - the caller rejects those combinations. Stills use
// the per-image ScaleOnTheFly (fixed partiality) instead.
class RotationScaleMerge {
public:
struct Result {
@@ -130,15 +130,11 @@ private:
std::vector<int32_t> group_h, group_k, group_l;
#ifdef JFJOCH_USE_CUDA
// GPU engine for the partial-scaling loop (segmented reduce + per-frame IRLS + corr update). Null /
// inactive when no GPU; the CPU loops are the fallback. Built in Ingest.
// GPU engine: the whole hot path (scaling, combine, scale-fulls, per-frame CC, smooth-G, merge +
// error model) runs on the device, resident, when a GPU is present. Null / inactive otherwise, with
// the CPU loops as the bit-parity fallback. Built in Ingest.
std::unique_ptr<RotationScaleMergeGPU> gpu_;
bool gpu_active_ = false;
// Phase-2 GPU combine + scale-fulls (partials->fulls, scaled, kept resident on the device). On by
// default when a GPU is present - validated bit-parity + run-to-run deterministic vs the CPU path;
// set JFJOCH_RSM_CPU_COMBINE to fall back to the CPU combine/scale-fulls. See
// RotationScaleMergeGPU::Combine.
bool gpu_combine_ = false;
#endif
// --- helpers (each a flat pass; see the .cpp) ---