RotationScaleMerge: drop profiling + env gates, honour forced mosaicity
Remove the [rsm] per-stage lap timing and the JFJOCH_RSM_NO_GPU / JFJOCH_RSM_CPU_COMBINE env gates now that the GPU-resident path is the validated default (it runs whenever a GPU is present, with the CPU loops as the bit-parity fallback; the diagnostic-dump path still uses the CPU combine). Honour a fixed (forced) mosaicity: SmoothMosaicityAndPartiality now overrides every frame with GetForcedMosaicity() when set, instead of always reading the per-frame integration value - so the caller can route the --mosaicity case through RotationScaleMerge (its partiality recompute makes it a natural fit) rather than a separate path. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,6 @@
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <future>
|
||||
#include <limits>
|
||||
@@ -24,8 +23,8 @@
|
||||
#include "../../common/ResolutionShells.h"
|
||||
|
||||
namespace {
|
||||
// These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat
|
||||
// implementation is numerically identical - see the comments there for the physics.
|
||||
// These mirror the per-image ScaleOnTheFly / Merge rocking-curve physics verbatim so this flat
|
||||
// implementation is numerically identical - see the comments there for the details.
|
||||
constexpr size_t MIN_REFLECTIONS = 20; // per-frame scale needs at least this many
|
||||
constexpr double SCALE_ROBUST_K = 3.0; // Cauchy loss scale (sigma units) for the per-frame G fit
|
||||
constexpr float MAX_FRAME_GAP = 2.0f; // a rocking event is a run of frames no more apart than this
|
||||
@@ -253,7 +252,7 @@ void RotationScaleMerge::Ingest() {
|
||||
// Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs
|
||||
// fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine.
|
||||
gpu_ = std::make_unique<RotationScaleMergeGPU>();
|
||||
gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr);
|
||||
gpu_active_ = gpu_->Available();
|
||||
if (gpu_active_) {
|
||||
const int n = static_cast<int>(partials.size());
|
||||
std::vector<float> I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n);
|
||||
@@ -272,19 +271,23 @@ void RotationScaleMerge::Ingest() {
|
||||
rawrun_start.data(), rawrun_count.data(),
|
||||
rawrun_h.data(), rawrun_k.data(), rawrun_l.data());
|
||||
gpu_->SetFrameCellOk(frame_cell_ok.data());
|
||||
gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr;
|
||||
logger.Info("RotationScaleMerge: GPU partial-scaling{} active",
|
||||
gpu_combine_ ? " + combine + scale-fulls" : "");
|
||||
logger.Info("RotationScaleMerge: GPU scaling + combine + scale-fulls + merge active");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void RotationScaleMerge::SmoothMosaicityAndPartiality() {
|
||||
// Raw per-frame mosaicity as measured at integration (image-local, deterministic).
|
||||
// Per-frame mosaicity to recompute partiality from. A forced (fixed) mosaicity overrides every frame;
|
||||
// otherwise use the per-frame value measured at integration (image-local, deterministic).
|
||||
const auto forced_mosaicity = x.GetScalingSettings().GetForcedMosaicity();
|
||||
std::vector<double> mos_raw(n_frames, NAN);
|
||||
for (int o = 0; o < n_frames; ++o) {
|
||||
const auto &m = partials_out[o].mosaicity_deg;
|
||||
if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
|
||||
if (forced_mosaicity.has_value() && std::isfinite(*forced_mosaicity) && *forced_mosaicity > 0.0) {
|
||||
for (int o = 0; o < n_frames; ++o) mos_raw[o] = *forced_mosaicity;
|
||||
} else {
|
||||
for (int o = 0; o < n_frames; ++o) {
|
||||
const auto &m = partials_out[o].mosaicity_deg;
|
||||
if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
|
||||
}
|
||||
}
|
||||
|
||||
// Frame-order moving average with the same window as smooth-G (a rotation range -> frame count).
|
||||
@@ -1173,19 +1176,11 @@ RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool
|
||||
|
||||
RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
||||
const std::vector<char> &masked_ice_rings) {
|
||||
auto t_last = std::chrono::steady_clock::now();
|
||||
auto lap = [&](const char *what) {
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration<double>(now - t_last).count());
|
||||
t_last = now;
|
||||
};
|
||||
|
||||
const int sg_number = x.GetSpaceGroupNumber().value_or(1);
|
||||
HKLKeyGenerator keygen(merge_friedel, sg_number);
|
||||
|
||||
// --- 1. Per-frame partial scaling (Rotation model, per-image G only). ---
|
||||
const int n_groups = ComputeAsuGroups(keygen); // one ASU grouping, shared by partials and fulls
|
||||
lap("group hkl");
|
||||
std::vector<double> partial_mean;
|
||||
bool scaled_on_gpu = false;
|
||||
#ifdef JFJOCH_USE_CUDA
|
||||
@@ -1206,7 +1201,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
||||
UpdateCorr(partials, g_partial, frame_scaled_scratch);
|
||||
}
|
||||
}
|
||||
lap("scale partials");
|
||||
const std::vector<uint8_t> partial_scaled = frame_scaled_scratch;
|
||||
|
||||
// --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. ---
|
||||
@@ -1242,9 +1236,9 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
||||
}
|
||||
|
||||
#ifdef JFJOCH_USE_CUDA
|
||||
// The GPU keeps corr resident through scaling + smooth-G; only a CPU combine (JFJOCH_RSM_CPU_COMBINE,
|
||||
// or the diagnostic dump) reads host partials[].corr, so refresh it just for that path.
|
||||
if (gpu_active_ && (!gpu_combine_ || !observation_dump_path.empty())) {
|
||||
// The GPU keeps corr resident through scaling + smooth-G; only the diagnostic dump falls back to the
|
||||
// CPU combine, which reads host partials[].corr, so refresh it just for that path.
|
||||
if (gpu_active_ && !observation_dump_path.empty()) {
|
||||
std::vector<float> corr(partials.size());
|
||||
gpu_->GetCorr(corr.data());
|
||||
for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i];
|
||||
@@ -1278,7 +1272,7 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
||||
// ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU
|
||||
// stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity
|
||||
// scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback.
|
||||
if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) {
|
||||
if (gpu_active_ && observation_dump_path.empty()) {
|
||||
// The smoothed corr is already resident (scaling + smooth-G ran on the device, no round-trip).
|
||||
const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff);
|
||||
g_full.assign(n_frames, 1.0);
|
||||
@@ -1327,7 +1321,6 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
||||
#endif
|
||||
if (!combined_on_gpu)
|
||||
Combine();
|
||||
lap("combine");
|
||||
|
||||
// --- 4. Scale the fulls (XDS order, Unity model). ---
|
||||
if (scale_fulls && !scaled_fulls_on_gpu) {
|
||||
@@ -1339,10 +1332,8 @@ RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
||||
}
|
||||
logger.Info("Scaled fulls (XDS order, Unity model)");
|
||||
}
|
||||
lap("scale fulls");
|
||||
|
||||
// --- 5. Error model + merge + statistics. ---
|
||||
auto r = MergeAndStats(n_groups, for_search, masked_ice_rings, combined_on_gpu && scaled_fulls_on_gpu);
|
||||
lap("merge+stats");
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -19,21 +19,21 @@
|
||||
#include "RotationScaleMergeGPU.h"
|
||||
#endif
|
||||
|
||||
// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path).
|
||||
// Dedicated, allocate-once scale+combine+merge for rotation data (the -P rot3d path): recompute the
|
||||
// per-frame partiality from the (smoothed) mosaicity, robustly fit a per-image scale G, 3D-combine each
|
||||
// rocking event's partials into fulls, refit a per-frame scale on the fulls (XDS order), and merge with
|
||||
// a global error model.
|
||||
//
|
||||
// This is a distinct, faster path from ScaleOnTheFly + MergeAll + CombineRotationObservations +
|
||||
// MergeOnTheFly. Those rebuild a std::map keyed by hkl on *every* scaling iteration and every merge
|
||||
// (7-14 map rebuilds per space-group pass), which dominates the offline wall clock. Here the per-frame
|
||||
// partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is computed once per
|
||||
// space group (by a sort, not a map) and reused across all scaling iterations; every hot step is a flat
|
||||
// loop over those vectors, so it also maps directly onto CUDA kernels (segmented reduction + per-frame
|
||||
// solve). CC1/2 and the per-image CC are computed once at the end, not every iteration.
|
||||
// The per-frame partial observations are ingested ONCE into flat vectors; the hkl->ASU grouping is
|
||||
// computed once per space group (by a sort, not a map) and reused across all scaling iterations; every
|
||||
// hot step is a flat loop over those vectors, so the whole pipeline maps onto CUDA kernels (segmented
|
||||
// reduction + per-frame solve) and runs GPU-resident when a GPU is present, with the CPU loops as the
|
||||
// bit-parity fallback. CC1/2 and the per-image CC are computed once at the end, not every iteration.
|
||||
//
|
||||
// It reproduces the numerics of the CPU pipeline exactly (same robust IRLS per-frame G, same 3D combine,
|
||||
// same XDS-order scale-fulls, same global error model, same merge statistics) - the speed-up is purely
|
||||
// from the data layout, not from cutting corners. It is used only for the self-scaling rotation case
|
||||
// with per-image G (Rotation partiality, no B refinement, no external reference, no absorption surface);
|
||||
// stills, B-factor refinement, reference scaling and the absorption surface stay on the classic path.
|
||||
// Used only for the self-scaling rotation case with per-image G (Rotation partiality, a fixed/forced
|
||||
// mosaicity is honoured by the recompute). It does NOT support B-factor refinement, external-reference
|
||||
// scaling, an absorption surface or wedge refinement - the caller rejects those combinations. Stills use
|
||||
// the per-image ScaleOnTheFly (fixed partiality) instead.
|
||||
class RotationScaleMerge {
|
||||
public:
|
||||
struct Result {
|
||||
@@ -130,15 +130,11 @@ private:
|
||||
std::vector<int32_t> group_h, group_k, group_l;
|
||||
|
||||
#ifdef JFJOCH_USE_CUDA
|
||||
// GPU engine for the partial-scaling loop (segmented reduce + per-frame IRLS + corr update). Null /
|
||||
// inactive when no GPU; the CPU loops are the fallback. Built in Ingest.
|
||||
// GPU engine: the whole hot path (scaling, combine, scale-fulls, per-frame CC, smooth-G, merge +
|
||||
// error model) runs on the device, resident, when a GPU is present. Null / inactive otherwise, with
|
||||
// the CPU loops as the bit-parity fallback. Built in Ingest.
|
||||
std::unique_ptr<RotationScaleMergeGPU> gpu_;
|
||||
bool gpu_active_ = false;
|
||||
// Phase-2 GPU combine + scale-fulls (partials->fulls, scaled, kept resident on the device). On by
|
||||
// default when a GPU is present - validated bit-parity + run-to-run deterministic vs the CPU path;
|
||||
// set JFJOCH_RSM_CPU_COMBINE to fall back to the CPU combine/scale-fulls. See
|
||||
// RotationScaleMergeGPU::Combine.
|
||||
bool gpu_combine_ = false;
|
||||
#endif
|
||||
|
||||
// --- helpers (each a flat pass; see the .cpp) ---
|
||||
|
||||
Reference in New Issue
Block a user