Steps 1-2 (GPU 3D-combine + resident scale-fulls) are validated bit-parity and run-to-run deterministic against the CPU path across the rotation battery, and cut the combine+scale-fulls region from ~0.46s to ~0.32s on lyso, so make them the default when a GPU is present (consistent with phase-1 partial scaling already being default-on). JFJOCH_RSM_CPU_COMBINE forces the CPU combine/scale-fulls for A/B or debugging; JFJOCH_RSM_NO_GPU still disables the whole GPU path. The only battery crystal whose reported metrics move is EP_cs_01-24 (CC1/2 2%, unindexable noise) whose upstream integration is itself nondeterministic; its merged intensities/CC/completeness are unchanged, only the ill-conditioned error-model b. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1177 lines
56 KiB
C++
1177 lines
56 KiB
C++
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "RotationScaleMerge.h"
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <fstream>
|
|
#include <future>
|
|
#include <limits>
|
|
#include <random>
|
|
|
|
#include <gemmi/reciproc.hpp>
|
|
#include <gemmi/symmetry.hpp>
|
|
|
|
#include "HKLKey.h"
|
|
#include "../../common/CorrelationCoefficient.h"
|
|
#include "../../common/CrystalLattice.h"
|
|
#include "../../common/Definitions.h"
|
|
#include "../../common/JFJochException.h"
|
|
#include "../../common/ResolutionShells.h"
|
|
|
|
namespace {
|
|
// These mirror the classic path (ScaleOnTheFly / Merge / Combine3D) verbatim so this flat
|
|
// implementation is numerically identical - see the comments there for the physics.
|
|
constexpr size_t MIN_REFLECTIONS = 20; // per-frame scale needs at least this many
|
|
constexpr double SCALE_ROBUST_K = 3.0; // Cauchy loss scale (sigma units) for the per-frame G fit
|
|
constexpr float MAX_FRAME_GAP = 2.0f; // a rocking event is a run of frames no more apart than this
|
|
constexpr double CHI2_1_MEDIAN = 0.454936;
|
|
|
|
double SafeInv(double x, double fallback) {
|
|
if (!std::isfinite(x) || x == 0.0)
|
|
return fallback;
|
|
return 1.0 / x;
|
|
}
|
|
|
|
// Kabsch rotation partiality: the fraction of a reflection recorded in the sampled slice, from the
|
|
// erf of the rocking angle relative to the mosaic width. Identical to ScaleOnTheFly's RotationPartiality
|
|
// (and the predictor's), so recomputing here just swaps in the smoothed mosaicity.
|
|
float RotationPartiality(double delta_phi_deg, double zeta, double mosaicity_deg, double wedge_deg) {
|
|
const double half_wedge = wedge_deg / 2.0;
|
|
const double c1 = zeta / std::sqrt(2.0);
|
|
const double arg_plus = (delta_phi_deg + half_wedge) * c1 / mosaicity_deg;
|
|
const double arg_minus = (delta_phi_deg - half_wedge) * c1 / mosaicity_deg;
|
|
return static_cast<float>((std::erf(arg_plus) - std::erf(arg_minus)) / 2.0);
|
|
}
|
|
|
|
// Deterministic CC1/2 half from the frame's stable index (splitmix64), matching Merge.cpp.
|
|
int HalfForImage(int64_t image_id) {
|
|
uint64_t z = static_cast<uint64_t>(image_id) + 0x9e3779b97f4a7c15ULL;
|
|
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
|
|
z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
|
|
z = z ^ (z >> 31);
|
|
return static_cast<int>(z & 1ULL);
|
|
}
|
|
|
|
struct ScaleObs { double coeff, Iobs, weight; };
|
|
|
|
// Robust per-frame scale (linear in G): the exact objective ScaleOnTheFly::SolveScaleIRLS solves.
|
|
double SolveScaleIRLS(const std::vector<ScaleObs> &obs, double robust_k) {
|
|
auto weighted_scale = [&obs](auto robust_weight) {
|
|
double num = 0.0, den = 0.0;
|
|
for (const auto &o : obs) {
|
|
const double rw = robust_weight(o);
|
|
const double w2 = o.weight * o.weight;
|
|
num += rw * w2 * o.coeff * o.Iobs;
|
|
den += rw * w2 * o.coeff * o.coeff;
|
|
}
|
|
return den > 0.0 ? num / den : NAN;
|
|
};
|
|
|
|
double G = weighted_scale([](const ScaleObs &) { return 1.0; });
|
|
if (!std::isfinite(G))
|
|
return 1.0;
|
|
G = std::max(0.0, G);
|
|
|
|
const double k2 = robust_k * robust_k;
|
|
for (int iter = 0; iter < 30; ++iter) {
|
|
const double G_prev = G;
|
|
const double G_next = weighted_scale([&](const ScaleObs &o) {
|
|
const double res = o.weight * (G * o.coeff - o.Iobs);
|
|
return 1.0 / (1.0 + res * res / k2);
|
|
});
|
|
if (!std::isfinite(G_next))
|
|
break;
|
|
G = std::max(0.0, G_next);
|
|
if (std::abs(G - G_prev) <= 1e-7 * std::max(G, 1.0))
|
|
break;
|
|
}
|
|
return G;
|
|
}
|
|
|
|
// Run fn(i) for i in [0, n) over `nthreads` workers pulling from a shared atomic counter - the same
|
|
// self-load-balancing pattern the rest of the codebase uses (heavy frames don't stall light ones).
|
|
// Work-stealing per-item parallel: one atomic fetch per item. Use ONLY when the per-item work is
|
|
// heavy and uneven (e.g. per-frame fits) - the atomic amortises. For millions of tiny uniform items
|
|
// use ParallelChunks instead; a per-item atomic there is pure contention.
|
|
template <typename Fn>
|
|
void ParallelFor(int n, size_t nthreads, Fn fn) {
|
|
if (n <= 0) return;
|
|
if (nthreads <= 1 || n == 1) {
|
|
for (int i = 0; i < n; ++i) fn(i);
|
|
return;
|
|
}
|
|
const size_t local = std::min(nthreads, static_cast<size_t>(n));
|
|
std::atomic<int> next = 0;
|
|
std::vector<std::future<void>> futures;
|
|
futures.reserve(local);
|
|
for (size_t t = 0; t < local; ++t)
|
|
futures.emplace_back(std::async(std::launch::async, [&] {
|
|
for (int i = next.fetch_add(1); i < n; i = next.fetch_add(1))
|
|
fn(i);
|
|
}));
|
|
for (auto &f : futures) f.get();
|
|
}
|
|
|
|
// Chunked parallel: each worker gets one contiguous [lo, hi) range, no per-item synchronisation.
|
|
// Right for millions of cheap uniform items (the CPU stand-in for a flat CUDA grid-stride kernel).
|
|
template <typename Fn>
|
|
void ParallelChunks(int n, size_t nthreads, Fn fn) {
|
|
if (n <= 0) return;
|
|
const int nt = static_cast<int>(std::max<size_t>(1, std::min(nthreads, static_cast<size_t>(n))));
|
|
if (nt == 1) { fn(0, n); return; }
|
|
const int chunk = (n + nt - 1) / nt;
|
|
std::vector<std::future<void>> futures;
|
|
futures.reserve(nt);
|
|
for (int t = 0; t < nt; ++t) {
|
|
const int lo = t * chunk, hi = std::min(n, lo + chunk);
|
|
if (lo >= hi) break;
|
|
futures.emplace_back(std::async(std::launch::async, [&fn, lo, hi] { fn(lo, hi); }));
|
|
}
|
|
for (auto &f : futures) f.get();
|
|
}
|
|
|
|
double median_of(std::vector<double> &v) {
|
|
std::nth_element(v.begin(), v.begin() + v.size() / 2, v.end());
|
|
return v[v.size() / 2];
|
|
}
|
|
}
|
|
|
|
RotationScaleMerge::RotationScaleMerge(const DiffractionExperiment &experiment,
|
|
std::vector<IntegrationOutcome> &partial_outcomes,
|
|
std::optional<UnitCell> reference_cell,
|
|
int scaling_iterations, float ice_ring_half_width_q,
|
|
size_t nthreads, Logger &logger,
|
|
std::string observation_dump_path)
|
|
: x(experiment), partials_out(partial_outcomes), reference_cell(std::move(reference_cell)),
|
|
nthreads(nthreads == 0 ? std::thread::hardware_concurrency() : nthreads), logger(logger),
|
|
observation_dump_path(std::move(observation_dump_path)) {
|
|
const auto s = x.GetScalingSettings();
|
|
min_partiality = s.GetMinPartiality();
|
|
d_min_limit = s.GetHighResolutionLimit_A();
|
|
merge_friedel = s.GetMergeFriedel();
|
|
capture_uncertainty_coeff = s.GetCaptureUncertaintyCoeff();
|
|
reject_nsigma = s.GetOutlierRejectNsigma();
|
|
reject_outliers = reject_nsigma > 0.0;
|
|
rfree_fraction = s.GetRfreeFraction();
|
|
scale_fulls = s.GetScaleFulls();
|
|
scaling_iter = std::max(1, scaling_iterations);
|
|
if (const auto forced = s.GetForcedMosaicity(); forced.has_value())
|
|
mosaicity_deg = *forced;
|
|
else
|
|
mosaicity_deg = s.GetDefaultMosaicity();
|
|
ice_half_width_q = ice_ring_half_width_q;
|
|
}
|
|
|
|
void RotationScaleMerge::Ingest() {
|
|
n_frames = static_cast<int>(partials_out.size());
|
|
|
|
size_t total = 0;
|
|
for (const auto &o : partials_out) total += o.reflections.size();
|
|
|
|
partials.clear();
|
|
partials.reserve(total);
|
|
frame_start.assign(n_frames, 0);
|
|
frame_count.assign(n_frames, 0);
|
|
frame_cell_ok.assign(n_frames, 1);
|
|
g_partial.assign(n_frames, 1.0);
|
|
|
|
const float dist_tol = x.GetIndexingSettings().GetUnitCellDistTolerance();
|
|
const float ang_tol = x.GetIndexingSettings().GetUnitCellAngleTolerance_deg();
|
|
|
|
for (int o = 0; o < n_frames; ++o) {
|
|
frame_start[o] = static_cast<int32_t>(partials.size());
|
|
if (reference_cell) {
|
|
const auto cell = partials_out[o].latt.GetUnitCell();
|
|
frame_cell_ok[o] = cell.is_close(*reference_cell, dist_tol, ang_tol) ? 1 : 0;
|
|
}
|
|
for (const auto &r : partials_out[o].reflections) {
|
|
Obs obs{};
|
|
obs.h = r.h; obs.k = r.k; obs.l = r.l;
|
|
obs.I = r.I; obs.sigma = r.sigma; obs.d = r.d; obs.rlp = r.rlp;
|
|
obs.partiality = r.partiality; obs.zeta = r.zeta; obs.delta_phi = r.delta_phi_deg; obs.bkg = r.bkg;
|
|
obs.image_number = r.image_number;
|
|
obs.frame = o;
|
|
obs.on_ice = r.on_ice_ring ? 1 : 0;
|
|
obs.corr = r.image_scale_corr;
|
|
obs.group = -1;
|
|
partials.push_back(obs);
|
|
}
|
|
frame_count[o] = static_cast<int32_t>(partials.size()) - frame_start[o];
|
|
}
|
|
|
|
// Sort ONCE by (raw h,k,l, image_number) and split into raw-hkl runs. This is the one expensive sort;
|
|
// both the 3D combine (event split) and the per-space-group ASU grouping reuse this order.
|
|
perm.resize(partials.size());
|
|
for (int i = 0; i < static_cast<int>(partials.size()); ++i) perm[i] = i;
|
|
std::sort(perm.begin(), perm.end(), [&](int32_t a, int32_t b) {
|
|
const auto &x1 = partials[a];
|
|
const auto &y1 = partials[b];
|
|
if (x1.h != y1.h) return x1.h < y1.h;
|
|
if (x1.k != y1.k) return x1.k < y1.k;
|
|
if (x1.l != y1.l) return x1.l < y1.l;
|
|
return x1.image_number < y1.image_number;
|
|
});
|
|
rawrun_start.clear(); rawrun_count.clear();
|
|
rawrun_h.clear(); rawrun_k.clear(); rawrun_l.clear(); rawrun_d.clear();
|
|
for (int i = 0; i < static_cast<int>(perm.size()); ) {
|
|
const auto &o0 = partials[perm[i]];
|
|
int j = i;
|
|
float d = NAN;
|
|
while (j < static_cast<int>(perm.size())) {
|
|
const auto &o = partials[perm[j]];
|
|
if (o.h != o0.h || o.k != o0.k || o.l != o0.l) break;
|
|
if (!std::isfinite(d) && std::isfinite(o.d) && o.d > 0.0f) d = o.d;
|
|
++j;
|
|
}
|
|
rawrun_start.push_back(i);
|
|
rawrun_count.push_back(j - i);
|
|
rawrun_h.push_back(o0.h); rawrun_k.push_back(o0.k); rawrun_l.push_back(o0.l);
|
|
rawrun_d.push_back(d);
|
|
i = j;
|
|
}
|
|
rawrun_group.assign(rawrun_start.size(), -1);
|
|
logger.Info("RotationScaleMerge: ingested {} partial observations from {} frames ({} distinct hkl)",
|
|
total, n_frames, rawrun_start.size());
|
|
|
|
SmoothMosaicityAndPartiality();
|
|
|
|
#ifdef JFJOCH_USE_CUDA
|
|
// Bring the partial-scaling loop onto the GPU when one is present. Upload the immutable per-obs
|
|
// fields once (corr lives on the device, refreshed each pass); the CPU keeps the sort/keying/combine.
|
|
gpu_ = std::make_unique<RotationScaleMergeGPU>();
|
|
gpu_active_ = gpu_->Available() && (std::getenv("JFJOCH_RSM_NO_GPU") == nullptr);
|
|
if (gpu_active_) {
|
|
const int n = static_cast<int>(partials.size());
|
|
std::vector<float> I(n), sigma(n), rlp(n), part(n), zeta(n), corr(n), bkg(n), img(n), dd(n);
|
|
std::vector<uint8_t> onice(n);
|
|
std::vector<int32_t> frm(n);
|
|
for (int i = 0; i < n; ++i) {
|
|
const auto &o = partials[i];
|
|
I[i] = o.I; sigma[i] = o.sigma; rlp[i] = o.rlp; part[i] = o.partiality;
|
|
zeta[i] = o.zeta; onice[i] = o.on_ice; frm[i] = o.frame; corr[i] = o.corr;
|
|
bkg[i] = o.bkg; img[i] = o.image_number; dd[i] = o.d;
|
|
}
|
|
gpu_->SetPartials(n, n_frames, I.data(), sigma.data(), rlp.data(), part.data(), zeta.data(),
|
|
onice.data(), frm.data(), corr.data(), frame_start.data(), frame_count.data());
|
|
gpu_->SetCombineInputs(bkg.data(), img.data(), dd.data());
|
|
gpu_->SetRawRuns(static_cast<int>(rawrun_start.size()), static_cast<int>(perm.size()), perm.data(),
|
|
rawrun_start.data(), rawrun_count.data(),
|
|
rawrun_h.data(), rawrun_k.data(), rawrun_l.data());
|
|
gpu_combine_ = std::getenv("JFJOCH_RSM_CPU_COMBINE") == nullptr;
|
|
logger.Info("RotationScaleMerge: GPU partial-scaling{} active",
|
|
gpu_combine_ ? " + combine + scale-fulls" : "");
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void RotationScaleMerge::SmoothMosaicityAndPartiality() {
|
|
// Raw per-frame mosaicity as measured at integration (image-local, deterministic).
|
|
std::vector<double> mos_raw(n_frames, NAN);
|
|
for (int o = 0; o < n_frames; ++o) {
|
|
const auto &m = partials_out[o].mosaicity_deg;
|
|
if (m && std::isfinite(*m) && *m > 0.0f) mos_raw[o] = *m;
|
|
}
|
|
|
|
// Frame-order moving average with the same window as smooth-G (a rotation range -> frame count).
|
|
// With smoothing off, fall back to the per-frame value (still deterministic, just unsmoothed).
|
|
const auto ss = x.GetScalingSettings();
|
|
const double smooth_deg = ss.GetSmoothGDegrees();
|
|
const auto gon = x.GetGoniometer();
|
|
const double osc = gon ? std::fabs(gon->GetIncrement_deg()) : 0.0;
|
|
mos_smooth.assign(n_frames, NAN);
|
|
if (smooth_deg > 0.0 && osc > 1e-6) {
|
|
int window = std::max(1, static_cast<int>(std::lround(smooth_deg / osc)));
|
|
if (window % 2 == 0) ++window;
|
|
const int half = window / 2;
|
|
for (int o = 0; o < n_frames; ++o) {
|
|
double sum = 0.0;
|
|
int cnt = 0;
|
|
for (int j = std::max(0, o - half); j <= std::min(n_frames - 1, o + half); ++j)
|
|
if (std::isfinite(mos_raw[j])) { sum += mos_raw[j]; ++cnt; }
|
|
if (cnt > 0) mos_smooth[o] = static_cast<float>(sum / cnt);
|
|
}
|
|
} else {
|
|
for (int o = 0; o < n_frames; ++o) mos_smooth[o] = static_cast<float>(mos_raw[o]);
|
|
}
|
|
|
|
// Recompute each partial's partiality from the smoothed mosaicity (same wedge the predictor used).
|
|
// Frames without a mosaicity keep the stored partiality.
|
|
const double wedge = gon ? std::fabs(gon->GetWedge_deg()) : 0.0;
|
|
ParallelChunks(static_cast<int>(partials.size()), nthreads, [&](int lo, int hi) {
|
|
for (int i = lo; i < hi; ++i) {
|
|
auto &o = partials[i];
|
|
const float mos = mos_smooth[o.frame];
|
|
if (std::isfinite(mos) && mos > 1e-6f && std::isfinite(o.zeta) && o.zeta > 0.0f
|
|
&& std::isfinite(o.delta_phi))
|
|
o.partiality = RotationPartiality(o.delta_phi, o.zeta, mos, wedge);
|
|
}
|
|
});
|
|
logger.Info("Recomputed partiality from frame-order-smoothed mosaicity");
|
|
}
|
|
|
|
int RotationScaleMerge::ComputeAsuGroups(const HKLKeyGenerator &keygen) {
|
|
// One ASU reduction per distinct raw hkl (not per observation): a raw hkl is eligible if it is not
|
|
// systematically absent and its resolution is in range. Group the eligible raw hkls by ASU key
|
|
// (sort the ~#distinct-hkl keys, not the millions of observations), then hand out dense ids.
|
|
const int n_run = static_cast<int>(rawrun_start.size());
|
|
std::vector<uint64_t> key(n_run);
|
|
std::vector<uint8_t> eligible(n_run, 0);
|
|
// The gemmi ASU reduction / absence test per raw hkl is the cost here and is independent per run
|
|
// (HKLKeyGenerator is const, so concurrent reads are safe) - compute keys in parallel chunks.
|
|
ParallelChunks(n_run, nthreads, [&](int lo, int hi) {
|
|
for (int r = lo; r < hi; ++r) {
|
|
rawrun_group[r] = -1;
|
|
if (keygen.IsSystematicallyAbsent(rawrun_h[r], rawrun_k[r], rawrun_l[r]))
|
|
continue;
|
|
const float d = rawrun_d[r]; // resolution is a per-raw-hkl property (all its partials share d)
|
|
if (!std::isfinite(d) || d <= 0.0f) continue;
|
|
if (d_min_limit && d < *d_min_limit) continue;
|
|
key[r] = keygen(rawrun_h[r], rawrun_k[r], rawrun_l[r]).pack();
|
|
eligible[r] = 1;
|
|
}
|
|
});
|
|
std::vector<int32_t> idx;
|
|
idx.reserve(n_run);
|
|
for (int r = 0; r < n_run; ++r)
|
|
if (eligible[r]) idx.push_back(r);
|
|
std::sort(idx.begin(), idx.end(), [&](int32_t a, int32_t b) { return key[a] < key[b]; });
|
|
|
|
group_h.clear(); group_k.clear(); group_l.clear();
|
|
int n_groups = 0;
|
|
for (size_t j = 0; j < idx.size(); ++j) {
|
|
if (j == 0 || key[idx[j]] != key[idx[j - 1]]) {
|
|
const int r = idx[j];
|
|
const auto hkl = keygen(rawrun_h[r], rawrun_k[r], rawrun_l[r]);
|
|
group_h.push_back(hkl.plus ? hkl.h : -hkl.h);
|
|
group_k.push_back(hkl.plus ? hkl.k : -hkl.k);
|
|
group_l.push_back(hkl.plus ? hkl.l : -hkl.l);
|
|
++n_groups;
|
|
}
|
|
rawrun_group[idx[j]] = n_groups - 1;
|
|
}
|
|
|
|
// Stamp each partial's group from its raw hkl, keeping the exact per-observation AcceptReflection
|
|
// finiteness (finite I, finite rlp != 0, finite sigma > 0) that grouping by raw hkl alone would miss.
|
|
// Parallel over raw-hkl runs: distinct runs own disjoint perm ranges, hence disjoint observations.
|
|
ParallelChunks(n_run, nthreads, [&](int rlo, int rhi) {
|
|
for (int r = rlo; r < rhi; ++r) {
|
|
const int g = rawrun_group[r];
|
|
const int lo = rawrun_start[r], hi = rawrun_start[r] + rawrun_count[r];
|
|
for (int p = lo; p < hi; ++p) {
|
|
auto &o = partials[perm[p]];
|
|
o.group = (g >= 0 && std::isfinite(o.I) && std::isfinite(o.rlp) && o.rlp != 0.0f
|
|
&& std::isfinite(o.sigma) && o.sigma > 0.0f) ? g : -1;
|
|
}
|
|
}
|
|
});
|
|
|
|
#ifdef JFJOCH_USE_CUDA
|
|
// Group-ordered permutation (obs bucketed by ASU group, obs-index order) + its CSR, so the GPU
|
|
// reduction is a deterministic segmented reduction (fixed order, no atomics). Built per space group.
|
|
if (gpu_active_) {
|
|
const int n = static_cast<int>(partials.size());
|
|
std::vector<int32_t> group_ids(n), gcount(n_groups, 0);
|
|
for (int i = 0; i < n; ++i) {
|
|
group_ids[i] = partials[i].group;
|
|
if (partials[i].group >= 0) ++gcount[partials[i].group];
|
|
}
|
|
std::vector<int32_t> gstart(n_groups, 0);
|
|
int acc = 0;
|
|
for (int g = 0; g < n_groups; ++g) { gstart[g] = acc; acc += gcount[g]; }
|
|
std::vector<int32_t> gperm(acc), gfill = gstart;
|
|
for (int i = 0; i < n; ++i) {
|
|
const int g = partials[i].group;
|
|
if (g >= 0) gperm[gfill[g]++] = i;
|
|
}
|
|
gpu_->SetGroups(n_groups, group_ids.data(), gperm.data(), acc, gstart.data(), gcount.data());
|
|
}
|
|
#endif
|
|
return n_groups;
|
|
}
|
|
|
|
void RotationScaleMerge::ReduceGroupMeans(const std::vector<Obs> &obs, int n_groups,
|
|
bool exclude_ice, const std::vector<char> &masked,
|
|
std::vector<double> &out_mean) const {
|
|
// Inverse-variance per-group mean of I*corr = the merge reference (a segmented reduction over the
|
|
// groups; the CPU stand-in for a CUDA reduce_by_key). No cell mask here: the scaling reference
|
|
// (MergeAll) is built without a reference cell - only the final merge applies it.
|
|
std::vector<double> sw(n_groups, 0.0), swI(n_groups, 0.0);
|
|
for (const auto &o : obs) {
|
|
if (o.group < 0) continue;
|
|
if (!(o.corr > 0.0f) || !std::isfinite(o.corr)) continue;
|
|
if (exclude_ice && o.on_ice) continue;
|
|
if (!masked.empty()) {
|
|
const int ring = IceRingIndex(o.d, ice_half_width_q);
|
|
if (ring >= 0 && ring < static_cast<int>(masked.size()) && masked[ring]) continue;
|
|
}
|
|
if (o.partiality < min_partiality) continue;
|
|
const float I_corr = o.I * o.corr;
|
|
const float sigma_corr = o.sigma * o.corr;
|
|
if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0f) continue;
|
|
const double w = 1.0 / (static_cast<double>(sigma_corr) * sigma_corr);
|
|
sw[o.group] += w;
|
|
swI[o.group] += w * I_corr;
|
|
}
|
|
out_mean.assign(n_groups, NAN);
|
|
for (int g = 0; g < n_groups; ++g)
|
|
if (sw[g] > 0.0) out_mean[g] = swI[g] / sw[g];
|
|
}
|
|
|
|
void RotationScaleMerge::FitPerFrameG(std::vector<Obs> &obs, const std::vector<int32_t> &fstart,
|
|
const std::vector<int32_t> &fcount,
|
|
const std::vector<double> &group_mean_in,
|
|
bool unity, std::vector<double> &g) {
|
|
std::vector<uint8_t> scaled(fstart.size(), 0);
|
|
ParallelFor(static_cast<int>(fstart.size()), nthreads, [&](int f) {
|
|
std::vector<ScaleObs> so;
|
|
so.reserve(fcount[f]);
|
|
const int lo = fstart[f], hi = fstart[f] + fcount[f];
|
|
for (int i = lo; i < hi; ++i) {
|
|
const auto &o = obs[i];
|
|
if (o.group < 0) continue;
|
|
if (o.on_ice) continue;
|
|
const double mean = group_mean_in[o.group];
|
|
if (!std::isfinite(mean)) continue;
|
|
double coeff;
|
|
if (unity) {
|
|
coeff = mean; // partiality already folded into the full, rlp = 1
|
|
} else {
|
|
if (!(std::isfinite(o.zeta) && o.zeta > 0.0f)) continue; // Rotation model needs zeta > 0
|
|
coeff = o.partiality * SafeInv(o.rlp, 1.0) * mean;
|
|
}
|
|
so.push_back({coeff, static_cast<double>(o.I), SafeInv(o.sigma, 1.0)});
|
|
}
|
|
if (so.size() < MIN_REFLECTIONS) return; // leave g[f]/corr untouched (as ScaleOnTheFly does)
|
|
g[f] = SolveScaleIRLS(so, SCALE_ROBUST_K);
|
|
scaled[f] = 1;
|
|
});
|
|
// Remember which frames were fitted this call (so the caller updates corr only there).
|
|
frame_scaled_scratch = std::move(scaled);
|
|
}
|
|
|
|
void RotationScaleMerge::UpdateCorr(std::vector<Obs> &obs, const std::vector<double> &g,
|
|
const std::vector<uint8_t> &frame_scaled) const {
|
|
ParallelChunks(static_cast<int>(obs.size()), nthreads, [&](int lo, int hi) {
|
|
for (int i = lo; i < hi; ++i) {
|
|
auto &o = obs[i];
|
|
if (!frame_scaled[o.frame]) continue;
|
|
const double denom = static_cast<double>(o.partiality) * g[o.frame]; // B_term = 1 (no B refine)
|
|
if (std::isfinite(o.rlp) && std::isfinite(denom) && denom > 0.0)
|
|
o.corr = static_cast<float>(o.rlp / denom);
|
|
else
|
|
o.corr = NAN;
|
|
}
|
|
});
|
|
}
|
|
|
|
void RotationScaleMerge::SmoothG(std::vector<Obs> &obs, std::vector<double> &g, int window) const {
|
|
const int n = static_cast<int>(g.size());
|
|
const int half = window / 2;
|
|
std::vector<double> g_smooth(n, NAN);
|
|
for (int o = 0; o < n; ++o) {
|
|
double sum_log = 0.0;
|
|
int count = 0;
|
|
for (int j = std::max(0, o - half); j <= std::min(n - 1, o + half); ++j) {
|
|
if (frame_scaled_scratch[j] && std::isfinite(g[j]) && g[j] > 0.0) {
|
|
sum_log += std::log(g[j]);
|
|
++count;
|
|
}
|
|
}
|
|
if (count > 0) g_smooth[o] = std::exp(sum_log / count);
|
|
}
|
|
for (auto &o : obs) {
|
|
const int f = o.frame;
|
|
if (!frame_scaled_scratch[f] || !std::isfinite(g[f]) || g[f] <= 0.0 || !std::isfinite(g_smooth[f]))
|
|
continue;
|
|
if (std::isfinite(o.corr))
|
|
o.corr = static_cast<float>(o.corr * (g[f] / g_smooth[f]));
|
|
}
|
|
for (int f = 0; f < n; ++f)
|
|
if (frame_scaled_scratch[f] && std::isfinite(g[f]) && g[f] > 0.0 && std::isfinite(g_smooth[f]))
|
|
g[f] = g_smooth[f];
|
|
}
|
|
|
|
void RotationScaleMerge::Combine() {
|
|
fulls.clear();
|
|
g_full.assign(n_frames, 1.0);
|
|
|
|
// Combine one raw-hkl run into fulls, appended to `out`. Independent per run (the events of one hkl
|
|
// touch no shared state), so runs parallelise cleanly. Returns the number of usable partials seen.
|
|
// `dump` (serial path only) writes each emitted full for the diagnostic observation dump.
|
|
auto process_rawrun = [&](int r, std::vector<Obs> &out, std::ofstream *dump) -> size_t {
|
|
const int lo = rawrun_start[r], hi = rawrun_start[r] + rawrun_count[r];
|
|
std::vector<int32_t> ev; // usable perm-indices of this raw hkl, in image-number order
|
|
ev.reserve(hi - lo);
|
|
for (int p = lo; p < hi; ++p) {
|
|
const auto &o = partials[perm[p]];
|
|
if (!std::isfinite(o.corr) || o.corr <= 0.0f) continue;
|
|
if (!std::isfinite(o.I) || !std::isfinite(o.sigma) || o.sigma <= 0.0f) continue;
|
|
ev.push_back(perm[p]);
|
|
}
|
|
const int group = rawrun_group[r];
|
|
|
|
size_t i = 0;
|
|
while (i < ev.size()) {
|
|
size_t kk = i + 1;
|
|
float last_frame = partials[ev[i]].image_number;
|
|
while (kk < ev.size()) {
|
|
const float frame = partials[ev[kk]].image_number;
|
|
if (frame - last_frame > MAX_FRAME_GAP) break;
|
|
last_frame = frame;
|
|
++kk;
|
|
}
|
|
|
|
double pooled_bkg = 0.0;
|
|
int n_pool = 0;
|
|
for (size_t m = i; m < kk; ++m) {
|
|
const float b = partials[ev[m]].bkg;
|
|
if (std::isfinite(b)) { pooled_bkg += b; ++n_pool; }
|
|
}
|
|
pooled_bkg = n_pool > 0 ? pooled_bkg / n_pool : 0.0;
|
|
auto pooled_I = [&](const Obs &r2) {
|
|
const double n_bkg = std::max(0.0, static_cast<double>(r2.sigma) * r2.sigma - r2.I)
|
|
/ std::max(r2.bkg, 1.0f);
|
|
return static_cast<double>(r2.I) + n_bkg * (static_cast<double>(r2.bkg) - pooled_bkg);
|
|
};
|
|
|
|
double sum_w = 0.0, sum_wI = 0.0, sum_partiality = 0.0;
|
|
float d = NAN;
|
|
int peak_outcome = partials[ev[i]].frame;
|
|
float peak_frame = partials[ev[i]].image_number;
|
|
float peak_partiality = -1.0f;
|
|
const bool on_ice = partials[ev[i]].on_ice;
|
|
for (size_t m = i; m < kk; ++m) {
|
|
const auto &r2 = partials[ev[m]];
|
|
const double sigma_corr = static_cast<double>(r2.sigma) * r2.corr;
|
|
const double w = 1.0 / (sigma_corr * sigma_corr);
|
|
sum_w += w;
|
|
sum_wI += w * pooled_I(r2) * r2.corr;
|
|
sum_partiality += r2.partiality;
|
|
if (r2.partiality > peak_partiality) {
|
|
peak_partiality = r2.partiality;
|
|
peak_outcome = r2.frame;
|
|
peak_frame = r2.image_number;
|
|
}
|
|
if (!std::isfinite(d) && std::isfinite(r2.d) && r2.d > 0.0f) d = r2.d;
|
|
}
|
|
double F = sum_wI / sum_w;
|
|
|
|
for (int iter = 0; iter < 3; ++iter) {
|
|
sum_w = 0.0; sum_wI = 0.0;
|
|
for (size_t m = i; m < kk; ++m) {
|
|
const auto &r2 = partials[ev[m]];
|
|
const double corr = r2.corr;
|
|
const double I_corr = pooled_I(r2) * corr;
|
|
const double sigma_corr = static_cast<double>(r2.sigma) * corr;
|
|
const double bkg_var = sigma_corr * sigma_corr - corr * I_corr;
|
|
double var = std::max(0.0, bkg_var) + corr * std::max(0.0, F);
|
|
if (!(var > 0.0)) var = sigma_corr * sigma_corr;
|
|
const double w = 1.0 / var;
|
|
sum_w += w;
|
|
sum_wI += w * I_corr;
|
|
}
|
|
F = sum_wI / sum_w;
|
|
}
|
|
|
|
const int n_frames_event = static_cast<int>(kk - i);
|
|
i = kk;
|
|
|
|
if (sum_w <= 0.0 || sum_partiality < min_partiality)
|
|
continue;
|
|
|
|
double sigma_full = 1.0 / std::sqrt(sum_w);
|
|
if (capture_uncertainty_coeff > 0.0) {
|
|
const double frac = std::min(1.0, sum_partiality);
|
|
const double extra = capture_uncertainty_coeff * (1.0 - frac) * std::max(0.0, F);
|
|
sigma_full = std::sqrt(sigma_full * sigma_full + extra * extra);
|
|
}
|
|
|
|
Obs full{};
|
|
full.h = rawrun_h[r]; full.k = rawrun_k[r]; full.l = rawrun_l[r];
|
|
full.I = static_cast<float>(F);
|
|
full.sigma = static_cast<float>(sigma_full);
|
|
full.d = d;
|
|
full.rlp = 1.0f;
|
|
full.partiality = 1.0f;
|
|
full.corr = 1.0f;
|
|
full.image_number = peak_frame;
|
|
full.frame = peak_outcome;
|
|
full.on_ice = on_ice ? 1 : 0;
|
|
full.group = group; // the raw hkl's ASU group for the current space group (<0 = absent)
|
|
out.push_back(full);
|
|
|
|
if (dump != nullptr)
|
|
*dump << full.h << ' ' << full.k << ' ' << full.l << ' ' << full.I << ' '
|
|
<< full.sigma << ' ' << d << ' ' << n_frames_event << ' ' << sum_partiality << ' '
|
|
<< static_cast<int>(peak_frame) << '\n';
|
|
}
|
|
return ev.size();
|
|
};
|
|
|
|
const int n_run = static_cast<int>(rawrun_start.size());
|
|
size_t n_used = 0;
|
|
|
|
if (!observation_dump_path.empty() || nthreads <= 1) {
|
|
// Serial (diagnostic dump needs a single writer).
|
|
std::ofstream dump;
|
|
if (!observation_dump_path.empty()) {
|
|
dump.open(observation_dump_path);
|
|
dump << "# h k l I sigma d n_frames captured_fraction\n";
|
|
}
|
|
fulls.reserve(n_run);
|
|
for (int r = 0; r < n_run; ++r)
|
|
n_used += process_rawrun(r, fulls, dump.is_open() ? &dump : nullptr);
|
|
} else {
|
|
// Parallel over contiguous rawrun chunks; concatenate the per-thread fulls in run order so the
|
|
// result is deterministic.
|
|
const int nt = static_cast<int>(std::min(nthreads, static_cast<size_t>(n_run)));
|
|
const int chunk = (n_run + nt - 1) / nt;
|
|
std::vector<std::vector<Obs>> part(nt);
|
|
std::vector<size_t> used(nt, 0);
|
|
std::vector<std::future<void>> futures;
|
|
futures.reserve(nt);
|
|
for (int t = 0; t < nt; ++t) {
|
|
const int r0 = t * chunk, r1 = std::min(n_run, r0 + chunk);
|
|
if (r0 >= r1) break;
|
|
futures.emplace_back(std::async(std::launch::async, [&, t, r0, r1] {
|
|
part[t].reserve(r1 - r0);
|
|
for (int r = r0; r < r1; ++r) used[t] += process_rawrun(r, part[t], nullptr);
|
|
}));
|
|
}
|
|
for (auto &f : futures) f.get();
|
|
size_t total = 0;
|
|
for (int t = 0; t < nt; ++t) { total += part[t].size(); n_used += used[t]; }
|
|
fulls.reserve(total);
|
|
for (int t = 0; t < nt; ++t)
|
|
fulls.insert(fulls.end(), part[t].begin(), part[t].end());
|
|
}
|
|
|
|
SortFullsByFrame();
|
|
logger.Info("3D combine: {} fulls from {} partials", fulls.size(), n_used);
|
|
}
|
|
|
|
void RotationScaleMerge::SortFullsByFrame() {
|
|
// Sort the fulls by their (peak) frame and build per-frame CSR ranges, so the scale-fulls step can
|
|
// fit a per-frame G by slicing contiguous ranges (the same layout the partials use).
|
|
std::sort(fulls.begin(), fulls.end(),
|
|
[](const Obs &a, const Obs &b) { return a.frame < b.frame; });
|
|
fulls_frame_start.assign(n_frames, 0);
|
|
fulls_frame_count.assign(n_frames, 0);
|
|
for (int i = 0; i < static_cast<int>(fulls.size()); ) {
|
|
const int f = fulls[i].frame;
|
|
fulls_frame_start[f] = i;
|
|
int j = i;
|
|
while (j < static_cast<int>(fulls.size()) && fulls[j].frame == f) ++j;
|
|
fulls_frame_count[f] = j - i;
|
|
i = j;
|
|
}
|
|
}
|
|
|
|
void RotationScaleMerge::FinalizePerFrameScale(int n_groups, const std::vector<double> &partial_group_mean,
|
|
const std::vector<uint8_t> &frame_scaled) {
|
|
// Per-frame CC vs the merged reference (CalculateGlobalCC), computed once now (not every iteration).
|
|
std::vector<double> cc(n_frames, NAN);
|
|
std::vector<int64_t> cc_n(n_frames, 0);
|
|
ParallelFor(n_frames, nthreads, [&](int f) {
|
|
double sx = 0, sy = 0, sx2 = 0, sy2 = 0, sxy = 0;
|
|
size_t n = 0;
|
|
const int lo = frame_start[f], hi = frame_start[f] + frame_count[f];
|
|
for (int i = lo; i < hi; ++i) {
|
|
const auto &o = partials[i];
|
|
if (o.on_ice) continue;
|
|
if (o.group < 0) continue;
|
|
if (o.partiality < min_partiality) continue;
|
|
if (!std::isfinite(o.I) || !std::isfinite(o.corr) || o.corr <= 0.0f) continue;
|
|
if (!std::isfinite(o.sigma) || o.sigma <= 0.0f) continue;
|
|
const double mean = partial_group_mean[o.group];
|
|
if (!std::isfinite(mean)) continue;
|
|
const double img = static_cast<double>(o.I) * o.corr;
|
|
sx += img; sy += mean; sx2 += img * img; sy2 += mean * mean; sxy += img * mean;
|
|
++n;
|
|
}
|
|
if (n < MIN_REFLECTIONS) return;
|
|
const double nd = static_cast<double>(n);
|
|
const double cov = sxy - sx * sy / nd;
|
|
const double vx = sx2 - sx * sx / nd;
|
|
const double vy = sy2 - sy * sy / nd;
|
|
if (vx > 0.0 && vy > 0.0) { cc[f] = cov / std::sqrt(vx * vy); cc_n[f] = static_cast<int64_t>(n); }
|
|
});
|
|
|
|
for (int f = 0; f < n_frames; ++f) {
|
|
auto &o = partials_out[f];
|
|
if (frame_scaled[f]) {
|
|
o.image_scale_g = static_cast<float>(g_partial[f]);
|
|
o.mosaicity_deg = (f < static_cast<int>(mos_smooth.size()) && std::isfinite(mos_smooth[f]))
|
|
? mos_smooth[f] : static_cast<float>(mosaicity_deg);
|
|
if (std::isfinite(cc[f])) { o.image_scale_cc = static_cast<float>(cc[f]); o.image_scale_cc_n = cc_n[f]; }
|
|
else { o.image_scale_cc.reset(); o.image_scale_cc_n.reset(); }
|
|
} else {
|
|
o.image_scale_g.reset();
|
|
o.image_scale_cc.reset();
|
|
o.image_scale_cc_n.reset();
|
|
o.mosaicity_deg.reset();
|
|
}
|
|
o.image_scale_b_factor_Ang2.reset();
|
|
o.image_scale_wedge_deg.reset();
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
// Possible unique reflections per shell for the completeness column - mirrors CalcPossibleReflections.
|
|
void PossiblePerShell(int space_group_number, const UnitCell &cell, double d_min, double d_max,
|
|
const ResolutionShells &shells, bool merge_friedel, std::vector<int> &possible) {
|
|
gemmi::UnitCell gemmi_cell = cell;
|
|
const gemmi::SpaceGroup *sg = gemmi::find_spacegroup_by_number(space_group_number);
|
|
if (sg == nullptr) return;
|
|
const std::vector<gemmi::Miller> hkls = gemmi::make_miller_vector(gemmi_cell, sg, d_min, d_max, true);
|
|
const gemmi::GroupOps gops = sg->operations();
|
|
CrystalLattice lattice(cell);
|
|
const auto astar = lattice.Astar(), bstar = lattice.Bstar(), cstar = lattice.Cstar();
|
|
for (const auto &hkl : hkls) {
|
|
const auto q = hkl[0] * astar + hkl[1] * bstar + hkl[2] * cstar;
|
|
const auto qlen = q.Length();
|
|
if (qlen < 1e-6) continue;
|
|
const auto shell = shells.GetShell(1.0 / qlen);
|
|
if (!shell.has_value()) continue;
|
|
const int s = *shell;
|
|
if (s >= 0 && s < static_cast<int>(possible.size()))
|
|
possible[s] += (merge_friedel || gops.is_reflection_centric(hkl)) ? 1 : 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
RotationScaleMerge::Result RotationScaleMerge::MergeAndStats(int n_groups, bool for_search,
|
|
const std::vector<char> &masked) {
|
|
// A full is usable for the merge / error model if it passes AddImage's filters (with the current
|
|
// ice/masked-ring context). group >= 0 already encodes "not absent and passes AcceptReflection".
|
|
auto masked_ring = [&](const Obs &o) {
|
|
if (masked.empty()) return false;
|
|
const int ring = IceRingIndex(o.d, ice_half_width_q);
|
|
return ring >= 0 && ring < static_cast<int>(masked.size()) && masked[ring];
|
|
};
|
|
auto usable_merge = [&](const Obs &o) {
|
|
if (o.group < 0) return false;
|
|
if (!frame_cell_ok[o.frame]) return false;
|
|
if (!(o.corr > 0.0f) || !std::isfinite(o.corr)) return false;
|
|
if (for_search && o.on_ice) return false;
|
|
if (masked_ring(o)) return false;
|
|
if (o.partiality < min_partiality) return false;
|
|
const float I_corr = o.I * o.corr, sigma_corr = o.sigma * o.corr;
|
|
return std::isfinite(I_corr) && std::isfinite(sigma_corr) && sigma_corr > 0.0f;
|
|
};
|
|
|
|
// ---- Error model: fit dev2 = a*sigma^2 + b^2*<I>^2 from symmetry-equivalent scatter. ----
|
|
std::vector<double> em_mean(n_groups, NAN);
|
|
std::vector<float> reject_median(n_groups, NAN);
|
|
double error_model_a = 1.0, error_model_b = 0.0, error_model_chi2 = 0.0;
|
|
bool error_model_active = false;
|
|
{
|
|
// Per-group inverse-variance mean over usable fulls (>=2 obs), and the leverage-corrected samples.
|
|
std::vector<double> sw(n_groups, 0.0), swI(n_groups, 0.0);
|
|
std::vector<int> cnt(n_groups, 0);
|
|
for (const auto &o : fulls) {
|
|
if (!usable_merge(o)) continue;
|
|
const double sigma_corr = static_cast<double>(o.sigma) * o.corr;
|
|
const double w = 1.0 / (sigma_corr * sigma_corr);
|
|
sw[o.group] += w; swI[o.group] += w * (static_cast<double>(o.I) * o.corr); cnt[o.group]++;
|
|
}
|
|
for (int g = 0; g < n_groups; ++g)
|
|
if (cnt[g] >= 2 && sw[g] > 0.0) em_mean[g] = swI[g] / sw[g];
|
|
|
|
if (reject_outliers) {
|
|
std::vector<std::vector<float>> iv(n_groups);
|
|
for (const auto &o : fulls)
|
|
if (usable_merge(o) && cnt[o.group] >= 2)
|
|
iv[o.group].push_back(o.I * o.corr);
|
|
for (int g = 0; g < n_groups; ++g)
|
|
if (!iv[g].empty()) {
|
|
std::nth_element(iv[g].begin(), iv[g].begin() + iv[g].size() / 2, iv[g].end());
|
|
reject_median[g] = iv[g][iv[g].size() / 2];
|
|
}
|
|
}
|
|
|
|
struct Sample { double s2, I2, dev2; };
|
|
std::vector<Sample> samples;
|
|
samples.reserve(fulls.size());
|
|
for (const auto &o : fulls) {
|
|
if (!usable_merge(o) || cnt[o.group] < 2) continue;
|
|
const double mean = em_mean[o.group];
|
|
if (!std::isfinite(mean)) continue;
|
|
const double sigma_corr = static_cast<double>(o.sigma) * o.corr;
|
|
const double s2 = sigma_corr * sigma_corr;
|
|
const double w = 1.0 / s2;
|
|
const double factor = 1.0 - w / sw[o.group];
|
|
if (factor < 0.05) continue;
|
|
const double resid = static_cast<double>(o.I) * o.corr - mean;
|
|
samples.push_back({s2, mean * mean, resid * resid / factor});
|
|
}
|
|
|
|
constexpr int n_bins = 16;
|
|
if (samples.size() >= static_cast<size_t>(8 * n_bins)) {
|
|
std::sort(samples.begin(), samples.end(),
|
|
[](const Sample &a, const Sample &b) { return a.I2 < b.I2; });
|
|
std::vector<double> bs2, bI2, bd2;
|
|
const size_t per = samples.size() / n_bins;
|
|
for (int bin = 0; bin < n_bins; ++bin) {
|
|
const size_t lo = bin * per;
|
|
const size_t hi = (bin == n_bins - 1) ? samples.size() : lo + per;
|
|
std::vector<double> vs2, vI2, vd2;
|
|
for (size_t i = lo; i < hi; ++i) {
|
|
vs2.push_back(samples[i].s2); vI2.push_back(samples[i].I2); vd2.push_back(samples[i].dev2);
|
|
}
|
|
bs2.push_back(median_of(vs2));
|
|
bI2.push_back(median_of(vI2));
|
|
bd2.push_back(median_of(vd2) / CHI2_1_MEDIAN);
|
|
}
|
|
std::vector<double> bd2_sorted = bd2;
|
|
const double dev2_floor = std::max(1e-30, 1e-3 * median_of(bd2_sorted));
|
|
double Ass = 0, AsI = 0, AII = 0, Bs = 0, BI = 0;
|
|
for (int bin = 0; bin < n_bins; ++bin) {
|
|
const double s2 = bs2[bin], I2 = bI2[bin], d2 = bd2[bin];
|
|
const double d2w = std::max(d2, dev2_floor);
|
|
const double wgt = 1.0 / (d2w * d2w);
|
|
Ass += wgt * s2 * s2; AsI += wgt * s2 * I2; AII += wgt * I2 * I2;
|
|
Bs += wgt * s2 * d2; BI += wgt * I2 * d2;
|
|
}
|
|
const double det = Ass * AII - AsI * AsI;
|
|
if (det > 1e-10 * Ass * AII) {
|
|
error_model_a = std::clamp((Bs * AII - BI * AsI) / det, 0.25, 100.0);
|
|
const double b2 = std::max((Ass * BI - AsI * Bs) / det, 0.0);
|
|
error_model_b = std::sqrt(b2);
|
|
error_model_active = true;
|
|
std::vector<double> chi2;
|
|
chi2.reserve(samples.size());
|
|
for (const auto &s : samples) {
|
|
const double v = error_model_a * s.s2 + b2 * s.I2;
|
|
if (v > 0.0) chi2.push_back(s.dev2 / v);
|
|
}
|
|
error_model_chi2 = chi2.empty() ? 0.0 : median_of(chi2) / CHI2_1_MEDIAN;
|
|
}
|
|
}
|
|
}
|
|
if (error_model_active)
|
|
logger.Info("Error model: a={:.3f} b={:.3f} ISa={:.1f} chi2={:.2f}", error_model_a, error_model_b,
|
|
error_model_b > 0 ? 1.0 / error_model_b : 0.0, error_model_chi2);
|
|
|
|
auto corrected_sigma = [&](float I_corr, float sigma_corr, int g) -> float {
|
|
if (!error_model_active) return sigma_corr;
|
|
const double I_for_b = std::isfinite(em_mean[g]) ? em_mean[g] : I_corr;
|
|
const double v = error_model_a * static_cast<double>(sigma_corr) * sigma_corr
|
|
+ (error_model_b * I_for_b) * (error_model_b * I_for_b);
|
|
return v > 0.0 ? static_cast<float>(std::sqrt(v)) : sigma_corr;
|
|
};
|
|
|
|
// ---- Merge: per-group inverse-variance sums with corrected sigma + deterministic half sets. ----
|
|
struct Accum { double swI = 0, sw = 0, swIh[2] = {0, 0}, swh[2] = {0, 0}; size_t nh[2] = {0, 0}; float d = NAN; };
|
|
std::vector<Accum> acc(n_groups);
|
|
size_t reject_count = 0;
|
|
for (const auto &o : fulls) {
|
|
if (!usable_merge(o)) continue;
|
|
const float I_corr = o.I * o.corr;
|
|
float sigma_corr = o.sigma * o.corr;
|
|
sigma_corr = corrected_sigma(I_corr, sigma_corr, o.group);
|
|
if (reject_outliers && error_model_active && std::isfinite(reject_median[o.group])
|
|
&& std::fabs(I_corr - reject_median[o.group]) > reject_nsigma * sigma_corr) {
|
|
++reject_count;
|
|
continue;
|
|
}
|
|
const double w = 1.0 / (static_cast<double>(sigma_corr) * sigma_corr);
|
|
const double wI = w * I_corr;
|
|
const int half = HalfForImage(o.frame);
|
|
auto &a = acc[o.group];
|
|
a.swI += wI; a.sw += w;
|
|
a.swIh[half] += wI; a.swh[half] += w; a.nh[half]++;
|
|
if (!std::isfinite(a.d) && std::isfinite(o.d) && o.d > 0.0f) a.d = o.d;
|
|
}
|
|
|
|
// ---- Export merged reflections (+ resolution-shell R-free flags). ----
|
|
Result result;
|
|
result.isa = error_model_b > 0 ? 1.0 / error_model_b : 0.0;
|
|
std::vector<double> merged_I(n_groups, NAN);
|
|
float d_min = std::numeric_limits<float>::max(), d_max = 0.0f;
|
|
for (int g = 0; g < n_groups; ++g) {
|
|
const auto &a = acc[g];
|
|
if (a.sw <= 0.0) continue;
|
|
MergedReflection mr{};
|
|
mr.h = group_h[g]; mr.k = group_k[g]; mr.l = group_l[g];
|
|
mr.I = static_cast<float>(a.swI / a.sw);
|
|
mr.sigma = 1.0f / std::sqrt(static_cast<float>(a.sw));
|
|
mr.I_half[0] = mr.I_half[1] = NAN;
|
|
mr.sigma_half[0] = mr.sigma_half[1] = NAN;
|
|
mr.d = a.d;
|
|
if (a.nh[0] + a.nh[1] > 0 && a.swh[0] > 0.0 && a.swh[1] > 0.0) {
|
|
for (int i = 0; i < 2; ++i) {
|
|
mr.I_half[i] = static_cast<float>(a.swIh[i] / a.swh[i]);
|
|
mr.sigma_half[i] = 1.0f / std::sqrt(static_cast<float>(a.swh[i]));
|
|
}
|
|
}
|
|
if (!std::isfinite(a.d) || a.d <= 0.0f) continue;
|
|
d_min = std::min(d_min, a.d);
|
|
d_max = std::max(d_max, a.d);
|
|
merged_I[g] = mr.I;
|
|
result.merged.push_back(mr);
|
|
}
|
|
|
|
if (rfree_fraction > 0.0 && !result.merged.empty() && d_min < d_max && d_min > 0.0f) {
|
|
constexpr int n_shells = 20;
|
|
ResolutionShells shells(d_min * 0.999f, d_max * 1.001f, n_shells);
|
|
std::vector<std::vector<size_t>> shell_groups(n_shells);
|
|
for (size_t i = 0; i < result.merged.size(); ++i) {
|
|
const auto shell = shells.GetShell(result.merged[i].d);
|
|
if (shell && *shell >= 0 && *shell < n_shells) shell_groups[*shell].push_back(i);
|
|
}
|
|
std::mt19937 rng(12345u);
|
|
std::bernoulli_distribution dist(rfree_fraction);
|
|
for (const auto &grp : shell_groups)
|
|
for (const size_t i : grp) result.merged[i].rfree_flag = dist(rng);
|
|
}
|
|
|
|
if (reject_count > 0)
|
|
logger.Info("Merge outlier rejection: dropped {} observations", reject_count);
|
|
|
|
// ---- Statistics (10 shells): completeness, multiplicity, <I/sigma>, R_meas, CC1/2. ----
|
|
constexpr int n_shells = 10;
|
|
float sd_min = std::numeric_limits<float>::max(), sd_max = 0.0f;
|
|
for (const auto &m : result.merged) {
|
|
if (!std::isfinite(m.d) || m.d <= 0.0f) continue;
|
|
if (d_min_limit && m.d < *d_min_limit) continue;
|
|
sd_min = std::min(sd_min, m.d); sd_max = std::max(sd_max, m.d);
|
|
}
|
|
if (!(sd_min < sd_max && sd_min > 0.0f))
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
|
|
"RotationScaleMerge: resolution calculation failed");
|
|
const float d_min_pad = sd_min * 0.999f, d_max_pad = sd_max * 1.001f;
|
|
ResolutionShells shells(d_min_pad, d_max_pad, n_shells);
|
|
const auto shell_mean_1_d2 = shells.GetShellMeanOneOverResSq();
|
|
const auto shell_min_res = shells.GetShellMinRes();
|
|
|
|
struct ShellAcc {
|
|
int unique = 0, total_obs = 0, possible = 0;
|
|
double sum_i_over_sigma = 0.0; int n_i_over_sigma = 0;
|
|
CorrelationCoefficient cc_half;
|
|
};
|
|
std::vector<ShellAcc> sa(n_shells);
|
|
std::vector<int> possible(n_shells, 0);
|
|
if (reference_cell)
|
|
PossiblePerShell(x.GetSpaceGroupNumber().value_or(1), *reference_cell, d_min_pad, d_max_pad,
|
|
shells, merge_friedel, possible);
|
|
for (int s = 0; s < n_shells; ++s) sa[s].possible = possible[s];
|
|
|
|
CorrelationCoefficient cc_half_overall;
|
|
for (const auto &m : result.merged) {
|
|
const auto shell = shells.GetShell(m.d);
|
|
if (!shell || *shell < 0 || *shell >= n_shells) continue;
|
|
if (std::isfinite(m.I) && std::isfinite(m.sigma) && m.sigma > 0.0) {
|
|
auto &s = sa[*shell];
|
|
s.unique++;
|
|
s.sum_i_over_sigma += m.I / m.sigma; s.n_i_over_sigma++;
|
|
if (std::isfinite(m.I_half[0]) && std::isfinite(m.I_half[1])) {
|
|
s.cc_half.Add(m.I_half[0], m.I_half[1]);
|
|
cc_half_overall.Add(m.I_half[0], m.I_half[1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// R_meas: re-walk the fulls (Mask = cell only; no ice / masked-ring / error-model), accumulate
|
|
// |I_i - <I>| per reflection.
|
|
struct RmeasObs { double sum_abs_dev = 0, sum_I = 0; int n = 0, shell = -1; };
|
|
std::vector<RmeasObs> rmeas(n_groups);
|
|
for (const auto &o : fulls) {
|
|
if (o.group < 0) continue;
|
|
if (!frame_cell_ok[o.frame]) continue;
|
|
if (!(o.corr > 0.0f) || !std::isfinite(o.corr)) continue;
|
|
if (o.partiality < min_partiality) continue;
|
|
const float I_corr = o.I * o.corr, sigma_corr = o.sigma * o.corr;
|
|
if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0f) continue;
|
|
const auto shell = shells.GetShell(o.d);
|
|
if (!shell || *shell < 0 || *shell >= n_shells) continue;
|
|
sa[*shell].total_obs++;
|
|
if (std::isfinite(merged_I[o.group])) {
|
|
auto &r = rmeas[o.group];
|
|
r.sum_abs_dev += std::fabs(static_cast<double>(I_corr) - merged_I[o.group]);
|
|
r.sum_I += I_corr; r.n++; r.shell = *shell;
|
|
}
|
|
}
|
|
std::vector<double> rmeas_num(n_shells, 0.0), rmeas_den(n_shells, 0.0);
|
|
double rmeas_num_all = 0.0, rmeas_den_all = 0.0;
|
|
for (const auto &r : rmeas) {
|
|
if (r.n < 2 || r.shell < 0 || r.shell >= n_shells) continue;
|
|
const double factor = std::sqrt(static_cast<double>(r.n) / (r.n - 1));
|
|
rmeas_num[r.shell] += factor * r.sum_abs_dev; rmeas_den[r.shell] += r.sum_I;
|
|
rmeas_num_all += factor * r.sum_abs_dev; rmeas_den_all += r.sum_I;
|
|
}
|
|
|
|
MergeStatistics &out = result.statistics;
|
|
out.shells.resize(n_shells);
|
|
for (int s = 0; s < n_shells; ++s) {
|
|
auto &ss = out.shells[s];
|
|
ss.mean_one_over_d2 = shell_mean_1_d2[s];
|
|
ss.d_min = shell_min_res[s];
|
|
ss.d_max = s == 0 ? d_max_pad : shell_min_res[s - 1];
|
|
ss.total_observations = sa[s].total_obs;
|
|
ss.unique_reflections = sa[s].unique;
|
|
ss.possible_unique_reflections = sa[s].possible;
|
|
ss.mean_i_over_sigma = sa[s].n_i_over_sigma > 0 ? sa[s].sum_i_over_sigma / sa[s].n_i_over_sigma : 0.0;
|
|
ss.cc_half = sa[s].cc_half.GetCC();
|
|
ss.cc_ref = NAN;
|
|
ss.r_meas = rmeas_den[s] > 0.0 ? rmeas_num[s] / rmeas_den[s] : NAN;
|
|
}
|
|
auto &overall = out.overall;
|
|
overall.d_min = sd_min; overall.d_max = sd_max;
|
|
double sum_ios = 0.0; int n_ios = 0;
|
|
for (int s = 0; s < n_shells; ++s) {
|
|
overall.total_observations += sa[s].total_obs;
|
|
overall.unique_reflections += sa[s].unique;
|
|
overall.possible_unique_reflections += sa[s].possible;
|
|
sum_ios += sa[s].sum_i_over_sigma; n_ios += sa[s].n_i_over_sigma;
|
|
}
|
|
overall.mean_i_over_sigma = n_ios > 0 ? sum_ios / n_ios : 0.0;
|
|
overall.cc_half = cc_half_overall.GetCC();
|
|
overall.cc_ref = NAN;
|
|
overall.r_meas = rmeas_den_all > 0.0 ? rmeas_num_all / rmeas_den_all : NAN;
|
|
|
|
logger.Info("Merge complete ({} unique reflections)", result.merged.size());
|
|
return result;
|
|
}
|
|
|
|
RotationScaleMerge::Result RotationScaleMerge::Run(bool for_search,
|
|
const std::vector<char> &masked_ice_rings) {
|
|
auto t_last = std::chrono::steady_clock::now();
|
|
auto lap = [&](const char *what) {
|
|
const auto now = std::chrono::steady_clock::now();
|
|
logger.Info("[rsm] {}: {:.2f} s", what, std::chrono::duration<double>(now - t_last).count());
|
|
t_last = now;
|
|
};
|
|
|
|
const int sg_number = x.GetSpaceGroupNumber().value_or(1);
|
|
HKLKeyGenerator keygen(merge_friedel, sg_number);
|
|
|
|
// --- 1. Per-frame partial scaling (Rotation model, per-image G only). ---
|
|
const int n_groups = ComputeAsuGroups(keygen); // one ASU grouping, shared by partials and fulls
|
|
lap("group hkl");
|
|
std::vector<double> partial_mean;
|
|
bool scaled_on_gpu = false;
|
|
#ifdef JFJOCH_USE_CUDA
|
|
if (gpu_active_) {
|
|
// Refresh corr on the device (smooth-G mutated it on the host between passes), run the whole
|
|
// scaling loop on the GPU, then read corr + per-frame G/scaled back.
|
|
std::vector<float> corr(partials.size());
|
|
for (size_t i = 0; i < partials.size(); ++i) corr[i] = partials[i].corr;
|
|
gpu_->SetCorr(corr.data());
|
|
gpu_->ScalePartials(scaling_iter, SCALE_ROBUST_K, min_partiality, d_min_limit.has_value());
|
|
gpu_->GetCorr(corr.data());
|
|
frame_scaled_scratch.assign(n_frames, 0);
|
|
gpu_->GetG(g_partial.data(), frame_scaled_scratch.data());
|
|
for (size_t i = 0; i < partials.size(); ++i) partials[i].corr = corr[i];
|
|
scaled_on_gpu = true;
|
|
}
|
|
#endif
|
|
if (!scaled_on_gpu) {
|
|
for (int it = 0; it < scaling_iter; ++it) {
|
|
ReduceGroupMeans(partials, n_groups, false, {}, partial_mean);
|
|
FitPerFrameG(partials, frame_start, frame_count, partial_mean, /*unity=*/false, g_partial);
|
|
UpdateCorr(partials, g_partial, frame_scaled_scratch);
|
|
}
|
|
}
|
|
lap("scale partials");
|
|
const std::vector<uint8_t> partial_scaled = frame_scaled_scratch;
|
|
|
|
// --- 2. Smooth G across frames (XDS DELPHI-like) before the combine. ---
|
|
const auto s = x.GetScalingSettings();
|
|
const double smooth_g_deg = s.GetSmoothGDegrees();
|
|
const auto gonio = x.GetGoniometer();
|
|
const double osc_deg = gonio ? std::fabs(gonio->GetIncrement_deg()) : 0.0;
|
|
if (smooth_g_deg > 0.0 && osc_deg > 1e-6) {
|
|
int window = std::max(1, static_cast<int>(std::lround(smooth_g_deg / osc_deg)));
|
|
if (window % 2 == 0) ++window;
|
|
SmoothG(partials, g_partial, window);
|
|
}
|
|
|
|
// Per-frame CC + write G/CC/mosaicity back onto the partials (once).
|
|
ReduceGroupMeans(partials, n_groups, false, {}, partial_mean);
|
|
FinalizePerFrameScale(n_groups, partial_mean, partial_scaled);
|
|
|
|
// --- 3. 3D combine of per-frame partials into fulls (fulls inherit their ASU group here). ---
|
|
bool combined_on_gpu = false;
|
|
bool scaled_fulls_on_gpu = false;
|
|
#ifdef JFJOCH_USE_CUDA
|
|
// GPU combine (+ scale-fulls) keeps the fulls resident on the device: combine, then build the frame /
|
|
// ASU-group CSRs on the host from just the small key arrays (a deterministic counting sort - no GPU
|
|
// stable-sort), scale the fulls in place, and download only once. Mirrors Combine() + the Unity
|
|
// scale-fulls loop below. The diagnostic dump (serial, one writer) has no GPU path -> CPU fallback.
|
|
if (gpu_active_ && gpu_combine_ && observation_dump_path.empty()) {
|
|
std::vector<float> corr(partials.size()); // refresh the smoothed corr on the device
|
|
for (size_t i = 0; i < partials.size(); ++i) corr[i] = partials[i].corr;
|
|
gpu_->SetCorr(corr.data());
|
|
const int nf = gpu_->Combine(rawrun_group.data(), min_partiality, capture_uncertainty_coeff);
|
|
g_full.assign(n_frames, 1.0);
|
|
|
|
if (scale_fulls && nf > 0) {
|
|
// Frame + group CSRs over the emit-ordered fulls, built by counting sort on the host (stable,
|
|
// deterministic). frame is always in [0, n_frames); group is <0 for absent/out-of-range fulls.
|
|
std::vector<int32_t> ff(nf), fg(nf);
|
|
gpu_->GetFullsKeys(ff.data(), fg.data());
|
|
std::vector<int32_t> f_start(n_frames, 0), f_count(n_frames, 0), f_perm(nf);
|
|
for (int i = 0; i < nf; ++i) ++f_count[ff[i]];
|
|
for (int f = 1; f < n_frames; ++f) f_start[f] = f_start[f - 1] + f_count[f - 1];
|
|
{ std::vector<int32_t> fill = f_start; for (int i = 0; i < nf; ++i) f_perm[fill[ff[i]]++] = i; }
|
|
gpu_->SetFullsFrameCSR(f_perm.data(), nf, f_start.data(), f_count.data());
|
|
|
|
std::vector<int32_t> g_count(n_groups, 0), g_start(n_groups, 0);
|
|
for (int i = 0; i < nf; ++i) if (fg[i] >= 0) ++g_count[fg[i]];
|
|
int acc = 0;
|
|
for (int g = 0; g < n_groups; ++g) { g_start[g] = acc; acc += g_count[g]; }
|
|
std::vector<int32_t> g_perm(acc);
|
|
{ std::vector<int32_t> fill = g_start; for (int i = 0; i < nf; ++i) if (fg[i] >= 0) g_perm[fill[fg[i]]++] = i; }
|
|
gpu_->SetFullsGroups(g_perm.data(), acc, g_start.data(), g_count.data());
|
|
|
|
gpu_->ScaleFulls(scaling_iter, SCALE_ROBUST_K, min_partiality);
|
|
scaled_fulls_on_gpu = true;
|
|
}
|
|
|
|
fulls.assign(nf, Obs{});
|
|
std::vector<int32_t> fh(nf), fk(nf), fl(nf), fframe(nf), fgroup(nf);
|
|
std::vector<float> fI(nf), fsig(nf), fd(nf), fimg(nf), fcorr(nf, 1.0f);
|
|
std::vector<uint8_t> fon(nf);
|
|
gpu_->GetFulls(fh.data(), fk.data(), fl.data(), fI.data(), fsig.data(), fd.data(),
|
|
fimg.data(), fframe.data(), fon.data(), fgroup.data());
|
|
if (scaled_fulls_on_gpu) gpu_->GetFullsCorr(fcorr.data());
|
|
for (int i = 0; i < nf; ++i) {
|
|
Obs &o = fulls[i];
|
|
o.h = fh[i]; o.k = fk[i]; o.l = fl[i];
|
|
o.I = fI[i]; o.sigma = fsig[i]; o.d = fd[i];
|
|
o.rlp = 1.0f; o.partiality = 1.0f; o.corr = fcorr[i];
|
|
o.image_number = fimg[i]; o.frame = fframe[i];
|
|
o.on_ice = fon[i]; o.group = fgroup[i];
|
|
}
|
|
logger.Info("3D combine{} (GPU): {} fulls", scaled_fulls_on_gpu ? " + scale-fulls" : "", nf);
|
|
combined_on_gpu = true;
|
|
}
|
|
#endif
|
|
if (!combined_on_gpu)
|
|
Combine();
|
|
lap("combine");
|
|
|
|
// --- 4. Scale the fulls (XDS order, Unity model). ---
|
|
if (scale_fulls && !scaled_fulls_on_gpu) {
|
|
std::vector<double> full_mean;
|
|
for (int it = 0; it < scaling_iter; ++it) {
|
|
ReduceGroupMeans(fulls, n_groups, false, {}, full_mean);
|
|
FitPerFrameG(fulls, fulls_frame_start, fulls_frame_count, full_mean, /*unity=*/true, g_full);
|
|
UpdateCorr(fulls, g_full, frame_scaled_scratch);
|
|
}
|
|
logger.Info("Scaled fulls (XDS order, Unity model)");
|
|
}
|
|
lap("scale fulls");
|
|
|
|
// --- 5. Error model + merge + statistics. ---
|
|
auto r = MergeAndStats(n_groups, for_search, masked_ice_rings);
|
|
lap("merge+stats");
|
|
return r;
|
|
}
|