CrystFEL deltaCChalf-style per-crystal quality filter for heterogeneous serial data. Each image is assigned to one CC1/2 half, so removing it perturbs only that half's per-reflection means; deltaCChalf_i = CC1/2(all) - CC1/2(without image i). A negative value means dropping the image RAISES CC1/2 (it disagrees with the consensus). Images whose deltaCChalf is a low-side statistical outlier (< mean - N*stddev) are skipped when merging. Reference-free. Two passes over the retained integration outcomes; per-image contributions are re-derived rather than stored, so memory stays O(unique reflections + images) for full 200k-frame runs. New CLI flag --reject-delta-cchalf <N> (default: off). Validation (jet FFBIDX +C+S, sigma4): removing 17/4000 (3 sigma) raises CC1/2 95.1->96.1%, CCref 54.9->55.2; 2 sigma -> 96.1/55.3. Dataset-appropriate: it HELPS heterogeneous serial data (some crystals genuinely bad) but slightly trims a homogeneous single rotation crystal (c2 94.6->93.8 - no bad crystals, the relative cut still removes the tail), so it is opt-in. R-free is the real test (user's full 200k). Note: the reported overall N_obs still counts all observations; the exported merge (and CC1/2) correctly exclude the rejected images. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
709 lines
28 KiB
C++
709 lines
28 KiB
C++
// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "Merge.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <limits>
|
|
#include <random>
|
|
#include <unordered_map>
|
|
|
|
#include <spdlog/fmt/fmt.h>
|
|
|
|
#include <gemmi/reciproc.hpp>
|
|
|
|
#include "../../common/CorrelationCoefficient.h"
|
|
#include "../../common/ResolutionShells.h"
|
|
#include "HKLKey.h"
|
|
|
|
MergeOnTheFly::MergeOnTheFly(const DiffractionExperiment &x)
|
|
: space_group_number(x.GetSpaceGroupNumber().value_or(1)),
|
|
scaling_settings(x.GetScalingSettings()),
|
|
indexing_settings(x.GetIndexingSettings()),
|
|
high_resolution_limit(scaling_settings.GetHighResolutionLimit_A()),
|
|
// A min-image-CC of 0 (the default) means "no limit": leave the optional
|
|
// empty so the per-image CC cut is inactive. Otherwise a 0.0 threshold
|
|
// would silently drop every image with a non-positive per-image CC (which
|
|
// also wrongly zeroed N_obs in MergeStats, since it masks with cc_mask=true
|
|
// while the merge keeps all images).
|
|
image_cc_limit(scaling_settings.GetMinCCForImage() > 0.0
|
|
? std::optional<double>(scaling_settings.GetMinCCForImage())
|
|
: std::nullopt),
|
|
min_partiality(scaling_settings.GetMinPartiality()),
|
|
generator(scaling_settings.GetMergeFriedel(), space_group_number),
|
|
reject_outliers(scaling_settings.GetOutlierRejectNsigma() > 0.0),
|
|
reject_nsigma(scaling_settings.GetOutlierRejectNsigma()) {
|
|
}
|
|
|
|
MergeOnTheFly &MergeOnTheFly::ReferenceCell(const std::optional<UnitCell> &cell) {
|
|
reference_cell = cell;
|
|
return *this;
|
|
}
|
|
|
|
void MergeOnTheFly::AddImage(const IntegrationOutcome &outcome, bool cc_mask) {
|
|
std::unique_lock ul(merged_mutex);
|
|
const int half = half_dist(rng);
|
|
|
|
if (Mask(outcome, cc_mask))
|
|
return;
|
|
|
|
for (const auto &r: outcome.reflections) {
|
|
if (generator.IsSystematicallyAbsent(r))
|
|
continue;
|
|
|
|
if (r.image_scale_corr <= 0.0 || !std::isfinite(r.image_scale_corr))
|
|
continue;
|
|
if (!AcceptReflection(r, high_resolution_limit))
|
|
continue;
|
|
if (r.partiality < min_partiality)
|
|
continue;
|
|
|
|
const float I_corr = r.I * r.image_scale_corr;
|
|
float sigma_corr = r.sigma * r.image_scale_corr;
|
|
if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0)
|
|
continue;
|
|
auto hkl = generator(r);
|
|
auto hkl_key = hkl.pack();
|
|
sigma_corr = CorrectedSigma(I_corr, sigma_corr, hkl_key);
|
|
|
|
// Robust outlier rejection: drop this observation if it sits more than
|
|
// reject_nsigma error-model sigmas from the reflection's median. Needs the active
|
|
// error model so sigma_corr reflects the real scatter (else the threshold is the
|
|
// bare counting sigma and would cull good partials).
|
|
if (reject_outliers && error_model_active) {
|
|
const auto mit = reject_median_I.find(hkl_key);
|
|
if (mit != reject_median_I.end() &&
|
|
std::fabs(I_corr - mit->second) > reject_nsigma * sigma_corr) {
|
|
++reject_count;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
auto it = accumulator.find(hkl_key);
|
|
if (it == accumulator.end())
|
|
it = accumulator.emplace(hkl_key, MergeAccum{
|
|
.h = hkl.plus ? hkl.h : -hkl.h,
|
|
.k = hkl.plus ? hkl.k : -hkl.k,
|
|
.l = hkl.plus ? hkl.l : -hkl.l,
|
|
}).first;
|
|
|
|
const float w = 1.0f / (sigma_corr * sigma_corr);
|
|
const float wI = w * I_corr;
|
|
|
|
it->second.sum_wI += wI;
|
|
it->second.sum_w += w;
|
|
|
|
it->second.sum_wI_half[half] += wI;
|
|
it->second.sum_w_half[half] += w;
|
|
it->second.n_half[half]++;
|
|
|
|
if (!std::isfinite(it->second.d) && std::isfinite(r.d) && r.d > 0.0f)
|
|
it->second.d = r.d;
|
|
}
|
|
}
|
|
|
|
float MergeOnTheFly::CorrectedSigma(float I_corr, float sigma_corr, uint64_t hkl_key) const {
|
|
if (!error_model_active)
|
|
return sigma_corr;
|
|
|
|
// Intensity for the (b*I)^2 term: the reflection's mean (constant over its
|
|
// observations), falling back to this observation only if the mean is unknown.
|
|
const auto it = error_model_mean_I.find(hkl_key);
|
|
const double I_for_b = (it != error_model_mean_I.end()) ? it->second : I_corr;
|
|
|
|
const double v = error_model_a * static_cast<double>(sigma_corr) * sigma_corr
|
|
+ (error_model_b * I_for_b) * (error_model_b * I_for_b);
|
|
return (v > 0.0) ? static_cast<float>(std::sqrt(v)) : sigma_corr;
|
|
}
|
|
|
|
void MergeOnTheFly::RefineErrorModel(const std::vector<IntegrationOutcome> &outcomes) {
|
|
// Reset to identity up front: every early return below then leaves the model
|
|
// inactive (CorrectedSigma returns sigma unchanged) rather than keeping a stale
|
|
// a/b from a previous call alongside a freshly-cleared mean map.
|
|
error_model_active = false;
|
|
error_model_a = 1.0;
|
|
error_model_b = 0.0;
|
|
error_model_mean_I.clear();
|
|
reject_median_I.clear();
|
|
reject_count = 0;
|
|
|
|
// --- 1. Collect accepted, scaled observations grouped by symmetry-equivalent hkl,
|
|
// applying exactly the filters AddImage uses. ---
|
|
struct Obs { float I, sigma; };
|
|
std::unordered_map<uint64_t, std::vector<Obs>> groups;
|
|
|
|
for (const auto &outcome: outcomes) {
|
|
if (Mask(outcome, false))
|
|
continue;
|
|
for (const auto &r: outcome.reflections) {
|
|
if (generator.IsSystematicallyAbsent(r))
|
|
continue;
|
|
if (r.image_scale_corr <= 0.0 || !std::isfinite(r.image_scale_corr))
|
|
continue;
|
|
if (!AcceptReflection(r, high_resolution_limit))
|
|
continue;
|
|
if (r.partiality < min_partiality)
|
|
continue;
|
|
const float I_corr = r.I * r.image_scale_corr;
|
|
const float sigma_corr = r.sigma * r.image_scale_corr;
|
|
if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0f)
|
|
continue;
|
|
groups[generator(r).pack()].push_back({I_corr, sigma_corr});
|
|
}
|
|
}
|
|
|
|
// --- 2. One global pool of (sigma^2, <I>^2, bias-corrected squared deviation). For an
|
|
// observation in a group of n, the residual from the inverse-variance mean has
|
|
// E[(I_i - <I>)^2] = sigma_i^2 (1 - h_i), h_i = w_i / sum_w (its leverage). The
|
|
// (b*I)^2 term uses the reflection mean, so the mean (not I_i) is the abscissa. ---
|
|
struct Sample { double s2, I2, dev2; };
|
|
std::vector<Sample> samples;
|
|
|
|
for (const auto &[key, obs]: groups) {
|
|
if (obs.size() < 2)
|
|
continue;
|
|
double sum_w = 0.0, sum_wI = 0.0;
|
|
for (const auto &o: obs) {
|
|
const double w = 1.0 / (static_cast<double>(o.sigma) * o.sigma);
|
|
sum_w += w;
|
|
sum_wI += w * o.I;
|
|
}
|
|
if (!(sum_w > 0.0))
|
|
continue;
|
|
const double mean = sum_wI / sum_w;
|
|
error_model_mean_I[key] = static_cast<float>(mean);
|
|
// Robust centre for outlier rejection: the median intensity (resists the very
|
|
// outliers the inverse-variance mean is being protected from). Only when active.
|
|
if (reject_outliers) {
|
|
std::vector<float> iv;
|
|
iv.reserve(obs.size());
|
|
for (const auto &o: obs)
|
|
iv.push_back(o.I);
|
|
std::nth_element(iv.begin(), iv.begin() + iv.size() / 2, iv.end());
|
|
reject_median_I[key] = iv[iv.size() / 2];
|
|
}
|
|
const double I2 = mean * mean;
|
|
for (const auto &o: obs) {
|
|
const double w = 1.0 / (static_cast<double>(o.sigma) * o.sigma);
|
|
const double factor = 1.0 - w / sum_w;
|
|
if (factor < 0.05)
|
|
continue;
|
|
const double resid = static_cast<double>(o.I) - mean;
|
|
samples.push_back({static_cast<double>(o.sigma) * o.sigma, I2, resid * resid / factor});
|
|
}
|
|
}
|
|
|
|
// --- 3. Fit global dev2 = a*sigma^2 + b^2*<I>^2. Bin by intensity (the per-observation
|
|
// dev2 is chi-square-1 noisy) and take medians; weight the bins by 1/dev2^2 so it
|
|
// is a *relative* fit - otherwise the strong bins (which fix b) swamp the weak
|
|
// bins (which fix a) and the weak sigmas stay over-confident. ---
|
|
constexpr int n_bins = 16;
|
|
if (samples.size() < static_cast<size_t>(8 * n_bins))
|
|
return; // too little multiplicity to fit -> leave identity
|
|
|
|
std::sort(samples.begin(), samples.end(),
|
|
[](const Sample &p, const Sample &q) { return p.I2 < q.I2; });
|
|
|
|
auto median = [](std::vector<double> &v) {
|
|
std::nth_element(v.begin(), v.begin() + v.size() / 2, v.end());
|
|
return v[v.size() / 2];
|
|
};
|
|
|
|
// Per-intensity-bin medians of (sigma^2, <I>^2, dev2).
|
|
std::vector<double> bs2, bI2, bd2;
|
|
bs2.reserve(n_bins); bI2.reserve(n_bins); bd2.reserve(n_bins);
|
|
const size_t per = samples.size() / n_bins;
|
|
for (int bin = 0; bin < n_bins; ++bin) {
|
|
const size_t lo = bin * per;
|
|
const size_t hi = (bin == n_bins - 1) ? samples.size() : lo + per;
|
|
std::vector<double> vs2, vI2, vd2;
|
|
vs2.reserve(hi - lo); vI2.reserve(hi - lo); vd2.reserve(hi - lo);
|
|
for (size_t i = lo; i < hi; ++i) {
|
|
vs2.push_back(samples[i].s2);
|
|
vI2.push_back(samples[i].I2);
|
|
vd2.push_back(samples[i].dev2);
|
|
}
|
|
bs2.push_back(median(vs2));
|
|
bI2.push_back(median(vI2));
|
|
bd2.push_back(median(vd2));
|
|
}
|
|
|
|
// Relative-weighted (1/dev2^2) least squares for (a, b^2). Floor the weight's dev2 at a
|
|
// small fraction of the typical bin dev2: an absolute floor (1e-30) does not stop a
|
|
// near-zero-scatter bin from acquiring a runaway weight and hijacking the fit, so the
|
|
// floor must scale with the data. The regression target keeps the unfloored dev2.
|
|
std::vector<double> bd2_sorted = bd2;
|
|
const double dev2_floor = std::max(1e-30, 1e-3 * median(bd2_sorted));
|
|
double Ass = 0, AsI = 0, AII = 0, Bs = 0, BI = 0;
|
|
for (int bin = 0; bin < n_bins; ++bin) {
|
|
const double s2 = bs2[bin], I2 = bI2[bin], d2 = bd2[bin];
|
|
const double d2w = std::max(d2, dev2_floor);
|
|
const double wgt = 1.0 / (d2w * d2w);
|
|
Ass += wgt * s2 * s2;
|
|
AsI += wgt * s2 * I2;
|
|
AII += wgt * I2 * I2;
|
|
Bs += wgt * s2 * d2;
|
|
BI += wgt * I2 * d2;
|
|
}
|
|
|
|
// Reject a near-collinear (ill-conditioned) system *relatively*: det lies in
|
|
// [0, Ass*AII] by Cauchy-Schwarz, so compare against that scale rather than 1e-30.
|
|
const double det = Ass * AII - AsI * AsI;
|
|
if (!(det > 1e-10 * Ass * AII))
|
|
return;
|
|
const double a = std::clamp((Bs * AII - BI * AsI) / det, 0.25, 100.0);
|
|
const double b2 = std::max((Ass * BI - AsI * Bs) / det, 0.0);
|
|
|
|
error_model_a = a;
|
|
error_model_b = std::sqrt(b2);
|
|
error_model_active = true;
|
|
}
|
|
|
|
// Per-crystal CC1/2-delta rejection (CrystFEL deltaCChalf style). Each image is assigned
|
|
// to one CC1/2 half, so removing an image only perturbs that half's per-reflection means.
|
|
// deltaCChalf_i = CC1/2(all) - CC1/2(without image i): a NEGATIVE value means removing the
|
|
// image RAISES CC1/2, i.e. it is inconsistent with the consensus. We flag images whose
|
|
// deltaCChalf is a low-side statistical outlier (< mean - nsigma*stddev). Reference-free.
|
|
// Two passes over the (retained) outcomes; per-image contributions are re-derived, not
|
|
// stored, so memory stays O(unique reflections + images) for full 200k-frame datasets.
|
|
std::vector<char> MergeOnTheFly::DeltaCChalfReject(const std::vector<IntegrationOutcome> &outcomes,
|
|
double nsigma) const {
|
|
struct Acc { double swI[2] = {0, 0}; double sw[2] = {0, 0}; size_t n[2] = {0, 0}; };
|
|
std::unordered_map<uint64_t, Acc> acc;
|
|
std::vector<int> img_half(outcomes.size(), 0);
|
|
|
|
std::mt19937 lrng{2026061600u};
|
|
std::bernoulli_distribution hd{0.5};
|
|
|
|
// ---- pass 1: accumulate half-set sums, record each image's half ----
|
|
auto contribution = [&](const Reflection &r, uint64_t &key, double &wI, double &w) -> bool {
|
|
if (generator.IsSystematicallyAbsent(r)) return false;
|
|
if (r.image_scale_corr <= 0.0 || !std::isfinite(r.image_scale_corr)) return false;
|
|
if (!AcceptReflection(r, high_resolution_limit)) return false;
|
|
if (r.partiality < min_partiality) return false;
|
|
const double I = static_cast<double>(r.I) * r.image_scale_corr;
|
|
const double s = static_cast<double>(r.sigma) * r.image_scale_corr;
|
|
if (!std::isfinite(I) || !std::isfinite(s) || s <= 0.0) return false;
|
|
w = 1.0 / (s * s);
|
|
wI = w * I;
|
|
key = generator(r).pack();
|
|
return true;
|
|
};
|
|
|
|
for (size_t i = 0; i < outcomes.size(); ++i) {
|
|
const int half = hd(lrng) ? 1 : 0;
|
|
img_half[i] = half;
|
|
for (const auto &r : outcomes[i].reflections) {
|
|
uint64_t key; double wI, w;
|
|
if (!contribution(r, key, wI, w)) continue;
|
|
auto &a = acc[key];
|
|
a.swI[half] += wI; a.sw[half] += w; a.n[half]++;
|
|
}
|
|
}
|
|
|
|
// ---- baseline Pearson over reflections present in BOTH halves (x=half0, y=half1) ----
|
|
auto pearson = [](double N, double Sx, double Sy, double Sxx, double Syy, double Sxy) -> double {
|
|
const double cov = N * Sxy - Sx * Sy;
|
|
const double vx = N * Sxx - Sx * Sx, vy = N * Syy - Sy * Sy;
|
|
const double den = std::sqrt(vx * vy);
|
|
return den > 0.0 ? cov / den : 0.0;
|
|
};
|
|
double N = 0, Sx = 0, Sy = 0, Sxx = 0, Syy = 0, Sxy = 0;
|
|
for (const auto &kv : acc) {
|
|
const auto &a = kv.second;
|
|
if (a.n[0] == 0 || a.n[1] == 0) continue;
|
|
const double x = a.swI[0] / a.sw[0], y = a.swI[1] / a.sw[1];
|
|
N += 1; Sx += x; Sy += y; Sxx += x * x; Syy += y * y; Sxy += x * y;
|
|
}
|
|
const double cc_base = pearson(N, Sx, Sy, Sxx, Syy, Sxy);
|
|
|
|
// ---- pass 2: leave-one-out deltaCChalf per image ----
|
|
std::vector<double> delta(outcomes.size(), 0.0);
|
|
for (size_t i = 0; i < outcomes.size(); ++i) {
|
|
const int h = img_half[i];
|
|
// aggregate this image's contributions per reflection key (an image may, rarely,
|
|
// touch the same ASU reflection twice)
|
|
std::unordered_map<uint64_t, std::pair<double, double>> mine; // key -> (sum wI, sum w), count via .first
|
|
std::unordered_map<uint64_t, size_t> mine_n;
|
|
for (const auto &r : outcomes[i].reflections) {
|
|
uint64_t key; double wI, w;
|
|
if (!contribution(r, key, wI, w)) continue;
|
|
auto &p = mine[key]; p.first += wI; p.second += w; mine_n[key]++;
|
|
}
|
|
double n = N, sx = Sx, sy = Sy, sxx = Sxx, syy = Syy, sxy = Sxy;
|
|
for (const auto &m : mine) {
|
|
const auto &a = acc.at(m.first);
|
|
if (a.n[0] == 0 || a.n[1] == 0) continue; // reflection not in CC1/2
|
|
const double x0 = a.swI[0] / a.sw[0], y0 = a.swI[1] / a.sw[1];
|
|
n -= 1; sx -= x0; sy -= y0; sxx -= x0 * x0; syy -= y0 * y0; sxy -= x0 * y0;
|
|
const double swI_h = a.swI[h] - m.second.first;
|
|
const double sw_h = a.sw[h] - m.second.second;
|
|
if (a.n[h] - mine_n[m.first] == 0 || sw_h <= 0.0) continue; // reflection drops half h
|
|
const double mean_h = swI_h / sw_h;
|
|
const double xnew = (h == 0) ? mean_h : x0;
|
|
const double ynew = (h == 1) ? mean_h : y0;
|
|
n += 1; sx += xnew; sy += ynew; sxx += xnew * xnew; syy += ynew * ynew; sxy += xnew * ynew;
|
|
}
|
|
delta[i] = cc_base - pearson(n, sx, sy, sxx, syy, sxy);
|
|
}
|
|
|
|
// ---- reject low-side outliers: delta < mean - nsigma*stddev ----
|
|
double dm = 0, dv = 0;
|
|
for (double d : delta) dm += d;
|
|
dm /= std::max<size_t>(1, delta.size());
|
|
for (double d : delta) dv += (d - dm) * (d - dm);
|
|
const double dstd = std::sqrt(dv / std::max<size_t>(1, delta.size()));
|
|
const double cut = dm - nsigma * dstd;
|
|
|
|
std::vector<char> reject(outcomes.size(), 0);
|
|
for (size_t i = 0; i < outcomes.size(); ++i)
|
|
reject[i] = (outcomes[i].reflections.empty() ? 0 : (delta[i] < cut ? 1 : 0));
|
|
return reject;
|
|
}
|
|
|
|
bool MergeOnTheFly::Mask(const IntegrationOutcome &outcome, bool cc_mask) {
|
|
if (reference_cell) {
|
|
auto cell = outcome.latt.GetUnitCell();
|
|
if (!cell.is_close(*reference_cell,
|
|
indexing_settings.GetUnitCellDistTolerance(),
|
|
indexing_settings.GetUnitCellAngleTolerance_deg()))
|
|
return true;
|
|
}
|
|
|
|
if (cc_mask && image_cc_limit) {
|
|
if (!outcome.image_scale_cc
|
|
|| std::isnan(outcome.image_scale_cc.value())
|
|
|| outcome.image_scale_cc.value() < image_cc_limit.value())
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
std::vector<MergedReflection> MergeOnTheFly::ExportReflections() {
|
|
std::unique_lock ul(merged_mutex);
|
|
|
|
float d_min = std::numeric_limits<float>::max();
|
|
float d_max = 0.0f;
|
|
|
|
std::vector<MergedReflection> out;
|
|
out.reserve(accumulator.size());
|
|
for (const auto &accum: accumulator | std::views::values) {
|
|
if (accum.sum_w <= 0.0)
|
|
continue;
|
|
|
|
MergedReflection mr{
|
|
.h = accum.h,
|
|
.k = accum.k,
|
|
.l = accum.l,
|
|
.I = static_cast<float>(accum.sum_wI / accum.sum_w),
|
|
.sigma = 1.0f / std::sqrt(static_cast<float>(accum.sum_w)),
|
|
.I_half = {NAN, NAN},
|
|
.sigma_half = {NAN, NAN},
|
|
.d = accum.d
|
|
};
|
|
|
|
if (accum.n_half[0] + accum.n_half[1] > 0 && accum.sum_w_half[0] > 0.0 && accum.sum_w_half[1] > 0.0) {
|
|
for (int i = 0; i < 2; ++i) {
|
|
mr.I_half[i] = static_cast<float>(accum.sum_wI_half[i] / accum.sum_w_half[i]);
|
|
mr.sigma_half[i] = 1.0f / std::sqrt(static_cast<float>(accum.sum_w_half[i]));
|
|
}
|
|
}
|
|
|
|
if (!std::isfinite(accum.d) || accum.d <= 0.0f)
|
|
continue;
|
|
|
|
d_min = std::min(d_min, accum.d);
|
|
d_max = std::max(d_max, accum.d);
|
|
|
|
out.emplace_back(mr);
|
|
}
|
|
|
|
const double rfree_fraction = scaling_settings.GetRfreeFraction();
|
|
if (rfree_fraction > 0.0 && !out.empty()) {
|
|
if (d_min < d_max && d_min > 0.0f) {
|
|
constexpr int n_shells = 20;
|
|
|
|
const float d_min_pad = d_min * 0.999f;
|
|
const float d_max_pad = d_max * 1.001f;
|
|
|
|
ResolutionShells shells(d_min_pad, d_max_pad, n_shells);
|
|
std::vector<std::vector<size_t>> shell_groups(n_shells);
|
|
|
|
for (size_t i = 0; i < out.size(); ++i) {
|
|
const auto shell = shells.GetShell(out[i].d);
|
|
if (!shell.has_value())
|
|
continue;
|
|
|
|
const int s = *shell;
|
|
if (s >= 0 && s < n_shells)
|
|
shell_groups[s].push_back(i);
|
|
}
|
|
|
|
std::mt19937 rfree_rng(12345u);
|
|
std::bernoulli_distribution rfree_dist(rfree_fraction);
|
|
|
|
for (const auto &group: shell_groups) {
|
|
for (const size_t idx: group)
|
|
out[idx].rfree_flag = rfree_dist(rfree_rng);
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
std::vector<MergedReflection> MergeAll(const DiffractionExperiment &x,
|
|
const std::vector<IntegrationOutcome> &integration_outcome,
|
|
bool mask) {
|
|
MergeOnTheFly merge(x);
|
|
for (const auto &outcome: integration_outcome)
|
|
merge.AddImage(outcome, mask);
|
|
return merge.ExportReflections();
|
|
}
|
|
|
|
struct ShellAccum {
|
|
int total_obs = 0;
|
|
int unique = 0;
|
|
int possible = 0;
|
|
|
|
double sum_i_over_sigma = 0.0;
|
|
int n_i_over_sigma = 0;
|
|
|
|
CorrelationCoefficient cc_half;
|
|
CorrelationCoefficient cc_ref;
|
|
};
|
|
|
|
void CalcPossibleReflections(int space_group_number ,
|
|
const UnitCell &cell,
|
|
double d_min,
|
|
double d_max,
|
|
const ResolutionShells &shells,
|
|
std::vector<ShellAccum> &acc) {
|
|
gemmi::UnitCell gemmi_cell = cell;
|
|
const gemmi::SpaceGroup *sg = gemmi::find_spacegroup_by_number(space_group_number);
|
|
if (sg == nullptr)
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
|
|
"Invalid space group number " + std::to_string(space_group_number));
|
|
|
|
// Generate unique reflections
|
|
std::vector<gemmi::Miller> possible_hkls = gemmi::make_miller_vector(gemmi_cell, sg, d_min, d_max, true);
|
|
CrystalLattice lattice(cell);
|
|
const auto astar = lattice.Astar();
|
|
const auto bstar = lattice.Bstar();
|
|
const auto cstar = lattice.Cstar();
|
|
|
|
for (const auto &hkl: possible_hkls) {
|
|
const auto q = hkl[0] * astar + hkl[1] * bstar + hkl[2] * cstar;
|
|
const auto qlen = q.Length();
|
|
if (qlen < 1e-6)
|
|
continue;
|
|
const auto d = 1.0 / qlen;
|
|
const auto shell = shells.GetShell(d);
|
|
if (!shell.has_value())
|
|
continue;
|
|
const int s = *shell;
|
|
if (s >= 0 && s < acc.size())
|
|
acc[s].possible++;
|
|
}
|
|
}
|
|
|
|
|
|
MergeStatistics MergeOnTheFly::MergeStats(const std::vector<MergedReflection> &merged,
|
|
const std::vector<IntegrationOutcome > &integration_outcome,
|
|
const std::vector<MergedReflection> &reference) {
|
|
|
|
constexpr int n_shells = 10;
|
|
|
|
auto d_min_limit_A = scaling_settings.GetHighResolutionLimit_A();
|
|
|
|
std::unordered_map<uint64_t, float> reference_intensities;
|
|
if (!reference.empty()) {
|
|
reference_intensities.reserve(reference.size());
|
|
for (const auto &r: reference) {
|
|
if (!std::isfinite(r.I))
|
|
continue;
|
|
|
|
const auto hkl = generator(r);
|
|
reference_intensities[hkl.pack()] = r.I;
|
|
}
|
|
}
|
|
|
|
float d_min = std::numeric_limits<float>::max();
|
|
float d_max = 0.0f;
|
|
|
|
for (const auto &m: merged) {
|
|
if (!std::isfinite(m.d) || m.d <= 0.0f)
|
|
continue;
|
|
if (d_min_limit_A && m.d < d_min_limit_A)
|
|
continue;
|
|
|
|
d_min = std::min(d_min, m.d);
|
|
d_max = std::max(d_max, m.d);
|
|
}
|
|
|
|
if (!(d_min < d_max && d_min > 0.0f))
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
|
|
"MergeStats: Error in resolution calculation");
|
|
|
|
const float d_min_pad = d_min * 0.999f;
|
|
const float d_max_pad = d_max * 1.001f;
|
|
|
|
ResolutionShells shells(d_min_pad, d_max_pad, n_shells);
|
|
const auto shell_mean_1_d2 = shells.GetShellMeanOneOverResSq();
|
|
const auto shell_min_res = shells.GetShellMinRes();
|
|
|
|
std::vector<ShellAccum> acc(n_shells);
|
|
|
|
if (reference_cell.has_value())
|
|
CalcPossibleReflections(space_group_number, reference_cell.value(),
|
|
d_min_pad, d_max_pad, shells, acc);
|
|
|
|
CorrelationCoefficient cc_half_overall;
|
|
CorrelationCoefficient cc_ref_overall;
|
|
|
|
for (const auto &m: merged) {
|
|
const auto shell = shells.GetShell(m.d);
|
|
if (!shell.has_value())
|
|
continue;
|
|
|
|
const int s = *shell;
|
|
if (s >= 0 && s < n_shells) {
|
|
if (std::isfinite(m.I) && std::isfinite(m.sigma) && m.sigma > 0.0) {
|
|
acc[s].unique++;
|
|
acc[s].sum_i_over_sigma += m.I / m.sigma;
|
|
++acc[s].n_i_over_sigma;
|
|
|
|
if (!reference_intensities.empty()) {
|
|
const auto hkl = generator(m);
|
|
const auto ref_it = reference_intensities.find(hkl.pack());
|
|
if (ref_it != reference_intensities.end() && std::isfinite(ref_it->second)) {
|
|
acc[s].cc_ref.Add(m.I, ref_it->second);
|
|
cc_ref_overall.Add(m.I, ref_it->second);
|
|
}
|
|
}
|
|
|
|
if (std::isfinite(m.I_half[0]) && std::isfinite(m.I_half[1])) {
|
|
acc[s].cc_half.Add(m.I_half[0], m.I_half[1]);
|
|
cc_half_overall.Add(m.I_half[0], m.I_half[1]);
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < integration_outcome.size(); ++i) {
|
|
if (Mask(integration_outcome[i], true))
|
|
continue;
|
|
|
|
for (const auto &r: integration_outcome[i].reflections) {
|
|
if (generator.IsSystematicallyAbsent(r))
|
|
continue;
|
|
if (r.image_scale_corr <= 0.0 || !std::isfinite(r.image_scale_corr))
|
|
continue;
|
|
if (!AcceptReflection(r, d_min_limit_A))
|
|
continue;
|
|
if (r.partiality < min_partiality)
|
|
continue;
|
|
|
|
const float I_corr = r.I * r.image_scale_corr;
|
|
const float sigma_corr = r.sigma * r.image_scale_corr;
|
|
if (!std::isfinite(I_corr) || !std::isfinite(sigma_corr) || sigma_corr <= 0.0f)
|
|
continue;
|
|
|
|
const auto shell = shells.GetShell(r.d);
|
|
if (!shell.has_value())
|
|
continue;
|
|
const int s = *shell;
|
|
if (s >= 0 && s < n_shells)
|
|
acc[s].total_obs++;
|
|
}
|
|
}
|
|
|
|
MergeStatistics out;
|
|
out.shells.resize(n_shells);
|
|
|
|
for (int s = 0; s < n_shells; ++s) {
|
|
const auto &sa = acc[s];
|
|
auto &ss = out.shells[s];
|
|
|
|
ss.mean_one_over_d2 = shell_mean_1_d2[s];
|
|
ss.d_min = shell_min_res[s];
|
|
ss.d_max = s == 0 ? d_max_pad : shell_min_res[s - 1];
|
|
ss.total_observations = sa.total_obs;
|
|
ss.unique_reflections = sa.unique;
|
|
ss.possible_unique_reflections = sa.possible;
|
|
ss.mean_i_over_sigma = sa.n_i_over_sigma > 0
|
|
? sa.sum_i_over_sigma / sa.n_i_over_sigma
|
|
: 0.0;
|
|
|
|
ss.cc_half = sa.cc_half.GetCC();
|
|
ss.cc_ref = sa.cc_ref.GetCC();
|
|
}
|
|
|
|
auto &overall = out.overall;
|
|
overall.d_min = d_min;
|
|
overall.d_max = d_max;
|
|
|
|
int all_possible = 0;
|
|
int all_unique = 0;
|
|
double sum_i_over_sigma = 0.0;
|
|
int n_i_over_sigma = 0;
|
|
|
|
|
|
for (const auto &sa: acc) {
|
|
overall.total_observations += sa.total_obs;
|
|
all_unique += sa.unique;
|
|
all_possible += sa.possible;
|
|
sum_i_over_sigma += sa.sum_i_over_sigma;
|
|
n_i_over_sigma += sa.n_i_over_sigma;
|
|
|
|
}
|
|
|
|
overall.possible_unique_reflections = all_possible;
|
|
overall.unique_reflections = all_unique;
|
|
overall.mean_i_over_sigma = n_i_over_sigma > 0 ? sum_i_over_sigma / n_i_over_sigma : 0.0;
|
|
overall.cc_half = cc_half_overall.GetCC();
|
|
overall.cc_ref = cc_ref_overall.GetCC();
|
|
|
|
return out;
|
|
}
|
|
|
|
std::ostream &operator<<(std::ostream &output, const MergeStatisticsShell &in) {
|
|
double completeness = in.possible_unique_reflections > 0
|
|
? static_cast<double>(in.unique_reflections) / in.possible_unique_reflections * 100.0 : 0.0;
|
|
|
|
output << fmt::format("{:8d} {:8d} {:8d} {:7.1f}% {:8.1f} {:7.1f}% {:7.1f}%",
|
|
in.total_observations,
|
|
in.unique_reflections,
|
|
in.possible_unique_reflections,
|
|
completeness,
|
|
in.mean_i_over_sigma,
|
|
in.cc_half*100.0,
|
|
in.cc_ref*100.0);
|
|
return output;
|
|
}
|
|
|
|
std::ostream &operator<<(std::ostream &output, const MergeStatistics &in) {
|
|
output << std::endl;
|
|
output << fmt::format(" {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}",
|
|
"d_min", "N_obs", "N_uniq", "N_possib", "Compl","<I/sig>", "CC1/2", "CCref")
|
|
<< std::endl;
|
|
output << fmt::format(" {:->8s} {:->8s} {:->8s} {:->8s} {:->8s} {:->8s} {:->8s} {:->8s}",
|
|
"", "", "", "", "", "", "", "") << std::endl;
|
|
for (const auto &sh: in.shells) {
|
|
if (sh.unique_reflections == 0)
|
|
continue;
|
|
output << fmt::format(" {:8.2f} ", sh.d_min);
|
|
output << sh;
|
|
output << std::endl;
|
|
}
|
|
output << fmt::format(" {:->8s} {:->8s} {:->8s} {:->8s} {:->8s} {:->8s} {:->8s} {:->8s}",
|
|
"", "", "", "", "", "", "", "") << std::endl;
|
|
|
|
output << fmt::format(" {:>8s} ", "Overall");
|
|
output << in.overall;
|
|
output << std::endl;
|
|
output << std::endl;
|
|
return output;
|
|
}
|