Merge: per-crystal CC1/2-delta rejection (--reject-delta-cchalf)

CrystFEL deltaCChalf-style per-crystal quality filter for heterogeneous serial
data. Each image is assigned to one CC1/2 half, so removing it perturbs only that
half's per-reflection means; deltaCChalf_i = CC1/2(all) - CC1/2(without image i).
A negative value means dropping the image RAISES CC1/2 (it disagrees with the
consensus). Images whose deltaCChalf is a low-side statistical outlier
(< mean - N*stddev) are skipped when merging. Reference-free.

Two passes over the retained integration outcomes; per-image contributions are
re-derived rather than stored, so memory stays O(unique reflections + images) for
full 200k-frame runs. New CLI flag --reject-delta-cchalf <N> (default: off).

Validation (jet FFBIDX +C+S, sigma4): removing 17/4000 (3 sigma) raises CC1/2
95.1->96.1%, CCref 54.9->55.2; 2 sigma -> 96.1/55.3. Dataset-appropriate: it HELPS
heterogeneous serial data (some crystals genuinely bad) but slightly trims a
homogeneous single rotation crystal (c2 94.6->93.8 - no bad crystals, the relative
cut still removes the tail), so it is opt-in. R-free is the real test (user's full
200k). Note: the reported overall N_obs still counts all observations; the exported
merge (and CC1/2) correctly exclude the rejected images.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 12:32:52 +02:00
parent f878fb9d5d
commit 545ebdf868
3 changed files with 129 additions and 3 deletions
+102
View File
@@ -260,6 +260,108 @@ void MergeOnTheFly::RefineErrorModel(const std::vector<IntegrationOutcome> &outc
error_model_active = true;
}
// Per-crystal CC1/2-delta rejection (CrystFEL deltaCChalf style). Each image is assigned
// to one CC1/2 half, so removing an image only perturbs that half's per-reflection means.
// deltaCChalf_i = CC1/2(all) - CC1/2(without image i): a NEGATIVE value means removing the
// image RAISES CC1/2, i.e. it is inconsistent with the consensus. We flag images whose
// deltaCChalf is a low-side statistical outlier (< mean - nsigma*stddev). Reference-free.
// Two passes over the (retained) outcomes; per-image contributions are re-derived, not
// stored, so memory stays O(unique reflections + images) for full 200k-frame datasets.
std::vector<char> MergeOnTheFly::DeltaCChalfReject(const std::vector<IntegrationOutcome> &outcomes,
double nsigma) const {
struct Acc { double swI[2] = {0, 0}; double sw[2] = {0, 0}; size_t n[2] = {0, 0}; };
std::unordered_map<uint64_t, Acc> acc;
std::vector<int> img_half(outcomes.size(), 0);
std::mt19937 lrng{2026061600u};
std::bernoulli_distribution hd{0.5};
// ---- pass 1: accumulate half-set sums, record each image's half ----
auto contribution = [&](const Reflection &r, uint64_t &key, double &wI, double &w) -> bool {
if (generator.IsSystematicallyAbsent(r)) return false;
if (r.image_scale_corr <= 0.0 || !std::isfinite(r.image_scale_corr)) return false;
if (!AcceptReflection(r, high_resolution_limit)) return false;
if (r.partiality < min_partiality) return false;
const double I = static_cast<double>(r.I) * r.image_scale_corr;
const double s = static_cast<double>(r.sigma) * r.image_scale_corr;
if (!std::isfinite(I) || !std::isfinite(s) || s <= 0.0) return false;
w = 1.0 / (s * s);
wI = w * I;
key = generator(r).pack();
return true;
};
for (size_t i = 0; i < outcomes.size(); ++i) {
const int half = hd(lrng) ? 1 : 0;
img_half[i] = half;
for (const auto &r : outcomes[i].reflections) {
uint64_t key; double wI, w;
if (!contribution(r, key, wI, w)) continue;
auto &a = acc[key];
a.swI[half] += wI; a.sw[half] += w; a.n[half]++;
}
}
// ---- baseline Pearson over reflections present in BOTH halves (x=half0, y=half1) ----
auto pearson = [](double N, double Sx, double Sy, double Sxx, double Syy, double Sxy) -> double {
const double cov = N * Sxy - Sx * Sy;
const double vx = N * Sxx - Sx * Sx, vy = N * Syy - Sy * Sy;
const double den = std::sqrt(vx * vy);
return den > 0.0 ? cov / den : 0.0;
};
double N = 0, Sx = 0, Sy = 0, Sxx = 0, Syy = 0, Sxy = 0;
for (const auto &kv : acc) {
const auto &a = kv.second;
if (a.n[0] == 0 || a.n[1] == 0) continue;
const double x = a.swI[0] / a.sw[0], y = a.swI[1] / a.sw[1];
N += 1; Sx += x; Sy += y; Sxx += x * x; Syy += y * y; Sxy += x * y;
}
const double cc_base = pearson(N, Sx, Sy, Sxx, Syy, Sxy);
// ---- pass 2: leave-one-out deltaCChalf per image ----
std::vector<double> delta(outcomes.size(), 0.0);
for (size_t i = 0; i < outcomes.size(); ++i) {
const int h = img_half[i];
// aggregate this image's contributions per reflection key (an image may, rarely,
// touch the same ASU reflection twice)
std::unordered_map<uint64_t, std::pair<double, double>> mine; // key -> (sum wI, sum w), count via .first
std::unordered_map<uint64_t, size_t> mine_n;
for (const auto &r : outcomes[i].reflections) {
uint64_t key; double wI, w;
if (!contribution(r, key, wI, w)) continue;
auto &p = mine[key]; p.first += wI; p.second += w; mine_n[key]++;
}
double n = N, sx = Sx, sy = Sy, sxx = Sxx, syy = Syy, sxy = Sxy;
for (const auto &m : mine) {
const auto &a = acc.at(m.first);
if (a.n[0] == 0 || a.n[1] == 0) continue; // reflection not in CC1/2
const double x0 = a.swI[0] / a.sw[0], y0 = a.swI[1] / a.sw[1];
n -= 1; sx -= x0; sy -= y0; sxx -= x0 * x0; syy -= y0 * y0; sxy -= x0 * y0;
const double swI_h = a.swI[h] - m.second.first;
const double sw_h = a.sw[h] - m.second.second;
if (a.n[h] - mine_n[m.first] == 0 || sw_h <= 0.0) continue; // reflection drops half h
const double mean_h = swI_h / sw_h;
const double xnew = (h == 0) ? mean_h : x0;
const double ynew = (h == 1) ? mean_h : y0;
n += 1; sx += xnew; sy += ynew; sxx += xnew * xnew; syy += ynew * ynew; sxy += xnew * ynew;
}
delta[i] = cc_base - pearson(n, sx, sy, sxx, syy, sxy);
}
// ---- reject low-side outliers: delta < mean - nsigma*stddev ----
double dm = 0, dv = 0;
for (double d : delta) dm += d;
dm /= std::max<size_t>(1, delta.size());
for (double d : delta) dv += (d - dm) * (d - dm);
const double dstd = std::sqrt(dv / std::max<size_t>(1, delta.size()));
const double cut = dm - nsigma * dstd;
std::vector<char> reject(outcomes.size(), 0);
for (size_t i = 0; i < outcomes.size(); ++i)
reject[i] = (outcomes[i].reflections.empty() ? 0 : (delta[i] < cut ? 1 : 0));
return reject;
}
bool MergeOnTheFly::Mask(const IntegrationOutcome &outcome, bool cc_mask) {
if (reference_cell) {
auto cell = outcome.latt.GetUnitCell();
+6
View File
@@ -114,6 +114,12 @@ public:
void AddImage(const IntegrationOutcome& outcome, bool cc_mask = false);
// Per-crystal CC1/2-delta rejection (CrystFEL deltaCChalf): returns a per-image flag
// marking images whose removal would raise CC1/2 by a low-side outlier amount
// (deltaCChalf < mean - nsigma*stddev). Skip the flagged images when merging.
[[nodiscard]] std::vector<char> DeltaCChalfReject(const std::vector<IntegrationOutcome> &outcomes,
double nsigma) const;
MergeStatistics MergeStats(const std::vector<MergedReflection> &merged,
const std::vector<IntegrationOutcome> &reflections,
const std::vector<MergedReflection> &reference = {});
+21 -3
View File
@@ -73,6 +73,7 @@ void print_usage() {
std::cout << " --scaling-high-resolution <num> High resolution limit for spot finding (default: no limit)" << std::endl;
std::cout << " --min-partiality <num> Minimum partiality to accept reflection (default: 0.02)" << std::endl;
std::cout << " --reject-outliers <num> Per-observation merge outlier rejection, N sigma from the per-reflection median (default: off; e.g. 6, XDS/DIALS-style)" << std::endl;
std::cout << " --reject-delta-cchalf <num> Per-crystal CC1/2-delta rejection: drop images with deltaCChalf below mean - N*stddev (default: off; e.g. 2.5)" << std::endl;
std::cout << " --min-image-cc <num> Per-image CC limit in percent (default: no limit)" << std::endl;
std::cout << " --scaling-iterations <num> Number of scaling iterations with no reference data (default: 3)" << std::endl;
std::cout << " --scaling-output <txt> Output format for scaling results mtz|cif|txt (default: mtz)" << std::endl;
@@ -101,7 +102,8 @@ enum {
OPT_BANDWIDTH,
OPT_INTEGRATION_RADIUS,
OPT_REJECT_OUTLIERS,
OPT_PROFILE_MULTIPLIER
OPT_PROFILE_MULTIPLIER,
OPT_REJECT_DELTA_CCHALF
};
static option long_options[] = {
@@ -140,6 +142,7 @@ static option long_options[] = {
{"bandwidth", required_argument, nullptr, OPT_BANDWIDTH},
{"integration-radius", required_argument, nullptr, OPT_INTEGRATION_RADIUS},
{"reject-outliers", required_argument, nullptr, OPT_REJECT_OUTLIERS},
{"reject-delta-cchalf", required_argument, nullptr, OPT_REJECT_DELTA_CCHALF},
{"profile-multiplier", required_argument, nullptr, OPT_PROFILE_MULTIPLIER},
{nullptr, 0, nullptr, 0}
};
@@ -323,6 +326,7 @@ int main(int argc, char **argv) {
std::optional<float> d_min_scale_merge;
std::optional<std::string> integration_radius_arg; // "r1" or "r1,r2,r3"
std::optional<double> outlier_reject_nsigma; // merge per-observation outlier rejection
std::optional<double> delta_cchalf_nsigma; // per-crystal CC1/2-delta rejection
std::optional<float> profile_multiplier; // PixelRefine Term-2 profile-width multiplier
if (argc == 1) {
@@ -519,6 +523,9 @@ int main(int argc, char **argv) {
case OPT_REJECT_OUTLIERS:
outlier_reject_nsigma = std::stod(optarg);
break;
case OPT_REJECT_DELTA_CCHALF:
delta_cchalf_nsigma = std::stod(optarg);
break;
case OPT_PROFILE_MULTIPLIER:
profile_multiplier = std::stof(optarg);
break;
@@ -1057,8 +1064,19 @@ int main(int argc, char **argv) {
const double isa = (b > 0.0) ? 1.0 / b : std::numeric_limits<double>::infinity();
logger.Info("Error model: sigma'^2 = {:.3f} sigma^2 + ({:.4f} I)^2 ISa = {:.1f}", a, b, isa);
}
for (auto &i : indexer.GetIntegrationOutcome())
merge_engine.AddImage(i);
const auto &merge_outcomes = indexer.GetIntegrationOutcome();
std::vector<char> dcch_reject;
if (delta_cchalf_nsigma) {
dcch_reject = merge_engine.DeltaCChalfReject(merge_outcomes, *delta_cchalf_nsigma);
const size_t nrej = std::count(dcch_reject.begin(), dcch_reject.end(), static_cast<char>(1));
logger.Info("CC1/2-delta rejection (deltaCChalf < mean - {:.1f} stddev) removed {} / {} images",
*delta_cchalf_nsigma, nrej, dcch_reject.size());
}
for (size_t i = 0; i < merge_outcomes.size(); ++i) {
if (!dcch_reject.empty() && dcch_reject[i])
continue;
merge_engine.AddImage(merge_outcomes[i]);
}
if (merge_engine.RejectedCount() > 0)
logger.Info("Outlier rejection (>{:.1f} sigma from the per-reflection median) removed {} observations",
experiment.GetScalingSettings().GetOutlierRejectNsigma(), merge_engine.RejectedCount());