Merge: per-crystal CC1/2-delta rejection (--reject-delta-cchalf)
CrystFEL deltaCChalf-style per-crystal quality filter for heterogeneous serial data. Each image is assigned to one CC1/2 half, so removing it perturbs only that half's per-reflection means; deltaCChalf_i = CC1/2(all) - CC1/2(without image i). A negative value means dropping the image RAISES CC1/2 (it disagrees with the consensus). Images whose deltaCChalf is a low-side statistical outlier (< mean - N*stddev) are skipped when merging. Reference-free. Two passes over the retained integration outcomes; per-image contributions are re-derived rather than stored, so memory stays O(unique reflections + images) for full 200k-frame runs. New CLI flag --reject-delta-cchalf <N> (default: off). Validation (jet FFBIDX +C+S, sigma4): removing 17/4000 (3 sigma) raises CC1/2 95.1->96.1%, CCref 54.9->55.2; 2 sigma -> 96.1/55.3. Dataset-appropriate: it HELPS heterogeneous serial data (some crystals genuinely bad) but slightly trims a homogeneous single rotation crystal (c2 94.6->93.8 - no bad crystals, the relative cut still removes the tail), so it is opt-in. R-free is the real test (user's full 200k). Note: the reported overall N_obs still counts all observations; the exported merge (and CC1/2) correctly exclude the rejected images. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -260,6 +260,108 @@ void MergeOnTheFly::RefineErrorModel(const std::vector<IntegrationOutcome> &outc
|
||||
error_model_active = true;
|
||||
}
|
||||
|
||||
// Per-crystal CC1/2-delta rejection (CrystFEL deltaCChalf style). Each image is assigned
|
||||
// to one CC1/2 half, so removing an image only perturbs that half's per-reflection means.
|
||||
// deltaCChalf_i = CC1/2(all) - CC1/2(without image i): a NEGATIVE value means removing the
|
||||
// image RAISES CC1/2, i.e. it is inconsistent with the consensus. We flag images whose
|
||||
// deltaCChalf is a low-side statistical outlier (< mean - nsigma*stddev). Reference-free.
|
||||
// Two passes over the (retained) outcomes; per-image contributions are re-derived, not
|
||||
// stored, so memory stays O(unique reflections + images) for full 200k-frame datasets.
|
||||
std::vector<char> MergeOnTheFly::DeltaCChalfReject(const std::vector<IntegrationOutcome> &outcomes,
|
||||
double nsigma) const {
|
||||
struct Acc { double swI[2] = {0, 0}; double sw[2] = {0, 0}; size_t n[2] = {0, 0}; };
|
||||
std::unordered_map<uint64_t, Acc> acc;
|
||||
std::vector<int> img_half(outcomes.size(), 0);
|
||||
|
||||
std::mt19937 lrng{2026061600u};
|
||||
std::bernoulli_distribution hd{0.5};
|
||||
|
||||
// ---- pass 1: accumulate half-set sums, record each image's half ----
|
||||
auto contribution = [&](const Reflection &r, uint64_t &key, double &wI, double &w) -> bool {
|
||||
if (generator.IsSystematicallyAbsent(r)) return false;
|
||||
if (r.image_scale_corr <= 0.0 || !std::isfinite(r.image_scale_corr)) return false;
|
||||
if (!AcceptReflection(r, high_resolution_limit)) return false;
|
||||
if (r.partiality < min_partiality) return false;
|
||||
const double I = static_cast<double>(r.I) * r.image_scale_corr;
|
||||
const double s = static_cast<double>(r.sigma) * r.image_scale_corr;
|
||||
if (!std::isfinite(I) || !std::isfinite(s) || s <= 0.0) return false;
|
||||
w = 1.0 / (s * s);
|
||||
wI = w * I;
|
||||
key = generator(r).pack();
|
||||
return true;
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < outcomes.size(); ++i) {
|
||||
const int half = hd(lrng) ? 1 : 0;
|
||||
img_half[i] = half;
|
||||
for (const auto &r : outcomes[i].reflections) {
|
||||
uint64_t key; double wI, w;
|
||||
if (!contribution(r, key, wI, w)) continue;
|
||||
auto &a = acc[key];
|
||||
a.swI[half] += wI; a.sw[half] += w; a.n[half]++;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- baseline Pearson over reflections present in BOTH halves (x=half0, y=half1) ----
|
||||
auto pearson = [](double N, double Sx, double Sy, double Sxx, double Syy, double Sxy) -> double {
|
||||
const double cov = N * Sxy - Sx * Sy;
|
||||
const double vx = N * Sxx - Sx * Sx, vy = N * Syy - Sy * Sy;
|
||||
const double den = std::sqrt(vx * vy);
|
||||
return den > 0.0 ? cov / den : 0.0;
|
||||
};
|
||||
double N = 0, Sx = 0, Sy = 0, Sxx = 0, Syy = 0, Sxy = 0;
|
||||
for (const auto &kv : acc) {
|
||||
const auto &a = kv.second;
|
||||
if (a.n[0] == 0 || a.n[1] == 0) continue;
|
||||
const double x = a.swI[0] / a.sw[0], y = a.swI[1] / a.sw[1];
|
||||
N += 1; Sx += x; Sy += y; Sxx += x * x; Syy += y * y; Sxy += x * y;
|
||||
}
|
||||
const double cc_base = pearson(N, Sx, Sy, Sxx, Syy, Sxy);
|
||||
|
||||
// ---- pass 2: leave-one-out deltaCChalf per image ----
|
||||
std::vector<double> delta(outcomes.size(), 0.0);
|
||||
for (size_t i = 0; i < outcomes.size(); ++i) {
|
||||
const int h = img_half[i];
|
||||
// aggregate this image's contributions per reflection key (an image may, rarely,
|
||||
// touch the same ASU reflection twice)
|
||||
std::unordered_map<uint64_t, std::pair<double, double>> mine; // key -> (sum wI, sum w), count via .first
|
||||
std::unordered_map<uint64_t, size_t> mine_n;
|
||||
for (const auto &r : outcomes[i].reflections) {
|
||||
uint64_t key; double wI, w;
|
||||
if (!contribution(r, key, wI, w)) continue;
|
||||
auto &p = mine[key]; p.first += wI; p.second += w; mine_n[key]++;
|
||||
}
|
||||
double n = N, sx = Sx, sy = Sy, sxx = Sxx, syy = Syy, sxy = Sxy;
|
||||
for (const auto &m : mine) {
|
||||
const auto &a = acc.at(m.first);
|
||||
if (a.n[0] == 0 || a.n[1] == 0) continue; // reflection not in CC1/2
|
||||
const double x0 = a.swI[0] / a.sw[0], y0 = a.swI[1] / a.sw[1];
|
||||
n -= 1; sx -= x0; sy -= y0; sxx -= x0 * x0; syy -= y0 * y0; sxy -= x0 * y0;
|
||||
const double swI_h = a.swI[h] - m.second.first;
|
||||
const double sw_h = a.sw[h] - m.second.second;
|
||||
if (a.n[h] - mine_n[m.first] == 0 || sw_h <= 0.0) continue; // reflection drops half h
|
||||
const double mean_h = swI_h / sw_h;
|
||||
const double xnew = (h == 0) ? mean_h : x0;
|
||||
const double ynew = (h == 1) ? mean_h : y0;
|
||||
n += 1; sx += xnew; sy += ynew; sxx += xnew * xnew; syy += ynew * ynew; sxy += xnew * ynew;
|
||||
}
|
||||
delta[i] = cc_base - pearson(n, sx, sy, sxx, syy, sxy);
|
||||
}
|
||||
|
||||
// ---- reject low-side outliers: delta < mean - nsigma*stddev ----
|
||||
double dm = 0, dv = 0;
|
||||
for (double d : delta) dm += d;
|
||||
dm /= std::max<size_t>(1, delta.size());
|
||||
for (double d : delta) dv += (d - dm) * (d - dm);
|
||||
const double dstd = std::sqrt(dv / std::max<size_t>(1, delta.size()));
|
||||
const double cut = dm - nsigma * dstd;
|
||||
|
||||
std::vector<char> reject(outcomes.size(), 0);
|
||||
for (size_t i = 0; i < outcomes.size(); ++i)
|
||||
reject[i] = (outcomes[i].reflections.empty() ? 0 : (delta[i] < cut ? 1 : 0));
|
||||
return reject;
|
||||
}
|
||||
|
||||
bool MergeOnTheFly::Mask(const IntegrationOutcome &outcome, bool cc_mask) {
|
||||
if (reference_cell) {
|
||||
auto cell = outcome.latt.GetUnitCell();
|
||||
|
||||
@@ -114,6 +114,12 @@ public:
|
||||
|
||||
void AddImage(const IntegrationOutcome& outcome, bool cc_mask = false);
|
||||
|
||||
// Per-crystal CC1/2-delta rejection (CrystFEL deltaCChalf): returns a per-image flag
|
||||
// marking images whose removal would raise CC1/2 by a low-side outlier amount
|
||||
// (deltaCChalf < mean - nsigma*stddev). Skip the flagged images when merging.
|
||||
[[nodiscard]] std::vector<char> DeltaCChalfReject(const std::vector<IntegrationOutcome> &outcomes,
|
||||
double nsigma) const;
|
||||
|
||||
MergeStatistics MergeStats(const std::vector<MergedReflection> &merged,
|
||||
const std::vector<IntegrationOutcome> &reflections,
|
||||
const std::vector<MergedReflection> &reference = {});
|
||||
|
||||
@@ -73,6 +73,7 @@ void print_usage() {
|
||||
std::cout << " --scaling-high-resolution <num> High resolution limit for spot finding (default: no limit)" << std::endl;
|
||||
std::cout << " --min-partiality <num> Minimum partiality to accept reflection (default: 0.02)" << std::endl;
|
||||
std::cout << " --reject-outliers <num> Per-observation merge outlier rejection, N sigma from the per-reflection median (default: off; e.g. 6, XDS/DIALS-style)" << std::endl;
|
||||
std::cout << " --reject-delta-cchalf <num> Per-crystal CC1/2-delta rejection: drop images with deltaCChalf below mean - N*stddev (default: off; e.g. 2.5)" << std::endl;
|
||||
std::cout << " --min-image-cc <num> Per-image CC limit in percent (default: no limit)" << std::endl;
|
||||
std::cout << " --scaling-iterations <num> Number of scaling iterations with no reference data (default: 3)" << std::endl;
|
||||
std::cout << " --scaling-output <txt> Output format for scaling results mtz|cif|txt (default: mtz)" << std::endl;
|
||||
@@ -101,7 +102,8 @@ enum {
|
||||
OPT_BANDWIDTH,
|
||||
OPT_INTEGRATION_RADIUS,
|
||||
OPT_REJECT_OUTLIERS,
|
||||
OPT_PROFILE_MULTIPLIER
|
||||
OPT_PROFILE_MULTIPLIER,
|
||||
OPT_REJECT_DELTA_CCHALF
|
||||
};
|
||||
|
||||
static option long_options[] = {
|
||||
@@ -140,6 +142,7 @@ static option long_options[] = {
|
||||
{"bandwidth", required_argument, nullptr, OPT_BANDWIDTH},
|
||||
{"integration-radius", required_argument, nullptr, OPT_INTEGRATION_RADIUS},
|
||||
{"reject-outliers", required_argument, nullptr, OPT_REJECT_OUTLIERS},
|
||||
{"reject-delta-cchalf", required_argument, nullptr, OPT_REJECT_DELTA_CCHALF},
|
||||
{"profile-multiplier", required_argument, nullptr, OPT_PROFILE_MULTIPLIER},
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
@@ -323,6 +326,7 @@ int main(int argc, char **argv) {
|
||||
std::optional<float> d_min_scale_merge;
|
||||
std::optional<std::string> integration_radius_arg; // "r1" or "r1,r2,r3"
|
||||
std::optional<double> outlier_reject_nsigma; // merge per-observation outlier rejection
|
||||
std::optional<double> delta_cchalf_nsigma; // per-crystal CC1/2-delta rejection
|
||||
std::optional<float> profile_multiplier; // PixelRefine Term-2 profile-width multiplier
|
||||
|
||||
if (argc == 1) {
|
||||
@@ -519,6 +523,9 @@ int main(int argc, char **argv) {
|
||||
case OPT_REJECT_OUTLIERS:
|
||||
outlier_reject_nsigma = std::stod(optarg);
|
||||
break;
|
||||
case OPT_REJECT_DELTA_CCHALF:
|
||||
delta_cchalf_nsigma = std::stod(optarg);
|
||||
break;
|
||||
case OPT_PROFILE_MULTIPLIER:
|
||||
profile_multiplier = std::stof(optarg);
|
||||
break;
|
||||
@@ -1057,8 +1064,19 @@ int main(int argc, char **argv) {
|
||||
const double isa = (b > 0.0) ? 1.0 / b : std::numeric_limits<double>::infinity();
|
||||
logger.Info("Error model: sigma'^2 = {:.3f} sigma^2 + ({:.4f} I)^2 ISa = {:.1f}", a, b, isa);
|
||||
}
|
||||
for (auto &i : indexer.GetIntegrationOutcome())
|
||||
merge_engine.AddImage(i);
|
||||
const auto &merge_outcomes = indexer.GetIntegrationOutcome();
|
||||
std::vector<char> dcch_reject;
|
||||
if (delta_cchalf_nsigma) {
|
||||
dcch_reject = merge_engine.DeltaCChalfReject(merge_outcomes, *delta_cchalf_nsigma);
|
||||
const size_t nrej = std::count(dcch_reject.begin(), dcch_reject.end(), static_cast<char>(1));
|
||||
logger.Info("CC1/2-delta rejection (deltaCChalf < mean - {:.1f} stddev) removed {} / {} images",
|
||||
*delta_cchalf_nsigma, nrej, dcch_reject.size());
|
||||
}
|
||||
for (size_t i = 0; i < merge_outcomes.size(); ++i) {
|
||||
if (!dcch_reject.empty() && dcch_reject[i])
|
||||
continue;
|
||||
merge_engine.AddImage(merge_outcomes[i]);
|
||||
}
|
||||
if (merge_engine.RejectedCount() > 0)
|
||||
logger.Info("Outlier rejection (>{:.1f} sigma from the per-reflection median) removed {} observations",
|
||||
experiment.GetScalingSettings().GetOutlierRejectNsigma(), merge_engine.RejectedCount());
|
||||
|
||||
Reference in New Issue
Block a user