Jungfraujoch/image_analysis/LoadFCalcFromMtz.cpp

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "LoadFCalcFromMtz.h"

#include <cmath>
#include <cstdint>
#include <limits>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>

#include <gemmi/mtz.hpp>
#include <gemmi/symmetry.hpp>

namespace {

// Reference intensities can come from a calculated structure factor F-model (squared to an
// intensity), or from a merged/observed mean-intensity column (used directly - this is what lets
// reference-based scaling be self-seeded from the data's own previous merge). column_with_one_of_labels would
// hide which label matched, so the priority list is walked explicitly to record the choice.
const gemmi::Mtz::Column* SelectDefaultColumn(const gemmi::Mtz& mtz, bool& square) {
    if (const auto* col = mtz.column_with_label("F-model", nullptr, 'F')) {
        square = true;
        return col;
    }
    for (const char* label : {"IMEAN", "I", "IOBS", "Iobs", "I-obs"}) {
        if (const auto* col = mtz.column_with_label(label, nullptr, 'J')) {
            square = false;
            return col;
        }
    }
    for (const auto& c : mtz.columns)
        if (c.type == 'J') {
            square = false;
            return &c;
        }
    return nullptr;
}

} // namespace

ReferenceMtzData LoadReferenceMtz(const std::string& path,
                                  const std::optional<std::string>& column) {
    gemmi::Mtz mtz;
    mtz.read_file_gz(path, true);

    ReferenceMtzData out;

    // Columns the caller may pick as reference intensities/structure factors.
    for (const auto& c : mtz.columns)
        if (c.type == 'J' || c.type == 'F')
            out.candidate_columns.push_back({c.label, c.type});

    const gemmi::Mtz::Column* col = nullptr;
    if (column) {
        col = mtz.column_with_label(*column, nullptr);
        if (col == nullptr)
            throw std::runtime_error("Reference MTZ has no column '" + *column + "'");
        out.squared = (col->type == 'F');
        out.default_column = false;
    } else {
        col = SelectDefaultColumn(mtz, out.squared);
        if (col == nullptr)
            throw std::runtime_error("MTZ has no F-model or intensity (J) column to use as reference");
        out.default_column = true;
    }
    out.used_column = col->label;
    out.used_column_type = col->type;

    // Cell and space group the reference was recorded in, for display and the consistency check.
    const gemmi::UnitCell& gc = mtz.get_cell();
    const bool cell_valid = gc.a > 0 && gc.b > 0 && gc.c > 0;
    if (cell_valid)
        out.cell = UnitCell{static_cast<float>(gc.a), static_cast<float>(gc.b), static_cast<float>(gc.c),
                            static_cast<float>(gc.alpha), static_cast<float>(gc.beta),
                            static_cast<float>(gc.gamma)};
    if (mtz.spacegroup != nullptr) {
        out.space_group_number = mtz.spacegroup->number;
        out.space_group_name = mtz.spacegroup->short_name();
        out.point_group = mtz.spacegroup->point_group_hm();
    }

    out.reflections.reserve(static_cast<std::size_t>(mtz.nreflections));

    const std::size_t stride = mtz.columns.size();
    double d_min = std::numeric_limits<double>::max();
    double d_max = 0.0;

    for (int i = 0; i < mtz.nreflections; ++i) {
        const float v = (*col)[static_cast<std::size_t>(i)];
        if (std::isnan(v))
            continue;

        const std::size_t row = static_cast<std::size_t>(i) * stride;
        MergedReflection r;
        r.h = static_cast<int32_t>(mtz.data[row + 0]);
        r.k = static_cast<int32_t>(mtz.data[row + 1]);
        r.l = static_cast<int32_t>(mtz.data[row + 2]);
        r.I = out.squared ? v * v : v;
        r.sigma = NAN;

        if (cell_valid) {
            r.d = static_cast<float>(gc.calculate_d({{r.h, r.k, r.l}}));
            if (std::isfinite(r.d) && r.d > 0.0f) {
                d_min = std::min<double>(d_min, r.d);
                d_max = std::max<double>(d_max, r.d);
            }
        }

        out.reflections.emplace_back(r);
    }

    if (d_max > 0.0) {
        out.d_min = d_min;
        out.d_max = d_max;
    }

    return out;
}

std::string ReferenceConsistencyWarning(const ReferenceMtzData& reference,
                                        const std::optional<UnitCell>& data_cell,
                                        std::optional<int> data_space_group_number) {
    std::vector<std::string> issues;

    if (reference.cell && data_cell) {
        const auto& r = *reference.cell;
        const auto& d = *data_cell;
        auto rel = [](float x, float y) { return y != 0.0f ? std::abs(x - y) / std::abs(y) : 0.0f; };
        const bool len_off = rel(r.a, d.a) > 0.02f || rel(r.b, d.b) > 0.02f || rel(r.c, d.c) > 0.02f;
        const bool ang_off = std::abs(r.alpha - d.alpha) > 1.5f || std::abs(r.beta - d.beta) > 1.5f
                          || std::abs(r.gamma - d.gamma) > 1.5f;
        if (len_off || ang_off) {
            std::ostringstream ss;
            ss.setf(std::ios::fixed);
            ss.precision(2);
            ss << "unit cell differs (reference " << r.a << " " << r.b << " " << r.c << " "
               << r.alpha << " " << r.beta << " " << r.gamma << ", data " << d.a << " " << d.b
               << " " << d.c << " " << d.alpha << " " << d.beta << " " << d.gamma << ")";
            issues.push_back(ss.str());
        }
    }

    if (!reference.point_group.empty() && data_space_group_number) {
        const auto* sg = gemmi::find_spacegroup_by_number(*data_space_group_number);
        if (sg != nullptr && reference.point_group != sg->point_group_hm())
            issues.push_back("point group differs (reference " + reference.point_group + ", data "
                             + std::string(sg->point_group_hm()) + ")");
    }

    if (issues.empty())
        return {};

    std::string out = "reference may not match the data: ";
    for (std::size_t i = 0; i < issues.size(); ++i)
        out += (i ? "; " : "") + issues[i];
    return out;
}