Files
Jungfraujoch/gemmi_gph/mtz.cpp
T

992 lines
36 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright 2019-2023 Global Phasing Ltd.
#include <gemmi/mtz.hpp>
#include <cstring> // for memcpy
#include <algorithm> // for stable_sort
#include <gemmi/atof.hpp> // for fast_atof
#include <gemmi/atox.hpp> // for simple_atoi, read_word
#include <gemmi/gz.hpp>
#include <gemmi/sprintf.hpp>
namespace gemmi {
namespace {
double wrap_degrees(double phi) {
if (phi >= 0 && phi < 360.)
return phi;
return phi - std::floor(phi / 360.) * 360.;
}
void shift_phase(float& phi, double shift, bool negate=false) {
double phi_ = phi + deg(shift);
phi = float(wrap_degrees(negate ? -phi_ : phi_));
}
// apply phase shift to HendricksonLattman coefficients HLA, HLB, HLC and HLD
void shift_hl_coefficients(float& a, float& b, float& c, float& d,
double shift, bool negate=false) {
double sinx = std::sin(shift);
double cosx = std::cos(shift);
double sin2x = 2 * sinx * cosx;
double cos2x = sq(cosx)- sq(sinx);
// a sin(x+y) + b cos(x+y) = a sin(x) cos(y) - b sin(x) sin(y)
// + a cos(x) sin(y) + b cos(x) cos(y)
float a_ = float(a * cosx - b * sinx);
float b_ = float(a * sinx + b * cosx);
float c_ = float(c * cos2x - d * sin2x);
float d_ = float(c * sin2x + d * cos2x);
a = a_; // cos(phi)
b = negate ? -b_ : b_; // sin(phi)
c = c_; // cos(2 phi)
d = negate ? -d_ : d_; // sin(2 phi)
}
// this function is generic because it was used in other places in the past
template <typename T, typename FP=typename std::iterator_traits<T>::value_type>
std::array<FP,2> calculate_min_max_disregarding_nans(T begin, T end) {
std::array<FP,2> minmax = {{NAN, NAN}};
T i = begin;
while (i != end && std::isnan(*i))
++i;
if (i != end) {
minmax[0] = minmax[1] = *i;
while (++i != end) {
if (*i < minmax[0])
minmax[0] = *i;
else if (*i > minmax[1])
minmax[1] = *i;
}
}
return minmax;
}
const char* skip_word_and_space(const char* line) {
while (*line != '\0' && !std::isspace(*line))
++line;
while (std::isspace(*line))
++line;
return line;
}
UnitCell read_cell_parameters(const char* line) {
double a = fast_atof(line, &line);
double b = fast_atof(line, &line);
double c = fast_atof(line, &line);
double alpha = fast_atof(line, &line);
double beta = fast_atof(line, &line);
double gamma = fast_atof(line, &line);
return UnitCell(a, b, c, alpha, beta, gamma);
}
} // anonymous namespace
UnitCellParameters Mtz::get_average_cell_from_batch_headers(double* rmsd) const {
if (rmsd)
for (int i = 0; i < 6; ++i)
rmsd[i] = 0.;
std::array<double, 6> avg = {0., 0., 0., 0., 0., 0.};
for (const Batch& batch : batches)
for (int i = 0; i < 6; ++i) {
// if batch headers are not set correctly, return global cell
if (batch.floats[i] <= 0)
return cell;
avg[i] += batch.floats[i];
}
if (avg[0] <= 0 || avg[1] <= 0 || avg[2] <= 0 ||
avg[3] <= 0 || avg[4] <= 0 || avg[5] <= 0)
return UnitCellParameters();
size_t n = batches.size();
for (int i = 0; i < 6; ++i)
avg[i] /= n;
if (rmsd) {
for (const Batch& batch : batches)
for (int i = 0; i < 6; ++i)
rmsd[i] += sq(avg[i] - batch.floats[i]);
for (int i = 0; i < 6; ++i)
rmsd[i] = std::sqrt(rmsd[i] / n);
}
// If average parameters are almost equal to the global cell, use the latter
// to avoid 32-bit precision artifacts (58.28 -> 58.279998).
if (UnitCellParameters(avg).approx(cell, 1e-4))
return cell;
return UnitCellParameters(avg);
}
std::array<double,2> Mtz::calculate_min_max_1_d2() const {
auto extend_min_max_1_d2 = [&](const UnitCell& uc, double& min, double& max) {
for (size_t i = 0; i < data.size(); i += columns.size()) {
double res = uc.calculate_1_d2_double(data[i+0], data[i+1], data[i+2]);
if (res < min)
min = res;
if (res > max)
max = res;
}
};
if (!has_data() || columns.size() < 3)
fail("No data.");
double min_value = INFINITY;
double max_value = 0.;
if (cell.is_crystal() && cell.a > 0)
extend_min_max_1_d2(cell, min_value, max_value);
const UnitCell* prev_cell = nullptr;
for (const Dataset& ds : datasets)
if (ds.cell.is_crystal() && ds.cell.a > 0 && ds.cell != cell &&
(!prev_cell || ds.cell != *prev_cell)) {
extend_min_max_1_d2(ds.cell, min_value, max_value);
prev_cell = &ds.cell;
}
if (min_value == INFINITY)
min_value = 0;
return {{min_value, max_value}};
}
void Mtz::read_first_bytes(AnyStream& stream) {
char buf[20] = {0};
if (!stream.read(buf, 20))
fail("Could not read the MTZ file (is it empty?)");
if (buf[0] != 'M' || buf[1] != 'T' || buf[2] != 'Z' || buf[3] != ' ')
fail("Not an MTZ file - it does not start with 'MTZ '");
// Bytes 9-12 have so-called machine stamp:
// "The first 4 half-bytes represent the real, complex, integer and
// character formats".
// We don't try to handle all the combinations here, only the two most
// common: big endian (for all types) and little endian (for all types).
// BE is denoted by 1 and LE by 4.
// If we get a value different than 1 and 4 we assume the native byte order.
if ((buf[9] & 0xf0) == (is_little_endian() ? 0x10 : 0x40))
toggle_endianness();
std::int32_t tmp_header_offset;
std::memcpy(&tmp_header_offset, buf + 4, 4);
if (!same_byte_order)
swap_four_bytes(&tmp_header_offset);
if (tmp_header_offset == -1) {
std::memcpy(&header_offset, buf + 12, 8);
if (!same_byte_order) {
swap_eight_bytes(&header_offset);
}
} else {
header_offset = (int64_t) tmp_header_offset;
}
stream.skip(60);
}
void Mtz::read_main_headers(AnyStream& stream, std::vector<std::string>* save_headers) {
char line[81] = {0};
std::ptrdiff_t header_pos = 4 * std::ptrdiff_t(header_offset - 1);
// temporary check
long cur_pos = stream.tell();
if (cur_pos != header_pos && cur_pos != -1)
fail(cat("wrong pos ", int(header_pos), " ", int(stream.tell())));
int ncol = 0;
bool has_batch = false;
while (stream.read(line, 80)) {
if (save_headers)
save_headers->emplace_back(line, line+80);
if (ialpha3_id(line) == ialpha3_id("END"))
break;
const char* args = skip_word_and_space(line);
switch (ialpha4_id(line)) {
case ialpha4_id("VERS"):
version_stamp = rtrim_str(args);
break;
case ialpha4_id("TITL"):
title = rtrim_str(args);
break;
case ialpha4_id("NCOL"): {
ncol = simple_atoi(args, &args);
nreflections = simple_atoi(args, &args);
int nbatches = simple_atoi(args);
if (nbatches < 0 || nbatches > 10000000) // sanity check
fail("Wrong NCOL header");
batches.resize(nbatches);
break;
}
case ialpha4_id("CELL"):
cell = read_cell_parameters(args);
break;
case ialpha4_id("SORT"):
for (int& n : sort_order)
n = simple_atoi(args, &args);
break;
case ialpha4_id("SYMI"): {
nsymop = simple_atoi(args, &args);
symops.reserve(nsymop);
simple_atoi(args, &args); // ignore number of primitive operations
args = skip_word_and_space(skip_blank(args)); // ignore lattice type
spacegroup_number = simple_atoi(args, &args);
args = skip_blank(args);
if (*args != '\'')
spacegroup_name = read_word(args);
else if (const char* end = std::strchr(++args, '\''))
spacegroup_name.assign(args, end);
// ignore point group which is at the end of args
break;
}
case ialpha4_id("SYMM"):
symops.push_back(parse_triplet(args));
break;
case ialpha4_id("RESO"):
min_1_d2 = fast_atof(args, &args);
max_1_d2 = fast_atof(args, &args);
break;
case ialpha4_id("VALM"):
if (*args != 'N') {
const char* endptr;
float v = (float) fast_atof(args, &endptr);
if (*endptr == '\0' || is_space(*endptr))
valm = v;
else
logger.note("Unexpected VALM value: " + rtrim_str(args));
}
break;
case ialpha4_id("COLU"): {
columns.emplace_back();
Column& col = columns.back();
col.label = read_word(args, &args);
col.type = read_word(args, &args)[0];
col.min_value = (float) fast_atof(args, &args);
col.max_value = (float) fast_atof(args, &args);
col.dataset_id = simple_atoi(args);
col.parent = this;
col.idx = columns.size() - 1;
break;
}
case ialpha4_id("COLS"):
// COLSRC is undocumented. CMTZ (libccp4) adds it after COLUMN:
// COLUMN IMEAN J -300.600006 4619 1
// COLSRC IMEAN CREATED_07/08/2019_11:00:23 1
if (!columns.empty() && columns.back().label == read_word(args, &args))
columns.back().source = read_word(args);
else
logger.note("MTZ: COLSRC is not after matching COLUMN");
break;
case ialpha4_id("COLG"):
// Column group - not used.
break;
case ialpha4_id("NDIF"):
datasets.reserve(simple_atoi(args));
break;
case ialpha4_id("PROJ"):
datasets.emplace_back();
datasets.back().id = simple_atoi(args, &args);
datasets.back().project_name = read_word(skip_word_and_space(args));
datasets.back().wavelength = 0.0;
break;
case ialpha4_id("CRYS"):
if (simple_atoi(args, &args) == last_dataset().id)
datasets.back().crystal_name = read_word(args);
else
logger.note("MTZ CRYSTAL line: unusual numbering.");
break;
case ialpha4_id("DATA"):
if (simple_atoi(args, &args) == last_dataset().id)
datasets.back().dataset_name = read_word(args);
else
logger.note("MTZ DATASET line: unusual numbering.");
break;
case ialpha4_id("DCEL"):
if (simple_atoi(args, &args) == last_dataset().id)
datasets.back().cell = read_cell_parameters(args);
else
logger.note("MTZ DCELL line: unusual numbering.");
break;
// case("DRES"): not in use yet
case ialpha4_id("DWAV"):
if (simple_atoi(args, &args) == last_dataset().id)
datasets.back().wavelength = fast_atof(args);
else
logger.note("MTZ DWAV line: unusual numbering.");
break;
case ialpha4_id("BATCH"):
// We take number of batches from the NCOL record and serial numbers
// from BH. This header could be used only to check consistency.
has_batch = true;
break;
default:
logger.note("Unknown header: " + rtrim_str(line));
}
}
if (ncol != (int) columns.size())
fail("Number of COLU records inconsistent with NCOL record.");
if (has_batch != !batches.empty())
fail("BATCH header inconsistent with NCOL record.");
// adjust data size, if necessary
if (!data.empty()) {
size_t expected_size = columns.size() * nreflections;
if (data.size() > expected_size)
data.resize(expected_size);
else if (data.size() < expected_size)
fail("internal error, wrong data size");
}
}
void Mtz::read_history_and_batch_headers(AnyStream& stream) {
char buf[81] = {0};
int n_headers = 0;
while (stream.read(buf, 80) && ialpha4_id(buf) != ialpha4_id("MTZE")) {
if (n_headers != 0) {
const char* start = skip_blank(buf);
const char* end = rtrim_cstr(start, start+80);
history.emplace_back(start, end);
--n_headers;
} else if (ialpha4_id(buf) == ialpha4_id("MTZH")) {
n_headers = simple_atoi(skip_word_and_space(buf+4));
if (n_headers < 0 || n_headers > 30) {
logger.note("Wrong MTZ: number of headers should be between 0 and 30");
return;
}
history.reserve(n_headers);
} else if (ialpha4_id(buf) == ialpha4_id("MTZB")) {
for (Batch& batch : batches) {
stream.read(buf, 80);
if (ialpha3_id(buf) != ialpha3_id("BH "))
fail("Missing BH header");
const char* args = skip_blank(buf + 2);
batch.number = simple_atoi(args, &args);
int total_words = simple_atoi(args, &args);
int int_words = simple_atoi(args, &args);
int float_words = simple_atoi(args);
if (total_words != int_words + float_words || total_words > 1000)
fail("Wrong BH header");
stream.read(buf, 80); // TITLE
const char* end = rtrim_cstr(buf + 6, buf+76);
batch.title.assign(buf, end - buf);
batch.ints.resize(int_words);
stream.read(batch.ints.data(), int_words * 4);
batch.floats.resize(float_words);
stream.read(batch.floats.data(), float_words * 4);
stream.read(buf, 80);
if (ialpha4_id(buf) != ialpha4_id("BHCH"))
fail("Missing BHCH header");
split_str_into_multi(buf + 5, " \t", batch.axes);
}
}
}
appended_text = stream.read_rest();
}
void Mtz::setup_spacegroup() {
spacegroup = find_spacegroup_by_name(spacegroup_name, cell.alpha, cell.gamma);
if (!spacegroup) {
logger.note("MTZ: unrecognized spacegroup name: " + spacegroup_name);
return;
}
if (spacegroup->ccp4 != spacegroup_number)
logger.note("MTZ: inconsistent spacegroup name and number");
cell.set_cell_images_from_spacegroup(spacegroup);
for (Dataset& d : datasets)
d.cell.set_cell_images_from_spacegroup(spacegroup);
}
// we should be at byte 80
void Mtz::read_raw_data(AnyStream& stream, bool do_read) {
size_t n = size_t(header_offset - 1 - 20);
if (!do_read) {
if (!stream.skip(4 * n))
fail("ignoring mtz data segment failed");
return;
}
data.resize(n);
if (!stream.read(data.data(), 4 * n))
fail("Error when reading MTZ data");
if (!same_byte_order)
for (float& f : data)
swap_four_bytes(&f);
}
void Mtz::read_stream(AnyStream& stream, bool with_data) {
read_first_bytes(stream);
// The older implementation of MTZ reading first read the headers,
// then the data. This required jumping to the headers at the end,
// then back to the beginning of the data (byte 80).
// The current implementation avoids calling seek(), allowing
// incremental reading of streams (stdin, gzipped files, etc).
read_raw_data(stream, with_data);
read_main_headers(stream, nullptr);
read_history_and_batch_headers(stream);
setup_spacegroup();
if (datasets.empty())
datasets.push_back({0, "HKL_base", "HKL_base", "HKL_base", cell, 0.});
}
// for probing/testing individual reflections, no need to optimize it
size_t Mtz::find_offset_of_hkl(const Miller& hkl, size_t start) const {
if (!has_data() || columns.size() < 3)
fail("No data.");
if (start != 0)
start -= (start % columns.size());
for (size_t n = start; n + 2 < data.size(); n += columns.size())
if (get_hkl(n) == hkl)
return n;
return (size_t)-1;
}
void Mtz::ensure_asu(bool tnt_asu) {
if (!is_merged())
fail("Mtz::ensure_asu() is for merged MTZ only");
if (!spacegroup)
return;
GroupOps gops = spacegroup->operations();
ReciprocalAsu asu(spacegroup, tnt_asu);
std::vector<int> phase_columns = positions_of_columns_with_type('P');
std::vector<int> abcd_columns = positions_of_columns_with_type('A');
std::vector<int> dano_columns = positions_of_columns_with_type('D');
std::vector<std::pair<int,int>> plus_minus_columns = positions_of_plus_minus_columns();
bool no_special_columns = phase_columns.empty() && abcd_columns.empty() &&
plus_minus_columns.empty() && dano_columns.empty();
bool centric = no_special_columns || gops.is_centrosymmetric();
for (size_t n = 0; n < data.size(); n += columns.size()) {
Miller hkl = get_hkl(n);
if (asu.is_in(hkl))
continue;
auto result = asu.to_asu(hkl, gops);
// cf. impl::move_to_asu() in asudata.hpp
set_hkl(n, result.first);
if (no_special_columns)
continue;
int isym = result.second;
if (!phase_columns.empty() || !abcd_columns.empty()) {
const Op& op = gops.sym_ops[(isym - 1) / 2];
double shift = op.phase_shift(hkl);
bool negate = (isym % 2 == 0);
for (int col : phase_columns)
shift_phase(data[n + col], shift, negate);
for (auto i = abcd_columns.begin(); i+3 < abcd_columns.end(); i += 4)
// we expect coefficients HLA, HLB, HLC and HLD - in this order
shift_hl_coefficients(data[n + *(i+0)], data[n + *(i+1)],
data[n + *(i+2)], data[n + *(i+3)],
shift, negate);
}
if (isym % 2 == 0 && !centric &&
// usually, centric reflections have empty F(-), so avoid swapping it
!gops.is_reflection_centric(hkl)) {
for (std::pair<int,int> cols : plus_minus_columns)
std::swap(data[n + cols.first], data[n + cols.second]);
for (int col : dano_columns)
data[n + col] = -data[n + col];
}
}
}
void Mtz::reindex(const Op& op) {
if (op.tran != Op::Tran{0, 0, 0})
gemmi::fail("reindexing operator must not have a translation");
if (op.det_rot() < 0)
gemmi::fail("reindexing operator must preserve the hand of the axes");
switch_to_original_hkl(); // changes hkl for unmerged data only
Op xyz_op = op.as_xyz();
logger.mesg("Real space transformation: ", op.as_xyz().triplet());
bool row_removal = false;
// change Miller indices
for (size_t n = 0; n < data.size(); n += columns.size()) {
Miller hkl_den = op.apply_to_hkl_without_division(get_hkl(n));
Miller hkl = Op::divide_hkl_by_DEN(hkl_den);
if (hkl[0] * Op::DEN == hkl_den[0] &&
hkl[1] * Op::DEN == hkl_den[1] &&
hkl[2] * Op::DEN == hkl_den[2]) {
set_hkl(n, hkl);
} else { // fractional hkl - remove
row_removal = true;
data[n] = NAN; // mark for removal
}
}
// remove reflections marked for removal
if (row_removal) {
int n_before = nreflections;
remove_rows_if([](const float* h) { return std::isnan(*h); });
logger.mesg("Reflections removed (because of fractional indices): ", n_before - nreflections);
}
switch_to_asu_hkl(); // revert switch_to_original_hkl() for unmerged data
// change space group
if (spacegroup) {
GroupOps gops = spacegroup->operations();
gops.change_basis_backward(xyz_op);
const SpaceGroup* new_sg = find_spacegroup_by_ops(gops);
if (!new_sg)
fail("reindexing: failed to determine new space group name");
if (new_sg != spacegroup) {
logger.mesg("Space group changed from ", spacegroup->xhm(), " to ", new_sg->xhm(), '.');
set_spacegroup(new_sg);
} else {
logger.mesg("Space group stays the same:", spacegroup->xhm(), '.');
}
}
// change unit cell parameters
cell = cell.changed_basis_backward(xyz_op, false);
for (Mtz::Dataset& ds : datasets)
ds.cell = ds.cell.changed_basis_backward(xyz_op, false);
for (Mtz::Batch& batch : batches)
batch.set_cell(batch.get_cell().changed_basis_backward(xyz_op, false));
}
void Mtz::expand_to_p1() {
if (!spacegroup || !has_data())
return;
std::vector<int> phase_columns = positions_of_columns_with_type('P');
std::vector<int> abcd_columns = positions_of_columns_with_type('A');
bool has_phases = (!phase_columns.empty() || !abcd_columns.empty());
GroupOps gops = spacegroup->operations();
data.reserve(gops.sym_ops.size() * data.size());
size_t orig_size = data.size();
std::vector<Miller> hkl_copies;
for (size_t n = 0; n < orig_size; n += columns.size()) {
hkl_copies.clear();
Miller hkl = get_hkl(n);
// no reallocations because of reserve() above
auto orig_iter = data.begin() + n;
for (auto op = gops.sym_ops.begin() + 1; op < gops.sym_ops.end(); ++op) {
Miller new_hkl = op->apply_to_hkl(hkl);
Op::Miller negated{{-new_hkl[0], -new_hkl[1], -new_hkl[2]}};
if (new_hkl != hkl && !in_vector(new_hkl, hkl_copies) &&
negated != hkl && !in_vector(negated, hkl_copies)) {
hkl_copies.push_back(new_hkl);
size_t offset = data.size();
data.insert(data.end(), orig_iter, orig_iter + columns.size());
set_hkl(offset, new_hkl);
if (has_phases) {
double shift = op->phase_shift(hkl);
if (shift != 0) {
for (int col : phase_columns)
shift_phase(data[offset + col], shift);
for (auto i = abcd_columns.begin(); i+3 < abcd_columns.end(); i += 4)
// we expect coefficients HLA, HLB, HLC and HLD - in this order
shift_hl_coefficients(data[offset + *(i+0)], data[offset + *(i+1)],
data[offset + *(i+2)], data[offset + *(i+3)], shift);
}
}
}
}
}
nreflections = int(data.size() / columns.size());
sort_order = {{0, 0, 0, 0, 0}};
set_spacegroup(&get_spacegroup_p1());
}
bool Mtz::switch_to_original_hkl() {
if (indices_switched_to_original)
return false;
if (!has_data())
fail("switch_to_original_hkl(): data not read yet");
if (nreflections == 0) {
// This function can be called before the data is populated
// to set indices_switched_to_original, which is not exposed in Python.
indices_switched_to_original = true;
return true;
}
const Column* col = column_with_label("M/ISYM");
if (col == nullptr || col->type != 'Y' || col->idx < 3)
return false;
std::vector<Op> inv_symops;
inv_symops.reserve(symops.size());
for (const Op& op : symops)
inv_symops.push_back(op.inverse());
for (size_t n = 0; n + col->idx < data.size(); n += columns.size()) {
int isym = static_cast<int>(data[n + col->idx]) & 0xFF;
const Op& op = inv_symops.at((isym - 1) / 2);
Miller hkl = op.apply_to_hkl(get_hkl(n));
int sign = (isym & 1) ? 1 : -1;
for (int i = 0; i < 3; ++i)
data[n+i] = static_cast<float>(sign * hkl[i]);
}
indices_switched_to_original = true;
return true;
}
bool Mtz::switch_to_asu_hkl() {
if (!indices_switched_to_original)
return false;
if (!has_data())
fail("switch_to_asu_hkl(): data not read yet");
const Column* col = column_with_label("M/ISYM");
if (col == nullptr || col->type != 'Y' || col->idx < 3 || !spacegroup)
return false;
size_t misym_idx = col->idx;
UnmergedHklMover hkl_mover(spacegroup);
for (size_t n = 0; n + col->idx < data.size(); n += columns.size()) {
Miller hkl = get_hkl(n);
int isym = hkl_mover.move_to_asu(hkl); // modifies hkl
set_hkl(n, hkl);
float& misym = data[n + misym_idx];
misym = float(((int)misym & ~0xff) | isym);
}
indices_switched_to_original = false;
return true;
}
void Mtz::read_file_gz(const std::string& path, bool with_data) {
try {
read_input(MaybeGzipped(path), with_data);
} catch (std::runtime_error& e) {
// append path to the error like in read_file(), but shouldn't the path go first?
fail(std::string(e.what()) + ": " + path);
}
}
std::vector<int> Mtz::sorted_row_indices(int use_first) const {
if (!has_data())
fail("No data.");
if (use_first <= 0 || use_first >= (int) columns.size())
fail("Wrong use_first arg in Mtz::sort.");
std::vector<int> indices(nreflections);
for (int i = 0; i != nreflections; ++i)
indices[i] = i;
std::stable_sort(indices.begin(), indices.end(), [&](int i, int j) {
int a = i * (int) columns.size();
int b = j * (int) columns.size();
for (int n = 0; n < use_first; ++n)
if (data[a+n] != data[b+n])
return data[a+n] < data[b+n];
return false;
});
return indices;
}
bool Mtz::sort(int use_first) {
std::vector<int> indices = sorted_row_indices(use_first);
sort_order = {{0, 0, 0, 0, 0}};
for (int i = 0; i < use_first; ++i)
sort_order[i] = i + 1;
if (std::is_sorted(indices.begin(), indices.end()))
return false;
std::vector<float> new_data(data.size());
size_t w = columns.size();
for (size_t i = 0; i != indices.size(); ++i)
std::memcpy(&new_data[i * w], &data[indices[i] * w], w * sizeof(float));
data.swap(new_data);
return true;
}
Mtz::Column& Mtz::add_column(const std::string& label, char type,
int dataset_id, int pos, bool expand_data) {
if (datasets.empty())
fail("No datasets.");
if (dataset_id < 0)
dataset_id = datasets.back().id;
else
dataset(dataset_id); // check if such dataset exist
if (pos > (int) columns.size())
fail("Requested column position after the end.");
if (pos < 0)
pos = (int) columns.size();
auto col = columns.emplace(columns.begin() + pos);
for (auto i = col + 1; i != columns.end(); ++i)
i->idx++;
col->dataset_id = dataset_id;
col->type = type;
col->label = label;
col->parent = this;
col->idx = pos;
if (expand_data)
expand_data_rows(1, pos);
return *col;
}
namespace { // helper functions for copying, replacing and removing columns
void check_column(const Mtz& mtz, size_t idx, const char* msg) {
if (!mtz.has_data())
fail(msg, ": data not read yet");
if (idx >= mtz.columns.size())
fail(msg, ": no column with 0-based index ", std::to_string(idx));
}
void check_trailing_cols(const Mtz& mtz, const Mtz::Column& src_col,
const std::vector<std::string>& trailing_cols) {
assert(src_col.parent == &mtz);
if (!mtz.has_data())
fail("data in source mtz not read yet");
if (src_col.idx + trailing_cols.size() >= mtz.columns.size())
fail("Not enough columns after " + src_col.label);
for (size_t i = 0; i < trailing_cols.size(); ++i)
if (!trailing_cols[i].empty() &&
trailing_cols[i] != mtz.columns[src_col.idx + i + 1].label)
fail("expected trailing column ", trailing_cols[i], ", found ", src_col.label);
}
void do_replace_column(Mtz& mtz, size_t dest_idx, const Mtz::Column& src_col,
const std::vector<std::string>& trailing_cols) {
const Mtz* src_mtz = src_col.parent;
for (size_t i = 0; i <= trailing_cols.size(); ++i) {
Mtz::Column& dst = mtz.columns[dest_idx + i];
const Mtz::Column& src = src_mtz->columns[src_col.idx + i];
dst.type = src.type;
dst.label = src.label;
dst.min_value = src.min_value;
dst.max_value = src.max_value;
dst.source = src.source;
dst.dataset_id = src.dataset_id;
}
if (src_mtz == &mtz) {
// internal copying
for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size())
for (size_t i = 0; i <= trailing_cols.size(); ++i)
mtz.data[n + dest_idx + i] = mtz.data[n + src_col.idx + i];
} else {
// external copying - need to match indices
std::vector<int> dst_indices = mtz.sorted_row_indices();
std::vector<int> src_indices = src_mtz->sorted_row_indices();
// cf. for_matching_reflections()
size_t dst_stride = mtz.columns.size();
size_t src_stride = src_mtz->columns.size();
auto dst = dst_indices.begin();
auto src = src_indices.begin();
while (dst != dst_indices.end() && src != src_indices.end()) {
Miller dst_hkl = mtz.get_hkl(*dst * dst_stride);
Miller src_hkl = src_mtz->get_hkl(*src * src_stride);
if (dst_hkl == src_hkl) {
// copy values
for (size_t i = 0; i <= trailing_cols.size(); ++i)
mtz.data[*dst * dst_stride + dest_idx + i] =
src_mtz->data[*src * src_stride + src_col.idx + i];
++dst;
++src;
} else if (dst_hkl < src_hkl) {
++dst;
} else {
++src;
}
}
}
}
} // anonymous namespace
Mtz::Column& Mtz::replace_column(size_t dest_idx, const Mtz::Column& src_col,
const std::vector<std::string>& trailing_cols) {
check_trailing_cols(*src_col.parent, src_col, trailing_cols);
check_column(*this, dest_idx + trailing_cols.size(), "replace_column()");
do_replace_column(*this, dest_idx, src_col, trailing_cols);
return columns[dest_idx];
}
Mtz::Column& Mtz::copy_column(int dest_idx, const Mtz::Column& src_col,
const std::vector<std::string>& trailing_cols) {
// check input consistency
if (!has_data())
fail("copy_column(): data not read yet");
check_trailing_cols(*src_col.parent, src_col, trailing_cols);
// add new columns
if (dest_idx < 0)
dest_idx = (int) columns.size();
// if src_col is from this Mtz it may get invalidated when adding columns
int col_idx = -1;
if (src_col.parent == this) {
col_idx = (int) src_col.idx;
if (col_idx >= dest_idx)
col_idx += 1 + (int)trailing_cols.size();
}
for (int i = 0; i <= (int) trailing_cols.size(); ++i)
add_column("", ' ', -1, dest_idx + i, false);
expand_data_rows(1 + trailing_cols.size(), dest_idx);
// copy the data
const Column& src_col_now = col_idx < 0 ? src_col : columns[col_idx];
// most of the work (hkl-based row matching and data copying) is done here:
do_replace_column(*this, dest_idx, src_col_now, trailing_cols);
return columns[dest_idx];
}
void Mtz::remove_column(size_t idx) {
check_column(*this, idx, "remove_column()");
columns.erase(columns.begin() + idx);
for (size_t i = idx; i < columns.size(); ++i)
--columns[i].idx;
vector_remove_column(data, columns.size(), idx);
assert(columns.size() * nreflections == data.size());
}
#define WRITE(...) do { \
int len = snprintf_z(buf, 81, __VA_ARGS__); \
if (len < 80) \
std::memset(buf + len, ' ', 80 - len); \
if (write(buf, 80, 1) != 1) \
sys_fail("Writing MTZ file failed"); \
} while(0)
template<typename Write>
void Mtz::write_to_stream(Write write) const {
// uses: data, spacegroup, nreflections, batches, cell, sort_order,
// valm, columns, datasets, history
if (!has_data())
fail("Cannot write Mtz which has no data");
if (!spacegroup)
fail("Cannot write Mtz which has no space group");
char buf[81] = {'M', 'T', 'Z', ' ', '\0'};
std::int64_t real_header_start = (int64_t) columns.size() * nreflections + 21;
std::int32_t header_start = (int32_t) real_header_start;
if (real_header_start > std::numeric_limits<int32_t>::max()) {
header_start = -1;
} else {
real_header_start = 0;
}
std::memcpy(buf + 4, &header_start, 4);
std::int32_t machst = is_little_endian() ? 0x00004144 : 0x11110000;
std::memcpy(buf + 8, &machst, 4);
std::memcpy(buf + 12, &real_header_start, 8);
if (write(buf, 80, 1) != 1 ||
write(data.data(), 4, data.size()) != data.size())
fail("Writing MTZ file failed");
WRITE("VERS MTZ:V1.1");
WRITE("TITLE %s", title.c_str());
WRITE("NCOL %8zu %12d %8zu", columns.size(), nreflections, batches.size());
if (cell.is_crystal())
WRITE("CELL %9.4f %9.4f %9.4f %9.4f %9.4f %9.4f",
cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma);
WRITE("SORT %3d %3d %3d %3d %3d", sort_order[0], sort_order[1],
sort_order[2], sort_order[3], sort_order[4]);
GroupOps ops = spacegroup->operations();
char lat_type = spacegroup->ccp4_lattice_type();
WRITE("SYMINF %3d %2d %c %5d %*s'%c%s' PG%s",
ops.order(), // number of symmetry operations
(int) ops.sym_ops.size(), // number of primitive operations
lat_type, // lattice type
spacegroup->ccp4, // space group number
20 - (int) std::strlen(spacegroup->hm), "",
lat_type, // space group name (first letter)
spacegroup->hm + 1, // space group name (the rest)
spacegroup->point_group_hm()); // point group name
// If we have symops that are the same as spacegroup->operations(),
// write symops to preserve the order of SYMM records.
if (!symops.empty() && ops.is_same_as(split_centering_vectors(symops)))
for (Op op : symops)
WRITE("SYMM %s", to_upper(op.triplet()).c_str());
else
for (Op op : ops)
WRITE("SYMM %s", to_upper(op.triplet()).c_str());
auto reso = calculate_min_max_1_d2();
WRITE("RESO %-20.12f %-20.12f", reso[0], reso[1]);
if (std::isnan(valm))
WRITE("VALM NAN");
else
WRITE("VALM %f", valm);
auto format17 = [](float f) {
char buffer[18];
int len = snprintf_z(buffer, 18, "%.9f", f);
return std::string(buffer, len > 0 ? std::min(len, 17) : 0);
};
for (const Column& col : columns) {
auto minmax = calculate_min_max_disregarding_nans(col.begin(), col.end());
const char* label = !col.label.empty() ? col.label.c_str() : "_";
WRITE("COLUMN %-30s %c %17s %17s %4d",
label, col.type,
format17(minmax[0]).c_str(), format17(minmax[1]).c_str(),
col.dataset_id);
if (!col.source.empty())
WRITE("COLSRC %-30s %-36s %4d", label, col.source.c_str(), col.dataset_id);
}
WRITE("NDIF %8zu", datasets.size());
for (const Dataset& ds : datasets) {
WRITE("PROJECT %7d %s", ds.id, ds.project_name.c_str());
WRITE("CRYSTAL %7d %s", ds.id, ds.crystal_name.c_str());
WRITE("DATASET %7d %s", ds.id, ds.dataset_name.c_str());
const UnitCell& uc = (ds.cell.is_crystal() && ds.cell.a > 0 ? ds.cell : cell);
WRITE("DCELL %9d %10.4f%10.4f%10.4f%10.4f%10.4f%10.4f",
ds.id, uc.a, uc.b, uc.c, uc.alpha, uc.beta, uc.gamma);
WRITE("DWAVEL %8d %10.5f", ds.id, ds.wavelength);
}
int pos = 0;
for (const Batch& batch : batches) {
if (pos == 0)
std::memcpy(buf, "BATCH ", 6); // NOLINT(bugprone-not-null-terminated-result)
pos += 6;
snprintf_z(buf + pos, 7, "%6d", batch.number);
if (pos > 72 || &batch == &batches.back()) {
std::memset(buf + pos, ' ', 80 - pos);
if (write(buf, 80, 1) != 1)
fail("Writing MTZ file failed");
pos = 0;
}
}
WRITE("END");
if (!history.empty()) {
// According to mtzformat.html the file can have only up to 30 history
// lines, but we don't enforce it here.
WRITE("MTZHIST %3zu", history.size());
for (const std::string& line : history)
WRITE("%s", line.c_str());
}
if (!batches.empty()) {
WRITE("MTZBATS");
for (const Batch& batch : batches) {
// keep the numbers the same as in files written by libccp4
WRITE("BH %8d %7zu %7zu %7zu",
batch.number, batch.ints.size() + batch.floats.size(),
batch.ints.size(), batch.floats.size());
WRITE("TITLE %.70s", batch.title.c_str());
if (batch.ints.size() != 29 || batch.floats.size() != 156)
fail("wrong size of binaries batch headers");
write(batch.ints.data(), 4, batch.ints.size());
write(batch.floats.data(), 4, batch.floats.size());
WRITE("BHCH %7.7s %7.7s %7.7s",
batch.axes.size() > 0 ? batch.axes[0].c_str() : "",
batch.axes.size() > 1 ? batch.axes[1].c_str() : "",
batch.axes.size() > 2 ? batch.axes[2].c_str() : "");
}
}
WRITE("MTZENDOFHEADERS");
if (!appended_text.empty()) {
if (write(appended_text.data(), appended_text.size(), 1) != 1)
fail("Writing MTZ file failed");
}
}
#undef WRITE
void Mtz::write_to_cstream(std::FILE* stream) const {
write_to_stream([&](const void *ptr, size_t size, size_t nmemb) {
return std::fwrite(ptr, size, nmemb, stream);
});
}
void Mtz::write_to_string(std::string& str) const {
// Calculate the size beforehand to avoid memory re-allocations
// and minimize memory usage. It hasn't been benchmarked against
// a single-pass writing.
size_t nbytes = size_to_write();
str.resize(nbytes);
write_to_buffer(&str[0], nbytes);
}
void Mtz::write_to_file(const std::string& path) const {
fileptr_t f = file_open(path.c_str(), "wb");
try {
write_to_cstream(f.get());
} catch (std::runtime_error& e) {
fail(std::string(e.what()) + ": " + path);
}
}
size_t Mtz::size_to_write() const {
size_t nbytes = 0;
write_to_stream([&](const void *, size_t size, size_t nmemb) {
nbytes += size * nmemb;
return nmemb;
});
return nbytes;
}
size_t Mtz::write_to_buffer(char* buf, size_t maxlen) const {
size_t len = 0;
write_to_stream([&](const void *ptr, size_t size, size_t nmemb) {
len += size * nmemb;
if (len > maxlen)
fail("Mtz::write_to_buffer: size too small");
memcpy(buf, ptr, size * nmemb);
buf += size * nmemb;
return nmemb;
});
return len;
}
} // namespace gemmi