Jungfraujoch/gemmi_gph/mtz.cpp

// Copyright 2019-2023 Global Phasing Ltd.

#include <gemmi/mtz.hpp>
#include <cstring>            // for memcpy
#include <algorithm>          // for stable_sort
#include <gemmi/atof.hpp>     // for fast_atof
#include <gemmi/atox.hpp>     // for simple_atoi, read_word
#include <gemmi/gz.hpp>
#include <gemmi/sprintf.hpp>

namespace gemmi {

namespace {

double wrap_degrees(double phi) {
  if (phi >= 0 && phi < 360.)
    return phi;
  return phi - std::floor(phi / 360.) * 360.;
}

void shift_phase(float& phi, double shift, bool negate=false) {
  double phi_ = phi + deg(shift);
  phi = float(wrap_degrees(negate ? -phi_ : phi_));
}

// apply phase shift to Hendrickson–Lattman coefficients HLA, HLB, HLC and HLD
void shift_hl_coefficients(float& a, float& b, float& c, float& d,
                           double shift, bool negate=false) {
  double sinx = std::sin(shift);
  double cosx = std::cos(shift);
  double sin2x = 2 * sinx * cosx;
  double cos2x = sq(cosx)- sq(sinx);
  // a sin(x+y) + b cos(x+y) = a sin(x) cos(y) - b sin(x) sin(y)
  //                         + a cos(x) sin(y) + b cos(x) cos(y)
  float a_ = float(a * cosx - b * sinx);
  float b_ = float(a * sinx + b * cosx);
  float c_ = float(c * cos2x - d * sin2x);
  float d_ = float(c * sin2x + d * cos2x);
  a = a_;                 // cos(phi)
  b = negate ? -b_ : b_;  // sin(phi)
  c = c_;                 // cos(2 phi)
  d = negate ? -d_ : d_;  // sin(2 phi)
}

// this function is generic because it was used in other places in the past
template <typename T, typename FP=typename std::iterator_traits<T>::value_type>
std::array<FP,2> calculate_min_max_disregarding_nans(T begin, T end) {
  std::array<FP,2> minmax = {{NAN, NAN}};
  T i = begin;
  while (i != end && std::isnan(*i))
    ++i;
  if (i != end) {
    minmax[0] = minmax[1] = *i;
    while (++i != end) {
      if (*i < minmax[0])
        minmax[0] = *i;
      else if (*i > minmax[1])
        minmax[1] = *i;
    }
  }
  return minmax;
}

const char* skip_word_and_space(const char* line) {
  while (*line != '\0' && !std::isspace(*line))
    ++line;
  while (std::isspace(*line))
    ++line;
  return line;
}

UnitCell read_cell_parameters(const char* line) {
  double a = fast_atof(line, &line);
  double b = fast_atof(line, &line);
  double c = fast_atof(line, &line);
  double alpha = fast_atof(line, &line);
  double beta = fast_atof(line, &line);
  double gamma = fast_atof(line, &line);
  return UnitCell(a, b, c, alpha, beta, gamma);
}

} // anonymous namespace

UnitCellParameters Mtz::get_average_cell_from_batch_headers(double* rmsd) const {
  if (rmsd)
    for (int i = 0; i < 6; ++i)
      rmsd[i] = 0.;
  std::array<double, 6> avg = {0., 0., 0., 0., 0., 0.};
  for (const Batch& batch : batches)
    for (int i = 0; i < 6; ++i) {
      // if batch headers are not set correctly, return global cell
      if (batch.floats[i] <= 0)
        return cell;
      avg[i] += batch.floats[i];
    }
  if (avg[0] <= 0 || avg[1] <= 0 || avg[2] <= 0 ||
      avg[3] <= 0 || avg[4] <= 0 || avg[5] <= 0)
    return UnitCellParameters();
  size_t n = batches.size();
  for (int i = 0; i < 6; ++i)
    avg[i] /= n;
  if (rmsd) {
    for (const Batch& batch : batches)
      for (int i = 0; i < 6; ++i)
        rmsd[i] += sq(avg[i] - batch.floats[i]);
    for (int i = 0; i < 6; ++i)
      rmsd[i] = std::sqrt(rmsd[i] / n);
  }
  // If average parameters are almost equal to the global cell, use the latter
  // to avoid 32-bit precision artifacts (58.28 -> 58.279998).
  if (UnitCellParameters(avg).approx(cell, 1e-4))
    return cell;
  return UnitCellParameters(avg);
}

std::array<double,2> Mtz::calculate_min_max_1_d2() const {
  auto extend_min_max_1_d2 = [&](const UnitCell& uc, double& min, double& max) {
    for (size_t i = 0; i < data.size(); i += columns.size()) {
      double res = uc.calculate_1_d2_double(data[i+0], data[i+1], data[i+2]);
      if (res < min)
        min = res;
      if (res > max)
        max = res;
    }
  };
  if (!has_data() || columns.size() < 3)
    fail("No data.");
  double min_value = INFINITY;
  double max_value = 0.;
  if (cell.is_crystal() && cell.a > 0)
    extend_min_max_1_d2(cell, min_value, max_value);
  const UnitCell* prev_cell = nullptr;
  for (const Dataset& ds : datasets)
    if (ds.cell.is_crystal() && ds.cell.a > 0 && ds.cell != cell &&
        (!prev_cell || ds.cell != *prev_cell)) {
      extend_min_max_1_d2(ds.cell, min_value, max_value);
      prev_cell = &ds.cell;
    }
  if (min_value == INFINITY)
    min_value = 0;
  return {{min_value, max_value}};
}

void Mtz::read_first_bytes(AnyStream& stream) {
  char buf[20] = {0};

  if (!stream.read(buf, 20))
    fail("Could not read the MTZ file (is it empty?)");
  if (buf[0] != 'M' || buf[1] != 'T' || buf[2] != 'Z' || buf[3] != ' ')
    fail("Not an MTZ file - it does not start with 'MTZ '");

  // Bytes 9-12 have so-called machine stamp:
  // "The first 4 half-bytes represent the real, complex, integer and
  // character formats".
  // We don't try to handle all the combinations here, only the two most
  // common: big endian (for all types) and little endian (for all types).
  // BE is denoted by 1 and LE by 4.
  // If we get a value different than 1 and 4 we assume the native byte order.
  if ((buf[9] & 0xf0) == (is_little_endian() ? 0x10 : 0x40))
    toggle_endianness();

  std::int32_t tmp_header_offset;
  std::memcpy(&tmp_header_offset, buf + 4, 4);
  if (!same_byte_order)
    swap_four_bytes(&tmp_header_offset);

  if (tmp_header_offset == -1) {
    std::memcpy(&header_offset, buf + 12, 8);
    if (!same_byte_order) {
      swap_eight_bytes(&header_offset);
    }
  } else {
    header_offset = (int64_t) tmp_header_offset;
  }
  stream.skip(60);
}

void Mtz::read_main_headers(AnyStream& stream, std::vector<std::string>* save_headers) {
  char line[81] = {0};
  std::ptrdiff_t header_pos = 4 * std::ptrdiff_t(header_offset - 1);
  // temporary check
  long cur_pos = stream.tell();
  if (cur_pos != header_pos && cur_pos != -1)
    fail(cat("wrong pos ", int(header_pos), "  ", int(stream.tell())));
  int ncol = 0;
  bool has_batch = false;
  while (stream.read(line, 80)) {
    if (save_headers)
      save_headers->emplace_back(line, line+80);
    if (ialpha3_id(line) == ialpha3_id("END"))
      break;
    const char* args = skip_word_and_space(line);
    switch (ialpha4_id(line)) {
      case ialpha4_id("VERS"):
        version_stamp = rtrim_str(args);
        break;
      case ialpha4_id("TITL"):
        title = rtrim_str(args);
        break;
      case ialpha4_id("NCOL"): {
        ncol = simple_atoi(args, &args);
        nreflections = simple_atoi(args, &args);
        int nbatches = simple_atoi(args);
        if (nbatches < 0 || nbatches > 10000000)  // sanity check
          fail("Wrong NCOL header");
        batches.resize(nbatches);
        break;
      }
      case ialpha4_id("CELL"):
        cell = read_cell_parameters(args);
        break;
      case ialpha4_id("SORT"):
        for (int& n : sort_order)
          n = simple_atoi(args, &args);
        break;
      case ialpha4_id("SYMI"): {
        nsymop = simple_atoi(args, &args);
        symops.reserve(nsymop);
        simple_atoi(args, &args); // ignore number of primitive operations
        args = skip_word_and_space(skip_blank(args)); // ignore lattice type
        spacegroup_number = simple_atoi(args, &args);
        args = skip_blank(args);
        if (*args != '\'')
          spacegroup_name = read_word(args);
        else if (const char* end = std::strchr(++args, '\''))
          spacegroup_name.assign(args, end);
        // ignore point group which is at the end of args
        break;
      }
      case ialpha4_id("SYMM"):
        symops.push_back(parse_triplet(args));
        break;
      case ialpha4_id("RESO"):
        min_1_d2 = fast_atof(args, &args);
        max_1_d2 = fast_atof(args, &args);
        break;
      case ialpha4_id("VALM"):
        if (*args != 'N') {
          const char* endptr;
          float v = (float) fast_atof(args, &endptr);
          if (*endptr == '\0' || is_space(*endptr))
            valm = v;
          else
            logger.note("Unexpected VALM value: " + rtrim_str(args));
        }
        break;
      case ialpha4_id("COLU"): {
        columns.emplace_back();
        Column& col = columns.back();
        col.label = read_word(args, &args);
        col.type = read_word(args, &args)[0];
        col.min_value = (float) fast_atof(args, &args);
        col.max_value = (float) fast_atof(args, &args);
        col.dataset_id = simple_atoi(args);
        col.parent = this;
        col.idx = columns.size() - 1;
        break;
      }
      case ialpha4_id("COLS"):
        // COLSRC is undocumented. CMTZ (libccp4) adds it after COLUMN:
        // COLUMN IMEAN                          J       -300.600006              4619    1
        // COLSRC IMEAN                          CREATED_07/08/2019_11:00:23              1
        if (!columns.empty() && columns.back().label == read_word(args, &args))
          columns.back().source = read_word(args);
        else
          logger.note("MTZ: COLSRC is not after matching COLUMN");
        break;
      case ialpha4_id("COLG"):
        // Column group - not used.
        break;
      case ialpha4_id("NDIF"):
        datasets.reserve(simple_atoi(args));
        break;
      case ialpha4_id("PROJ"):
        datasets.emplace_back();
        datasets.back().id = simple_atoi(args, &args);
        datasets.back().project_name = read_word(skip_word_and_space(args));
        datasets.back().wavelength = 0.0;
        break;
      case ialpha4_id("CRYS"):
        if (simple_atoi(args, &args) == last_dataset().id)
          datasets.back().crystal_name = read_word(args);
        else
          logger.note("MTZ CRYSTAL line: unusual numbering.");
        break;
      case ialpha4_id("DATA"):
        if (simple_atoi(args, &args) == last_dataset().id)
          datasets.back().dataset_name = read_word(args);
        else
          logger.note("MTZ DATASET line: unusual numbering.");
        break;
      case ialpha4_id("DCEL"):
        if (simple_atoi(args, &args) == last_dataset().id)
          datasets.back().cell = read_cell_parameters(args);
        else
          logger.note("MTZ DCELL line: unusual numbering.");
        break;
      // case("DRES"): not in use yet
      case ialpha4_id("DWAV"):
        if (simple_atoi(args, &args) == last_dataset().id)
          datasets.back().wavelength = fast_atof(args);
        else
          logger.note("MTZ DWAV line: unusual numbering.");
        break;
      case ialpha4_id("BATCH"):
        // We take number of batches from the NCOL record and serial numbers
        // from BH. This header could be used only to check consistency.
        has_batch = true;
        break;
      default:
        logger.note("Unknown header: " + rtrim_str(line));
    }
  }
  if (ncol != (int) columns.size())
    fail("Number of COLU records inconsistent with NCOL record.");
  if (has_batch != !batches.empty())
    fail("BATCH header inconsistent with NCOL record.");
  // adjust data size, if necessary
  if (!data.empty()) {
    size_t expected_size = columns.size() * nreflections;
    if (data.size() > expected_size)
      data.resize(expected_size);
    else if (data.size() < expected_size)
      fail("internal error, wrong data size");
  }
}

void Mtz::read_history_and_batch_headers(AnyStream& stream) {
  char buf[81] = {0};
  int n_headers = 0;
  while (stream.read(buf, 80) && ialpha4_id(buf) != ialpha4_id("MTZE")) {
    if (n_headers != 0) {
      const char* start = skip_blank(buf);
      const char* end = rtrim_cstr(start, start+80);
      history.emplace_back(start, end);
      --n_headers;
    } else if (ialpha4_id(buf) == ialpha4_id("MTZH")) {
      n_headers = simple_atoi(skip_word_and_space(buf+4));
      if (n_headers < 0 || n_headers > 30) {
        logger.note("Wrong MTZ: number of headers should be between 0 and 30");
        return;
      }
      history.reserve(n_headers);
    } else if (ialpha4_id(buf) == ialpha4_id("MTZB")) {
      for (Batch& batch : batches) {
        stream.read(buf, 80);
        if (ialpha3_id(buf) != ialpha3_id("BH "))
          fail("Missing BH header");
        const char* args = skip_blank(buf + 2);
        batch.number = simple_atoi(args, &args);
        int total_words = simple_atoi(args, &args);
        int int_words = simple_atoi(args, &args);
        int float_words = simple_atoi(args);
        if (total_words != int_words + float_words || total_words > 1000)
          fail("Wrong BH header");
        stream.read(buf, 80); // TITLE
        const char* end = rtrim_cstr(buf + 6, buf+76);
        batch.title.assign(buf, end - buf);
        batch.ints.resize(int_words);
        stream.read(batch.ints.data(), int_words * 4);
        batch.floats.resize(float_words);
        stream.read(batch.floats.data(), float_words * 4);
        stream.read(buf, 80);
        if (ialpha4_id(buf) != ialpha4_id("BHCH"))
          fail("Missing BHCH header");
        split_str_into_multi(buf + 5, " \t", batch.axes);
      }
    }
  }
  appended_text = stream.read_rest();
}

void Mtz::setup_spacegroup() {
  spacegroup = find_spacegroup_by_name(spacegroup_name, cell.alpha, cell.gamma);
  if (!spacegroup) {
    logger.note("MTZ: unrecognized spacegroup name: " + spacegroup_name);
    return;
  }
  if (spacegroup->ccp4 != spacegroup_number)
    logger.note("MTZ: inconsistent spacegroup name and number");
  cell.set_cell_images_from_spacegroup(spacegroup);
  for (Dataset& d : datasets)
    d.cell.set_cell_images_from_spacegroup(spacegroup);
}

// we should be at byte 80
void Mtz::read_raw_data(AnyStream& stream, bool do_read) {
  size_t n = size_t(header_offset - 1 - 20);
  if (!do_read) {
    if (!stream.skip(4 * n))
      fail("ignoring mtz data segment failed");
    return;
  }
  data.resize(n);
  if (!stream.read(data.data(), 4 * n))
    fail("Error when reading MTZ data");
  if (!same_byte_order)
    for (float& f : data)
      swap_four_bytes(&f);
}

void Mtz::read_stream(AnyStream& stream, bool with_data) {
  read_first_bytes(stream);
  // The older implementation of MTZ reading first read the headers,
  // then the data. This required jumping to the headers at the end,
  // then back to the beginning of the data (byte 80).
  // The current implementation avoids calling seek(), allowing
  // incremental reading of streams (stdin, gzipped files, etc).
  read_raw_data(stream, with_data);
  read_main_headers(stream, nullptr);
  read_history_and_batch_headers(stream);
  setup_spacegroup();
  if (datasets.empty())
    datasets.push_back({0, "HKL_base", "HKL_base", "HKL_base", cell, 0.});
}

// for probing/testing individual reflections, no need to optimize it
size_t Mtz::find_offset_of_hkl(const Miller& hkl, size_t start) const {
  if (!has_data() || columns.size() < 3)
    fail("No data.");
  if (start != 0)
    start -= (start % columns.size());
  for (size_t n = start; n + 2 < data.size(); n += columns.size())
    if (get_hkl(n) == hkl)
      return n;
  return (size_t)-1;
}

void Mtz::ensure_asu(bool tnt_asu) {
  if (!is_merged())
    fail("Mtz::ensure_asu() is for merged MTZ only");
  if (!spacegroup)
    return;
  GroupOps gops = spacegroup->operations();
  ReciprocalAsu asu(spacegroup, tnt_asu);
  std::vector<int> phase_columns = positions_of_columns_with_type('P');
  std::vector<int> abcd_columns = positions_of_columns_with_type('A');
  std::vector<int> dano_columns = positions_of_columns_with_type('D');
  std::vector<std::pair<int,int>> plus_minus_columns = positions_of_plus_minus_columns();
  bool no_special_columns = phase_columns.empty() && abcd_columns.empty() &&
                            plus_minus_columns.empty() && dano_columns.empty();
  bool centric = no_special_columns || gops.is_centrosymmetric();
  for (size_t n = 0; n < data.size(); n += columns.size()) {
    Miller hkl = get_hkl(n);
    if (asu.is_in(hkl))
      continue;
    auto result = asu.to_asu(hkl, gops);
    // cf. impl::move_to_asu() in asudata.hpp
    set_hkl(n, result.first);
    if (no_special_columns)
      continue;
    int isym = result.second;
    if (!phase_columns.empty() || !abcd_columns.empty()) {
      const Op& op = gops.sym_ops[(isym - 1) / 2];
      double shift = op.phase_shift(hkl);
      bool negate = (isym % 2 == 0);
      for (int col : phase_columns)
        shift_phase(data[n + col], shift, negate);
      for (auto i = abcd_columns.begin(); i+3 < abcd_columns.end(); i += 4)
        // we expect coefficients HLA, HLB, HLC and HLD - in this order
        shift_hl_coefficients(data[n + *(i+0)], data[n + *(i+1)],
                              data[n + *(i+2)], data[n + *(i+3)],
                              shift, negate);
    }
    if (isym % 2 == 0 && !centric &&
        // usually, centric reflections have empty F(-), so avoid swapping it
        !gops.is_reflection_centric(hkl)) {
      for (std::pair<int,int> cols : plus_minus_columns)
        std::swap(data[n + cols.first], data[n + cols.second]);
      for (int col : dano_columns)
        data[n + col] = -data[n + col];
    }
  }
}

void Mtz::reindex(const Op& op) {
  if (op.tran != Op::Tran{0, 0, 0})
    gemmi::fail("reindexing operator must not have a translation");
  if (op.det_rot() < 0)
    gemmi::fail("reindexing operator must preserve the hand of the axes");
  switch_to_original_hkl();  // changes hkl for unmerged data only
  Op xyz_op = op.as_xyz();
  logger.mesg("Real space transformation: ", op.as_xyz().triplet());
  bool row_removal = false;
  // change Miller indices
  for (size_t n = 0; n < data.size(); n += columns.size()) {
    Miller hkl_den = op.apply_to_hkl_without_division(get_hkl(n));
    Miller hkl = Op::divide_hkl_by_DEN(hkl_den);
    if (hkl[0] * Op::DEN == hkl_den[0] &&
        hkl[1] * Op::DEN == hkl_den[1] &&
        hkl[2] * Op::DEN == hkl_den[2]) {
      set_hkl(n, hkl);
    } else {  // fractional hkl - remove
      row_removal = true;
      data[n] = NAN;  // mark for removal
    }
  }

  // remove reflections marked for removal
  if (row_removal) {
    int n_before = nreflections;
    remove_rows_if([](const float* h) { return std::isnan(*h); });
    logger.mesg("Reflections removed (because of fractional indices): ", n_before - nreflections);
  }

  switch_to_asu_hkl();  // revert switch_to_original_hkl() for unmerged data

  // change space group
  if (spacegroup) {
    GroupOps gops = spacegroup->operations();
    gops.change_basis_backward(xyz_op);
    const SpaceGroup* new_sg = find_spacegroup_by_ops(gops);
    if (!new_sg)
      fail("reindexing: failed to determine new space group name");
    if (new_sg != spacegroup) {
      logger.mesg("Space group changed from ", spacegroup->xhm(), " to ", new_sg->xhm(), '.');
      set_spacegroup(new_sg);
    } else {
      logger.mesg("Space group stays the same:", spacegroup->xhm(), '.');
    }
  }

  // change unit cell parameters
  cell = cell.changed_basis_backward(xyz_op, false);
  for (Mtz::Dataset& ds : datasets)
    ds.cell = ds.cell.changed_basis_backward(xyz_op, false);
  for (Mtz::Batch& batch : batches)
    batch.set_cell(batch.get_cell().changed_basis_backward(xyz_op, false));
}

void Mtz::expand_to_p1() {
  if (!spacegroup || !has_data())
    return;
  std::vector<int> phase_columns = positions_of_columns_with_type('P');
  std::vector<int> abcd_columns = positions_of_columns_with_type('A');
  bool has_phases = (!phase_columns.empty() || !abcd_columns.empty());
  GroupOps gops = spacegroup->operations();
  data.reserve(gops.sym_ops.size() * data.size());
  size_t orig_size = data.size();
  std::vector<Miller> hkl_copies;
  for (size_t n = 0; n < orig_size; n += columns.size()) {
    hkl_copies.clear();
    Miller hkl = get_hkl(n);
    // no reallocations because of reserve() above
    auto orig_iter = data.begin() + n;
    for (auto op = gops.sym_ops.begin() + 1; op < gops.sym_ops.end(); ++op) {
      Miller new_hkl = op->apply_to_hkl(hkl);
      Op::Miller negated{{-new_hkl[0], -new_hkl[1], -new_hkl[2]}};
      if (new_hkl != hkl && !in_vector(new_hkl, hkl_copies) &&
          negated != hkl && !in_vector(negated, hkl_copies)) {
        hkl_copies.push_back(new_hkl);
        size_t offset = data.size();
        data.insert(data.end(), orig_iter, orig_iter + columns.size());
        set_hkl(offset, new_hkl);
        if (has_phases) {
          double shift = op->phase_shift(hkl);
          if (shift != 0) {
            for (int col : phase_columns)
              shift_phase(data[offset + col], shift);
            for (auto i = abcd_columns.begin(); i+3 < abcd_columns.end(); i += 4)
              // we expect coefficients HLA, HLB, HLC and HLD - in this order
              shift_hl_coefficients(data[offset + *(i+0)], data[offset + *(i+1)],
                                    data[offset + *(i+2)], data[offset + *(i+3)], shift);
          }
        }
      }
    }
  }
  nreflections = int(data.size() / columns.size());
  sort_order = {{0, 0, 0, 0, 0}};
  set_spacegroup(&get_spacegroup_p1());
}

bool Mtz::switch_to_original_hkl() {
  if (indices_switched_to_original)
    return false;
  if (!has_data())
    fail("switch_to_original_hkl(): data not read yet");
  if (nreflections == 0) {
    // This function can be called before the data is populated
    // to set indices_switched_to_original, which is not exposed in Python.
    indices_switched_to_original = true;
    return true;
  }
  const Column* col = column_with_label("M/ISYM");
  if (col == nullptr || col->type != 'Y' || col->idx < 3)
    return false;
  std::vector<Op> inv_symops;
  inv_symops.reserve(symops.size());
  for (const Op& op : symops)
    inv_symops.push_back(op.inverse());
  for (size_t n = 0; n + col->idx < data.size(); n += columns.size()) {
    int isym = static_cast<int>(data[n + col->idx]) & 0xFF;
    const Op& op = inv_symops.at((isym - 1) / 2);
    Miller hkl = op.apply_to_hkl(get_hkl(n));
    int sign = (isym & 1) ? 1 : -1;
    for (int i = 0; i < 3; ++i)
      data[n+i] = static_cast<float>(sign * hkl[i]);
  }
  indices_switched_to_original = true;
  return true;
}

bool Mtz::switch_to_asu_hkl() {
  if (!indices_switched_to_original)
    return false;
  if (!has_data())
    fail("switch_to_asu_hkl(): data not read yet");
  const Column* col = column_with_label("M/ISYM");
  if (col == nullptr || col->type != 'Y' || col->idx < 3 || !spacegroup)
    return false;
  size_t misym_idx = col->idx;
  UnmergedHklMover hkl_mover(spacegroup);
  for (size_t n = 0; n + col->idx < data.size(); n += columns.size()) {
    Miller hkl = get_hkl(n);
    int isym = hkl_mover.move_to_asu(hkl);  // modifies hkl
    set_hkl(n, hkl);
    float& misym = data[n + misym_idx];
    misym = float(((int)misym & ~0xff) | isym);
  }
  indices_switched_to_original = false;
  return true;
}

void Mtz::read_file_gz(const std::string& path, bool with_data) {
  try {
    read_input(MaybeGzipped(path), with_data);
  } catch (std::runtime_error& e) {
    // append path to the error like in read_file(), but shouldn't the path go first?
    fail(std::string(e.what()) + ": " + path);
  }
}

std::vector<int> Mtz::sorted_row_indices(int use_first) const {
  if (!has_data())
    fail("No data.");
  if (use_first <= 0 || use_first >= (int) columns.size())
    fail("Wrong use_first arg in Mtz::sort.");
  std::vector<int> indices(nreflections);
  for (int i = 0; i != nreflections; ++i)
    indices[i] = i;
  std::stable_sort(indices.begin(), indices.end(), [&](int i, int j) {
    int a = i * (int) columns.size();
    int b = j * (int) columns.size();
    for (int n = 0; n < use_first; ++n)
      if (data[a+n] != data[b+n])
        return data[a+n] < data[b+n];
    return false;
  });
  return indices;
}

bool Mtz::sort(int use_first) {
  std::vector<int> indices = sorted_row_indices(use_first);
  sort_order = {{0, 0, 0, 0, 0}};
  for (int i = 0; i < use_first; ++i)
    sort_order[i] = i + 1;
  if (std::is_sorted(indices.begin(), indices.end()))
    return false;
  std::vector<float> new_data(data.size());
  size_t w = columns.size();
  for (size_t i = 0; i != indices.size(); ++i)
    std::memcpy(&new_data[i * w], &data[indices[i] * w], w * sizeof(float));
  data.swap(new_data);
  return true;
}

Mtz::Column& Mtz::add_column(const std::string& label, char type,
                             int dataset_id, int pos, bool expand_data) {
  if (datasets.empty())
    fail("No datasets.");
  if (dataset_id < 0)
    dataset_id = datasets.back().id;
  else
    dataset(dataset_id); // check if such dataset exist
  if (pos > (int) columns.size())
    fail("Requested column position after the end.");
  if (pos < 0)
    pos = (int) columns.size();
  auto col = columns.emplace(columns.begin() + pos);
  for (auto i = col + 1; i != columns.end(); ++i)
    i->idx++;
  col->dataset_id = dataset_id;
  col->type = type;
  col->label = label;
  col->parent = this;
  col->idx = pos;
  if (expand_data)
    expand_data_rows(1, pos);
  return *col;
}


namespace {  // helper functions for copying, replacing and removing columns

void check_column(const Mtz& mtz, size_t idx, const char* msg) {
  if (!mtz.has_data())
    fail(msg, ": data not read yet");
  if (idx >= mtz.columns.size())
    fail(msg, ": no column with 0-based index ", std::to_string(idx));
}

void check_trailing_cols(const Mtz& mtz, const Mtz::Column& src_col,
                         const std::vector<std::string>& trailing_cols) {
  assert(src_col.parent == &mtz);
  if (!mtz.has_data())
    fail("data in source mtz not read yet");
  if (src_col.idx + trailing_cols.size() >= mtz.columns.size())
    fail("Not enough columns after " + src_col.label);
  for (size_t i = 0; i < trailing_cols.size(); ++i)
    if (!trailing_cols[i].empty() &&
        trailing_cols[i] != mtz.columns[src_col.idx + i + 1].label)
      fail("expected trailing column ", trailing_cols[i], ", found ", src_col.label);
}

void do_replace_column(Mtz& mtz, size_t dest_idx, const Mtz::Column& src_col,
                       const std::vector<std::string>& trailing_cols) {
  const Mtz* src_mtz = src_col.parent;
  for (size_t i = 0; i <= trailing_cols.size(); ++i) {
    Mtz::Column& dst = mtz.columns[dest_idx + i];
    const Mtz::Column& src = src_mtz->columns[src_col.idx + i];
    dst.type = src.type;
    dst.label = src.label;
    dst.min_value = src.min_value;
    dst.max_value = src.max_value;
    dst.source = src.source;
    dst.dataset_id = src.dataset_id;
  }
  if (src_mtz == &mtz) {
    // internal copying
    for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size())
      for (size_t i = 0; i <= trailing_cols.size(); ++i)
        mtz.data[n + dest_idx + i] = mtz.data[n + src_col.idx + i];
  } else {
    // external copying - need to match indices
    std::vector<int> dst_indices = mtz.sorted_row_indices();
    std::vector<int> src_indices = src_mtz->sorted_row_indices();
    // cf. for_matching_reflections()
    size_t dst_stride = mtz.columns.size();
    size_t src_stride = src_mtz->columns.size();
    auto dst = dst_indices.begin();
    auto src = src_indices.begin();
    while (dst != dst_indices.end() && src != src_indices.end()) {
      Miller dst_hkl = mtz.get_hkl(*dst * dst_stride);
      Miller src_hkl = src_mtz->get_hkl(*src * src_stride);
      if (dst_hkl == src_hkl) {
        // copy values
        for (size_t i = 0; i <= trailing_cols.size(); ++i)
          mtz.data[*dst * dst_stride + dest_idx + i] =
            src_mtz->data[*src * src_stride + src_col.idx + i];
        ++dst;
        ++src;
      } else if (dst_hkl < src_hkl) {
        ++dst;
      } else {
        ++src;
      }
    }
  }
}

} // anonymous namespace

Mtz::Column& Mtz::replace_column(size_t dest_idx, const Mtz::Column& src_col,
                                 const std::vector<std::string>& trailing_cols) {
  check_trailing_cols(*src_col.parent, src_col, trailing_cols);
  check_column(*this, dest_idx + trailing_cols.size(), "replace_column()");
  do_replace_column(*this, dest_idx, src_col, trailing_cols);
  return columns[dest_idx];
}

Mtz::Column& Mtz::copy_column(int dest_idx, const Mtz::Column& src_col,
                              const std::vector<std::string>& trailing_cols) {
  // check input consistency
  if (!has_data())
    fail("copy_column(): data not read yet");
  check_trailing_cols(*src_col.parent, src_col, trailing_cols);
  // add new columns
  if (dest_idx < 0)
    dest_idx = (int) columns.size();
  // if src_col is from this Mtz it may get invalidated when adding columns
  int col_idx = -1;
  if (src_col.parent == this) {
    col_idx = (int) src_col.idx;
    if (col_idx >= dest_idx)
      col_idx += 1 + (int)trailing_cols.size();
  }
  for (int i = 0; i <= (int) trailing_cols.size(); ++i)
    add_column("", ' ', -1, dest_idx + i, false);
  expand_data_rows(1 + trailing_cols.size(), dest_idx);
  // copy the data
  const Column& src_col_now = col_idx < 0 ? src_col : columns[col_idx];
  // most of the work (hkl-based row matching and data copying) is done here:
  do_replace_column(*this, dest_idx, src_col_now, trailing_cols);
  return columns[dest_idx];
}

void Mtz::remove_column(size_t idx) {
  check_column(*this, idx, "remove_column()");
  columns.erase(columns.begin() + idx);
  for (size_t i = idx; i < columns.size(); ++i)
    --columns[i].idx;
  vector_remove_column(data, columns.size(), idx);
  assert(columns.size() * nreflections == data.size());
}


#define WRITE(...) do { \
    int len = snprintf_z(buf, 81, __VA_ARGS__); \
    if (len < 80) \
      std::memset(buf + len, ' ', 80 - len); \
    if (write(buf, 80, 1) != 1) \
      sys_fail("Writing MTZ file failed"); \
  } while(0)

template<typename Write>
void Mtz::write_to_stream(Write write) const {
  // uses: data, spacegroup, nreflections, batches, cell, sort_order,
  //       valm, columns, datasets, history
  if (!has_data())
    fail("Cannot write Mtz which has no data");
  if (!spacegroup)
    fail("Cannot write Mtz which has no space group");
  char buf[81] = {'M', 'T', 'Z', ' ', '\0'};
  std::int64_t real_header_start = (int64_t) columns.size() * nreflections + 21;
  std::int32_t header_start = (int32_t) real_header_start;
  if (real_header_start > std::numeric_limits<int32_t>::max()) {
    header_start = -1;
  } else {
    real_header_start = 0;
  }
  std::memcpy(buf + 4, &header_start, 4);
  std::int32_t machst = is_little_endian() ? 0x00004144 : 0x11110000;
  std::memcpy(buf + 8, &machst, 4);
  std::memcpy(buf + 12, &real_header_start, 8);
  if (write(buf, 80, 1) != 1 ||
      write(data.data(), 4, data.size()) != data.size())
    fail("Writing MTZ file failed");
  WRITE("VERS MTZ:V1.1");
  WRITE("TITLE %s", title.c_str());
  WRITE("NCOL %8zu %12d %8zu", columns.size(), nreflections, batches.size());
  if (cell.is_crystal())
    WRITE("CELL  %9.4f %9.4f %9.4f %9.4f %9.4f %9.4f",
          cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma);
  WRITE("SORT  %3d %3d %3d %3d %3d", sort_order[0], sort_order[1],
        sort_order[2], sort_order[3], sort_order[4]);
  GroupOps ops = spacegroup->operations();
  char lat_type = spacegroup->ccp4_lattice_type();
  WRITE("SYMINF %3d %2d %c %5d %*s'%c%s' PG%s",
        ops.order(),               // number of symmetry operations
        (int) ops.sym_ops.size(),  // number of primitive operations
        lat_type,                  // lattice type
        spacegroup->ccp4,          // space group number
        20 - (int) std::strlen(spacegroup->hm), "",
        lat_type,                  // space group name (first letter)
        spacegroup->hm + 1,        // space group name (the rest)
        spacegroup->point_group_hm()); // point group name
  // If we have symops that are the same as spacegroup->operations(),
  // write symops to preserve the order of SYMM records.
  if (!symops.empty() && ops.is_same_as(split_centering_vectors(symops)))
    for (Op op : symops)
      WRITE("SYMM %s", to_upper(op.triplet()).c_str());
  else
    for (Op op : ops)
      WRITE("SYMM %s", to_upper(op.triplet()).c_str());
  auto reso = calculate_min_max_1_d2();
  WRITE("RESO %-20.12f %-20.12f", reso[0], reso[1]);
  if (std::isnan(valm))
    WRITE("VALM NAN");
  else
    WRITE("VALM %f", valm);
  auto format17 = [](float f) {
    char buffer[18];
    int len = snprintf_z(buffer, 18, "%.9f", f);
    return std::string(buffer, len > 0 ? std::min(len, 17) : 0);
  };
  for (const Column& col : columns) {
    auto minmax = calculate_min_max_disregarding_nans(col.begin(), col.end());
    const char* label = !col.label.empty() ? col.label.c_str() : "_";
    WRITE("COLUMN %-30s %c %17s %17s %4d",
          label, col.type,
          format17(minmax[0]).c_str(), format17(minmax[1]).c_str(),
          col.dataset_id);
    if (!col.source.empty())
      WRITE("COLSRC %-30s %-36s  %4d", label, col.source.c_str(), col.dataset_id);
  }
  WRITE("NDIF %8zu", datasets.size());
  for (const Dataset& ds : datasets) {
    WRITE("PROJECT %7d %s", ds.id, ds.project_name.c_str());
    WRITE("CRYSTAL %7d %s", ds.id, ds.crystal_name.c_str());
    WRITE("DATASET %7d %s", ds.id, ds.dataset_name.c_str());
    const UnitCell& uc = (ds.cell.is_crystal() && ds.cell.a > 0 ? ds.cell : cell);
    WRITE("DCELL %9d %10.4f%10.4f%10.4f%10.4f%10.4f%10.4f",
          ds.id, uc.a, uc.b, uc.c, uc.alpha, uc.beta, uc.gamma);
    WRITE("DWAVEL %8d %10.5f", ds.id, ds.wavelength);
  }
  int pos = 0;
  for (const Batch& batch : batches) {
    if (pos == 0)
      std::memcpy(buf, "BATCH ", 6);  // NOLINT(bugprone-not-null-terminated-result)
    pos += 6;
    snprintf_z(buf + pos, 7, "%6d", batch.number);
    if (pos > 72 || &batch == &batches.back()) {
      std::memset(buf + pos, ' ', 80 - pos);
      if (write(buf, 80, 1) != 1)
        fail("Writing MTZ file failed");
      pos = 0;
    }
  }
  WRITE("END");
  if (!history.empty()) {
    // According to mtzformat.html the file can have only up to 30 history
    // lines, but we don't enforce it here.
    WRITE("MTZHIST %3zu", history.size());
    for (const std::string& line : history)
      WRITE("%s", line.c_str());
  }
  if (!batches.empty()) {
    WRITE("MTZBATS");
    for (const Batch& batch : batches) {
      // keep the numbers the same as in files written by libccp4
      WRITE("BH %8d %7zu %7zu %7zu",
            batch.number, batch.ints.size() + batch.floats.size(),
            batch.ints.size(), batch.floats.size());
      WRITE("TITLE %.70s", batch.title.c_str());
      if (batch.ints.size() != 29 || batch.floats.size() != 156)
        fail("wrong size of binaries batch headers");
      write(batch.ints.data(), 4, batch.ints.size());
      write(batch.floats.data(), 4, batch.floats.size());
      WRITE("BHCH  %7.7s %7.7s %7.7s",
            batch.axes.size() > 0 ? batch.axes[0].c_str() : "",
            batch.axes.size() > 1 ? batch.axes[1].c_str() : "",
            batch.axes.size() > 2 ? batch.axes[2].c_str() : "");
    }
  }
  WRITE("MTZENDOFHEADERS");
  if (!appended_text.empty()) {
    if (write(appended_text.data(), appended_text.size(), 1) != 1)
      fail("Writing MTZ file failed");
  }
}

#undef WRITE

void Mtz::write_to_cstream(std::FILE* stream) const {
  write_to_stream([&](const void *ptr, size_t size, size_t nmemb) {
      return std::fwrite(ptr, size, nmemb, stream);
  });
}

void Mtz::write_to_string(std::string& str) const {
  // Calculate the size beforehand to avoid memory re-allocations
  // and minimize memory usage. It hasn't been benchmarked against
  // a single-pass writing.
  size_t nbytes = size_to_write();
  str.resize(nbytes);
  write_to_buffer(&str[0], nbytes);
}

void Mtz::write_to_file(const std::string& path) const {
  fileptr_t f = file_open(path.c_str(), "wb");
  try {
    write_to_cstream(f.get());
  } catch (std::runtime_error& e) {
    fail(std::string(e.what()) + ": " + path);
  }
}

size_t Mtz::size_to_write() const {
  size_t nbytes = 0;
  write_to_stream([&](const void *, size_t size, size_t nmemb) {
      nbytes += size * nmemb;
      return nmemb;
  });
  return nbytes;
}

size_t Mtz::write_to_buffer(char* buf, size_t maxlen) const {
  size_t len = 0;
  write_to_stream([&](const void *ptr, size_t size, size_t nmemb) {
      len += size * nmemb;
      if (len > maxlen)
        fail("Mtz::write_to_buffer: size too small");
      memcpy(buf, ptr, size * nmemb);
      buf += size * nmemb;
      return nmemb;
  });
  return len;
}

} // namespace gemmi