Gemmi: Add more functionality from gemmi 0.7.5

2026-05-13 17:26:51 +02:00
parent b27b140bf0
commit 08bf186766
27 changed files with 10507 additions and 3 deletions
@@ -153,7 +153,7 @@ ADD_SUBDIRECTORY(reader)
 ADD_SUBDIRECTORY(detector_control)
 ADD_SUBDIRECTORY(image_puller)
 ADD_SUBDIRECTORY(preview)
-ADD_SUBDIRECTORY(symmetry)
+ADD_SUBDIRECTORY(gemmi_gph)
 ADD_SUBDIRECTORY(xds-plugin)

 IF (JFJOCH_WRITER_ONLY)
@@ -0,0 +1,8 @@
+ADD_LIBRARY(gemmi STATIC symmetry.cpp gz.cpp mtz.cpp sprintf.cpp xds_ascii.cpp
+        gemmi/cellred.hpp
+        gemmi/symmetry.hpp
+        gemmi/fail.hpp
+        gemmi/unitcell.hpp
+        gemmi/math.hpp)
+TARGET_INCLUDE_DIRECTORIES(gemmi PUBLIC .)
+TARGET_LINK_LIBRARIES(gemmi )
@@ -0,0 +1,41 @@
+// Copyright 2020 Global Phasing Ltd.
+//
+// Functions that convert strings to floating-point numbers ignoring locale.
+// Simple wrappers around fastfloat::from_chars().
+
+#ifndef GEMMI_ATOF_HPP_
+#define GEMMI_ATOF_HPP_
+
+#include "atox.hpp"   // for is_space
+#include "third_party/fast_float.h"
+
+namespace gemmi {
+
+using fast_float::from_chars_result;
+
+inline from_chars_result fast_from_chars(const char* start, const char* end, double& d) {
+  while (start < end && is_space(*start))
+    ++start;
+  if (start < end && *start == '+')
+    ++start;
+  return fast_float::from_chars(start, end, d);
+}
+
+inline from_chars_result fast_from_chars(const char* start, double& d) {
+  while (is_space(*start))
+    ++start;
+  if (*start == '+')
+    ++start;
+  return fast_float::from_chars(start, start + std::strlen(start), d);
+}
+
+inline double fast_atof(const char* p, const char** endptr=nullptr) {
+  double d = 0;
+  auto result = fast_from_chars(p, d);
+  if (endptr)
+    *endptr = result.ptr;
+  return d;
+}
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,135 @@
+// Copyright 2018 Global Phasing Ltd.
+//
+// Locale-independent functions that convert strings to integers,
+// equivalents of standard isspace and isdigit, and a few helper functions.
+//
+// This file is named similarly to the standard functions atoi() and atof().
+// But the functions here are not meant to be equivalent to the standard
+// library functions. They are locale-independent (a good thing when reading
+// numbers from files). They don't set errno, don't signal overflow and
+// underflow. Due to the limited scope these functions tend to be faster
+// than the standard-library ones.
+
+#ifndef GEMMI_ATOX_HPP_
+#define GEMMI_ATOX_HPP_
+
+#include <cstdint>
+#include <stdexcept>  // for invalid_argument
+#include <string>
+
+namespace gemmi {
+
+// equivalent of std::isspace for C locale (no handling of EOF)
+inline bool is_space(char c) {
+  static const std::uint8_t table[256] = { // 1 for 9-13 and 32
+    0,0,0,0,0,0,0,0, 0,1,1,1,1,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0
+  };
+  return table[(std::uint8_t)c] != 0;
+}
+
+// equivalent of std::isblank for C locale (no handling of EOF)
+inline bool is_blank(char c) {
+  return c == ' ' || c == '\t';
+}
+
+// equivalent of std::isdigit for C locale (no handling of EOF)
+inline bool is_digit(char c) {
+  return c >= '0' && c <= '9';
+}
+
+inline const char* skip_blank(const char* p) {
+  if (p)
+    while (is_blank(*p))
+      ++p;
+  return p;
+}
+
+inline const char* skip_word(const char* p) {
+  if (p)
+    while (*p != '\0' && !is_space(*p))
+      ++p;
+  return p;
+}
+
+inline std::string read_word(const char* line) {
+  line = skip_blank(line);
+  return std::string(line, skip_word(line));
+}
+
+inline std::string read_word(const char* line, const char** endptr) {
+  line = skip_blank(line);
+  *endptr = skip_word(line);
+  return std::string(line, *endptr);
+}
+
+// no checking for overflow
+inline int string_to_int(const char* p, bool checked, size_t length=0) {
+  int mult = -1;
+  int n = 0;
+  size_t i = 0;
+  while ((length == 0 || i < length) && is_space(p[i]))
+    ++i;
+  if (p[i] == '-') {
+    mult = 1;
+    ++i;
+  } else if (p[i] == '+') {
+    ++i;
+  }
+  bool has_digits = false;
+  // use negative numbers because INT_MIN < -INT_MAX
+  for (; (length == 0 || i < length) && is_digit(p[i]); ++i) {
+    n = n * 10 - (p[i] - '0');
+    has_digits = true;
+  }
+  if (checked) {
+    while ((length == 0 || i < length) && is_space(p[i]))
+      ++i;
+    if (!has_digits || p[i] != '\0')
+      throw std::invalid_argument("not an integer: " +
+                                  std::string(p, length ? length : i+1));
+  }
+  return mult * n;
+}
+
+inline int string_to_int(const std::string& str, bool checked) {
+  return string_to_int(str.c_str(), checked);
+}
+
+inline int simple_atoi(const char* p, const char** endptr=nullptr) {
+  int mult = -1;
+  int n = 0;
+  while (is_space(*p))
+    ++p;
+  if (*p == '-') {
+    mult = 1;
+    ++p;
+  } else if (*p == '+') {
+    ++p;
+  }
+  for (; is_digit(*p); ++p)
+    n = n * 10 - (*p - '0'); // use negative numbers because INT_MIN < -INT_MAX
+  if (endptr)
+    *endptr = p;
+  return mult * n;
+}
+
+inline int no_sign_atoi(const char* p, const char** endptr=nullptr) {
+  int n = 0;
+  while (is_space(*p))
+    ++p;
+  for (; is_digit(*p); ++p)
+    n = n * 10 + (*p - '0');
+  if (endptr)
+    *endptr = p;
+  return n;
+}
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,173 @@
+// Copyright 2018 Global Phasing Ltd.
+//
+// File-related utilities.
+
+#ifndef GEMMI_FILEUTIL_HPP_
+#define GEMMI_FILEUTIL_HPP_
+
+#include <cassert>
+#include <cstdio>    // for FILE, fopen, fclose
+#include <cstdint>
+#include <cstdlib>   // for malloc, realloc
+#include <cstring>   // for strlen
+#include <initializer_list>
+#include <memory>    // for unique_ptr
+#include "fail.hpp"  // for sys_fail
+
+#if defined(_WIN32) && !defined(GEMMI_USE_FOPEN)
+#include "utf.hpp"
+#endif
+
+namespace gemmi {
+
+// strip directory and suffixes from filename
+inline std::string path_basename(const std::string& path,
+                                 std::initializer_list<const char*> exts) {
+  size_t pos = path.find_last_of("\\/");
+  std::string basename = pos == std::string::npos ? path : path.substr(pos + 1);
+  for (const char* ext : exts) {
+    size_t len = std::strlen(ext);
+    if (basename.size() > len &&
+        basename.compare(basename.length() - len, len, ext, len) == 0)
+      basename.resize(basename.length() - len);
+  }
+  return basename;
+}
+
+// file operations
+
+/// deleter for fileptr_t
+struct needs_fclose {
+  bool use_fclose;
+  void operator()(std::FILE* f) const noexcept {
+    if (use_fclose)
+      std::fclose(f);
+  }
+};
+
+typedef std::unique_ptr<std::FILE, needs_fclose> fileptr_t;
+
+inline fileptr_t file_open(const char* path, const char* mode) {
+  std::FILE* file;
+#if defined(_WIN32) && !defined(GEMMI_USE_FOPEN)
+  std::wstring wpath = UTF8_to_wchar(path);
+  std::wstring wmode = UTF8_to_wchar(mode);
+  if ((file = ::_wfopen(wpath.c_str(), wmode.c_str())) == nullptr)
+#else
+  if ((file = std::fopen(path, mode)) == nullptr)
+#endif
+    sys_fail(std::string("Failed to open ") + path +
+             (*mode == 'w' ? " for writing" : ""));
+  return fileptr_t(file, needs_fclose{true});
+}
+
+// helper function for treating "-" as stdin or stdout
+inline fileptr_t file_open_or(const char* path, const char* mode,
+                              std::FILE* dash_stream) {
+  if (path[0] == '-' && path[1] == '\0')
+    return fileptr_t(dash_stream, needs_fclose{false});
+  return file_open(path, mode);
+}
+
+inline std::size_t file_size(std::FILE* f, const std::string& path) {
+  if (std::fseek(f, 0, SEEK_END) != 0)
+    sys_fail(path + ": fseek failed");
+  long length = std::ftell(f);
+  if (length < 0)
+    sys_fail(path + ": ftell failed");
+  if (std::fseek(f, 0, SEEK_SET) != 0)
+    sys_fail(path + ": fseek failed");
+  return length;
+}
+
+// helper function for working with binary files
+inline bool is_little_endian() {
+  std::uint32_t x = 1;
+  return *reinterpret_cast<char *>(&x) == 1;
+}
+
+inline void swap_two_bytes(void* start) {
+  char* bytes = static_cast<char*>(start);
+  std::swap(bytes[0], bytes[1]);
+}
+
+inline void swap_four_bytes(void* start) {
+  char* bytes = static_cast<char*>(start);
+  std::swap(bytes[0], bytes[3]);
+  std::swap(bytes[1], bytes[2]);
+}
+
+inline void swap_eight_bytes(void* start) {
+  char* bytes = static_cast<char*>(start);
+  std::swap(bytes[0], bytes[7]);
+  std::swap(bytes[1], bytes[6]);
+  std::swap(bytes[2], bytes[5]);
+  std::swap(bytes[3], bytes[4]);
+}
+
+
+class CharArray {
+  std::unique_ptr<char, decltype(&std::free)> ptr_;
+  size_t size_;
+public:
+  CharArray() : ptr_(nullptr, &std::free), size_(0) {}
+  explicit CharArray(size_t n) : ptr_((char*)std::malloc(n), &std::free), size_(n) {}
+  explicit operator bool() const { return (bool)ptr_; }
+  char* data() { return ptr_.get(); }
+  const char* data() const { return ptr_.get(); }
+  size_t size() const { return size_; }
+  void set_size(size_t n) { size_ = n; }
+
+  void resize(size_t n) {
+    char* new_ptr = (char*) std::realloc(ptr_.get(), n);
+    if (!new_ptr && n != 0)
+      fail("Out of memory.");
+    (void) ptr_.release();  // NOLINT(bugprone-unused-return-value)
+    ptr_.reset(new_ptr);
+    size_ = n;
+  }
+
+  // Remove first n bytes making space for more text at the returned position.
+  char* roll(size_t n) {
+    assert(n <= size());
+    std::memmove(data(), data() + n, n);
+    return data() + n;
+  }
+};
+
+
+/// reading file into a memory buffer (optimized: uses fseek to determine file size)
+inline CharArray read_file_into_buffer(const std::string& path) {
+  fileptr_t f = file_open(path.c_str(), "rb");
+  size_t size = file_size(f.get(), path);
+  CharArray buffer(size);
+  if (std::fread(buffer.data(), size, 1, f.get()) != 1)
+    sys_fail(path + ": fread failed");
+  return buffer;
+}
+
+inline CharArray read_stdin_into_buffer() {
+  size_t n = 0;
+  CharArray buffer(16 * 1024);
+  for (;;) {
+    n += std::fread(buffer.data() + n, 1, buffer.size() - n, stdin);
+    if (n != buffer.size()) {
+      buffer.set_size(n);
+      break;
+    }
+    buffer.resize(2*n);
+  }
+  return buffer;
+}
+
+template<typename T>
+inline CharArray read_into_buffer(T&& input) {
+  if (input.is_compressed())
+    return input.uncompress_into_buffer();
+  if (input.is_stdin())
+    return read_stdin_into_buffer();
+  return read_file_into_buffer(input.path());
+}
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,52 @@
+// Copyright 2017 Global Phasing Ltd.
+//
+// Functions for transparent reading of gzipped files. Uses zlib.
+
+#ifndef GEMMI_GZ_HPP_
+#define GEMMI_GZ_HPP_
+#include <string>
+#include "fail.hpp"     // GEMMI_DLL
+#include "input.hpp"    // BasicInput
+#include "util.hpp"     // iends_with
+
+namespace gemmi {
+
+GEMMI_DLL extern const char* const zlib_description;
+
+GEMMI_DLL size_t estimate_uncompressed_size(const std::string& path);
+
+// the same interface as FileStream and MemoryStream
+struct GEMMI_DLL GzStream final : public AnyStream {
+  GzStream(void* f_) : f(f_) {}
+  char* gets(char* line, int size) override;
+  int getc() override;
+  bool read(void* buf, size_t len) override;
+  bool skip(size_t n) override;
+  long tell() override;
+  std::string read_rest() override;
+
+private:
+  void* f;  // implementation detail
+};
+
+class GEMMI_DLL MaybeGzipped : public BasicInput {
+public:
+  explicit MaybeGzipped(const std::string& path);
+  ~MaybeGzipped();
+  size_t gzread_checked(void* buf, size_t len);
+  bool is_compressed() const { return iends_with(path(), ".gz"); }
+  std::string basepath() const {
+    return is_compressed() ? path().substr(0, path().size() - 3) : path();
+  }
+
+  CharArray uncompress_into_buffer(size_t limit=0);
+
+  std::unique_ptr<AnyStream> create_stream();
+
+private:
+  void* file_ = nullptr;
+};
+
+} // namespace gemmi
+
+#endif
@@ -0,0 +1,168 @@
+// Copyright 2018 Global Phasing Ltd.
+//
+// Input abstraction.
+// Used to decouple file reading and decompression.
+
+#ifndef GEMMI_INPUT_HPP_
+#define GEMMI_INPUT_HPP_
+
+#include <cstddef> // for ptrdiff_t
+#include <cstdio>  // for FILE, fseek, fread
+#include <cstring> // for memchr
+#include <string>
+#include "fileutil.hpp"  // for fileptr_t
+
+namespace gemmi {
+
+// base class for FileStream, MemoryStream and GzStream
+struct AnyStream {
+  virtual ~AnyStream() = default;
+
+  virtual char* gets(char* line, int size) = 0;   // for pdb, copy_line()
+  virtual int getc() = 0;                         // for copy_line()
+  virtual bool read(void* buf, size_t len) = 0;   // for ccp4, mtz
+
+  // these are not used in GzStream because MemoryStream is used for mtz
+  virtual long tell() = 0; // temporary, for testing
+  virtual bool skip(size_t n) = 0;  // for reading mtz without data
+  virtual std::string read_rest() { return {}; }  // for mtz (appendix)
+
+  size_t copy_line(char* line, int size) {        // for pdb, xds_ascii
+    if (!gets(line, size))
+      return 0;
+    size_t len = std::strlen(line);
+    // If a line is longer than size we discard the rest of it.
+    if (len > 0 && line[len-1] != '\n')
+      for (int c = getc(); c > 0 /* not 0 nor EOF */ && c != '\n'; c = getc())
+        continue;
+    return len;
+  };
+};
+
+struct FileStream final : public AnyStream {
+  FileStream(std::FILE* f_) : f(f_, needs_fclose{false}) {}
+  FileStream(const char* path, const char* mode) : f(file_open_or(path, mode, stdin)) {}
+
+  char* gets(char* line, int size) override { return std::fgets(line, size, f.get()); }
+  int getc() override { return std::fgetc(f.get()); }
+  bool read(void* buf, size_t len) override { return std::fread(buf, len, 1, f.get()) == 1; }
+
+  std::string read_rest() override {
+    std::string ret;
+    int c = std::fgetc(f.get());
+    if (c != EOF) {
+      ret += (char)c;
+      char buf[512];
+      for (;;) {
+        size_t n = std::fread(buf, 1, sizeof(buf), f.get());
+        ret.append(buf, n);
+        if (n != sizeof(buf))
+          break;
+      }
+    }
+    return ret;
+  }
+
+  long tell() override {
+    return std::ftell(f.get());
+  }
+
+  bool skip(size_t n) override {
+#if defined(_MSC_VER)
+    int result = _fseeki64(f.get(), (std::ptrdiff_t)n, SEEK_CUR);
+#elif defined(__MINGW32__)
+    int result = fseeko(f.get(), (_off_t)n, SEEK_CUR);
+#else
+    int result = std::fseek(f.get(), (long)n, SEEK_CUR);
+#endif
+    if (result != 0) {
+      char buf[512];
+      while (n >= sizeof(buf)) {
+        if (std::fread(buf, sizeof(buf), 1, f.get()) != 1)
+          return false;
+        n -= sizeof(buf);
+      }
+      if (n > 0 && std::fread(buf, n, 1, f.get()) != 1)
+        return false;
+    }
+    return true;
+  }
+
+private:
+  fileptr_t f;
+};
+
+struct MemoryStream final : public AnyStream {
+  MemoryStream(const char* start_, size_t size)
+    : start(start_), end(start_ + size), cur(start_) {}
+
+  char* gets(char* line, int size) override {
+    --size; // fgets reads in at most one less than size characters
+    if (cur >= end)
+      return nullptr;
+    if (size > end - cur)
+      size = int(end - cur);
+    const char* nl = (const char*) std::memchr(cur, '\n', size);
+    size_t len = nl ? nl - cur + 1 : size;
+    std::memcpy(line, cur, len);
+    line[len] = '\0';
+    cur += len;
+    return line;
+  }
+  int getc() override { return cur < end ? *cur++ : EOF; }
+
+  bool read(void* buf, size_t len) override {
+    if (cur + len > end)
+      return false;
+    std::memcpy(buf, cur, len);
+    cur += len;
+    return true;
+  }
+
+  std::string read_rest() override {
+    const char* last = cur;
+    cur = end;
+    return std::string(last, end);
+  }
+
+  long tell() override {
+    return cur - start;
+  }
+  bool skip(size_t n) override {
+    cur += n;
+    return cur < end;
+  }
+
+private:
+  const char* const start;
+  const char* const end;
+  const char* cur;
+};
+
+class BasicInput {
+public:
+  explicit BasicInput(const std::string& path) : path_(path) {}
+
+  const std::string& path() const { return path_; }
+  const std::string& basepath() const { return path_; }
+
+  // Does the path stands for stdin?
+  // Each reading function needs to call it (some functions use stdin
+  // and some std::cin, so we don't try to unify it here).
+  bool is_stdin() const { return path() == "-"; }
+
+  // providing the same interface as MaybeGzipped
+  bool is_compressed() const { return false; }
+  // for reading (uncompressing into memory) the whole file at once
+  CharArray uncompress_into_buffer(size_t=0) { return {}; }
+
+  std::unique_ptr<AnyStream> create_stream() {
+    return std::unique_ptr<AnyStream>(new FileStream(path().c_str(), "rb"));
+  }
+
+private:
+  std::string path_;
+};
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,287 @@
+// Copyright 2018 Global Phasing Ltd.
+//
+// Bidirectional iterators (over elements of any container) that can filter,
+// uniquify, group, or iterate with a stride.
+
+#ifndef GEMMI_ITERATOR_HPP_
+#define GEMMI_ITERATOR_HPP_
+#include <iterator>     // for bidirectional_iterator_tag
+#include <type_traits>  // for remove_cv
+#include <vector>
+
+namespace gemmi {
+
+// Disable warning "X<T>::operator X<T>() const will not be called for
+// implicit or explicit conversions", which is triggered when templates
+// StrideIter, IndirectIter and others are expanded with const Value.
+#if defined(__INTEL_COMPILER) || defined(__NVCOMPILER)
+  #pragma diagnostic push
+  #pragma diag_suppress = conversion_function_not_usable
+#elif defined(__NVCC__)
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress = conversion_function_not_usable
+#endif
+
+// implements concept BidirectionalIterator
+template <typename Policy>
+struct BidirIterator : Policy {
+  using value_type = typename std::remove_cv<typename Policy::value_type>::type;
+  using difference_type = std::ptrdiff_t;
+  using pointer = typename Policy::value_type*;
+  using reference = typename Policy::reference;
+  using iterator_category = std::bidirectional_iterator_tag;
+
+  BidirIterator() = default;
+  BidirIterator(Policy&& p) : Policy(p) {}
+
+  BidirIterator& operator++() { Policy::increment(); return *this; }
+  BidirIterator operator++(int) { BidirIterator x = *this; ++*this; return x; }
+  BidirIterator& operator--() { Policy::decrement(); return *this; }
+  BidirIterator operator--(int) { BidirIterator x = *this; --*this; return x; }
+  bool operator==(const BidirIterator &o) const { return Policy::equal(o); }
+  bool operator!=(const BidirIterator &o) const { return !Policy::equal(o); }
+  reference operator*() { return Policy::dereference(); }
+  pointer operator->() { return &Policy::dereference(); }
+  using const_variant = BidirIterator<typename Policy::const_policy>;
+  operator const_variant() const {
+    return const_variant(static_cast<const Policy&>(*this));
+  }
+};
+
+template<typename Value>
+class StrideIterPolicy {
+public:
+  using value_type = Value;
+  using reference = Value&;
+  StrideIterPolicy() : cur_(nullptr), offset_(0), stride_(0) {}
+  StrideIterPolicy(Value* ptr, std::size_t offset, size_t stride)
+    : cur_(ptr), offset_(offset), stride_((unsigned)stride) {}
+  void increment() { cur_ += stride_; }
+  void decrement() { cur_ -= stride_; }
+  bool equal(const StrideIterPolicy& o) const { return cur_ == o.cur_; }
+  Value& dereference() { return cur_[offset_]; }
+  using const_policy = StrideIterPolicy<Value const>;
+  operator const_policy() const { return const_policy(cur_, offset_, stride_); }
+private:
+  Value* cur_;
+  std::size_t offset_;
+  unsigned stride_;
+};
+template<typename Value>
+using StrideIter = BidirIterator<StrideIterPolicy<Value>>;
+
+
+template<typename Redirect, typename Value>
+class IndirectIterPolicy {
+public:
+  using value_type = Value;
+  using reference = Value&;
+  IndirectIterPolicy() : redir_(nullptr) {}
+  IndirectIterPolicy(Redirect* redir, std::vector<int>::const_iterator cur)
+    : redir_(redir), cur_(cur) {}
+  void increment() { ++cur_; }
+  void decrement() { --cur_; }
+  bool equal(const IndirectIterPolicy& o) const { return cur_ == o.cur_; }
+  Value& dereference() { return redir_->value_at(*cur_); }
+  using const_policy = IndirectIterPolicy<Redirect const, Value const>;
+  operator const_policy() const { return const_policy(redir_, cur_); }
+  // TODO: what should be done with absent optional tags (*cur_ < 0)?
+private:
+  Redirect* redir_;
+  std::vector<int>::const_iterator cur_; // points into positions
+};
+template<typename Redirect, typename Value>
+using IndirectIter = BidirIterator<IndirectIterPolicy<Redirect, Value>>;
+
+
+template<typename Vector, typename Value>
+class UniqIterPolicy {
+public:
+  using value_type = Value;
+  using reference = Value&;
+  UniqIterPolicy() : vec_(nullptr), pos_(0) {}
+  UniqIterPolicy(Vector* vec, std::size_t pos) : vec_(vec), pos_(pos) {}
+  void increment() {
+    // move to the first element of the next group
+    const auto& key = (*vec_)[pos_].group_key();
+    ++pos_;
+    while (pos_ != vec_->size() && (*vec_)[pos_].group_key() == key)
+      ++pos_;
+  }
+  void decrement() {
+    --pos_; // now we are at the last element of the previous group
+    const auto& key = (*vec_)[pos_].group_key();
+    while (pos_ != 0 && (*vec_)[pos_-1].group_key() == key)
+      --pos_; // move to the group beginning
+  }
+  bool equal(const UniqIterPolicy& o) const { return pos_ == o.pos_; }
+  Value& dereference() { return (*vec_)[pos_]; }
+  using const_policy = UniqIterPolicy<Vector const, Value const>;
+  operator const_policy() const { return const_policy(vec_, pos_); }
+private:
+  Vector* vec_;
+  std::size_t pos_;
+};
+template<typename Vector, typename Value>
+using UniqIter = BidirIterator<UniqIterPolicy<Vector, Value>>;
+
+template<typename Value, typename Vector=std::vector<Value>>
+struct UniqProxy {
+  Vector& vec;
+  using iterator = UniqIter<Vector, Value>;
+  iterator begin() { return {{&vec, 0}}; }
+  iterator end() { return {{&vec, vec.size()}}; }
+};
+template<typename Value, typename Vector=std::vector<Value>>
+struct ConstUniqProxy {
+  const Vector& vec;
+  using iterator = UniqIter<const Vector, const Value>;
+  iterator begin() const { return {{&vec, 0}}; }
+  iterator end() const { return {{&vec, vec.size()}}; }
+};
+
+
+template<typename Vector, typename Value>
+class GroupingIterPolicy {
+public:
+  using value_type = Value;
+  using reference = Value&;
+  GroupingIterPolicy() = default;
+  GroupingIterPolicy(const Value& span) : span_(span) {}
+  void increment() {
+    span_.set_begin(span_.end());
+    span_.set_size(0);
+    while (!span_.is_ending() &&
+           span_.begin()->group_key() == span_.end()->group_key())
+      span_.set_size(span_.size() + 1);
+  }
+  void decrement() {
+    span_.set_begin(span_.begin() - 1);
+    span_.set_size(1);
+    while (!span_.is_beginning() &&
+           span_.begin()->group_key() == (span_.begin() - 1)->group_key()) {
+      span_.set_begin(span_.begin() - 1);
+      span_.set_size(span_.size() + 1);
+    }
+  }
+  bool equal(const GroupingIterPolicy& o) const {
+    return span_.begin() == o.span_.begin();
+  }
+  Value& dereference() { return span_; }
+  using const_policy = GroupingIterPolicy<Vector const, Value const>;
+  operator const_policy() const { return const_policy(span_); }
+private:
+  Value span_;
+};
+template<typename Vector, typename Value>
+using GroupingIter = BidirIterator<GroupingIterPolicy<Vector, Value>>;
+
+
+template<typename Filter, typename Vector, typename Value>
+class FilterIterPolicy {
+public:
+  using value_type = Value;
+  using reference = Value&;
+  FilterIterPolicy() : vec_(nullptr), pos_(0) {}
+  FilterIterPolicy(const Filter* filter, Vector* vec, std::size_t pos)
+      : filter_(filter), vec_(vec), pos_(pos) {
+    while (pos_ != vec_->size() && !matches(pos_))
+      ++pos_;
+  }
+  bool matches(std::size_t p) const { return filter_->matches((*vec_)[p]); }
+  void increment() { while (++pos_ < vec_->size() && !matches(pos_)) {} }
+  void decrement() { while (pos_ != 0 && !matches(--pos_)) {} }
+  bool equal(const FilterIterPolicy& o) const { return pos_ == o.pos_; }
+  Value& dereference() { return (*vec_)[pos_]; }
+  using const_policy = FilterIterPolicy<Filter, Vector const, Value const>;
+  operator const_policy() const { return const_policy(vec_, pos_); }
+private:
+  const Filter* filter_;
+  Vector* vec_;
+  std::size_t pos_;
+};
+template<typename Filter, typename Vector, typename Value>
+using FilterIter = BidirIterator<FilterIterPolicy<Filter, Vector, Value>>;
+
+template<typename Filter, typename Value>
+struct FilterProxy {
+  const Filter& filter;
+  std::vector<Value>& vec;
+  using iterator = FilterIter<Filter, std::vector<Value>, Value>;
+  iterator begin() { return {{&filter, &vec, 0}}; }
+  iterator end() { return {{&filter, &vec, vec.size()}}; }
+};
+
+template<typename Filter, typename Value>
+struct ConstFilterProxy {
+  const Filter& filter;
+  const std::vector<Value>& vec;
+  using iterator = FilterIter<Filter, const std::vector<Value>, const Value>;
+  iterator begin() const { return {{&filter, &vec, 0}}; }
+  iterator end() const { return {{&filter, &vec, vec.size()}}; }
+};
+
+
+template<typename Item>
+struct ItemGroup {
+  using element_type = Item;
+
+  ItemGroup(Item* start, const Item* end)
+      : size_(int(end - start)), extent_(int(end - start)), start_(start) {
+    for (const Item* i = start + 1; i != end; ++i)
+      if (i->group_key() != start->group_key())
+        --size_;
+  }
+
+  struct iterator {
+    Item* ptr;
+    const Item* end;
+    bool operator==(const iterator& o) const { return ptr == o.ptr; }
+    bool operator!=(const iterator& o) const { return ptr != o.ptr; }
+    iterator& operator++() {
+      const Item* prev = ptr++;
+      while (ptr != end && ptr->group_key() != prev->group_key())
+        ++ptr;
+      return *this;
+    }
+    Item& operator*() { return *ptr; }
+    Item* operator->() { return ptr; }
+  };
+  iterator begin() { return iterator{start_, start_+extent_}; }
+  iterator end() { return iterator{start_+extent_, start_+extent_}; }
+
+  size_t size() const { return (size_t) size_; }
+  int extent() const { return extent_; }
+  bool empty() const { return size_ == 0; }
+  Item& front() { return *start_; }
+  const Item& front() const { return *start_; }
+  Item& back() { return start_[extent_ - 1]; }
+  const Item& back() const { return start_[extent_ - 1]; }
+
+  // constant time unless sparse (extend_ > size_)
+  Item& operator[](std::size_t i) {
+    if (size_ == extent_ || i == 0)
+      return start_[i];
+    for (Item* ptr = start_ + 1; ; ++ptr)
+      if (ptr->group_key() == start_->group_key())
+        if (--i == 0)
+          return *ptr;
+  }
+  const Item& operator[](std::size_t i) const {
+    return const_cast<ItemGroup*>(this)->operator[](i);
+  }
+
+private:
+  int size_ = 0;
+  int extent_ = 0;
+  Item* start_ = nullptr;
+};
+
+#if defined(__INTEL_COMPILER) || defined(__NVCOMPILER)
+  #pragma diagnostic pop
+#elif defined(__NVCC__)
+  #pragma nv_diagnostic pop
+#endif
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,71 @@
+// Copyright Global Phasing Ltd.
+//
+// Logger - a tiny utility for passing messages through a callback.
+
+#ifndef GEMMI_LOGGER_HPP_
+#define GEMMI_LOGGER_HPP_
+
+#include <cstdio>      // for fprintf
+#include <functional>  // for function
+#include "fail.hpp"    // for GEMMI_COLD
+#include "util.hpp"    // for cat
+
+namespace gemmi {
+
+/// Passes messages (including warnings/errors) to a callback function.
+/// Messages are passed as strings without a trailing newline.
+/// They have syslog-like severity levels: 8=debug, 6=info, 5=notice, 3=error,
+/// allowing the use of a threshold to filter them.
+/// Quirk: Errors double as both errors and warnings. Unrecoverable errors
+///        don't go through this class; Logger only handles errors that can
+///        be downgraded to warnings. If a callback is set, the error is passed
+///        as a warning message. Otherwise, it's thrown as std::runtime_error.
+struct Logger {
+  /// A function that handles messages.
+  std::function<void(const std::string&)> callback;
+  /// Pass messages of this level and all lower (more severe) levels:
+  /// 8=all, 6=all but debug, 5=notes and warnings, 3=warnings, 0=none
+  int threshold = 6;
+
+  /// suspend() and resume() are used internally to avoid duplicate messages
+  /// when the same function is called (internally) multiple times.
+  void suspend() { threshold -= 100; }
+  void resume()  { threshold += 100; }
+
+  /// Send a message without any prefix on with a numeric threshold N.
+  template<int N, class... Args> void level(Args const&... args) const {
+    if (threshold >= N && callback)
+      callback(cat(args...));
+  }
+
+  /// Send a debug message.
+  template<class... Args> void debug(Args const&... args) const { level<8>("Debug: ", args...); }
+  /// Send a message without any prefix.
+  template<class... Args> void mesg(Args const&... args) const { level<6>(args...); }
+  /// Send a note (a notice, a significant message).
+  template<class... Args> void note(Args const&... args) const { level<5>("Note: ", args...); }
+
+  /// Send a warning/error (see Quirk above).
+  template<class... Args> GEMMI_COLD void err(Args const&... args) const {
+    if (threshold >= 3) {
+      std::string msg = cat(args...);
+      if (callback == nullptr)
+        fail(msg);
+      callback("Warning: " + msg);
+    }
+  }
+
+  // predefined callbacks
+
+  /// to be used as: logger.callback = Logger::to_stderr;
+  static void to_stderr(const std::string& s) {
+    std::fprintf(stderr, "%s\n", s.c_str());
+  }
+  /// to be used as: logger.callback = Logger::to_stdout;
+  static void to_stdout(const std::string& s) {
+    std::fprintf(stdout, "%s\n", s.c_str());
+  }
+};
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,600 @@
+// Copyright 2019 Global Phasing Ltd.
+//
+// MTZ reflection file format.
+
+#ifndef GEMMI_MTZ_HPP_
+#define GEMMI_MTZ_HPP_
+
+#include <cassert>
+#include <cmath>         // for isnan
+#include <cstdint>       // for int32_t
+#include <algorithm>     // for copy
+#include <array>
+#include <initializer_list>
+#include <string>
+#include <vector>
+#include "fail.hpp"      // for fail
+#include "input.hpp"     // for AnyStream, FileStream, CharArray
+#include "iterator.hpp"  // for StrideIter
+#include "logger.hpp"    // for Logger
+#include "math.hpp"      // for rad, Mat33
+#include "symmetry.hpp"  // for find_spacegroup_by_name, SpaceGroup
+#include "unitcell.hpp"  // for UnitCell
+#include "util.hpp"      // for ialpha4_id, rtrim_str, ialpha3_id, ...
+
+namespace gemmi {
+
+// Unmerged MTZ files always store in-asu hkl indices and symmetry operation
+// encoded in the M/ISYM column. Here is a helper for writing such files.
+struct UnmergedHklMover {
+  UnmergedHklMover(const SpaceGroup* spacegroup) : asu_(spacegroup) {
+    if (spacegroup)
+      group_ops_ = spacegroup->operations();
+  }
+
+  // Modifies hkl and returns ISYM value for M/ISYM
+  int move_to_asu(std::array<int, 3>& hkl) {
+    std::pair<Miller, int> hkl_isym = asu_.to_asu(hkl, group_ops_);
+    hkl = hkl_isym.first;
+    return hkl_isym.second;
+  }
+
+private:
+  ReciprocalAsu asu_;
+  GroupOps group_ops_;
+};
+
+struct MtzMetadata {
+  std::string source_path;  // input file path, if known
+  bool same_byte_order = true;
+  bool indices_switched_to_original = false;
+  std::int64_t header_offset = 0;
+  std::string version_stamp;
+  std::string title;
+  int nreflections = 0;
+  std::array<int, 5> sort_order = {};
+  double min_1_d2 = NAN;
+  double max_1_d2 = NAN;
+  float valm = NAN;
+  int nsymop = 0;
+  UnitCell cell;
+  int spacegroup_number = 0;
+  std::string spacegroup_name;
+  std::vector<Op> symops;
+  const SpaceGroup* spacegroup = nullptr;
+  std::vector<std::string> history;
+  std::string appended_text;
+  // used to report non-critical problems when reading a file (also used in mtz2cif)
+  Logger logger;
+};
+
+struct GEMMI_DLL Mtz : public MtzMetadata {
+  struct Dataset {
+    int id;
+    std::string project_name;
+    std::string crystal_name;
+    std::string dataset_name;
+    UnitCell cell;
+    double wavelength;  // 0 means not set
+  };
+
+  struct Column {
+    int dataset_id;
+    char type;
+    std::string label;
+    float min_value = NAN;
+    float max_value = NAN;
+    std::string source;  // from COLSRC
+    Mtz* parent;
+    std::size_t idx;
+
+    Dataset& dataset() { return parent->dataset(dataset_id); }
+    const Dataset& dataset() const { return parent->dataset(dataset_id); }
+    bool has_data() const { return parent->has_data(); }
+    int size() const { return has_data() ? parent->nreflections : 0; }
+    size_t stride() const { return parent->columns.size(); }
+    float& operator[](std::size_t n) { return parent->data[idx + n * stride()]; }
+    float operator[](std::size_t n) const { return parent->data[idx + n * stride()]; }
+    float& at(std::size_t n) { return parent->data.at(idx + n * stride()); }
+    float at(std::size_t n) const { return parent->data.at(idx + n * stride()); }
+    bool is_integer() const {
+      return type == 'H' || type == 'B' || type == 'Y' || type == 'I';
+    }
+
+    const Column* get_next_column_if_type(char next_type) const {
+      if (idx + 1 < parent->columns.size()) {
+        const Column& next_col = parent->columns[idx + 1];
+        if (next_col.dataset_id == dataset_id && next_col.type == next_type)
+          return &next_col;
+      }
+      return nullptr;
+    }
+
+    using iterator = StrideIter<float>;
+    iterator begin() {
+      assert(parent);
+      assert(&parent->columns[idx] == this);
+      return iterator({parent->data.data(), idx, stride()});
+    }
+    iterator end() {
+      return iterator({parent->data.data() + parent->data.size(), idx,
+                       stride()});
+    }
+    using const_iterator = StrideIter<const float>;
+    const_iterator begin() const { return const_cast<Column*>(this)->begin(); }
+    const_iterator end() const { return const_cast<Column*>(this)->end(); }
+  };
+
+  struct Batch {
+    Batch() {
+      ints.resize(29, 0);
+      floats.resize(156, 0.);
+      // write the same values that are written by CCP4 progs such as COMBAT
+      ints[0] = 29 + 156;
+      ints[1] = 29;
+      ints[2] = 156;
+      // COMBAT sets BSCALE=1, but Pointless sets it to 0.
+      //floats[43] = 1.f; // batch scale
+    }
+    int number = 0;
+    std::string title;
+    std::vector<int> ints;
+    std::vector<float> floats;
+    std::vector<std::string> axes;
+
+    UnitCell get_cell() const {
+      return UnitCell(floats[0], floats[1], floats[2],
+                      floats[3], floats[4], floats[5]);
+    }
+    void set_cell(const UnitCell& uc) {
+      floats[0] = (float) uc.a;
+      floats[1] = (float) uc.b;
+      floats[2] = (float) uc.c;
+      floats[3] = (float) uc.alpha;
+      floats[4] = (float) uc.beta;
+      floats[5] = (float) uc.gamma;
+    }
+
+    int dataset_id() const { return ints[20]; }
+    void set_dataset_id(int id) { ints[20] = id; }
+    float wavelength() const { return floats[86]; }
+    void set_wavelength(float lambda) { floats[86] = lambda; }
+    float phi_start() const { return floats[36]; }
+    float phi_end() const { return floats[37]; }
+    Mat33 matrix_U() const {
+      return Mat33(floats[6], floats[9],  floats[12],
+                   floats[7], floats[10], floats[13],
+                   floats[8], floats[11], floats[14]);
+    }
+  };
+
+  std::vector<Dataset> datasets;
+  std::vector<Column> columns;
+  std::vector<Batch> batches;
+  std::vector<float> data;
+
+  explicit Mtz(bool with_base=false) {
+    if (with_base)
+      add_base();
+  }
+  Mtz(Mtz&& o) noexcept { *this = std::move(o); }
+  Mtz& operator=(Mtz&& o) noexcept {
+    MtzMetadata::operator=(std::move(o));
+    datasets = std::move(o.datasets);
+    columns = std::move(o.columns);
+    batches = std::move(o.batches);
+    data = std::move(o.data);
+    for (Mtz::Column& col : columns)
+      col.parent = this;
+    return *this;
+  }
+
+  // explicit to be aware where we make copies
+  explicit Mtz(const Mtz& o) : MtzMetadata(o) {
+    datasets = o.datasets;
+    columns = o.columns;
+    batches = o.batches;
+    data = o.data;
+    for (Mtz::Column& col : columns)
+      col.parent = this;
+  }
+
+  Mtz& operator=(Mtz const&) = delete;
+
+  void add_base() {
+    datasets.push_back({0, "HKL_base", "HKL_base", "HKL_base", cell, 0.});
+    for (int i = 0; i != 3; ++i)
+      add_column(std::string(1, "HKL"[i]), 'H', 0, i, false);
+  }
+
+  // Functions to use after MTZ headers (and data) is read.
+
+  double resolution_high() const { return std::sqrt(1.0 / max_1_d2); }
+  double resolution_low() const  { return std::sqrt(1.0 / min_1_d2); }
+
+  UnitCell& get_cell(int dataset=-1) {
+    for (Dataset& ds : datasets)
+      if (ds.id == dataset && ds.cell.is_crystal() && ds.cell.a > 0)
+        return ds.cell;
+    return cell;
+  }
+
+  const UnitCell& get_cell(int dataset=-1) const {
+    return const_cast<Mtz*>(this)->get_cell(dataset);
+  }
+
+  void set_cell_for_all(const UnitCell& new_cell) {
+    cell = new_cell;
+    cell.set_cell_images_from_spacegroup(spacegroup);  // probably not needed
+    for (Dataset& ds : datasets)
+      ds.cell = cell;
+  }
+
+  UnitCellParameters get_average_cell_from_batch_headers(double* rmsd) const;
+
+  void set_spacegroup(const SpaceGroup* new_sg) {
+    spacegroup = new_sg;
+    spacegroup_number = new_sg ? spacegroup->ccp4 : 0;
+    spacegroup_name = new_sg ? spacegroup->hm : "";
+  }
+
+  Dataset& last_dataset() {
+    if (datasets.empty())
+      fail("MTZ dataset not found (missing DATASET header line?).");
+    return datasets.back();
+  }
+
+  Dataset& dataset(int id) {
+    if ((size_t)id < datasets.size() && datasets[id].id == id)
+      return datasets[id];
+    for (Dataset& d : datasets)
+      if (d.id == id)
+        return d;
+    fail("MTZ file has no dataset with ID " + std::to_string(id));
+  }
+  const Dataset& dataset(int id) const {
+    return const_cast<Mtz*>(this)->dataset(id);
+  }
+
+  Dataset* dataset_with_name(const std::string& name) {
+    for (Dataset& d : datasets)
+      if (d.dataset_name == name)
+        return &d;
+    return nullptr;
+  }
+  const Dataset* dataset_with_name(const std::string& label) const {
+    return const_cast<Mtz*>(this)->dataset_with_name(label);
+  }
+
+  int count(const std::string& label) const {
+    int n = 0;
+    for (const Column& col : columns)
+      if (col.label == label)
+        ++n;
+    return n;
+  }
+
+  int count_type(char type) const {
+    int n = 0;
+    for (const Column& col : columns)
+      if (col.type == type)
+        ++n;
+    return n;
+  }
+
+  Column* column_with_label(const std::string& label, const Dataset* ds=nullptr, char type='*') {
+    for (Column& col : columns)
+      if (col.label == label && (!ds || ds->id == col.dataset_id)
+                             && (type == '*' || type == col.type))
+        return &col;
+    return nullptr;
+  }
+  const Column* column_with_label(const std::string& label, const Dataset* ds=nullptr,
+                                  char type='*') const {
+    return const_cast<Mtz*>(this)->column_with_label(label, ds, type);
+  }
+
+  const Column& get_column_with_label(const std::string& label, const Dataset* ds=nullptr) const {
+    if (const Column* col = column_with_label(label, ds))
+      return *col;
+    fail("Column label not found: " + label);
+  }
+
+  std::vector<const Column*> columns_with_type(char type) const {
+    std::vector<const Column*> cols;
+    for (const Column& col : columns)
+      if (col.type == type)
+        cols.push_back(&col);
+    return cols;
+  }
+
+  std::vector<int> positions_of_columns_with_type(char col_type) const {
+    std::vector<int> cols;
+    for (int i = 0; i < (int) columns.size(); ++i)
+      if (columns[i].type == col_type)
+        cols.push_back(i);
+    return cols;
+  }
+
+  // F(+)/(-) pairs should have type G (and L for sigma),
+  // I(+)/(-) -- K (M for sigma), but E(+)/(-) has no special column type,
+  // so here we use column labels not types.
+  std::vector<std::pair<int,int>> positions_of_plus_minus_columns() const {
+    std::vector<std::pair<int,int>> r;
+    for (int i = 0; i < (int) columns.size(); ++i) {
+      const Column& col = columns[i];
+      size_t sign_pos = col.label.find("(+)");
+      if (sign_pos != std::string::npos) {
+        std::string minus_label = columns[i].label;
+        minus_label[sign_pos+1] = '-';
+        for (int j = 0; j < (int) columns.size(); ++j)
+          if (columns[j].label == minus_label &&
+              columns[j].type == col.type &&
+              columns[j].dataset_id == col.dataset_id) {
+            r.emplace_back(i, j);
+            break;
+          }
+      }
+    }
+    return r;
+  }
+
+  /// the order of labels matters
+  const Column* column_with_one_of_labels(std::initializer_list<const char*> labels,
+                                          char type='*') const {
+    for (const char* label : labels)
+      if (const Column* col = column_with_label(label, nullptr, type))
+        return col;
+    return nullptr;
+  }
+
+  /// the order of labels doesn't matter
+  Column* column_with_type_and_any_of_labels(char type, std::initializer_list<const char*> labels) {
+    for (Column& col : columns)
+      if (col.type == type) {
+        for (const char* label : labels)
+          if (col.label == label)
+            return &col;
+      }
+    return nullptr;
+  }
+
+  Column* rfree_column() {
+    // cf. MtzToCif::default_spec in mtz2cif.hpp
+    return column_with_type_and_any_of_labels('I',
+        {"FREE", "RFREE", "FREER", "FreeR_flag", "R-free-flags", "FreeRflag", "R_FREE_FLAGS"});
+  }
+  const Column* rfree_column() const {
+    return const_cast<Mtz*>(this)->rfree_column();
+  }
+
+  Column* imean_column() {
+    return column_with_type_and_any_of_labels('J', {"IMEAN", "I", "IOBS", "I-obs"});
+  }
+  const Column* imean_column() const {
+    return const_cast<Mtz*>(this)->imean_column();
+  }
+
+  Column* iplus_column() {
+    return column_with_type_and_any_of_labels('K', {"I(+)", "IOBS(+)", "I-obs(+)", "Iplus"});
+  }
+  const Column* iplus_column() const {
+    return const_cast<Mtz*>(this)->iplus_column();
+  }
+
+  Column* iminus_column() {
+    return column_with_type_and_any_of_labels('K', {"I(-)", "IOBS(-)", "I-obs(-)", "Iminus"});
+  }
+  const Column* iminus_column() const {
+    return const_cast<Mtz*>(this)->iminus_column();
+  }
+
+  bool has_data() const {
+    return data.size() == columns.size() * nreflections;
+  }
+
+  bool is_merged() const { return batches.empty(); }
+
+  /// Calculates min/max for all combinations of reflections and unit cells,
+  /// where unit cells are a global CELL and per-dataset DCELL.
+  std::array<double,2> calculate_min_max_1_d2() const;
+
+  void update_reso() {
+    std::array<double,2> reso = calculate_min_max_1_d2();
+    min_1_d2 = reso[0];
+    max_1_d2 = reso[1];
+  }
+
+  // Functions for reading MTZ headers and data.
+
+  void toggle_endianness() {
+    same_byte_order = !same_byte_order;
+    swap_eight_bytes(&header_offset);
+  }
+
+  void read_first_bytes(AnyStream& stream);
+
+  /// read headers until END
+  void read_main_headers(AnyStream& stream, std::vector<std::string>* save_headers);
+
+  /// read the part between END and MTZENDOFHEADERS
+  void read_history_and_batch_headers(AnyStream& stream);
+
+  void setup_spacegroup();
+
+  void read_raw_data(AnyStream& stream, bool do_read=true);
+
+  void read_all_headers(AnyStream& stream);
+
+  void read_stream(AnyStream& stream, bool with_data);
+
+  void read_file(const std::string& path) {
+    try {
+      source_path = path;
+      FileStream stream(path.c_str(), "rb");
+      read_stream(stream, true);
+    } catch (std::system_error&) {
+      throw;  // system_error::what() includes path, don't add anything
+    } catch (std::runtime_error& e) {
+      fail(std::string(e.what()) + ": " + path);
+    }
+  }
+
+  template<typename Input>
+  void read_input(Input&& input, bool with_data) {
+    source_path = input.path();
+    read_stream(*input.create_stream(), with_data);
+  }
+
+  /// the same as read_input(MaybeGzipped(path), with_data)
+  void read_file_gz(const std::string& path, bool with_data=true);
+
+  std::vector<int> sorted_row_indices(int use_first=3) const;
+  bool sort(int use_first=3);
+
+  Miller get_hkl(size_t offset) const {
+    return {{(int)data[offset], (int)data[offset+1], (int)data[offset+2]}};
+  }
+  void set_hkl(size_t offset, const Miller& hkl) {
+    for (int i = 0; i != 3; ++i)
+      data[offset + i] = static_cast<float>(hkl[i]);
+  }
+
+  /// Returns offset of the first hkl or (size_t)-1. Can be slow.
+  size_t find_offset_of_hkl(const Miller& hkl, size_t start=0) const;
+
+  /// (for merged MTZ only) change HKL to ASU equivalent, adjust phases, etc
+  void ensure_asu(bool tnt_asu=false);
+
+  /// Reindex data, usually followed by ensure_asu(). Outputs messages through logger.
+  void reindex(const Op& op);
+
+  /// Change symmetry to P1 and expand reflections. Does not sort.
+  /// Similar to command EXPAND in SFTOOLS.
+  void expand_to_p1();
+
+  /// (for unmerged MTZ only) change HKL according to M/ISYM
+  bool switch_to_original_hkl();
+
+  /// (for unmerged MTZ only) change HKL to ASU equivalent and set ISYM
+  bool switch_to_asu_hkl();
+
+  Dataset& add_dataset(const std::string& name) {
+    int id = 0;
+    for (const Dataset& d : datasets)
+      if (d.id >= id)
+        id = d.id + 1;
+    datasets.push_back({id, name, name, name, cell, 0.0});
+    return datasets.back();
+  }
+
+  Column& add_column(const std::string& label, char type,
+                     int dataset_id, int pos, bool expand_data);
+
+  // extra_col are columns right after src_col that are also copied.
+  Column& replace_column(size_t dest_idx, const Column& src_col,
+                         const std::vector<std::string>& trailing_cols={});
+
+  // If dest_idx < 0 - columns are appended at the end
+  // append new column(s), otherwise overwrite existing ones.
+  Column& copy_column(int dest_idx, const Column& src_col,
+                      const std::vector<std::string>& trailing_cols={});
+
+  void remove_column(size_t idx);
+
+  template <typename Func>
+  void remove_rows_if(Func condition) {
+    if (!has_data())
+      fail("No data.");
+    auto out = data.begin();
+    size_t width = columns.size();
+    for (auto r = data.begin(); r < data.end(); r += width)
+      if (!condition(&*r)) {
+        if (r != out)
+          std::copy(r, r + width, out);
+        out += width;
+      }
+    data.erase(out, data.end());
+    nreflections = int(data.size() / width);
+  }
+
+  void expand_data_rows(size_t added, int pos_=-1) {
+    size_t old_row_size = columns.size() - added;
+    if (data.size() != old_row_size * nreflections)
+      fail("Internal error");
+    size_t pos = pos_ == -1 ? old_row_size : (size_t) pos_;
+    if (pos > old_row_size)
+      fail("expand_data_rows(): pos out of range");
+    vector_insert_columns(data, old_row_size, (size_t)nreflections, added, pos, NAN);
+  }
+
+  void set_data(const float* new_data, size_t n) {
+    size_t ncols = columns.size();
+    if (n % ncols != 0)
+      fail("Mtz.set_data(): expected " + std::to_string(ncols) + " columns.");
+    nreflections = int(n / ncols);
+    data.assign(new_data, new_data + n);
+  }
+
+  // Function for writing MTZ file
+  void write_to_cstream(std::FILE* stream) const;
+  void write_to_string(std::string& str) const;
+  void write_to_file(const std::string& path) const;
+  size_t size_to_write() const;
+  size_t write_to_buffer(char* buf, size_t maxlen) const;
+
+private:
+  template<typename Write> void write_to_stream(Write write) const;
+};
+
+
+inline Mtz read_mtz_file(const std::string& path) {
+  Mtz mtz;
+  mtz.read_file(path);
+  return mtz;
+}
+
+template<typename Input>
+Mtz read_mtz(Input&& input, bool with_data) {
+  Mtz mtz;
+  mtz.read_input(std::forward<Input>(input), with_data);
+  return mtz;
+}
+
+// Abstraction of data source, cf. ReflnDataProxy.
+struct MtzDataProxy {
+  const Mtz& mtz_;
+  size_t stride() const { return mtz_.columns.size(); }
+  size_t size() const { return mtz_.data.size(); }
+  using num_type = float;
+  float get_num(size_t n) const { return mtz_.data[n]; }
+  const UnitCell& unit_cell() const { return mtz_.cell; }
+  const SpaceGroup* spacegroup() const { return mtz_.spacegroup; }
+  Miller get_hkl(size_t offset) const { return mtz_.get_hkl(offset); }
+
+  size_t column_index(const std::string& label) const {
+    if (const Mtz::Column* col = mtz_.column_with_label(label))
+      return col->idx;
+    fail("MTZ file has no column with label: " + label);
+  }
+};
+
+// Like above, but here the data is stored outside of the Mtz class
+struct MtzExternalDataProxy : MtzDataProxy {
+  const float* data_;
+  MtzExternalDataProxy(const Mtz& mtz, const float* data)
+    : MtzDataProxy{mtz}, data_(data) {}
+  size_t size() const { return mtz_.columns.size() * mtz_.nreflections; }
+  float get_num(size_t n) const { return data_[n]; }
+  Miller get_hkl(size_t offset) const {
+    return {{(int)data_[offset + 0],
+             (int)data_[offset + 1],
+             (int)data_[offset + 2]}};
+  }
+};
+
+inline MtzDataProxy data_proxy(const Mtz& mtz) { return {mtz}; }
+
+} // namespace gemmi
+
+#endif
@@ -0,0 +1,80 @@
+// Copyright 2017 Global Phasing Ltd.
+//
+// interface to stb_sprintf: snprintf_z, to_str(float|double)
+
+#ifndef GEMMI_SPRINTF_HPP_
+#define GEMMI_SPRINTF_HPP_
+
+#include <string>
+#ifdef __has_include
+# if __has_include(<charconv>) && !(defined(_MSVC_LANG) && _MSVC_LANG < 201703L)
+#  include <charconv>
+# endif
+#endif
+
+#if __cpp_lib_to_chars < 201611L
+# include <algorithm> // for min
+#endif
+
+#include "fail.hpp"  // for GEMMI_DLL
+
+namespace gemmi {
+
+// On MinGW format(printf) doesn't support %zu.
+#if (defined(__GNUC__) && !defined(__MINGW32__)) || defined(__clang__)
+# define GEMMI_ATTRIBUTE_FORMAT(fmt,va) __attribute__((format(printf,fmt,va)))
+#else
+# define GEMMI_ATTRIBUTE_FORMAT(fmt,va)
+#endif
+/// stb_snprintf in gemmi namespace - like snprintf, but ignores locale
+/// and is always zero-terminated (hence _z).
+GEMMI_DLL int snprintf_z(char *buf, int count, char const *fmt, ...)
+                                                         GEMMI_ATTRIBUTE_FORMAT(3,4);
+/// stb_sprintf in gemmi namespace
+GEMMI_DLL int sprintf_z(char *buf, char const *fmt, ...) GEMMI_ATTRIBUTE_FORMAT(2,3);
+
+inline std::string to_str(double d) {
+  char buf[24];
+  int len = sprintf_z(buf, "%.9g", d);
+  return std::string(buf, len > 0 ? len : 0);
+}
+
+inline std::string to_str(float d) {
+  char buf[16];
+  int len = sprintf_z(buf, "%.6g", d);
+  return std::string(buf, len > 0 ? len : 0);
+}
+
+template<int Prec>
+std::string to_str_prec(double d) {
+  static_assert(Prec >= 0 && Prec < 7, "unsupported precision");
+  char buf[16];
+  int len = d > -1e8 && d < 1e8 ? sprintf_z(buf, "%.*f", Prec, d)
+                                : sprintf_z(buf, "%g", d);
+  return std::string(buf, len > 0 ? len : 0);
+}
+
+/// zero-terminated to_chars()
+inline char* to_chars_z(char* first, char* last, int value) {
+#if __cpp_lib_to_chars >= 201611L
+  auto result = std::to_chars(first, last-1, value);
+  *result.ptr = '\0';
+  return result.ptr;
+#else
+  int n = snprintf_z(first, int(last - first), "%d", value);
+  return std::min(first + n, last - 1);
+#endif
+}
+inline char* to_chars_z(char* first, char* last, size_t value) {
+#if __cpp_lib_to_chars >= 201611L
+  auto result = std::to_chars(first, last-1, value);
+  *result.ptr = '\0';
+  return result.ptr;
+#else
+  int n = snprintf_z(first, int(last - first), "%zu", value);
+  return std::min(first + n, last - 1);
+#endif
+}
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,315 @@
+// Copyright 2017 Global Phasing Ltd.
+//
+// Utilities. Mostly for working with strings and vectors.
+
+#ifndef GEMMI_UTIL_HPP_
+#define GEMMI_UTIL_HPP_
+
+#include <cassert>
+#include <cctype>     // for isspace
+#include <cstring>    // for strncmp
+#include <algorithm>  // for equal, find, remove_if
+#include <iterator>   // for begin, end, make_move_iterator
+#include <string>
+#include <vector>
+
+namespace gemmi {
+
+//   #####   string helpers   #####
+
+inline void append_to_str(std::string& out, int v) { out += std::to_string(v); }
+inline void append_to_str(std::string& out, size_t v) { out += std::to_string(v); }
+void append_to_str(std::string& out, double) = delete;
+template<typename T>
+void append_to_str(std::string& out, const T& v) { out += v; }
+
+inline void cat_to(std::string&) {}
+template <typename T, typename... Args>
+void cat_to(std::string& out, const T& value, Args const&... args) {
+  append_to_str(out, value);
+  cat_to(out, args...);
+}
+template <class... Args>
+std::string cat(Args const&... args) {
+  std::string out;
+  cat_to(out, args...);
+  return out;
+}
+
+inline bool starts_with(const std::string& str, const std::string& prefix) {
+  size_t sl = prefix.length();
+  return str.length() >= sl && str.compare(0, sl, prefix) == 0;
+}
+
+template<size_t N> bool starts_with(const char* a, const char (&b)[N]) {
+  return std::strncmp(a, b, N-1) == 0;
+}
+
+inline bool ends_with(const std::string& str, const std::string& suffix) {
+  size_t sl = suffix.length();
+  return str.length() >= sl && str.compare(str.length() - sl, sl, suffix) == 0;
+}
+
+// can be faster than std::tolower() b/c it takes char not int
+inline char lower(char c) {
+  if (c >= 'A' && c <= 'Z')
+    return c | 0x20;
+  return c;
+}
+
+// works as expected only for a-zA-Z
+inline char alpha_up(char c) { return c & ~0x20; }
+
+inline std::string to_lower(std::string str) {
+  for (char& c : str)
+    if (c >= 'A' && c <= 'Z')
+      c |= 0x20;
+  return str;
+}
+
+inline std::string to_upper(std::string str) {
+  for (char& c : str)
+    if (c >= 'a' && c <= 'z')
+      c &= ~0x20;
+  return str;
+}
+
+// case-insensitive character comparison
+inline bool isame(char a, char b) {
+  return a == b || ((a^b) == 0x20 && (a|0x20) >= 'a' && (a|0x20) <= 'z');
+}
+
+// Case-insensitive comparisons. The second arg must be lowercase.
+
+inline bool iequal_from(const std::string& str, size_t offset, const std::string& low) {
+  return str.length() == low.length() + offset &&
+         std::equal(std::begin(low), std::end(low), str.begin() + offset,
+                    [](char c1, char c2) { return c1 == lower(c2); });
+}
+
+inline bool iequal(const std::string& str, const std::string& low) {
+  return iequal_from(str, 0, low);
+}
+
+inline bool istarts_with(const std::string& str, const std::string& prefix) {
+  return str.length() >= prefix.length() &&
+         std::equal(std::begin(prefix), std::end(prefix), str.begin(),
+                    [](char c1, char c2) { return c1 == lower(c2); });
+}
+inline bool iends_with(const std::string& str, const std::string& suffix) {
+  size_t sl = suffix.length();
+  return str.length() >= sl &&
+         std::equal(std::begin(suffix), std::end(suffix), str.end() - sl,
+                    [](char c1, char c2) { return c1 == lower(c2); });
+}
+
+inline bool giends_with(const std::string& str, const std::string& suffix) {
+  return iends_with(str, suffix) || iends_with(str, suffix + ".gz");
+}
+
+inline std::string trim_str(const std::string& str) {
+  const std::string ws = " \r\n\t";
+  std::string::size_type first = str.find_first_not_of(ws);
+  if (first == std::string::npos)
+    return std::string{};
+  std::string::size_type last = str.find_last_not_of(ws);
+  return str.substr(first, last - first + 1);
+}
+
+inline std::string rtrim_str(const std::string& str) {
+  std::string::size_type last = str.find_last_not_of(" \r\n\t");
+  return str.substr(0, last == std::string::npos ? 0 : last + 1);
+}
+
+// end is after the last character of the string (typically \0)
+inline const char* rtrim_cstr(const char* start, const char* end=nullptr) {
+  if (!start)
+    return nullptr;
+  if (!end) {
+    end = start;
+    while (*end != '\0')
+      ++end;
+  }
+  while (end > start && std::isspace(end[-1]))
+    --end;
+  return end;
+}
+
+namespace impl {
+inline size_t length(char) { return 1; }
+inline size_t length(const std::string& s) { return s.length(); }
+}
+
+// takes a single separator (usually char or string);
+// may return empty fields
+template<typename S>
+void split_str_into(const std::string& str, S sep,
+                    std::vector<std::string>& result) {
+  std::size_t start = 0, end;
+  while ((end = str.find(sep, start)) != std::string::npos) {
+    result.emplace_back(str, start, end - start);
+    start = end + impl::length(sep);
+  }
+  result.emplace_back(str, start);
+}
+
+template<typename S>
+std::vector<std::string> split_str(const std::string& str, S sep) {
+  std::vector<std::string> result;
+  split_str_into(str, sep, result);
+  return result;
+}
+
+// _multi variants takes multiple 1-char separators as a string;
+// discards empty fields
+inline void split_str_into_multi(const std::string& str, const char* seps,
+                                 std::vector<std::string>& result) {
+  std::size_t start = str.find_first_not_of(seps);
+  while (start != std::string::npos) {
+    std::size_t end = str.find_first_of(seps, start);
+    result.emplace_back(str, start, end - start);
+    start = str.find_first_not_of(seps, end);
+  }
+}
+
+inline std::vector<std::string> split_str_multi(const std::string& str,
+                                                const char* seps=" \t") {
+  std::vector<std::string> result;
+  split_str_into_multi(str, seps, result);
+  return result;
+}
+
+template<typename T, typename S, typename F>
+std::string join_str(T begin, T end, const S& sep, const F& getter) {
+  std::string r;
+  bool first = true;
+  for (T i = begin; i != end; ++i) {
+    if (!first)
+      r += sep;
+    r += getter(*i);
+    first = false;
+  }
+  return r;
+}
+
+template<typename T, typename S>
+std::string join_str(T begin, T end, const S& sep) {
+  return join_str(begin, end, sep, [](const std::string& t) { return t; });
+}
+
+template<typename T, typename S, typename F>
+std::string join_str(const T& iterable, const S& sep, const F& getter) {
+  return join_str(iterable.begin(), iterable.end(), sep, getter);
+}
+
+template<typename T, typename S>
+std::string join_str(const T& iterable, const S& sep) {
+  return join_str(iterable.begin(), iterable.end(), sep);
+}
+
+template<typename T, typename S>
+void string_append_sep(std::string& str, S sep, const T& item) {
+  if (!str.empty())
+    str += sep;
+  str += item;
+}
+
+inline void replace_all(std::string &s,
+                        const std::string &old, const std::string &new_) {
+  std::string::size_type pos = 0;
+  while ((pos = s.find(old, pos)) != std::string::npos) {
+    s.replace(pos, old.size(), new_);
+    pos += new_.size();
+  }
+}
+
+// list is a comma separated string
+inline bool is_in_list(const std::string& name, const std::string& list,
+                       char sep=',') {
+  if (name.length() >= list.length())
+    return name == list;
+  for (size_t start=0, end=0; end != std::string::npos; start=end+1) {
+    end = list.find(sep, start);
+    if (list.compare(start, end - start, name) == 0)
+      return true;
+  }
+  return false;
+}
+
+//   #####   vector helpers   #####
+
+template <class T>
+bool in_vector(const T& x, const std::vector<T>& v) {
+  return std::find(v.begin(), v.end(), x) != v.end();
+}
+
+template <typename F, typename T>
+bool in_vector_f(F f, const std::vector<T>& v) {
+  return std::find_if(v.begin(), v.end(), f) != v.end();
+}
+
+template <class T>
+T* vector_end_ptr(std::vector<T>& v) { return v.data() + v.size(); }
+template <class T>
+const T* vector_end_ptr(const std::vector<T>& v) { return v.data() + v.size(); }
+
+template <class T>
+void vector_move_extend(std::vector<T>& dst, std::vector<T>&& src) {
+  if (dst.empty())
+    dst = std::move(src);
+  else
+    dst.insert(dst.end(), std::make_move_iterator(src.begin()),
+                          std::make_move_iterator(src.end()));
+}
+
+// wrapper around the erase-remove idiom
+template <class T, typename F>
+void vector_remove_if(std::vector<T>& v, F&& condition) {
+  v.erase(std::remove_if(v.begin(), v.end(), condition), v.end());
+}
+
+/// \par data - 2d array (old_width x length) in a vector
+/// Insert \par n new columns at position pos.
+template <class T>
+void vector_insert_columns(std::vector<T>& data, size_t old_width,
+                           size_t length, size_t n, size_t pos, const T& new_value) {
+  assert(data.size() == old_width * length);
+  assert(pos <= old_width);
+  data.resize(data.size() + n * length);
+  typename std::vector<T>::iterator dst = data.end();
+  for (size_t i = length; i-- != 0; ) {
+    for (size_t j = old_width; j-- != pos; )
+      *--dst = data[i * old_width + j];
+    for (size_t j = n; j-- != 0; )
+      *--dst = new_value;
+    for (size_t j = pos; j-- != 0; )
+      *--dst = data[i * old_width + j];
+  }
+  assert(dst == data.begin());
+}
+/// \par data - 2d array with new_width+1 columns, in a vector
+/// Remove column at position pos.
+template <class T>
+void vector_remove_column(std::vector<T>& data, size_t new_width, size_t pos) {
+  assert(pos <= new_width);
+  for (size_t source = pos + 1; source < data.size(); ++source)
+    for (size_t i = 0; i < new_width && source < data.size(); ++i)
+      data[pos++] = data[source++];
+  data.resize(pos);
+}
+
+
+//   #####   other helpers   #####
+
+// Numeric ID used for case-insensitive comparison of 4 letters.
+// s must have 4 chars or 3 chars + NUL, ' ' and NUL are equivalent in s.
+constexpr int ialpha4_id(const char* s) {
+  return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]) & ~0x20202020;
+}
+// Numeric ID used for case-insensitive comparison of 3 letters.
+constexpr int ialpha3_id(const char* s) {
+  return (s[0] << 16 | s[1] << 8 | s[2]) & ~0x20202020;
+}
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,183 @@
+// Copyright 2020 Global Phasing Ltd.
+//
+// Read XDS files: XDS_ASCII.HKL and INTEGRATE.HKL.
+
+#ifndef GEMMI_XDS_ASCII_HPP_
+#define GEMMI_XDS_ASCII_HPP_
+
+#include "input.hpp"     // for AnyStream, FileStream
+#include "unitcell.hpp"  // for UnitCell
+#include "util.hpp"      // for starts_with
+
+namespace gemmi {
+
+// from Pointless docs: likely in-house source, in which case
+// the unpolarised value is left unchanged (recognised wavelengths
+// are CuKalpha 1.5418 +- 0.0019, Mo 0.7107 +- 0.0002, Cr 2.29 +- 0.01)
+inline bool likely_in_house_source(double wavelength) {
+  return std::fabs(wavelength - 1.5418) < 0.0019 ||
+         std::fabs(wavelength - 0.7107) < 0.0002 ||
+         std::fabs(wavelength - 2.29) < 0.01;
+}
+
+struct XdsAsciiMetadata {
+  struct Iset {
+    int id;
+    std::string input_file;
+    double wavelength = 0.;
+    std::array<double,6> cell_constants = {0., 0., 0., 0., 0., 0.};
+    //statistics set by gather_iset_statistics()
+    int frame_number_min = -1;
+    int frame_number_max = -1;
+    int frame_count = -1;
+    int reflection_count = -1;
+
+    Iset(int id_) : id(id_) {}
+  };
+  std::string source_path;
+  int read_columns = 0;  // doesn't include ITEM_ISET from XSCALE
+  int spacegroup_number = 0;
+  double wavelength = 0.;
+  std::array<double,6> cell_constants = {0., 0., 0., 0., 0., 0.};
+  Mat33 cell_axes{0.};
+  Vec3 incident_beam_dir;
+  double oscillation_range = 0.;
+  Vec3 rotation_axis;
+  double starting_angle = 0.;
+  double reflecting_range_esd = 0.;
+  char friedels_law = '\0';
+  int starting_frame = 1;
+  int nx = 0;  // detector size - number of pixels
+  int ny = 0;
+  double qx = 0.;  // pixel size in mm
+  double qy = 0.;
+  double orgx = 0.;
+  double orgy = 0.;
+  double detector_distance = 0.;
+  std::string generated_by;
+  std::string version_str;
+  std::vector<Iset> isets;
+};
+
+struct GEMMI_DLL XdsAscii : XdsAsciiMetadata {
+  struct Refl {
+    Miller hkl;
+    int iset = 1;
+    double iobs;
+    double sigma;
+    double xd;
+    double yd;
+    double zd;
+    double rlp;
+    double peak;
+    double corr;  // is it always integer?
+    double maxc;
+
+    // ZD can be negative for a few reflections
+    int frame() const { return (int) std::floor(zd + 1); }
+  };
+  std::vector<Refl> data;
+
+  XdsAscii() = default;
+  XdsAscii(const XdsAsciiMetadata& m) : XdsAsciiMetadata(m) {}
+
+  Iset& find_or_add_iset(int id) {
+    for (Iset& i : isets)
+      if (i.id == id)
+        return i;
+    isets.emplace_back(id);
+    return isets.back();
+  }
+  void read_stream(AnyStream& reader, const std::string& source);
+
+  template<typename T>
+  void read_input(T&& input) {
+    read_stream(*input.create_stream(), input.path());
+  }
+
+  bool is_merged() const { return read_columns < 8; }
+
+  // set a few Iset properties in isets
+  void gather_iset_statistics();
+
+  double rot_angle(const Refl& refl) const {
+    double z = refl.zd - starting_frame + 1;
+    return starting_angle + oscillation_range * z;
+  }
+
+  // it's already normalized, but just in case normalize it again
+  Vec3 get_rotation_axis() const {
+    double length = rotation_axis.length();
+    if (length == 0)
+      fail("unknown rotation axis");
+    return rotation_axis / length;
+  }
+
+  // I'm not sure if always |incident_beam_dir| == 1/wavelength
+  Vec3 get_s0_direction() const {
+    double length = incident_beam_dir.length();
+    if (length == 0)
+      fail("unknown incident beam direction");
+    return incident_beam_dir / length;
+  }
+
+  bool has_cell_axes() const {
+    for (int i = 0; i < 3; ++i)
+      if (cell_axes[i][0] == 0 && cell_axes[i][1] == 0 && cell_axes[i][2] == 0)
+        return false;
+    return true;
+  }
+
+  /// Return transition matrix from "Cambridge" frame to XDS frame.
+  /// x_xds = M x_cam
+  Mat33 calculate_conversion_from_cambridge() const {
+    // Cambridge z direction is along the principal rotation axis
+    Vec3 z = get_rotation_axis();
+    // Cambridge z direction is along beam
+    Vec3 x = get_s0_direction();
+    Vec3 y = z.cross(x).normalized();
+    // beam and rotation axis may not be orthogonal
+    x = y.cross(z).normalized();
+    return Mat33::from_columns(x, y, z);
+  }
+
+  Mat33 get_orientation() const {
+    if (!has_cell_axes())
+      fail("unknown unit cell axes");
+    Vec3 a = cell_axes.row_copy(0);
+    Vec3 b = cell_axes.row_copy(1);
+    Vec3 c = cell_axes.row_copy(2);
+    Vec3 ar = b.cross(c).normalized();
+    Vec3 br = c.cross(a);
+    Vec3 cr = ar.cross(br).normalized();
+    br = cr.cross(ar);
+    return Mat33::from_columns(ar, br, cr);
+  }
+
+  /// \par p is degree of polarization from range (0,1), as used in XDS.
+  void apply_polarization_correction(double p, Vec3 normal);
+
+  /// \par overload is maximally allowed pixel value in a peak (MAXC).
+  void eliminate_overloads(double overload) {
+    vector_remove_if(data, [&](Refl& r) { return r.maxc > overload; });
+  }
+
+  /// \par batchmin lowest allowed batch number.
+  void eliminate_batchmin(int batchmin) {
+    double minz = batchmin - 1;
+    vector_remove_if(data, [&](Refl& r) { return r.zd < minz; });
+  }
+};
+
+inline XdsAscii read_xds_ascii_file(const std::string& path) {
+  XdsAscii ret;
+  FileStream stream(path.c_str(), "rb");
+  ret.read_stream(stream, path);
+  return ret;
+}
+
+/// read possibly gzipped file
+GEMMI_DLL XdsAscii read_xds_ascii(const std::string& path);
+
+} // namespace gemmi
+#endif
@@ -0,0 +1,189 @@
+// Copyright Global Phasing Ltd.
+
+#include <gemmi/gz.hpp>
+#include <cassert>
+#include <cstdio>       // fseek, ftell, fread
+#include <climits>      // INT_MAX
+#if USE_ZLIB_NG
+# define WITH_GZFILEOP 1
+# include <zlib-ng.h>
+# define GG(name) zng_ ## name
+#else
+# include <zlib.h>
+# define GG(name) name
+#endif
+#include <gemmi/fileutil.hpp> // file_open
+
+namespace gemmi {
+
+const char* const zlib_description =
+#if USE_ZLIB_NG
+  "zlib-ng " ZLIBNG_VERSION;
+#else
+  "zlib " ZLIB_VERSION;
+#endif
+
+// Throws if the size is not found or if it is suspicious.
+// Anything outside of the arbitrary limits from 1 to 10x of the compressed
+// size looks suspicious to us.
+// **This function should not be relied upon.**
+// In particular, if the return values is >= 4GiB - it's only a guess.
+size_t estimate_uncompressed_size(const std::string& path) {
+  fileptr_t f = file_open(path.c_str(), "rb");
+  unsigned char buf[4];
+  if (std::fread(buf, 1, 2, f.get()) != 2)
+    sys_fail("Failed to read: " + path);
+  if (buf[0] != 0x1f || buf[1] != 0x8b)
+    fail("File not in the gzip format: " + path);
+  if (std::fseek(f.get(), -4, SEEK_END) != 0)
+    sys_fail("fseek() failed (empty file?): " + path);
+  long pos = std::ftell(f.get());
+  if (pos <= 0)
+    sys_fail("ftell() failed on " + path);
+  size_t gzipped_size = pos + 4;
+  if (std::fread(buf, 1, 4, f.get()) != 4)
+    sys_fail("Failed to read last 4 bytes of: " + path);
+  unsigned orig_size = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0];
+  if (orig_size + 100 < gzipped_size || orig_size > 100 * gzipped_size) {
+    // The size is stored as 32-bit number. If the original size exceeds 4GiB,
+    // the stored number is modulo 4 GiB. So we just guess...
+    constexpr size_t max_uint = 4294967295U;
+    if (gzipped_size > max_uint / 6)
+      return max_uint + (sizeof(size_t) > 4 ? orig_size : 0);
+    fail("Cannot determine uncompressed size of " + path +
+         "\nWould it be " + std::to_string(gzipped_size) + " -> " +
+         std::to_string(orig_size) + " bytes?");
+  }
+  return orig_size;
+}
+
+static size_t big_gzread(gzFile file, void* buf, size_t len) {
+#if USE_ZLIB_NG
+  return GG(gzfread)(buf, 1, len, file);
+#else
+  // In zlib >= 1.2.9 we could use gzfread()
+  size_t read_bytes = 0;
+  while (len > INT_MAX) {
+    int ret = gzread(file, buf, INT_MAX);
+    read_bytes += ret;
+    if (ret != INT_MAX)
+      return read_bytes;
+    len -= INT_MAX;
+    buf = (char*) buf + INT_MAX;
+  }
+  read_bytes += gzread(file, buf, (unsigned) len);
+  return read_bytes;
+#endif
+}
+
+char* GzStream::gets(char* line, int size) {
+  return GG(gzgets)((gzFile)f, line, size);
+}
+
+int GzStream::getc() {
+  return GG(gzgetc)((gzFile)f);
+}
+
+bool GzStream::read(void* buf, size_t len) {
+  return big_gzread((gzFile)f, buf, len) == len;
+}
+
+bool GzStream::skip(size_t n) {
+  return GG(gzseek)((gzFile)f, n, SEEK_CUR) != -1;
+}
+
+long GzStream::tell() {
+  return GG(gztell)((gzFile)f);
+}
+
+std::string GzStream::read_rest() {
+    std::string retval;
+    int c = getc();
+    if (c != EOF) {
+      retval += (char)c;
+      char buf[512];
+      for (;;) {
+        size_t n = big_gzread((gzFile)f, buf,  sizeof(buf));
+        retval.append(buf, n);
+        if (n != sizeof(buf))
+          break;
+      }
+    }
+    return retval;
+}
+
+
+MaybeGzipped::MaybeGzipped(const std::string& path) : BasicInput(path) {}
+
+MaybeGzipped::~MaybeGzipped() {
+  if (file_)
+#if USE_ZLIB_NG || (ZLIB_VERNUM >= 0x1235)
+    GG(gzclose_r)((gzFile)file_);
+#else
+    gzclose((gzFile)file_);
+#endif
+}
+
+size_t MaybeGzipped::gzread_checked(void* buf, size_t len) {
+  gzFile file = (gzFile) file_;
+  size_t read_bytes = big_gzread(file, buf, len);
+  if (read_bytes != len && !GG(gzeof)(file)) {
+    int errnum = 0;
+    std::string err_str = GG(gzerror)(file, &errnum);
+    if (errnum == Z_ERRNO)
+      sys_fail("failed to read " + path());
+    if (errnum)
+      fail("Error reading " + path() + ": " + err_str);
+  }
+  if (read_bytes > len)  // should never happen
+    fail("Error reading " + path());
+  return read_bytes;
+}
+
+CharArray MaybeGzipped::uncompress_into_buffer(size_t limit) {
+  if (!is_compressed())
+    return BasicInput::uncompress_into_buffer();
+  size_t size = (limit == 0 ? estimate_uncompressed_size(path()) : limit);
+  file_ = GG(gzopen)(path().c_str(), "rb");
+  if (!file_)
+    sys_fail("Failed to gzopen " + path());
+  if (size > 3221225471)
+    // if this exception is changed adjust prog/cif2mtz.cpp
+    fail("For now gz files above 3 GiB uncompressed are not supported.\n"
+         "To read " + path() + " first uncompress it.");
+  CharArray mem(size);
+  size_t read_bytes = gzread_checked(mem.data(), size);
+  // if the file is shorter than the size from header, adjust size
+  if (read_bytes < size) {
+    mem.set_size(read_bytes);  // should we call resize() here
+  } else if (limit == 0) { // read_bytes == size
+  // if the file is longer than the size from header, read in the rest
+    int next_char;
+    while (!GG(gzeof)((gzFile)file_) && (next_char = GG(gzgetc)((gzFile)file_)) != -1) {
+      if (mem.size() > 3221225471)
+        fail("For now gz files above 3 GiB uncompressed are not supported.\n"
+             "To read " + path() + " first uncompress it.");
+      GG(gzungetc)(next_char, (gzFile)file_);
+      size_t old_size = mem.size();
+      mem.resize(2 * old_size);
+      size_t n = gzread_checked(mem.data() + old_size, old_size);
+      mem.set_size(old_size + n);
+    }
+  }
+  return mem;
+}
+
+std::unique_ptr<AnyStream> MaybeGzipped::create_stream() {
+  if (is_compressed()) {
+    file_ = GG(gzopen)(path().c_str(), "rb");
+    if (!file_)
+      sys_fail("Failed to gzopen " + path());
+#if ZLIB_VERNUM >= 0x1235
+    GG(gzbuffer)((gzFile)file_, 64*1024);
+#endif
+    return std::unique_ptr<AnyStream>(new GzStream(file_));
+  }
+  return BasicInput::create_stream();
+}
+
+} // namespace gemmi
@@ -0,0 +1,991 @@
+// Copyright 2019-2023 Global Phasing Ltd.
+
+#include <gemmi/mtz.hpp>
+#include <cstring>            // for memcpy
+#include <algorithm>          // for stable_sort
+#include <gemmi/atof.hpp>     // for fast_atof
+#include <gemmi/atox.hpp>     // for simple_atoi, read_word
+#include <gemmi/gz.hpp>
+#include <gemmi/sprintf.hpp>
+
+namespace gemmi {
+
+namespace {
+
+double wrap_degrees(double phi) {
+  if (phi >= 0 && phi < 360.)
+    return phi;
+  return phi - std::floor(phi / 360.) * 360.;
+}
+
+void shift_phase(float& phi, double shift, bool negate=false) {
+  double phi_ = phi + deg(shift);
+  phi = float(wrap_degrees(negate ? -phi_ : phi_));
+}
+
+// apply phase shift to Hendrickson–Lattman coefficients HLA, HLB, HLC and HLD
+void shift_hl_coefficients(float& a, float& b, float& c, float& d,
+                           double shift, bool negate=false) {
+  double sinx = std::sin(shift);
+  double cosx = std::cos(shift);
+  double sin2x = 2 * sinx * cosx;
+  double cos2x = sq(cosx)- sq(sinx);
+  // a sin(x+y) + b cos(x+y) = a sin(x) cos(y) - b sin(x) sin(y)
+  //                         + a cos(x) sin(y) + b cos(x) cos(y)
+  float a_ = float(a * cosx - b * sinx);
+  float b_ = float(a * sinx + b * cosx);
+  float c_ = float(c * cos2x - d * sin2x);
+  float d_ = float(c * sin2x + d * cos2x);
+  a = a_;                 // cos(phi)
+  b = negate ? -b_ : b_;  // sin(phi)
+  c = c_;                 // cos(2 phi)
+  d = negate ? -d_ : d_;  // sin(2 phi)
+}
+
+// this function is generic because it was used in other places in the past
+template <typename T, typename FP=typename std::iterator_traits<T>::value_type>
+std::array<FP,2> calculate_min_max_disregarding_nans(T begin, T end) {
+  std::array<FP,2> minmax = {{NAN, NAN}};
+  T i = begin;
+  while (i != end && std::isnan(*i))
+    ++i;
+  if (i != end) {
+    minmax[0] = minmax[1] = *i;
+    while (++i != end) {
+      if (*i < minmax[0])
+        minmax[0] = *i;
+      else if (*i > minmax[1])
+        minmax[1] = *i;
+    }
+  }
+  return minmax;
+}
+
+const char* skip_word_and_space(const char* line) {
+  while (*line != '\0' && !std::isspace(*line))
+    ++line;
+  while (std::isspace(*line))
+    ++line;
+  return line;
+}
+
+UnitCell read_cell_parameters(const char* line) {
+  double a = fast_atof(line, &line);
+  double b = fast_atof(line, &line);
+  double c = fast_atof(line, &line);
+  double alpha = fast_atof(line, &line);
+  double beta = fast_atof(line, &line);
+  double gamma = fast_atof(line, &line);
+  return UnitCell(a, b, c, alpha, beta, gamma);
+}
+
+} // anonymous namespace
+
+UnitCellParameters Mtz::get_average_cell_from_batch_headers(double* rmsd) const {
+  if (rmsd)
+    for (int i = 0; i < 6; ++i)
+      rmsd[i] = 0.;
+  std::array<double, 6> avg = {0., 0., 0., 0., 0., 0.};
+  for (const Batch& batch : batches)
+    for (int i = 0; i < 6; ++i) {
+      // if batch headers are not set correctly, return global cell
+      if (batch.floats[i] <= 0)
+        return cell;
+      avg[i] += batch.floats[i];
+    }
+  if (avg[0] <= 0 || avg[1] <= 0 || avg[2] <= 0 ||
+      avg[3] <= 0 || avg[4] <= 0 || avg[5] <= 0)
+    return UnitCellParameters();
+  size_t n = batches.size();
+  for (int i = 0; i < 6; ++i)
+    avg[i] /= n;
+  if (rmsd) {
+    for (const Batch& batch : batches)
+      for (int i = 0; i < 6; ++i)
+        rmsd[i] += sq(avg[i] - batch.floats[i]);
+    for (int i = 0; i < 6; ++i)
+      rmsd[i] = std::sqrt(rmsd[i] / n);
+  }
+  // If average parameters are almost equal to the global cell, use the latter
+  // to avoid 32-bit precision artifacts (58.28 -> 58.279998).
+  if (UnitCellParameters(avg).approx(cell, 1e-4))
+    return cell;
+  return UnitCellParameters(avg);
+}
+
+std::array<double,2> Mtz::calculate_min_max_1_d2() const {
+  auto extend_min_max_1_d2 = [&](const UnitCell& uc, double& min, double& max) {
+    for (size_t i = 0; i < data.size(); i += columns.size()) {
+      double res = uc.calculate_1_d2_double(data[i+0], data[i+1], data[i+2]);
+      if (res < min)
+        min = res;
+      if (res > max)
+        max = res;
+    }
+  };
+  if (!has_data() || columns.size() < 3)
+    fail("No data.");
+  double min_value = INFINITY;
+  double max_value = 0.;
+  if (cell.is_crystal() && cell.a > 0)
+    extend_min_max_1_d2(cell, min_value, max_value);
+  const UnitCell* prev_cell = nullptr;
+  for (const Dataset& ds : datasets)
+    if (ds.cell.is_crystal() && ds.cell.a > 0 && ds.cell != cell &&
+        (!prev_cell || ds.cell != *prev_cell)) {
+      extend_min_max_1_d2(ds.cell, min_value, max_value);
+      prev_cell = &ds.cell;
+    }
+  if (min_value == INFINITY)
+    min_value = 0;
+  return {{min_value, max_value}};
+}
+
+void Mtz::read_first_bytes(AnyStream& stream) {
+  char buf[20] = {0};
+
+  if (!stream.read(buf, 20))
+    fail("Could not read the MTZ file (is it empty?)");
+  if (buf[0] != 'M' || buf[1] != 'T' || buf[2] != 'Z' || buf[3] != ' ')
+    fail("Not an MTZ file - it does not start with 'MTZ '");
+
+  // Bytes 9-12 have so-called machine stamp:
+  // "The first 4 half-bytes represent the real, complex, integer and
+  // character formats".
+  // We don't try to handle all the combinations here, only the two most
+  // common: big endian (for all types) and little endian (for all types).
+  // BE is denoted by 1 and LE by 4.
+  // If we get a value different than 1 and 4 we assume the native byte order.
+  if ((buf[9] & 0xf0) == (is_little_endian() ? 0x10 : 0x40))
+    toggle_endianness();
+
+  std::int32_t tmp_header_offset;
+  std::memcpy(&tmp_header_offset, buf + 4, 4);
+  if (!same_byte_order)
+    swap_four_bytes(&tmp_header_offset);
+
+  if (tmp_header_offset == -1) {
+    std::memcpy(&header_offset, buf + 12, 8);
+    if (!same_byte_order) {
+      swap_eight_bytes(&header_offset);
+    }
+  } else {
+    header_offset = (int64_t) tmp_header_offset;
+  }
+  stream.skip(60);
+}
+
+void Mtz::read_main_headers(AnyStream& stream, std::vector<std::string>* save_headers) {
+  char line[81] = {0};
+  std::ptrdiff_t header_pos = 4 * std::ptrdiff_t(header_offset - 1);
+  // temporary check
+  long cur_pos = stream.tell();
+  if (cur_pos != header_pos && cur_pos != -1)
+    fail(cat("wrong pos ", int(header_pos), "  ", int(stream.tell())));
+  int ncol = 0;
+  bool has_batch = false;
+  while (stream.read(line, 80)) {
+    if (save_headers)
+      save_headers->emplace_back(line, line+80);
+    if (ialpha3_id(line) == ialpha3_id("END"))
+      break;
+    const char* args = skip_word_and_space(line);
+    switch (ialpha4_id(line)) {
+      case ialpha4_id("VERS"):
+        version_stamp = rtrim_str(args);
+        break;
+      case ialpha4_id("TITL"):
+        title = rtrim_str(args);
+        break;
+      case ialpha4_id("NCOL"): {
+        ncol = simple_atoi(args, &args);
+        nreflections = simple_atoi(args, &args);
+        int nbatches = simple_atoi(args);
+        if (nbatches < 0 || nbatches > 10000000)  // sanity check
+          fail("Wrong NCOL header");
+        batches.resize(nbatches);
+        break;
+      }
+      case ialpha4_id("CELL"):
+        cell = read_cell_parameters(args);
+        break;
+      case ialpha4_id("SORT"):
+        for (int& n : sort_order)
+          n = simple_atoi(args, &args);
+        break;
+      case ialpha4_id("SYMI"): {
+        nsymop = simple_atoi(args, &args);
+        symops.reserve(nsymop);
+        simple_atoi(args, &args); // ignore number of primitive operations
+        args = skip_word_and_space(skip_blank(args)); // ignore lattice type
+        spacegroup_number = simple_atoi(args, &args);
+        args = skip_blank(args);
+        if (*args != '\'')
+          spacegroup_name = read_word(args);
+        else if (const char* end = std::strchr(++args, '\''))
+          spacegroup_name.assign(args, end);
+        // ignore point group which is at the end of args
+        break;
+      }
+      case ialpha4_id("SYMM"):
+        symops.push_back(parse_triplet(args));
+        break;
+      case ialpha4_id("RESO"):
+        min_1_d2 = fast_atof(args, &args);
+        max_1_d2 = fast_atof(args, &args);
+        break;
+      case ialpha4_id("VALM"):
+        if (*args != 'N') {
+          const char* endptr;
+          float v = (float) fast_atof(args, &endptr);
+          if (*endptr == '\0' || is_space(*endptr))
+            valm = v;
+          else
+            logger.note("Unexpected VALM value: " + rtrim_str(args));
+        }
+        break;
+      case ialpha4_id("COLU"): {
+        columns.emplace_back();
+        Column& col = columns.back();
+        col.label = read_word(args, &args);
+        col.type = read_word(args, &args)[0];
+        col.min_value = (float) fast_atof(args, &args);
+        col.max_value = (float) fast_atof(args, &args);
+        col.dataset_id = simple_atoi(args);
+        col.parent = this;
+        col.idx = columns.size() - 1;
+        break;
+      }
+      case ialpha4_id("COLS"):
+        // COLSRC is undocumented. CMTZ (libccp4) adds it after COLUMN:
+        // COLUMN IMEAN                          J       -300.600006              4619    1
+        // COLSRC IMEAN                          CREATED_07/08/2019_11:00:23              1
+        if (!columns.empty() && columns.back().label == read_word(args, &args))
+          columns.back().source = read_word(args);
+        else
+          logger.note("MTZ: COLSRC is not after matching COLUMN");
+        break;
+      case ialpha4_id("COLG"):
+        // Column group - not used.
+        break;
+      case ialpha4_id("NDIF"):
+        datasets.reserve(simple_atoi(args));
+        break;
+      case ialpha4_id("PROJ"):
+        datasets.emplace_back();
+        datasets.back().id = simple_atoi(args, &args);
+        datasets.back().project_name = read_word(skip_word_and_space(args));
+        datasets.back().wavelength = 0.0;
+        break;
+      case ialpha4_id("CRYS"):
+        if (simple_atoi(args, &args) == last_dataset().id)
+          datasets.back().crystal_name = read_word(args);
+        else
+          logger.note("MTZ CRYSTAL line: unusual numbering.");
+        break;
+      case ialpha4_id("DATA"):
+        if (simple_atoi(args, &args) == last_dataset().id)
+          datasets.back().dataset_name = read_word(args);
+        else
+          logger.note("MTZ DATASET line: unusual numbering.");
+        break;
+      case ialpha4_id("DCEL"):
+        if (simple_atoi(args, &args) == last_dataset().id)
+          datasets.back().cell = read_cell_parameters(args);
+        else
+          logger.note("MTZ DCELL line: unusual numbering.");
+        break;
+      // case("DRES"): not in use yet
+      case ialpha4_id("DWAV"):
+        if (simple_atoi(args, &args) == last_dataset().id)
+          datasets.back().wavelength = fast_atof(args);
+        else
+          logger.note("MTZ DWAV line: unusual numbering.");
+        break;
+      case ialpha4_id("BATCH"):
+        // We take number of batches from the NCOL record and serial numbers
+        // from BH. This header could be used only to check consistency.
+        has_batch = true;
+        break;
+      default:
+        logger.note("Unknown header: " + rtrim_str(line));
+    }
+  }
+  if (ncol != (int) columns.size())
+    fail("Number of COLU records inconsistent with NCOL record.");
+  if (has_batch != !batches.empty())
+    fail("BATCH header inconsistent with NCOL record.");
+  // adjust data size, if necessary
+  if (!data.empty()) {
+    size_t expected_size = columns.size() * nreflections;
+    if (data.size() > expected_size)
+      data.resize(expected_size);
+    else if (data.size() < expected_size)
+      fail("internal error, wrong data size");
+  }
+}
+
+void Mtz::read_history_and_batch_headers(AnyStream& stream) {
+  char buf[81] = {0};
+  int n_headers = 0;
+  while (stream.read(buf, 80) && ialpha4_id(buf) != ialpha4_id("MTZE")) {
+    if (n_headers != 0) {
+      const char* start = skip_blank(buf);
+      const char* end = rtrim_cstr(start, start+80);
+      history.emplace_back(start, end);
+      --n_headers;
+    } else if (ialpha4_id(buf) == ialpha4_id("MTZH")) {
+      n_headers = simple_atoi(skip_word_and_space(buf+4));
+      if (n_headers < 0 || n_headers > 30) {
+        logger.note("Wrong MTZ: number of headers should be between 0 and 30");
+        return;
+      }
+      history.reserve(n_headers);
+    } else if (ialpha4_id(buf) == ialpha4_id("MTZB")) {
+      for (Batch& batch : batches) {
+        stream.read(buf, 80);
+        if (ialpha3_id(buf) != ialpha3_id("BH "))
+          fail("Missing BH header");
+        const char* args = skip_blank(buf + 2);
+        batch.number = simple_atoi(args, &args);
+        int total_words = simple_atoi(args, &args);
+        int int_words = simple_atoi(args, &args);
+        int float_words = simple_atoi(args);
+        if (total_words != int_words + float_words || total_words > 1000)
+          fail("Wrong BH header");
+        stream.read(buf, 80); // TITLE
+        const char* end = rtrim_cstr(buf + 6, buf+76);
+        batch.title.assign(buf, end - buf);
+        batch.ints.resize(int_words);
+        stream.read(batch.ints.data(), int_words * 4);
+        batch.floats.resize(float_words);
+        stream.read(batch.floats.data(), float_words * 4);
+        stream.read(buf, 80);
+        if (ialpha4_id(buf) != ialpha4_id("BHCH"))
+          fail("Missing BHCH header");
+        split_str_into_multi(buf + 5, " \t", batch.axes);
+      }
+    }
+  }
+  appended_text = stream.read_rest();
+}
+
+void Mtz::setup_spacegroup() {
+  spacegroup = find_spacegroup_by_name(spacegroup_name, cell.alpha, cell.gamma);
+  if (!spacegroup) {
+    logger.note("MTZ: unrecognized spacegroup name: " + spacegroup_name);
+    return;
+  }
+  if (spacegroup->ccp4 != spacegroup_number)
+    logger.note("MTZ: inconsistent spacegroup name and number");
+  cell.set_cell_images_from_spacegroup(spacegroup);
+  for (Dataset& d : datasets)
+    d.cell.set_cell_images_from_spacegroup(spacegroup);
+}
+
+// we should be at byte 80
+void Mtz::read_raw_data(AnyStream& stream, bool do_read) {
+  size_t n = size_t(header_offset - 1 - 20);
+  if (!do_read) {
+    if (!stream.skip(4 * n))
+      fail("ignoring mtz data segment failed");
+    return;
+  }
+  data.resize(n);
+  if (!stream.read(data.data(), 4 * n))
+    fail("Error when reading MTZ data");
+  if (!same_byte_order)
+    for (float& f : data)
+      swap_four_bytes(&f);
+}
+
+void Mtz::read_stream(AnyStream& stream, bool with_data) {
+  read_first_bytes(stream);
+  // The older implementation of MTZ reading first read the headers,
+  // then the data. This required jumping to the headers at the end,
+  // then back to the beginning of the data (byte 80).
+  // The current implementation avoids calling seek(), allowing
+  // incremental reading of streams (stdin, gzipped files, etc).
+  read_raw_data(stream, with_data);
+  read_main_headers(stream, nullptr);
+  read_history_and_batch_headers(stream);
+  setup_spacegroup();
+  if (datasets.empty())
+    datasets.push_back({0, "HKL_base", "HKL_base", "HKL_base", cell, 0.});
+}
+
+// for probing/testing individual reflections, no need to optimize it
+size_t Mtz::find_offset_of_hkl(const Miller& hkl, size_t start) const {
+  if (!has_data() || columns.size() < 3)
+    fail("No data.");
+  if (start != 0)
+    start -= (start % columns.size());
+  for (size_t n = start; n + 2 < data.size(); n += columns.size())
+    if (get_hkl(n) == hkl)
+      return n;
+  return (size_t)-1;
+}
+
+void Mtz::ensure_asu(bool tnt_asu) {
+  if (!is_merged())
+    fail("Mtz::ensure_asu() is for merged MTZ only");
+  if (!spacegroup)
+    return;
+  GroupOps gops = spacegroup->operations();
+  ReciprocalAsu asu(spacegroup, tnt_asu);
+  std::vector<int> phase_columns = positions_of_columns_with_type('P');
+  std::vector<int> abcd_columns = positions_of_columns_with_type('A');
+  std::vector<int> dano_columns = positions_of_columns_with_type('D');
+  std::vector<std::pair<int,int>> plus_minus_columns = positions_of_plus_minus_columns();
+  bool no_special_columns = phase_columns.empty() && abcd_columns.empty() &&
+                            plus_minus_columns.empty() && dano_columns.empty();
+  bool centric = no_special_columns || gops.is_centrosymmetric();
+  for (size_t n = 0; n < data.size(); n += columns.size()) {
+    Miller hkl = get_hkl(n);
+    if (asu.is_in(hkl))
+      continue;
+    auto result = asu.to_asu(hkl, gops);
+    // cf. impl::move_to_asu() in asudata.hpp
+    set_hkl(n, result.first);
+    if (no_special_columns)
+      continue;
+    int isym = result.second;
+    if (!phase_columns.empty() || !abcd_columns.empty()) {
+      const Op& op = gops.sym_ops[(isym - 1) / 2];
+      double shift = op.phase_shift(hkl);
+      bool negate = (isym % 2 == 0);
+      for (int col : phase_columns)
+        shift_phase(data[n + col], shift, negate);
+      for (auto i = abcd_columns.begin(); i+3 < abcd_columns.end(); i += 4)
+        // we expect coefficients HLA, HLB, HLC and HLD - in this order
+        shift_hl_coefficients(data[n + *(i+0)], data[n + *(i+1)],
+                              data[n + *(i+2)], data[n + *(i+3)],
+                              shift, negate);
+    }
+    if (isym % 2 == 0 && !centric &&
+        // usually, centric reflections have empty F(-), so avoid swapping it
+        !gops.is_reflection_centric(hkl)) {
+      for (std::pair<int,int> cols : plus_minus_columns)
+        std::swap(data[n + cols.first], data[n + cols.second]);
+      for (int col : dano_columns)
+        data[n + col] = -data[n + col];
+    }
+  }
+}
+
+void Mtz::reindex(const Op& op) {
+  if (op.tran != Op::Tran{0, 0, 0})
+    gemmi::fail("reindexing operator must not have a translation");
+  if (op.det_rot() < 0)
+    gemmi::fail("reindexing operator must preserve the hand of the axes");
+  switch_to_original_hkl();  // changes hkl for unmerged data only
+  Op xyz_op = op.as_xyz();
+  logger.mesg("Real space transformation: ", op.as_xyz().triplet());
+  bool row_removal = false;
+  // change Miller indices
+  for (size_t n = 0; n < data.size(); n += columns.size()) {
+    Miller hkl_den = op.apply_to_hkl_without_division(get_hkl(n));
+    Miller hkl = Op::divide_hkl_by_DEN(hkl_den);
+    if (hkl[0] * Op::DEN == hkl_den[0] &&
+        hkl[1] * Op::DEN == hkl_den[1] &&
+        hkl[2] * Op::DEN == hkl_den[2]) {
+      set_hkl(n, hkl);
+    } else {  // fractional hkl - remove
+      row_removal = true;
+      data[n] = NAN;  // mark for removal
+    }
+  }
+
+  // remove reflections marked for removal
+  if (row_removal) {
+    int n_before = nreflections;
+    remove_rows_if([](const float* h) { return std::isnan(*h); });
+    logger.mesg("Reflections removed (because of fractional indices): ", n_before - nreflections);
+  }
+
+  switch_to_asu_hkl();  // revert switch_to_original_hkl() for unmerged data
+
+  // change space group
+  if (spacegroup) {
+    GroupOps gops = spacegroup->operations();
+    gops.change_basis_backward(xyz_op);
+    const SpaceGroup* new_sg = find_spacegroup_by_ops(gops);
+    if (!new_sg)
+      fail("reindexing: failed to determine new space group name");
+    if (new_sg != spacegroup) {
+      logger.mesg("Space group changed from ", spacegroup->xhm(), " to ", new_sg->xhm(), '.');
+      set_spacegroup(new_sg);
+    } else {
+      logger.mesg("Space group stays the same:", spacegroup->xhm(), '.');
+    }
+  }
+
+  // change unit cell parameters
+  cell = cell.changed_basis_backward(xyz_op, false);
+  for (Mtz::Dataset& ds : datasets)
+    ds.cell = ds.cell.changed_basis_backward(xyz_op, false);
+  for (Mtz::Batch& batch : batches)
+    batch.set_cell(batch.get_cell().changed_basis_backward(xyz_op, false));
+}
+
+void Mtz::expand_to_p1() {
+  if (!spacegroup || !has_data())
+    return;
+  std::vector<int> phase_columns = positions_of_columns_with_type('P');
+  std::vector<int> abcd_columns = positions_of_columns_with_type('A');
+  bool has_phases = (!phase_columns.empty() || !abcd_columns.empty());
+  GroupOps gops = spacegroup->operations();
+  data.reserve(gops.sym_ops.size() * data.size());
+  size_t orig_size = data.size();
+  std::vector<Miller> hkl_copies;
+  for (size_t n = 0; n < orig_size; n += columns.size()) {
+    hkl_copies.clear();
+    Miller hkl = get_hkl(n);
+    // no reallocations because of reserve() above
+    auto orig_iter = data.begin() + n;
+    for (auto op = gops.sym_ops.begin() + 1; op < gops.sym_ops.end(); ++op) {
+      Miller new_hkl = op->apply_to_hkl(hkl);
+      Op::Miller negated{{-new_hkl[0], -new_hkl[1], -new_hkl[2]}};
+      if (new_hkl != hkl && !in_vector(new_hkl, hkl_copies) &&
+          negated != hkl && !in_vector(negated, hkl_copies)) {
+        hkl_copies.push_back(new_hkl);
+        size_t offset = data.size();
+        data.insert(data.end(), orig_iter, orig_iter + columns.size());
+        set_hkl(offset, new_hkl);
+        if (has_phases) {
+          double shift = op->phase_shift(hkl);
+          if (shift != 0) {
+            for (int col : phase_columns)
+              shift_phase(data[offset + col], shift);
+            for (auto i = abcd_columns.begin(); i+3 < abcd_columns.end(); i += 4)
+              // we expect coefficients HLA, HLB, HLC and HLD - in this order
+              shift_hl_coefficients(data[offset + *(i+0)], data[offset + *(i+1)],
+                                    data[offset + *(i+2)], data[offset + *(i+3)], shift);
+          }
+        }
+      }
+    }
+  }
+  nreflections = int(data.size() / columns.size());
+  sort_order = {{0, 0, 0, 0, 0}};
+  set_spacegroup(&get_spacegroup_p1());
+}
+
+bool Mtz::switch_to_original_hkl() {
+  if (indices_switched_to_original)
+    return false;
+  if (!has_data())
+    fail("switch_to_original_hkl(): data not read yet");
+  if (nreflections == 0) {
+    // This function can be called before the data is populated
+    // to set indices_switched_to_original, which is not exposed in Python.
+    indices_switched_to_original = true;
+    return true;
+  }
+  const Column* col = column_with_label("M/ISYM");
+  if (col == nullptr || col->type != 'Y' || col->idx < 3)
+    return false;
+  std::vector<Op> inv_symops;
+  inv_symops.reserve(symops.size());
+  for (const Op& op : symops)
+    inv_symops.push_back(op.inverse());
+  for (size_t n = 0; n + col->idx < data.size(); n += columns.size()) {
+    int isym = static_cast<int>(data[n + col->idx]) & 0xFF;
+    const Op& op = inv_symops.at((isym - 1) / 2);
+    Miller hkl = op.apply_to_hkl(get_hkl(n));
+    int sign = (isym & 1) ? 1 : -1;
+    for (int i = 0; i < 3; ++i)
+      data[n+i] = static_cast<float>(sign * hkl[i]);
+  }
+  indices_switched_to_original = true;
+  return true;
+}
+
+bool Mtz::switch_to_asu_hkl() {
+  if (!indices_switched_to_original)
+    return false;
+  if (!has_data())
+    fail("switch_to_asu_hkl(): data not read yet");
+  const Column* col = column_with_label("M/ISYM");
+  if (col == nullptr || col->type != 'Y' || col->idx < 3 || !spacegroup)
+    return false;
+  size_t misym_idx = col->idx;
+  UnmergedHklMover hkl_mover(spacegroup);
+  for (size_t n = 0; n + col->idx < data.size(); n += columns.size()) {
+    Miller hkl = get_hkl(n);
+    int isym = hkl_mover.move_to_asu(hkl);  // modifies hkl
+    set_hkl(n, hkl);
+    float& misym = data[n + misym_idx];
+    misym = float(((int)misym & ~0xff) | isym);
+  }
+  indices_switched_to_original = false;
+  return true;
+}
+
+void Mtz::read_file_gz(const std::string& path, bool with_data) {
+  try {
+    read_input(MaybeGzipped(path), with_data);
+  } catch (std::runtime_error& e) {
+    // append path to the error like in read_file(), but shouldn't the path go first?
+    fail(std::string(e.what()) + ": " + path);
+  }
+}
+
+std::vector<int> Mtz::sorted_row_indices(int use_first) const {
+  if (!has_data())
+    fail("No data.");
+  if (use_first <= 0 || use_first >= (int) columns.size())
+    fail("Wrong use_first arg in Mtz::sort.");
+  std::vector<int> indices(nreflections);
+  for (int i = 0; i != nreflections; ++i)
+    indices[i] = i;
+  std::stable_sort(indices.begin(), indices.end(), [&](int i, int j) {
+    int a = i * (int) columns.size();
+    int b = j * (int) columns.size();
+    for (int n = 0; n < use_first; ++n)
+      if (data[a+n] != data[b+n])
+        return data[a+n] < data[b+n];
+    return false;
+  });
+  return indices;
+}
+
+bool Mtz::sort(int use_first) {
+  std::vector<int> indices = sorted_row_indices(use_first);
+  sort_order = {{0, 0, 0, 0, 0}};
+  for (int i = 0; i < use_first; ++i)
+    sort_order[i] = i + 1;
+  if (std::is_sorted(indices.begin(), indices.end()))
+    return false;
+  std::vector<float> new_data(data.size());
+  size_t w = columns.size();
+  for (size_t i = 0; i != indices.size(); ++i)
+    std::memcpy(&new_data[i * w], &data[indices[i] * w], w * sizeof(float));
+  data.swap(new_data);
+  return true;
+}
+
+Mtz::Column& Mtz::add_column(const std::string& label, char type,
+                             int dataset_id, int pos, bool expand_data) {
+  if (datasets.empty())
+    fail("No datasets.");
+  if (dataset_id < 0)
+    dataset_id = datasets.back().id;
+  else
+    dataset(dataset_id); // check if such dataset exist
+  if (pos > (int) columns.size())
+    fail("Requested column position after the end.");
+  if (pos < 0)
+    pos = (int) columns.size();
+  auto col = columns.emplace(columns.begin() + pos);
+  for (auto i = col + 1; i != columns.end(); ++i)
+    i->idx++;
+  col->dataset_id = dataset_id;
+  col->type = type;
+  col->label = label;
+  col->parent = this;
+  col->idx = pos;
+  if (expand_data)
+    expand_data_rows(1, pos);
+  return *col;
+}
+
+
+namespace {  // helper functions for copying, replacing and removing columns
+
+void check_column(const Mtz& mtz, size_t idx, const char* msg) {
+  if (!mtz.has_data())
+    fail(msg, ": data not read yet");
+  if (idx >= mtz.columns.size())
+    fail(msg, ": no column with 0-based index ", std::to_string(idx));
+}
+
+void check_trailing_cols(const Mtz& mtz, const Mtz::Column& src_col,
+                         const std::vector<std::string>& trailing_cols) {
+  assert(src_col.parent == &mtz);
+  if (!mtz.has_data())
+    fail("data in source mtz not read yet");
+  if (src_col.idx + trailing_cols.size() >= mtz.columns.size())
+    fail("Not enough columns after " + src_col.label);
+  for (size_t i = 0; i < trailing_cols.size(); ++i)
+    if (!trailing_cols[i].empty() &&
+        trailing_cols[i] != mtz.columns[src_col.idx + i + 1].label)
+      fail("expected trailing column ", trailing_cols[i], ", found ", src_col.label);
+}
+
+void do_replace_column(Mtz& mtz, size_t dest_idx, const Mtz::Column& src_col,
+                       const std::vector<std::string>& trailing_cols) {
+  const Mtz* src_mtz = src_col.parent;
+  for (size_t i = 0; i <= trailing_cols.size(); ++i) {
+    Mtz::Column& dst = mtz.columns[dest_idx + i];
+    const Mtz::Column& src = src_mtz->columns[src_col.idx + i];
+    dst.type = src.type;
+    dst.label = src.label;
+    dst.min_value = src.min_value;
+    dst.max_value = src.max_value;
+    dst.source = src.source;
+    dst.dataset_id = src.dataset_id;
+  }
+  if (src_mtz == &mtz) {
+    // internal copying
+    for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size())
+      for (size_t i = 0; i <= trailing_cols.size(); ++i)
+        mtz.data[n + dest_idx + i] = mtz.data[n + src_col.idx + i];
+  } else {
+    // external copying - need to match indices
+    std::vector<int> dst_indices = mtz.sorted_row_indices();
+    std::vector<int> src_indices = src_mtz->sorted_row_indices();
+    // cf. for_matching_reflections()
+    size_t dst_stride = mtz.columns.size();
+    size_t src_stride = src_mtz->columns.size();
+    auto dst = dst_indices.begin();
+    auto src = src_indices.begin();
+    while (dst != dst_indices.end() && src != src_indices.end()) {
+      Miller dst_hkl = mtz.get_hkl(*dst * dst_stride);
+      Miller src_hkl = src_mtz->get_hkl(*src * src_stride);
+      if (dst_hkl == src_hkl) {
+        // copy values
+        for (size_t i = 0; i <= trailing_cols.size(); ++i)
+          mtz.data[*dst * dst_stride + dest_idx + i] =
+            src_mtz->data[*src * src_stride + src_col.idx + i];
+        ++dst;
+        ++src;
+      } else if (dst_hkl < src_hkl) {
+        ++dst;
+      } else {
+        ++src;
+      }
+    }
+  }
+}
+
+} // anonymous namespace
+
+Mtz::Column& Mtz::replace_column(size_t dest_idx, const Mtz::Column& src_col,
+                                 const std::vector<std::string>& trailing_cols) {
+  check_trailing_cols(*src_col.parent, src_col, trailing_cols);
+  check_column(*this, dest_idx + trailing_cols.size(), "replace_column()");
+  do_replace_column(*this, dest_idx, src_col, trailing_cols);
+  return columns[dest_idx];
+}
+
+Mtz::Column& Mtz::copy_column(int dest_idx, const Mtz::Column& src_col,
+                              const std::vector<std::string>& trailing_cols) {
+  // check input consistency
+  if (!has_data())
+    fail("copy_column(): data not read yet");
+  check_trailing_cols(*src_col.parent, src_col, trailing_cols);
+  // add new columns
+  if (dest_idx < 0)
+    dest_idx = (int) columns.size();
+  // if src_col is from this Mtz it may get invalidated when adding columns
+  int col_idx = -1;
+  if (src_col.parent == this) {
+    col_idx = (int) src_col.idx;
+    if (col_idx >= dest_idx)
+      col_idx += 1 + (int)trailing_cols.size();
+  }
+  for (int i = 0; i <= (int) trailing_cols.size(); ++i)
+    add_column("", ' ', -1, dest_idx + i, false);
+  expand_data_rows(1 + trailing_cols.size(), dest_idx);
+  // copy the data
+  const Column& src_col_now = col_idx < 0 ? src_col : columns[col_idx];
+  // most of the work (hkl-based row matching and data copying) is done here:
+  do_replace_column(*this, dest_idx, src_col_now, trailing_cols);
+  return columns[dest_idx];
+}
+
+void Mtz::remove_column(size_t idx) {
+  check_column(*this, idx, "remove_column()");
+  columns.erase(columns.begin() + idx);
+  for (size_t i = idx; i < columns.size(); ++i)
+    --columns[i].idx;
+  vector_remove_column(data, columns.size(), idx);
+  assert(columns.size() * nreflections == data.size());
+}
+
+
+#define WRITE(...) do { \
+    int len = snprintf_z(buf, 81, __VA_ARGS__); \
+    if (len < 80) \
+      std::memset(buf + len, ' ', 80 - len); \
+    if (write(buf, 80, 1) != 1) \
+      sys_fail("Writing MTZ file failed"); \
+  } while(0)
+
+template<typename Write>
+void Mtz::write_to_stream(Write write) const {
+  // uses: data, spacegroup, nreflections, batches, cell, sort_order,
+  //       valm, columns, datasets, history
+  if (!has_data())
+    fail("Cannot write Mtz which has no data");
+  if (!spacegroup)
+    fail("Cannot write Mtz which has no space group");
+  char buf[81] = {'M', 'T', 'Z', ' ', '\0'};
+  std::int64_t real_header_start = (int64_t) columns.size() * nreflections + 21;
+  std::int32_t header_start = (int32_t) real_header_start;
+  if (real_header_start > std::numeric_limits<int32_t>::max()) {
+    header_start = -1;
+  } else {
+    real_header_start = 0;
+  }
+  std::memcpy(buf + 4, &header_start, 4);
+  std::int32_t machst = is_little_endian() ? 0x00004144 : 0x11110000;
+  std::memcpy(buf + 8, &machst, 4);
+  std::memcpy(buf + 12, &real_header_start, 8);
+  if (write(buf, 80, 1) != 1 ||
+      write(data.data(), 4, data.size()) != data.size())
+    fail("Writing MTZ file failed");
+  WRITE("VERS MTZ:V1.1");
+  WRITE("TITLE %s", title.c_str());
+  WRITE("NCOL %8zu %12d %8zu", columns.size(), nreflections, batches.size());
+  if (cell.is_crystal())
+    WRITE("CELL  %9.4f %9.4f %9.4f %9.4f %9.4f %9.4f",
+          cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma);
+  WRITE("SORT  %3d %3d %3d %3d %3d", sort_order[0], sort_order[1],
+        sort_order[2], sort_order[3], sort_order[4]);
+  GroupOps ops = spacegroup->operations();
+  char lat_type = spacegroup->ccp4_lattice_type();
+  WRITE("SYMINF %3d %2d %c %5d %*s'%c%s' PG%s",
+        ops.order(),               // number of symmetry operations
+        (int) ops.sym_ops.size(),  // number of primitive operations
+        lat_type,                  // lattice type
+        spacegroup->ccp4,          // space group number
+        20 - (int) std::strlen(spacegroup->hm), "",
+        lat_type,                  // space group name (first letter)
+        spacegroup->hm + 1,        // space group name (the rest)
+        spacegroup->point_group_hm()); // point group name
+  // If we have symops that are the same as spacegroup->operations(),
+  // write symops to preserve the order of SYMM records.
+  if (!symops.empty() && ops.is_same_as(split_centering_vectors(symops)))
+    for (Op op : symops)
+      WRITE("SYMM %s", to_upper(op.triplet()).c_str());
+  else
+    for (Op op : ops)
+      WRITE("SYMM %s", to_upper(op.triplet()).c_str());
+  auto reso = calculate_min_max_1_d2();
+  WRITE("RESO %-20.12f %-20.12f", reso[0], reso[1]);
+  if (std::isnan(valm))
+    WRITE("VALM NAN");
+  else
+    WRITE("VALM %f", valm);
+  auto format17 = [](float f) {
+    char buffer[18];
+    int len = snprintf_z(buffer, 18, "%.9f", f);
+    return std::string(buffer, len > 0 ? std::min(len, 17) : 0);
+  };
+  for (const Column& col : columns) {
+    auto minmax = calculate_min_max_disregarding_nans(col.begin(), col.end());
+    const char* label = !col.label.empty() ? col.label.c_str() : "_";
+    WRITE("COLUMN %-30s %c %17s %17s %4d",
+          label, col.type,
+          format17(minmax[0]).c_str(), format17(minmax[1]).c_str(),
+          col.dataset_id);
+    if (!col.source.empty())
+      WRITE("COLSRC %-30s %-36s  %4d", label, col.source.c_str(), col.dataset_id);
+  }
+  WRITE("NDIF %8zu", datasets.size());
+  for (const Dataset& ds : datasets) {
+    WRITE("PROJECT %7d %s", ds.id, ds.project_name.c_str());
+    WRITE("CRYSTAL %7d %s", ds.id, ds.crystal_name.c_str());
+    WRITE("DATASET %7d %s", ds.id, ds.dataset_name.c_str());
+    const UnitCell& uc = (ds.cell.is_crystal() && ds.cell.a > 0 ? ds.cell : cell);
+    WRITE("DCELL %9d %10.4f%10.4f%10.4f%10.4f%10.4f%10.4f",
+          ds.id, uc.a, uc.b, uc.c, uc.alpha, uc.beta, uc.gamma);
+    WRITE("DWAVEL %8d %10.5f", ds.id, ds.wavelength);
+  }
+  int pos = 0;
+  for (const Batch& batch : batches) {
+    if (pos == 0)
+      std::memcpy(buf, "BATCH ", 6);  // NOLINT(bugprone-not-null-terminated-result)
+    pos += 6;
+    snprintf_z(buf + pos, 7, "%6d", batch.number);
+    if (pos > 72 || &batch == &batches.back()) {
+      std::memset(buf + pos, ' ', 80 - pos);
+      if (write(buf, 80, 1) != 1)
+        fail("Writing MTZ file failed");
+      pos = 0;
+    }
+  }
+  WRITE("END");
+  if (!history.empty()) {
+    // According to mtzformat.html the file can have only up to 30 history
+    // lines, but we don't enforce it here.
+    WRITE("MTZHIST %3zu", history.size());
+    for (const std::string& line : history)
+      WRITE("%s", line.c_str());
+  }
+  if (!batches.empty()) {
+    WRITE("MTZBATS");
+    for (const Batch& batch : batches) {
+      // keep the numbers the same as in files written by libccp4
+      WRITE("BH %8d %7zu %7zu %7zu",
+            batch.number, batch.ints.size() + batch.floats.size(),
+            batch.ints.size(), batch.floats.size());
+      WRITE("TITLE %.70s", batch.title.c_str());
+      if (batch.ints.size() != 29 || batch.floats.size() != 156)
+        fail("wrong size of binaries batch headers");
+      write(batch.ints.data(), 4, batch.ints.size());
+      write(batch.floats.data(), 4, batch.floats.size());
+      WRITE("BHCH  %7.7s %7.7s %7.7s",
+            batch.axes.size() > 0 ? batch.axes[0].c_str() : "",
+            batch.axes.size() > 1 ? batch.axes[1].c_str() : "",
+            batch.axes.size() > 2 ? batch.axes[2].c_str() : "");
+    }
+  }
+  WRITE("MTZENDOFHEADERS");
+  if (!appended_text.empty()) {
+    if (write(appended_text.data(), appended_text.size(), 1) != 1)
+      fail("Writing MTZ file failed");
+  }
+}
+
+#undef WRITE
+
+void Mtz::write_to_cstream(std::FILE* stream) const {
+  write_to_stream([&](const void *ptr, size_t size, size_t nmemb) {
+      return std::fwrite(ptr, size, nmemb, stream);
+  });
+}
+
+void Mtz::write_to_string(std::string& str) const {
+  // Calculate the size beforehand to avoid memory re-allocations
+  // and minimize memory usage. It hasn't been benchmarked against
+  // a single-pass writing.
+  size_t nbytes = size_to_write();
+  str.resize(nbytes);
+  write_to_buffer(&str[0], nbytes);
+}
+
+void Mtz::write_to_file(const std::string& path) const {
+  fileptr_t f = file_open(path.c_str(), "wb");
+  try {
+    write_to_cstream(f.get());
+  } catch (std::runtime_error& e) {
+    fail(std::string(e.what()) + ": " + path);
+  }
+}
+
+size_t Mtz::size_to_write() const {
+  size_t nbytes = 0;
+  write_to_stream([&](const void *, size_t size, size_t nmemb) {
+      nbytes += size * nmemb;
+      return nmemb;
+  });
+  return nbytes;
+}
+
+size_t Mtz::write_to_buffer(char* buf, size_t maxlen) const {
+  size_t len = 0;
+  write_to_stream([&](const void *ptr, size_t size, size_t nmemb) {
+      len += size * nmemb;
+      if (len > maxlen)
+        fail("Mtz::write_to_buffer: size too small");
+      memcpy(buf, ptr, size * nmemb);
+      buf += size * nmemb;
+      return nmemb;
+  });
+  return len;
+}
+
+} // namespace gemmi
@@ -0,0 +1,68 @@
+// Copyright 2017 Global Phasing Ltd.
+
+#include <gemmi/sprintf.hpp>
+#include <stdarg.h>  // for va_list
+
+#ifdef USE_STD_SNPRINTF  // useful for benchmarking and testing only
+# include <cstdio>
+# include <algorithm> // for min
+#else
+# define STB_SPRINTF_IMPLEMENTATION
+# define STB_SPRINTF_STATIC
+# define STB_SPRINTF_NOUNALIGNED 1
+// Making functions from stb_sprintf static may trigger warnings.
+# if defined(__GNUC__)
+#  pragma GCC diagnostic ignored "-Wunused-function"
+# endif
+# if defined(__clang__)
+#  pragma clang diagnostic ignored "-Wunused-function"
+# endif
+
+// To use system stb_sprintf.h (not recommended, but some Linux distros
+// don't like bundled libraries) define GEMMI_USE_SYSTEM_STB or remove
+// third_party/stb_sprintf.h.
+# if defined(__has_include)
+#  if !__has_include("../third_party/stb_sprintf.h")
+#   define GEMMI_USE_SYSTEM_STB 1
+#  endif
+# endif
+# ifdef GEMMI_USE_SYSTEM_STB
+#  pragma message("Using system stb_sprintf.h, not the bundled one. It may not work.")
+#  include "stb/stb_sprintf.h"
+# else
+#  include "../third_party/stb_sprintf.h"
+# endif
+#endif  // USE_STD_SNPRINTF
+
+namespace gemmi {
+
+// We copy functions from sprintf.h only to have them declared with GEMMI_DLL.
+int sprintf_z(char *buf, char const *fmt, ...) {
+  int result;
+  va_list va;
+  va_start(va, fmt);
+#ifdef USE_STD_SNPRINTF
+  result = std::vsprintf(buf, fmt, va);
+#else
+  result = STB_SPRINTF_DECORATE(vsprintfcb)(0, 0, buf, fmt, va);
+#endif
+  va_end(va);
+  return result;
+}
+
+int snprintf_z(char *buf, int count, char const *fmt, ...) {
+  int result;
+  va_list va;
+  va_start(va, fmt);
+#ifdef USE_STD_SNPRINTF
+  result = std::vsnprintf(buf, count, fmt, va);
+  // stbsp_snprintf always returns a zero-terminated string
+  buf[std::min(result, count-1)] = '\0';
+#else
+  result = STB_SPRINTF_DECORATE(vsnprintf)(buf, count, fmt, va);
+#endif
+  va_end(va);
+  return result;
+}
+
+}  // namespace gemmi
@@ -0,0 +1,306 @@
+// Copyright 2023 Global Phasing Ltd.
+
+#include <gemmi/xds_ascii.hpp>
+#include <gemmi/atof.hpp>      // for fast_from_chars
+#include <gemmi/atox.hpp>      // for skip_blank, read_word
+#include <gemmi/util.hpp>      // for trim_str
+#include <gemmi/gz.hpp>
+#include <gemmi/math.hpp>
+
+namespace gemmi {
+
+void XdsAscii::gather_iset_statistics() {
+  for (Iset& iset : isets) {
+    iset.frame_number_min = INT_MAX;
+    iset.frame_number_max = 0;
+    for (const XdsAscii::Refl& refl : data)
+      if (refl.iset == iset.id) {
+        ++iset.reflection_count;
+        int frame = refl.frame();
+        iset.frame_number_min = std::min(iset.frame_number_min, frame);
+        iset.frame_number_max = std::max(iset.frame_number_max, frame);
+      }
+    if (iset.frame_number_min > iset.frame_number_max)
+      continue;
+    std::vector<uint8_t> frames(iset.frame_number_max - iset.frame_number_min + 1);
+    for (const XdsAscii::Refl& refl : data)
+      if (refl.iset == iset.id)
+        frames[refl.frame() - iset.frame_number_min] = 1;
+    iset.frame_count = 0;
+    for (uint8_t f : frames)
+      iset.frame_count += f;
+  }
+}
+
+/// Based on Phil Evans' notes and the literature, see:
+/// https://github.com/project-gemmi/gemmi/discussions/248
+/// \par p is defined as in XDS (p=0.5 for unpolarized beam).
+void XdsAscii::apply_polarization_correction(double p, Vec3 normal) {
+  if (!has_cell_axes())
+    fail("unknown unit cell axes");
+  Mat33 UB = cell_axes.inverse();
+  Vec3 rot_axis = get_rotation_axis();
+  Vec3 s0_dir = get_s0_direction();
+  normal = normal.normalized();
+  // The polarization normal is expected to be approx. orthogonal to the beam.
+  // dot() is the same as cos_angle() for normalized vectors.
+  if (normal.dot(s0_dir) > std::cos(rad(5.0)))
+    fail("polarization normal is far from orthogonal to the incident beam");
+  // make normal exactly orthogonal to the beam
+  normal = s0_dir.cross(normal).cross(s0_dir).normalized();
+  // wavevector
+  Vec3 s0 = s0_dir / wavelength;
+  double s0_m2 = 1. / s0.length_sq();  // s0^-2
+
+  for (Refl& refl : data) {
+    double phi = rad(rot_angle(refl));
+    Vec3 h(refl.hkl[0], refl.hkl[1], refl.hkl[2]);
+    Vec3 r0 = UB.multiply(h);
+    Vec3 r = rotate_about_axis(r0, rot_axis, phi);
+    Vec3 s = s0 + r;
+#if 0
+    double two_theta = s0.angle(s);
+    // 2d sin(theta) = lambda
+    double bragg_angle = std::asin(wavelength / (2 * unit_cell.calculate_d(refl.hkl)));
+    printf("(%d %d %d) two-theta %g %g\n",
+           refl.hkl[0], refl.hkl[1], refl.hkl[2], deg(two_theta), deg(2 * bragg_angle));
+#endif
+    // we should have |s| == |s0|, but just in case calculate it separately
+    double s_m2 = 1. / s.length_sq();
+    // 1 + cos^2(2theta) = 2 * correction for unpolarized beam
+    double t = 1 + sq(s.dot(s0)) * s_m2 * s0_m2;
+    double polariz_factor = (1 - 2*p) * (1 - sq(normal.dot(s)) * s_m2) + p * t;
+    // We assume that the XDS files has polarization correction applied,
+    // but for non-polarized beam. So we multiply intensities by P0=t/2
+    // and divide by a hopefully more accurate polarization factor.
+    double mult = 0.5 * t / polariz_factor;
+    refl.iobs *= mult;
+    refl.sigma *= mult;
+    refl.rlp *= mult;
+  }
+}
+
+namespace {
+
+template<size_t N>
+bool starts_with_ptr(const char* a, const char (&b)[N], const char** endptr) {
+  if (std::strncmp(a, b, N-1) != 0)
+    return false;
+  *endptr = a + N - 1;
+  return true;
+}
+
+template<size_t N>
+bool starts_with_ptr_b(const char* a, const char (&b)[N], const char** endptr) {
+  return starts_with_ptr<N>(skip_blank(a), b, endptr);
+}
+
+inline const char* parse_number_into(const char* start, const char* end,
+                                     double& val, const char* line) {
+  auto result = fast_from_chars(start, end, val);
+  if (result.ec != std::errc())
+    fail("failed to parse a number in:\n", line);
+  return result.ptr;
+}
+
+template<size_t N>
+void parse_numbers_into_array(const char* start, const char* end,
+                              double (&arr)[N], const char* line) {
+  for (double& val : arr)
+    start = parse_number_into(start, end, val, line);
+}
+
+template<size_t N>
+void parse_numbers_into_array(const char* start, const char* end,
+                              std::array<double,N>& arr, const char* line) {
+  for (double& val : arr)
+    start = parse_number_into(start, end, val, line);
+}
+
+void parse_numbers_into_vec3(const char* start, const char* end,
+                             Vec3& vec, const char* line) {
+  for (double* val : {&vec.x, &vec.y, &vec.z})
+    start = parse_number_into(start, end, *val, line);
+}
+
+
+} // anonymous namespace
+
+void XdsAscii::read_stream(AnyStream& line_reader, const std::string& source) {
+  source_path = source;
+  read_columns = 12;
+  char line[256];
+  size_t len0 = line_reader.copy_line(line, 255);
+  if (len0 == 0)
+    fail("empty file");
+  int iset_col = 0;
+  const char xds_ascii_header[] = "!FORMAT=XDS_ASCII    MERGE=";
+  char xds_ascii_type = '\0';
+  if (starts_with(line, xds_ascii_header)) {
+    size_t n = sizeof(xds_ascii_header)-1;
+    xds_ascii_type = line[n];
+    // !FORMAT=XDS_ASCII    MERGE=FALSE    FRIEDEL'S_LAW=
+    if (strncmp(line + n + 5, "    FRIEDEL'S_LAW=", 18) == 0)
+      friedels_law = line[50];
+  }
+  if (!xds_ascii_type && !starts_with(line, "!OUTPUT_FILE=INTEGRATE.HKL"))
+    fail("not an XDS_ASCII nor INTEGRATE.HKL file: " + source_path);
+  const char* rhs;
+  while (size_t len = line_reader.copy_line(line, 255)) {
+    if (line[0] == '!') {
+      if (starts_with_ptr(line+1, "Generated by ", &rhs)) {
+        generated_by = read_word(rhs, &rhs);
+        version_str = trim_str(rhs);
+      } else if (starts_with_ptr(line+1, "SPACE_GROUP_NUMBER=", &rhs)) {
+        spacegroup_number = simple_atoi(rhs);
+      } else if (starts_with_ptr(line+1, "UNIT_CELL_", &rhs)) {
+        if (starts_with_ptr(rhs, "CONSTANTS=", &rhs)) {  // UNIT_CELL_CONSTANTS=
+          parse_numbers_into_array(rhs, line+len, cell_constants, line);
+        } else if (starts_with_ptr(rhs, "A-AXIS=", &rhs)) { // UNIT_CELL_A-AXIS=
+          parse_numbers_into_array(rhs, line+len, cell_axes.a[0], line);
+        } else if (starts_with_ptr(rhs, "B-AXIS=", &rhs)) { // UNIT_CELL_B-AXIS=
+          parse_numbers_into_array(rhs, line+len, cell_axes.a[1], line);
+        } else if (starts_with_ptr(rhs, "C-AXIS=", &rhs)) { // UNIT_CELL_C-AXIS=
+          parse_numbers_into_array(rhs, line+len, cell_axes.a[2], line);
+        }
+      } else if (starts_with_ptr(line+1, "REFLECTING_RANGE_E.S.D.=", &rhs)) {
+        auto result = fast_from_chars(rhs, line+len, reflecting_range_esd);
+        if (result.ec != std::errc())
+          fail("failed to parse mosaicity:\n", line);
+      } else if (starts_with_ptr(line+1, "X-RAY_WAVELENGTH=", &rhs)) {
+        auto result = fast_from_chars(rhs, line+len, wavelength);
+        if (result.ec != std::errc())
+          fail("failed to parse wavelength:\n", line);
+      } else if (starts_with_ptr(line+1, "INCIDENT_BEAM_DIRECTION=", &rhs)) {
+        parse_numbers_into_vec3(rhs, line+len, incident_beam_dir, line);
+      } else if (starts_with_ptr(line+1, "OSCILLATION_RANGE=", &rhs)) {
+        auto result = fast_from_chars(rhs, line+len, oscillation_range);
+        if (result.ec != std::errc())
+          fail("failed to parse:\n", line);
+      } else if (starts_with_ptr(line+1, "ROTATION_AXIS=", &rhs)) {
+        parse_numbers_into_vec3(rhs, line+len, rotation_axis, line);
+      } else if (starts_with_ptr(line+1, "STARTING_ANGLE=", &rhs)) {
+        auto result = fast_from_chars(rhs, line+len, starting_angle);
+        if (result.ec != std::errc())
+          fail("failed to parse:\n", line);
+      } else if (starts_with_ptr(line+1, "STARTING_FRAME=", &rhs)) {
+        starting_frame = simple_atoi(rhs);
+      } else if (starts_with_ptr(line+1, " ISET= ", &rhs)) {
+        const char* endptr;
+        int id = simple_atoi(rhs, &endptr);
+        XdsAscii::Iset& iset = find_or_add_iset(id);
+        endptr = skip_blank(endptr);
+        if (starts_with_ptr(endptr, "INPUT_FILE=", &rhs)) {
+          iset.input_file = read_word(rhs);
+        } else if (starts_with_ptr(endptr, "X-RAY_WAVELENGTH=", &rhs)) {
+          double w;
+          auto result = fast_from_chars(rhs, line+len, w);
+          if (result.ec != std::errc())
+            fail("failed to parse iset wavelength:\n", line);
+          iset.wavelength = w;
+        } else if (starts_with_ptr(endptr, "UNIT_CELL_CONSTANTS=", &rhs)) {
+          parse_numbers_into_array(rhs, line+len, iset.cell_constants, line);
+        }
+      } else if (starts_with_ptr(line+1, "NX=", &rhs)) {
+        const char* endptr;
+        nx = simple_atoi(rhs, &endptr);
+        if (starts_with_ptr_b(endptr, "NY=", &rhs))
+          ny = simple_atoi(rhs, &endptr);
+        if (starts_with_ptr_b(endptr, "QX=", &rhs))
+          endptr = parse_number_into(rhs, line+len, qx, line);
+        if (starts_with_ptr_b(endptr, "QY=", &rhs))
+          parse_number_into(rhs, line+len, qy, line);
+      } else if (starts_with_ptr(line+1, "ORGX=", &rhs)) {
+        const char* endptr = parse_number_into(rhs, line+len, orgx, line);
+        if (starts_with_ptr_b(endptr, "ORGY=", &rhs))
+          endptr = parse_number_into(rhs, line+len, orgy, line);
+        if (starts_with_ptr_b(endptr, "DETECTOR_DISTANCE=", &rhs))
+          parse_number_into(rhs, line+len, detector_distance, line);
+      } else if (starts_with_ptr(line+1, "NUMBER_OF_ITEMS_IN_EACH_DATA_RECORD=", &rhs)) {
+        int num = simple_atoi(rhs);
+        // INTEGRATE.HKL has read_columns=12, as set above
+        if (xds_ascii_type == 'T')  // merged file
+          read_columns = 5;
+        else if (generated_by == "XSCALE")
+          read_columns = 8;
+        else if (generated_by == "CORRECT")
+          read_columns = 11;
+        // check if the columns are what they always are
+        if (num < read_columns)
+          fail("expected ", std::to_string(read_columns), "+ columns, got:\n", line);
+        if (generated_by == "INTEGRATE") {
+          line_reader.copy_line(line, 52);
+          if (!starts_with(line, "!H,K,L,IOBS,SIGMA,XCAL,YCAL,ZCAL,RLP,PEAK,CORR,MAXC"))
+            fail("unexpected column order in INTEGRATE.HKL");
+        } else {
+          const char* expected_columns[12] = {
+            "H=1", "K=2", "L=3", "IOBS=4", "SIGMA(IOBS)=5",
+            "XD=6", "YD=7", "ZD=8", "RLP=9", "PEAK=10", "CORR=11", "MAXC=12"
+          };
+          for (int i = 0; i < read_columns; ++i) {
+            const char* col = expected_columns[i];
+            line_reader.copy_line(line, 42);
+            if (std::strncmp(line, "!ITEM_", 6) != 0 ||
+                std::strncmp(line+6, col, std::strlen(col)) != 0)
+              fail("column !ITEM_" + std::string(col), " not found.");
+          }
+        }
+      } else if (starts_with_ptr(line+1, "ITEM_ISET=", &rhs)) {
+        iset_col = simple_atoi(rhs);
+      } else if (starts_with(line+1, "END_OF_DATA")) {
+        if (isets.empty()) {
+          isets.emplace_back(1);
+          isets.back().wavelength = wavelength;
+        }
+        for (XdsAscii::Refl& refl : data)
+          if (size_t(refl.iset - 1) >= isets.size())
+            fail("unexpected ITEM_ISET " + std::to_string(refl.iset));
+        return;
+      }
+    } else {
+      data.emplace_back();
+      XdsAscii::Refl& r = data.back();
+      const char* p = line;
+      for (int i = 0; i < 3; ++i)
+        r.hkl[i] = simple_atoi(p, &p);
+      auto result = fast_from_chars(p, line+len, r.iobs); // 4
+      result = fast_from_chars(result.ptr, line+len, r.sigma); // 5
+      if (read_columns >= 8) {
+        result = fast_from_chars(result.ptr, line+len, r.xd); // 6
+        result = fast_from_chars(result.ptr, line+len, r.yd); // 7
+        result = fast_from_chars(result.ptr, line+len, r.zd); // 8
+        if (read_columns >= 11) {
+          result = fast_from_chars(result.ptr, line+len, r.rlp); // 9
+          result = fast_from_chars(result.ptr, line+len, r.peak); // 10
+          result = fast_from_chars(result.ptr, line+len, r.corr); // 11
+          if (read_columns >= 12) {
+            result = fast_from_chars(result.ptr, line+len, r.maxc); // 12
+          } else {
+            r.maxc = 0;  // 12
+          }
+        } else {
+          r.rlp = r.peak = r.corr = r.maxc = 0;  // 9-11
+        }
+      } else {
+        r.xd = r.yd = r.zd = 0;  // 6-8
+      }
+      if (result.ec != std::errc())
+        fail("failed to parse data line:\n", line);
+      if (iset_col >= read_columns) {
+        const char* iset_ptr = result.ptr;
+        for (int j = read_columns+1; j < iset_col; ++j)
+          iset_ptr = skip_word(skip_blank(iset_ptr));
+        r.iset = simple_atoi(iset_ptr);
+      }
+    }
+  }
+  fail("incorrect or unfinished file: " + source_path);
+}
+
+XdsAscii read_xds_ascii(const std::string& path) {
+  XdsAscii xds_ascii;
+  xds_ascii.read_input(gemmi::MaybeGzipped(path));
+  return xds_ascii;
+}
+
+}  // namespace gemmi
@@ -1,2 +0,0 @@
-ADD_LIBRARY(gemmi STATIC symmetry.cpp gemmi/symmetry.hpp gemmi/fail.hpp)
-TARGET_INCLUDE_DIRECTORIES(gemmi PUBLIC .)