Files
Jungfraujoch/tools/jfjoch_recompress.cpp
T
leonarski_fandClaude Opus 4.8 f261adf42a
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 12m47s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 13m3s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 13m36s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 13m32s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 13m43s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 8m12s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 11m18s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m26s
Build Packages / Generate python client (push) Successful in 15s
Build Packages / build:rpm (rocky8) (push) Successful in 12m46s
Build Packages / Create release (push) Skipped
Build Packages / Build documentation (push) Successful in 45s
Build Packages / build:rpm (rocky9) (push) Successful in 13m24s
Build Packages / DIALS test (push) Successful in 13m10s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 6m58s
Build Packages / XDS test (neggia plugin) (push) Successful in 5m53s
Build Packages / Unit tests (push) Successful in 59m15s
Build Packages / build:windows (push) Successful in 18m58s
tools: add jfjoch_recompress (bitshuffle/LZ4 -> bitshuffle/zstd)
New offline tool that re-compresses /entry/data/data of a _data_NNNNNN.h5 file
from bitshuffle/LZ4 to the standard bitshuffle/Zstd HDF5 filter. Every other
object (groups, datasets, attributes, the dataset's own attributes,
dims/dtype/chunking/block size) is reproduced unchanged.

It writes a fresh file - only /entry/data/data is re-encoded, every other object
is H5Ocopy'd verbatim - which then atomically replaces the original via rename().
This needs no h5repack (the new file has no leftover space) and is crash-safe
(the original is opened read-only until the rename). Frames are streamed one at a
time through the registered bitshuffle filter (decompress LZ4, compress Zstd), so
it is dtype-agnostic and never holds the whole dataset in memory.

Output is read by the standard bitshuffle+zstd HDF5 plugin (verified against the
hdf5plugin/DIALS libh5bshuf.so, which links libzstd and supports the zstd mode).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 15:24:30 +02:00

188 lines
8.8 KiB
C++

// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
// Re-compress /entry/data/data of a JUNGFRAUJOCH _data_NNNNNN.h5 file from bitshuffle/LZ4 to
// bitshuffle/Zstd (standard bitshuffle HDF5 filter, mode ZSTD). Every other object in the file
// (groups, datasets, attributes, the dataset's own attributes, dims/dtype/chunking/block size) is
// reproduced unchanged.
//
// The result is written to a fresh temporary file (only /entry/data/data is re-encoded; every other
// object is H5Ocopy'd verbatim, preserving its data, filters and attributes) which then atomically
// replaces the original. This needs no h5repack - the new file has no leftover space - and the
// original is opened read-only until the final rename, so a crash leaves it intact.
//
// Frames are streamed one at a time through the registered bitshuffle filter (decompress LZ4 on
// read, compress Zstd on write), so it is dtype-agnostic and never holds the whole dataset in memory.
#include <cstdio>
#include <cstdint>
#include <string>
#include <vector>
#include <hdf5.h>
#include "../writer/HDF5Objects.h" // RegisterHDF5Filter()
#include "../compression/bitshuffle/bshuf_h5filter.h" // BSHUF_H5FILTER, BSHUF_H5_COMPRESS_LZ4/ZSTD
namespace {
constexpr char DATA[] = "/entry/data/data";
// --- copy all attributes of one object verbatim ---
struct AttrCopy { hid_t dst; };
herr_t copy_attr(hid_t src_obj, const char *name, const H5A_info_t *, void *op) {
hid_t dst = static_cast<AttrCopy *>(op)->dst;
hid_t a = H5Aopen(src_obj, name, H5P_DEFAULT);
hid_t t = H5Aget_type(a), s = H5Aget_space(a);
std::vector<uint8_t> buf((size_t) H5Sget_simple_extent_npoints(s) * H5Tget_size(t));
H5Aread(a, t, buf.data());
hid_t a2 = H5Acreate2(dst, name, t, s, H5P_DEFAULT, H5P_DEFAULT);
H5Awrite(a2, t, buf.data());
H5Aclose(a2); H5Sclose(s); H5Tclose(t); H5Aclose(a);
return 0;
}
void copy_all_attrs(hid_t src_obj, hid_t dst_obj) {
AttrCopy ctx{dst_obj};
H5Aiterate2(src_obj, H5_INDEX_NAME, H5_ITER_NATIVE, nullptr, copy_attr, &ctx);
}
// --- H5Ocopy every child of a group verbatim, except one name ---
struct ChildCopy { hid_t src_file, dst_file; std::string group, except; herr_t rc; };
herr_t copy_child(hid_t, const char *name, const H5L_info2_t *, void *op) {
auto *c = static_cast<ChildCopy *>(op);
if (c->except == name) return 0;
std::string p = c->group + "/" + name;
if (H5Ocopy(c->src_file, p.c_str(), c->dst_file, p.c_str(), H5P_DEFAULT, H5P_DEFAULT) < 0) {
fprintf(stderr, " H5Ocopy failed for %s\n", p.c_str());
c->rc = -1;
}
return c->rc;
}
herr_t copy_children_except(hid_t src_file, hid_t dst_file, const char *group, const char *except) {
hid_t s = H5Gopen2(src_file, group, H5P_DEFAULT);
ChildCopy ctx{src_file, dst_file, group, except, 0};
H5Literate2(s, H5_INDEX_NAME, H5_ITER_NATIVE, nullptr, copy_child, &ctx);
H5Gclose(s);
return ctx.rc;
}
// Re-encode /entry/data/data (LZ4) from src into dst (Zstd). dst:/entry/data must already exist.
int recompress_data(hid_t src_file, hid_t dst_file, unsigned block_size) {
hid_t s_dset = H5Dopen2(src_file, DATA, H5P_DEFAULT);
hid_t space = H5Dget_space(s_dset);
hid_t tid = H5Dget_type(s_dset);
hid_t s_dcpl = H5Dget_create_plist(s_dset);
const int nd = H5Sget_simple_extent_ndims(space);
std::vector<hsize_t> dims(nd); H5Sget_simple_extent_dims(space, dims.data(), nullptr);
std::vector<hsize_t> chunk(nd); H5Pget_chunk(s_dcpl, nd, chunk.data());
unsigned cd_zstd[2] = { block_size, (unsigned) BSHUF_H5_COMPRESS_ZSTD };
hid_t dcpl = H5Pcreate(H5P_DATASET_CREATE);
H5Pset_chunk(dcpl, nd, chunk.data());
H5Pset_filter(dcpl, BSHUF_H5FILTER, H5Z_FLAG_MANDATORY, 2, cd_zstd);
H5Pset_fill_time(dcpl, H5D_FILL_TIME_NEVER);
H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_INCR);
hid_t d_dset = H5Dcreate2(dst_file, DATA, tid, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
if (d_dset < 0) {
fprintf(stderr, " cannot create Zstd dataset (is the bitshuffle filter built with Zstd?)\n");
H5Pclose(dcpl); H5Pclose(s_dcpl); H5Tclose(tid); H5Sclose(space); H5Dclose(s_dset);
return -1;
}
const hsize_t nframes = dims[0];
size_t frame_elems = 1; for (int d = 1; d < nd; d++) frame_elems *= dims[d];
std::vector<uint8_t> buf(frame_elems * H5Tget_size(tid));
std::vector<hsize_t> mdim(dims); mdim[0] = 1;
hid_t memspace = H5Screate_simple(nd, mdim.data(), nullptr);
std::vector<hsize_t> start(nd, 0);
int rc = 0;
for (hsize_t i = 0; i < nframes; i++) {
start[0] = i;
H5Sselect_hyperslab(space, H5S_SELECT_SET, start.data(), nullptr, mdim.data(), nullptr);
if (H5Dread(s_dset, tid, memspace, space, H5P_DEFAULT, buf.data()) < 0 ||
H5Dwrite(d_dset, tid, memspace, space, H5P_DEFAULT, buf.data()) < 0) {
fprintf(stderr, " frame %llu read/write failed\n", (unsigned long long) i); rc = -1; break;
}
}
copy_all_attrs(s_dset, d_dset);
H5Sclose(memspace); H5Dclose(d_dset); H5Pclose(dcpl);
H5Pclose(s_dcpl); H5Tclose(tid); H5Sclose(space); H5Dclose(s_dset);
return rc;
}
int convert(const char *path) {
hid_t src = H5Fopen(path, H5F_ACC_RDONLY, H5P_DEFAULT);
if (src < 0) { fprintf(stderr, "%s: cannot open\n", path); return 1; }
if (H5Lexists(src, DATA, H5P_DEFAULT) <= 0) {
fprintf(stderr, "%s: no %s dataset; skipping\n", path, DATA); H5Fclose(src); return 1; }
// Inspect the source filter; only act on bitshuffle/LZ4.
hid_t s_dset = H5Dopen2(src, DATA, H5P_DEFAULT);
hid_t s_dcpl = H5Dget_create_plist(s_dset);
unsigned flags; size_t nelmts = 16; unsigned cd[16]; char fname[80];
bool ok = H5Pget_filter_by_id2(s_dcpl, BSHUF_H5FILTER, &flags, &nelmts, cd, sizeof(fname), fname, nullptr) >= 0
&& nelmts >= 5;
unsigned block_size = ok ? cd[3] : 0, mode = ok ? cd[4] : 0;
H5Pclose(s_dcpl); H5Dclose(s_dset);
if (!ok) { fprintf(stderr, "%s: %s is not bitshuffle-compressed; skipping\n", path, DATA); H5Fclose(src); return 1; }
if (mode == (unsigned) BSHUF_H5_COMPRESS_ZSTD) {
fprintf(stderr, "%s: %s already bitshuffle/zstd; nothing to do\n", path, DATA); H5Fclose(src); return 0; }
if (mode != (unsigned) BSHUF_H5_COMPRESS_LZ4) {
fprintf(stderr, "%s: unexpected bitshuffle mode %u; skipping\n", path, mode); H5Fclose(src); return 1; }
std::string tmp = std::string(path) + ".jfjoch_recompress.tmp";
hid_t dst = H5Fcreate(tmp.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
if (dst < 0) { fprintf(stderr, "%s: cannot create %s\n", path, tmp.c_str()); H5Fclose(src); return 1; }
int rc = 0;
{
// root attributes + root children other than /entry
hid_t s_root = H5Gopen2(src, "/", H5P_DEFAULT), d_root = H5Gopen2(dst, "/", H5P_DEFAULT);
copy_all_attrs(s_root, d_root);
H5Gclose(s_root); H5Gclose(d_root);
rc |= copy_children_except(src, dst, "/", "entry");
// /entry: recreate group + attrs, copy children other than "data"
hid_t s_e = H5Gopen2(src, "/entry", H5P_DEFAULT);
hid_t d_e = H5Gcreate2(dst, "/entry", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
copy_all_attrs(s_e, d_e); H5Gclose(s_e); H5Gclose(d_e);
rc |= copy_children_except(src, dst, "/entry", "data");
// /entry/data: recreate group + attrs, copy children other than the "data" dataset
hid_t s_d = H5Gopen2(src, "/entry/data", H5P_DEFAULT);
hid_t d_d = H5Gcreate2(dst, "/entry/data", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
copy_all_attrs(s_d, d_d); H5Gclose(s_d); H5Gclose(d_d);
rc |= copy_children_except(src, dst, "/entry/data", "data");
// /entry/data/data: re-encode LZ4 -> Zstd
rc |= recompress_data(src, dst, block_size);
}
H5Fclose(dst); H5Fclose(src);
if (rc == 0) {
if (std::rename(tmp.c_str(), path) != 0) {
perror("rename"); std::remove(tmp.c_str()); return 1; }
printf("%s: %s re-compressed bitshuffle/LZ4 -> bitshuffle/zstd\n", path, DATA);
return 0;
}
std::remove(tmp.c_str()); // leave the original untouched on error
fprintf(stderr, "%s: conversion failed; original left unchanged\n", path);
return 1;
}
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <data_NNNNNN.h5> [more.h5 ...]\n"
" Re-compress /entry/data/data from bitshuffle/LZ4 to bitshuffle/zstd,\n"
" writing a fresh file that atomically replaces the original.\n",
argv[0]);
return 1;
}
RegisterHDF5Filter();
H5Eset_auto2(H5E_DEFAULT, nullptr, nullptr); // we check return codes ourselves
int rc = 0;
for (int i = 1; i < argc; i++) rc |= convert(argv[i]);
return rc;
}