Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 12m47s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 13m3s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 13m36s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 13m32s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 13m43s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 8m12s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 11m18s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m26s
Build Packages / Generate python client (push) Successful in 15s
Build Packages / build:rpm (rocky8) (push) Successful in 12m46s
Build Packages / Create release (push) Skipped
Build Packages / Build documentation (push) Successful in 45s
Build Packages / build:rpm (rocky9) (push) Successful in 13m24s
Build Packages / DIALS test (push) Successful in 13m10s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 6m58s
Build Packages / XDS test (neggia plugin) (push) Successful in 5m53s
Build Packages / Unit tests (push) Successful in 59m15s
Build Packages / build:windows (push) Successful in 18m58s
New offline tool that re-compresses /entry/data/data of a _data_NNNNNN.h5 file from bitshuffle/LZ4 to the standard bitshuffle/Zstd HDF5 filter. Every other object (groups, datasets, attributes, the dataset's own attributes, dims/dtype/chunking/block size) is reproduced unchanged. It writes a fresh file - only /entry/data/data is re-encoded, every other object is H5Ocopy'd verbatim - which then atomically replaces the original via rename(). This needs no h5repack (the new file has no leftover space) and is crash-safe (the original is opened read-only until the rename). Frames are streamed one at a time through the registered bitshuffle filter (decompress LZ4, compress Zstd), so it is dtype-agnostic and never holds the whole dataset in memory. Output is read by the standard bitshuffle+zstd HDF5 plugin (verified against the hdf5plugin/DIALS libh5bshuf.so, which links libzstd and supports the zstd mode). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
188 lines
8.8 KiB
C++
188 lines
8.8 KiB
C++
// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
// Re-compress /entry/data/data of a JUNGFRAUJOCH _data_NNNNNN.h5 file from bitshuffle/LZ4 to
|
|
// bitshuffle/Zstd (standard bitshuffle HDF5 filter, mode ZSTD). Every other object in the file
|
|
// (groups, datasets, attributes, the dataset's own attributes, dims/dtype/chunking/block size) is
|
|
// reproduced unchanged.
|
|
//
|
|
// The result is written to a fresh temporary file (only /entry/data/data is re-encoded; every other
|
|
// object is H5Ocopy'd verbatim, preserving its data, filters and attributes) which then atomically
|
|
// replaces the original. This needs no h5repack - the new file has no leftover space - and the
|
|
// original is opened read-only until the final rename, so a crash leaves it intact.
|
|
//
|
|
// Frames are streamed one at a time through the registered bitshuffle filter (decompress LZ4 on
|
|
// read, compress Zstd on write), so it is dtype-agnostic and never holds the whole dataset in memory.
|
|
|
|
#include <cstdio>
|
|
#include <cstdint>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <hdf5.h>
|
|
#include "../writer/HDF5Objects.h" // RegisterHDF5Filter()
|
|
#include "../compression/bitshuffle/bshuf_h5filter.h" // BSHUF_H5FILTER, BSHUF_H5_COMPRESS_LZ4/ZSTD
|
|
|
|
namespace {
|
|
constexpr char DATA[] = "/entry/data/data";
|
|
|
|
// --- copy all attributes of one object verbatim ---
|
|
struct AttrCopy { hid_t dst; };
|
|
herr_t copy_attr(hid_t src_obj, const char *name, const H5A_info_t *, void *op) {
|
|
hid_t dst = static_cast<AttrCopy *>(op)->dst;
|
|
hid_t a = H5Aopen(src_obj, name, H5P_DEFAULT);
|
|
hid_t t = H5Aget_type(a), s = H5Aget_space(a);
|
|
std::vector<uint8_t> buf((size_t) H5Sget_simple_extent_npoints(s) * H5Tget_size(t));
|
|
H5Aread(a, t, buf.data());
|
|
hid_t a2 = H5Acreate2(dst, name, t, s, H5P_DEFAULT, H5P_DEFAULT);
|
|
H5Awrite(a2, t, buf.data());
|
|
H5Aclose(a2); H5Sclose(s); H5Tclose(t); H5Aclose(a);
|
|
return 0;
|
|
}
|
|
void copy_all_attrs(hid_t src_obj, hid_t dst_obj) {
|
|
AttrCopy ctx{dst_obj};
|
|
H5Aiterate2(src_obj, H5_INDEX_NAME, H5_ITER_NATIVE, nullptr, copy_attr, &ctx);
|
|
}
|
|
|
|
// --- H5Ocopy every child of a group verbatim, except one name ---
|
|
struct ChildCopy { hid_t src_file, dst_file; std::string group, except; herr_t rc; };
|
|
herr_t copy_child(hid_t, const char *name, const H5L_info2_t *, void *op) {
|
|
auto *c = static_cast<ChildCopy *>(op);
|
|
if (c->except == name) return 0;
|
|
std::string p = c->group + "/" + name;
|
|
if (H5Ocopy(c->src_file, p.c_str(), c->dst_file, p.c_str(), H5P_DEFAULT, H5P_DEFAULT) < 0) {
|
|
fprintf(stderr, " H5Ocopy failed for %s\n", p.c_str());
|
|
c->rc = -1;
|
|
}
|
|
return c->rc;
|
|
}
|
|
herr_t copy_children_except(hid_t src_file, hid_t dst_file, const char *group, const char *except) {
|
|
hid_t s = H5Gopen2(src_file, group, H5P_DEFAULT);
|
|
ChildCopy ctx{src_file, dst_file, group, except, 0};
|
|
H5Literate2(s, H5_INDEX_NAME, H5_ITER_NATIVE, nullptr, copy_child, &ctx);
|
|
H5Gclose(s);
|
|
return ctx.rc;
|
|
}
|
|
|
|
// Re-encode /entry/data/data (LZ4) from src into dst (Zstd). dst:/entry/data must already exist.
|
|
int recompress_data(hid_t src_file, hid_t dst_file, unsigned block_size) {
|
|
hid_t s_dset = H5Dopen2(src_file, DATA, H5P_DEFAULT);
|
|
hid_t space = H5Dget_space(s_dset);
|
|
hid_t tid = H5Dget_type(s_dset);
|
|
hid_t s_dcpl = H5Dget_create_plist(s_dset);
|
|
const int nd = H5Sget_simple_extent_ndims(space);
|
|
std::vector<hsize_t> dims(nd); H5Sget_simple_extent_dims(space, dims.data(), nullptr);
|
|
std::vector<hsize_t> chunk(nd); H5Pget_chunk(s_dcpl, nd, chunk.data());
|
|
|
|
unsigned cd_zstd[2] = { block_size, (unsigned) BSHUF_H5_COMPRESS_ZSTD };
|
|
hid_t dcpl = H5Pcreate(H5P_DATASET_CREATE);
|
|
H5Pset_chunk(dcpl, nd, chunk.data());
|
|
H5Pset_filter(dcpl, BSHUF_H5FILTER, H5Z_FLAG_MANDATORY, 2, cd_zstd);
|
|
H5Pset_fill_time(dcpl, H5D_FILL_TIME_NEVER);
|
|
H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_INCR);
|
|
hid_t d_dset = H5Dcreate2(dst_file, DATA, tid, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
|
|
if (d_dset < 0) {
|
|
fprintf(stderr, " cannot create Zstd dataset (is the bitshuffle filter built with Zstd?)\n");
|
|
H5Pclose(dcpl); H5Pclose(s_dcpl); H5Tclose(tid); H5Sclose(space); H5Dclose(s_dset);
|
|
return -1;
|
|
}
|
|
|
|
const hsize_t nframes = dims[0];
|
|
size_t frame_elems = 1; for (int d = 1; d < nd; d++) frame_elems *= dims[d];
|
|
std::vector<uint8_t> buf(frame_elems * H5Tget_size(tid));
|
|
std::vector<hsize_t> mdim(dims); mdim[0] = 1;
|
|
hid_t memspace = H5Screate_simple(nd, mdim.data(), nullptr);
|
|
std::vector<hsize_t> start(nd, 0);
|
|
int rc = 0;
|
|
for (hsize_t i = 0; i < nframes; i++) {
|
|
start[0] = i;
|
|
H5Sselect_hyperslab(space, H5S_SELECT_SET, start.data(), nullptr, mdim.data(), nullptr);
|
|
if (H5Dread(s_dset, tid, memspace, space, H5P_DEFAULT, buf.data()) < 0 ||
|
|
H5Dwrite(d_dset, tid, memspace, space, H5P_DEFAULT, buf.data()) < 0) {
|
|
fprintf(stderr, " frame %llu read/write failed\n", (unsigned long long) i); rc = -1; break;
|
|
}
|
|
}
|
|
copy_all_attrs(s_dset, d_dset);
|
|
|
|
H5Sclose(memspace); H5Dclose(d_dset); H5Pclose(dcpl);
|
|
H5Pclose(s_dcpl); H5Tclose(tid); H5Sclose(space); H5Dclose(s_dset);
|
|
return rc;
|
|
}
|
|
|
|
int convert(const char *path) {
|
|
hid_t src = H5Fopen(path, H5F_ACC_RDONLY, H5P_DEFAULT);
|
|
if (src < 0) { fprintf(stderr, "%s: cannot open\n", path); return 1; }
|
|
if (H5Lexists(src, DATA, H5P_DEFAULT) <= 0) {
|
|
fprintf(stderr, "%s: no %s dataset; skipping\n", path, DATA); H5Fclose(src); return 1; }
|
|
|
|
// Inspect the source filter; only act on bitshuffle/LZ4.
|
|
hid_t s_dset = H5Dopen2(src, DATA, H5P_DEFAULT);
|
|
hid_t s_dcpl = H5Dget_create_plist(s_dset);
|
|
unsigned flags; size_t nelmts = 16; unsigned cd[16]; char fname[80];
|
|
bool ok = H5Pget_filter_by_id2(s_dcpl, BSHUF_H5FILTER, &flags, &nelmts, cd, sizeof(fname), fname, nullptr) >= 0
|
|
&& nelmts >= 5;
|
|
unsigned block_size = ok ? cd[3] : 0, mode = ok ? cd[4] : 0;
|
|
H5Pclose(s_dcpl); H5Dclose(s_dset);
|
|
if (!ok) { fprintf(stderr, "%s: %s is not bitshuffle-compressed; skipping\n", path, DATA); H5Fclose(src); return 1; }
|
|
if (mode == (unsigned) BSHUF_H5_COMPRESS_ZSTD) {
|
|
fprintf(stderr, "%s: %s already bitshuffle/zstd; nothing to do\n", path, DATA); H5Fclose(src); return 0; }
|
|
if (mode != (unsigned) BSHUF_H5_COMPRESS_LZ4) {
|
|
fprintf(stderr, "%s: unexpected bitshuffle mode %u; skipping\n", path, mode); H5Fclose(src); return 1; }
|
|
|
|
std::string tmp = std::string(path) + ".jfjoch_recompress.tmp";
|
|
hid_t dst = H5Fcreate(tmp.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
|
|
if (dst < 0) { fprintf(stderr, "%s: cannot create %s\n", path, tmp.c_str()); H5Fclose(src); return 1; }
|
|
|
|
int rc = 0;
|
|
{
|
|
// root attributes + root children other than /entry
|
|
hid_t s_root = H5Gopen2(src, "/", H5P_DEFAULT), d_root = H5Gopen2(dst, "/", H5P_DEFAULT);
|
|
copy_all_attrs(s_root, d_root);
|
|
H5Gclose(s_root); H5Gclose(d_root);
|
|
rc |= copy_children_except(src, dst, "/", "entry");
|
|
|
|
// /entry: recreate group + attrs, copy children other than "data"
|
|
hid_t s_e = H5Gopen2(src, "/entry", H5P_DEFAULT);
|
|
hid_t d_e = H5Gcreate2(dst, "/entry", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
|
|
copy_all_attrs(s_e, d_e); H5Gclose(s_e); H5Gclose(d_e);
|
|
rc |= copy_children_except(src, dst, "/entry", "data");
|
|
|
|
// /entry/data: recreate group + attrs, copy children other than the "data" dataset
|
|
hid_t s_d = H5Gopen2(src, "/entry/data", H5P_DEFAULT);
|
|
hid_t d_d = H5Gcreate2(dst, "/entry/data", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
|
|
copy_all_attrs(s_d, d_d); H5Gclose(s_d); H5Gclose(d_d);
|
|
rc |= copy_children_except(src, dst, "/entry/data", "data");
|
|
|
|
// /entry/data/data: re-encode LZ4 -> Zstd
|
|
rc |= recompress_data(src, dst, block_size);
|
|
}
|
|
|
|
H5Fclose(dst); H5Fclose(src);
|
|
|
|
if (rc == 0) {
|
|
if (std::rename(tmp.c_str(), path) != 0) {
|
|
perror("rename"); std::remove(tmp.c_str()); return 1; }
|
|
printf("%s: %s re-compressed bitshuffle/LZ4 -> bitshuffle/zstd\n", path, DATA);
|
|
return 0;
|
|
}
|
|
std::remove(tmp.c_str()); // leave the original untouched on error
|
|
fprintf(stderr, "%s: conversion failed; original left unchanged\n", path);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
if (argc < 2) {
|
|
fprintf(stderr, "Usage: %s <data_NNNNNN.h5> [more.h5 ...]\n"
|
|
" Re-compress /entry/data/data from bitshuffle/LZ4 to bitshuffle/zstd,\n"
|
|
" writing a fresh file that atomically replaces the original.\n",
|
|
argv[0]);
|
|
return 1;
|
|
}
|
|
RegisterHDF5Filter();
|
|
H5Eset_auto2(H5E_DEFAULT, nullptr, nullptr); // we check return codes ourselves
|
|
int rc = 0;
|
|
for (int i = 1; i < argc; i++) rc |= convert(argv[i]);
|
|
return rc;
|
|
}
|