Files
Jungfraujoch/compression/JFJochZstdHuffCompressor.cpp
T
leonarski_fandClaude Opus 4.8 7e7a73062c Compression: add BSHUF_ZSTD_RLE_HUFF (RLE runs + Huffman literals)
New CompressionAlgorithm that emits a standard Zstandard frame: zero/0xFF runs
become RLE_Blocks (like BSHUF_ZSTD_RLE) and literal regions become
Compressed_Blocks with per-block adaptive Huffman literals and no sequences
(Number_of_Sequences=0). Short runs are absorbed into the literal stream;
incompressible literals fall back to Raw_Blocks so the worst case stays within
ZSTD_compressBound.

The Huffman tree + bitstream are produced by zstd's own HUF_compress{1,4}X_repeat
(the same calls ZSTD_compressLiterals uses); only the frame/block/literals-section
framing is hand-written, with comments citing zstd_compression_format.md so it can
be checked clause by clause. Output decodes with stock ZSTD_decompress, so no
reader changes are needed (decode routes like BSHUF_ZSTD).

On sparse diffraction this gives ~12% smaller files than bitshuffle/LZ4 at about
the same end-to-end speed, sitting between LZ4 and full ZSTD; for maximum ratio
use BSHUF_ZSTD. Robust on any input: tests round-trip pure zeros, Poisson(10),
Mersenne-Twister noise (checked against the size bound), an extreme-sparsity mask,
and a real lyso image through stock ZSTD_decompress.

API: exposed as "bszstd_rlehuf"; regenerate the Python/TS clients (update_version.sh)
to surface the new value there.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 14:41:46 +02:00

165 lines
8.7 KiB
C++

// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
// This file hand-assembles a standard Zstandard frame so it can be decoded by stock
// ZSTD_decompress. Every structure below follows the Zstandard compression format spec:
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
// Comments cite that document's section names so the framing can be checked clause by clause.
// Only the framing is hand-written; the Huffman tree + bitstream inside each Compressed_Block's
// Literals_Section are produced by zstd's own HUF_compress*X_repeat (so those bytes are trusted).
#include "JFJochZstdHuffCompressor.h"
#include <cstring>
#include <algorithm>
#include <zstd.h>
extern "C" {
#include <common/huf.h>
size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize,
HUF_CElt *hufTable, HUF_repeat *repeat, int flags);
}
namespace {
// --- "Zstandard frames" / "Frame_Header" ---
constexpr uint32_t MAGIC_NUMBER = 0xFD2FB528u;
// Frame_Header_Descriptor: Frame_Content_Size_flag = 2 (4-byte size), Single_Segment_flag = 1
// (=> no Window_Descriptor; Window_Size = Frame_Content_Size), no checksum, no dictionary.
constexpr uint8_t FRAME_DESCRIPTOR = 0xA0;
// --- "Blocks": Block_Type field of Block_Header ---
constexpr uint32_t BLOCK_RAW = 0;
constexpr uint32_t BLOCK_RLE = 1;
constexpr uint32_t BLOCK_COMPRESSED = 2;
// --- "Literals_Section_Header": Literals_Block_Type field ---
constexpr uint8_t LITERALS_COMPRESSED = 2; // carries a Huffman_Tree_Description
constexpr uint8_t LITERALS_TREELESS = 3; // reuses the tree from a previous Compressed block
constexpr size_t CHUNK = 65536; // cap each block's regenerated size well under Block_Maximum_Size (128 KiB)
constexpr size_t RUN_MIN = 64; // runs shorter than this are absorbed into the literal stream
constexpr unsigned LIT_LOG = 11; // Huffman table-log limit for literals, per the format
}
JFJochZstdHuffCompressor::JFJochZstdHuffCompressor()
: ctable(HUF_CTABLE_SIZE_ST(255)), entwksp(HUF_WORKSPACE_SIZE_U64) {}
// append the low nbytes of v, little-endian (Zstd integer fields are little-endian)
void JFJochZstdHuffCompressor::put_le(uint64_t v, int nbytes) {
for (int i = 0; i < nbytes; i++) out.push_back((uint8_t)(v >> (8 * i)));
}
// Block_Header (3 bytes, little-endian): [ Last_Block:1 | Block_Type:2 | Block_Size:21 ].
// Last_Block (bit 0) is left 0 here and OR-ed onto the final block afterwards. Returns the
// offset of the header so that final block can be marked.
size_t JFJochZstdHuffCompressor::blk_hdr(uint32_t type, uint32_t size) {
size_t off = out.size();
put_le((uint64_t)(type << 1) | ((uint64_t)size << 3), 3);
return off;
}
// RLE_Block: Block_Size is the number of repeats (the regenerated size); Block_Content is the
// single repeated byte. Split into <=CHUNK pieces so Block_Size stays under Block_Maximum_Size.
void JFJochZstdHuffCompressor::emit_run(uint8_t value, size_t nbytes, size_t &last_off) {
for (size_t off = 0; off < nbytes; off += CHUNK) {
size_t c = std::min(CHUNK, nbytes - off);
last_off = blk_hdr(BLOCK_RLE, (uint32_t)c);
out.push_back(value);
}
}
// Emit one Compressed_Block whose Literals_Section holds n Huffman-coded literal bytes and whose
// Sequences_Section is empty (Number_of_Sequences = 0). Falls back to a Raw_Block if Huffman does
// not help. The Huffman payload (tree + streams, or streams only when reusing) is produced by zstd.
void JFJochZstdHuffCompressor::emit_lit_chunk(const uint8_t *lits, size_t n, size_t &last_off) {
HUF_CElt *ct = reinterpret_cast<HUF_CElt *>(ctable.data());
HUF_repeat rep = (HUF_repeat)repeat_state;
// Size_Format also selects the stream count: format 0 = 1 stream, formats 1/2/3 = 4 streams.
// Use a single stream only for small inputs (matches zstd's own ZSTD_compressLiterals heuristic).
const bool single = n < 256;
const size_t wkspSize = entwksp.size() * sizeof(uint64_t);
size_t hs = single
? HUF_compress1X_repeat(hufbuf.data(), hufbuf.size(), lits, n, 255, LIT_LOG, entwksp.data(), wkspSize, ct, &rep, 0)
: HUF_compress4X_repeat(hufbuf.data(), hufbuf.size(), lits, n, 255, LIT_LOG, entwksp.data(), wkspSize, ct, &rep, 0);
// Literals_Section_Header, read as a little-endian value, is laid out as
// [ Literals_Block_Type:2 | Size_Format:2 | Regenerated_Size | Compressed_Size ].
// Size_Format (chosen from Regenerated_Size n) sets the header length and field widths:
// n < 1 KiB -> 3-byte header, 10-bit fields (format 0 single-stream, else 1)
// n < 16 KiB -> 4-byte header, 14-bit fields (format 2)
// else -> 5-byte header, 18-bit fields (format 3)
// Regenerated_Size starts at bit 4; Compressed_Size starts right after it, at bit (4 + width).
int size_format, lh_bytes, regen_width;
if (n < 1024) { size_format = single ? 0 : 1; lh_bytes = 3; regen_width = 10; }
else if (n < 16384) { size_format = 2; lh_bytes = 4; regen_width = 14; }
else { size_format = 3; lh_bytes = 5; regen_width = 18; }
// hs == 1 is HUF's "single-symbol alphabet" signal (not a usable Huffman section); only keep
// the Huffman block when it is well-formed and actually smaller than a Raw_Block.
if (hs > 1 && !HUF_isError(hs) && (size_t)(lh_bytes + 1) + hs < n) {
repeat_state = HUF_repeat_check; // the table now in ct is valid -> reuse it next time
// HUF_compress*_repeat leaves *repeat != none only when it reused the previous table.
uint8_t lit_type = (rep != HUF_repeat_none) ? LITERALS_TREELESS : LITERALS_COMPRESSED;
uint64_t hdr = (uint64_t)lit_type | ((uint64_t)size_format << 2)
| ((uint64_t)n << 4) | ((uint64_t)hs << (4 + regen_width));
last_off = blk_hdr(BLOCK_COMPRESSED, (uint32_t)(lh_bytes + hs + 1));
put_le(hdr, lh_bytes); // Literals_Section_Header
out.insert(out.end(), hufbuf.data(), hufbuf.data() + hs); // Huffman tree + stream(s)
out.push_back(0x00); // Sequences_Section: Number_of_Sequences = 0
} else {
repeat_state = HUF_repeat_none; // discard any table HUF rebuilt but we did not emit
last_off = blk_hdr(BLOCK_RAW, (uint32_t)n);
out.insert(out.end(), lits, lits + n);
}
}
// src = one bitshuffled block (src_size bytes, a multiple of 8). Emits a complete Zstd frame.
size_t JFJochZstdHuffCompressor::Compress(uint8_t *dst, const uint64_t *src, size_t src_size) {
const size_t W = src_size / 8;
out.clear(); literals.clear(); segs.clear();
repeat_state = HUF_repeat_none;
if (hufbuf.size() < src_size + 1024) hufbuf.resize(src_size + 1024);
// Pass 1: classify 8-byte words into runs (0x00 / 0xFF) and literals. Runs >= RUN_MIN become
// RLE_Blocks; shorter runs are folded into the literal stream (where 0x00 costs ~1 Huffman bit).
size_t lit_start = 0;
auto close_lit = [&]() { if (literals.size() > lit_start) {
segs.push_back({2, literals.size() - lit_start, lit_start}); lit_start = literals.size(); } };
for (size_t i = 0; i < W; ) {
if (src[i] == 0 || src[i] == UINT64_MAX) {
uint64_t val = src[i]; size_t j = i; while (j < W && src[j] == val) j++;
size_t bytes = (j - i) * 8; uint8_t v = val ? 0xFF : 0x00;
if (bytes >= RUN_MIN) { close_lit(); segs.push_back({(uint8_t)(val ? 1 : 0), bytes, 0}); }
else literals.insert(literals.end(), bytes, v);
i = j;
} else {
const uint8_t *wb = reinterpret_cast<const uint8_t *>(&src[i]);
literals.insert(literals.end(), wb, wb + 8);
i++;
}
}
close_lit();
// Frame_Header: Magic_Number + Frame_Header_Descriptor + Frame_Content_Size (4 bytes).
put_le(MAGIC_NUMBER, 4);
out.push_back(FRAME_DESCRIPTOR);
put_le(src_size, 4);
// Pass 2: emit the blocks in order.
size_t last_off = 0; bool emitted = false;
for (const Seg &s : segs) {
emitted = true;
if (s.type == 2)
for (size_t off = 0; off < s.bytes; off += CHUNK)
emit_lit_chunk(literals.data() + s.lit_off + off, std::min(CHUNK, s.bytes - off), last_off);
else
emit_run(s.type == 1 ? 0xFF : 0x00, s.bytes, last_off);
}
if (!emitted) last_off = blk_hdr(BLOCK_RAW, 0); // empty input -> a single empty block
out[last_off] |= 1; // set Last_Block on the final block
memcpy(dst, out.data(), out.size());
return out.size();
}