Jungfraujoch/compression/JFJochZstdHuffCompressor.cpp

// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

// This file hand-assembles a standard Zstandard frame so it can be decoded by stock
// ZSTD_decompress. Every structure below follows the Zstandard compression format spec:
//   https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
// Comments cite that document's section names so the framing can be checked clause by clause.
// Only the framing is hand-written; the Huffman tree + bitstream inside each Compressed_Block's
// Literals_Section are produced by zstd's own HUF_compress*X_repeat (so those bytes are trusted).

#include "JFJochZstdHuffCompressor.h"

#include <cstring>
#include <algorithm>
#include <zstd.h>

extern "C" {
#include <common/huf.h>
size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize,
                             unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize,
                             HUF_CElt *hufTable, HUF_repeat *repeat, int flags);
}

namespace {
// --- "Zstandard frames" / "Frame_Header" ---
constexpr uint32_t MAGIC_NUMBER     = 0xFD2FB528u;
// Frame_Header_Descriptor: Frame_Content_Size_flag = 2 (4-byte size), Single_Segment_flag = 1
// (=> no Window_Descriptor; Window_Size = Frame_Content_Size), no checksum, no dictionary.
constexpr uint8_t  FRAME_DESCRIPTOR = 0xA0;

// --- "Blocks": Block_Type field of Block_Header ---
constexpr uint32_t BLOCK_RAW        = 0;
constexpr uint32_t BLOCK_RLE        = 1;
constexpr uint32_t BLOCK_COMPRESSED = 2;

// --- "Literals_Section_Header": Literals_Block_Type field ---
constexpr uint8_t  LITERALS_COMPRESSED = 2;   // carries a Huffman_Tree_Description
constexpr uint8_t  LITERALS_TREELESS   = 3;   // reuses the tree from a previous Compressed block

constexpr size_t   CHUNK   = 65536;  // cap each block's regenerated size well under Block_Maximum_Size (128 KiB)
constexpr size_t   RUN_MIN = 64;     // runs shorter than this are absorbed into the literal stream
constexpr unsigned LIT_LOG = 11;     // Huffman table-log limit for literals, per the format
}

JFJochZstdHuffCompressor::JFJochZstdHuffCompressor()
    : ctable(HUF_CTABLE_SIZE_ST(255)), entwksp(HUF_WORKSPACE_SIZE_U64) {}

// append the low nbytes of v, little-endian (Zstd integer fields are little-endian)
void JFJochZstdHuffCompressor::put_le(uint64_t v, int nbytes) {
    for (int i = 0; i < nbytes; i++) out.push_back((uint8_t)(v >> (8 * i)));
}

// Block_Header (3 bytes, little-endian): [ Last_Block:1 | Block_Type:2 | Block_Size:21 ].
// Last_Block (bit 0) is left 0 here and OR-ed onto the final block afterwards. Returns the
// offset of the header so that final block can be marked.
size_t JFJochZstdHuffCompressor::blk_hdr(uint32_t type, uint32_t size) {
    size_t off = out.size();
    put_le((uint64_t)(type << 1) | ((uint64_t)size << 3), 3);
    return off;
}

// RLE_Block: Block_Size is the number of repeats (the regenerated size); Block_Content is the
// single repeated byte. Split into <=CHUNK pieces so Block_Size stays under Block_Maximum_Size.
void JFJochZstdHuffCompressor::emit_run(uint8_t value, size_t nbytes, size_t &last_off) {
    for (size_t off = 0; off < nbytes; off += CHUNK) {
        size_t c = std::min(CHUNK, nbytes - off);
        last_off = blk_hdr(BLOCK_RLE, (uint32_t)c);
        out.push_back(value);
    }
}

// Emit one Compressed_Block whose Literals_Section holds n Huffman-coded literal bytes and whose
// Sequences_Section is empty (Number_of_Sequences = 0). Falls back to a Raw_Block if Huffman does
// not help. The Huffman payload (tree + streams, or streams only when reusing) is produced by zstd.
void JFJochZstdHuffCompressor::emit_lit_chunk(const uint8_t *lits, size_t n, size_t &last_off) {
    HUF_CElt *ct = reinterpret_cast<HUF_CElt *>(ctable.data());
    HUF_repeat rep = (HUF_repeat)repeat_state;
    // Size_Format also selects the stream count: format 0 = 1 stream, formats 1/2/3 = 4 streams.
    // Use a single stream only for small inputs (matches zstd's own ZSTD_compressLiterals heuristic).
    const bool single = n < 256;
    const size_t wkspSize = entwksp.size() * sizeof(uint64_t);
    size_t hs = single
        ? HUF_compress1X_repeat(hufbuf.data(), hufbuf.size(), lits, n, 255, LIT_LOG, entwksp.data(), wkspSize, ct, &rep, 0)
        : HUF_compress4X_repeat(hufbuf.data(), hufbuf.size(), lits, n, 255, LIT_LOG, entwksp.data(), wkspSize, ct, &rep, 0);

    // Literals_Section_Header, read as a little-endian value, is laid out as
    //   [ Literals_Block_Type:2 | Size_Format:2 | Regenerated_Size | Compressed_Size ].
    // Size_Format (chosen from Regenerated_Size n) sets the header length and field widths:
    //   n < 1 KiB  -> 3-byte header, 10-bit fields  (format 0 single-stream, else 1)
    //   n < 16 KiB -> 4-byte header, 14-bit fields  (format 2)
    //   else       -> 5-byte header, 18-bit fields  (format 3)
    // Regenerated_Size starts at bit 4; Compressed_Size starts right after it, at bit (4 + width).
    int size_format, lh_bytes, regen_width;
    if      (n < 1024)  { size_format = single ? 0 : 1; lh_bytes = 3; regen_width = 10; }
    else if (n < 16384) { size_format = 2;               lh_bytes = 4; regen_width = 14; }
    else                { size_format = 3;               lh_bytes = 5; regen_width = 18; }

    // hs == 1 is HUF's "single-symbol alphabet" signal (not a usable Huffman section); only keep
    // the Huffman block when it is well-formed and actually smaller than a Raw_Block.
    if (hs > 1 && !HUF_isError(hs) && (size_t)(lh_bytes + 1) + hs < n) {
        repeat_state = HUF_repeat_check;   // the table now in ct is valid -> reuse it next time
        // HUF_compress*_repeat leaves *repeat != none only when it reused the previous table.
        uint8_t lit_type = (rep != HUF_repeat_none) ? LITERALS_TREELESS : LITERALS_COMPRESSED;
        uint64_t hdr = (uint64_t)lit_type | ((uint64_t)size_format << 2)
                       | ((uint64_t)n << 4) | ((uint64_t)hs << (4 + regen_width));
        last_off = blk_hdr(BLOCK_COMPRESSED, (uint32_t)(lh_bytes + hs + 1));
        put_le(hdr, lh_bytes);                                  // Literals_Section_Header
        out.insert(out.end(), hufbuf.data(), hufbuf.data() + hs); // Huffman tree + stream(s)
        out.push_back(0x00);                                   // Sequences_Section: Number_of_Sequences = 0
    } else {
        repeat_state = HUF_repeat_none;    // discard any table HUF rebuilt but we did not emit
        last_off = blk_hdr(BLOCK_RAW, (uint32_t)n);
        out.insert(out.end(), lits, lits + n);
    }
}

// src = one bitshuffled block (src_size bytes, a multiple of 8). Emits a complete Zstd frame.
size_t JFJochZstdHuffCompressor::Compress(uint8_t *dst, const uint64_t *src, size_t src_size) {
    const size_t W = src_size / 8;
    out.clear(); literals.clear(); segs.clear();
    repeat_state = HUF_repeat_none;
    if (hufbuf.size() < src_size + 1024) hufbuf.resize(src_size + 1024);

    // Pass 1: classify 8-byte words into runs (0x00 / 0xFF) and literals. Runs >= RUN_MIN become
    // RLE_Blocks; shorter runs are folded into the literal stream (where 0x00 costs ~1 Huffman bit).
    size_t lit_start = 0;
    auto close_lit = [&]() { if (literals.size() > lit_start) {
        segs.push_back({2, literals.size() - lit_start, lit_start}); lit_start = literals.size(); } };
    for (size_t i = 0; i < W; ) {
        if (src[i] == 0 || src[i] == UINT64_MAX) {
            uint64_t val = src[i]; size_t j = i; while (j < W && src[j] == val) j++;
            size_t bytes = (j - i) * 8; uint8_t v = val ? 0xFF : 0x00;
            if (bytes >= RUN_MIN) { close_lit(); segs.push_back({(uint8_t)(val ? 1 : 0), bytes, 0}); }
            else literals.insert(literals.end(), bytes, v);
            i = j;
        } else {
            const uint8_t *wb = reinterpret_cast<const uint8_t *>(&src[i]);
            literals.insert(literals.end(), wb, wb + 8);
            i++;
        }
    }
    close_lit();

    // Frame_Header: Magic_Number + Frame_Header_Descriptor + Frame_Content_Size (4 bytes).
    put_le(MAGIC_NUMBER, 4);
    out.push_back(FRAME_DESCRIPTOR);
    put_le(src_size, 4);

    // Pass 2: emit the blocks in order.
    size_t last_off = 0; bool emitted = false;
    for (const Seg &s : segs) {
        emitted = true;
        if (s.type == 2)
            for (size_t off = 0; off < s.bytes; off += CHUNK)
                emit_lit_chunk(literals.data() + s.lit_off + off, std::min(CHUNK, s.bytes - off), last_off);
        else
            emit_run(s.type == 1 ? 0xFF : 0x00, s.bytes, last_off);
    }
    if (!emitted) last_off = blk_hdr(BLOCK_RAW, 0); // empty input -> a single empty block
    out[last_off] |= 1;                             // set Last_Block on the final block

    memcpy(dst, out.data(), out.size());
    return out.size();
}