Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 14m4s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 14m42s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky8) (push) Successful in 14m50s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m0s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m34s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 15m42s
Build Packages / XDS test (durin plugin) (push) Successful in 8m25s
Build Packages / Generate python client (push) Successful in 31s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 8m32s
Build Packages / Create release (push) Skipped
Build Packages / XDS test (neggia plugin) (push) Successful in 8m17s
Build Packages / Build documentation (push) Successful in 59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 11m12s
Build Packages / build:rpm (rocky9) (push) Successful in 12m52s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 12m22s
Build Packages / DIALS test (push) Successful in 13m0s
Build Packages / Unit tests (push) Successful in 59m47s
Build Packages / build:windows (push) Failing after 12m51s
On sparse lyso frames the larger block improves compression ratio across all bshuf algorithms (16-bit data): ZSTD 8.58 -> 9.30, LZ4 7.38 -> 7.58, RLE 6.82 -> 6.90. 16384 captures most of the gain available from even larger blocks (ZSTD tops out ~9.55 at 65536) while staying close to the cache sweet spot: the cheap codecs (LZ4, RLE) peak in throughput once a block's working set fits L1d (~4096 elem here), so very large blocks trade real throughput for diminishing ratio - and that penalty is worse on the Xeon Gold/Platinum production hosts (smaller private L2, shared-L3 contention under many parallel compression threads). The block size is stored per-dataset in the bitshuffle HDF5 filter params, so existing readers (XDS/Neggia/Durin/CrystFEL) stay compatible. Move the per-block bitshuffle scratch off the inline member array onto a lazily-sized heap vector, like tmp_space, so the block size no longer bloats every stack-allocated compressor (incl. the transient ones in CBORStream2Serializer). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
136 lines
5.6 KiB
C++
136 lines
5.6 KiB
C++
// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "JFJochCompressor.h"
|
|
|
|
#include <stdexcept>
|
|
#include <cstring>
|
|
#include <bitshuffle/bitshuffle_internals.h>
|
|
#include <bitshuffle_hperf/bitshuffle.h>
|
|
#include <zstd.h>
|
|
#include <lz4/lz4.h>
|
|
|
|
#include "../common/JFJochException.h"
|
|
|
|
extern "C" {
|
|
void bshuf_write_uint64_BE(void* buf, uint64_t num);
|
|
}
|
|
|
|
// Worst-case size of one compressed block, including its 4-byte length prefix. Mirrors the
|
|
// per-block term of MaxCompressedSize(), so a dest sized to MaxCompressedSize() never fails.
|
|
static size_t MaxCompressedBlockSize(CompressionAlgorithm algorithm, size_t src_size) {
|
|
switch (algorithm) {
|
|
case CompressionAlgorithm::BSHUF_LZ4:
|
|
return LZ4_compressBound(src_size) + 4;
|
|
case CompressionAlgorithm::BSHUF_ZSTD:
|
|
case CompressionAlgorithm::BSHUF_ZSTD_RLE:
|
|
return ZSTD_compressBound(src_size) + 4;
|
|
default:
|
|
return src_size + 4;
|
|
}
|
|
}
|
|
|
|
JFJochBitShuffleCompressor::JFJochBitShuffleCompressor(CompressionAlgorithm in_algorithm) {
|
|
algorithm = in_algorithm;
|
|
}
|
|
|
|
size_t JFJochBitShuffleCompressor::CompressBlock(char *dest, const char *source, size_t nelements, size_t elem_size) {
|
|
// Assert nelements < block_size
|
|
const char *src_ptr;
|
|
int64_t bshuf_ret = bitshuf_encode_block(tmp_space.data(), source, scratch.data(), nelements, elem_size);
|
|
if (bshuf_ret < 0)
|
|
throw JFJochException(JFJochExceptionCategory::Compression, "bshuf_trans_bit_elem error");
|
|
src_ptr = tmp_space.data();
|
|
|
|
size_t compressed_size;
|
|
size_t src_size = nelements * elem_size;
|
|
|
|
switch (algorithm) {
|
|
case CompressionAlgorithm::BSHUF_LZ4:
|
|
compressed_size = LZ4_compress_default(src_ptr, dest + 4, src_size, LZ4_compressBound(src_size));
|
|
break;
|
|
case CompressionAlgorithm::BSHUF_ZSTD:
|
|
compressed_size = ZSTD_compress(dest + 4, ZSTD_compressBound(src_size), src_ptr, src_size, 0);
|
|
if (ZSTD_isError(compressed_size))
|
|
throw(JFJochException(JFJochExceptionCategory::Compression, ZSTD_getErrorName(compressed_size)));
|
|
break;
|
|
case CompressionAlgorithm::BSHUF_ZSTD_RLE:
|
|
try {
|
|
compressed_size = zstd_compressor.Compress(((uint8_t *) dest) + 4, (uint64_t *) src_ptr,
|
|
src_size, src_size);
|
|
} catch (const std::runtime_error &e) {
|
|
throw JFJochException(JFJochExceptionCategory::ZSTDCompressionError, e.what());
|
|
}
|
|
break;
|
|
default:
|
|
throw JFJochException(JFJochExceptionCategory::Compression, "Algorithm not supported");
|
|
}
|
|
|
|
bshuf_write_uint32_BE(dest, compressed_size);
|
|
|
|
return compressed_size + 4;
|
|
}
|
|
|
|
std::vector<uint8_t> JFJochBitShuffleCompressor::Compress(const void *source, size_t nelements, size_t elem_size) {
|
|
std::vector<uint8_t> tmp(MaxCompressedSize(algorithm, nelements, elem_size));
|
|
size_t tmp_size = Compress(tmp.data(), tmp.size(), source, nelements, elem_size);
|
|
tmp.resize(tmp_size);
|
|
return tmp;
|
|
}
|
|
|
|
int64_t JFJochBitShuffleCompressor::Compress(void *dest, size_t dest_size, const void *source, size_t nelements, size_t elem_size) {
|
|
auto c_dest = (char *) dest;
|
|
auto c_source = (char *) source;
|
|
|
|
static_assert(DefaultBlockSize % BSHUF_BLOCKED_MULT == 0, "Block size must be multiple of 8");
|
|
|
|
if (algorithm == CompressionAlgorithm::NO_COMPRESSION) {
|
|
// Trivial case if no compression - copy content
|
|
if (nelements * elem_size > dest_size)
|
|
return -1;
|
|
memcpy(dest, source, nelements * elem_size);
|
|
return nelements * elem_size;
|
|
}
|
|
|
|
if (dest_size < 12)
|
|
return -1;
|
|
|
|
bshuf_write_uint64_BE(c_dest, nelements * elem_size);
|
|
bshuf_write_uint32_BE(c_dest + 8, DefaultBlockSize * elem_size);
|
|
|
|
if (tmp_space.size() < DefaultBlockSize * elem_size)
|
|
tmp_space.resize(DefaultBlockSize * elem_size);
|
|
if (scratch.size() < DefaultBlockSize * elem_size)
|
|
scratch.resize(DefaultBlockSize * elem_size);
|
|
|
|
size_t num_full_blocks = nelements / DefaultBlockSize;
|
|
size_t reminder_size = nelements - num_full_blocks * DefaultBlockSize;
|
|
size_t compressed_size = 12;
|
|
|
|
// Blocks are small relative to the image, so before each one we just check that the
|
|
// remaining space still covers that block's worst case, and bail out (-1) if not.
|
|
for (int i = 0; i < num_full_blocks; i++) {
|
|
if (compressed_size + MaxCompressedBlockSize(algorithm, DefaultBlockSize * elem_size) > dest_size)
|
|
return -1;
|
|
compressed_size += CompressBlock(c_dest + compressed_size,
|
|
c_source + i * DefaultBlockSize * elem_size, DefaultBlockSize, elem_size);
|
|
}
|
|
|
|
size_t last_block_size = reminder_size - reminder_size % BSHUF_BLOCKED_MULT;
|
|
if (last_block_size > 0) {
|
|
if (compressed_size + MaxCompressedBlockSize(algorithm, last_block_size * elem_size) > dest_size)
|
|
return -1;
|
|
compressed_size += CompressBlock(c_dest + compressed_size,
|
|
c_source + num_full_blocks * DefaultBlockSize * elem_size, last_block_size, elem_size);
|
|
}
|
|
|
|
size_t leftover_bytes = (reminder_size % BSHUF_BLOCKED_MULT) * elem_size;
|
|
if (leftover_bytes > 0) {
|
|
if (compressed_size + leftover_bytes > dest_size)
|
|
return -1;
|
|
memcpy(c_dest + compressed_size, c_source + (num_full_blocks * DefaultBlockSize + last_block_size) * elem_size, leftover_bytes);
|
|
compressed_size += leftover_bytes;
|
|
}
|
|
return compressed_size;
|
|
}
|