Update bitshuffle/lz4 code + embed bshuf_h5filter code into the library

This commit is contained in:
2026-04-08 10:16:46 +02:00
parent 2bcd3074cc
commit 69470cd374
12 changed files with 3949 additions and 1303 deletions
+2 -13
View File
@@ -13,14 +13,7 @@ CFLAGS=-DH5_USE_110_API -Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR) -std=
# include https://github.com/kiyo-masui/bitshuffle (to handle
# e.g. bitshuffle compressed pixel_masks)
use_BITSHUFFLE =
ifeq ($(use_BITSHUFFLE),)
else
BITSHUFFLE_SRC_DIR = ../bitshuffle-master/src/
BITSHUFFLE_INC_DIR = ../bitshuffle-master/src/
BITSHUFFLE_OBJS = $(BUILD_DIR)/bshuf_h5filter.o
CFLAGS += -DUSE_BITSHUFFLE -I$(BITSHUFFLE_INC_DIR)
endif
CFLAGS += -DUSE_BITSHUFFLE
.PHONY: plugin
plugin: $(BUILD_DIR)/durin-plugin.so
@@ -38,10 +31,6 @@ $(BUILD_DIR)/test_plugin: $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_g
mkdir -p $(BUILD_DIR)
gfortran -O -g -fopenmp -ldl $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_generic_host.f90 -o $@ -J$(BUILD_DIR)
$(BUILD_DIR)/bshuf_h5filter.o: $(BITSHUFFLE_SRC_DIR)/bshuf_h5filter.c
mkdir -p $(BUILD_DIR)
$(CC) $(CFLAGS) -c $< -o $@
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c
mkdir -p $(BUILD_DIR)
$(CC) $(CFLAGS) -c $< -o $@
@@ -51,7 +40,7 @@ $(BSLZ4_BUILD_DIR)/%.o: $(BSLZ4_SRC_DIR)/%.c
$(CC) $(CFLAGS) -c $< -o $@
$(BUILD_DIR)/bslz4.a: $(BSLZ4_BUILD_DIR)/lz4.o $(BSLZ4_BUILD_DIR)/bitshuffle.o \
$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o
$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o $(BSLZ4_BUILD_DIR)/bshuf_h5filter.o
mkdir -p $(BUILD_DIR)
ar rcs $@ $^
+139 -25
View File
@@ -14,23 +14,22 @@
#include "bitshuffle_internals.h"
#include "lz4.h"
#ifdef ZSTD_SUPPORT
#include "zstd.h"
#endif
#include <stdio.h>
#include <string.h>
#define BSHUF_LZ4_DECOMPRESS_FAST
// Macros.
#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \
free(buf); return count - 1000; }
/* Bitshuffle and compress a single block. */
int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
const size_t size, const size_t elem_size) {
const size_t size, const size_t elem_size, const int option) {
int64_t nbytes, count;
void *tmp_buf_bshuf;
@@ -42,7 +41,8 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
tmp_buf_bshuf = malloc(size * elem_size);
if (tmp_buf_bshuf == NULL) return -1;
tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size));
int dst_capacity = LZ4_compressBound(size * elem_size);
tmp_buf_lz4 = malloc(dst_capacity);
if (tmp_buf_lz4 == NULL){
free(tmp_buf_bshuf);
return -1;
@@ -58,7 +58,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
free(tmp_buf_bshuf);
return count;
}
nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size);
nbytes = LZ4_compress_default((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size, dst_capacity);
free(tmp_buf_bshuf);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4);
@@ -76,7 +76,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
/* Decompress and bitunshuffle a single block. */
int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
const size_t size, const size_t elem_size) {
const size_t size, const size_t elem_size, const int option) {
int64_t nbytes, count;
void *out, *tmp_buf;
@@ -96,14 +96,6 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
#ifdef BSHUF_LZ4_DECOMPRESS_FAST
nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
if (nbytes != nbytes_from_header) {
free(tmp_buf);
return -91;
}
#else
nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header,
size * elem_size);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
@@ -112,7 +104,7 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
return -91;
}
nbytes = nbytes_from_header;
#endif
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
nbytes += 4;
@@ -121,6 +113,92 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
return nbytes;
}
#ifdef ZSTD_SUPPORT
/* Bitshuffle and compress a single block. */
int64_t bshuf_compress_zstd_block(ioc_chain *C_ptr, \
const size_t size, const size_t elem_size, const int comp_lvl) {
int64_t nbytes, count;
void *tmp_buf_bshuf;
void *tmp_buf_zstd;
size_t this_iter;
const void *in;
void *out;
tmp_buf_bshuf = malloc(size * elem_size);
if (tmp_buf_bshuf == NULL) return -1;
size_t tmp_buf_zstd_size = ZSTD_compressBound(size * elem_size);
tmp_buf_zstd = malloc(tmp_buf_zstd_size);
if (tmp_buf_zstd == NULL){
free(tmp_buf_bshuf);
return -1;
}
in = ioc_get_in(C_ptr, &this_iter);
ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size));
count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size);
if (count < 0) {
free(tmp_buf_zstd);
free(tmp_buf_bshuf);
return count;
}
nbytes = ZSTD_compress(tmp_buf_zstd, tmp_buf_zstd_size, (const void*)tmp_buf_bshuf, size * elem_size, comp_lvl);
free(tmp_buf_bshuf);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_zstd);
out = ioc_get_out(C_ptr, &this_iter);
ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4));
bshuf_write_uint32_BE(out, nbytes);
memcpy((char *) out + 4, tmp_buf_zstd, nbytes);
free(tmp_buf_zstd);
return nbytes + 4;
}
/* Decompress and bitunshuffle a single block. */
int64_t bshuf_decompress_zstd_block(ioc_chain *C_ptr,
const size_t size, const size_t elem_size, const int option) {
int64_t nbytes, count;
void *out, *tmp_buf;
const void *in;
size_t this_iter;
int32_t nbytes_from_header;
in = ioc_get_in(C_ptr, &this_iter);
nbytes_from_header = bshuf_read_uint32_BE(in);
ioc_set_next_in(C_ptr, &this_iter,
(void*) ((char*) in + nbytes_from_header + 4));
out = ioc_get_out(C_ptr, &this_iter);
ioc_set_next_out(C_ptr, &this_iter,
(void *) ((char *) out + size * elem_size));
tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
nbytes = ZSTD_decompress(tmp_buf, size * elem_size, (void *)((char *) in + 4), nbytes_from_header);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
if (nbytes != size * elem_size) {
free(tmp_buf);
return -91;
}
nbytes = nbytes_from_header;
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
nbytes += 4;
free(tmp_buf);
return nbytes;
}
#endif // ZSTD_SUPPORT
/* ---- Public functions ----
*
@@ -138,13 +216,13 @@ size_t bshuf_compress_lz4_bound(const size_t size,
}
if (block_size % BSHUF_BLOCKED_MULT) return -81;
// Note that each block gets a 4 byte header.
// Size of full blocks.
bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
// Size of partial blocks, if any.
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
// Size of uncompressed data not fitting into any blocks.
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
return bound;
}
@@ -153,13 +231,49 @@ size_t bshuf_compress_lz4_bound(const size_t size,
int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size,
elem_size, block_size);
elem_size, block_size, 0/*option*/);
}
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size,
elem_size, block_size);
elem_size, block_size, 0/*option*/);
}
#ifdef ZSTD_SUPPORT
size_t bshuf_compress_zstd_bound(const size_t size,
const size_t elem_size, size_t block_size) {
size_t bound, leftover;
if (block_size == 0) {
block_size = bshuf_default_block_size(elem_size);
}
if (block_size % BSHUF_BLOCKED_MULT) return -81;
// Note that each block gets a 4 byte header.
// Size of full blocks.
bound = (ZSTD_compressBound(block_size * elem_size) + 4) * (size / block_size);
// Size of partial blocks, if any.
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
if (leftover) bound += ZSTD_compressBound(leftover * elem_size) + 4;
// Size of uncompressed data not fitting into any blocks.
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
return bound;
}
int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size, const int comp_lvl) {
return bshuf_blocked_wrap_fun(&bshuf_compress_zstd_block, in, out, size,
elem_size, block_size, comp_lvl);
}
int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_decompress_zstd_block, in, out, size,
elem_size, block_size, 0/*option*/);
}
#endif // ZSTD_SUPPORT
+89 -7
View File
@@ -35,6 +35,10 @@
extern "C" {
#endif
/*
* ---- LZ4 Interface ----
*/
/* ---- bshuf_compress_lz4_bound ----
*
* Bound on size of data compressed with *bshuf_compress_lz4*.
@@ -94,11 +98,6 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
* must patch the parameters used to compress the data.
*
* NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function
* LZ4_decompress_fast from LZ4, which does not protect against maliciously
* formed datasets. By modifying the compressed data, this function could be
* coerced into leaving the boundaries of the input buffer.
*
* Parameters
* ----------
* in : input buffer
@@ -116,8 +115,91 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size);
/*
* ---- ZSTD Interface ----
*/
#ifdef ZSTD_SUPPORT
/* ---- bshuf_compress_zstd_bound ----
*
* Bound on size of data compressed with *bshuf_compress_zstd*.
*
* Parameters
* ----------
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Process in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* Bound on compressed data size.
*
*/
size_t bshuf_compress_zstd_bound(const size_t size,
const size_t elem_size, size_t block_size);
/* ---- bshuf_compress_zstd ----
*
* Bitshuffled and compress the data using zstd.
*
* Transpose within elements, in blocks of data of *block_size* elements then
* compress the blocks using ZSTD. In the output buffer, each block is prefixed
* by a 4 byte integer giving the compressed size of that block.
*
* Output buffer must be large enough to hold the compressed data. This could
* be in principle substantially larger than the input buffer. Use the routine
* *bshuf_compress_zstd_bound* to get an upper limit.
*
* Parameters
* ----------
* in : input buffer, must be of size * elem_size bytes
* out : output buffer, must be large enough to hold data.
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Process in blocks of this many elements. Pass 0 to
* select automatically (recommended).
* comp_lvl : compression level applied
*
* Returns
* -------
* number of bytes used in output buffer, negative error-code if failed.
*
*/
int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size, const size_t
elem_size, size_t block_size, const int comp_lvl);
/* ---- bshuf_decompress_zstd ----
*
* Undo compression and bitshuffling.
*
* Decompress data then un-bitshuffle it in blocks of *block_size* elements.
*
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
* must patch the parameters used to compress the data.
*
* Parameters
* ----------
* in : input buffer
* out : output buffer, must be of size * elem_size bytes
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Process in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* number of bytes consumed in *input* buffer, negative error-code if failed.
*
*/
int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size);
#endif // ZSTD_SUPPORT
#ifdef __cplusplus
}
} // extern "C"
#endif
#endif
#endif // BITSHUFFLE_H
+742 -39
View File
@@ -16,30 +16,55 @@
#include <string.h>
#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__)
#define USEAVX512
#endif
#if defined(__AVX2__) && defined (__SSE2__)
#define USEAVX2
#endif
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(NO_WARN_X86_INTRINSICS)
#define USESSE2
#endif
#if defined(__ARM_NEON__) || (__ARM_NEON)
#ifdef __aarch64__
#define USEARMNEON
#endif
#endif
// Conditional includes for SSE2 and AVX2.
#ifdef USEAVX2
#include <immintrin.h>
#elif defined USESSE2
#include <emmintrin.h>
#elif defined USEARMNEON
#include <arm_neon.h>
#endif
#if defined(_OPENMP) && defined(_MSC_VER)
typedef int64_t omp_size_t;
#else
typedef size_t omp_size_t;
#endif
// Macros.
#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
/* ---- Functions indicating compile time instruction set. ---- */
int bshuf_using_NEON(void) {
#ifdef USEARMNEON
return 1;
#else
return 0;
#endif
}
int bshuf_using_SSE2(void) {
#ifdef USESSE2
return 1;
@@ -58,6 +83,14 @@ int bshuf_using_AVX2(void) {
}
int bshuf_using_AVX512(void) {
#ifdef USEAVX512
return 1;
#else
return 0;
#endif
}
/* ---- Worker code not requiring special instruction sets. ----
*
* The following code does not use any x86 specific vectorized instructions
@@ -131,8 +164,8 @@ int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t
CHECK_MULT_EIGHT(start);
if (size > start) {
// ii loop separated into 2 loops so the compiler can unroll
// the inner one.
for (ii = start; ii + 7 < size; ii += 8) {
for (jj = 0; jj < elem_size; jj++) {
for (kk = 0; kk < 8; kk++) {
@@ -348,10 +381,516 @@ int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size
}
/* ---- Worker code that uses Arm NEON ----
*
* The following code makes use of the Arm NEON instruction set.
* NEON technology is the implementation of the ARM Advanced Single
* Instruction Multiple Data (SIMD) extension.
* The NEON unit is the component of the processor that executes SIMD instructions.
* It is also called the NEON Media Processing Engine (MPE).
*
*/
#ifdef USEARMNEON
/* Transpose bytes within elements for 16 bit elements. */
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
size_t ii;
const char *in_b = (const char*) in;
char *out_b = (char*) out;
int8x16_t a0, b0, a1, b1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = vld1q_s8(in_b + 2*ii + 0*16);
b0 = vld1q_s8(in_b + 2*ii + 1*16);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
a0 = vzip1q_s8(a1, b1);
b0 = vzip2q_s8(a1, b1);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
a0 = vzip1q_s8(a1, b1);
b0 = vzip2q_s8(a1, b1);
vst1q_s8(out_b + 0*size + ii, a0);
vst1q_s8(out_b + 1*size + ii, b0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 2,
size - size % 16);
}
/* Transpose bytes within elements for 32 bit elements. */
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
size_t ii;
const char *in_b;
char *out_b;
in_b = (const char*) in;
out_b = (char*) out;
int8x16_t a0, b0, c0, d0, a1, b1, c1, d1;
int64x2_t a2, b2, c2, d2;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = vld1q_s8(in_b + 4*ii + 0*16);
b0 = vld1q_s8(in_b + 4*ii + 1*16);
c0 = vld1q_s8(in_b + 4*ii + 2*16);
d0 = vld1q_s8(in_b + 4*ii + 3*16);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
c1 = vzip1q_s8(c0, d0);
d1 = vzip2q_s8(c0, d0);
a0 = vzip1q_s8(a1, b1);
b0 = vzip2q_s8(a1, b1);
c0 = vzip1q_s8(c1, d1);
d0 = vzip2q_s8(c1, d1);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
c1 = vzip1q_s8(c0, d0);
d1 = vzip2q_s8(c0, d0);
a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
vst1q_s64((int64_t *) (out_b + 0*size + ii), a2);
vst1q_s64((int64_t *) (out_b + 1*size + ii), b2);
vst1q_s64((int64_t *) (out_b + 2*size + ii), c2);
vst1q_s64((int64_t *) (out_b + 3*size + ii), d2);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 4,
size - size % 16);
}
/* Transpose bytes within elements for 64 bit elements. */
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
size_t ii;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = vld1q_s8(in_b + 8*ii + 0*16);
b0 = vld1q_s8(in_b + 8*ii + 1*16);
c0 = vld1q_s8(in_b + 8*ii + 2*16);
d0 = vld1q_s8(in_b + 8*ii + 3*16);
e0 = vld1q_s8(in_b + 8*ii + 4*16);
f0 = vld1q_s8(in_b + 8*ii + 5*16);
g0 = vld1q_s8(in_b + 8*ii + 6*16);
h0 = vld1q_s8(in_b + 8*ii + 7*16);
a1 = vzip1q_s8 (a0, b0);
b1 = vzip2q_s8 (a0, b0);
c1 = vzip1q_s8 (c0, d0);
d1 = vzip2q_s8 (c0, d0);
e1 = vzip1q_s8 (e0, f0);
f1 = vzip2q_s8 (e0, f0);
g1 = vzip1q_s8 (g0, h0);
h1 = vzip2q_s8 (g0, h0);
a0 = vzip1q_s8 (a1, b1);
b0 = vzip2q_s8 (a1, b1);
c0 = vzip1q_s8 (c1, d1);
d0 = vzip2q_s8 (c1, d1);
e0 = vzip1q_s8 (e1, f1);
f0 = vzip2q_s8 (e1, f1);
g0 = vzip1q_s8 (g1, h1);
h0 = vzip2q_s8 (g1, h1);
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
vst1q_s8(out_b + 0*size + ii, a0);
vst1q_s8(out_b + 1*size + ii, b0);
vst1q_s8(out_b + 2*size + ii, c0);
vst1q_s8(out_b + 3*size + ii, d0);
vst1q_s8(out_b + 4*size + ii, e0);
vst1q_s8(out_b + 5*size + ii, f0);
vst1q_s8(out_b + 6*size + ii, g0);
vst1q_s8(out_b + 7*size + ii, h0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 8,
size - size % 16);
}
/* Transpose bytes within elements using best NEON algorithm available. */
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
// Trivial cases: power of 2 bytes.
switch (elem_size) {
case 1:
count = bshuf_copy(in, out, size, elem_size);
return count;
case 2:
count = bshuf_trans_byte_elem_NEON_16(in, out, size);
return count;
case 4:
count = bshuf_trans_byte_elem_NEON_32(in, out, size);
return count;
case 8:
count = bshuf_trans_byte_elem_NEON_64(in, out, size);
return count;
}
// Worst case: odd number of bytes. Turns out that this is faster for
// (odd * 2) byte elements as well (hence % 4).
if (elem_size % 4) {
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
return count;
}
// Multiple of power of 2: transpose hierarchically.
{
size_t nchunk_elem;
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
if ((elem_size % 8) == 0) {
nchunk_elem = elem_size / 8;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
} else if ((elem_size % 4) == 0) {
nchunk_elem = elem_size / 4;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
} else {
// Not used since scalar algorithm is faster.
nchunk_elem = elem_size / 2;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
}
free(tmp_buf);
return count;
}
}
/* Creates a mask made up of the most significant
* bit of each byte of 'input'
*/
int32_t move_byte_mask_neon(uint8x16_t input) {
return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3)
| (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7)
| (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11)
| (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15)
);
}
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, kk;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
uint16_t* out_ui16;
int64_t count;
size_t nbyte = elem_size * size;
CHECK_MULT_EIGHT(nbyte);
int16x8_t xmm;
int32_t bt;
for (ii = 0; ii + 15 < nbyte; ii += 16) {
xmm = vld1q_s16((int16_t *) (in_b + ii));
for (kk = 0; kk < 8; kk++) {
bt = move_byte_mask_neon((uint8x16_t) xmm);
xmm = vshlq_n_s16(xmm, 1);
out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
*out_ui16 = bt;
}
}
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
nbyte - nbyte % 16);
return count;
}
/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* For data organized into a row for each bit (8 * elem_size rows), transpose
* the bytes. */
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, jj;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
CHECK_MULT_EIGHT(size);
size_t nrows = 8 * elem_size;
size_t nbyte_row = size / 8;
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
for (ii = 0; ii + 7 < nrows; ii += 8) {
for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj);
b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj);
c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj);
d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj);
e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj);
f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj);
g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj);
h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj);
a1 = vzip1q_s8(a0, b0);
b1 = vzip1q_s8(c0, d0);
c1 = vzip1q_s8(e0, f0);
d1 = vzip1q_s8(g0, h0);
e1 = vzip2q_s8(a0, b0);
f1 = vzip2q_s8(c0, d0);
g1 = vzip2q_s8(e0, f0);
h1 = vzip2q_s8(g0, h0);
a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
b0= (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
as = (int64x1_t *) &a1;
bs = (int64x1_t *) &b1;
cs = (int64x1_t *) &c1;
ds = (int64x1_t *) &d1;
es = (int64x1_t *) &e1;
fs = (int64x1_t *) &f1;
gs = (int64x1_t *) &g1;
hs = (int64x1_t *) &h1;
vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as);
vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1));
vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs);
vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1));
vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs);
vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1));
vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds);
vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1));
vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es);
vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1));
vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs);
vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1));
vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs);
vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1));
vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs);
vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1));
}
for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
}
}
return size * elem_size;
}
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
uint16_t* out_ui16 = (uint16_t*) out;
size_t ii, jj, kk;
size_t nbyte = elem_size * size;
int16x8_t xmm;
int32_t bt;
if (elem_size % 2) {
bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
} else {
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
ii += 8 * elem_size) {
for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
xmm = vld1q_s16((int16_t *) &in_b[ii + jj]);
for (kk = 0; kk < 8; kk++) {
bt = move_byte_mask_neon((uint8x16_t) xmm);
xmm = vshlq_n_s16(xmm, 1);
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
out_ui16[ind / 2] = bt;
}
}
}
}
return size * elem_size;
}
/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
#else // #ifdef USEARMNEON
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
return -13;
}
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
#endif
/* ---- Worker code that uses SSE2 ----
*
* The following code makes use of the SSE2 instruction set and specialized
* 16 byte registers. The SSE2 instructions are present on modern x86
* 16 byte registers. The SSE2 instructions are present on modern x86
* processors. The first Intel processor microarchitecture supporting SSE2 was
* Pentium 4 (2000).
*
@@ -512,7 +1051,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
int64_t count;
// Trivial cases: power of 2 bytes.
switch (elem_size) {
case 1:
count = bshuf_copy(in, out, size, elem_size);
@@ -528,14 +1067,14 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
return count;
}
// Worst case: odd number of bytes. Turns out that this is faster for
// (odd * 2) byte elements as well (hence % 4).
if (elem_size % 4) {
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
return count;
}
// Multiple of power of 2: transpose hierarchically.
{
size_t nchunk_elem;
void* tmp_buf = malloc(size * elem_size);
@@ -554,7 +1093,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
} else {
// Not used since scalar algorithm is faster.
nchunk_elem = elem_size / 2;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
@@ -687,8 +1226,8 @@ int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size
g1 = _mm_unpacklo_epi32(g0, h0);
h1 = _mm_unpackhi_epi32(g0, h0);
// We don't have a storeh instruction for integers, so interpret
// as a float. Have a storel (_mm_storel_epi64).
as = (__m128 *) &a1;
bs = (__m128 *) &b1;
cs = (__m128 *) &c1;
@@ -737,8 +1276,8 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
uint16_t* out_ui16 = (uint16_t*) out;
@@ -788,7 +1327,7 @@ int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
return count;
}
#else
#else // #ifdef USESSE2
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
@@ -842,7 +1381,7 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
}
#endif
#endif // #ifdef USESSE2
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
@@ -857,7 +1396,6 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
*/
#ifdef USEAVX2
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
@@ -1014,8 +1552,8 @@ int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
char* out_b = (char*) out;
@@ -1065,7 +1603,7 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
}
#else
#else // #ifdef USEAVX2
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
@@ -1096,19 +1634,179 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
return -12;
}
#endif
#endif // #ifdef USEAVX2
#ifdef USEAVX512
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, kk;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
size_t nbyte = elem_size * size;
int64_t count;
int64_t* out_i64;
__m512i zmm;
__mmask64 bt;
if (nbyte >= 64) {
const __m512i mask = _mm512_set1_epi8(0);
for (ii = 0; ii + 63 < nbyte; ii += 64) {
zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
for (kk = 0; kk < 8; kk++) {
bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
zmm = _mm512_slli_epi16(zmm, 1);
out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
*out_i64 = (int64_t)bt;
}
}
}
__m256i ymm;
int32_t bt32;
int32_t* out_i32;
size_t start = nbyte - nbyte % 64;
for (ii = start; ii + 31 < nbyte; ii += 32) {
ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
for (kk = 0; kk < 8; kk++) {
bt32 = _mm256_movemask_epi8(ymm);
ymm = _mm256_slli_epi16(ymm, 1);
out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
*out_i32 = bt32;
}
}
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
nbyte - nbyte % 64 % 32);
return count;
}
/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
char* out_b = (char*) out;
size_t ii, jj, kk;
size_t nbyte = elem_size * size;
__m512i zmm;
__mmask64 bt;
if (elem_size % 8) {
return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
} else {
const __m512i mask = _mm512_set1_epi8(0);
for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
ii += 8 * elem_size) {
zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
for (kk = 0; kk < 8; kk++) {
bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
zmm = _mm512_slli_epi16(zmm, 1);
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
* (int64_t *) &out_b[ind] = bt;
}
}
}
}
return size * elem_size;
}
/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
#else // #ifdef USEAVX512
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -14;
}
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -14;
}
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -14;
}
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -14;
}
#endif
/* ---- Drivers selecting best instruction set at compile time. ---- */
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
#ifdef USEAVX2
#ifdef USEAVX512
count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size);
#elif defined USEAVX2
count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
#elif defined(USESSE2)
count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
#elif defined(USEARMNEON)
count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size);
#else
count = bshuf_trans_bit_elem_scal(in, out, size, elem_size);
#endif
@@ -1116,14 +1814,18 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
}
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
#ifdef USEAVX2
#ifdef USEAVX512
count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size);
#elif defined USEAVX2
count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
#elif defined(USESSE2)
count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
#elif defined(USEARMNEON)
count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size);
#else
count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size);
#endif
@@ -1136,9 +1838,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
/* Wrap a function for processing a single block to process an entire buffer in
* parallel. */
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \
const size_t size, const size_t elem_size, size_t block_size) {
const size_t size, const size_t elem_size, size_t block_size, const int option) {
size_t ii;
omp_size_t ii = 0;
int64_t err = 0;
int64_t count, cum_count=0;
size_t last_block_size;
@@ -1161,8 +1863,8 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
#pragma omp parallel for schedule(dynamic, 1) \
private(count) reduction(+ : cum_count)
#endif
for (ii = 0; ii < size / block_size; ii ++) {
count = fun(&C, block_size, elem_size);
for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) {
count = fun(&C, block_size, elem_size, option);
if (count < 0) err = count;
cum_count += count;
}
@@ -1170,7 +1872,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
last_block_size = size % block_size;
last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT;
if (last_block_size) {
count = fun(&C, last_block_size, elem_size);
count = fun(&C, last_block_size, elem_size, option);
if (count < 0) err = count;
cum_count += count;
}
@@ -1178,6 +1880,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
if (err < 0) return err;
leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
//this_iter;
last_in = (char *) ioc_get_in(&C, &this_iter);
ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
last_out = (char *) ioc_get_out(&C, &this_iter);
@@ -1193,7 +1896,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
/* Bitshuffle a single block. */
int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
const size_t size, const size_t elem_size) {
const size_t size, const size_t elem_size, const int option) {
size_t this_iter;
const void *in;
@@ -1201,7 +1904,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
int64_t count;
in = ioc_get_in(C_ptr, &this_iter);
ioc_set_next_in(C_ptr, &this_iter,
(void*) ((char*) in + size * elem_size));
@@ -1216,7 +1919,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
/* Bitunshuffle a single block. */
int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \
const size_t size, const size_t elem_size) {
const size_t size, const size_t elem_size, const int option) {
size_t this_iter;
@@ -1296,11 +1999,11 @@ uint32_t bshuf_read_uint32_BE(const void* buf) {
*/
size_t bshuf_default_block_size(const size_t elem_size) {
// This function needs to be absolutely stable between versions.
// Otherwise encoded data will not be decodable.
size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
// Ensure it is a required multiple.
block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
}
@@ -1310,7 +2013,7 @@ int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size,
elem_size, block_size);
elem_size, block_size, 0/*option*/);
}
@@ -1318,7 +2021,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size,
elem_size, block_size);
elem_size, block_size, 0/*option*/);
}
+33 -8
View File
@@ -18,6 +18,8 @@
* -1 : Failed to allocate memory.
* -11 : Missing SSE.
* -12 : Missing AVX.
* -13 : Missing Arm Neon.
* -14 : Missing AVX512.
* -80 : Input size not a multiple of 8.
* -81 : block_size not multiple of 8.
* -91 : Decompression error, wrong number of bytes processed.
@@ -28,15 +30,14 @@
#ifndef BITSHUFFLE_CORE_H
#define BITSHUFFLE_CORE_H
// We assume GNU g++ defining `__cplusplus` has stdint.h
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
#include <stdint.h>
#else
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef signed short int16_t;
typedef unsigned int uint32_t;
typedef signed int int32_t;
typedef signed int int32_t;
typedef unsigned long long uint64_t;
typedef long long int64_t;
#endif
@@ -44,11 +45,11 @@
#include <stdlib.h>
// These are usually set in the setup.py.
#ifndef BSHUF_VERSION_MAJOR
#define BSHUF_VERSION_MAJOR 0
#define BSHUF_VERSION_MINOR 3
#define BSHUF_VERSION_POINT 4
#define BSHUF_VERSION_MINOR 4
#define BSHUF_VERSION_POINT 0
#endif
#ifdef __cplusplus
@@ -67,6 +68,18 @@ extern "C" {
int bshuf_using_SSE2(void);
/* ---- bshuf_using_NEON ----
*
* Whether routines where compiled with the NEON instruction set.
*
* Returns
* -------
* 1 if using NEON, 0 otherwise.
*
*/
int bshuf_using_NEON(void);
/* ---- bshuf_using_AVX2 ----
*
* Whether routines where compiled with the AVX2 instruction set.
@@ -79,6 +92,18 @@ int bshuf_using_SSE2(void);
int bshuf_using_AVX2(void);
/* ---- bshuf_using_AVX512 ----
*
* Whether routines where compiled with the AVX512 instruction set.
*
* Returns
* -------
* 1 if using AVX512, 0 otherwise.
*
*/
int bshuf_using_AVX512(void);
/* ---- bshuf_default_block_size ----
*
* The default block size as function of element size.
@@ -151,7 +176,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size);
#ifdef __cplusplus
}
} // extern "C"
#endif
#endif
#endif // BITSHUFFLE_CORE_H
+18 -6
View File
@@ -13,19 +13,31 @@
#ifndef BITSHUFFLE_INTERNALS_H
#define BITSHUFFLE_INTERNALS_H
// We assume GNU g++ defining `__cplusplus` has stdint.h
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
#include <stdint.h>
#else
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef signed int int32_t;
typedef unsigned long long uint64_t;
typedef long long int64_t;
#endif
#include <stdlib.h>
#include "iochain.h"
// Constants.
#ifndef BSHUF_MIN_RECOMMEND_BLOCK
#define BSHUF_MIN_RECOMMEND_BLOCK 128
#define BSHUF_BLOCKED_MULT 8
#define BSHUF_BLOCKED_MULT 8 // Block sizes must be multiple of this.
#define BSHUF_TARGET_BLOCK_SIZE_B 8192
#endif
// Macros.
#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }
@@ -49,15 +61,15 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
/* Function definition for worker functions that process a single block. */
typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr,
const size_t size, const size_t elem_size);
const size_t size, const size_t elem_size, const int option);
/* Wrap a function for processing a single block to process an entire buffer in
* parallel. */
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
const size_t size, const size_t elem_size, size_t block_size);
const size_t size, const size_t elem_size, size_t block_size, const int option);
#ifdef __cplusplus
}
} // extern "C"
#endif
#endif
#endif // BITSHUFFLE_INTERNALS_H
+260
View File
@@ -0,0 +1,260 @@
/*
* Bitshuffle HDF5 filter
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#include "bitshuffle.h"
#include "bshuf_h5filter.h"
#define PUSH_ERR(func, minor, str) \
H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
// Prototypes from bitshuffle.c
void bshuf_write_uint64_BE(void* buf, uint64_t num);
uint64_t bshuf_read_uint64_BE(void* buf);
void bshuf_write_uint32_BE(void* buf, uint32_t num);
uint32_t bshuf_read_uint32_BE(const void* buf);
// Only called on compression, not on reverse.
herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
herr_t r;
size_t ii;
unsigned int elem_size;
unsigned int flags;
size_t nelements = 8;
size_t nelem_max = 11;
unsigned values[] = {0,0,0,0,0,0,0,0,0,0,0};
unsigned tmp_values[] = {0,0,0,0,0,0,0,0};
char msg[80];
r = H5Pget_filter_by_id2(dcpl, BSHUF_H5FILTER, &flags, &nelements,
tmp_values, 0, NULL, NULL);
if(r<0) return -1;
// First 3 slots reserved. Move any passed options to higher addresses.
for (ii=0; ii < nelements && ii + 3 < nelem_max; ii++) {
values[ii + 3] = tmp_values[ii];
}
nelements = 3 + nelements;
values[0] = BSHUF_VERSION_MAJOR;
values[1] = BSHUF_VERSION_MINOR;
elem_size = H5Tget_size(type);
if(elem_size <= 0) {
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
"Invalid element size.");
return -1;
}
values[2] = elem_size;
// Validate user supplied arguments.
if (nelements > 3) {
if (values[3] % 8 || values[3] < 0) {
sprintf(msg, "Error in bitshuffle. Invalid block size: %d.",
values[3]);
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, msg);
return -1;
}
}
if (nelements > 4) {
switch (values[4]) {
case 0:
break;
case BSHUF_H5_COMPRESS_LZ4:
break;
#ifdef ZSTD_SUPPORT
case BSHUF_H5_COMPRESS_ZSTD:
break;
#endif
default:
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
"Invalid bitshuffle compression.");
}
}
r = H5Pmodify_filter(dcpl, BSHUF_H5FILTER, flags, nelements, values);
if(r<0) return -1;
return 1;
}
size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
const unsigned int cd_values[], size_t nbytes,
size_t *buf_size, void **buf) {
size_t size, elem_size;
int err = -1;
char msg[80];
size_t block_size = 0;
size_t buf_size_out, nbytes_uncomp, nbytes_out;
char* in_buf = *buf;
void *out_buf;
if (cd_nelmts < 3) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"Not enough parameters.");
return 0;
}
elem_size = cd_values[2];
#ifdef ZSTD_SUPPORT
const int comp_lvl = cd_values[5];
#endif
// User specified block size.
if (cd_nelmts > 3) block_size = cd_values[3];
if (block_size == 0) block_size = bshuf_default_block_size(elem_size);
#ifndef ZSTD_SUPPORT
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"ZSTD compression filter chosen but ZSTD support not installed.");
return 0;
}
#endif
// Compression in addition to bitshuffle.
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
if (flags & H5Z_FLAG_REVERSE) {
// First eight bytes is the number of bytes in the output buffer,
// little endian.
nbytes_uncomp = bshuf_read_uint64_BE(in_buf);
// Override the block size with the one read from the header.
block_size = bshuf_read_uint32_BE((const char*) in_buf + 8) / elem_size;
// Skip over the header.
in_buf += 12;
buf_size_out = nbytes_uncomp;
} else {
nbytes_uncomp = nbytes;
// Pick which compressions library to use
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
buf_size_out = bshuf_compress_lz4_bound(nbytes_uncomp / elem_size,
elem_size, block_size) + 12;
}
#ifdef ZSTD_SUPPORT
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
buf_size_out = bshuf_compress_zstd_bound(nbytes_uncomp / elem_size,
elem_size, block_size) + 12;
}
#endif
}
} else {
nbytes_uncomp = nbytes;
buf_size_out = nbytes;
}
// TODO, remove this restriction by memcopying the extra.
if (nbytes_uncomp % elem_size) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"Non integer number of elements.");
return 0;
}
size = nbytes_uncomp / elem_size;
out_buf = malloc(buf_size_out);
if (out_buf == NULL) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"Could not allocate output buffer.");
return 0;
}
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
if (flags & H5Z_FLAG_REVERSE) {
// Bit unshuffle/decompress.
// Pick which compressions library to use
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
err = bshuf_decompress_lz4(in_buf, out_buf, size, elem_size, block_size);
}
#ifdef ZSTD_SUPPORT
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
err = bshuf_decompress_zstd(in_buf, out_buf, size, elem_size, block_size);
}
#endif
nbytes_out = nbytes_uncomp;
} else {
// Bit shuffle/compress.
// Write the header, described in
// http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
// Technically we should be using signed integers instead of
// unsigned ones, however for valid inputs (positive numbers) these
// have the same representation.
bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
bshuf_write_uint32_BE((char*) out_buf + 8, block_size * elem_size);
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
err = bshuf_compress_lz4(in_buf, (char*) out_buf + 12, size,
elem_size, block_size);
}
#ifdef ZSTD_SUPPORT
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
err = bshuf_compress_zstd(in_buf, (char*) out_buf + 12, size,
elem_size, block_size, comp_lvl);
}
#endif
nbytes_out = err + 12;
}
} else {
if (flags & H5Z_FLAG_REVERSE) {
// Bit unshuffle.
err = bshuf_bitunshuffle(in_buf, out_buf, size, elem_size,
block_size); } else {
// Bit shuffle.
err = bshuf_bitshuffle(in_buf, out_buf, size, elem_size,
block_size); } nbytes_out = nbytes; }
//printf("nb_in %d, nb_uncomp %d, nb_out %d, buf_out %d, block %d\n",
//nbytes, nbytes_uncomp, nbytes_out, buf_size_out, block_size);
if (err < 0) {
sprintf(msg, "Error in bitshuffle with error code %d.", err);
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, msg);
free(out_buf);
return 0;
} else {
free(*buf);
*buf = out_buf;
*buf_size = buf_size_out;
return nbytes_out;
}
}
H5Z_class_t bshuf_H5Filter[1] = {{
H5Z_CLASS_T_VERS,
(H5Z_filter_t)(BSHUF_H5FILTER),
1, 1,
"bitshuffle; see https://github.com/kiyo-masui/bitshuffle",
NULL,
(H5Z_set_local_func_t)(bshuf_h5_set_local),
(H5Z_func_t)(bshuf_h5_filter)
}};
int bshuf_register_h5filter(void){
int retval;
retval = H5Zregister(bshuf_H5Filter);
if(retval<0){
PUSH_ERR("bshuf_register_h5filter",
H5E_CANTREGISTER, "Can't register bitshuffle filter");
}
return retval;
}
+67
View File
@@ -0,0 +1,67 @@
/*
* Bitshuffle HDF5 filter
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*
* Header File
*
* Filter Options
* --------------
* block_size (option slot 0) : integer (optional)
* What block size to use (in elements not bytes). Default is 0,
* for which bitshuffle will pick a block size with a target of 8kb.
* Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
* Whether to apply LZ4 compression to the data after bitshuffling.
* This is much faster than applying compression as a second filter
* because it is done when the small block of data is already in the
* L1 cache.
*
* For LZ4 compression, the compressed format of the data is the same as
* for the normal LZ4 filter described in
* http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
*
*/
#ifndef BSHUF_H5FILTER_H
#define BSHUF_H5FILTER_H
#ifdef __cplusplus
extern "C" {
#endif
#define H5Z_class_t_vers 2
#include "hdf5.h"
#define BSHUF_H5FILTER 32008
#define BSHUF_H5_COMPRESS_LZ4 2
#define BSHUF_H5_COMPRESS_ZSTD 3
extern H5Z_class_t bshuf_H5Filter[1];
/* ---- bshuf_register_h5filter ----
*
* Register the bitshuffle HDF5 filter within the HDF5 library.
*
* Call this before using the bitshuffle HDF5 filter from C unless
* using dynamically loaded filters.
*
*/
int bshuf_register_h5filter(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // BSHUF_H5FILTER_H
+4 -4
View File
@@ -1,5 +1,5 @@
/*
* IOchain - Distribute a chain of dependant IO events amoung threads.
* IOchain - Distribute a chain of dependent IO events among threads.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
@@ -81,9 +81,9 @@ void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
#ifdef _OPENMP
omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
// *in_pl[this_iter]* lock released at the end of the iteration to avoid being
// overtaken by previous threads and having *out_pl[this_iter]* corrupted.
// Especially worried about thread 0, iteration 0.
omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
#endif
}
+3 -3
View File
@@ -1,5 +1,5 @@
/*
* IOchain - Distribute a chain of dependant IO events amoung threads.
* IOchain - Distribute a chain of dependent IO events among threads.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
@@ -25,7 +25,7 @@
* Usage
* -----
* - Call `ioc_init` in serial block.
* - Each thread should create a local variable *size_t this_iter* and
* - Each thread should create a local variable *size_t this_iter* and
* pass its address to all function calls. Its value will be set
* inside the functions and is used to identify the thread.
* - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
@@ -90,5 +90,5 @@ void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
void * ioc_get_out(ioc_chain *C, size_t *this_iter);
void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);
#endif
#endif // IOCHAIN_H
+1921 -941
View File
File diff suppressed because it is too large Load Diff
+671 -257
View File
File diff suppressed because it is too large Load Diff