Update bitshuffle/lz4 code + embed bshuf_h5filter code into the library
This commit is contained in:
@@ -13,14 +13,7 @@ CFLAGS=-DH5_USE_110_API -Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR) -std=
|
||||
|
||||
# include https://github.com/kiyo-masui/bitshuffle (to handle
|
||||
# e.g. bitshuffle compressed pixel_masks)
|
||||
use_BITSHUFFLE =
|
||||
ifeq ($(use_BITSHUFFLE),)
|
||||
else
|
||||
BITSHUFFLE_SRC_DIR = ../bitshuffle-master/src/
|
||||
BITSHUFFLE_INC_DIR = ../bitshuffle-master/src/
|
||||
BITSHUFFLE_OBJS = $(BUILD_DIR)/bshuf_h5filter.o
|
||||
CFLAGS += -DUSE_BITSHUFFLE -I$(BITSHUFFLE_INC_DIR)
|
||||
endif
|
||||
CFLAGS += -DUSE_BITSHUFFLE
|
||||
|
||||
.PHONY: plugin
|
||||
plugin: $(BUILD_DIR)/durin-plugin.so
|
||||
@@ -38,10 +31,6 @@ $(BUILD_DIR)/test_plugin: $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_g
|
||||
mkdir -p $(BUILD_DIR)
|
||||
gfortran -O -g -fopenmp -ldl $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_generic_host.f90 -o $@ -J$(BUILD_DIR)
|
||||
|
||||
$(BUILD_DIR)/bshuf_h5filter.o: $(BITSHUFFLE_SRC_DIR)/bshuf_h5filter.c
|
||||
mkdir -p $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c
|
||||
mkdir -p $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
@@ -51,7 +40,7 @@ $(BSLZ4_BUILD_DIR)/%.o: $(BSLZ4_SRC_DIR)/%.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(BUILD_DIR)/bslz4.a: $(BSLZ4_BUILD_DIR)/lz4.o $(BSLZ4_BUILD_DIR)/bitshuffle.o \
|
||||
$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o
|
||||
$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o $(BSLZ4_BUILD_DIR)/bshuf_h5filter.o
|
||||
mkdir -p $(BUILD_DIR)
|
||||
ar rcs $@ $^
|
||||
|
||||
|
||||
+139
-25
@@ -14,23 +14,22 @@
|
||||
#include "bitshuffle_internals.h"
|
||||
#include "lz4.h"
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
#include "zstd.h"
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
|
||||
|
||||
#define BSHUF_LZ4_DECOMPRESS_FAST
|
||||
|
||||
|
||||
|
||||
// Macros.
|
||||
#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \
|
||||
free(buf); return count - 1000; }
|
||||
|
||||
|
||||
/* Bitshuffle and compress a single block. */
|
||||
int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
const size_t size, const size_t elem_size) {
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *tmp_buf_bshuf;
|
||||
@@ -42,7 +41,8 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
tmp_buf_bshuf = malloc(size * elem_size);
|
||||
if (tmp_buf_bshuf == NULL) return -1;
|
||||
|
||||
tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size));
|
||||
int dst_capacity = LZ4_compressBound(size * elem_size);
|
||||
tmp_buf_lz4 = malloc(dst_capacity);
|
||||
if (tmp_buf_lz4 == NULL){
|
||||
free(tmp_buf_bshuf);
|
||||
return -1;
|
||||
@@ -58,7 +58,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
free(tmp_buf_bshuf);
|
||||
return count;
|
||||
}
|
||||
nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size);
|
||||
nbytes = LZ4_compress_default((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size, dst_capacity);
|
||||
free(tmp_buf_bshuf);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4);
|
||||
|
||||
@@ -76,7 +76,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
|
||||
/* Decompress and bitunshuffle a single block. */
|
||||
int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
const size_t size, const size_t elem_size) {
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *out, *tmp_buf;
|
||||
@@ -96,14 +96,6 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
#ifdef BSHUF_LZ4_DECOMPRESS_FAST
|
||||
nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
|
||||
if (nbytes != nbytes_from_header) {
|
||||
free(tmp_buf);
|
||||
return -91;
|
||||
}
|
||||
#else
|
||||
nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header,
|
||||
size * elem_size);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
|
||||
@@ -112,7 +104,7 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
return -91;
|
||||
}
|
||||
nbytes = nbytes_from_header;
|
||||
#endif
|
||||
|
||||
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
nbytes += 4;
|
||||
@@ -121,6 +113,92 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
/* Bitshuffle and compress a single block. */
|
||||
int64_t bshuf_compress_zstd_block(ioc_chain *C_ptr, \
|
||||
const size_t size, const size_t elem_size, const int comp_lvl) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *tmp_buf_bshuf;
|
||||
void *tmp_buf_zstd;
|
||||
size_t this_iter;
|
||||
const void *in;
|
||||
void *out;
|
||||
|
||||
tmp_buf_bshuf = malloc(size * elem_size);
|
||||
if (tmp_buf_bshuf == NULL) return -1;
|
||||
|
||||
size_t tmp_buf_zstd_size = ZSTD_compressBound(size * elem_size);
|
||||
tmp_buf_zstd = malloc(tmp_buf_zstd_size);
|
||||
if (tmp_buf_zstd == NULL){
|
||||
free(tmp_buf_bshuf);
|
||||
return -1;
|
||||
}
|
||||
|
||||
in = ioc_get_in(C_ptr, &this_iter);
|
||||
ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size));
|
||||
|
||||
count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size);
|
||||
if (count < 0) {
|
||||
free(tmp_buf_zstd);
|
||||
free(tmp_buf_bshuf);
|
||||
return count;
|
||||
}
|
||||
nbytes = ZSTD_compress(tmp_buf_zstd, tmp_buf_zstd_size, (const void*)tmp_buf_bshuf, size * elem_size, comp_lvl);
|
||||
free(tmp_buf_bshuf);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_zstd);
|
||||
|
||||
out = ioc_get_out(C_ptr, &this_iter);
|
||||
ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4));
|
||||
|
||||
bshuf_write_uint32_BE(out, nbytes);
|
||||
memcpy((char *) out + 4, tmp_buf_zstd, nbytes);
|
||||
|
||||
free(tmp_buf_zstd);
|
||||
|
||||
return nbytes + 4;
|
||||
}
|
||||
|
||||
|
||||
/* Decompress and bitunshuffle a single block. */
|
||||
int64_t bshuf_decompress_zstd_block(ioc_chain *C_ptr,
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *out, *tmp_buf;
|
||||
const void *in;
|
||||
size_t this_iter;
|
||||
int32_t nbytes_from_header;
|
||||
|
||||
in = ioc_get_in(C_ptr, &this_iter);
|
||||
nbytes_from_header = bshuf_read_uint32_BE(in);
|
||||
ioc_set_next_in(C_ptr, &this_iter,
|
||||
(void*) ((char*) in + nbytes_from_header + 4));
|
||||
|
||||
out = ioc_get_out(C_ptr, &this_iter);
|
||||
ioc_set_next_out(C_ptr, &this_iter,
|
||||
(void *) ((char *) out + size * elem_size));
|
||||
|
||||
tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
nbytes = ZSTD_decompress(tmp_buf, size * elem_size, (void *)((char *) in + 4), nbytes_from_header);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
|
||||
if (nbytes != size * elem_size) {
|
||||
free(tmp_buf);
|
||||
return -91;
|
||||
}
|
||||
|
||||
nbytes = nbytes_from_header;
|
||||
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
nbytes += 4;
|
||||
|
||||
free(tmp_buf);
|
||||
return nbytes;
|
||||
}
|
||||
#endif // ZSTD_SUPPORT
|
||||
|
||||
|
||||
/* ---- Public functions ----
|
||||
*
|
||||
@@ -138,13 +216,13 @@ size_t bshuf_compress_lz4_bound(const size_t size,
|
||||
}
|
||||
if (block_size % BSHUF_BLOCKED_MULT) return -81;
|
||||
|
||||
|
||||
|
||||
// Note that each block gets a 4 byte header.
|
||||
// Size of full blocks.
|
||||
bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
|
||||
|
||||
// Size of partial blocks, if any.
|
||||
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
|
||||
if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
|
||||
|
||||
// Size of uncompressed data not fitting into any blocks.
|
||||
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
|
||||
return bound;
|
||||
}
|
||||
@@ -153,13 +231,49 @@ size_t bshuf_compress_lz4_bound(const size_t size,
|
||||
int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size,
|
||||
elem_size, block_size);
|
||||
elem_size, block_size, 0/*option*/);
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size,
|
||||
elem_size, block_size);
|
||||
elem_size, block_size, 0/*option*/);
|
||||
}
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
size_t bshuf_compress_zstd_bound(const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
|
||||
size_t bound, leftover;
|
||||
|
||||
if (block_size == 0) {
|
||||
block_size = bshuf_default_block_size(elem_size);
|
||||
}
|
||||
if (block_size % BSHUF_BLOCKED_MULT) return -81;
|
||||
|
||||
// Note that each block gets a 4 byte header.
|
||||
// Size of full blocks.
|
||||
bound = (ZSTD_compressBound(block_size * elem_size) + 4) * (size / block_size);
|
||||
// Size of partial blocks, if any.
|
||||
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
|
||||
if (leftover) bound += ZSTD_compressBound(leftover * elem_size) + 4;
|
||||
// Size of uncompressed data not fitting into any blocks.
|
||||
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
|
||||
return bound;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size, const int comp_lvl) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_compress_zstd_block, in, out, size,
|
||||
elem_size, block_size, comp_lvl);
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_decompress_zstd_block, in, out, size,
|
||||
elem_size, block_size, 0/*option*/);
|
||||
}
|
||||
#endif // ZSTD_SUPPORT
|
||||
|
||||
+89
-7
@@ -35,6 +35,10 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ---- LZ4 Interface ----
|
||||
*/
|
||||
|
||||
/* ---- bshuf_compress_lz4_bound ----
|
||||
*
|
||||
* Bound on size of data compressed with *bshuf_compress_lz4*.
|
||||
@@ -94,11 +98,6 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
|
||||
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
|
||||
* must patch the parameters used to compress the data.
|
||||
*
|
||||
* NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function
|
||||
* LZ4_decompress_fast from LZ4, which does not protect against maliciously
|
||||
* formed datasets. By modifying the compressed data, this function could be
|
||||
* coerced into leaving the boundaries of the input buffer.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* in : input buffer
|
||||
@@ -116,8 +115,91 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
|
||||
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
/*
|
||||
* ---- ZSTD Interface ----
|
||||
*/
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
/* ---- bshuf_compress_zstd_bound ----
|
||||
*
|
||||
* Bound on size of data compressed with *bshuf_compress_zstd*.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* size : number of elements in input
|
||||
* elem_size : element size of typed data
|
||||
* block_size : Process in blocks of this many elements. Pass 0 to
|
||||
* select automatically (recommended).
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* Bound on compressed data size.
|
||||
*
|
||||
*/
|
||||
size_t bshuf_compress_zstd_bound(const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
/* ---- bshuf_compress_zstd ----
|
||||
*
|
||||
* Bitshuffled and compress the data using zstd.
|
||||
*
|
||||
* Transpose within elements, in blocks of data of *block_size* elements then
|
||||
* compress the blocks using ZSTD. In the output buffer, each block is prefixed
|
||||
* by a 4 byte integer giving the compressed size of that block.
|
||||
*
|
||||
* Output buffer must be large enough to hold the compressed data. This could
|
||||
* be in principle substantially larger than the input buffer. Use the routine
|
||||
* *bshuf_compress_zstd_bound* to get an upper limit.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* in : input buffer, must be of size * elem_size bytes
|
||||
* out : output buffer, must be large enough to hold data.
|
||||
* size : number of elements in input
|
||||
* elem_size : element size of typed data
|
||||
* block_size : Process in blocks of this many elements. Pass 0 to
|
||||
* select automatically (recommended).
|
||||
* comp_lvl : compression level applied
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* number of bytes used in output buffer, negative error-code if failed.
|
||||
*
|
||||
*/
|
||||
int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size, const size_t
|
||||
elem_size, size_t block_size, const int comp_lvl);
|
||||
|
||||
|
||||
/* ---- bshuf_decompress_zstd ----
|
||||
*
|
||||
* Undo compression and bitshuffling.
|
||||
*
|
||||
* Decompress data then un-bitshuffle it in blocks of *block_size* elements.
|
||||
*
|
||||
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
|
||||
* must patch the parameters used to compress the data.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* in : input buffer
|
||||
* out : output buffer, must be of size * elem_size bytes
|
||||
* size : number of elements in input
|
||||
* elem_size : element size of typed data
|
||||
* block_size : Process in blocks of this many elements. Pass 0 to
|
||||
* select automatically (recommended).
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* number of bytes consumed in *input* buffer, negative error-code if failed.
|
||||
*
|
||||
*/
|
||||
int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
#endif // ZSTD_SUPPORT
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BITSHUFFLE_H
|
||||
|
||||
+742
-39
@@ -16,30 +16,55 @@
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__)
|
||||
#define USEAVX512
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__) && defined (__SSE2__)
|
||||
#define USEAVX2
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(NO_WARN_X86_INTRINSICS)
|
||||
#define USESSE2
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON__) || (__ARM_NEON)
|
||||
#ifdef __aarch64__
|
||||
#define USEARMNEON
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Conditional includes for SSE2 and AVX2.
|
||||
#ifdef USEAVX2
|
||||
#include <immintrin.h>
|
||||
#elif defined USESSE2
|
||||
#include <emmintrin.h>
|
||||
#elif defined USEARMNEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(_OPENMP) && defined(_MSC_VER)
|
||||
typedef int64_t omp_size_t;
|
||||
#else
|
||||
typedef size_t omp_size_t;
|
||||
#endif
|
||||
|
||||
|
||||
// Macros.
|
||||
#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
|
||||
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
|
||||
|
||||
|
||||
/* ---- Functions indicating compile time instruction set. ---- */
|
||||
|
||||
int bshuf_using_NEON(void) {
|
||||
#ifdef USEARMNEON
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int bshuf_using_SSE2(void) {
|
||||
#ifdef USESSE2
|
||||
return 1;
|
||||
@@ -58,6 +83,14 @@ int bshuf_using_AVX2(void) {
|
||||
}
|
||||
|
||||
|
||||
int bshuf_using_AVX512(void) {
|
||||
#ifdef USEAVX512
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---- Worker code not requiring special instruction sets. ----
|
||||
*
|
||||
* The following code does not use any x86 specific vectorized instructions
|
||||
@@ -131,8 +164,8 @@ int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t
|
||||
CHECK_MULT_EIGHT(start);
|
||||
|
||||
if (size > start) {
|
||||
|
||||
|
||||
// ii loop separated into 2 loops so the compiler can unroll
|
||||
// the inner one.
|
||||
for (ii = start; ii + 7 < size; ii += 8) {
|
||||
for (jj = 0; jj < elem_size; jj++) {
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
@@ -348,10 +381,516 @@ int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size
|
||||
}
|
||||
|
||||
|
||||
/* ---- Worker code that uses Arm NEON ----
|
||||
*
|
||||
* The following code makes use of the Arm NEON instruction set.
|
||||
* NEON technology is the implementation of the ARM Advanced Single
|
||||
* Instruction Multiple Data (SIMD) extension.
|
||||
* The NEON unit is the component of the processor that executes SIMD instructions.
|
||||
* It is also called the NEON Media Processing Engine (MPE).
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef USEARMNEON
|
||||
|
||||
/* Transpose bytes within elements for 16 bit elements. */
|
||||
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
|
||||
|
||||
size_t ii;
|
||||
const char *in_b = (const char*) in;
|
||||
char *out_b = (char*) out;
|
||||
int8x16_t a0, b0, a1, b1;
|
||||
|
||||
for (ii=0; ii + 15 < size; ii += 16) {
|
||||
a0 = vld1q_s8(in_b + 2*ii + 0*16);
|
||||
b0 = vld1q_s8(in_b + 2*ii + 1*16);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
|
||||
a0 = vzip1q_s8(a1, b1);
|
||||
b0 = vzip2q_s8(a1, b1);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
|
||||
a0 = vzip1q_s8(a1, b1);
|
||||
b0 = vzip2q_s8(a1, b1);
|
||||
|
||||
vst1q_s8(out_b + 0*size + ii, a0);
|
||||
vst1q_s8(out_b + 1*size + ii, b0);
|
||||
}
|
||||
|
||||
return bshuf_trans_byte_elem_remainder(in, out, size, 2,
|
||||
size - size % 16);
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bytes within elements for 32 bit elements. */
|
||||
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
|
||||
|
||||
size_t ii;
|
||||
const char *in_b;
|
||||
char *out_b;
|
||||
in_b = (const char*) in;
|
||||
out_b = (char*) out;
|
||||
int8x16_t a0, b0, c0, d0, a1, b1, c1, d1;
|
||||
int64x2_t a2, b2, c2, d2;
|
||||
|
||||
for (ii=0; ii + 15 < size; ii += 16) {
|
||||
a0 = vld1q_s8(in_b + 4*ii + 0*16);
|
||||
b0 = vld1q_s8(in_b + 4*ii + 1*16);
|
||||
c0 = vld1q_s8(in_b + 4*ii + 2*16);
|
||||
d0 = vld1q_s8(in_b + 4*ii + 3*16);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
c1 = vzip1q_s8(c0, d0);
|
||||
d1 = vzip2q_s8(c0, d0);
|
||||
|
||||
a0 = vzip1q_s8(a1, b1);
|
||||
b0 = vzip2q_s8(a1, b1);
|
||||
c0 = vzip1q_s8(c1, d1);
|
||||
d0 = vzip2q_s8(c1, d1);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
c1 = vzip1q_s8(c0, d0);
|
||||
d1 = vzip2q_s8(c0, d0);
|
||||
|
||||
a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
|
||||
b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
|
||||
c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
|
||||
d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
|
||||
|
||||
vst1q_s64((int64_t *) (out_b + 0*size + ii), a2);
|
||||
vst1q_s64((int64_t *) (out_b + 1*size + ii), b2);
|
||||
vst1q_s64((int64_t *) (out_b + 2*size + ii), c2);
|
||||
vst1q_s64((int64_t *) (out_b + 3*size + ii), d2);
|
||||
}
|
||||
|
||||
return bshuf_trans_byte_elem_remainder(in, out, size, 4,
|
||||
size - size % 16);
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bytes within elements for 64 bit elements. */
|
||||
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
|
||||
|
||||
size_t ii;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
|
||||
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
|
||||
|
||||
for (ii=0; ii + 15 < size; ii += 16) {
|
||||
a0 = vld1q_s8(in_b + 8*ii + 0*16);
|
||||
b0 = vld1q_s8(in_b + 8*ii + 1*16);
|
||||
c0 = vld1q_s8(in_b + 8*ii + 2*16);
|
||||
d0 = vld1q_s8(in_b + 8*ii + 3*16);
|
||||
e0 = vld1q_s8(in_b + 8*ii + 4*16);
|
||||
f0 = vld1q_s8(in_b + 8*ii + 5*16);
|
||||
g0 = vld1q_s8(in_b + 8*ii + 6*16);
|
||||
h0 = vld1q_s8(in_b + 8*ii + 7*16);
|
||||
|
||||
a1 = vzip1q_s8 (a0, b0);
|
||||
b1 = vzip2q_s8 (a0, b0);
|
||||
c1 = vzip1q_s8 (c0, d0);
|
||||
d1 = vzip2q_s8 (c0, d0);
|
||||
e1 = vzip1q_s8 (e0, f0);
|
||||
f1 = vzip2q_s8 (e0, f0);
|
||||
g1 = vzip1q_s8 (g0, h0);
|
||||
h1 = vzip2q_s8 (g0, h0);
|
||||
|
||||
a0 = vzip1q_s8 (a1, b1);
|
||||
b0 = vzip2q_s8 (a1, b1);
|
||||
c0 = vzip1q_s8 (c1, d1);
|
||||
d0 = vzip2q_s8 (c1, d1);
|
||||
e0 = vzip1q_s8 (e1, f1);
|
||||
f0 = vzip2q_s8 (e1, f1);
|
||||
g0 = vzip1q_s8 (g1, h1);
|
||||
h0 = vzip2q_s8 (g1, h1);
|
||||
|
||||
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
|
||||
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
|
||||
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
|
||||
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
|
||||
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
|
||||
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
|
||||
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
|
||||
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
|
||||
|
||||
a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
|
||||
b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
|
||||
c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
|
||||
d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
|
||||
e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
|
||||
f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
|
||||
g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
|
||||
h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
|
||||
|
||||
vst1q_s8(out_b + 0*size + ii, a0);
|
||||
vst1q_s8(out_b + 1*size + ii, b0);
|
||||
vst1q_s8(out_b + 2*size + ii, c0);
|
||||
vst1q_s8(out_b + 3*size + ii, d0);
|
||||
vst1q_s8(out_b + 4*size + ii, e0);
|
||||
vst1q_s8(out_b + 5*size + ii, f0);
|
||||
vst1q_s8(out_b + 6*size + ii, g0);
|
||||
vst1q_s8(out_b + 7*size + ii, h0);
|
||||
}
|
||||
|
||||
return bshuf_trans_byte_elem_remainder(in, out, size, 8,
|
||||
size - size % 16);
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bytes within elements using best NEON algorithm available. */
|
||||
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
// Trivial cases: power of 2 bytes.
|
||||
switch (elem_size) {
|
||||
case 1:
|
||||
count = bshuf_copy(in, out, size, elem_size);
|
||||
return count;
|
||||
case 2:
|
||||
count = bshuf_trans_byte_elem_NEON_16(in, out, size);
|
||||
return count;
|
||||
case 4:
|
||||
count = bshuf_trans_byte_elem_NEON_32(in, out, size);
|
||||
return count;
|
||||
case 8:
|
||||
count = bshuf_trans_byte_elem_NEON_64(in, out, size);
|
||||
return count;
|
||||
}
|
||||
|
||||
// Worst case: odd number of bytes. Turns out that this is faster for
|
||||
// (odd * 2) byte elements as well (hence % 4).
|
||||
if (elem_size % 4) {
|
||||
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
|
||||
return count;
|
||||
}
|
||||
|
||||
// Multiple of power of 2: transpose hierarchically.
|
||||
{
|
||||
size_t nchunk_elem;
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
if ((elem_size % 8) == 0) {
|
||||
nchunk_elem = elem_size / 8;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
|
||||
count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
|
||||
} else if ((elem_size % 4) == 0) {
|
||||
nchunk_elem = elem_size / 4;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
|
||||
count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
|
||||
} else {
|
||||
// Not used since scalar algorithm is faster.
|
||||
nchunk_elem = elem_size / 2;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
|
||||
count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
|
||||
}
|
||||
|
||||
free(tmp_buf);
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Creates a mask made up of the most significant
|
||||
* bit of each byte of 'input'
|
||||
*/
|
||||
int32_t move_byte_mask_neon(uint8x16_t input) {
|
||||
|
||||
return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3)
|
||||
| (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7)
|
||||
| (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11)
|
||||
| (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15)
|
||||
);
|
||||
}
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
size_t ii, kk;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
uint16_t* out_ui16;
|
||||
|
||||
int64_t count;
|
||||
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
CHECK_MULT_EIGHT(nbyte);
|
||||
|
||||
int16x8_t xmm;
|
||||
int32_t bt;
|
||||
|
||||
for (ii = 0; ii + 15 < nbyte; ii += 16) {
|
||||
xmm = vld1q_s16((int16_t *) (in_b + ii));
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = move_byte_mask_neon((uint8x16_t) xmm);
|
||||
xmm = vshlq_n_s16(xmm, 1);
|
||||
out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_ui16 = bt;
|
||||
}
|
||||
}
|
||||
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
|
||||
nbyte - nbyte % 16);
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bits within elements. */
|
||||
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* For data organized into a row for each bit (8 * elem_size rows), transpose
|
||||
* the bytes. */
|
||||
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
size_t ii, jj;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
size_t nrows = 8 * elem_size;
|
||||
size_t nbyte_row = size / 8;
|
||||
|
||||
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
|
||||
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
|
||||
int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
|
||||
|
||||
for (ii = 0; ii + 7 < nrows; ii += 8) {
|
||||
for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
|
||||
a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj);
|
||||
b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj);
|
||||
c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj);
|
||||
d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj);
|
||||
e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj);
|
||||
f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj);
|
||||
g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj);
|
||||
h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip1q_s8(c0, d0);
|
||||
c1 = vzip1q_s8(e0, f0);
|
||||
d1 = vzip1q_s8(g0, h0);
|
||||
e1 = vzip2q_s8(a0, b0);
|
||||
f1 = vzip2q_s8(c0, d0);
|
||||
g1 = vzip2q_s8(e0, f0);
|
||||
h1 = vzip2q_s8(g0, h0);
|
||||
|
||||
a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
|
||||
b0= (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
|
||||
c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
|
||||
d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
|
||||
e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
|
||||
f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
|
||||
g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
|
||||
h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
|
||||
|
||||
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
|
||||
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
|
||||
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
|
||||
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
|
||||
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
|
||||
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
|
||||
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
|
||||
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
|
||||
|
||||
as = (int64x1_t *) &a1;
|
||||
bs = (int64x1_t *) &b1;
|
||||
cs = (int64x1_t *) &c1;
|
||||
ds = (int64x1_t *) &d1;
|
||||
es = (int64x1_t *) &e1;
|
||||
fs = (int64x1_t *) &f1;
|
||||
gs = (int64x1_t *) &g1;
|
||||
hs = (int64x1_t *) &h1;
|
||||
|
||||
vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1));
|
||||
}
|
||||
for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
|
||||
out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
|
||||
}
|
||||
}
|
||||
return size * elem_size;
|
||||
}
|
||||
|
||||
|
||||
/* Shuffle bits within the bytes of eight element blocks. */
|
||||
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
uint16_t* out_ui16 = (uint16_t*) out;
|
||||
|
||||
size_t ii, jj, kk;
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
int16x8_t xmm;
|
||||
int32_t bt;
|
||||
|
||||
if (elem_size % 2) {
|
||||
bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
|
||||
} else {
|
||||
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
|
||||
ii += 8 * elem_size) {
|
||||
for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
|
||||
xmm = vld1q_s16((int16_t *) &in_b[ii + jj]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = move_byte_mask_neon((uint8x16_t) xmm);
|
||||
xmm = vshlq_n_s16(xmm, 1);
|
||||
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
|
||||
out_ui16[ind / 2] = bt;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return size * elem_size;
|
||||
}
|
||||
|
||||
|
||||
/* Untranspose bits within elements. */
|
||||
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
#else // #ifdef USEARMNEON
|
||||
|
||||
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* ---- Worker code that uses SSE2 ----
|
||||
*
|
||||
* The following code makes use of the SSE2 instruction set and specialized
|
||||
* 16 byte registers. The SSE2 instructions are present on modern x86
|
||||
* 16 byte registers. The SSE2 instructions are present on modern x86
|
||||
* processors. The first Intel processor microarchitecture supporting SSE2 was
|
||||
* Pentium 4 (2000).
|
||||
*
|
||||
@@ -512,7 +1051,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
|
||||
|
||||
int64_t count;
|
||||
|
||||
|
||||
// Trivial cases: power of 2 bytes.
|
||||
switch (elem_size) {
|
||||
case 1:
|
||||
count = bshuf_copy(in, out, size, elem_size);
|
||||
@@ -528,14 +1067,14 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Worst case: odd number of bytes. Turns out that this is faster for
|
||||
// (odd * 2) byte elements as well (hence % 4).
|
||||
if (elem_size % 4) {
|
||||
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
// Multiple of power of 2: transpose hierarchically.
|
||||
{
|
||||
size_t nchunk_elem;
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
@@ -554,7 +1093,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
|
||||
} else {
|
||||
|
||||
// Not used since scalar algorithm is faster.
|
||||
nchunk_elem = elem_size / 2;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
|
||||
count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
|
||||
@@ -687,8 +1226,8 @@ int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size
|
||||
g1 = _mm_unpacklo_epi32(g0, h0);
|
||||
h1 = _mm_unpackhi_epi32(g0, h0);
|
||||
|
||||
|
||||
|
||||
// We don't have a storeh instruction for integers, so interpret
|
||||
// as a float. Have a storel (_mm_storel_epi64).
|
||||
as = (__m128 *) &a1;
|
||||
bs = (__m128 *) &b1;
|
||||
cs = (__m128 *) &c1;
|
||||
@@ -737,8 +1276,8 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
uint16_t* out_ui16 = (uint16_t*) out;
|
||||
|
||||
@@ -788,7 +1327,7 @@ int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
|
||||
return count;
|
||||
}
|
||||
|
||||
#else
|
||||
#else // #ifdef USESSE2
|
||||
|
||||
|
||||
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
|
||||
@@ -842,7 +1381,7 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
#endif // #ifdef USESSE2
|
||||
|
||||
|
||||
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
|
||||
@@ -857,7 +1396,6 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
|
||||
*/
|
||||
|
||||
#ifdef USEAVX2
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
@@ -1014,8 +1552,8 @@ int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
|
||||
@@ -1065,7 +1603,7 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
#else // #ifdef USEAVX2
|
||||
|
||||
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
@@ -1096,19 +1634,179 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
|
||||
return -12;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // #ifdef USEAVX2
|
||||
|
||||
#ifdef USEAVX512
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
size_t ii, kk;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
size_t nbyte = elem_size * size;
|
||||
int64_t count;
|
||||
|
||||
int64_t* out_i64;
|
||||
__m512i zmm;
|
||||
__mmask64 bt;
|
||||
if (nbyte >= 64) {
|
||||
const __m512i mask = _mm512_set1_epi8(0);
|
||||
|
||||
for (ii = 0; ii + 63 < nbyte; ii += 64) {
|
||||
zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
|
||||
zmm = _mm512_slli_epi16(zmm, 1);
|
||||
out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_i64 = (int64_t)bt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__m256i ymm;
|
||||
int32_t bt32;
|
||||
int32_t* out_i32;
|
||||
size_t start = nbyte - nbyte % 64;
|
||||
for (ii = start; ii + 31 < nbyte; ii += 32) {
|
||||
ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt32 = _mm256_movemask_epi8(ymm);
|
||||
ymm = _mm256_slli_epi16(ymm, 1);
|
||||
out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_i32 = bt32;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
|
||||
nbyte - nbyte % 64 % 32);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bits within elements. */
|
||||
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
|
||||
return count;
|
||||
|
||||
}
|
||||
|
||||
/* Shuffle bits within the bytes of eight element blocks. */
|
||||
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
|
||||
size_t ii, jj, kk;
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
__m512i zmm;
|
||||
__mmask64 bt;
|
||||
|
||||
if (elem_size % 8) {
|
||||
return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
|
||||
} else {
|
||||
const __m512i mask = _mm512_set1_epi8(0);
|
||||
for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
|
||||
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
|
||||
ii += 8 * elem_size) {
|
||||
zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
|
||||
zmm = _mm512_slli_epi16(zmm, 1);
|
||||
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
|
||||
* (int64_t *) &out_b[ind] = bt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return size * elem_size;
|
||||
}
|
||||
|
||||
/* Untranspose bits within elements. */
|
||||
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
return count;
|
||||
}
|
||||
|
||||
#else // #ifdef USEAVX512
|
||||
|
||||
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
return -14;
|
||||
}
|
||||
|
||||
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -14;
|
||||
|
||||
}
|
||||
|
||||
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -14;
|
||||
}
|
||||
|
||||
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -14;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ---- Drivers selecting best instruction set at compile time. ---- */
|
||||
|
||||
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
|
||||
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
#ifdef USEAVX2
|
||||
#ifdef USEAVX512
|
||||
count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size);
|
||||
#elif defined USEAVX2
|
||||
count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
|
||||
#elif defined(USESSE2)
|
||||
count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
|
||||
#elif defined(USEARMNEON)
|
||||
count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size);
|
||||
#else
|
||||
count = bshuf_trans_bit_elem_scal(in, out, size, elem_size);
|
||||
#endif
|
||||
@@ -1116,14 +1814,18 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
#ifdef USEAVX2
|
||||
#ifdef USEAVX512
|
||||
count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size);
|
||||
#elif defined USEAVX2
|
||||
count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
|
||||
#elif defined(USESSE2)
|
||||
count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
|
||||
#elif defined(USEARMNEON)
|
||||
count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size);
|
||||
#else
|
||||
count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size);
|
||||
#endif
|
||||
@@ -1136,9 +1838,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
/* Wrap a function for processing a single block to process an entire buffer in
|
||||
* parallel. */
|
||||
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \
|
||||
const size_t size, const size_t elem_size, size_t block_size) {
|
||||
const size_t size, const size_t elem_size, size_t block_size, const int option) {
|
||||
|
||||
size_t ii;
|
||||
omp_size_t ii = 0;
|
||||
int64_t err = 0;
|
||||
int64_t count, cum_count=0;
|
||||
size_t last_block_size;
|
||||
@@ -1161,8 +1863,8 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
#pragma omp parallel for schedule(dynamic, 1) \
|
||||
private(count) reduction(+ : cum_count)
|
||||
#endif
|
||||
for (ii = 0; ii < size / block_size; ii ++) {
|
||||
count = fun(&C, block_size, elem_size);
|
||||
for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) {
|
||||
count = fun(&C, block_size, elem_size, option);
|
||||
if (count < 0) err = count;
|
||||
cum_count += count;
|
||||
}
|
||||
@@ -1170,7 +1872,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
last_block_size = size % block_size;
|
||||
last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT;
|
||||
if (last_block_size) {
|
||||
count = fun(&C, last_block_size, elem_size);
|
||||
count = fun(&C, last_block_size, elem_size, option);
|
||||
if (count < 0) err = count;
|
||||
cum_count += count;
|
||||
}
|
||||
@@ -1178,6 +1880,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
if (err < 0) return err;
|
||||
|
||||
leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
|
||||
//this_iter;
|
||||
last_in = (char *) ioc_get_in(&C, &this_iter);
|
||||
ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
|
||||
last_out = (char *) ioc_get_out(&C, &this_iter);
|
||||
@@ -1193,7 +1896,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
|
||||
/* Bitshuffle a single block. */
|
||||
int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
|
||||
const size_t size, const size_t elem_size) {
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
|
||||
size_t this_iter;
|
||||
const void *in;
|
||||
@@ -1201,7 +1904,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
|
||||
int64_t count;
|
||||
|
||||
|
||||
|
||||
|
||||
in = ioc_get_in(C_ptr, &this_iter);
|
||||
ioc_set_next_in(C_ptr, &this_iter,
|
||||
(void*) ((char*) in + size * elem_size));
|
||||
@@ -1216,7 +1919,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
|
||||
|
||||
/* Bitunshuffle a single block. */
|
||||
int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \
|
||||
const size_t size, const size_t elem_size) {
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
|
||||
|
||||
size_t this_iter;
|
||||
@@ -1296,11 +1999,11 @@ uint32_t bshuf_read_uint32_BE(const void* buf) {
|
||||
*/
|
||||
|
||||
size_t bshuf_default_block_size(const size_t elem_size) {
|
||||
|
||||
|
||||
// This function needs to be absolutely stable between versions.
|
||||
// Otherwise encoded data will not be decodable.
|
||||
|
||||
size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
|
||||
|
||||
// Ensure it is a required multiple.
|
||||
block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
|
||||
return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
|
||||
}
|
||||
@@ -1310,7 +2013,7 @@ int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
|
||||
return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size,
|
||||
elem_size, block_size);
|
||||
elem_size, block_size, 0/*option*/);
|
||||
}
|
||||
|
||||
|
||||
@@ -1318,7 +2021,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
|
||||
return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size,
|
||||
elem_size, block_size);
|
||||
elem_size, block_size, 0/*option*/);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
* -1 : Failed to allocate memory.
|
||||
* -11 : Missing SSE.
|
||||
* -12 : Missing AVX.
|
||||
* -13 : Missing Arm Neon.
|
||||
* -14 : Missing AVX512.
|
||||
* -80 : Input size not a multiple of 8.
|
||||
* -81 : block_size not multiple of 8.
|
||||
* -91 : Decompression error, wrong number of bytes processed.
|
||||
@@ -28,15 +30,14 @@
|
||||
#ifndef BITSHUFFLE_CORE_H
|
||||
#define BITSHUFFLE_CORE_H
|
||||
|
||||
|
||||
// We assume GNU g++ defining `__cplusplus` has stdint.h
|
||||
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
|
||||
#include <stdint.h>
|
||||
#else
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef signed short int16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef long long int64_t;
|
||||
#endif
|
||||
@@ -44,11 +45,11 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
|
||||
// These are usually set in the setup.py.
|
||||
#ifndef BSHUF_VERSION_MAJOR
|
||||
#define BSHUF_VERSION_MAJOR 0
|
||||
#define BSHUF_VERSION_MINOR 3
|
||||
#define BSHUF_VERSION_POINT 4
|
||||
#define BSHUF_VERSION_MINOR 4
|
||||
#define BSHUF_VERSION_POINT 0
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -67,6 +68,18 @@ extern "C" {
|
||||
int bshuf_using_SSE2(void);
|
||||
|
||||
|
||||
/* ---- bshuf_using_NEON ----
|
||||
*
|
||||
* Whether routines where compiled with the NEON instruction set.
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* 1 if using NEON, 0 otherwise.
|
||||
*
|
||||
*/
|
||||
int bshuf_using_NEON(void);
|
||||
|
||||
|
||||
/* ---- bshuf_using_AVX2 ----
|
||||
*
|
||||
* Whether routines where compiled with the AVX2 instruction set.
|
||||
@@ -79,6 +92,18 @@ int bshuf_using_SSE2(void);
|
||||
int bshuf_using_AVX2(void);
|
||||
|
||||
|
||||
/* ---- bshuf_using_AVX512 ----
|
||||
*
|
||||
* Whether routines where compiled with the AVX512 instruction set.
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* 1 if using AVX512, 0 otherwise.
|
||||
*
|
||||
*/
|
||||
int bshuf_using_AVX512(void);
|
||||
|
||||
|
||||
/* ---- bshuf_default_block_size ----
|
||||
*
|
||||
* The default block size as function of element size.
|
||||
@@ -151,7 +176,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BITSHUFFLE_CORE_H
|
||||
|
||||
@@ -13,19 +13,31 @@
|
||||
#ifndef BITSHUFFLE_INTERNALS_H
|
||||
#define BITSHUFFLE_INTERNALS_H
|
||||
|
||||
// We assume GNU g++ defining `__cplusplus` has stdint.h
|
||||
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
|
||||
#include <stdint.h>
|
||||
#else
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef long long int64_t;
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "iochain.h"
|
||||
|
||||
|
||||
// Constants.
|
||||
#ifndef BSHUF_MIN_RECOMMEND_BLOCK
|
||||
#define BSHUF_MIN_RECOMMEND_BLOCK 128
|
||||
#define BSHUF_BLOCKED_MULT 8
|
||||
#define BSHUF_BLOCKED_MULT 8 // Block sizes must be multiple of this.
|
||||
#define BSHUF_TARGET_BLOCK_SIZE_B 8192
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// Macros.
|
||||
#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }
|
||||
|
||||
|
||||
@@ -49,15 +61,15 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
|
||||
/* Function definition for worker functions that process a single block. */
|
||||
typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr,
|
||||
const size_t size, const size_t elem_size);
|
||||
const size_t size, const size_t elem_size, const int option);
|
||||
|
||||
/* Wrap a function for processing a single block to process an entire buffer in
|
||||
* parallel. */
|
||||
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
const size_t size, const size_t elem_size, size_t block_size);
|
||||
const size_t size, const size_t elem_size, size_t block_size, const int option);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // BITSHUFFLE_INTERNALS_H
|
||||
|
||||
@@ -0,0 +1,260 @@
|
||||
/*
|
||||
* Bitshuffle HDF5 filter
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
* Website: http://www.github.com/kiyo-masui/bitshuffle
|
||||
* Created: 2014
|
||||
*
|
||||
* See LICENSE file for details about copyright and rights to use.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "bitshuffle.h"
|
||||
#include "bshuf_h5filter.h"
|
||||
|
||||
|
||||
#define PUSH_ERR(func, minor, str) \
|
||||
H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
|
||||
|
||||
|
||||
// Prototypes from bitshuffle.c
|
||||
void bshuf_write_uint64_BE(void* buf, uint64_t num);
|
||||
uint64_t bshuf_read_uint64_BE(void* buf);
|
||||
void bshuf_write_uint32_BE(void* buf, uint32_t num);
|
||||
uint32_t bshuf_read_uint32_BE(const void* buf);
|
||||
|
||||
|
||||
// Only called on compression, not on reverse.
|
||||
herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
|
||||
|
||||
herr_t r;
|
||||
size_t ii;
|
||||
|
||||
unsigned int elem_size;
|
||||
|
||||
unsigned int flags;
|
||||
size_t nelements = 8;
|
||||
size_t nelem_max = 11;
|
||||
unsigned values[] = {0,0,0,0,0,0,0,0,0,0,0};
|
||||
unsigned tmp_values[] = {0,0,0,0,0,0,0,0};
|
||||
char msg[80];
|
||||
|
||||
r = H5Pget_filter_by_id2(dcpl, BSHUF_H5FILTER, &flags, &nelements,
|
||||
tmp_values, 0, NULL, NULL);
|
||||
if(r<0) return -1;
|
||||
|
||||
// First 3 slots reserved. Move any passed options to higher addresses.
|
||||
for (ii=0; ii < nelements && ii + 3 < nelem_max; ii++) {
|
||||
values[ii + 3] = tmp_values[ii];
|
||||
}
|
||||
|
||||
nelements = 3 + nelements;
|
||||
|
||||
values[0] = BSHUF_VERSION_MAJOR;
|
||||
values[1] = BSHUF_VERSION_MINOR;
|
||||
|
||||
elem_size = H5Tget_size(type);
|
||||
if(elem_size <= 0) {
|
||||
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
|
||||
"Invalid element size.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
values[2] = elem_size;
|
||||
|
||||
// Validate user supplied arguments.
|
||||
if (nelements > 3) {
|
||||
if (values[3] % 8 || values[3] < 0) {
|
||||
sprintf(msg, "Error in bitshuffle. Invalid block size: %d.",
|
||||
values[3]);
|
||||
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, msg);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (nelements > 4) {
|
||||
switch (values[4]) {
|
||||
case 0:
|
||||
break;
|
||||
case BSHUF_H5_COMPRESS_LZ4:
|
||||
break;
|
||||
#ifdef ZSTD_SUPPORT
|
||||
case BSHUF_H5_COMPRESS_ZSTD:
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
|
||||
"Invalid bitshuffle compression.");
|
||||
}
|
||||
}
|
||||
|
||||
r = H5Pmodify_filter(dcpl, BSHUF_H5FILTER, flags, nelements, values);
|
||||
if(r<0) return -1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
|
||||
const unsigned int cd_values[], size_t nbytes,
|
||||
size_t *buf_size, void **buf) {
|
||||
|
||||
size_t size, elem_size;
|
||||
int err = -1;
|
||||
char msg[80];
|
||||
size_t block_size = 0;
|
||||
size_t buf_size_out, nbytes_uncomp, nbytes_out;
|
||||
char* in_buf = *buf;
|
||||
void *out_buf;
|
||||
|
||||
if (cd_nelmts < 3) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"Not enough parameters.");
|
||||
return 0;
|
||||
}
|
||||
elem_size = cd_values[2];
|
||||
#ifdef ZSTD_SUPPORT
|
||||
const int comp_lvl = cd_values[5];
|
||||
#endif
|
||||
|
||||
// User specified block size.
|
||||
if (cd_nelmts > 3) block_size = cd_values[3];
|
||||
|
||||
if (block_size == 0) block_size = bshuf_default_block_size(elem_size);
|
||||
|
||||
#ifndef ZSTD_SUPPORT
|
||||
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"ZSTD compression filter chosen but ZSTD support not installed.");
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Compression in addition to bitshuffle.
|
||||
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
|
||||
if (flags & H5Z_FLAG_REVERSE) {
|
||||
// First eight bytes is the number of bytes in the output buffer,
|
||||
// little endian.
|
||||
nbytes_uncomp = bshuf_read_uint64_BE(in_buf);
|
||||
// Override the block size with the one read from the header.
|
||||
block_size = bshuf_read_uint32_BE((const char*) in_buf + 8) / elem_size;
|
||||
// Skip over the header.
|
||||
in_buf += 12;
|
||||
buf_size_out = nbytes_uncomp;
|
||||
} else {
|
||||
nbytes_uncomp = nbytes;
|
||||
// Pick which compressions library to use
|
||||
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
|
||||
buf_size_out = bshuf_compress_lz4_bound(nbytes_uncomp / elem_size,
|
||||
elem_size, block_size) + 12;
|
||||
}
|
||||
#ifdef ZSTD_SUPPORT
|
||||
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
|
||||
buf_size_out = bshuf_compress_zstd_bound(nbytes_uncomp / elem_size,
|
||||
elem_size, block_size) + 12;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
nbytes_uncomp = nbytes;
|
||||
buf_size_out = nbytes;
|
||||
}
|
||||
|
||||
// TODO, remove this restriction by memcopying the extra.
|
||||
if (nbytes_uncomp % elem_size) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"Non integer number of elements.");
|
||||
return 0;
|
||||
}
|
||||
size = nbytes_uncomp / elem_size;
|
||||
|
||||
out_buf = malloc(buf_size_out);
|
||||
if (out_buf == NULL) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"Could not allocate output buffer.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
|
||||
if (flags & H5Z_FLAG_REVERSE) {
|
||||
// Bit unshuffle/decompress.
|
||||
// Pick which compressions library to use
|
||||
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
|
||||
err = bshuf_decompress_lz4(in_buf, out_buf, size, elem_size, block_size);
|
||||
}
|
||||
#ifdef ZSTD_SUPPORT
|
||||
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
|
||||
err = bshuf_decompress_zstd(in_buf, out_buf, size, elem_size, block_size);
|
||||
}
|
||||
#endif
|
||||
nbytes_out = nbytes_uncomp;
|
||||
} else {
|
||||
// Bit shuffle/compress.
|
||||
// Write the header, described in
|
||||
// http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
|
||||
// Technically we should be using signed integers instead of
|
||||
// unsigned ones, however for valid inputs (positive numbers) these
|
||||
// have the same representation.
|
||||
bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
|
||||
bshuf_write_uint32_BE((char*) out_buf + 8, block_size * elem_size);
|
||||
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
|
||||
err = bshuf_compress_lz4(in_buf, (char*) out_buf + 12, size,
|
||||
elem_size, block_size);
|
||||
}
|
||||
#ifdef ZSTD_SUPPORT
|
||||
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
|
||||
err = bshuf_compress_zstd(in_buf, (char*) out_buf + 12, size,
|
||||
elem_size, block_size, comp_lvl);
|
||||
}
|
||||
#endif
|
||||
nbytes_out = err + 12;
|
||||
}
|
||||
} else {
|
||||
if (flags & H5Z_FLAG_REVERSE) {
|
||||
// Bit unshuffle.
|
||||
err = bshuf_bitunshuffle(in_buf, out_buf, size, elem_size,
|
||||
block_size); } else {
|
||||
// Bit shuffle.
|
||||
err = bshuf_bitshuffle(in_buf, out_buf, size, elem_size,
|
||||
block_size); } nbytes_out = nbytes; }
|
||||
//printf("nb_in %d, nb_uncomp %d, nb_out %d, buf_out %d, block %d\n",
|
||||
//nbytes, nbytes_uncomp, nbytes_out, buf_size_out, block_size);
|
||||
|
||||
if (err < 0) {
|
||||
sprintf(msg, "Error in bitshuffle with error code %d.", err);
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, msg);
|
||||
free(out_buf);
|
||||
return 0;
|
||||
} else {
|
||||
free(*buf);
|
||||
*buf = out_buf;
|
||||
*buf_size = buf_size_out;
|
||||
|
||||
return nbytes_out;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
H5Z_class_t bshuf_H5Filter[1] = {{
|
||||
H5Z_CLASS_T_VERS,
|
||||
(H5Z_filter_t)(BSHUF_H5FILTER),
|
||||
1, 1,
|
||||
"bitshuffle; see https://github.com/kiyo-masui/bitshuffle",
|
||||
NULL,
|
||||
(H5Z_set_local_func_t)(bshuf_h5_set_local),
|
||||
(H5Z_func_t)(bshuf_h5_filter)
|
||||
}};
|
||||
|
||||
|
||||
int bshuf_register_h5filter(void){
|
||||
|
||||
int retval;
|
||||
|
||||
retval = H5Zregister(bshuf_H5Filter);
|
||||
if(retval<0){
|
||||
PUSH_ERR("bshuf_register_h5filter",
|
||||
H5E_CANTREGISTER, "Can't register bitshuffle filter");
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Bitshuffle HDF5 filter
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
* Website: http://www.github.com/kiyo-masui/bitshuffle
|
||||
* Created: 2014
|
||||
*
|
||||
* See LICENSE file for details about copyright and rights to use.
|
||||
*
|
||||
*
|
||||
* Header File
|
||||
*
|
||||
* Filter Options
|
||||
* --------------
|
||||
* block_size (option slot 0) : integer (optional)
|
||||
* What block size to use (in elements not bytes). Default is 0,
|
||||
* for which bitshuffle will pick a block size with a target of 8kb.
|
||||
* Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
|
||||
* Whether to apply LZ4 compression to the data after bitshuffling.
|
||||
* This is much faster than applying compression as a second filter
|
||||
* because it is done when the small block of data is already in the
|
||||
* L1 cache.
|
||||
*
|
||||
* For LZ4 compression, the compressed format of the data is the same as
|
||||
* for the normal LZ4 filter described in
|
||||
* http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BSHUF_H5FILTER_H
|
||||
#define BSHUF_H5FILTER_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define H5Z_class_t_vers 2
|
||||
#include "hdf5.h"
|
||||
|
||||
|
||||
#define BSHUF_H5FILTER 32008
|
||||
|
||||
|
||||
#define BSHUF_H5_COMPRESS_LZ4 2
|
||||
#define BSHUF_H5_COMPRESS_ZSTD 3
|
||||
|
||||
|
||||
extern H5Z_class_t bshuf_H5Filter[1];
|
||||
|
||||
|
||||
/* ---- bshuf_register_h5filter ----
|
||||
*
|
||||
* Register the bitshuffle HDF5 filter within the HDF5 library.
|
||||
*
|
||||
* Call this before using the bitshuffle HDF5 filter from C unless
|
||||
* using dynamically loaded filters.
|
||||
*
|
||||
*/
|
||||
int bshuf_register_h5filter(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // BSHUF_H5FILTER_H
|
||||
+4
-4
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* IOchain - Distribute a chain of dependant IO events amoung threads.
|
||||
* IOchain - Distribute a chain of dependent IO events among threads.
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
@@ -81,9 +81,9 @@ void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
|
||||
C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
|
||||
|
||||
|
||||
|
||||
// *in_pl[this_iter]* lock released at the end of the iteration to avoid being
|
||||
// overtaken by previous threads and having *out_pl[this_iter]* corrupted.
|
||||
// Especially worried about thread 0, iteration 0.
|
||||
omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
|
||||
#endif
|
||||
}
|
||||
|
||||
+3
-3
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* IOchain - Distribute a chain of dependant IO events amoung threads.
|
||||
* IOchain - Distribute a chain of dependent IO events among threads.
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
@@ -25,7 +25,7 @@
|
||||
* Usage
|
||||
* -----
|
||||
* - Call `ioc_init` in serial block.
|
||||
* - Each thread should create a local variable *size_t this_iter* and
|
||||
* - Each thread should create a local variable *size_t this_iter* and
|
||||
* pass its address to all function calls. Its value will be set
|
||||
* inside the functions and is used to identify the thread.
|
||||
* - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
|
||||
@@ -90,5 +90,5 @@ void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
|
||||
void * ioc_get_out(ioc_chain *C, size_t *this_iter);
|
||||
void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);
|
||||
|
||||
#endif
|
||||
#endif // IOCHAIN_H
|
||||
|
||||
|
||||
+1921
-941
File diff suppressed because it is too large
Load Diff
+671
-257
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user