diff --git a/Cargo.toml b/Cargo.toml index ef35ea0..34bc19e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = ["retrieval", "httpret", "err", "disk"] [profile.release] -opt-level = 0 -overflow-checks = true -debug = 2 -debug-assertions = true +#opt-level = 0 +#overflow-checks = true +#debug = 2 +#debug-assertions = true diff --git a/bitshuffle/Cargo.toml b/bitshuffle/Cargo.toml new file mode 100644 index 0000000..b359472 --- /dev/null +++ b/bitshuffle/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "bitshuffle" +version = "0.0.1-a.0" +authors = ["Dominik Werder "] +edition = "2018" + +[dependencies] +libc = "0.2.92" + +[build-dependencies] +cc = "1.0.67" diff --git a/bitshuffle/build.rs b/bitshuffle/build.rs new file mode 100644 index 0000000..46a164c --- /dev/null +++ b/bitshuffle/build.rs @@ -0,0 +1,9 @@ +fn main() { + cc::Build::new() + .file("src/bitshuffle.c") + .file("src/bitshuffle_core.c") + .file("src/iochain.c") + .file("src/lz4.c") + .include("src") + .compile("bitshufbundled"); +} diff --git a/bitshuffle/src/bitshuffle.c b/bitshuffle/src/bitshuffle.c new file mode 100644 index 0000000..30037be --- /dev/null +++ b/bitshuffle/src/bitshuffle.c @@ -0,0 +1,164 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + */ + +#include "bitshuffle.h" +#include "bitshuffle_core.h" +#include "bitshuffle_internals.h" +#include "lz4.h" + +#include +#include + + +// Constants. +// Use fast decompression instead of safe decompression for LZ4. +#define BSHUF_LZ4_DECOMPRESS_FAST + + +// Macros. +#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \ + free(buf); return count - 1000; } + + +/* Bitshuffle and compress a single block. */ +int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \ + const size_t size, const size_t elem_size) { + + int64_t nbytes, count; + void *tmp_buf_bshuf; + void *tmp_buf_lz4; + size_t this_iter; + const void *in; + void *out; + + tmp_buf_bshuf = malloc(size * elem_size); + if (tmp_buf_bshuf == NULL) return -1; + + tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size)); + if (tmp_buf_lz4 == NULL){ + free(tmp_buf_bshuf); + return -1; + } + + + in = ioc_get_in(C_ptr, &this_iter); + ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size)); + + count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size); + if (count < 0) { + free(tmp_buf_lz4); + free(tmp_buf_bshuf); + return count; + } + nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size); + free(tmp_buf_bshuf); + CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4); + + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4)); + + bshuf_write_uint32_BE(out, nbytes); + memcpy((char *) out + 4, tmp_buf_lz4, nbytes); + + free(tmp_buf_lz4); + + return nbytes + 4; +} + + +/* Decompress and bitunshuffle a single block. */ +int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr, + const size_t size, const size_t elem_size) { + + int64_t nbytes, count; + void *out, *tmp_buf; + const void *in; + size_t this_iter; + int32_t nbytes_from_header; + + in = ioc_get_in(C_ptr, &this_iter); + nbytes_from_header = bshuf_read_uint32_BE(in); + ioc_set_next_in(C_ptr, &this_iter, + (void*) ((char*) in + nbytes_from_header + 4)); + + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, + (void *) ((char *) out + size * elem_size)); + + tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + +#ifdef BSHUF_LZ4_DECOMPRESS_FAST + nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size); + CHECK_ERR_FREE_LZ(nbytes, tmp_buf); + if (nbytes != nbytes_from_header) { + free(tmp_buf); + return -91; + } +#else + nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header, + size * elem_size); + CHECK_ERR_FREE_LZ(nbytes, tmp_buf); + if (nbytes != size * elem_size) { + free(tmp_buf); + return -91; + } + nbytes = nbytes_from_header; +#endif + count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + nbytes += 4; + + free(tmp_buf); + return nbytes; +} + + +/* ---- Public functions ---- + * + * See header file for description and usage. + * + */ + +size_t bshuf_compress_lz4_bound(const size_t size, + const size_t elem_size, size_t block_size) { + + size_t bound, leftover; + + if (block_size == 0) { + block_size = bshuf_default_block_size(elem_size); + } + if (block_size % BSHUF_BLOCKED_MULT) return -81; + + // Note that each block gets a 4 byte header. + // Size of full blocks. + bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size); + // Size of partial blocks, if any. + leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT; + if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4; + // Size of uncompressed data not fitting into any blocks. + bound += (size % BSHUF_BLOCKED_MULT) * elem_size; + return bound; +} + + +int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size, + elem_size, block_size); +} + + +int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size, + elem_size, block_size); +} diff --git a/bitshuffle/src/bitshuffle.h b/bitshuffle/src/bitshuffle.h new file mode 100644 index 0000000..3df95f4 --- /dev/null +++ b/bitshuffle/src/bitshuffle.h @@ -0,0 +1,123 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + * + * Header File + * + * Worker routines return an int64_t which is the number of bytes processed + * if positive or an error code if negative. + * + * Error codes: + * -1 : Failed to allocate memory. + * -11 : Missing SSE. + * -12 : Missing AVX. + * -80 : Input size not a multiple of 8. + * -81 : block_size not multiple of 8. + * -91 : Decompression error, wrong number of bytes processed. + * -1YYY : Error internal to compression routine with error code -YYY. + */ + + +#ifndef BITSHUFFLE_H +#define BITSHUFFLE_H + +#include +#include "bitshuffle_core.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* ---- bshuf_compress_lz4_bound ---- + * + * Bound on size of data compressed with *bshuf_compress_lz4*. + * + * Parameters + * ---------- + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Process in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * Bound on compressed data size. + * + */ +size_t bshuf_compress_lz4_bound(const size_t size, + const size_t elem_size, size_t block_size); + + +/* ---- bshuf_compress_lz4 ---- + * + * Bitshuffled and compress the data using LZ4. + * + * Transpose within elements, in blocks of data of *block_size* elements then + * compress the blocks using LZ4. In the output buffer, each block is prefixed + * by a 4 byte integer giving the compressed size of that block. + * + * Output buffer must be large enough to hold the compressed data. This could + * be in principle substantially larger than the input buffer. Use the routine + * *bshuf_compress_lz4_bound* to get an upper limit. + * + * Parameters + * ---------- + * in : input buffer, must be of size * elem_size bytes + * out : output buffer, must be large enough to hold data. + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Process in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes used in output buffer, negative error-code if failed. + * + */ +int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const size_t + elem_size, size_t block_size); + + +/* ---- bshuf_decompress_lz4 ---- + * + * Undo compression and bitshuffling. + * + * Decompress data then un-bitshuffle it in blocks of *block_size* elements. + * + * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size* + * must patch the parameters used to compress the data. + * + * NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function + * LZ4_decompress_fast from LZ4, which does not protect against maliciously + * formed datasets. By modifying the compressed data, this function could be + * coerced into leaving the boundaries of the input buffer. + * + * Parameters + * ---------- + * in : input buffer + * out : output buffer, must be of size * elem_size bytes + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Process in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes consumed in *input* buffer, negative error-code if failed. + * + */ +int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // BITSHUFFLE_H diff --git a/bitshuffle/src/bitshuffle_core.c b/bitshuffle/src/bitshuffle_core.c new file mode 100644 index 0000000..d7dba08 --- /dev/null +++ b/bitshuffle/src/bitshuffle_core.c @@ -0,0 +1,1862 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + */ + +#include "bitshuffle_core.h" +#include "bitshuffle_internals.h" + +#include +#include + + +#if defined(__AVX2__) && defined (__SSE2__) +#define USEAVX2 +#endif + +#if defined(__SSE2__) +#define USESSE2 +#endif + +#if defined(__ARM_NEON__) || (__ARM_NEON) +#define USEARMNEON +#endif + +// Conditional includes for SSE2 and AVX2. +#ifdef USEAVX2 +#include +#elif defined USESSE2 +#include +#elif defined USEARMNEON +#include +#endif + +#if defined(_OPENMP) && defined(_MSC_VER) +typedef int64_t omp_size_t; +#else +typedef size_t omp_size_t; +#endif + +// Macros. +#define CHECK_MULT_EIGHT(n) if (n % 8) return -80; +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + + +/* ---- Functions indicating compile time instruction set. ---- */ + +int bshuf_using_NEON(void) { +#ifdef USEARMNEON + return 1; +#else + return 0; +#endif +} + + +int bshuf_using_SSE2(void) { +#ifdef USESSE2 + return 1; +#else + return 0; +#endif +} + + +int bshuf_using_AVX2(void) { +#ifdef USEAVX2 + return 1; +#else + return 0; +#endif +} + + +/* ---- Worker code not requiring special instruction sets. ---- + * + * The following code does not use any x86 specific vectorized instructions + * and should compile on any machine + * + */ + +/* Transpose 8x8 bit array packed into a single quadword *x*. + * *t* is workspace. */ +#define TRANS_BIT_8X8(x, t) { \ + t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL; \ + x = x ^ t ^ (t << 7); \ + t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL; \ + x = x ^ t ^ (t << 14); \ + t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL; \ + x = x ^ t ^ (t << 28); \ + } + +/* Transpose 8x8 bit array along the diagonal from upper right + to lower left */ +#define TRANS_BIT_8X8_BE(x, t) { \ + t = (x ^ (x >> 9)) & 0x0055005500550055LL; \ + x = x ^ t ^ (t << 9); \ + t = (x ^ (x >> 18)) & 0x0000333300003333LL; \ + x = x ^ t ^ (t << 18); \ + t = (x ^ (x >> 36)) & 0x000000000F0F0F0FLL; \ + x = x ^ t ^ (t << 36); \ + } + +/* Transpose of an array of arbitrarily typed elements. */ +#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) { \ + size_t ii, jj, kk; \ + const type_t* in_type = (const type_t*) in; \ + type_t* out_type = (type_t*) out; \ + for(ii = 0; ii + 7 < lda; ii += 8) { \ + for(jj = 0; jj < ldb; jj++) { \ + for(kk = 0; kk < 8; kk++) { \ + out_type[jj*lda + ii + kk] = \ + in_type[ii*ldb + kk * ldb + jj]; \ + } \ + } \ + } \ + for(ii = lda - lda % 8; ii < lda; ii ++) { \ + for(jj = 0; jj < ldb; jj++) { \ + out_type[jj*lda + ii] = in_type[ii*ldb + jj]; \ + } \ + } \ + } + + +/* Memory copy with bshuf call signature. For testing and profiling. */ +int64_t bshuf_copy(const void* in, void* out, const size_t size, + const size_t elem_size) { + + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + memcpy(out_b, in_b, size * elem_size); + return size * elem_size; +} + + +/* Transpose bytes within elements, starting partway through input. */ +int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size, + const size_t elem_size, const size_t start) { + + size_t ii, jj, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(start); + + if (size > start) { + // ii loop separated into 2 loops so the compiler can unroll + // the inner one. + for (ii = start; ii + 7 < size; ii += 8) { + for (jj = 0; jj < elem_size; jj++) { + for (kk = 0; kk < 8; kk++) { + out_b[jj * size + ii + kk] + = in_b[ii * elem_size + kk * elem_size + jj]; + } + } + } + for (ii = size - size % 8; ii < size; ii ++) { + for (jj = 0; jj < elem_size; jj++) { + out_b[jj * size + ii] = in_b[ii * elem_size + jj]; + } + } + } + return size * elem_size; +} + + +/* Transpose bytes within elements. */ +int64_t bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0); +} + + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size, + const size_t elem_size, const size_t start_byte) { + + const uint64_t* in_b = (const uint64_t*) in; + uint8_t* out_b = (uint8_t*) out; + + uint64_t x, t; + + size_t ii, kk; + size_t nbyte = elem_size * size; + size_t nbyte_bitrow = nbyte / 8; + + uint64_t e=1; + const int little_endian = *(uint8_t *) &e == 1; + const size_t bit_row_skip = little_endian ? nbyte_bitrow : -nbyte_bitrow; + const int64_t bit_row_offset = little_endian ? 0 : 7 * nbyte_bitrow; + + CHECK_MULT_EIGHT(nbyte); + CHECK_MULT_EIGHT(start_byte); + + for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) { + x = in_b[ii]; + if (little_endian) { + TRANS_BIT_8X8(x, t); + } else { + TRANS_BIT_8X8_BE(x, t); + } + for (kk = 0; kk < 8; kk ++) { + out_b[bit_row_offset + kk * bit_row_skip + ii] = x; + x = x >> 8; + } + } + return size * elem_size; +} + + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0); +} + + +/* General transpose of an array, optimized for large element sizes. */ +int64_t bshuf_trans_elem(const void* in, void* out, const size_t lda, + const size_t ldb, const size_t elem_size) { + + size_t ii, jj; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + for(ii = 0; ii < lda; ii++) { + for(jj = 0; jj < ldb; jj++) { + memcpy(&out_b[(jj*lda + ii) * elem_size], + &in_b[(ii*ldb + jj) * elem_size], elem_size); + } + } + return lda * ldb * elem_size; +} + + +/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */ +int64_t bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t nbyte_bitrow = size / 8; + + CHECK_MULT_EIGHT(size); + + return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow); +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + void *tmp_buf; + + CHECK_MULT_EIGHT(size); + + tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + size_t ii, jj, kk, nbyte_row; + const char *in_b; + char *out_b; + + + in_b = (const char*) in; + out_b = (char*) out; + + nbyte_row = size / 8; + + CHECK_MULT_EIGHT(size); + + for (jj = 0; jj < elem_size; jj++) { + for (ii = 0; ii < nbyte_row; ii++) { + for (kk = 0; kk < 8; kk++) { + out_b[ii * 8 * elem_size + jj * 8 + kk] = \ + in_b[(jj * 8 + kk) * nbyte_row + ii]; + } + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_scal(const void* in, void* out, \ + const size_t size, const size_t elem_size) { + + const char *in_b; + char *out_b; + uint64_t x, t; + size_t ii, jj, kk; + size_t nbyte, out_index; + + uint64_t e=1; + const int little_endian = *(uint8_t *) &e == 1; + const size_t elem_skip = little_endian ? elem_size : -elem_size; + const uint64_t elem_offset = little_endian ? 0 : 7 * elem_size; + + CHECK_MULT_EIGHT(size); + + in_b = (const char*) in; + out_b = (char*) out; + + nbyte = elem_size * size; + + for (jj = 0; jj < 8 * elem_size; jj += 8) { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { + x = *((uint64_t*) &in_b[ii + jj]); + if (little_endian) { + TRANS_BIT_8X8(x, t); + } else { + TRANS_BIT_8X8_BE(x, t); + } + for (kk = 0; kk < 8; kk++) { + out_index = ii + jj / 8 + elem_offset + kk * elem_skip; + *((uint8_t*) &out_b[out_index]) = x; + x = x >> 8; + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + void *tmp_buf; + + CHECK_MULT_EIGHT(size); + + tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* ---- Worker code that uses Arm NEON ---- + * + * The following code makes use of the Arm NEON instruction set. + * NEON technology is the implementation of the ARM Advanced Single + * Instruction Multiple Data (SIMD) extension. + * The NEON unit is the component of the processor that executes SIMD instructions. + * It is also called the NEON Media Processing Engine (MPE). + * + */ + +#ifdef USEARMNEON + +/* Transpose bytes within elements for 16 bit elements. */ +int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) { + + size_t ii; + const char *in_b = (const char*) in; + char *out_b = (char*) out; + int8x16_t a0, b0, a1, b1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = vld1q_s8(in_b + 2*ii + 0*16); + b0 = vld1q_s8(in_b + 2*ii + 1*16); + + a1 = vzip1q_s8(a0, b0); + b1 = vzip2q_s8(a0, b0); + + a0 = vzip1q_s8(a1, b1); + b0 = vzip2q_s8(a1, b1); + + a1 = vzip1q_s8(a0, b0); + b1 = vzip2q_s8(a0, b0); + + a0 = vzip1q_s8(a1, b1); + b0 = vzip2q_s8(a1, b1); + + vst1q_s8(out_b + 0*size + ii, a0); + vst1q_s8(out_b + 1*size + ii, b0); + } + + return bshuf_trans_byte_elem_remainder(in, out, size, 2, + size - size % 16); +} + + +/* Transpose bytes within elements for 32 bit elements. */ +int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) { + + size_t ii; + const char *in_b; + char *out_b; + in_b = (const char*) in; + out_b = (char*) out; + int8x16_t a0, b0, c0, d0, a1, b1, c1, d1; + int64x2_t a2, b2, c2, d2; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = vld1q_s8(in_b + 4*ii + 0*16); + b0 = vld1q_s8(in_b + 4*ii + 1*16); + c0 = vld1q_s8(in_b + 4*ii + 2*16); + d0 = vld1q_s8(in_b + 4*ii + 3*16); + + a1 = vzip1q_s8(a0, b0); + b1 = vzip2q_s8(a0, b0); + c1 = vzip1q_s8(c0, d0); + d1 = vzip2q_s8(c0, d0); + + a0 = vzip1q_s8(a1, b1); + b0 = vzip2q_s8(a1, b1); + c0 = vzip1q_s8(c1, d1); + d0 = vzip2q_s8(c1, d1); + + a1 = vzip1q_s8(a0, b0); + b1 = vzip2q_s8(a0, b0); + c1 = vzip1q_s8(c0, d0); + d1 = vzip2q_s8(c0, d0); + + a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1)); + b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1)); + c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1)); + d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1)); + + vst1q_s64((int64_t *) (out_b + 0*size + ii), a2); + vst1q_s64((int64_t *) (out_b + 1*size + ii), b2); + vst1q_s64((int64_t *) (out_b + 2*size + ii), c2); + vst1q_s64((int64_t *) (out_b + 3*size + ii), d2); + } + + return bshuf_trans_byte_elem_remainder(in, out, size, 4, + size - size % 16); +} + + +/* Transpose bytes within elements for 64 bit elements. */ +int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) { + + size_t ii; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + int8x16_t a0, b0, c0, d0, e0, f0, g0, h0; + int8x16_t a1, b1, c1, d1, e1, f1, g1, h1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = vld1q_s8(in_b + 8*ii + 0*16); + b0 = vld1q_s8(in_b + 8*ii + 1*16); + c0 = vld1q_s8(in_b + 8*ii + 2*16); + d0 = vld1q_s8(in_b + 8*ii + 3*16); + e0 = vld1q_s8(in_b + 8*ii + 4*16); + f0 = vld1q_s8(in_b + 8*ii + 5*16); + g0 = vld1q_s8(in_b + 8*ii + 6*16); + h0 = vld1q_s8(in_b + 8*ii + 7*16); + + a1 = vzip1q_s8 (a0, b0); + b1 = vzip2q_s8 (a0, b0); + c1 = vzip1q_s8 (c0, d0); + d1 = vzip2q_s8 (c0, d0); + e1 = vzip1q_s8 (e0, f0); + f1 = vzip2q_s8 (e0, f0); + g1 = vzip1q_s8 (g0, h0); + h1 = vzip2q_s8 (g0, h0); + + a0 = vzip1q_s8 (a1, b1); + b0 = vzip2q_s8 (a1, b1); + c0 = vzip1q_s8 (c1, d1); + d0 = vzip2q_s8 (c1, d1); + e0 = vzip1q_s8 (e1, f1); + f0 = vzip2q_s8 (e1, f1); + g0 = vzip1q_s8 (g1, h1); + h0 = vzip2q_s8 (g1, h1); + + a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0)); + b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0)); + c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0)); + d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0)); + e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0)); + f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0)); + g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0)); + h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0)); + + a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1)); + b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1)); + c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1)); + d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1)); + e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1)); + f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1)); + g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1)); + h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1)); + + vst1q_s8(out_b + 0*size + ii, a0); + vst1q_s8(out_b + 1*size + ii, b0); + vst1q_s8(out_b + 2*size + ii, c0); + vst1q_s8(out_b + 3*size + ii, d0); + vst1q_s8(out_b + 4*size + ii, e0); + vst1q_s8(out_b + 5*size + ii, f0); + vst1q_s8(out_b + 6*size + ii, g0); + vst1q_s8(out_b + 7*size + ii, h0); + } + + return bshuf_trans_byte_elem_remainder(in, out, size, 8, + size - size % 16); +} + + +/* Transpose bytes within elements using best NEON algorithm available. */ +int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + // Trivial cases: power of 2 bytes. + switch (elem_size) { + case 1: + count = bshuf_copy(in, out, size, elem_size); + return count; + case 2: + count = bshuf_trans_byte_elem_NEON_16(in, out, size); + return count; + case 4: + count = bshuf_trans_byte_elem_NEON_32(in, out, size); + return count; + case 8: + count = bshuf_trans_byte_elem_NEON_64(in, out, size); + return count; + } + + // Worst case: odd number of bytes. Turns out that this is faster for + // (odd * 2) byte elements as well (hence % 4). + if (elem_size % 4) { + count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); + return count; + } + + // Multiple of power of 2: transpose hierarchically. + { + size_t nchunk_elem; + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + if ((elem_size % 8) == 0) { + nchunk_elem = elem_size / 8; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); + count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); + } else if ((elem_size % 4) == 0) { + nchunk_elem = elem_size / 4; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); + count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); + } else { + // Not used since scalar algorithm is faster. + nchunk_elem = elem_size / 2; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); + count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); + } + + free(tmp_buf); + return count; + } +} + + +/* Creates a mask made up of the most significant + * bit of each byte of 'input' + */ +int32_t move_byte_mask_neon(uint8x16_t input) { + + return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3) + | (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7) + | (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11) + | (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15) + ); +} + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + uint16_t* out_ui16; + + int64_t count; + + size_t nbyte = elem_size * size; + + CHECK_MULT_EIGHT(nbyte); + + int16x8_t xmm; + int32_t bt; + + for (ii = 0; ii + 15 < nbyte; ii += 16) { + xmm = vld1q_s16((int16_t *) (in_b + ii)); + for (kk = 0; kk < 8; kk++) { + bt = move_byte_mask_neon((uint8x16_t) xmm); + xmm = vshlq_n_s16(xmm, 1); + out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_ui16 = bt; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, + nbyte - nbyte % 16); + return count; +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, jj; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(size); + + size_t nrows = 8 * elem_size; + size_t nbyte_row = size / 8; + + int8x16_t a0, b0, c0, d0, e0, f0, g0, h0; + int8x16_t a1, b1, c1, d1, e1, f1, g1, h1; + int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; + + for (ii = 0; ii + 7 < nrows; ii += 8) { + for (jj = 0; jj + 15 < nbyte_row; jj += 16) { + a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj); + b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj); + c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj); + d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj); + e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj); + f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj); + g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj); + h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj); + + a1 = vzip1q_s8(a0, b0); + b1 = vzip1q_s8(c0, d0); + c1 = vzip1q_s8(e0, f0); + d1 = vzip1q_s8(g0, h0); + e1 = vzip2q_s8(a0, b0); + f1 = vzip2q_s8(c0, d0); + g1 = vzip2q_s8(e0, f0); + h1 = vzip2q_s8(g0, h0); + + a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1)); + b0= (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1)); + c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1)); + d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1)); + e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1)); + f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1)); + g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1)); + h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1)); + + a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0)); + b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0)); + c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0)); + d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0)); + e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0)); + f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0)); + g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0)); + h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0)); + + as = (int64x1_t *) &a1; + bs = (int64x1_t *) &b1; + cs = (int64x1_t *) &c1; + ds = (int64x1_t *) &d1; + es = (int64x1_t *) &e1; + fs = (int64x1_t *) &f1; + gs = (int64x1_t *) &g1; + hs = (int64x1_t *) &h1; + + vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as); + vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1)); + vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs); + vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1)); + vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs); + vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1)); + vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds); + vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1)); + vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es); + vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1)); + vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs); + vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1)); + vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs); + vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1)); + vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs); + vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1)); + } + for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { + out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; + out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; + out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; + out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; + out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; + out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; + out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; + out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + + CHECK_MULT_EIGHT(size); + + // With a bit of care, this could be written such that such that it is + // in_buf = out_buf safe. + const char* in_b = (const char*) in; + uint16_t* out_ui16 = (uint16_t*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + + int16x8_t xmm; + int32_t bt; + + if (elem_size % 2) { + bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); + } else { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { + xmm = vld1q_s16((int16_t *) &in_b[ii + jj]); + for (kk = 0; kk < 8; kk++) { + bt = move_byte_mask_neon((uint8x16_t) xmm); + xmm = vshlq_n_s16(xmm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); + out_ui16[ind / 2] = bt; + } + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + +#else // #ifdef USEARMNEON + +int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -13; +} + + +int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -13; +} + + +int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -13; +} + + +int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -13; +} + + +int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -13; +} + + +int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) { + return -13; +} + + +int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) { + return -13; +} + + +int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) { + return -13; +} + + +int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -13; +} + + +#endif + + + + + +/* ---- Worker code that uses SSE2 ---- + * + * The following code makes use of the SSE2 instruction set and specialized + * 16 byte registers. The SSE2 instructions are present on modern x86 + * processors. The first Intel processor microarchitecture supporting SSE2 was + * Pentium 4 (2000). + * + */ + +#ifdef USESSE2 + +/* Transpose bytes within elements for 16 bit elements. */ +int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { + + size_t ii; + const char *in_b = (const char*) in; + char *out_b = (char*) out; + __m128i a0, b0, a1, b1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]); + b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + + _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); + _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); + } + return bshuf_trans_byte_elem_remainder(in, out, size, 2, + size - size % 16); +} + + +/* Transpose bytes within elements for 32 bit elements. */ +int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { + + size_t ii; + const char *in_b; + char *out_b; + in_b = (const char*) in; + out_b = (char*) out; + __m128i a0, b0, c0, d0, a1, b1, c1, d1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]); + b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]); + c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]); + d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + c1 = _mm_unpacklo_epi8(c0, d0); + d1 = _mm_unpackhi_epi8(c0, d0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + c0 = _mm_unpacklo_epi8(c1, d1); + d0 = _mm_unpackhi_epi8(c1, d1); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + c1 = _mm_unpacklo_epi8(c0, d0); + d1 = _mm_unpackhi_epi8(c0, d0); + + a0 = _mm_unpacklo_epi64(a1, c1); + b0 = _mm_unpackhi_epi64(a1, c1); + c0 = _mm_unpacklo_epi64(b1, d1); + d0 = _mm_unpackhi_epi64(b1, d1); + + _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); + _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); + _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); + _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); + } + return bshuf_trans_byte_elem_remainder(in, out, size, 4, + size - size % 16); +} + + +/* Transpose bytes within elements for 64 bit elements. */ +int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { + + size_t ii; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + __m128i a0, b0, c0, d0, e0, f0, g0, h0; + __m128i a1, b1, c1, d1, e1, f1, g1, h1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); + b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); + c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); + d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); + e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); + f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); + g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); + h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + c1 = _mm_unpacklo_epi8(c0, d0); + d1 = _mm_unpackhi_epi8(c0, d0); + e1 = _mm_unpacklo_epi8(e0, f0); + f1 = _mm_unpackhi_epi8(e0, f0); + g1 = _mm_unpacklo_epi8(g0, h0); + h1 = _mm_unpackhi_epi8(g0, h0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + c0 = _mm_unpacklo_epi8(c1, d1); + d0 = _mm_unpackhi_epi8(c1, d1); + e0 = _mm_unpacklo_epi8(e1, f1); + f0 = _mm_unpackhi_epi8(e1, f1); + g0 = _mm_unpacklo_epi8(g1, h1); + h0 = _mm_unpackhi_epi8(g1, h1); + + a1 = _mm_unpacklo_epi32(a0, c0); + b1 = _mm_unpackhi_epi32(a0, c0); + c1 = _mm_unpacklo_epi32(b0, d0); + d1 = _mm_unpackhi_epi32(b0, d0); + e1 = _mm_unpacklo_epi32(e0, g0); + f1 = _mm_unpackhi_epi32(e0, g0); + g1 = _mm_unpacklo_epi32(f0, h0); + h1 = _mm_unpackhi_epi32(f0, h0); + + a0 = _mm_unpacklo_epi64(a1, e1); + b0 = _mm_unpackhi_epi64(a1, e1); + c0 = _mm_unpacklo_epi64(b1, f1); + d0 = _mm_unpackhi_epi64(b1, f1); + e0 = _mm_unpacklo_epi64(c1, g1); + f0 = _mm_unpackhi_epi64(c1, g1); + g0 = _mm_unpacklo_epi64(d1, h1); + h0 = _mm_unpackhi_epi64(d1, h1); + + _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); + _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); + _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); + _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); + _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); + _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); + _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); + _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); + } + return bshuf_trans_byte_elem_remainder(in, out, size, 8, + size - size % 16); +} + + +/* Transpose bytes within elements using best SSE algorithm available. */ +int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + // Trivial cases: power of 2 bytes. + switch (elem_size) { + case 1: + count = bshuf_copy(in, out, size, elem_size); + return count; + case 2: + count = bshuf_trans_byte_elem_SSE_16(in, out, size); + return count; + case 4: + count = bshuf_trans_byte_elem_SSE_32(in, out, size); + return count; + case 8: + count = bshuf_trans_byte_elem_SSE_64(in, out, size); + return count; + } + + // Worst case: odd number of bytes. Turns out that this is faster for + // (odd * 2) byte elements as well (hence % 4). + if (elem_size % 4) { + count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); + return count; + } + + // Multiple of power of 2: transpose hierarchically. + { + size_t nchunk_elem; + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + if ((elem_size % 8) == 0) { + nchunk_elem = elem_size / 8; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); + count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); + } else if ((elem_size % 4) == 0) { + nchunk_elem = elem_size / 4; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); + count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); + } else { + // Not used since scalar algorithm is faster. + nchunk_elem = elem_size / 2; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); + count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); + } + + free(tmp_buf); + return count; + } +} + + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + uint16_t* out_ui16; + + int64_t count; + + size_t nbyte = elem_size * size; + + CHECK_MULT_EIGHT(nbyte); + + __m128i xmm; + int32_t bt; + + for (ii = 0; ii + 15 < nbyte; ii += 16) { + xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); + for (kk = 0; kk < 8; kk++) { + bt = _mm_movemask_epi8(xmm); + xmm = _mm_slli_epi16(xmm, 1); + out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_ui16 = bt; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, + nbyte - nbyte % 16); + return count; +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, jj; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(size); + + size_t nrows = 8 * elem_size; + size_t nbyte_row = size / 8; + + __m128i a0, b0, c0, d0, e0, f0, g0, h0; + __m128i a1, b1, c1, d1, e1, f1, g1, h1; + __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; + + for (ii = 0; ii + 7 < nrows; ii += 8) { + for (jj = 0; jj + 15 < nbyte_row; jj += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]); + b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]); + c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]); + d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]); + e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]); + f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]); + g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]); + h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]); + + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpacklo_epi8(c0, d0); + c1 = _mm_unpacklo_epi8(e0, f0); + d1 = _mm_unpacklo_epi8(g0, h0); + e1 = _mm_unpackhi_epi8(a0, b0); + f1 = _mm_unpackhi_epi8(c0, d0); + g1 = _mm_unpackhi_epi8(e0, f0); + h1 = _mm_unpackhi_epi8(g0, h0); + + + a0 = _mm_unpacklo_epi16(a1, b1); + b0 = _mm_unpacklo_epi16(c1, d1); + c0 = _mm_unpackhi_epi16(a1, b1); + d0 = _mm_unpackhi_epi16(c1, d1); + + e0 = _mm_unpacklo_epi16(e1, f1); + f0 = _mm_unpacklo_epi16(g1, h1); + g0 = _mm_unpackhi_epi16(e1, f1); + h0 = _mm_unpackhi_epi16(g1, h1); + + + a1 = _mm_unpacklo_epi32(a0, b0); + b1 = _mm_unpackhi_epi32(a0, b0); + + c1 = _mm_unpacklo_epi32(c0, d0); + d1 = _mm_unpackhi_epi32(c0, d0); + + e1 = _mm_unpacklo_epi32(e0, f0); + f1 = _mm_unpackhi_epi32(e0, f0); + + g1 = _mm_unpacklo_epi32(g0, h0); + h1 = _mm_unpackhi_epi32(g0, h0); + + // We don't have a storeh instruction for integers, so interpret + // as a float. Have a storel (_mm_storel_epi64). + as = (__m128 *) &a1; + bs = (__m128 *) &b1; + cs = (__m128 *) &c1; + ds = (__m128 *) &d1; + es = (__m128 *) &e1; + fs = (__m128 *) &f1; + gs = (__m128 *) &g1; + hs = (__m128 *) &h1; + + _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as); + _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs); + _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs); + _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds); + _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es); + _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs); + _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs); + _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs); + + _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as); + _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds); + _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es); + _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs); + } + for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { + out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; + out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; + out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; + out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; + out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; + out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; + out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; + out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + CHECK_MULT_EIGHT(size); + + // With a bit of care, this could be written such that such that it is + // in_buf = out_buf safe. + const char* in_b = (const char*) in; + uint16_t* out_ui16 = (uint16_t*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + + __m128i xmm; + int32_t bt; + + if (elem_size % 2) { + bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); + } else { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { + xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); + for (kk = 0; kk < 8; kk++) { + bt = _mm_movemask_epi8(xmm); + xmm = _mm_slli_epi16(xmm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); + out_ui16[ind / 2] = bt; + } + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + +#else // #ifdef USESSE2 + + +int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { + return -11; +} + + +int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +#endif // #ifdef USESSE2 + + +/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */ + +/* ---- Worker code that uses AVX2 ---- + * + * The following code makes use of the AVX2 instruction set and specialized + * 32 byte registers. The AVX2 instructions are present on newer x86 + * processors. The first Intel processor microarchitecture supporting AVX2 was + * Haswell (2013). + * + */ + +#ifdef USEAVX2 + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + int32_t* out_i32; + + size_t nbyte = elem_size * size; + + int64_t count; + + __m256i ymm; + int32_t bt; + + for (ii = 0; ii + 31 < nbyte; ii += 32) { + ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); + for (kk = 0; kk < 8; kk++) { + bt = _mm256_movemask_epi8(ymm); + ymm = _mm256_slli_epi16(ymm, 1); + out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_i32 = bt; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, + nbyte - nbyte % 32); + return count; +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t hh, ii, jj, kk, mm; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(size); + + size_t nrows = 8 * elem_size; + size_t nbyte_row = size / 8; + + if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, + elem_size); + + __m256i ymm_0[8]; + __m256i ymm_1[8]; + __m256i ymm_storeage[8][4]; + + for (jj = 0; jj + 31 < nbyte_row; jj += 32) { + for (ii = 0; ii + 3 < elem_size; ii += 4) { + for (hh = 0; hh < 4; hh ++) { + + for (kk = 0; kk < 8; kk ++){ + ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ + (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); + } + + for (kk = 0; kk < 4; kk ++){ + ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + } + + for (kk = 0; kk < 2; kk ++){ + for (mm = 0; mm < 2; mm ++){ + ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( + ymm_1[kk * 4 + mm * 2], + ymm_1[kk * 4 + mm * 2 + 1]); + ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( + ymm_1[kk * 4 + mm * 2], + ymm_1[kk * 4 + mm * 2 + 1]); + } + } + + for (kk = 0; kk < 4; kk ++){ + ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + } + + for (kk = 0; kk < 8; kk ++){ + ymm_storeage[kk][hh] = ymm_1[kk]; + } + } + + for (mm = 0; mm < 8; mm ++) { + + for (kk = 0; kk < 4; kk ++){ + ymm_0[kk] = ymm_storeage[mm][kk]; + } + + ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); + ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); + ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); + ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); + + ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); + ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); + ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); + ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); + + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); + } + } + } + for (ii = 0; ii < nrows; ii ++ ) { + for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { + out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + CHECK_MULT_EIGHT(size); + + // With a bit of care, this could be written such that such that it is + // in_buf = out_buf safe. + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + + __m256i ymm; + int32_t bt; + + if (elem_size % 4) { + return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); + } else { + for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); + for (kk = 0; kk < 8; kk++) { + bt = _mm256_movemask_epi8(ymm); + ymm = _mm256_slli_epi16(ymm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); + * (int32_t *) &out_b[ind] = bt; + } + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size); + + free(tmp_buf); + return count; +} + + +#else // #ifdef USEAVX2 + +int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + +#endif // #ifdef USEAVX2 + + +/* ---- Drivers selecting best instruction set at compile time. ---- */ + +int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; +#ifdef USEAVX2 + count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size); +#elif defined(USESSE2) + count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size); +#elif defined(USEARMNEON) + count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size); +#else + count = bshuf_trans_bit_elem_scal(in, out, size, elem_size); +#endif + return count; +} + + +int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; +#ifdef USEAVX2 + count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size); +#elif defined(USESSE2) + count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size); +#elif defined(USEARMNEON) + count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size); +#else + count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size); +#endif + return count; +} + + +/* ---- Wrappers for implementing blocking ---- */ + +/* Wrap a function for processing a single block to process an entire buffer in + * parallel. */ +int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \ + const size_t size, const size_t elem_size, size_t block_size) { + + omp_size_t ii = 0; + int64_t err = 0; + int64_t count, cum_count=0; + size_t last_block_size; + size_t leftover_bytes; + size_t this_iter; + char *last_in; + char *last_out; + + + ioc_chain C; + ioc_init(&C, in, out); + + + if (block_size == 0) { + block_size = bshuf_default_block_size(elem_size); + } + if (block_size % BSHUF_BLOCKED_MULT) return -81; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(dynamic, 1) \ + private(count) reduction(+ : cum_count) +#endif + for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) { + count = fun(&C, block_size, elem_size); + if (count < 0) err = count; + cum_count += count; + } + + last_block_size = size % block_size; + last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT; + if (last_block_size) { + count = fun(&C, last_block_size, elem_size); + if (count < 0) err = count; + cum_count += count; + } + + if (err < 0) return err; + + leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size; + //this_iter; + last_in = (char *) ioc_get_in(&C, &this_iter); + ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes)); + last_out = (char *) ioc_get_out(&C, &this_iter); + ioc_set_next_out(&C, &this_iter, (void *) (last_out + leftover_bytes)); + + memcpy(last_out, last_in, leftover_bytes); + + ioc_destroy(&C); + + return cum_count + leftover_bytes; +} + + +/* Bitshuffle a single block. */ +int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \ + const size_t size, const size_t elem_size) { + + size_t this_iter; + const void *in; + void *out; + int64_t count; + + + + in = ioc_get_in(C_ptr, &this_iter); + ioc_set_next_in(C_ptr, &this_iter, + (void*) ((char*) in + size * elem_size)); + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, + (void *) ((char *) out + size * elem_size)); + + count = bshuf_trans_bit_elem(in, out, size, elem_size); + return count; +} + + +/* Bitunshuffle a single block. */ +int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \ + const size_t size, const size_t elem_size) { + + + size_t this_iter; + const void *in; + void *out; + int64_t count; + + + + + in = ioc_get_in(C_ptr, &this_iter); + ioc_set_next_in(C_ptr, &this_iter, + (void*) ((char*) in + size * elem_size)); + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, + (void *) ((char *) out + size * elem_size)); + + count = bshuf_untrans_bit_elem(in, out, size, elem_size); + return count; +} + + +/* Write a 64 bit unsigned integer to a buffer in big endian order. */ +void bshuf_write_uint64_BE(void* buf, uint64_t num) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint64_t pow28 = 1 << 8; + for (ii = 7; ii >= 0; ii--) { + b[ii] = num % pow28; + num = num / pow28; + } +} + + +/* Read a 64 bit unsigned integer from a buffer big endian order. */ +uint64_t bshuf_read_uint64_BE(void* buf) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint64_t num = 0, pow28 = 1 << 8, cp = 1; + for (ii = 7; ii >= 0; ii--) { + num += b[ii] * cp; + cp *= pow28; + } + return num; +} + + +/* Write a 32 bit unsigned integer to a buffer in big endian order. */ +void bshuf_write_uint32_BE(void* buf, uint32_t num) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint32_t pow28 = 1 << 8; + for (ii = 3; ii >= 0; ii--) { + b[ii] = num % pow28; + num = num / pow28; + } +} + + +/* Read a 32 bit unsigned integer from a buffer big endian order. */ +uint32_t bshuf_read_uint32_BE(const void* buf) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint32_t num = 0, pow28 = 1 << 8, cp = 1; + for (ii = 3; ii >= 0; ii--) { + num += b[ii] * cp; + cp *= pow28; + } + return num; +} + + +/* ---- Public functions ---- + * + * See header file for description and usage. + * + */ + +size_t bshuf_default_block_size(const size_t elem_size) { + // This function needs to be absolutely stable between versions. + // Otherwise encoded data will not be decodable. + + size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size; + // Ensure it is a required multiple. + block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT; + return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK); +} + + +int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + + return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size, + elem_size, block_size); +} + + +int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + + return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size, + elem_size, block_size); +} + + +#undef TRANS_BIT_8X8 +#undef TRANS_ELEM_TYPE +#undef MAX +#undef CHECK_MULT_EIGHT +#undef CHECK_ERR_FREE + +#undef USESSE2 +#undef USEAVX2 diff --git a/bitshuffle/src/bitshuffle_core.h b/bitshuffle/src/bitshuffle_core.h new file mode 100644 index 0000000..0f0fc9c --- /dev/null +++ b/bitshuffle/src/bitshuffle_core.h @@ -0,0 +1,159 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + * + * Header File + * + * Worker routines return an int64_t which is the number of bytes processed + * if positive or an error code if negative. + * + * Error codes: + * -1 : Failed to allocate memory. + * -11 : Missing SSE. + * -12 : Missing AVX. + * -13 : Missing Arm Neon. + * -80 : Input size not a multiple of 8. + * -81 : block_size not multiple of 8. + * -91 : Decompression error, wrong number of bytes processed. + * -1YYY : Error internal to compression routine with error code -YYY. + */ + + +#ifndef BITSHUFFLE_CORE_H +#define BITSHUFFLE_CORE_H + +#include +#if 0 +// We assume GNU g++ defining `__cplusplus` has stdint.h +#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus) +#else + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned long long uint64_t; + typedef long long int64_t; +#endif +#endif + +#include + + +// These are usually set in the setup.py. +#ifndef BSHUF_VERSION_MAJOR +#define BSHUF_VERSION_MAJOR 0 +#define BSHUF_VERSION_MINOR 3 +#define BSHUF_VERSION_POINT 5 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* --- bshuf_using_SSE2 ---- + * + * Whether routines where compiled with the SSE2 instruction set. + * + * Returns + * ------- + * 1 if using SSE2, 0 otherwise. + * + */ +int bshuf_using_SSE2(void); + + +/* ---- bshuf_using_AVX2 ---- + * + * Whether routines where compiled with the AVX2 instruction set. + * + * Returns + * ------- + * 1 if using AVX2, 0 otherwise. + * + */ +int bshuf_using_AVX2(void); + + +/* ---- bshuf_default_block_size ---- + * + * The default block size as function of element size. + * + * This is the block size used by the blocked routines (any routine + * taking a *block_size* argument) when the block_size is not provided + * (zero is passed). + * + * The results of this routine are guaranteed to be stable such that + * shuffled/compressed data can always be decompressed. + * + * Parameters + * ---------- + * elem_size : element size of data to be shuffled/compressed. + * + */ +size_t bshuf_default_block_size(const size_t elem_size); + + +/* ---- bshuf_bitshuffle ---- + * + * Bitshuffle the data. + * + * Transpose the bits within elements, in blocks of *block_size* + * elements. + * + * Parameters + * ---------- + * in : input buffer, must be of size * elem_size bytes + * out : output buffer, must be of size * elem_size bytes + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Do transpose in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes processed, negative error-code if failed. + * + */ +int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size); + + +/* ---- bshuf_bitunshuffle ---- + * + * Unshuffle bitshuffled data. + * + * Untranspose the bits within elements, in blocks of *block_size* + * elements. + * + * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size* + * must match the parameters used to shuffle the data. + * + * Parameters + * ---------- + * in : input buffer, must be of size * elem_size bytes + * out : output buffer, must be of size * elem_size bytes + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Do transpose in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes processed, negative error-code if failed. + * + */ +int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // BITSHUFFLE_CORE_H diff --git a/bitshuffle/src/bitshuffle_internals.h b/bitshuffle/src/bitshuffle_internals.h new file mode 100644 index 0000000..e7372f2 --- /dev/null +++ b/bitshuffle/src/bitshuffle_internals.h @@ -0,0 +1,77 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + */ + + +#ifndef BITSHUFFLE_INTERNALS_H +#define BITSHUFFLE_INTERNALS_H + +#include +#if 0 +// We assume GNU g++ defining `__cplusplus` has stdint.h +#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus) +#else + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned long long uint64_t; + typedef long long int64_t; +#endif +#endif + +#include +#include "iochain.h" + + +// Constants. +#ifndef BSHUF_MIN_RECOMMEND_BLOCK +#define BSHUF_MIN_RECOMMEND_BLOCK 128 +#define BSHUF_BLOCKED_MULT 8 // Block sizes must be multiple of this. +#define BSHUF_TARGET_BLOCK_SIZE_B 8192 +#endif + + +// Macros. +#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; } + + +#ifdef __cplusplus +extern "C" { +#endif + +/* ---- Utility functions for internal use only ---- */ + +int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size); + +/* Read a 32 bit unsigned integer from a buffer big endian order. */ +uint32_t bshuf_read_uint32_BE(const void* buf); + +/* Write a 32 bit unsigned integer to a buffer in big endian order. */ +void bshuf_write_uint32_BE(void* buf, uint32_t num); + +int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size); + +/* Function definition for worker functions that process a single block. */ +typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr, + const size_t size, const size_t elem_size); + +/* Wrap a function for processing a single block to process an entire buffer in + * parallel. */ +int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, + const size_t size, const size_t elem_size, size_t block_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // BITSHUFFLE_INTERNALS_H diff --git a/bitshuffle/src/iochain.c b/bitshuffle/src/iochain.c new file mode 100644 index 0000000..3f2a4d4 --- /dev/null +++ b/bitshuffle/src/iochain.c @@ -0,0 +1,89 @@ +/* + * IOchain - Distribute a chain of dependant IO events amoung threads. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + */ + +#include +#include "iochain.h" + + +void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0) { +#ifdef _OPENMP + omp_init_lock(&C->next_lock); + for (size_t ii = 0; ii < IOC_SIZE; ii ++) { + omp_init_lock(&(C->in_pl[ii].lock)); + omp_init_lock(&(C->out_pl[ii].lock)); + } +#endif + C->next = 0; + C->in_pl[0].ptr = in_ptr_0; + C->out_pl[0].ptr = out_ptr_0; +} + + +void ioc_destroy(ioc_chain *C) { +#ifdef _OPENMP + omp_destroy_lock(&C->next_lock); + for (size_t ii = 0; ii < IOC_SIZE; ii ++) { + omp_destroy_lock(&(C->in_pl[ii].lock)); + omp_destroy_lock(&(C->out_pl[ii].lock)); + } +#endif +} + + +const void * ioc_get_in(ioc_chain *C, size_t *this_iter) { +#ifdef _OPENMP + omp_set_lock(&C->next_lock); + #pragma omp flush +#endif + *this_iter = C->next; + C->next ++; +#ifdef _OPENMP + omp_set_lock(&(C->in_pl[*this_iter % IOC_SIZE].lock)); + omp_set_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock)); + omp_set_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock)); + omp_unset_lock(&C->next_lock); +#endif + return C->in_pl[*this_iter % IOC_SIZE].ptr; +} + + +void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr) { + C->in_pl[(*this_iter + 1) % IOC_SIZE].ptr = in_ptr; +#ifdef _OPENMP + omp_unset_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock)); +#endif +} + + +void * ioc_get_out(ioc_chain *C, size_t *this_iter) { +#ifdef _OPENMP + omp_set_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock)); + #pragma omp flush +#endif + void *out_ptr = C->out_pl[*this_iter % IOC_SIZE].ptr; +#ifdef _OPENMP + omp_unset_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock)); +#endif + return out_ptr; +} + + +void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) { + C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr; +#ifdef _OPENMP + omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock)); + // *in_pl[this_iter]* lock released at the end of the iteration to avoid being + // overtaken by previous threads and having *out_pl[this_iter]* corrupted. + // Especially worried about thread 0, iteration 0. + omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock)); +#endif +} diff --git a/bitshuffle/src/iochain.h b/bitshuffle/src/iochain.h new file mode 100644 index 0000000..f738e6e --- /dev/null +++ b/bitshuffle/src/iochain.h @@ -0,0 +1,93 @@ +/* + * IOchain - Distribute a chain of dependant IO events amoung threads. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + * + * Header File + * + * Similar in concept to a queue. Each task includes reading an input + * and writing output, but the location of the input/output (the pointers) + * depend on the previous item in the chain. + * + * This is designed for parallelizing blocked compression/decompression IO, + * where the destination of a compressed block depends on the compressed size + * of all previous blocks. + * + * Implemented with OpenMP locks. + * + * + * Usage + * ----- + * - Call `ioc_init` in serial block. + * - Each thread should create a local variable *size_t this_iter* and + * pass its address to all function calls. Its value will be set + * inside the functions and is used to identify the thread. + * - Each thread must call each of the `ioc_get*` and `ioc_set*` methods + * exactly once per iteration, starting with `ioc_get_in` and ending + * with `ioc_set_next_out`. + * - The order (`ioc_get_in`, `ioc_set_next_in`, *work*, `ioc_get_out`, + * `ioc_set_next_out`, *work*) is most efficient. + * - Have each thread call `ioc_end_pop`. + * - `ioc_get_in` is blocked until the previous entry's + * `ioc_set_next_in` is called. + * - `ioc_get_out` is blocked until the previous entry's + * `ioc_set_next_out` is called. + * - There are no blocks on the very first iteration. + * - Call `ioc_destroy` in serial block. + * - Safe for num_threads >= IOC_SIZE (but less efficient). + * + */ + + +#ifndef IOCHAIN_H +#define IOCHAIN_H + + +#include +#ifdef _OPENMP +#include +#endif + + +#define IOC_SIZE 33 + + +typedef struct ioc_ptr_and_lock { +#ifdef _OPENMP + omp_lock_t lock; +#endif + void *ptr; +} ptr_and_lock; + +typedef struct ioc_const_ptr_and_lock { +#ifdef _OPENMP + omp_lock_t lock; +#endif + const void *ptr; +} const_ptr_and_lock; + + +typedef struct ioc_chain { +#ifdef _OPENMP + omp_lock_t next_lock; +#endif + size_t next; + const_ptr_and_lock in_pl[IOC_SIZE]; + ptr_and_lock out_pl[IOC_SIZE]; +} ioc_chain; + + +void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0); +void ioc_destroy(ioc_chain *C); +const void * ioc_get_in(ioc_chain *C, size_t *this_iter); +void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr); +void * ioc_get_out(ioc_chain *C, size_t *this_iter); +void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr); + +#endif // IOCHAIN_H diff --git a/bitshuffle/src/lib.rs b/bitshuffle/src/lib.rs new file mode 100644 index 0000000..776df72 --- /dev/null +++ b/bitshuffle/src/lib.rs @@ -0,0 +1,30 @@ +use libc::{size_t}; + +extern { + pub fn bshuf_compress_lz4(inp: *const u8, out: *const u8, size: size_t, elem_size: size_t, block_size: size_t) -> i64; + pub fn bshuf_decompress_lz4(inp: *const u8, out: *const u8, size: size_t, elem_size: size_t, block_size: size_t) -> i64; +} + +pub fn bitshuffle_compress(inp: &[u8], out: &mut [u8], size: usize, elem_size: usize, block_size: usize) -> Result { + unsafe { + let n = bshuf_compress_lz4(inp.as_ptr(), out.as_mut_ptr(), size, elem_size, block_size); + if n >= 0 { + Ok(n as usize) + } + else { + Err(n as isize) + } + } +} + +pub fn bitshuffle_decompress(inp: &[u8], out: &mut [u8], size: usize, elem_size: usize, block_size: usize) -> Result { + unsafe { + let n = bshuf_decompress_lz4(inp.as_ptr(), out.as_mut_ptr(), size, elem_size, block_size); + if n >= 0 { + Ok(n as usize) + } + else { + Err(n as isize) + } + } +} diff --git a/bitshuffle/src/lz4.c b/bitshuffle/src/lz4.c new file mode 100644 index 0000000..1b5194c --- /dev/null +++ b/bitshuffle/src/lz4.c @@ -0,0 +1,1515 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2015, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + + +/************************************** +* Tuning parameters +**************************************/ +/* + * HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#define HEAPMODE 0 + +/* + * ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define ACCELERATION_DEFAULT 1 + + +/************************************** +* CPU Feature Detection +**************************************/ +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +/************************************** +* Includes +**************************************/ +#include "lz4.h" + + +/************************************** +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# include +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ +#else +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# if defined(__GNUC__) || defined(__clang__) +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif /* _MSC_VER */ + +/* LZ4_GCC_VERSION is defined into lz4.h */ +#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +/************************************** +* Memory routines +**************************************/ +#include /* malloc, calloc, free */ +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/************************************** +* Basic Types +**************************************/ +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + + +/************************************** +* Reading and writing into memory +**************************************/ +#define STEPSIZE sizeof(size_t) + +static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } + +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val16; + memcpy(&val16, memPtr, 2); + return val16; +} + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) + { + return LZ4_read16(memPtr); + } + else + { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) + { + memcpy(memPtr, &value, 2); + } + else + { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val32; + memcpy(&val32, memPtr, 4); + return val32; +} + +static U64 LZ4_read64(const void* memPtr) +{ + U64 val64; + memcpy(&val64, memPtr, 8); + return val64; +} + +static size_t LZ4_read_ARCH(const void* p) +{ + if (LZ4_64bits()) + return (size_t)LZ4_read64(p); + else + return (size_t)LZ4_read32(p); +} + + +static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); } + +static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); } + +/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ +static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* e = (BYTE*)dstEnd; + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } + else /* Big Endian CPU */ + { + if (LZ4_64bits()) + { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll((U64)val) >> 3); +# else + unsigned r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } +} + +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIn compression run slower on incompressible data */ + + +/************************************** +* Local Structures and types +**************************************/ +typedef struct { + U32 hashTable[HASH_SIZE_U32]; + U32 currentOffset; + U32 initCheck; + const BYTE* dictionary; + BYTE* bufferStart; /* obsolete, used for slideInputBuffer */ + U32 dictSize; +} LZ4_stream_t_internal; + +typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; + +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; + + +/************************************** +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + + + +/******************************** +* Compression functions +********************************/ + +static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static const U64 prime5bytes = 889523592379ULL; +static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + const U32 hashMask = (1<> (40 - hashLog)) & hashMask; +} + +static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType) +{ + if (LZ4_64bits()) + return LZ4_hashSequence64(sequence, tableType); + return LZ4_hashSequence((U32)sequence, tableType); +} + +static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); } + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + +FORCE_INLINE int LZ4_compress_generic( + void* const ctx, + const char* const source, + char* const dest, + const int inputSize, + const int maxOutputSize, + const limitedOutput_directive outputLimited, + const tableType_t tableType, + const dict_directive dict, + const dictIssue_directive dictIssue, + const U32 acceleration) +{ + LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx; + + const BYTE* ip = (const BYTE*) source; + const BYTE* base; + const BYTE* lowLimit; + const BYTE* const lowRefLimit = ip - dictPtr->dictSize; + const BYTE* const dictionary = dictPtr->dictionary; + const BYTE* const dictEnd = dictionary + dictPtr->dictSize; + const size_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 forwardH; + size_t refDelta=0; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + switch(dict) + { + case noDict: + default: + base = (const BYTE*)source; + lowLimit = (const BYTE*)source; + break; + case withPrefix64k: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source - dictPtr->dictSize; + break; + case usingExtDict: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source; + break; + } + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (inputSize> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx, tableType, base); + if (dict==usingExtDict) + { + if (match<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) + || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; /* Check output limit */ + if (litLength>=RUN_MASK) + { + int len = (int)litLength-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; + matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += MINMATCH + matchLength; + if (ip==limit) + { + unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchLength += more; + ip += more; + } + } + else + { + matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += MINMATCH + matchLength; + } + + if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) + return 0; /* Check output limit */ + if (matchLength>=ML_MASK) + { + *token += ML_MASK; + matchLength -= ML_MASK; + for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } + if (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip > mflimit) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx, tableType, base); + if (dict==usingExtDict) + { + if (match<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + LZ4_putPosition(ip, ctx, tableType, base); + if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) + && (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + const size_t lastRun = (size_t)(iend - anchor); + if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) + return 0; /* Check output limit */ + if (lastRun >= RUN_MASK) + { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } + else + { + *op++ = (BYTE)(lastRun<= LZ4_compressBound(inputSize)) + { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } + else + { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ +#if (HEAPMODE) + void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctx; + void* ctxPtr = &ctx; +#endif + + int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); +} + + +/* hidden debug function */ +/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ +int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t ctx; + + LZ4_resetStream(&ctx); + + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); +} + + +/******************************** +* destSize variant +********************************/ + +static int LZ4_compress_destSize_generic( + void* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + const int targetDstSize, + const tableType_t tableType) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* base = (const BYTE*) src; + const BYTE* lowLimit = (const BYTE*) src; + const BYTE* anchor = ip; + const BYTE* const iend = ip + *srcSizePtr; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + targetDstSize; + BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; + BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); + BYTE* const oMaxSeq = oMaxLit - 1 /* token */; + + U32 forwardH; + + + /* Init conditions */ + if (targetDstSize < 1) return 0; /* Impossible to store anything */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (*srcSizePtr> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) + goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if (op + ((litLength+240)/255) + litLength > oMaxLit) + { + /* Not enough space for a last match */ + op--; + goto _last_literals; + } + if (litLength>=RUN_MASK) + { + unsigned len = litLength - RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< oMaxMatch) + { + /* Match description too long : reduce it */ + matchLength = (15-1) + (oMaxMatch-op) * 255; + } + //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH); + ip += MINMATCH + matchLength; + + if (matchLength>=ML_MASK) + { + *token += ML_MASK; + matchLength -= ML_MASK; + while (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of block */ + if (ip > mflimit) break; + if (op > oMaxSeq) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ( (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + size_t lastRunSize = (size_t)(iend - anchor); + if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) + { + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (oend-op) - 1; + lastRunSize -= (lastRunSize+240)/255; + } + ip = anchor + lastRunSize; + + if (lastRunSize >= RUN_MASK) + { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } + else + { + *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */ + { + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } + else + { + if (*srcSizePtr < LZ4_64Klimit) + return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16); + else + return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr); + } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (HEAPMODE) + void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctxBody; + void* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/******************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + LZ4_resetStream(lz4s); + return lz4s; +} + +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(size_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ + LZ4_resetStream(LZ4_dict); + + if (dictSize < (int)HASH_UNIT) + { + dict->dictionary = NULL; + dict->dictSize = 0; + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + dict->currentOffset += 64 KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd-HASH_UNIT) + { + LZ4_putPosition(p, dict->hashTable, byU32, base); + p+=3; + } + + return dict->dictSize; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ + { + /* rescale hash table */ + U32 delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = (const BYTE*) source; + if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ + if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; + LZ4_renormDictT(streamPtr, smallest); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + /* Check overlapping input/dictionary space */ + { + const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) + { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } + + /* external dictionary mode */ + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; + int result; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = dictEnd; + if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; + LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); + + result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + + return result; +} + + +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/******************************* +* Decompression functions +*******************************/ +/* + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is essential this generic function is really inlined, + * in order to remove useless branches during compilation optimization. + */ +FORCE_INLINE int LZ4_decompress_generic( + const char* const source, + char* const dest, + int inputSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ + + int endOnInput, /* endOnOutputSize, endOnInputSize */ + int partialDecoding, /* full, partial */ + int targetOutputSize, /* only used if partialDecoding==partial */ + int dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* == dest if dict == noDict */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + /* Local Variables */ + const BYTE* ip = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + const BYTE* const lowLimit = lowPrefix - dictSize; + + const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; + const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; + const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Special cases */ + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); + + + /* Main Loop */ + while (1) + { + unsigned token; + size_t length; + const BYTE* match; + + /* get literal length */ + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s; + do + { + s = *ip++; + length += s; + } + while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + break; /* Necessarily EOF, due to parsing restrictions */ + } + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; + + /* get offset */ + match = cpy - LZ4_readLE16(ip); ip+=2; + if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) + { + unsigned s; + do + { + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + s = *ip++; + length += s; + } while (s==255); + if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + + /* check external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) + { + if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ + + if (length <= (size_t)(lowPrefix-match)) + { + /* match can be copied as a single segment from external dictionary */ + match = dictEnd - (lowPrefix-match); + memmove(op, match, length); op += length; + } + else + { + /* match encompass external dictionary and current segment */ + size_t copySize = (size_t)(lowPrefix-match); + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + copySize = length - copySize; + if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ + { + BYTE* const endOfMatch = op + copySize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } + else + { + memcpy(op, lowPrefix, copySize); + op += copySize; + } + } + continue; + } + + /* copy repeated sequence */ + cpy = op + length; + if (unlikely((op-match)<8)) + { + const size_t dec64 = dec64table[op-match]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[op-match]; + LZ4_copy4(op+4, match); + op += 8; match -= dec64; + } else { LZ4_copy8(op, match); op+=8; match+=8; } + + if (unlikely(cpy>oend-12)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ + if (op < oend-8) + { + LZ4_wildCopy(op, match, oend-8); + match += (oend-8) - op; + op = oend-8; + } + while (opprefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) + { + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += result; + lz4sd->prefixEnd += result; + } + else + { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) + { + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += originalSize; + lz4sd->prefixEnd += originalSize; + } + else + { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); + if (dictStart+dictSize == dest) + { + if (dictSize >= (int)(64 KB - 1)) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); + } + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); +} + +/* debug function */ +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + + +/*************************************************** +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } +int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } + +/* +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base) +{ + MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); + lz4ds->bufferStart = base; +} + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer); + return lz4ds; +} + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data; + int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); + return (char*)(ctx->bufferStart + dictSize); +} + +/* Obsolete streaming decompression functions */ + +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/bitshuffle/src/lz4.h b/bitshuffle/src/lz4.h new file mode 100644 index 0000000..3e74002 --- /dev/null +++ b/bitshuffle/src/lz4.h @@ -0,0 +1,360 @@ +/* + LZ4 - Fast LZ compression algorithm + Header File + Copyright (C) 2011-2015, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + +/* + * lz4.h provides block compression functions, and gives full buffer control to programmer. + * If you need to generate inter-operable compressed data (respecting LZ4 frame specification), + * and can let the library handle its own memory, please use lz4frame.h instead. +*/ + +/************************************** +* Version +**************************************/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) +int LZ4_versionNumber (void); + +/************************************** +* Tuning parameter +**************************************/ +/* + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#define LZ4_MEMORY_USAGE 14 + + +/************************************** +* Simple Functions +**************************************/ + +int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); +int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); + +/* +LZ4_compress_default() : + Compresses 'sourceSize' bytes from buffer 'source' + into already allocated 'dest' buffer of size 'maxDestSize'. + Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). + It also runs faster, so it's a recommended setting. + If the function cannot compress 'source' into a more limited 'dest' budget, + compression stops *immediately*, and the function result is zero. + As a consequence, 'dest' content is not valid. + This function never writes outside 'dest' buffer, nor read outside 'source' buffer. + sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) + return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) + or 0 if compression fails + +LZ4_decompress_safe() : + compressedSize : is the precise full size of the compressed block. + maxDecompressedSize : is the size of destination buffer, which must be already allocated. + return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) + If destination buffer is not large enough, decoding will stop and output an error code (<0). + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function is protected against buffer overflow exploits, including malicious data packets. + It never writes outside output buffer, nor reads outside input buffer. +*/ + + +/************************************** +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ +int LZ4_compressBound(int inputSize); + +/* +LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows to select an "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1. +*/ +int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); + + +/* +LZ4_compress_fast_extState() : + Same compression function, just using an externally allocated memory space to store compression state. + Use LZ4_sizeofState() to know how much memory must be allocated, + and allocate it on 8-bytes boundaries (using malloc() typically). + Then, provide it as 'void* state' to compression function. +*/ +int LZ4_sizeofState(void); +int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); + + +/* +LZ4_compress_destSize() : + Reverse the logic, by compressing as much data as possible from 'source' buffer + into already allocated buffer 'dest' of size 'targetDestSize'. + This function either compresses the entire 'source' content into 'dest' if it's large enough, + or fill 'dest' buffer completely with as much data as possible from 'source'. + *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'. + New value is necessarily <= old value. + return : Nb bytes written into 'dest' (necessarily <= targetDestSize) + or 0 if compression fails +*/ +int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); + + +/* +LZ4_decompress_fast() : + originalSize : is the original and therefore uncompressed size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is detected malformed, the function will stop decoding and return a negative result. + Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. + note : This function fully respect memory boundaries for properly formed compressed data. + It is a bit faster than LZ4_decompress_safe(). + However, it does not provide any protection against intentionally modified data stream (malicious input). + Use this function in trusted environment only (data to decode comes from a trusted source). +*/ +int LZ4_decompress_fast (const char* source, char* dest, int originalSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'compressedSize' at position 'source' + into destination buffer 'dest' of size 'maxDecompressedSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ +int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); + + +/*********************************************** +* Streaming Compression Functions +***********************************************/ +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long)) +/* + * LZ4_stream_t + * information structure to track an LZ4 stream. + * important : init this structure content before first use ! + * note : only allocated directly the structure if you are statically linking LZ4 + * If you are using liblz4 as a DLL, please use below construction methods instead. + */ +typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; + +/* + * LZ4_resetStream + * Use this function to init an allocated LZ4_stream_t structure + */ +void LZ4_resetStream (LZ4_stream_t* streamPtr); + +/* + * LZ4_createStream will allocate and initialize an LZ4_stream_t structure + * LZ4_freeStream releases its memory. + * In the context of a DLL (liblz4), please use these methods rather than the static struct. + * They are more future proof, in case of a change of LZ4_stream_t size. + */ +LZ4_stream_t* LZ4_createStream(void); +int LZ4_freeStream (LZ4_stream_t* streamPtr); + +/* + * LZ4_loadDict + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Loading a size of 0 is allowed. + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/* + * LZ4_compress_fast_continue + * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to still be present and unmodified ! + * 'dst' buffer must be already allocated. + * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. + */ +int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); + +/* + * LZ4_saveDict + * If previously compressed data block is not guaranteed to remain available at its memory location + * save it into a safer place (char* safeBuffer) + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue() + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error + */ +int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); + + +/************************************************ +* Streaming Decompression Functions +************************************************/ + +#define LZ4_STREAMDECODESIZE_U64 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; +/* + * LZ4_streamDecode_t + * information structure to track an LZ4 stream. + * init this structure content using LZ4_setStreamDecode or memset() before first use ! + * + * In the context of a DLL (liblz4) please prefer usage of construction methods below. + * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future. + * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure + * LZ4_freeStreamDecode releases its memory. + */ +LZ4_streamDecode_t* LZ4_createStreamDecode(void); +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/* + * LZ4_setStreamDecode + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * Return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) + In the case of a ring buffers, decoding buffer must be either : + - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions) + In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB). + - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block. + In which case, encoding and decoding buffers do not need to be synchronized, + and encoding ring buffer can have any size, including small ones ( < 64 KB). + - _At least_ 64 KB + 8 bytes + maxBlockSize. + In which case, encoding and decoding buffers do not need to be synchronized, + and encoding ring buffer can have any size, including larger than decoding buffer. + Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, + and indicate where it is saved using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as + a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue() + They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure. +*/ +int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); +int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); + + + +/************************************** +* Obsolete Functions +**************************************/ +/* Deprecate Warnings */ +/* Should these warnings messages be a problem, + it is generally possible to disable them, + with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual for example. + You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ +#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK +# define LZ4_DEPRECATE_WARNING_DEFBLOCK +# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +# if (LZ4_GCC_VERSION >= 405) || defined(__clang__) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (LZ4_GCC_VERSION >= 301) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") +# define LZ4_DEPRECATED(message) +# endif +#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */ + +/* Obsolete compression functions */ +/* These functions are planned to start generate warnings by r131 approximately */ +int LZ4_compress (const char* source, char* dest, int sourceSize); +int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); +int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete decompression functions */ +/* These function names are completely deprecated and must no longer be used. + They are only provided here for compatibility with older programs. + - LZ4_uncompress is the same as LZ4_decompress_fast + - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe + These function prototypes are now disabled; uncomment them only if you really need them. + It is highly recommended to stop using these prototypes and migrate to maintained ones */ +/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */ +/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */ + +/* Obsolete streaming functions; use new streaming interface whenever possible */ +LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer); +LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state); + +/* Obsolete streaming decoding functions */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + + +#if defined (__cplusplus) +} +#endif diff --git a/disk/Cargo.toml b/disk/Cargo.toml index 48f1dc3..74aaa9a 100644 --- a/disk/Cargo.toml +++ b/disk/Cargo.toml @@ -17,3 +17,4 @@ async-stream = "0.3.0" hex = "0.4.3" err = { path = "../err" } netpod = { path = "../netpod" } +bitshuffle = { path = "../bitshuffle" } diff --git a/disk/src/lib.rs b/disk/src/lib.rs index db09563..b515fe2 100644 --- a/disk/src/lib.rs +++ b/disk/src/lib.rs @@ -11,6 +11,7 @@ use futures_util::future::FusedFuture; use futures_util::{pin_mut, StreamExt}; use bytes::{Bytes, BytesMut, BufMut, Buf}; use std::path::PathBuf; +use bitshuffle::bitshuffle_decompress; pub async fn read_test_1(query: &netpod::AggQuerySingleChannel) -> Result { @@ -293,13 +294,14 @@ fn open_files(query: &netpod::AggQuerySingleChannel) -> async_channel::Receiver< let mut query = query.clone(); tokio::spawn(async move { let tb0 = query.timebin; - for i1 in 0..16 { + for i1 in 0..query.tb_file_count { query.timebin = tb0 + i1; let path = datapath(&query); let fileres = tokio::fs::OpenOptions::new() .read(true) - .open(path) + .open(&path) .await; + info!("opened file {:?} {:?}", &path, &fileres); match fileres { Ok(k) => { match chtx.send(Ok(k)).await { @@ -377,6 +379,7 @@ pub struct EventChunker { had_channel: bool, polled: u32, state: DataFileState, + tmpbuf: Vec, } enum DataFileState { @@ -394,6 +397,7 @@ impl EventChunker { had_channel: false, polled: 0, state: DataFileState::FileHeader, + tmpbuf: vec![0; 1024 * 1024 * 4], } } @@ -403,12 +407,15 @@ impl EventChunker { // what I've consumed from the buffer // how many bytes I need min to make progress let mut ret = EventFull::dummy(); - let mut need_min = 0; + let mut need_min = 0 as u32; use byteorder::{BE, ReadBytesExt}; //info!("parse_buf rb {}", buf.len()); - let mut i1 = 0; + //let mut i1 = 0; loop { //info!("parse_buf LOOP {}", i1); + if (buf.len() as u32) < need_min { + break; + } match self.state { DataFileState::FileHeader => { assert!(buf.len() >= 6, "logic"); @@ -419,7 +426,7 @@ impl EventChunker { assert!(len > 0 && len < 128, "unexpected data file header"); let totlen = len as usize + 2; if buf.len() < totlen { - info!("parse_buf not enough A"); + info!("parse_buf not enough A totlen {}", totlen); need_min = totlen as u32; break; } @@ -435,29 +442,88 @@ impl EventChunker { } } DataFileState::Event => { - assert!(buf.len() >= 4, "logic"); let mut sl = std::io::Cursor::new(buf.as_ref()); let len = sl.read_i32::().unwrap(); //info!("event len {}", len); - if (buf.len() as u32) < len as u32 { - // TODO gather stats about this + if (buf.len() as u32) < 20 { + // TODO gather stats about how often we find not enough input //info!("parse_buf not enough B"); need_min = len as u32; break; } + else if (buf.len() as u32) < len as u32 { + // TODO this is just for testing + let mut sl = std::io::Cursor::new(buf.as_ref()); + sl.read_i32::().unwrap(); + sl.read_i64::().unwrap(); + let ts = sl.read_i64::().unwrap(); + //info!("parse_buf not enough C len {} have {} ts {}", len, buf.len(), ts); + need_min = len as u32; + break; + } else { let mut sl = std::io::Cursor::new(buf.as_ref()); let len1b = sl.read_i32::().unwrap(); + assert!(len == len1b); sl.read_i64::().unwrap(); let ts = sl.read_i64::().unwrap(); let pulse = sl.read_i64::().unwrap(); - //info!("len {} len1b {} ts {} pulse {}", len, len1b, ts, pulse); - need_min = 4; + sl.read_i64::().unwrap(); + let _status = sl.read_i8().unwrap(); + let _severity = sl.read_i8().unwrap(); + let _optional = sl.read_i32::().unwrap(); + assert!(_status == 0); + assert!(_severity == 0); + assert!(_optional == -1); + let type_flags = sl.read_u8().unwrap(); + let type_index = sl.read_u8().unwrap(); + assert!(type_index <= 13); + let is_compressed = type_flags & 0x80 != 0; + let is_array = type_flags & 0x40 != 0; + let is_big_endian = type_flags & 0x20 != 0; + let is_shaped = type_flags & 0x10 != 0; + let compression_method = if is_compressed { + sl.read_u8().unwrap() + } + else { + 0 + }; + let shape_dim = if is_shaped { + sl.read_u8().unwrap() + } + else { + 0 + }; + assert!(compression_method <= 0); + assert!(!is_shaped || (shape_dim >= 1 && shape_dim <= 2)); + let mut shape_lens = [0, 0, 0, 0]; + for i1 in 0..shape_dim { + shape_lens[i1 as usize] = sl.read_u8().unwrap(); + } + if true && is_compressed { + //info!("event ts {} is_compressed {}", ts, is_compressed); + let value_bytes = sl.read_u64::().unwrap(); + let block_size = sl.read_u32::().unwrap(); + let p1 = sl.position() as u32; + let k1 = len as u32 - p1 - 4; + assert!(value_bytes < 1024 * 256); + assert!(block_size == 1024 * 8); + let value_bytes = value_bytes; + let inp = [0; 16]; + let type_size = type_size(type_index); + let ele_count = value_bytes / type_size as u64; + let ele_size = type_size; + //info!("try decompress value_bytes {} ele_size {} ele_count {} type_index {}", value_bytes, ele_size, ele_count, type_index); + let c1 = bitshuffle_decompress(&buf.as_ref()[p1 as usize..], &mut self.tmpbuf, ele_count as usize, ele_size as usize, 0); + //info!("decompress result: {:?}", c1); + assert!(c1.unwrap() as u32 == k1); + } buf.advance(len as usize); + need_min = 4; } } } - i1 += 1; + //i1 += 1; } Ok(ParseResult { events: ret, @@ -467,6 +533,26 @@ impl EventChunker { } +fn type_size(ix: u8) -> u32 { + match ix { + 0 => 1, + 1 => 1, + 2 => 1, + 3 => 1, + 4 => 2, + 5 => 2, + 6 => 2, + 7 => 4, + 8 => 4, + 9 => 8, + 10 => 8, + 11 => 4, + 12 => 8, + 13 => 1, + _ => panic!("logic") + } +} + struct ParseResult { events: EventFull, need_min: u32, @@ -477,7 +563,7 @@ impl Stream for EventChunker { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { self.polled += 1; - if self.polled >= 20000 { + if self.polled >= 2000000 { warn!("EventChunker poll limit reached"); return Poll::Ready(None); } @@ -493,6 +579,11 @@ impl Stream for EventChunker { // TODO gather stats about this: //info!("parse_buf returned {} leftover bytes to me", buf.len()); self.inp.put_back(buf); + + } + if res.need_min > 8000 { + warn!("spurious EventChunker asks for need_min {}", res.need_min); + panic!(); } self.inp.set_need_min(res.need_min); Poll::Ready(Some(Ok(res.events))) @@ -544,6 +635,7 @@ impl NeedMinBuffer { pub fn put_back(&mut self, buf: BytesMut) { assert!(self.left.is_none()); + self.left = Some(buf); } pub fn set_need_min(&mut self, need_min: u32) { @@ -562,14 +654,17 @@ impl Stream for NeedMinBuffer { pin_mut!(g); let z = match g.poll_next(cx) { Poll::Ready(Some(Ok(buf))) => { + //info!("NeedMin got buf len {}", buf.len()); match self.left.take() { Some(mut left) => { left.unsplit(buf); let buf = left; if buf.len() as u32 >= self.need_min { + //info!("with left ready len {} need_min {}", buf.len(), self.need_min); Poll::Ready(Some(Ok(buf))) } else { + //info!("with left not enough len {} need_min {}", buf.len(), self.need_min); self.left.replace(buf); again = true; Poll::Pending @@ -577,9 +672,11 @@ impl Stream for NeedMinBuffer { } None => { if buf.len() as u32 >= self.need_min { + //info!("simply ready len {} need_min {}", buf.len(), self.need_min); Poll::Ready(Some(Ok(buf))) } else { + //info!("no previous leftover, need more len {} need_min {}", buf.len(), self.need_min); self.left.replace(buf); again = true; Poll::Pending diff --git a/netpod/src/lib.rs b/netpod/src/lib.rs index 58f5700..6692d4a 100644 --- a/netpod/src/lib.rs +++ b/netpod/src/lib.rs @@ -31,6 +31,7 @@ pub struct AggQuerySingleChannel { pub split: u32, pub tbsize: u32, pub buffer_size: u32, + pub tb_file_count: u32, } pub struct BodyStream { diff --git a/retrieval/src/bin/retrieval.rs b/retrieval/src/bin/retrieval.rs index 23faeff..cefb3cc 100644 --- a/retrieval/src/bin/retrieval.rs +++ b/retrieval/src/bin/retrieval.rs @@ -56,15 +56,16 @@ fn simple_fetch() { let t1 = chrono::Utc::now(); let query = netpod::AggQuerySingleChannel { ksprefix: "daq_swissfel".into(), - keyspace: 2, + keyspace: 3, channel: netpod::Channel { - name: "S10BC01-DBAM070:EOM1_T1".into(), + name: "S10BC01-DBAM070:BAM_CH1_NORM".into(), backend: "sf-databuffer".into(), }, - timebin: 18700, + timebin: 18719, + tb_file_count: 1, split: 12, tbsize: 1000 * 60 * 60 * 24, - buffer_size: 1024 * 16, + buffer_size: 1024 * 8, }; let query_string = serde_json::to_string(&query).unwrap(); let _host = tokio::spawn(httpret::host(8360));