diff --git a/Makefile b/Makefile index 9222a9a..ae5825a 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,12 @@ SRC_DIR = ./src TEST_DIR = ./test INC_DIR = $(SRC_DIR) +BSLZ4_SRC_DIR = ./bslz4/src +BSLZ4_BUILD_DIR = ./bslz4/build +BSLZ4_INC_DIR = $(BSLZ4_SRC_DIR) + CC=h5cc -CFLAGS=-Wall -g -O2 -fpic -I$(INC_DIR) +CFLAGS=-Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR) -std=c89 .PHONY: all all: plugin example test_plugin @@ -26,14 +30,26 @@ $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c mkdir -p $(BUILD_DIR) $(CC) $(CFLAGS) -c $< -o $@ -$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o +$(BSLZ4_BUILD_DIR)/%.o: $(BSLZ4_SRC_DIR)/%.c + mkdir -p $(BSLZ4_BUILD_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +$(BUILD_DIR)/bslz4.a: $(BSLZ4_BUILD_DIR)/lz4.o $(BSLZ4_BUILD_DIR)/bitshuffle.o \ +$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o + mkdir -p $(BUILD_DIR) + ar rcs $@ $^ + +$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \ +$(BUILD_DIR)/bslz4.a mkdir -p $(BUILD_DIR) $(CC) $(CFLAGS) -shared $^ -o $(BUILD_DIR)/durin-plugin.so -$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o +$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \ +$(BUILD_DIR)/bslz4.a mkdir -p $(BUILD_DIR) $(CC) $(CFLAGS) $^ -o $(BUILD_DIR)/example .PHONY: clean clean: rm -r $(BUILD_DIR) + rm -r $(BSLZ4_BUILD_DIR) diff --git a/bslz4/LICENSE b/bslz4/LICENSE new file mode 100644 index 0000000..1365ed6 --- /dev/null +++ b/bslz4/LICENSE @@ -0,0 +1,21 @@ +Bitshuffle - Filter for improving compression of typed binary data. + +Copyright (c) 2014 Kiyoshi Masui (kiyo@physics.ubc.ca) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/bslz4/src/bitshuffle.c b/bslz4/src/bitshuffle.c new file mode 100644 index 0000000..3b8815b --- /dev/null +++ b/bslz4/src/bitshuffle.c @@ -0,0 +1,165 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + */ + +#include "bitshuffle.h" +#include "bitshuffle_core.h" +#include "bitshuffle_internals.h" +#include "lz4.h" + +#include +#include + + + + +#define BSHUF_LZ4_DECOMPRESS_FAST + + + +#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \ + free(buf); return count - 1000; } + + +/* Bitshuffle and compress a single block. */ +int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \ + const size_t size, const size_t elem_size) { + + int64_t nbytes, count; + void *tmp_buf_bshuf; + void *tmp_buf_lz4; + size_t this_iter; + const void *in; + void *out; + + tmp_buf_bshuf = malloc(size * elem_size); + if (tmp_buf_bshuf == NULL) return -1; + + tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size)); + if (tmp_buf_lz4 == NULL){ + free(tmp_buf_bshuf); + return -1; + } + + + in = ioc_get_in(C_ptr, &this_iter); + ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size)); + + count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size); + if (count < 0) { + free(tmp_buf_lz4); + free(tmp_buf_bshuf); + return count; + } + nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size); + free(tmp_buf_bshuf); + CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4); + + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4)); + + bshuf_write_uint32_BE(out, nbytes); + memcpy((char *) out + 4, tmp_buf_lz4, nbytes); + + free(tmp_buf_lz4); + + return nbytes + 4; +} + + +/* Decompress and bitunshuffle a single block. */ +int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr, + const size_t size, const size_t elem_size) { + + int64_t nbytes, count; + void *out, *tmp_buf; + const void *in; + size_t this_iter; + int32_t nbytes_from_header; + + in = ioc_get_in(C_ptr, &this_iter); + nbytes_from_header = bshuf_read_uint32_BE(in); + ioc_set_next_in(C_ptr, &this_iter, + (void*) ((char*) in + nbytes_from_header + 4)); + + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, + (void *) ((char *) out + size * elem_size)); + + tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + +#ifdef BSHUF_LZ4_DECOMPRESS_FAST + nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size); + CHECK_ERR_FREE_LZ(nbytes, tmp_buf); + if (nbytes != nbytes_from_header) { + free(tmp_buf); + return -91; + } +#else + nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header, + size * elem_size); + CHECK_ERR_FREE_LZ(nbytes, tmp_buf); + if (nbytes != size * elem_size) { + free(tmp_buf); + return -91; + } + nbytes = nbytes_from_header; +#endif + count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + nbytes += 4; + + free(tmp_buf); + return nbytes; +} + + +/* ---- Public functions ---- + * + * See header file for description and usage. + * + */ + +size_t bshuf_compress_lz4_bound(const size_t size, + const size_t elem_size, size_t block_size) { + + size_t bound, leftover; + + if (block_size == 0) { + block_size = bshuf_default_block_size(elem_size); + } + if (block_size % BSHUF_BLOCKED_MULT) return -81; + + + + bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size); + + leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT; + if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4; + + bound += (size % BSHUF_BLOCKED_MULT) * elem_size; + return bound; +} + + +int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size, + elem_size, block_size); +} + + +int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size, + elem_size, block_size); +} + diff --git a/bslz4/src/bitshuffle.h b/bslz4/src/bitshuffle.h new file mode 100644 index 0000000..dc87e9e --- /dev/null +++ b/bslz4/src/bitshuffle.h @@ -0,0 +1,123 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + * + * Header File + * + * Worker routines return an int64_t which is the number of bytes processed + * if positive or an error code if negative. + * + * Error codes: + * -1 : Failed to allocate memory. + * -11 : Missing SSE. + * -12 : Missing AVX. + * -80 : Input size not a multiple of 8. + * -81 : block_size not multiple of 8. + * -91 : Decompression error, wrong number of bytes processed. + * -1YYY : Error internal to compression routine with error code -YYY. + */ + + +#ifndef BITSHUFFLE_H +#define BITSHUFFLE_H + +#include +#include "bitshuffle_core.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* ---- bshuf_compress_lz4_bound ---- + * + * Bound on size of data compressed with *bshuf_compress_lz4*. + * + * Parameters + * ---------- + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Process in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * Bound on compressed data size. + * + */ +size_t bshuf_compress_lz4_bound(const size_t size, + const size_t elem_size, size_t block_size); + + +/* ---- bshuf_compress_lz4 ---- + * + * Bitshuffled and compress the data using LZ4. + * + * Transpose within elements, in blocks of data of *block_size* elements then + * compress the blocks using LZ4. In the output buffer, each block is prefixed + * by a 4 byte integer giving the compressed size of that block. + * + * Output buffer must be large enough to hold the compressed data. This could + * be in principle substantially larger than the input buffer. Use the routine + * *bshuf_compress_lz4_bound* to get an upper limit. + * + * Parameters + * ---------- + * in : input buffer, must be of size * elem_size bytes + * out : output buffer, must be large enough to hold data. + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Process in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes used in output buffer, negative error-code if failed. + * + */ +int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const size_t + elem_size, size_t block_size); + + +/* ---- bshuf_decompress_lz4 ---- + * + * Undo compression and bitshuffling. + * + * Decompress data then un-bitshuffle it in blocks of *block_size* elements. + * + * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size* + * must patch the parameters used to compress the data. + * + * NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function + * LZ4_decompress_fast from LZ4, which does not protect against maliciously + * formed datasets. By modifying the compressed data, this function could be + * coerced into leaving the boundaries of the input buffer. + * + * Parameters + * ---------- + * in : input buffer + * out : output buffer, must be of size * elem_size bytes + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Process in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes consumed in *input* buffer, negative error-code if failed. + * + */ +int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bslz4/src/bitshuffle_core.c b/bslz4/src/bitshuffle_core.c new file mode 100644 index 0000000..1a184d6 --- /dev/null +++ b/bslz4/src/bitshuffle_core.c @@ -0,0 +1,1332 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + */ + +#include "bitshuffle_core.h" +#include "bitshuffle_internals.h" + +#include +#include + + +#if defined(__AVX2__) && defined (__SSE2__) +#define USEAVX2 +#endif + +#if defined(__SSE2__) +#define USESSE2 +#endif + + + +#ifdef USEAVX2 +#include +#elif defined USESSE2 +#include +#endif + + + +#define CHECK_MULT_EIGHT(n) if (n % 8) return -80; +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + + +/* ---- Functions indicating compile time instruction set. ---- */ + +int bshuf_using_SSE2(void) { +#ifdef USESSE2 + return 1; +#else + return 0; +#endif +} + + +int bshuf_using_AVX2(void) { +#ifdef USEAVX2 + return 1; +#else + return 0; +#endif +} + + +/* ---- Worker code not requiring special instruction sets. ---- + * + * The following code does not use any x86 specific vectorized instructions + * and should compile on any machine + * + */ + +/* Transpose 8x8 bit array packed into a single quadword *x*. + * *t* is workspace. */ +#define TRANS_BIT_8X8(x, t) { \ + t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL; \ + x = x ^ t ^ (t << 7); \ + t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL; \ + x = x ^ t ^ (t << 14); \ + t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL; \ + x = x ^ t ^ (t << 28); \ + } + +/* Transpose 8x8 bit array along the diagonal from upper right + to lower left */ +#define TRANS_BIT_8X8_BE(x, t) { \ + t = (x ^ (x >> 9)) & 0x0055005500550055LL; \ + x = x ^ t ^ (t << 9); \ + t = (x ^ (x >> 18)) & 0x0000333300003333LL; \ + x = x ^ t ^ (t << 18); \ + t = (x ^ (x >> 36)) & 0x000000000F0F0F0FLL; \ + x = x ^ t ^ (t << 36); \ + } + +/* Transpose of an array of arbitrarily typed elements. */ +#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) { \ + size_t ii, jj, kk; \ + const type_t* in_type = (const type_t*) in; \ + type_t* out_type = (type_t*) out; \ + for(ii = 0; ii + 7 < lda; ii += 8) { \ + for(jj = 0; jj < ldb; jj++) { \ + for(kk = 0; kk < 8; kk++) { \ + out_type[jj*lda + ii + kk] = \ + in_type[ii*ldb + kk * ldb + jj]; \ + } \ + } \ + } \ + for(ii = lda - lda % 8; ii < lda; ii ++) { \ + for(jj = 0; jj < ldb; jj++) { \ + out_type[jj*lda + ii] = in_type[ii*ldb + jj]; \ + } \ + } \ + } + + +/* Memory copy with bshuf call signature. For testing and profiling. */ +int64_t bshuf_copy(const void* in, void* out, const size_t size, + const size_t elem_size) { + + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + memcpy(out_b, in_b, size * elem_size); + return size * elem_size; +} + + +/* Transpose bytes within elements, starting partway through input. */ +int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size, + const size_t elem_size, const size_t start) { + + size_t ii, jj, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(start); + + if (size > start) { + + + for (ii = start; ii + 7 < size; ii += 8) { + for (jj = 0; jj < elem_size; jj++) { + for (kk = 0; kk < 8; kk++) { + out_b[jj * size + ii + kk] + = in_b[ii * elem_size + kk * elem_size + jj]; + } + } + } + for (ii = size - size % 8; ii < size; ii ++) { + for (jj = 0; jj < elem_size; jj++) { + out_b[jj * size + ii] = in_b[ii * elem_size + jj]; + } + } + } + return size * elem_size; +} + + +/* Transpose bytes within elements. */ +int64_t bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0); +} + + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size, + const size_t elem_size, const size_t start_byte) { + + const uint64_t* in_b = (const uint64_t*) in; + uint8_t* out_b = (uint8_t*) out; + + uint64_t x, t; + + size_t ii, kk; + size_t nbyte = elem_size * size; + size_t nbyte_bitrow = nbyte / 8; + + uint64_t e=1; + const int little_endian = *(uint8_t *) &e == 1; + const size_t bit_row_skip = little_endian ? nbyte_bitrow : -nbyte_bitrow; + const int64_t bit_row_offset = little_endian ? 0 : 7 * nbyte_bitrow; + + CHECK_MULT_EIGHT(nbyte); + CHECK_MULT_EIGHT(start_byte); + + for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) { + x = in_b[ii]; + if (little_endian) { + TRANS_BIT_8X8(x, t); + } else { + TRANS_BIT_8X8_BE(x, t); + } + for (kk = 0; kk < 8; kk ++) { + out_b[bit_row_offset + kk * bit_row_skip + ii] = x; + x = x >> 8; + } + } + return size * elem_size; +} + + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0); +} + + +/* General transpose of an array, optimized for large element sizes. */ +int64_t bshuf_trans_elem(const void* in, void* out, const size_t lda, + const size_t ldb, const size_t elem_size) { + + size_t ii, jj; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + for(ii = 0; ii < lda; ii++) { + for(jj = 0; jj < ldb; jj++) { + memcpy(&out_b[(jj*lda + ii) * elem_size], + &in_b[(ii*ldb + jj) * elem_size], elem_size); + } + } + return lda * ldb * elem_size; +} + + +/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */ +int64_t bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t nbyte_bitrow = size / 8; + + CHECK_MULT_EIGHT(size); + + return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow); +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + void *tmp_buf; + + CHECK_MULT_EIGHT(size); + + tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + size_t ii, jj, kk, nbyte_row; + const char *in_b; + char *out_b; + + + in_b = (const char*) in; + out_b = (char*) out; + + nbyte_row = size / 8; + + CHECK_MULT_EIGHT(size); + + for (jj = 0; jj < elem_size; jj++) { + for (ii = 0; ii < nbyte_row; ii++) { + for (kk = 0; kk < 8; kk++) { + out_b[ii * 8 * elem_size + jj * 8 + kk] = \ + in_b[(jj * 8 + kk) * nbyte_row + ii]; + } + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_scal(const void* in, void* out, \ + const size_t size, const size_t elem_size) { + + const char *in_b; + char *out_b; + uint64_t x, t; + size_t ii, jj, kk; + size_t nbyte, out_index; + + uint64_t e=1; + const int little_endian = *(uint8_t *) &e == 1; + const size_t elem_skip = little_endian ? elem_size : -elem_size; + const uint64_t elem_offset = little_endian ? 0 : 7 * elem_size; + + CHECK_MULT_EIGHT(size); + + in_b = (const char*) in; + out_b = (char*) out; + + nbyte = elem_size * size; + + for (jj = 0; jj < 8 * elem_size; jj += 8) { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) { + x = *((uint64_t*) &in_b[ii + jj]); + if (little_endian) { + TRANS_BIT_8X8(x, t); + } else { + TRANS_BIT_8X8_BE(x, t); + } + for (kk = 0; kk < 8; kk++) { + out_index = ii + jj / 8 + elem_offset + kk * elem_skip; + *((uint8_t*) &out_b[out_index]) = x; + x = x >> 8; + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + void *tmp_buf; + + CHECK_MULT_EIGHT(size); + + tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* ---- Worker code that uses SSE2 ---- + * + * The following code makes use of the SSE2 instruction set and specialized + * 16 byte registers. The SSE2 instructions are present on modern x86 + * processors. The first Intel processor microarchitecture supporting SSE2 was + * Pentium 4 (2000). + * + */ + +#ifdef USESSE2 + +/* Transpose bytes within elements for 16 bit elements. */ +int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { + + size_t ii; + const char *in_b = (const char*) in; + char *out_b = (char*) out; + __m128i a0, b0, a1, b1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]); + b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + + _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); + _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); + } + return bshuf_trans_byte_elem_remainder(in, out, size, 2, + size - size % 16); +} + + +/* Transpose bytes within elements for 32 bit elements. */ +int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { + + size_t ii; + const char *in_b; + char *out_b; + in_b = (const char*) in; + out_b = (char*) out; + __m128i a0, b0, c0, d0, a1, b1, c1, d1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]); + b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]); + c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]); + d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + c1 = _mm_unpacklo_epi8(c0, d0); + d1 = _mm_unpackhi_epi8(c0, d0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + c0 = _mm_unpacklo_epi8(c1, d1); + d0 = _mm_unpackhi_epi8(c1, d1); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + c1 = _mm_unpacklo_epi8(c0, d0); + d1 = _mm_unpackhi_epi8(c0, d0); + + a0 = _mm_unpacklo_epi64(a1, c1); + b0 = _mm_unpackhi_epi64(a1, c1); + c0 = _mm_unpacklo_epi64(b1, d1); + d0 = _mm_unpackhi_epi64(b1, d1); + + _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); + _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); + _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); + _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); + } + return bshuf_trans_byte_elem_remainder(in, out, size, 4, + size - size % 16); +} + + +/* Transpose bytes within elements for 64 bit elements. */ +int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { + + size_t ii; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + __m128i a0, b0, c0, d0, e0, f0, g0, h0; + __m128i a1, b1, c1, d1, e1, f1, g1, h1; + + for (ii=0; ii + 15 < size; ii += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); + b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); + c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); + d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); + e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); + f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); + g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); + h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpackhi_epi8(a0, b0); + c1 = _mm_unpacklo_epi8(c0, d0); + d1 = _mm_unpackhi_epi8(c0, d0); + e1 = _mm_unpacklo_epi8(e0, f0); + f1 = _mm_unpackhi_epi8(e0, f0); + g1 = _mm_unpacklo_epi8(g0, h0); + h1 = _mm_unpackhi_epi8(g0, h0); + + a0 = _mm_unpacklo_epi8(a1, b1); + b0 = _mm_unpackhi_epi8(a1, b1); + c0 = _mm_unpacklo_epi8(c1, d1); + d0 = _mm_unpackhi_epi8(c1, d1); + e0 = _mm_unpacklo_epi8(e1, f1); + f0 = _mm_unpackhi_epi8(e1, f1); + g0 = _mm_unpacklo_epi8(g1, h1); + h0 = _mm_unpackhi_epi8(g1, h1); + + a1 = _mm_unpacklo_epi32(a0, c0); + b1 = _mm_unpackhi_epi32(a0, c0); + c1 = _mm_unpacklo_epi32(b0, d0); + d1 = _mm_unpackhi_epi32(b0, d0); + e1 = _mm_unpacklo_epi32(e0, g0); + f1 = _mm_unpackhi_epi32(e0, g0); + g1 = _mm_unpacklo_epi32(f0, h0); + h1 = _mm_unpackhi_epi32(f0, h0); + + a0 = _mm_unpacklo_epi64(a1, e1); + b0 = _mm_unpackhi_epi64(a1, e1); + c0 = _mm_unpacklo_epi64(b1, f1); + d0 = _mm_unpackhi_epi64(b1, f1); + e0 = _mm_unpacklo_epi64(c1, g1); + f0 = _mm_unpackhi_epi64(c1, g1); + g0 = _mm_unpacklo_epi64(d1, h1); + h0 = _mm_unpackhi_epi64(d1, h1); + + _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); + _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); + _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); + _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); + _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); + _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); + _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); + _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); + } + return bshuf_trans_byte_elem_remainder(in, out, size, 8, + size - size % 16); +} + + +/* Transpose bytes within elements using best SSE algorithm available. */ +int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + + switch (elem_size) { + case 1: + count = bshuf_copy(in, out, size, elem_size); + return count; + case 2: + count = bshuf_trans_byte_elem_SSE_16(in, out, size); + return count; + case 4: + count = bshuf_trans_byte_elem_SSE_32(in, out, size); + return count; + case 8: + count = bshuf_trans_byte_elem_SSE_64(in, out, size); + return count; + } + + + + if (elem_size % 4) { + count = bshuf_trans_byte_elem_scal(in, out, size, elem_size); + return count; + } + + + { + size_t nchunk_elem; + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + if ((elem_size % 8) == 0) { + nchunk_elem = elem_size / 8; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t); + count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size); + } else if ((elem_size % 4) == 0) { + nchunk_elem = elem_size / 4; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t); + count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size); + } else { + + nchunk_elem = elem_size / 2; + TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t); + count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf, + size * nchunk_elem); + bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size); + } + + free(tmp_buf); + return count; + } +} + + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + uint16_t* out_ui16; + + int64_t count; + + size_t nbyte = elem_size * size; + + CHECK_MULT_EIGHT(nbyte); + + __m128i xmm; + int32_t bt; + + for (ii = 0; ii + 15 < nbyte; ii += 16) { + xmm = _mm_loadu_si128((__m128i *) &in_b[ii]); + for (kk = 0; kk < 8; kk++) { + bt = _mm_movemask_epi8(xmm); + xmm = _mm_slli_epi16(xmm, 1); + out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_ui16 = bt; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, + nbyte - nbyte % 16); + return count; +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, jj; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(size); + + size_t nrows = 8 * elem_size; + size_t nbyte_row = size / 8; + + __m128i a0, b0, c0, d0, e0, f0, g0, h0; + __m128i a1, b1, c1, d1, e1, f1, g1, h1; + __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; + + for (ii = 0; ii + 7 < nrows; ii += 8) { + for (jj = 0; jj + 15 < nbyte_row; jj += 16) { + a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]); + b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]); + c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]); + d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]); + e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]); + f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]); + g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]); + h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]); + + + a1 = _mm_unpacklo_epi8(a0, b0); + b1 = _mm_unpacklo_epi8(c0, d0); + c1 = _mm_unpacklo_epi8(e0, f0); + d1 = _mm_unpacklo_epi8(g0, h0); + e1 = _mm_unpackhi_epi8(a0, b0); + f1 = _mm_unpackhi_epi8(c0, d0); + g1 = _mm_unpackhi_epi8(e0, f0); + h1 = _mm_unpackhi_epi8(g0, h0); + + + a0 = _mm_unpacklo_epi16(a1, b1); + b0 = _mm_unpacklo_epi16(c1, d1); + c0 = _mm_unpackhi_epi16(a1, b1); + d0 = _mm_unpackhi_epi16(c1, d1); + + e0 = _mm_unpacklo_epi16(e1, f1); + f0 = _mm_unpacklo_epi16(g1, h1); + g0 = _mm_unpackhi_epi16(e1, f1); + h0 = _mm_unpackhi_epi16(g1, h1); + + + a1 = _mm_unpacklo_epi32(a0, b0); + b1 = _mm_unpackhi_epi32(a0, b0); + + c1 = _mm_unpacklo_epi32(c0, d0); + d1 = _mm_unpackhi_epi32(c0, d0); + + e1 = _mm_unpacklo_epi32(e0, f0); + f1 = _mm_unpackhi_epi32(e0, f0); + + g1 = _mm_unpacklo_epi32(g0, h0); + h1 = _mm_unpackhi_epi32(g0, h0); + + + + as = (__m128 *) &a1; + bs = (__m128 *) &b1; + cs = (__m128 *) &c1; + ds = (__m128 *) &d1; + es = (__m128 *) &e1; + fs = (__m128 *) &f1; + gs = (__m128 *) &g1; + hs = (__m128 *) &h1; + + _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as); + _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs); + _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs); + _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds); + _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es); + _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs); + _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs); + _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs); + + _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as); + _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds); + _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es); + _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs); + _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs); + } + for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { + out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; + out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; + out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; + out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; + out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; + out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; + out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; + out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + CHECK_MULT_EIGHT(size); + + + + const char* in_b = (const char*) in; + uint16_t* out_ui16 = (uint16_t*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + + __m128i xmm; + int32_t bt; + + if (elem_size % 2) { + bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); + } else { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { + xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]); + for (kk = 0; kk < 8; kk++) { + bt = _mm_movemask_epi8(xmm); + xmm = _mm_slli_epi16(xmm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); + out_ui16[ind / 2] = bt; + } + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + +#else + + +int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) { + return -11; +} + + +int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) { + return -11; +} + + +int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -11; +} + + +#endif + + +/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */ + +/* ---- Worker code that uses AVX2 ---- + * + * The following code makes use of the AVX2 instruction set and specialized + * 32 byte registers. The AVX2 instructions are present on newer x86 + * processors. The first Intel processor microarchitecture supporting AVX2 was + * Haswell (2013). + * + */ + +#ifdef USEAVX2 + +/* Transpose bits within bytes. */ +int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + int32_t* out_i32; + + size_t nbyte = elem_size * size; + + int64_t count; + + __m256i ymm; + int32_t bt; + + for (ii = 0; ii + 31 < nbyte; ii += 32) { + ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]); + for (kk = 0; kk < 8; kk++) { + bt = _mm256_movemask_epi8(ymm); + ymm = _mm256_slli_epi16(ymm, 1); + out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_i32 = bt; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, + nbyte - nbyte % 32); + return count; +} + + +/* Transpose bits within elements. */ +int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size); + + free(tmp_buf); + + return count; +} + + +/* For data organized into a row for each bit (8 * elem_size rows), transpose + * the bytes. */ +int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + size_t hh, ii, jj, kk, mm; + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + CHECK_MULT_EIGHT(size); + + size_t nrows = 8 * elem_size; + size_t nbyte_row = size / 8; + + if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size, + elem_size); + + __m256i ymm_0[8]; + __m256i ymm_1[8]; + __m256i ymm_storeage[8][4]; + + for (jj = 0; jj + 31 < nbyte_row; jj += 32) { + for (ii = 0; ii + 3 < elem_size; ii += 4) { + for (hh = 0; hh < 4; hh ++) { + + for (kk = 0; kk < 8; kk ++){ + ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[ + (ii * 8 + hh * 8 + kk) * nbyte_row + jj]); + } + + for (kk = 0; kk < 4; kk ++){ + ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + } + + for (kk = 0; kk < 2; kk ++){ + for (mm = 0; mm < 2; mm ++){ + ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16( + ymm_1[kk * 4 + mm * 2], + ymm_1[kk * 4 + mm * 2 + 1]); + ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16( + ymm_1[kk * 4 + mm * 2], + ymm_1[kk * 4 + mm * 2 + 1]); + } + } + + for (kk = 0; kk < 4; kk ++){ + ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2], + ymm_0[kk * 2 + 1]); + } + + for (kk = 0; kk < 8; kk ++){ + ymm_storeage[kk][hh] = ymm_1[kk]; + } + } + + for (mm = 0; mm < 8; mm ++) { + + for (kk = 0; kk < 4; kk ++){ + ymm_0[kk] = ymm_storeage[mm][kk]; + } + + ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]); + ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]); + ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]); + ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]); + + ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32); + ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32); + ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49); + ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49); + + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]); + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]); + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]); + _mm256_storeu_si256((__m256i *) &out_b[ + (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]); + } + } + } + for (ii = 0; ii < nrows; ii ++ ) { + for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) { + out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj]; + } + } + return size * elem_size; +} + + +/* Shuffle bits within the bytes of eight element blocks. */ +int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + CHECK_MULT_EIGHT(size); + + + + const char* in_b = (const char*) in; + char* out_b = (char*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + + __m256i ymm; + int32_t bt; + + if (elem_size % 4) { + return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size); + } else { + for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) { + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]); + for (kk = 0; kk < 8; kk++) { + bt = _mm256_movemask_epi8(ymm); + ymm = _mm256_slli_epi16(ymm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); + * (int32_t *) &out_b[ind] = bt; + } + } + } + } + return size * elem_size; +} + + +/* Untranspose bits within elements. */ +int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; + + CHECK_MULT_EIGHT(size); + + void* tmp_buf = malloc(size * elem_size); + if (tmp_buf == NULL) return -1; + + count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size); + CHECK_ERR_FREE(count, tmp_buf); + count = bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size); + + free(tmp_buf); + return count; +} + + +#else + +int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + + +int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size, + const size_t elem_size) { + return -12; +} + +#endif + + +/* ---- Drivers selecting best instruction set at compile time. ---- */ + +int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; +#ifdef USEAVX2 + count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size); +#elif defined(USESSE2) + count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size); +#else + count = bshuf_trans_bit_elem_scal(in, out, size, elem_size); +#endif + return count; +} + + +int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size) { + + int64_t count; +#ifdef USEAVX2 + count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size); +#elif defined(USESSE2) + count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size); +#else + count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size); +#endif + return count; +} + + +/* ---- Wrappers for implementing blocking ---- */ + +/* Wrap a function for processing a single block to process an entire buffer in + * parallel. */ +int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \ + const size_t size, const size_t elem_size, size_t block_size) { + + size_t ii; + int64_t err = 0; + int64_t count, cum_count=0; + size_t last_block_size; + size_t leftover_bytes; + size_t this_iter; + char *last_in; + char *last_out; + + + ioc_chain C; + ioc_init(&C, in, out); + + + if (block_size == 0) { + block_size = bshuf_default_block_size(elem_size); + } + if (block_size % BSHUF_BLOCKED_MULT) return -81; + +#if defined(_OPENMP) + #pragma omp parallel for schedule(dynamic, 1) \ + private(count) reduction(+ : cum_count) +#endif + for (ii = 0; ii < size / block_size; ii ++) { + count = fun(&C, block_size, elem_size); + if (count < 0) err = count; + cum_count += count; + } + + last_block_size = size % block_size; + last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT; + if (last_block_size) { + count = fun(&C, last_block_size, elem_size); + if (count < 0) err = count; + cum_count += count; + } + + if (err < 0) return err; + + leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size; + last_in = (char *) ioc_get_in(&C, &this_iter); + ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes)); + last_out = (char *) ioc_get_out(&C, &this_iter); + ioc_set_next_out(&C, &this_iter, (void *) (last_out + leftover_bytes)); + + memcpy(last_out, last_in, leftover_bytes); + + ioc_destroy(&C); + + return cum_count + leftover_bytes; +} + + +/* Bitshuffle a single block. */ +int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \ + const size_t size, const size_t elem_size) { + + size_t this_iter; + const void *in; + void *out; + int64_t count; + + + + in = ioc_get_in(C_ptr, &this_iter); + ioc_set_next_in(C_ptr, &this_iter, + (void*) ((char*) in + size * elem_size)); + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, + (void *) ((char *) out + size * elem_size)); + + count = bshuf_trans_bit_elem(in, out, size, elem_size); + return count; +} + + +/* Bitunshuffle a single block. */ +int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \ + const size_t size, const size_t elem_size) { + + + size_t this_iter; + const void *in; + void *out; + int64_t count; + + + + + in = ioc_get_in(C_ptr, &this_iter); + ioc_set_next_in(C_ptr, &this_iter, + (void*) ((char*) in + size * elem_size)); + out = ioc_get_out(C_ptr, &this_iter); + ioc_set_next_out(C_ptr, &this_iter, + (void *) ((char *) out + size * elem_size)); + + count = bshuf_untrans_bit_elem(in, out, size, elem_size); + return count; +} + + +/* Write a 64 bit unsigned integer to a buffer in big endian order. */ +void bshuf_write_uint64_BE(void* buf, uint64_t num) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint64_t pow28 = 1 << 8; + for (ii = 7; ii >= 0; ii--) { + b[ii] = num % pow28; + num = num / pow28; + } +} + + +/* Read a 64 bit unsigned integer from a buffer big endian order. */ +uint64_t bshuf_read_uint64_BE(void* buf) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint64_t num = 0, pow28 = 1 << 8, cp = 1; + for (ii = 7; ii >= 0; ii--) { + num += b[ii] * cp; + cp *= pow28; + } + return num; +} + + +/* Write a 32 bit unsigned integer to a buffer in big endian order. */ +void bshuf_write_uint32_BE(void* buf, uint32_t num) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint32_t pow28 = 1 << 8; + for (ii = 3; ii >= 0; ii--) { + b[ii] = num % pow28; + num = num / pow28; + } +} + + +/* Read a 32 bit unsigned integer from a buffer big endian order. */ +uint32_t bshuf_read_uint32_BE(const void* buf) { + int ii; + uint8_t* b = (uint8_t*) buf; + uint32_t num = 0, pow28 = 1 << 8, cp = 1; + for (ii = 3; ii >= 0; ii--) { + num += b[ii] * cp; + cp *= pow28; + } + return num; +} + + +/* ---- Public functions ---- + * + * See header file for description and usage. + * + */ + +size_t bshuf_default_block_size(const size_t elem_size) { + + + + size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size; + + block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT; + return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK); +} + + +int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + + return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size, + elem_size, block_size); +} + + +int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size) { + + return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size, + elem_size, block_size); +} + + +#undef TRANS_BIT_8X8 +#undef TRANS_ELEM_TYPE +#undef MAX +#undef CHECK_MULT_EIGHT +#undef CHECK_ERR_FREE + +#undef USESSE2 +#undef USEAVX2 diff --git a/bslz4/src/bitshuffle_core.h b/bslz4/src/bitshuffle_core.h new file mode 100644 index 0000000..8996d91 --- /dev/null +++ b/bslz4/src/bitshuffle_core.h @@ -0,0 +1,157 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + * + * Header File + * + * Worker routines return an int64_t which is the number of bytes processed + * if positive or an error code if negative. + * + * Error codes: + * -1 : Failed to allocate memory. + * -11 : Missing SSE. + * -12 : Missing AVX. + * -80 : Input size not a multiple of 8. + * -81 : block_size not multiple of 8. + * -91 : Decompression error, wrong number of bytes processed. + * -1YYY : Error internal to compression routine with error code -YYY. + */ + + +#ifndef BITSHUFFLE_CORE_H +#define BITSHUFFLE_CORE_H + + +#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus) +#include +#else + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef signed short int16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned long long uint64_t; + typedef long long int64_t; +#endif + +#include + + + +#ifndef BSHUF_VERSION_MAJOR +#define BSHUF_VERSION_MAJOR 0 +#define BSHUF_VERSION_MINOR 3 +#define BSHUF_VERSION_POINT 4 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* --- bshuf_using_SSE2 ---- + * + * Whether routines where compiled with the SSE2 instruction set. + * + * Returns + * ------- + * 1 if using SSE2, 0 otherwise. + * + */ +int bshuf_using_SSE2(void); + + +/* ---- bshuf_using_AVX2 ---- + * + * Whether routines where compiled with the AVX2 instruction set. + * + * Returns + * ------- + * 1 if using AVX2, 0 otherwise. + * + */ +int bshuf_using_AVX2(void); + + +/* ---- bshuf_default_block_size ---- + * + * The default block size as function of element size. + * + * This is the block size used by the blocked routines (any routine + * taking a *block_size* argument) when the block_size is not provided + * (zero is passed). + * + * The results of this routine are guaranteed to be stable such that + * shuffled/compressed data can always be decompressed. + * + * Parameters + * ---------- + * elem_size : element size of data to be shuffled/compressed. + * + */ +size_t bshuf_default_block_size(const size_t elem_size); + + +/* ---- bshuf_bitshuffle ---- + * + * Bitshuffle the data. + * + * Transpose the bits within elements, in blocks of *block_size* + * elements. + * + * Parameters + * ---------- + * in : input buffer, must be of size * elem_size bytes + * out : output buffer, must be of size * elem_size bytes + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Do transpose in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes processed, negative error-code if failed. + * + */ +int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size); + + +/* ---- bshuf_bitunshuffle ---- + * + * Unshuffle bitshuffled data. + * + * Untranspose the bits within elements, in blocks of *block_size* + * elements. + * + * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size* + * must match the parameters used to shuffle the data. + * + * Parameters + * ---------- + * in : input buffer, must be of size * elem_size bytes + * out : output buffer, must be of size * elem_size bytes + * size : number of elements in input + * elem_size : element size of typed data + * block_size : Do transpose in blocks of this many elements. Pass 0 to + * select automatically (recommended). + * + * Returns + * ------- + * number of bytes processed, negative error-code if failed. + * + */ +int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size, + const size_t elem_size, size_t block_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bslz4/src/bitshuffle_internals.h b/bslz4/src/bitshuffle_internals.h new file mode 100644 index 0000000..7b5b8a5 --- /dev/null +++ b/bslz4/src/bitshuffle_internals.h @@ -0,0 +1,63 @@ +/* + * Bitshuffle - Filter for improving compression of typed binary data. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + */ + + +#ifndef BITSHUFFLE_INTERNALS_H +#define BITSHUFFLE_INTERNALS_H + + +#include +#include "iochain.h" + + +#ifndef BSHUF_MIN_RECOMMEND_BLOCK +#define BSHUF_MIN_RECOMMEND_BLOCK 128 +#define BSHUF_BLOCKED_MULT 8 +#define BSHUF_TARGET_BLOCK_SIZE_B 8192 +#endif + + + +#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; } + + +#ifdef __cplusplus +extern "C" { +#endif + +/* ---- Utility functions for internal use only ---- */ + +int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size); + +/* Read a 32 bit unsigned integer from a buffer big endian order. */ +uint32_t bshuf_read_uint32_BE(const void* buf); + +/* Write a 32 bit unsigned integer to a buffer in big endian order. */ +void bshuf_write_uint32_BE(void* buf, uint32_t num); + +int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, + const size_t elem_size); + +/* Function definition for worker functions that process a single block. */ +typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr, + const size_t size, const size_t elem_size); + +/* Wrap a function for processing a single block to process an entire buffer in + * parallel. */ +int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, + const size_t size, const size_t elem_size, size_t block_size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bslz4/src/iochain.c b/bslz4/src/iochain.c new file mode 100644 index 0000000..5c5eddd --- /dev/null +++ b/bslz4/src/iochain.c @@ -0,0 +1,90 @@ +/* + * IOchain - Distribute a chain of dependant IO events amoung threads. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + */ + +#include +#include "iochain.h" + + +void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0) { +#ifdef _OPENMP + omp_init_lock(&C->next_lock); + for (size_t ii = 0; ii < IOC_SIZE; ii ++) { + omp_init_lock(&(C->in_pl[ii].lock)); + omp_init_lock(&(C->out_pl[ii].lock)); + } +#endif + C->next = 0; + C->in_pl[0].ptr = in_ptr_0; + C->out_pl[0].ptr = out_ptr_0; +} + + +void ioc_destroy(ioc_chain *C) { +#ifdef _OPENMP + omp_destroy_lock(&C->next_lock); + for (size_t ii = 0; ii < IOC_SIZE; ii ++) { + omp_destroy_lock(&(C->in_pl[ii].lock)); + omp_destroy_lock(&(C->out_pl[ii].lock)); + } +#endif +} + + +const void * ioc_get_in(ioc_chain *C, size_t *this_iter) { +#ifdef _OPENMP + omp_set_lock(&C->next_lock); + #pragma omp flush +#endif + *this_iter = C->next; + C->next ++; +#ifdef _OPENMP + omp_set_lock(&(C->in_pl[*this_iter % IOC_SIZE].lock)); + omp_set_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock)); + omp_set_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock)); + omp_unset_lock(&C->next_lock); +#endif + return C->in_pl[*this_iter % IOC_SIZE].ptr; +} + + +void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr) { + C->in_pl[(*this_iter + 1) % IOC_SIZE].ptr = in_ptr; +#ifdef _OPENMP + omp_unset_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock)); +#endif +} + + +void * ioc_get_out(ioc_chain *C, size_t *this_iter) { +#ifdef _OPENMP + omp_set_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock)); + #pragma omp flush +#endif + void *out_ptr = C->out_pl[*this_iter % IOC_SIZE].ptr; +#ifdef _OPENMP + omp_unset_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock)); +#endif + return out_ptr; +} + + +void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) { + C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr; +#ifdef _OPENMP + omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock)); + + + + omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock)); +#endif +} + diff --git a/bslz4/src/iochain.h b/bslz4/src/iochain.h new file mode 100644 index 0000000..6a398ff --- /dev/null +++ b/bslz4/src/iochain.h @@ -0,0 +1,94 @@ +/* + * IOchain - Distribute a chain of dependant IO events amoung threads. + * + * This file is part of Bitshuffle + * Author: Kiyoshi Masui + * Website: http://www.github.com/kiyo-masui/bitshuffle + * Created: 2014 + * + * See LICENSE file for details about copyright and rights to use. + * + * + * Header File + * + * Similar in concept to a queue. Each task includes reading an input + * and writing output, but the location of the input/output (the pointers) + * depend on the previous item in the chain. + * + * This is designed for parallelizing blocked compression/decompression IO, + * where the destination of a compressed block depends on the compressed size + * of all previous blocks. + * + * Implemented with OpenMP locks. + * + * + * Usage + * ----- + * - Call `ioc_init` in serial block. + * - Each thread should create a local variable *size_t this_iter* and + * pass its address to all function calls. Its value will be set + * inside the functions and is used to identify the thread. + * - Each thread must call each of the `ioc_get*` and `ioc_set*` methods + * exactly once per iteration, starting with `ioc_get_in` and ending + * with `ioc_set_next_out`. + * - The order (`ioc_get_in`, `ioc_set_next_in`, *work*, `ioc_get_out`, + * `ioc_set_next_out`, *work*) is most efficient. + * - Have each thread call `ioc_end_pop`. + * - `ioc_get_in` is blocked until the previous entry's + * `ioc_set_next_in` is called. + * - `ioc_get_out` is blocked until the previous entry's + * `ioc_set_next_out` is called. + * - There are no blocks on the very first iteration. + * - Call `ioc_destroy` in serial block. + * - Safe for num_threads >= IOC_SIZE (but less efficient). + * + */ + + +#ifndef IOCHAIN_H +#define IOCHAIN_H + + +#include +#ifdef _OPENMP +#include +#endif + + +#define IOC_SIZE 33 + + +typedef struct ioc_ptr_and_lock { +#ifdef _OPENMP + omp_lock_t lock; +#endif + void *ptr; +} ptr_and_lock; + +typedef struct ioc_const_ptr_and_lock { +#ifdef _OPENMP + omp_lock_t lock; +#endif + const void *ptr; +} const_ptr_and_lock; + + +typedef struct ioc_chain { +#ifdef _OPENMP + omp_lock_t next_lock; +#endif + size_t next; + const_ptr_and_lock in_pl[IOC_SIZE]; + ptr_and_lock out_pl[IOC_SIZE]; +} ioc_chain; + + +void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0); +void ioc_destroy(ioc_chain *C); +const void * ioc_get_in(ioc_chain *C, size_t *this_iter); +void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr); +void * ioc_get_out(ioc_chain *C, size_t *this_iter); +void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr); + +#endif + diff --git a/bslz4/src/lz4.c b/bslz4/src/lz4.c new file mode 100644 index 0000000..31f489d --- /dev/null +++ b/bslz4/src/lz4.c @@ -0,0 +1,1515 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2015, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + + +/************************************** +* Tuning parameters +**************************************/ +/* + * HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#define HEAPMODE 0 + +/* + * ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define ACCELERATION_DEFAULT 1 + + +/************************************** +* CPU Feature Detection +**************************************/ +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +/************************************** +* Includes +**************************************/ +#include "lz4.h" + + +/************************************** +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# include +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ +#else +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# if defined(__GNUC__) || defined(__clang__) +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif /* _MSC_VER */ + +/* LZ4_GCC_VERSION is defined into lz4.h */ +#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +/************************************** +* Memory routines +**************************************/ +#include /* malloc, calloc, free */ +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/************************************** +* Basic Types +**************************************/ +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + + +/************************************** +* Reading and writing into memory +**************************************/ +#define STEPSIZE sizeof(size_t) + +static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } + +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val16; + memcpy(&val16, memPtr, 2); + return val16; +} + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) + { + return LZ4_read16(memPtr); + } + else + { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) + { + memcpy(memPtr, &value, 2); + } + else + { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val32; + memcpy(&val32, memPtr, 4); + return val32; +} + +static U64 LZ4_read64(const void* memPtr) +{ + U64 val64; + memcpy(&val64, memPtr, 8); + return val64; +} + +static size_t LZ4_read_ARCH(const void* p) +{ + if (LZ4_64bits()) + return (size_t)LZ4_read64(p); + else + return (size_t)LZ4_read32(p); +} + + +static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); } + +static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); } + +/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ +static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* e = (BYTE*)dstEnd; + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } + else /* Big Endian CPU */ + { + if (LZ4_64bits()) + { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll((U64)val) >> 3); +# else + unsigned r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } + else /* 32 bits */ + { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } +} + +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIn compression run slower on incompressible data */ + + +/************************************** +* Local Structures and types +**************************************/ +typedef struct { + U32 hashTable[HASH_SIZE_U32]; + U32 currentOffset; + U32 initCheck; + const BYTE* dictionary; + BYTE* bufferStart; /* obsolete, used for slideInputBuffer */ + U32 dictSize; +} LZ4_stream_t_internal; + +typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; + +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; + + +/************************************** +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + + + +/******************************** +* Compression functions +********************************/ + +static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static const U64 prime5bytes = 889523592379ULL; +static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + const U32 hashMask = (1<> (40 - hashLog)) & hashMask; +} + +static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType) +{ + if (LZ4_64bits()) + return LZ4_hashSequence64(sequence, tableType); + return LZ4_hashSequence((U32)sequence, tableType); +} + +static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); } + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + +FORCE_INLINE int LZ4_compress_generic( + void* const ctx, + const char* const source, + char* const dest, + const int inputSize, + const int maxOutputSize, + const limitedOutput_directive outputLimited, + const tableType_t tableType, + const dict_directive dict, + const dictIssue_directive dictIssue, + const U32 acceleration) +{ + LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx; + + const BYTE* ip = (const BYTE*) source; + const BYTE* base; + const BYTE* lowLimit; + const BYTE* const lowRefLimit = ip - dictPtr->dictSize; + const BYTE* const dictionary = dictPtr->dictionary; + const BYTE* const dictEnd = dictionary + dictPtr->dictSize; + const size_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 forwardH; + size_t refDelta=0; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + switch(dict) + { + case noDict: + default: + base = (const BYTE*)source; + lowLimit = (const BYTE*)source; + break; + case withPrefix64k: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source - dictPtr->dictSize; + break; + case usingExtDict: + base = (const BYTE*)source - dictPtr->currentOffset; + lowLimit = (const BYTE*)source; + break; + } + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (inputSize> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx, tableType, base); + if (dict==usingExtDict) + { + if (match<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) + || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; /* Check output limit */ + if (litLength>=RUN_MASK) + { + int len = (int)litLength-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; + matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += MINMATCH + matchLength; + if (ip==limit) + { + unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchLength += more; + ip += more; + } + } + else + { + matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += MINMATCH + matchLength; + } + + if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) + return 0; /* Check output limit */ + if (matchLength>=ML_MASK) + { + *token += ML_MASK; + matchLength -= ML_MASK; + for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } + if (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip > mflimit) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx, tableType, base); + if (dict==usingExtDict) + { + if (match<(const BYTE*)source) + { + refDelta = dictDelta; + lowLimit = dictionary; + } + else + { + refDelta = 0; + lowLimit = (const BYTE*)source; + } + } + LZ4_putPosition(ip, ctx, tableType, base); + if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) + && (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + const size_t lastRun = (size_t)(iend - anchor); + if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) + return 0; /* Check output limit */ + if (lastRun >= RUN_MASK) + { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } + else + { + *op++ = (BYTE)(lastRun<= LZ4_compressBound(inputSize)) + { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } + else + { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ +#if (HEAPMODE) + void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctx; + void* ctxPtr = &ctx; +#endif + + int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); +} + + +/* hidden debug function */ +/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ +int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t ctx; + + LZ4_resetStream(&ctx); + + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); +} + + +/******************************** +* destSize variant +********************************/ + +static int LZ4_compress_destSize_generic( + void* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + const int targetDstSize, + const tableType_t tableType) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* base = (const BYTE*) src; + const BYTE* lowLimit = (const BYTE*) src; + const BYTE* anchor = ip; + const BYTE* const iend = ip + *srcSizePtr; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + targetDstSize; + BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; + BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); + BYTE* const oMaxSeq = oMaxLit - 1 /* token */; + + U32 forwardH; + + + /* Init conditions */ + if (targetDstSize < 1) return 0; /* Impossible to store anything */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (*srcSizePtr> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) + goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + { + /* Encode Literal length */ + unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if (op + ((litLength+240)/255) + litLength > oMaxLit) + { + /* Not enough space for a last match */ + op--; + goto _last_literals; + } + if (litLength>=RUN_MASK) + { + unsigned len = litLength - RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< oMaxMatch) + { + /* Match description too long : reduce it */ + matchLength = (15-1) + (oMaxMatch-op) * 255; + } + ip += MINMATCH + matchLength; + + if (matchLength>=ML_MASK) + { + *token += ML_MASK; + matchLength -= ML_MASK; + while (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of block */ + if (ip > mflimit) break; + if (op > oMaxSeq) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ( (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + size_t lastRunSize = (size_t)(iend - anchor); + if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) + { + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (oend-op) - 1; + lastRunSize -= (lastRunSize+240)/255; + } + ip = anchor + lastRunSize; + + if (lastRunSize >= RUN_MASK) + { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } + else + { + *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */ + { + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } + else + { + if (*srcSizePtr < LZ4_64Klimit) + return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16); + else + return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr); + } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (HEAPMODE) + void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctxBody; + void* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/******************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + LZ4_resetStream(lz4s); + return lz4s; +} + +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(size_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ + LZ4_resetStream(LZ4_dict); + + if (dictSize < (int)HASH_UNIT) + { + dict->dictionary = NULL; + dict->dictSize = 0; + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + dict->currentOffset += 64 KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd-HASH_UNIT) + { + LZ4_putPosition(p, dict->hashTable, byU32, base); + p+=3; + } + + return dict->dictSize; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ + { + /* rescale hash table */ + U32 delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = (const BYTE*) source; + if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ + if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; + LZ4_renormDictT(streamPtr, smallest); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + /* Check overlapping input/dictionary space */ + { + const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) + { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } + + /* external dictionary mode */ + { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); + else + result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) +{ + LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; + int result; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = dictEnd; + if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; + LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); + + result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + + return result; +} + + +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/******************************* +* Decompression functions +*******************************/ +/* + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is essential this generic function is really inlined, + * in order to remove useless branches during compilation optimization. + */ +FORCE_INLINE int LZ4_decompress_generic( + const char* const source, + char* const dest, + int inputSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ + + int endOnInput, /* endOnOutputSize, endOnInputSize */ + int partialDecoding, /* full, partial */ + int targetOutputSize, /* only used if partialDecoding==partial */ + int dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* == dest if dict == noDict */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + /* Local Variables */ + const BYTE* ip = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + const BYTE* const lowLimit = lowPrefix - dictSize; + + const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; + const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; + const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Special cases */ + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); + + + /* Main Loop */ + while (1) + { + unsigned token; + size_t length; + const BYTE* match; + + /* get literal length */ + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s; + do + { + s = *ip++; + length += s; + } + while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + break; /* Necessarily EOF, due to parsing restrictions */ + } + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; + + /* get offset */ + match = cpy - LZ4_readLE16(ip); ip+=2; + if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) + { + unsigned s; + do + { + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + s = *ip++; + length += s; + } while (s==255); + if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + + /* check external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) + { + if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ + + if (length <= (size_t)(lowPrefix-match)) + { + /* match can be copied as a single segment from external dictionary */ + match = dictEnd - (lowPrefix-match); + memmove(op, match, length); op += length; + } + else + { + /* match encompass external dictionary and current segment */ + size_t copySize = (size_t)(lowPrefix-match); + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + copySize = length - copySize; + if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ + { + BYTE* const endOfMatch = op + copySize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } + else + { + memcpy(op, lowPrefix, copySize); + op += copySize; + } + } + continue; + } + + /* copy repeated sequence */ + cpy = op + length; + if (unlikely((op-match)<8)) + { + const size_t dec64 = dec64table[op-match]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[op-match]; + LZ4_copy4(op+4, match); + op += 8; match -= dec64; + } else { LZ4_copy8(op, match); op+=8; match+=8; } + + if (unlikely(cpy>oend-12)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ + if (op < oend-8) + { + LZ4_wildCopy(op, match, oend-8); + match += (oend-8) - op; + op = oend-8; + } + while (opprefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) + { + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += result; + lz4sd->prefixEnd += result; + } + else + { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) + { + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += originalSize; + lz4sd->prefixEnd += originalSize; + } + else + { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); + if (dictStart+dictSize == dest) + { + if (dictSize >= (int)(64 KB - 1)) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); + } + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); +} + +/* debug function */ +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + + +/*************************************************** +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } +int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } + +/* +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base) +{ + MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); + lz4ds->bufferStart = base; +} + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer); + return lz4ds; +} + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data; + int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); + return (char*)(ctx->bufferStart + dictSize); +} + +/* Obsolete streaming decompression functions */ + +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +#endif /* LZ4_COMMONDEFS_ONLY */ + diff --git a/bslz4/src/lz4.h b/bslz4/src/lz4.h new file mode 100644 index 0000000..3e74002 --- /dev/null +++ b/bslz4/src/lz4.h @@ -0,0 +1,360 @@ +/* + LZ4 - Fast LZ compression algorithm + Header File + Copyright (C) 2011-2015, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + +/* + * lz4.h provides block compression functions, and gives full buffer control to programmer. + * If you need to generate inter-operable compressed data (respecting LZ4 frame specification), + * and can let the library handle its own memory, please use lz4frame.h instead. +*/ + +/************************************** +* Version +**************************************/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) +int LZ4_versionNumber (void); + +/************************************** +* Tuning parameter +**************************************/ +/* + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#define LZ4_MEMORY_USAGE 14 + + +/************************************** +* Simple Functions +**************************************/ + +int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); +int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); + +/* +LZ4_compress_default() : + Compresses 'sourceSize' bytes from buffer 'source' + into already allocated 'dest' buffer of size 'maxDestSize'. + Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). + It also runs faster, so it's a recommended setting. + If the function cannot compress 'source' into a more limited 'dest' budget, + compression stops *immediately*, and the function result is zero. + As a consequence, 'dest' content is not valid. + This function never writes outside 'dest' buffer, nor read outside 'source' buffer. + sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) + return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) + or 0 if compression fails + +LZ4_decompress_safe() : + compressedSize : is the precise full size of the compressed block. + maxDecompressedSize : is the size of destination buffer, which must be already allocated. + return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) + If destination buffer is not large enough, decoding will stop and output an error code (<0). + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function is protected against buffer overflow exploits, including malicious data packets. + It never writes outside output buffer, nor reads outside input buffer. +*/ + + +/************************************** +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ +int LZ4_compressBound(int inputSize); + +/* +LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows to select an "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1. +*/ +int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); + + +/* +LZ4_compress_fast_extState() : + Same compression function, just using an externally allocated memory space to store compression state. + Use LZ4_sizeofState() to know how much memory must be allocated, + and allocate it on 8-bytes boundaries (using malloc() typically). + Then, provide it as 'void* state' to compression function. +*/ +int LZ4_sizeofState(void); +int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); + + +/* +LZ4_compress_destSize() : + Reverse the logic, by compressing as much data as possible from 'source' buffer + into already allocated buffer 'dest' of size 'targetDestSize'. + This function either compresses the entire 'source' content into 'dest' if it's large enough, + or fill 'dest' buffer completely with as much data as possible from 'source'. + *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'. + New value is necessarily <= old value. + return : Nb bytes written into 'dest' (necessarily <= targetDestSize) + or 0 if compression fails +*/ +int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); + + +/* +LZ4_decompress_fast() : + originalSize : is the original and therefore uncompressed size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is detected malformed, the function will stop decoding and return a negative result. + Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. + note : This function fully respect memory boundaries for properly formed compressed data. + It is a bit faster than LZ4_decompress_safe(). + However, it does not provide any protection against intentionally modified data stream (malicious input). + Use this function in trusted environment only (data to decode comes from a trusted source). +*/ +int LZ4_decompress_fast (const char* source, char* dest, int originalSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'compressedSize' at position 'source' + into destination buffer 'dest' of size 'maxDecompressedSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ +int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); + + +/*********************************************** +* Streaming Compression Functions +***********************************************/ +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long)) +/* + * LZ4_stream_t + * information structure to track an LZ4 stream. + * important : init this structure content before first use ! + * note : only allocated directly the structure if you are statically linking LZ4 + * If you are using liblz4 as a DLL, please use below construction methods instead. + */ +typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; + +/* + * LZ4_resetStream + * Use this function to init an allocated LZ4_stream_t structure + */ +void LZ4_resetStream (LZ4_stream_t* streamPtr); + +/* + * LZ4_createStream will allocate and initialize an LZ4_stream_t structure + * LZ4_freeStream releases its memory. + * In the context of a DLL (liblz4), please use these methods rather than the static struct. + * They are more future proof, in case of a change of LZ4_stream_t size. + */ +LZ4_stream_t* LZ4_createStream(void); +int LZ4_freeStream (LZ4_stream_t* streamPtr); + +/* + * LZ4_loadDict + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Loading a size of 0 is allowed. + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/* + * LZ4_compress_fast_continue + * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to still be present and unmodified ! + * 'dst' buffer must be already allocated. + * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. + */ +int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); + +/* + * LZ4_saveDict + * If previously compressed data block is not guaranteed to remain available at its memory location + * save it into a safer place (char* safeBuffer) + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue() + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error + */ +int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); + + +/************************************************ +* Streaming Decompression Functions +************************************************/ + +#define LZ4_STREAMDECODESIZE_U64 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; +/* + * LZ4_streamDecode_t + * information structure to track an LZ4 stream. + * init this structure content using LZ4_setStreamDecode or memset() before first use ! + * + * In the context of a DLL (liblz4) please prefer usage of construction methods below. + * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future. + * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure + * LZ4_freeStreamDecode releases its memory. + */ +LZ4_streamDecode_t* LZ4_createStreamDecode(void); +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/* + * LZ4_setStreamDecode + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * Return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) + In the case of a ring buffers, decoding buffer must be either : + - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions) + In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB). + - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block. + In which case, encoding and decoding buffers do not need to be synchronized, + and encoding ring buffer can have any size, including small ones ( < 64 KB). + - _At least_ 64 KB + 8 bytes + maxBlockSize. + In which case, encoding and decoding buffers do not need to be synchronized, + and encoding ring buffer can have any size, including larger than decoding buffer. + Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, + and indicate where it is saved using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as + a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue() + They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure. +*/ +int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); +int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); + + + +/************************************** +* Obsolete Functions +**************************************/ +/* Deprecate Warnings */ +/* Should these warnings messages be a problem, + it is generally possible to disable them, + with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual for example. + You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ +#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK +# define LZ4_DEPRECATE_WARNING_DEFBLOCK +# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +# if (LZ4_GCC_VERSION >= 405) || defined(__clang__) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (LZ4_GCC_VERSION >= 301) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") +# define LZ4_DEPRECATED(message) +# endif +#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */ + +/* Obsolete compression functions */ +/* These functions are planned to start generate warnings by r131 approximately */ +int LZ4_compress (const char* source, char* dest, int sourceSize); +int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); +int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete decompression functions */ +/* These function names are completely deprecated and must no longer be used. + They are only provided here for compatibility with older programs. + - LZ4_uncompress is the same as LZ4_decompress_fast + - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe + These function prototypes are now disabled; uncomment them only if you really need them. + It is highly recommended to stop using these prototypes and migrate to maintained ones */ +/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */ +/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */ + +/* Obsolete streaming functions; use new streaming interface whenever possible */ +LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer); +LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state); + +/* Obsolete streaming decoding functions */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + + +#if defined (__cplusplus) +} +#endif diff --git a/src/file.c b/src/file.c index 38180c7..20ca72b 100644 --- a/src/file.c +++ b/src/file.c @@ -5,29 +5,15 @@ #include +#include #include #include #include #include #include "file.h" #include "err.h" +#include "filters.h" -hid_t h5_int_type_from_width(const int width) { - if (width == sizeof(char)) { - return H5T_NATIVE_SCHAR; - } else if (width == sizeof(short)) { - return H5T_NATIVE_SHORT; - } else if (width == sizeof(int)) { - return H5T_NATIVE_INT; - } else if (width == sizeof(long)) { - return H5T_NATIVE_LONG; - } else if (width == sizeof(long long)) { - return H5T_NATIVE_LLONG; - } else { - /* TODO: error */ - return -1; - } -} void clear_det_visit_objects(struct det_visit_objects_t *objects) { if (objects->nxdata) { @@ -41,18 +27,24 @@ void clear_det_visit_objects(struct det_visit_objects_t *objects) { } -void free_nxs_data_description(struct data_description_t *desc) { - if (desc->extra) free(desc->extra); /* should just be NULL */ - desc->extra = NULL; +void free_ds_desc(struct ds_desc_t *desc) { + H5Gclose(desc->det_g_id); + H5Gclose(desc->data_g_id); + free(desc); } +void free_nxs_desc(struct ds_desc_t *desc) { + free_ds_desc(desc); +} -void free_eiger_data_description(struct data_description_t *desc) { - if (!desc->extra) return; - struct eiger_data_description_t *extra = desc->extra; - if (extra->block_sizes) free(extra->block_sizes); - free(extra); - desc->extra = NULL; +void free_eiger_desc(struct ds_desc_t *desc) { + struct eiger_ds_desc_t *e_desc = (struct eiger_ds_desc_t *) desc; + free(e_desc->block_sizes); + free_ds_desc(desc); +} + +void free_opt_eiger_desc(struct ds_desc_t *desc) { + free_eiger_desc(desc); } @@ -80,13 +72,12 @@ double scale_from_units(const char* unit_string) { } } -int get_nxs_dataset_dims(const struct data_description_t *desc, struct dataset_properties_t *properties) { +int get_nxs_dataset_dims(struct ds_desc_t *desc) { hid_t g_id, ds_id, s_id, t_id; int retval = 0; int ndims = 0; int width = 0; - hsize_t dims[3] = {0}; - g_id = desc->data_group_id;; + g_id = desc->data_g_id;; ds_id = H5Dopen2(g_id, "data", H5P_DEFAULT); if (ds_id <= 0) { @@ -115,12 +106,11 @@ int get_nxs_dataset_dims(const struct data_description_t *desc, struct dataset_p ERROR_JUMP(-1, close_space, message); } - if (H5Sget_simple_extent_dims(s_id, dims, NULL) < 0) { + if (H5Sget_simple_extent_dims(s_id, desc->dims, NULL) < 0) { ERROR_JUMP(-1, close_space, "Error getting dataset dimensions"); } - memcpy(properties->dims, dims, 3 * sizeof(*dims)); - properties->data_width = width; + desc->data_width = width; close_space: H5Sclose(s_id); @@ -132,10 +122,18 @@ done: return retval; } -int get_frame(hid_t g_id, const char* name, hsize_t *frame_idx, hsize_t *frame_size, int data_width, void *buffer) { +int get_frame_simple(const struct ds_desc_t *desc, + const char *name, + const hsize_t *frame_idx, + const hsize_t *frame_size, + void *buffer) { + int retval = 0; herr_t err = 0; - hid_t ds_id, s_id, ms_id, t_id; + hid_t g_id, ds_id, s_id, ms_id, t_id; + + g_id = desc->data_g_id; + ds_id = H5Dopen2(g_id, name, H5P_DEFAULT); if (ds_id <= 0) { char message[64]; @@ -146,6 +144,10 @@ int get_frame(hid_t g_id, const char* name, hsize_t *frame_idx, hsize_t *frame_s if (s_id <= 0) { ERROR_JUMP(-1, close_dataset, "Error getting dataspace"); } + t_id = H5Dget_type(ds_id); + if (t_id <= 0) { + ERROR_JUMP(-1, close_type, "Error retrieving datatype"); + } err = H5Sselect_hyperslab(s_id, H5S_SELECT_SET, frame_idx, NULL, frame_size, NULL); if (err < 0) { ERROR_JUMP(-1, close_space, "Error seleting hyperslab"); @@ -155,12 +157,6 @@ int get_frame(hid_t g_id, const char* name, hsize_t *frame_idx, hsize_t *frame_s ERROR_JUMP(-1, close_space, "Could not create dataspace"); } - t_id = h5_int_type_from_width(data_width); - if (t_id < 0) { - char message[64]; - sprintf(message, "Could not infer signed integer from width %d", data_width); - ERROR_JUMP(-1, close_mspace, message); - } err = H5Dread(ds_id, t_id, ms_id, s_id, H5P_DEFAULT, buffer); if (err < 0) { ERROR_JUMP(-1, close_mspace, "Error reading dataset"); @@ -170,29 +166,110 @@ close_mspace: H5Sclose(ms_id); close_space: H5Sclose(s_id); +close_type: + H5Tclose(t_id); close_dataset: H5Dclose(ds_id); done: return retval; } + +int get_frame_from_chunk(const struct ds_desc_t *desc, + const char *ds_name, + const hsize_t *frame_idx, + const hsize_t *frame_size, + void *buffer) { + + hid_t d_id = 0; + hsize_t c_offset[3] = {frame_idx[0], 0, 0}; + uint32_t c_filter_mask = 0; + hsize_t c_bytes; + void *c_buffer = NULL; + const struct opt_eiger_ds_desc_t *o_eiger_desc = (struct opt_eiger_ds_desc_t *) desc; + int retval = 0; + + if (frame_idx[1] != 0 || frame_idx[2] != 0) { + char message[64]; + sprintf(message, "Require frame selection starts at [n, 0, 0], not [n, %llu, %llu]", + frame_idx[1], frame_idx[2]); + ERROR_JUMP(-1, done, message); + } + + d_id = H5Dopen(desc->data_g_id, ds_name, H5P_DEFAULT); + if (d_id < 0) { + char message[64]; + sprintf(message, "Error opening dataset %.32s", ds_name); + ERROR_JUMP(-1, done, message); + } + + + if (H5Dget_chunk_storage_size(d_id, c_offset, &c_bytes) < 0) { + char message[96]; + sprintf(message, "Error reading chunk size from %.32s for frame %llu", ds_name, frame_idx[0]); + ERROR_JUMP(-1, done, message); + } + if (c_bytes == 0) { + char message[96]; + sprintf(message, "Target chunk %llu has zero size for dataset %.32s", frame_idx[0], ds_name); + ERROR_JUMP(-1, done, message); + } + + if (o_eiger_desc->bs_applied) { + c_buffer = malloc(c_bytes); + if (!c_buffer) { + char message[128]; + sprintf(message, "Unable to allocate chunk buffer for dataset %.32s - frame %llu, size %llu bytes", + ds_name, frame_idx[0], c_bytes); + ERROR_JUMP(-1, done, message); + } + } else { + c_buffer = buffer; + } + + if (H5DOread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) < 0) { + char message[128]; + sprintf(message, "Error reading chunk %llu from dataset %.32s - size %llu bytes", + frame_idx[0], ds_name, c_bytes); + ERROR_JUMP(-1, done, message); + } + + if (o_eiger_desc->bs_applied) { + if (bslz4_decompress( + o_eiger_desc->bs_params, + c_bytes, + c_buffer, + desc->data_width * frame_size[1] * frame_size[2], + buffer) < 0) { + char message[128]; + sprintf(message, "Error processing chunk %llu from %.32s with bitshuffle_lz4", + frame_idx[0], ds_name); + ERROR_JUMP(-1, done, message); + } + } + +done: + if (c_buffer && (c_buffer != buffer)) free(c_buffer); + if (d_id) H5Dclose(d_id); + return retval; +} + + int get_nxs_frame( - const struct data_description_t *desc, - const struct dataset_properties_t *ds_prop, - int n, - int data_width, + const struct ds_desc_t *desc, + const int n, void *buffer) { /* detector data are the two inner most indices */ /* TODO: handle ndims > 3 and select appropriately */ int retval = 0; hsize_t frame_idx[3] = {n, 0, 0}; - hsize_t frame_size[3] = {1, ds_prop->dims[1], ds_prop->dims[2]}; - if (n < 0 || n >= ds_prop->dims[0]) { + hsize_t frame_size[3] = {1, desc->dims[1], desc->dims[2]}; + if (n < 0 || n >= desc->dims[0]) { char message[64]; - sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) ds_prop->dims[0] - 1); + sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) desc->dims[0] - 1); ERROR_JUMP(-1, done, message); } - retval = get_frame(desc->data_group_id, "data", frame_idx, frame_size, data_width, buffer); + retval = get_frame_simple(desc, "data", frame_idx, frame_size, buffer); if (retval < 0) { ERROR_JUMP(retval, done, ""); } @@ -202,22 +279,20 @@ done: int get_dectris_eiger_frame( - const struct data_description_t *desc, - const struct dataset_properties_t *ds_prop, + const struct ds_desc_t *desc, int n, - int data_width, void *buffer) { int retval = 0; int block, frame_count, idx; - struct eiger_data_description_t *eiger_desc = desc->extra; + struct eiger_ds_desc_t *eiger_desc = (struct eiger_ds_desc_t*) desc; char data_name[16] = {0}; hsize_t frame_idx[3] = {0, 0, 0}; - hsize_t frame_size[3] = {1, ds_prop->dims[1], ds_prop->dims[2]}; + hsize_t frame_size[3] = {1, desc->dims[1], desc->dims[2]}; - if (n < 0 || n >= ds_prop->dims[0]) { + if (n < 0 || n >= desc->dims[0]) { char message[64]; - sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) ds_prop->dims[0] - 1); + sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) desc->dims[0] - 1); ERROR_JUMP(-1, done, message); } @@ -228,7 +303,7 @@ int get_dectris_eiger_frame( idx = n - (frame_count - eiger_desc->block_sizes[block]); /* index in current block */ frame_idx[0] = idx; sprintf(data_name, "data_%06d", block + 1); - retval = get_frame(desc->data_group_id, data_name, frame_idx, frame_size, data_width, buffer); + retval = eiger_desc->frame_func(desc, data_name, frame_idx, frame_size, buffer); if (retval < 0) { ERROR_JUMP(retval, done, ""); } @@ -237,7 +312,7 @@ done: } -int get_dectris_eiger_dataset_dims(const struct data_description_t *desc, struct dataset_properties_t *properties) { +int get_dectris_eiger_dataset_dims(struct ds_desc_t *desc) { int retval = 0; int n_datas = 0; int n = 0; @@ -246,11 +321,12 @@ int get_dectris_eiger_dataset_dims(const struct data_description_t *desc, struct char ds_name[16] = {0}; /* 12 chars in "data_xxxxxx\0" */ int *frame_counts = NULL; hsize_t dims[3] = {0}; + struct eiger_ds_desc_t *eiger_desc = (struct eiger_ds_desc_t*) desc; /* datasets are "data_%06d % n" - need to determine how many of these there are and what the ranges are */ sprintf(ds_name, "data_%06d", n_datas + 1); - while (H5Lexists(desc->data_group_id, ds_name, H5P_DEFAULT) > 0) { + while (H5Lexists(desc->data_g_id, ds_name, H5P_DEFAULT) > 0) { sprintf(ds_name, "data_%06d", ++n_datas + 1); } @@ -260,7 +336,7 @@ int get_dectris_eiger_dataset_dims(const struct data_description_t *desc, struct hid_t ds_id, t_id, s_id; hsize_t block_dims[3] = {0}; sprintf(ds_name, "data_%06d", n + 1); - ds_id = H5Dopen2(desc->data_group_id, ds_name, H5P_DEFAULT); + ds_id = H5Dopen2(desc->data_g_id, ds_name, H5P_DEFAULT); if (ds_id < 0) { char message[64]; sprintf("Unable to open dataset %.16s", ds_name); @@ -309,10 +385,10 @@ loop_end: if (retval < 0) { free(frame_counts); } else { - memcpy(properties->dims, dims, 3 * sizeof(*dims)); - properties->data_width = data_width; - ((struct eiger_data_description_t *) desc->extra)->n_data_blocks = n_datas; - ((struct eiger_data_description_t *) desc->extra)->block_sizes = frame_counts; + memcpy(desc->dims, dims, 3 * sizeof(*dims)); + desc->data_width = data_width; + eiger_desc->n_data_blocks = n_datas; + eiger_desc->block_sizes = frame_counts; } return retval; } @@ -424,12 +500,12 @@ done: } -int get_nxs_pixel_info(const struct data_description_t *desc, double *x_size, double *y_size) { +int get_nxs_pixel_info(const struct ds_desc_t *desc, double *x_size, double *y_size) { int retval = 0; - if (read_pixel_info(desc->det_group_id, "x_pixel_size", x_size) < 0) { + if (read_pixel_info(desc->det_g_id, "x_pixel_size", x_size) < 0) { ERROR_JUMP(-1, done, ""); } - if (read_pixel_info(desc->det_group_id, "y_pixel_size", y_size) < 0) { + if (read_pixel_info(desc->det_g_id, "y_pixel_size", y_size) < 0) { ERROR_JUMP(-1, done, ""); } done: @@ -437,12 +513,12 @@ done: } -int get_dectris_eiger_pixel_info(const struct data_description_t *desc, double *x_size, double *y_size) { +int get_dectris_eiger_pixel_info(const struct ds_desc_t *desc, double *x_size, double *y_size) { int retval = 0; - if (read_pixel_info(desc->det_group_id, "detectorSpecific/x_pixel_size", x_size) < 0) { + if (read_pixel_info(desc->det_g_id, "detectorSpecific/x_pixel_size", x_size) < 0) { ERROR_JUMP(-1, done, ""); } - if (read_pixel_info(desc->det_group_id, "detectorSpecific/y_pixel_size", y_size) < 0) { + if (read_pixel_info(desc->det_g_id, "detectorSpecific/y_pixel_size", y_size) < 0) { ERROR_JUMP(-1, done, ""); } done: @@ -450,12 +526,12 @@ done: } -int get_nxs_pixel_mask(const struct data_description_t *desc, int *buffer) { +int get_nxs_pixel_mask(const struct ds_desc_t *desc, int *buffer) { int retval = 0; hid_t ds_id; herr_t err = 0; - ds_id = H5Dopen2(desc->det_group_id, "pixel_mask", H5P_DEFAULT); + ds_id = H5Dopen2(desc->det_g_id, "pixel_mask", H5P_DEFAULT); if (ds_id < 0) { ERROR_JUMP(-1, done, "Error opening pixel_mask dataset"); } @@ -472,12 +548,12 @@ done: } -int get_dectris_eiger_pixel_mask(const struct data_description_t *desc, int *buffer) { +int get_dectris_eiger_pixel_mask(const struct ds_desc_t *desc, int *buffer) { int retval = 0; hid_t ds_id; herr_t err = 0; - ds_id = H5Dopen2(desc->det_group_id, "detectorSpecific/pixel_mask", H5P_DEFAULT); + ds_id = H5Dopen2(desc->det_g_id, "detectorSpecific/pixel_mask", H5P_DEFAULT); if (ds_id < 0) { ERROR_JUMP(-1, done, "Error opening detectorSpecific/pixel_mask"); } @@ -602,85 +678,227 @@ done: return retval; } +int check_for_chunk_read( + hid_t g_id, + const char* ds_name, + struct opt_eiger_ds_desc_t *desc) { -int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_objects_t *visit_result) { int retval = 0; - data_desc->det_group_id = visit_result->nxdetector; + int n_filters; + hsize_t cdims[3]; + hid_t ds_id, dcpl, s_id; + unsigned int filter_flags, filter_config; + char filter_name[16]; + size_t name_len = 16; + size_t cd_nelems = BS_H5_N_PARAMS; + hsize_t dims[3]; + H5Z_filter_t filter; + + dcpl = 0; + s_id = 0; + ds_id = 0; + + ds_id = H5Dopen2(g_id, ds_name, H5P_DEFAULT); + if (ds_id < 0) { + char message[64]; + sprintf(message, "Error opening dataset %.32s", ds_name); + ERROR_JUMP(-1, done, message) + } + + s_id = H5Dget_space(ds_id); + if (s_id < 0) { + char message[64]; + sprintf(message, "Error opening dataspace for %.32s", ds_name); + ERROR_JUMP(-1, done, message); + } + + if (3 != H5Sget_simple_extent_ndims(s_id)) { + goto done; + } + + if (H5Sget_simple_extent_dims(s_id, dims, NULL) < 0) { + char message[80]; + sprintf(message, "Error retriving dataset dimensions for %.32s", ds_name); + ERROR_JUMP(-1, done, message); + } + + dcpl = H5Dget_create_plist(ds_id); + if (dcpl < 0) { + ERROR_JUMP(-1, done, "Error getting dataset creation property list"); + } + + /* check the chunk layout matches the layout we expect of + * [1, frame_size_y, frame_size_x] (1 frame == 1 chunk) */ + int cndims = H5Pget_chunk(dcpl, 3, cdims); + if (cndims != 3) { + goto done; + } + if (cdims[0] != 1 || cdims[1] != dims[1] || cdims[2] != dims[2]) { + goto done; + } + + /* check for potential filters - only the bitshuffle filter is supported */ + n_filters = H5Pget_nfilters(dcpl); + if (n_filters < 0) { + ERROR_JUMP(-1, done, "Error retrieving number of filters on dataset"); + } else if (n_filters > 1) { + goto done; + } + + if (n_filters == 1) { + filter = H5Pget_filter2(dcpl, 0, &filter_flags, + &cd_nelems, desc->bs_params, + name_len, filter_name, + &filter_config); + if (filter < 0) { + ERROR_JUMP(-1, done, "Error retrieving filter information"); + } + if (filter != BS_H5_FILTER_ID) { + goto done; + } + if (cd_nelems > BS_H5_N_PARAMS) { + char message[128]; + sprintf(message, + "More than expected number of parameters to bitshuffle filter - expected %d, was %lu", + BS_H5_N_PARAMS, cd_nelems); + ERROR_JUMP(-1, done, message); + } + desc->bs_applied = 1; + } else { + desc->bs_applied = 0; + } + + retval = 1; + +done: + if (dcpl) H5Pclose(dcpl); + if (s_id) H5Sclose(s_id); + if (ds_id) H5Dclose(ds_id); + return retval; +} + +int create_dataset_descriptor(struct ds_desc_t **desc, struct det_visit_objects_t *visit_result) { + int retval = 0; + hid_t g_id, ds_id; + int (*pxl_func)(const struct ds_desc_t*, double*, double*); + int (*pxl_mask_func)(const struct ds_desc_t*, int*); + int (*ds_prop_func)(struct ds_desc_t*); + int (*frame_func)(const struct ds_desc_t*, int, void*); + void (*free_func)(struct ds_desc_t*); + struct ds_desc_t *output; + + g_id = visit_result->nxdetector; /* determine the pixel information location */ - if (H5Lexists(data_desc->det_group_id, "x_pixel_size", H5P_DEFAULT) > 0 && - H5Lexists(data_desc->det_group_id, "y_pixel_size", H5P_DEFAULT)) { - data_desc->get_pixel_properties = &get_nxs_pixel_info; - } else if (H5Lexists(data_desc->det_group_id, "detectorSpecific", H5P_DEFAULT) > 0 && - H5Lexists(data_desc->det_group_id, "detectorSpecific/x_pixel_size", H5P_DEFAULT) > 0 && - H5Lexists(data_desc->det_group_id, "detectorSpecific/y_pixel_size", H5P_DEFAULT) > 0) { - data_desc->get_pixel_properties = &get_dectris_eiger_pixel_info; + if (H5Lexists(g_id, "x_pixel_size", H5P_DEFAULT) > 0 && + H5Lexists(g_id, "y_pixel_size", H5P_DEFAULT)) { + pxl_func = &get_nxs_pixel_info; + } else if (H5Lexists(g_id, "detectorSpecific", H5P_DEFAULT) > 0 && + H5Lexists(g_id, "detectorSpecific/x_pixel_size", H5P_DEFAULT) > 0 && + H5Lexists(g_id, "detectorSpecific/y_pixel_size", H5P_DEFAULT) > 0) { + pxl_func = &get_dectris_eiger_pixel_info; } else { - data_desc->get_pixel_properties = NULL; ERROR_JUMP(-1, done, "Could not locate x_pixel_size and y_pixel_size"); } /* determine pixel mask location */ - if (H5Lexists(data_desc->det_group_id, "pixel_mask", H5P_DEFAULT) > 0) { - data_desc->get_pixel_mask = &get_nxs_pixel_mask; - } else if (H5Lexists(data_desc->det_group_id, "detectorSpecific", H5P_DEFAULT) > 0 && - H5Lexists(data_desc->det_group_id, "detectorSpecific/pixel_mask", H5P_DEFAULT) > 0) { - data_desc->get_pixel_mask = &get_dectris_eiger_pixel_mask; + if (H5Lexists(g_id, "pixel_mask", H5P_DEFAULT) > 0) { + pxl_mask_func = &get_nxs_pixel_mask; + } else if (H5Lexists(g_id, "detectorSpecific", H5P_DEFAULT) > 0 && + H5Lexists(g_id, "detectorSpecific/pixel_mask", H5P_DEFAULT) > 0) { + pxl_mask_func = &get_dectris_eiger_pixel_mask; } else { - data_desc->get_pixel_mask = NULL; ERROR_JUMP(-1, done, "Could not locate pixel_mask"); } /* determine where the data is stored and what strategy to use */ /* we select the "dectris-eiger" strategy if both are valid due to - * potential confusion with the sizes of a virtual dataset (and possible - * failure opening it if the library version is not up to date) + * potential confusion with the sizes of a virtual dataset, possible failure + * opening if we're using an old library version, and the potential to use the + * optimised chunk read strategy */ if (H5Lexists(visit_result->nxdetector, "data_000001", H5P_DEFAULT) > 0) { - data_desc->data_group_id = visit_result->nxdetector; - data_desc->get_data_properties = &get_dectris_eiger_dataset_dims; - data_desc->get_data_frame = &get_dectris_eiger_frame; + ds_id = visit_result->nxdetector; + ds_prop_func = &get_dectris_eiger_dataset_dims; + frame_func = &get_dectris_eiger_frame; } else if (H5Lexists(visit_result->nxdetector, "data", H5P_DEFAULT) > 0) { - data_desc->data_group_id = visit_result->nxdetector; - data_desc->get_data_properties = &get_nxs_dataset_dims; - data_desc->get_data_frame = &get_nxs_frame; + ds_id = visit_result->nxdetector; + ds_prop_func = &get_nxs_dataset_dims; + frame_func = &get_nxs_frame; } else if (H5Lexists(visit_result->nxdata, "data_000001", H5P_DEFAULT) > 0) { - data_desc->data_group_id = visit_result->nxdata; - data_desc->get_data_properties = &get_dectris_eiger_dataset_dims; - data_desc->get_data_frame = &get_dectris_eiger_frame; + ds_id = visit_result->nxdata; + ds_prop_func = &get_dectris_eiger_dataset_dims; + frame_func = &get_dectris_eiger_frame; } else if (H5Lexists(visit_result->nxdata, "data", H5P_DEFAULT) > 0) { - data_desc->data_group_id = visit_result->nxdata; - data_desc->get_data_properties = &get_nxs_dataset_dims; - data_desc->get_data_frame = &get_nxs_frame; + ds_id = visit_result->nxdata; + ds_prop_func = &get_nxs_dataset_dims; + frame_func = &get_nxs_frame; } else { - data_desc->data_group_id = 0; - data_desc->get_data_properties = NULL; - data_desc->get_data_frame = NULL; ERROR_JUMP(-1, done, "Could not locate detector dataset"); } - if (data_desc->get_data_properties == &get_dectris_eiger_dataset_dims) { - /* setup the "extra eiger info" struct */ - struct eiger_data_description_t *eiger_desc = malloc(sizeof(*eiger_desc)); + + if (ds_prop_func == &get_dectris_eiger_dataset_dims) { + + /* setup the "extra info" structs */ + struct eiger_ds_desc_t *eiger_desc; + struct opt_eiger_ds_desc_t *o_eiger_desc; + + eiger_desc = malloc(sizeof(*eiger_desc)); if (!eiger_desc) { ERROR_JUMP(-1, done, "Memory error creating data description for Eiger"); } memset(eiger_desc, 0, sizeof(*eiger_desc)); - data_desc->extra = eiger_desc; - data_desc->free_extra = free_eiger_data_description; + eiger_desc->frame_func = &get_frame_simple; + + o_eiger_desc = malloc(sizeof(*o_eiger_desc)); + if (!o_eiger_desc) { + free(eiger_desc); + ERROR_JUMP(-1, done, "Memory error creating data description for optimised Eiger"); + } + o_eiger_desc->base.frame_func = &get_frame_from_chunk; + + /* check if we can perform the optimised chunk read */ + retval = check_for_chunk_read(ds_id, "data_000001", o_eiger_desc); + if (retval < 0) { + free(o_eiger_desc); + free(eiger_desc); + ERROR_JUMP(-1, done, ""); + } + if (retval) { + free(eiger_desc); + *(struct opt_eiger_ds_desc_t**) desc = o_eiger_desc; + free_func = &free_opt_eiger_desc; + } else { + free(o_eiger_desc); + *(struct eiger_ds_desc_t**) desc = eiger_desc; + free_func = &free_eiger_desc; + } + } else { - data_desc->free_extra = free_nxs_data_description; + *desc = malloc(sizeof(struct nxs_ds_desc_t)); + free_func = &free_nxs_desc; } + output = *((struct ds_desc_t **) desc); + output->det_g_id = g_id; + output->data_g_id = ds_id; + output->get_pixel_properties = pxl_func; + output->get_pixel_mask = pxl_mask_func; + output->get_data_frame = frame_func; + output->free_desc = free_func; + + ds_prop_func(output); + done: return retval; } -int extract_detector_info( +int get_detector_info( const hid_t fid, - struct data_description_t *data_desc, - struct dataset_properties_t *dataset_prop) { + struct ds_desc_t **desc) { + int retval = 0; herr_t err = 0; struct det_visit_objects_t objects = {0}; @@ -696,12 +914,11 @@ int extract_detector_info( fprintf(stderr, "WARNING: Could not locate an NXdetector entry\n"); } - if ((retval = fill_data_descriptor(data_desc, &objects)) < 0) { + if ((retval = create_dataset_descriptor(desc, &objects)) < 0) { ERROR_JUMP(retval, done, ""); }; - if ((retval = data_desc->get_data_properties(data_desc, dataset_prop)) < 0) { - ERROR_JUMP(retval, done, ""); - } + + done: return retval; } diff --git a/src/file.h b/src/file.h index 9e97116..a7e0ae9 100644 --- a/src/file.h +++ b/src/file.h @@ -9,43 +9,42 @@ #include #include "err.h" +#include "filters.h" -struct dataset_properties_t { - int data_width; + +struct ds_desc_t { + hid_t det_g_id; + hid_t data_g_id; hsize_t dims[3]; + int data_width; + int (*get_pixel_properties)(const struct ds_desc_t*, double*, double*); + int (*get_pixel_mask)(const struct ds_desc_t*, int*); + int (*get_data_frame)(const struct ds_desc_t*, const int, void*); + void (*free_desc)(struct ds_desc_t*); }; -struct data_description_t { - hid_t det_group_id; - hid_t data_group_id; - int (*get_pixel_properties)(const struct data_description_t*, double*, double*); - int (*get_pixel_mask)(const struct data_description_t*, int*); - int (*get_data_properties)(const struct data_description_t*, struct dataset_properties_t*); - int (*get_data_frame)(const struct data_description_t*, const struct dataset_properties_t*, int, int, void*); - void *extra; - void (*free_extra)(struct data_description_t*); +struct nxs_ds_desc_t { + struct ds_desc_t base; }; -void free_nxs_data_description(struct data_description_t *desc); - -struct eiger_data_description_t { +struct eiger_ds_desc_t { + struct ds_desc_t base; int n_data_blocks; int *block_sizes; + int (*frame_func)(const struct ds_desc_t*, const char*, const hsize_t*, const hsize_t*, void*); }; -void free_eiger_data_description(struct data_description_t *desc); +struct opt_eiger_ds_desc_t { + struct eiger_ds_desc_t base; + int bs_applied; + unsigned int bs_params[BS_H5_N_PARAMS]; +}; + +int get_detector_info(const hid_t fid, struct ds_desc_t **desc); struct det_visit_objects_t { hid_t nxdata; hid_t nxdetector; }; -void clear_det_visit_objects(struct det_visit_objects_t *objects); - -int get_nxs_dataset_dims(const struct data_description_t *desc, struct dataset_properties_t *properties); - -int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_objects_t *visit_result); - -int extract_detector_info(const hid_t fid, struct data_description_t *data_desc, struct dataset_properties_t *ds_prop); - #endif /* NXS_XDS_FILE_H */ diff --git a/src/filters.c b/src/filters.c new file mode 100644 index 0000000..49369df --- /dev/null +++ b/src/filters.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Diamond Light Source Ltd. + * Author: Charles Mita + */ + +#include +#include "filters.h" +#include "err.h" +#include "bitshuffle.h" + +/* Required prototypes from bitshuffle.c but not included in header */ +uint64_t bshuf_read_uint64_BE(const void *buffer); +uint32_t bshuf_read_uint32_BE(const void *buffer); + + +/* + * Derived from the h5 filter code from the bitshuffle project (not included here) + */ +int bslz4_decompress( + const unsigned int* bs_params, + size_t in_size, + void *in_buffer, + size_t out_size, + void *out_buffer) { + + int retval = 0; + size_t size, elem_size, block_size, u_bytes; + + elem_size = bs_params[2]; + u_bytes = bshuf_read_uint64_BE(in_buffer); + + if (u_bytes != out_size) { + char message[64]; + sprintf(message, "Decompressed chunk is %lu bytes, expected %lu", u_bytes, out_size); + ERROR_JUMP(-1, done, message); + } + + block_size = bshuf_read_uint32_BE((const char *) in_buffer + 8) / elem_size; + if (!block_size) { + ERROR_JUMP(-1, done, "Read block bitshuffle lz4 block size as 0"); + } + /* skip over header */ + in_buffer += 12; + size = u_bytes / elem_size; + + if (bs_params[4] == BS_H5_PARAM_LZ4_COMPRESS) { + if (bshuf_decompress_lz4(in_buffer, out_buffer, size, elem_size, block_size) < 0) { + ERROR_JUMP(-1, done, "Error performing bitshuffle_lz4 decompression"); + } + } else { + if (bshuf_bitunshuffle(in_buffer, out_buffer, size, elem_size, block_size) < 0) { + ERROR_JUMP(-1, done, "Error performing bit unshuffle"); + } + } + +done: + return retval; +} diff --git a/src/filters.h b/src/filters.h new file mode 100644 index 0000000..caa9a37 --- /dev/null +++ b/src/filters.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018 Diamond Light Source Ltd. + * Author: Charles Mita + */ + +#ifndef NXS_XDS_FILTER_H +#define NXS_XDS_FILTER_H + +#define BS_H5_N_PARAMS 5 +#define BS_H5_FILTER_ID 32008 +#define BS_H5_PARAM_LZ4_COMPRESS 2 + + + +int bslz4_decompress( + const unsigned int* bs_params, + size_t in_size, + void *in_buffer, + size_t out_size, + void *out_buffer); + +#endif /* NXS_XDS_FILTER_H */ diff --git a/src/plugin.c b/src/plugin.c index 44d618c..ef5be54 100644 --- a/src/plugin.c +++ b/src/plugin.c @@ -7,6 +7,7 @@ #include #include #include "file.h" +#include "filters.h" #include "plugin.h" @@ -15,9 +16,40 @@ #define ERROR_OUTPUT stderr + +/* mask bits loosely based on what Neggia does and what NeXus says should be done */ +/* basically - anything in the low byte (& 0xFF) means "ignore this" */ +/* Neggia usses the value -2 if bit 1, 2 or 3 are set */ +#define COPY_AND_MASK(in, out, size, mask) \ +{ \ + int i; \ + if (mask) { \ + for (i = 0; i < size; ++i) { \ + out[i] = in[i]; \ + if (mask[i] & 0xFF) out[i] = -1; \ + if (mask[i] & 30) out[i] = -2; \ + } \ + } else { \ + for (i = 0; i < size; i++) { \ + out[i] = in[i]; \ + } \ + } \ +} + +#define APPLY_MASK(buffer, mask, size) \ +{ \ + int i; \ + if (mask) { \ + for (i = 0; i < size; ++i) { \ + if (mask[i] & 0xFF) buffer[i] = -1; \ + if (mask[i] & 30) buffer[i] = -2; \ + } \ + } \ +} + + static hid_t file_id = 0; -static struct data_description_t data_desc = {0}; -static struct dataset_properties_t ds_prop = {0}; +static struct ds_desc_t *data_desc = NULL; static int *mask_buffer = NULL; @@ -29,21 +61,35 @@ void fill_info_array(int info[1024]) { info[4] = VERSION_TIMESTAMP; } -void apply_mask(int *data, int *mask, int size) { - int *dptr, *mptr; - dptr = data; - mptr = mask; - while (dptr < data + size && mptr < mask + size) { - /* mask bits loosely based on what Neggia does and what NeXus says should be done */ - /* basically - anything in the low byte (& 0xFF) means "ignore this" */ - if (*mptr & 0x01) *dptr = -1; - if (*mptr & 0xFE) *dptr = -2; - dptr++; - mptr++; +int convert_to_int_and_mask(void *in_buffer, int d_width, int *out_buffer, int length, int *mask) { + /* transfer data to output buffer, performing data conversion as required */ + int retval = 0; + /* TODO: decide how conversion of data should work */ + /* Should we sign extend? Neggia doesn't (casts from uint*), but may be more intuitive */ + if (d_width == sizeof(signed char)) { + signed char *in = in_buffer; + COPY_AND_MASK(in, out_buffer, length, mask); + } else if (d_width == sizeof(short)) { + short *in = in_buffer; + COPY_AND_MASK(in, out_buffer, length, mask); + } else if (d_width == sizeof(int)) { + int *in = in_buffer; + COPY_AND_MASK(in, out_buffer, length, mask); + } else if (d_width == sizeof(long int)) { + long int *in = in_buffer; + COPY_AND_MASK(in, out_buffer, length, mask); + } else if (d_width == sizeof(long long int)) { + long long int *in = in_buffer; + COPY_AND_MASK(in, out_buffer, length, mask); + } else { + char message[128]; + sprintf(message, "Unsupported conversion of data width %d to %ld (int)", d_width, sizeof(int)); + ERROR_JUMP(-1, done, message); } +done: + return retval; } - #ifdef __cplusplus extern "C" { #endif @@ -74,14 +120,14 @@ void plugin_open( } reset_error_stack(); - retval = extract_detector_info(file_id, &data_desc, &ds_prop); + retval = get_detector_info(file_id, &data_desc); if (retval < 0) { ERROR_JUMP(-4, done, ""); } - mask_buffer = malloc(ds_prop.dims[1] * ds_prop.dims[2] * sizeof(int)); + mask_buffer = malloc(data_desc->dims[1] * data_desc->dims[2] * sizeof(int)); if (mask_buffer) { - retval = data_desc.get_pixel_mask(&data_desc, mask_buffer); + retval = data_desc->get_pixel_mask(data_desc, mask_buffer); if (retval < 0) { fprintf(ERROR_OUTPUT, "WARNING: Could not read pixel mask - no masking will be applied\n"); dump_error_stack(ERROR_OUTPUT); @@ -94,6 +140,10 @@ void plugin_open( done: *error_flag = retval; if (retval < 0) { + if ((data_desc) && (data_desc->free_desc)) { + data_desc->free_desc(data_desc); + data_desc = NULL; + } dump_error_stack(ERROR_OUTPUT); } } @@ -112,15 +162,15 @@ void plugin_get_header( reset_error_stack(); fill_info_array(info); - err = data_desc.get_pixel_properties(&data_desc, &x_pixel_size, &y_pixel_size); + err = data_desc->get_pixel_properties(data_desc, &x_pixel_size, &y_pixel_size); if (err < 0) { ERROR_JUMP(err, done, "Failed to retrieve pixel information"); } - *nx = ds_prop.dims[2]; - *ny = ds_prop.dims[1]; - *nbytes = ds_prop.data_width; - *number_of_frames = ds_prop.dims[0]; + *nx = data_desc->dims[2]; + *ny = data_desc->dims[1]; + *nbytes = data_desc->data_width; + *number_of_frames = data_desc->dims[0]; *qx = (float) x_pixel_size; *qy = (float) y_pixel_size; @@ -138,25 +188,36 @@ void plugin_get_data( int *data_array, int info[1024], int *error_flag) { - int retval = 0, ij; + + int retval = 0; + int frame_size_px = data_desc->dims[1] * data_desc->dims[2]; reset_error_stack(); fill_info_array(info); - if (data_desc.get_data_frame(&data_desc, &ds_prop, (*frame_number) - 1, sizeof(int), data_array) < 0) { + + void *buffer = NULL; + if (sizeof(*data_array) == data_desc->data_width) { + buffer = data_array; + } else { + buffer = malloc(data_desc->data_width * frame_size_px); + if (!buffer) { + ERROR_JUMP(-1, done, "Unable to allocate data buffer"); + } + } + + if (data_desc->get_data_frame(data_desc, (*frame_number) - 1, buffer) < 0) { char message[64] = {0}; sprintf(message, "Failed to retrieve data for frame %d", *frame_number); ERROR_JUMP(-2, done, message); } - // nasty hack - for (ij = 0; ij < ds_prop.dims[1] * ds_prop.dims[2]; ij++) { - if (data_array[ij] == 0xffff) { - data_array[ij] = -2; - } - } - - - if (mask_buffer) { - apply_mask(data_array, mask_buffer, ds_prop.dims[1] * ds_prop.dims[2]); + if (buffer != data_array) { + if (convert_to_int_and_mask(buffer, data_desc->data_width, data_array, frame_size_px, mask_buffer) < 0) { + char message[64]; + sprintf(message, "Error converting data for frame %d", *frame_number); + ERROR_JUMP(-2, done, message); + } + } else { + APPLY_MASK(data_array, mask_buffer, frame_size_px); } done: @@ -164,6 +225,7 @@ done: if (retval < 0) { dump_error_stack(ERROR_OUTPUT); } + if (buffer && (buffer != data_array)) free(buffer); } @@ -177,7 +239,10 @@ void plugin_close(int *error_flag) { file_id = 0; if (mask_buffer) free(mask_buffer); - if (data_desc.free_extra) data_desc.free_extra(&data_desc); + if (data_desc->free_desc) { + data_desc->free_desc(data_desc); + data_desc = NULL; + } if (H5close() < 0) { *error_flag = -1; } diff --git a/src/test.c b/src/test.c index ec6d669..68a7ef9 100644 --- a/src/test.c +++ b/src/test.c @@ -4,20 +4,22 @@ #include "file.h" #include "err.h" - -void apply_mask(int *data, int *mask, int size) { - int *dptr, *mptr; - dptr = data; - mptr = mask; - while (dptr < data + size && mptr < mask + size) { - if (*mptr & 0x01) *dptr = -1; - if (*mptr & 0xFE) *dptr = -2; - dptr++; - mptr++; - } +#define COPY_AND_MASK(in, out, size, mask) \ +{ \ + int i; \ + if (mask) { \ + for (i = 0; i < size; ++i) { \ + out[i] = in[i]; \ + if (mask[i] & 0xFE) out[i] = -2; \ + if (mask[i] & 0x01) out[i] = -1; \ + } \ + } else { \ + for (i = 0; i < size; i++) { \ + out[i] = in[i]; \ + } \ + } \ } - int parse_args(int argc, char **argv, char **file_name, int *frame_idx) { int retval = 0; if (argc == 2) { @@ -38,13 +40,13 @@ int main(int argc, char **argv) { int err = 0; int retval = 0; char *test_file = ""; - struct data_description_t desc = {0}; - struct dataset_properties_t prop = {0}; + struct ds_desc_t *desc; int dims[3] = {0}; hid_t fid = 0; int frame_idx = 0; int *mask = NULL; int *data = NULL; + void *buffer = NULL; init_error_handling(); if (init_h5_error_handling() < 0) { @@ -58,13 +60,13 @@ int main(int argc, char **argv) { fid = H5Fopen(test_file, H5F_ACC_RDONLY, H5P_DEFAULT); if (fid < 0) ERROR_JUMP(-1, done, "Error opening file"); - err = extract_detector_info(fid, &desc, &prop); + err = get_detector_info(fid, &desc); if (err < 0) { ERROR_JUMP(err, done, ""); } - dims[0] = prop.dims[0]; - dims[1] = prop.dims[1]; - dims[2] = prop.dims[2]; + dims[0] = desc->dims[0]; + dims[1] = desc->dims[1]; + dims[2] = desc->dims[2]; printf("Dims: %d, %d, %d\n", dims[0], dims[1], dims[2]); @@ -72,18 +74,42 @@ int main(int argc, char **argv) { if (!mask) { ERROR_JUMP(err, done, "Failed to allocate space for pixel mask"); } - err = desc.get_pixel_mask(&desc, mask); + err = desc->get_pixel_mask(desc, mask); if (err < 0) { ERROR_JUMP(err, done, ""); } data = malloc(dims[1] * dims[2] * sizeof(*data)); - err = desc.get_data_frame(&desc, &prop, frame_idx, sizeof(*data), data); + if (sizeof(*data) != desc->data_width) { + buffer = malloc(dims[1] * dims[2] * desc->data_width); + } else { + buffer = data; + } + + err = desc->get_data_frame(desc, frame_idx, buffer); if (err < 0) { ERROR_JUMP(err, done, ""); } - apply_mask(data, mask, dims[1] * dims[2]); + if (buffer != data) { + if (desc->data_width == sizeof(signed char)) { + signed char *in = buffer; + COPY_AND_MASK(in, data, dims[1] * dims[2], mask); + } else if (desc->data_width == sizeof(short)) { + short *in = buffer; + COPY_AND_MASK(in, data, dims[1] * dims[2], mask); + } else if (desc->data_width == sizeof(int)) { + int *in = buffer; + COPY_AND_MASK(in, data, dims[1] * dims[2], mask); + } else if (desc->data_width == sizeof(long int)) { + long int *in = buffer; + COPY_AND_MASK(in, data, dims[1] * dims[2], mask); + } else if (desc->data_width == sizeof(long long int)) { + long long int *in = buffer; + COPY_AND_MASK(in, data, dims[1] * dims[2], mask); + } + } + { int i, j; int max_i = 30; @@ -101,6 +127,7 @@ int main(int argc, char **argv) { done: if (fid > 0) H5Fclose(fid); if (data) free(data); + if (buffer && (data != buffer)) free(buffer); if (mask) free(mask); if (retval != 0) dump_error_stack(stderr); return retval;