From 369948795be056fbe02c0ddfc8780e0437803f71 Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Wed, 15 Aug 2018 16:45:09 +0100
Subject: [PATCH 1/8] Add the bitshuffle-lz4 code and add to the build.

Source for the bitshuffle code is:
https://github.com/kiyo-masui/bitshuffle

Release Tag: 0.3.4

Commit at time of copy:
9ffba9de83036a91d345fa2f62fcaedf55f54c5f

The LZF and HDF5 plugin parts are not included as they are
not required.
---
 Makefile                         |   19 +-
 bslz4/LICENSE                    |   21 +
 bslz4/src/bitshuffle.c           |  165 ++++
 bslz4/src/bitshuffle.h           |  123 +++
 bslz4/src/bitshuffle_core.c      | 1333 ++++++++++++++++++++++++++
 bslz4/src/bitshuffle_core.h      |  156 +++
 bslz4/src/bitshuffle_internals.h |   75 ++
 bslz4/src/iochain.c              |   90 ++
 bslz4/src/iochain.h              |   94 ++
 bslz4/src/lz4.c                  | 1516 ++++++++++++++++++++++++++++++
 bslz4/src/lz4.h                  |  360 +++++++
 11 files changed, 3949 insertions(+), 3 deletions(-)
 create mode 100644 bslz4/LICENSE
 create mode 100644 bslz4/src/bitshuffle.c
 create mode 100644 bslz4/src/bitshuffle.h
 create mode 100644 bslz4/src/bitshuffle_core.c
 create mode 100644 bslz4/src/bitshuffle_core.h
 create mode 100644 bslz4/src/bitshuffle_internals.h
 create mode 100644 bslz4/src/iochain.c
 create mode 100644 bslz4/src/iochain.h
 create mode 100644 bslz4/src/lz4.c
 create mode 100644 bslz4/src/lz4.h

diff --git a/Makefile b/Makefile
index 9222a9a..9e80aa4 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,12 @@ SRC_DIR = ./src
 TEST_DIR = ./test
 INC_DIR = $(SRC_DIR)
 
+BSLZ4_SRC_DIR = ./bslz4/src
+BSLZ4_BUILD_DIR = ./bslz4/build
+BSLZ4_INC_DIR = $(BSLZ4_SRC_DIR)
+
 CC=h5cc
-CFLAGS=-Wall -g -O2 -fpic -I$(INC_DIR)
+CFLAGS=-Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR)
 
 .PHONY: all
 all: plugin example test_plugin
@@ -26,11 +30,20 @@ $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c
 	mkdir -p $(BUILD_DIR)
 	$(CC) $(CFLAGS) -c $< -o $@
 
-$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o
+$(BSLZ4_BUILD_DIR)/%.o: $(BSLZ4_SRC_DIR)/%.c
+	mkdir -p $(BSLZ4_BUILD_DIR)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+$(BUILD_DIR)/bslz4.a: $(BSLZ4_BUILD_DIR)/lz4.o $(BSLZ4_BUILD_DIR)/bitshuffle.o \
+$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o
+	mkdir -p $(BUILD_DIR)
+	ar rcs $@ $^
+
+$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/bslz4.a
 	mkdir -p $(BUILD_DIR)
 	$(CC) $(CFLAGS) -shared $^ -o $(BUILD_DIR)/durin-plugin.so
 
-$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o
+$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/bslz4.a
 	mkdir -p $(BUILD_DIR)
 	$(CC) $(CFLAGS) $^ -o $(BUILD_DIR)/example
 
diff --git a/bslz4/LICENSE b/bslz4/LICENSE
new file mode 100644
index 0000000..1365ed6
--- /dev/null
+++ b/bslz4/LICENSE
@@ -0,0 +1,21 @@
+Bitshuffle - Filter for improving compression of typed binary data.
+
+Copyright (c) 2014 Kiyoshi Masui (kiyo@physics.ubc.ca)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/bslz4/src/bitshuffle.c b/bslz4/src/bitshuffle.c
new file mode 100644
index 0000000..54ff045
--- /dev/null
+++ b/bslz4/src/bitshuffle.c
@@ -0,0 +1,165 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ */
+
+#include "bitshuffle.h"
+#include "bitshuffle_core.h"
+#include "bitshuffle_internals.h"
+#include "lz4.h"
+
+#include <stdio.h>
+#include <string.h>
+
+
+// Constants.
+// Use fast decompression instead of safe decompression for LZ4.
+#define BSHUF_LZ4_DECOMPRESS_FAST
+
+
+// Macros.
+#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) {                      \
+    free(buf); return count - 1000; }
+
+
+/* Bitshuffle and compress a single block. */
+int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
+        const size_t size, const size_t elem_size) {
+
+    int64_t nbytes, count;
+    void *tmp_buf_bshuf;
+    void *tmp_buf_lz4;
+    size_t this_iter;
+    const void *in;
+    void *out;
+
+    tmp_buf_bshuf = malloc(size * elem_size);
+    if (tmp_buf_bshuf == NULL) return -1;
+
+    tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size));
+    if (tmp_buf_lz4 == NULL){
+        free(tmp_buf_bshuf);
+        return -1;
+    }
+
+
+    in = ioc_get_in(C_ptr, &this_iter);
+    ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size));
+
+    count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size);
+    if (count < 0) {
+        free(tmp_buf_lz4);
+        free(tmp_buf_bshuf);
+        return count;
+    }
+    nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size);
+    free(tmp_buf_bshuf);
+    CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4);
+
+    out = ioc_get_out(C_ptr, &this_iter);
+    ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4));
+
+    bshuf_write_uint32_BE(out, nbytes);
+    memcpy((char *) out + 4, tmp_buf_lz4, nbytes);
+
+    free(tmp_buf_lz4);
+
+    return nbytes + 4;
+}
+
+
+/* Decompress and bitunshuffle a single block. */
+int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
+        const size_t size, const size_t elem_size) {
+
+    int64_t nbytes, count;
+    void *out, *tmp_buf;
+    const void *in;
+    size_t this_iter;
+    int32_t nbytes_from_header;
+
+    in = ioc_get_in(C_ptr, &this_iter);
+    nbytes_from_header = bshuf_read_uint32_BE(in);
+    ioc_set_next_in(C_ptr, &this_iter,
+            (void*) ((char*) in + nbytes_from_header + 4));
+
+    out = ioc_get_out(C_ptr, &this_iter);
+    ioc_set_next_out(C_ptr, &this_iter,
+            (void *) ((char *) out + size * elem_size));
+
+    tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+#ifdef BSHUF_LZ4_DECOMPRESS_FAST
+    nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size);
+    CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
+    if (nbytes != nbytes_from_header) {
+        free(tmp_buf);
+        return -91;
+    }
+#else
+    nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header,
+                                 size * elem_size);
+    CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
+    if (nbytes != size * elem_size) {
+        free(tmp_buf);
+        return -91;
+    }
+    nbytes = nbytes_from_header;
+#endif
+    count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    nbytes += 4;
+
+    free(tmp_buf);
+    return nbytes;
+}
+
+
+/* ---- Public functions ----
+ *
+ * See header file for description and usage.
+ *
+ */
+
+size_t bshuf_compress_lz4_bound(const size_t size,
+        const size_t elem_size, size_t block_size) {
+
+    size_t bound, leftover;
+
+    if (block_size == 0) {
+        block_size = bshuf_default_block_size(elem_size);
+    }
+    if (block_size % BSHUF_BLOCKED_MULT) return -81;
+
+    // Note that each block gets a 4 byte header.
+    // Size of full blocks.
+    bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
+    // Size of partial blocks, if any.
+    leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
+    if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
+    // Size of uncompressed data not fitting into any blocks.
+    bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
+    return bound;
+}
+
+
+int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size) {
+    return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size,
+            elem_size, block_size);
+}
+
+
+int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size) {
+    return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size,
+            elem_size, block_size);
+}
+
diff --git a/bslz4/src/bitshuffle.h b/bslz4/src/bitshuffle.h
new file mode 100644
index 0000000..3df95f4
--- /dev/null
+++ b/bslz4/src/bitshuffle.h
@@ -0,0 +1,123 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ *
+ * Header File
+ *
+ * Worker routines return an int64_t which is the number of bytes processed
+ * if positive or an error code if negative.
+ *
+ * Error codes:
+ *      -1    : Failed to allocate memory.
+ *      -11   : Missing SSE.
+ *      -12   : Missing AVX.
+ *      -80   : Input size not a multiple of 8.
+ *      -81   : block_size not multiple of 8.
+ *      -91   : Decompression error, wrong number of bytes processed.
+ *      -1YYY : Error internal to compression routine with error code -YYY.
+ */
+
+
+#ifndef BITSHUFFLE_H
+#define BITSHUFFLE_H
+
+#include <stdlib.h>
+#include "bitshuffle_core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ---- bshuf_compress_lz4_bound ----
+ *
+ * Bound on size of data compressed with *bshuf_compress_lz4*.
+ *
+ * Parameters
+ * ----------
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Process in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  Bound on compressed data size.
+ *
+ */
+size_t bshuf_compress_lz4_bound(const size_t size,
+        const size_t elem_size, size_t block_size);
+
+
+/* ---- bshuf_compress_lz4 ----
+ *
+ * Bitshuffled and compress the data using LZ4.
+ *
+ * Transpose within elements, in blocks of data of *block_size* elements then
+ * compress the blocks using LZ4.  In the output buffer, each block is prefixed
+ * by a 4 byte integer giving the compressed size of that block.
+ *
+ * Output buffer must be large enough to hold the compressed data.  This could
+ * be in principle substantially larger than the input buffer.  Use the routine
+ * *bshuf_compress_lz4_bound* to get an upper limit.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be large enough to hold data.
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Process in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  number of bytes used in output buffer, negative error-code if failed.
+ *
+ */
+int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const size_t
+        elem_size, size_t block_size);
+
+
+/* ---- bshuf_decompress_lz4 ----
+ *
+ * Undo compression and bitshuffling.
+ *
+ * Decompress data then un-bitshuffle it in blocks of *block_size* elements.
+ *
+ * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
+ * must patch the parameters used to compress the data.
+ *
+ * NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function
+ * LZ4_decompress_fast from LZ4, which does not protect against maliciously
+ * formed datasets. By modifying the compressed data, this function could be
+ * coerced into leaving the boundaries of the input buffer.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Process in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  number of bytes consumed in *input* buffer, negative error-code if failed.
+ *
+ */
+int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif  // BITSHUFFLE_H
diff --git a/bslz4/src/bitshuffle_core.c b/bslz4/src/bitshuffle_core.c
new file mode 100644
index 0000000..583e4fe
--- /dev/null
+++ b/bslz4/src/bitshuffle_core.c
@@ -0,0 +1,1333 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ */
+
+#include "bitshuffle_core.h"
+#include "bitshuffle_internals.h"
+
+#include <stdio.h>
+#include <string.h>
+
+
+#if defined(__AVX2__) && defined (__SSE2__)
+#define USEAVX2
+#endif
+
+#if defined(__SSE2__)
+#define USESSE2
+#endif
+
+
+// Conditional includes for SSE2 and AVX2.
+#ifdef USEAVX2
+#include <immintrin.h>
+#elif defined USESSE2
+#include <emmintrin.h>
+#endif
+
+
+// Macros.
+#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
+
+/* ---- Functions indicating compile time instruction set. ---- */
+
+int bshuf_using_SSE2(void) {
+#ifdef USESSE2
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+
+int bshuf_using_AVX2(void) {
+#ifdef USEAVX2
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+
+/* ---- Worker code not requiring special instruction sets. ----
+ *
+ * The following code does not use any x86 specific vectorized instructions
+ * and should compile on any machine
+ *
+ */
+
+/* Transpose 8x8 bit array packed into a single quadword *x*.
+ * *t* is workspace. */
+#define TRANS_BIT_8X8(x, t) {                                               \
+        t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL;                          \
+        x = x ^ t ^ (t << 7);                                               \
+        t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL;                         \
+        x = x ^ t ^ (t << 14);                                              \
+        t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL;                         \
+        x = x ^ t ^ (t << 28);                                              \
+    }
+
+/* Transpose 8x8 bit array along the diagonal from upper right
+   to lower left */
+#define TRANS_BIT_8X8_BE(x, t) {                                            \
+        t = (x ^ (x >> 9)) & 0x0055005500550055LL;                          \
+        x = x ^ t ^ (t << 9);                                               \
+        t = (x ^ (x >> 18)) & 0x0000333300003333LL;                         \
+        x = x ^ t ^ (t << 18);                                              \
+        t = (x ^ (x >> 36)) & 0x000000000F0F0F0FLL;                         \
+        x = x ^ t ^ (t << 36);                                              \
+    }
+
+/* Transpose of an array of arbitrarily typed elements. */
+#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) {                        \
+        size_t ii, jj, kk;                                                  \
+        const type_t* in_type = (const type_t*) in;                                 \
+        type_t* out_type = (type_t*) out;                                   \
+        for(ii = 0; ii + 7 < lda; ii += 8) {                                \
+            for(jj = 0; jj < ldb; jj++) {                                   \
+                for(kk = 0; kk < 8; kk++) {                                 \
+                    out_type[jj*lda + ii + kk] =                            \
+                        in_type[ii*ldb + kk * ldb + jj];                    \
+                }                                                           \
+            }                                                               \
+        }                                                                   \
+        for(ii = lda - lda % 8; ii < lda; ii ++) {                          \
+            for(jj = 0; jj < ldb; jj++) {                                   \
+                out_type[jj*lda + ii] = in_type[ii*ldb + jj];                            \
+            }                                                               \
+        }                                                                   \
+    }
+
+
+/* Memory copy with bshuf call signature. For testing and profiling. */
+int64_t bshuf_copy(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    memcpy(out_b, in_b, size * elem_size);
+    return size * elem_size;
+}
+
+
+/* Transpose bytes within elements, starting partway through input. */
+int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size,
+         const size_t elem_size, const size_t start) {
+
+    size_t ii, jj, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    CHECK_MULT_EIGHT(start);
+
+    if (size > start) {
+        // ii loop separated into 2 loops so the compiler can unroll
+        // the inner one.
+        for (ii = start; ii + 7 < size; ii += 8) {
+            for (jj = 0; jj < elem_size; jj++) {
+                for (kk = 0; kk < 8; kk++) {
+                    out_b[jj * size + ii + kk]
+                        = in_b[ii * elem_size + kk * elem_size + jj];
+                }
+            }
+        }
+        for (ii = size - size % 8; ii < size; ii ++) {
+            for (jj = 0; jj < elem_size; jj++) {
+                out_b[jj * size + ii] = in_b[ii * elem_size + jj];
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Transpose bytes within elements. */
+int64_t bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0);
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size,
+         const size_t elem_size, const size_t start_byte) {
+
+    const uint64_t* in_b = (const uint64_t*) in;
+    uint8_t* out_b = (uint8_t*) out;
+
+    uint64_t x, t;
+
+    size_t ii, kk;
+    size_t nbyte = elem_size * size;
+    size_t nbyte_bitrow = nbyte / 8;
+
+    uint64_t e=1;
+    const int little_endian = *(uint8_t *) &e == 1;
+    const size_t bit_row_skip = little_endian ? nbyte_bitrow : -nbyte_bitrow;
+    const int64_t bit_row_offset = little_endian ? 0 : 7 * nbyte_bitrow;
+
+    CHECK_MULT_EIGHT(nbyte);
+    CHECK_MULT_EIGHT(start_byte);
+
+    for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) {
+        x = in_b[ii];
+        if (little_endian) {
+            TRANS_BIT_8X8(x, t);
+        } else {
+            TRANS_BIT_8X8_BE(x, t);
+        }
+        for (kk = 0; kk < 8; kk ++) {
+            out_b[bit_row_offset + kk * bit_row_skip + ii] = x;
+            x = x >> 8;
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_scal(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0);
+}
+
+
+/* General transpose of an array, optimized for large element sizes. */
+int64_t bshuf_trans_elem(const void* in, void* out, const size_t lda,
+        const size_t ldb, const size_t elem_size) {
+
+    size_t ii, jj;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    for(ii = 0; ii < lda; ii++) {
+        for(jj = 0; jj < ldb; jj++) {
+            memcpy(&out_b[(jj*lda + ii) * elem_size],
+                   &in_b[(ii*ldb + jj) * elem_size], elem_size);
+        }
+    }
+    return lda * ldb * elem_size;
+}
+
+
+/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */
+int64_t bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t nbyte_bitrow = size / 8;
+
+    CHECK_MULT_EIGHT(size);
+
+    return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow);
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+    void *tmp_buf;
+
+    CHECK_MULT_EIGHT(size);
+
+    tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_scal(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    size_t ii, jj, kk, nbyte_row;
+    const char *in_b;
+    char *out_b;
+
+
+    in_b = (const char*) in;
+    out_b = (char*) out;
+
+    nbyte_row = size / 8;
+
+    CHECK_MULT_EIGHT(size);
+
+    for (jj = 0; jj < elem_size; jj++) {
+        for (ii = 0; ii < nbyte_row; ii++) {
+            for (kk = 0; kk < 8; kk++) {
+                out_b[ii * 8 * elem_size + jj * 8 + kk] = \
+                        in_b[(jj * 8 + kk) * nbyte_row + ii];
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_scal(const void* in, void* out, \
+        const size_t size, const size_t elem_size) {
+
+    const char *in_b;
+    char *out_b;
+    uint64_t x, t;
+    size_t ii, jj, kk;
+    size_t nbyte, out_index;
+
+    uint64_t e=1;
+    const int little_endian = *(uint8_t *) &e == 1;
+    const size_t elem_skip = little_endian ? elem_size : -elem_size;
+    const uint64_t elem_offset = little_endian ? 0 : 7 * elem_size;
+
+    CHECK_MULT_EIGHT(size);
+
+    in_b = (const char*) in;
+    out_b = (char*) out;
+
+    nbyte = elem_size * size;
+
+    for (jj = 0; jj < 8 * elem_size; jj += 8) {
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) {
+            x = *((uint64_t*) &in_b[ii + jj]);
+            if (little_endian) {
+                TRANS_BIT_8X8(x, t);
+            } else {
+                TRANS_BIT_8X8_BE(x, t);
+            }
+            for (kk = 0; kk < 8; kk++) {
+                out_index = ii + jj / 8 + elem_offset + kk * elem_skip;
+                *((uint8_t*) &out_b[out_index]) = x;
+                x = x >> 8;
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+    void *tmp_buf;
+
+    CHECK_MULT_EIGHT(size);
+
+    tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+
+/* ---- Worker code that uses SSE2 ----
+ *
+ * The following code makes use of the SSE2 instruction set and specialized
+ * 16 byte registers. The SSE2 instructions are present on modern x86 
+ * processors. The first Intel processor microarchitecture supporting SSE2 was
+ * Pentium 4 (2000).
+ *
+ */
+
+#ifdef USESSE2
+
+/* Transpose bytes within elements for 16 bit elements. */
+int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {
+
+    size_t ii;
+    const char *in_b = (const char*) in;
+    char *out_b = (char*) out;
+    __m128i a0, b0, a1, b1;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
+        b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
+    }
+    return bshuf_trans_byte_elem_remainder(in, out, size, 2,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements for 32 bit elements. */
+int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {
+
+    size_t ii;
+    const char *in_b;
+    char *out_b;
+    in_b = (const char*) in;
+    out_b = (char*) out;
+    __m128i a0, b0, c0, d0, a1, b1, c1, d1;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
+        b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
+        c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
+        d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+        c1 = _mm_unpacklo_epi8(c0, d0);
+        d1 = _mm_unpackhi_epi8(c0, d0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+        c0 = _mm_unpacklo_epi8(c1, d1);
+        d0 = _mm_unpackhi_epi8(c1, d1);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+        c1 = _mm_unpacklo_epi8(c0, d0);
+        d1 = _mm_unpackhi_epi8(c0, d0);
+
+        a0 = _mm_unpacklo_epi64(a1, c1);
+        b0 = _mm_unpackhi_epi64(a1, c1);
+        c0 = _mm_unpacklo_epi64(b1, d1);
+        d0 = _mm_unpackhi_epi64(b1, d1);
+
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
+        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
+        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
+    }
+    return bshuf_trans_byte_elem_remainder(in, out, size, 4,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements for 64 bit elements. */
+int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {
+
+    size_t ii;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
+    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
+        b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
+        c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
+        d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
+        e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
+        f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
+        g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
+        h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
+
+        a1 = _mm_unpacklo_epi8(a0, b0);
+        b1 = _mm_unpackhi_epi8(a0, b0);
+        c1 = _mm_unpacklo_epi8(c0, d0);
+        d1 = _mm_unpackhi_epi8(c0, d0);
+        e1 = _mm_unpacklo_epi8(e0, f0);
+        f1 = _mm_unpackhi_epi8(e0, f0);
+        g1 = _mm_unpacklo_epi8(g0, h0);
+        h1 = _mm_unpackhi_epi8(g0, h0);
+
+        a0 = _mm_unpacklo_epi8(a1, b1);
+        b0 = _mm_unpackhi_epi8(a1, b1);
+        c0 = _mm_unpacklo_epi8(c1, d1);
+        d0 = _mm_unpackhi_epi8(c1, d1);
+        e0 = _mm_unpacklo_epi8(e1, f1);
+        f0 = _mm_unpackhi_epi8(e1, f1);
+        g0 = _mm_unpacklo_epi8(g1, h1);
+        h0 = _mm_unpackhi_epi8(g1, h1);
+
+        a1 = _mm_unpacklo_epi32(a0, c0);
+        b1 = _mm_unpackhi_epi32(a0, c0);
+        c1 = _mm_unpacklo_epi32(b0, d0);
+        d1 = _mm_unpackhi_epi32(b0, d0);
+        e1 = _mm_unpacklo_epi32(e0, g0);
+        f1 = _mm_unpackhi_epi32(e0, g0);
+        g1 = _mm_unpacklo_epi32(f0, h0);
+        h1 = _mm_unpackhi_epi32(f0, h0);
+
+        a0 = _mm_unpacklo_epi64(a1, e1);
+        b0 = _mm_unpackhi_epi64(a1, e1);
+        c0 = _mm_unpacklo_epi64(b1, f1);
+        d0 = _mm_unpackhi_epi64(b1, f1);
+        e0 = _mm_unpacklo_epi64(c1, g1);
+        f0 = _mm_unpackhi_epi64(c1, g1);
+        g0 = _mm_unpacklo_epi64(d1, h1);
+        h0 = _mm_unpackhi_epi64(d1, h1);
+
+        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
+        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
+        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
+        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
+        _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
+        _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
+        _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
+        _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
+    }
+    return bshuf_trans_byte_elem_remainder(in, out, size, 8,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements using best SSE algorithm available. */
+int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    // Trivial cases: power of 2 bytes.
+    switch (elem_size) {
+        case 1:
+            count = bshuf_copy(in, out, size, elem_size);
+            return count;
+        case 2:
+            count = bshuf_trans_byte_elem_SSE_16(in, out, size);
+            return count;
+        case 4:
+            count = bshuf_trans_byte_elem_SSE_32(in, out, size);
+            return count;
+        case 8:
+            count = bshuf_trans_byte_elem_SSE_64(in, out, size);
+            return count;
+    }
+
+    // Worst case: odd number of bytes. Turns out that this is faster for
+    // (odd * 2) byte elements as well (hence % 4).
+    if (elem_size % 4) {
+        count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
+        return count;
+    }
+
+    // Multiple of power of 2: transpose hierarchically.
+    {
+        size_t nchunk_elem;
+        void* tmp_buf = malloc(size * elem_size);
+        if (tmp_buf == NULL) return -1;
+
+        if ((elem_size % 8) == 0) {
+            nchunk_elem = elem_size / 8;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
+            count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
+        } else if ((elem_size % 4) == 0) {
+            nchunk_elem = elem_size / 4;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
+            count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
+        } else {
+            // Not used since scalar algorithm is faster.
+            nchunk_elem = elem_size / 2;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
+            count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
+        }
+
+        free(tmp_buf);
+        return count;
+    }
+}
+
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    uint16_t* out_ui16;
+
+    int64_t count;
+
+    size_t nbyte = elem_size * size;
+
+    CHECK_MULT_EIGHT(nbyte);
+
+    __m128i xmm;
+    int32_t bt;
+
+    for (ii = 0; ii + 15 < nbyte; ii += 16) {
+        xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt = _mm_movemask_epi8(xmm);
+            xmm = _mm_slli_epi16(xmm, 1);
+            out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_ui16 = bt;
+        }
+    }
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 16);
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, jj;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    CHECK_MULT_EIGHT(size);
+
+    size_t nrows = 8 * elem_size;
+    size_t nbyte_row = size / 8;
+
+    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
+    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
+    __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
+
+    for (ii = 0; ii + 7 < nrows; ii += 8) {
+        for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
+            a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
+            b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
+            c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
+            d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
+            e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
+            f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
+            g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
+            h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
+
+
+            a1 = _mm_unpacklo_epi8(a0, b0);
+            b1 = _mm_unpacklo_epi8(c0, d0);
+            c1 = _mm_unpacklo_epi8(e0, f0);
+            d1 = _mm_unpacklo_epi8(g0, h0);
+            e1 = _mm_unpackhi_epi8(a0, b0);
+            f1 = _mm_unpackhi_epi8(c0, d0);
+            g1 = _mm_unpackhi_epi8(e0, f0);
+            h1 = _mm_unpackhi_epi8(g0, h0);
+
+
+            a0 = _mm_unpacklo_epi16(a1, b1);
+            b0 = _mm_unpacklo_epi16(c1, d1);
+            c0 = _mm_unpackhi_epi16(a1, b1);
+            d0 = _mm_unpackhi_epi16(c1, d1);
+
+            e0 = _mm_unpacklo_epi16(e1, f1);
+            f0 = _mm_unpacklo_epi16(g1, h1);
+            g0 = _mm_unpackhi_epi16(e1, f1);
+            h0 = _mm_unpackhi_epi16(g1, h1);
+
+
+            a1 = _mm_unpacklo_epi32(a0, b0);
+            b1 = _mm_unpackhi_epi32(a0, b0);
+
+            c1 = _mm_unpacklo_epi32(c0, d0);
+            d1 = _mm_unpackhi_epi32(c0, d0);
+
+            e1 = _mm_unpacklo_epi32(e0, f0);
+            f1 = _mm_unpackhi_epi32(e0, f0);
+
+            g1 = _mm_unpacklo_epi32(g0, h0);
+            h1 = _mm_unpackhi_epi32(g0, h0);
+
+            // We don't have a storeh instruction for integers, so interpret
+            // as a float. Have a storel (_mm_storel_epi64).
+            as = (__m128 *) &a1;
+            bs = (__m128 *) &b1;
+            cs = (__m128 *) &c1;
+            ds = (__m128 *) &d1;
+            es = (__m128 *) &e1;
+            fs = (__m128 *) &f1;
+            gs = (__m128 *) &g1;
+            hs = (__m128 *) &h1;
+
+            _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
+            _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
+
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
+            _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
+        }
+        for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
+            out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
+    const char* in_b = (const char*) in;
+    uint16_t* out_ui16 = (uint16_t*) out;
+
+    size_t ii, jj, kk;
+    size_t nbyte = elem_size * size;
+
+    __m128i xmm;
+    int32_t bt;
+
+    if (elem_size % 2) {
+        bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
+    } else {
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                ii += 8 * elem_size) {
+            for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
+                xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm_movemask_epi8(xmm);
+                    xmm = _mm_slli_epi16(xmm, 1);
+                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    out_ui16[ind / 2] = bt;
+                }
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+#else // #ifdef USESSE2
+
+
+int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {
+    return -11;
+}
+
+
+int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {
+    return -11;
+}
+
+
+int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -11;
+}
+
+
+#endif // #ifdef USESSE2
+
+
+/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
+
+/* ---- Worker code that uses AVX2 ----
+ *
+ * The following code makes use of the AVX2 instruction set and specialized
+ * 32 byte registers. The AVX2 instructions are present on newer x86
+ * processors. The first Intel processor microarchitecture supporting AVX2 was
+ * Haswell (2013).
+ *
+ */
+
+#ifdef USEAVX2
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    int32_t* out_i32;
+
+    size_t nbyte = elem_size * size;
+
+    int64_t count;
+
+    __m256i ymm;
+    int32_t bt;
+
+    for (ii = 0; ii + 31 < nbyte; ii += 32) {
+        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt = _mm256_movemask_epi8(ymm);
+            ymm = _mm256_slli_epi16(ymm, 1);
+            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_i32 = bt;
+        }
+    }
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 32);
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t hh, ii, jj, kk, mm;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    CHECK_MULT_EIGHT(size);
+
+    size_t nrows = 8 * elem_size;
+    size_t nbyte_row = size / 8;
+
+    if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size,
+            elem_size);
+
+    __m256i ymm_0[8];
+    __m256i ymm_1[8];
+    __m256i ymm_storeage[8][4];
+
+    for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
+        for (ii = 0; ii + 3 < elem_size; ii += 4) {
+            for (hh = 0; hh < 4; hh ++) {
+
+                for (kk = 0; kk < 8; kk ++){
+                    ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
+                            (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
+                }
+
+                for (kk = 0; kk < 4; kk ++){
+                    ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                    ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                }
+
+                for (kk = 0; kk < 2; kk ++){
+                    for (mm = 0; mm < 2; mm ++){
+                        ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
+                                ymm_1[kk * 4 + mm * 2],
+                                ymm_1[kk * 4 + mm * 2 + 1]);
+                        ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
+                                ymm_1[kk * 4 + mm * 2],
+                                ymm_1[kk * 4 + mm * 2 + 1]);
+                    }
+                }
+
+                for (kk = 0; kk < 4; kk ++){
+                    ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                    ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
+                            ymm_0[kk * 2 + 1]);
+                }
+
+                for (kk = 0; kk < 8; kk ++){
+                    ymm_storeage[kk][hh] = ymm_1[kk];
+                }
+            }
+
+            for (mm = 0; mm < 8; mm ++) {
+
+                for (kk = 0; kk < 4; kk ++){
+                    ymm_0[kk] = ymm_storeage[mm][kk];
+                }
+
+                ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
+                ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
+                ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
+                ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
+
+                ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
+                ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
+                ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
+                ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
+
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
+                _mm256_storeu_si256((__m256i *) &out_b[
+                        (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
+            }
+        }
+    }
+    for (ii = 0; ii < nrows; ii ++ ) {
+        for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
+            out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    size_t ii, jj, kk;
+    size_t nbyte = elem_size * size;
+
+    __m256i ymm;
+    int32_t bt;
+
+    if (elem_size % 4) {
+        return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size);
+    } else {
+        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
+            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                    ii += 8 * elem_size) {
+                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm256_movemask_epi8(ymm);
+                    ymm = _mm256_slli_epi16(ymm, 1);
+                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    * (int32_t *) &out_b[ind] = bt;
+                }
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+    return count;
+}
+
+
+#else // #ifdef USEAVX2
+
+int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -12;
+}
+
+
+int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -12;
+}
+
+
+int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -12;
+}
+
+
+int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -12;
+}
+
+
+int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -12;
+}
+
+#endif // #ifdef USEAVX2
+
+
+/* ---- Drivers selecting best instruction set at compile time. ---- */
+
+int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, 
+        const size_t elem_size) {
+
+    int64_t count;
+#ifdef USEAVX2
+    count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
+#elif defined(USESSE2)
+    count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
+#else
+    count = bshuf_trans_bit_elem_scal(in, out, size, elem_size);
+#endif
+    return count;
+}
+
+
+int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, 
+        const size_t elem_size) {
+
+    int64_t count;
+#ifdef USEAVX2
+    count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
+#elif defined(USESSE2)
+    count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
+#else
+    count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size);
+#endif
+    return count;
+}
+
+
+/* ---- Wrappers for implementing blocking ---- */
+
+/* Wrap a function for processing a single block to process an entire buffer in
+ * parallel. */
+int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \
+        const size_t size, const size_t elem_size, size_t block_size) {
+
+    size_t ii;
+    int64_t err = 0;
+    int64_t count, cum_count=0;
+    size_t last_block_size;
+    size_t leftover_bytes;
+    size_t this_iter;
+    char *last_in;
+    char *last_out;
+
+
+    ioc_chain C;
+    ioc_init(&C, in, out);
+
+
+    if (block_size == 0) {
+        block_size = bshuf_default_block_size(elem_size);
+    }
+    if (block_size % BSHUF_BLOCKED_MULT) return -81;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for schedule(dynamic, 1) \
+            private(count) reduction(+ : cum_count)
+#endif
+    for (ii = 0; ii < size / block_size; ii ++) {
+        count = fun(&C, block_size, elem_size);
+        if (count < 0) err = count;
+        cum_count += count;
+    }
+
+    last_block_size = size % block_size;
+    last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT;
+    if (last_block_size) {
+        count = fun(&C, last_block_size, elem_size);
+        if (count < 0) err = count;
+        cum_count += count;
+    }
+
+    if (err < 0) return err;
+
+    leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
+    //this_iter;
+    last_in = (char *) ioc_get_in(&C, &this_iter);
+    ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
+    last_out = (char *) ioc_get_out(&C, &this_iter);
+    ioc_set_next_out(&C, &this_iter, (void *) (last_out + leftover_bytes));
+
+    memcpy(last_out, last_in, leftover_bytes);
+
+    ioc_destroy(&C);
+
+    return cum_count + leftover_bytes;
+}
+
+
+/* Bitshuffle a single block. */
+int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
+        const size_t size, const size_t elem_size) {
+
+    size_t this_iter;
+    const void *in;
+    void *out;
+    int64_t count;
+
+
+    
+    in = ioc_get_in(C_ptr, &this_iter);
+    ioc_set_next_in(C_ptr, &this_iter,
+            (void*) ((char*) in + size * elem_size));
+    out = ioc_get_out(C_ptr, &this_iter);
+    ioc_set_next_out(C_ptr, &this_iter,
+            (void *) ((char *) out + size * elem_size));
+
+    count = bshuf_trans_bit_elem(in, out, size, elem_size);
+    return count;
+}
+
+
+/* Bitunshuffle a single block. */
+int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \
+        const size_t size, const size_t elem_size) {
+
+
+    size_t this_iter;
+    const void *in;
+    void *out;
+    int64_t count;
+
+
+
+
+    in = ioc_get_in(C_ptr, &this_iter);
+    ioc_set_next_in(C_ptr, &this_iter,
+            (void*) ((char*) in + size * elem_size));
+    out = ioc_get_out(C_ptr, &this_iter);
+    ioc_set_next_out(C_ptr, &this_iter,
+            (void *) ((char *) out + size * elem_size));
+
+    count = bshuf_untrans_bit_elem(in, out, size, elem_size);
+    return count;
+}
+
+
+/* Write a 64 bit unsigned integer to a buffer in big endian order. */
+void bshuf_write_uint64_BE(void* buf, uint64_t num) {
+    int ii;
+    uint8_t* b = (uint8_t*) buf;
+    uint64_t pow28 = 1 << 8;
+    for (ii = 7; ii >= 0; ii--) {
+        b[ii] = num % pow28;
+        num = num / pow28;
+    }
+}
+
+
+/* Read a 64 bit unsigned integer from a buffer big endian order. */
+uint64_t bshuf_read_uint64_BE(void* buf) {
+    int ii;
+    uint8_t* b = (uint8_t*) buf;
+    uint64_t num = 0, pow28 = 1 << 8, cp = 1;
+    for (ii = 7; ii >= 0; ii--) {
+        num += b[ii] * cp;
+        cp *= pow28;
+    }
+    return num;
+}
+
+
+/* Write a 32 bit unsigned integer to a buffer in big endian order. */
+void bshuf_write_uint32_BE(void* buf, uint32_t num) {
+    int ii;
+    uint8_t* b = (uint8_t*) buf;
+    uint32_t pow28 = 1 << 8;
+    for (ii = 3; ii >= 0; ii--) {
+        b[ii] = num % pow28;
+        num = num / pow28;
+    }
+}
+
+
+/* Read a 32 bit unsigned integer from a buffer big endian order. */
+uint32_t bshuf_read_uint32_BE(const void* buf) {
+    int ii;
+    uint8_t* b = (uint8_t*) buf;
+    uint32_t num = 0, pow28 = 1 << 8, cp = 1;
+    for (ii = 3; ii >= 0; ii--) {
+        num += b[ii] * cp;
+        cp *= pow28;
+    }
+    return num;
+}
+
+
+/* ---- Public functions ----
+ *
+ * See header file for description and usage.
+ *
+ */
+
+size_t bshuf_default_block_size(const size_t elem_size) {
+    // This function needs to be absolutely stable between versions.
+    // Otherwise encoded data will not be decodable.
+
+    size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
+    // Ensure it is a required multiple.
+    block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
+    return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
+}
+
+
+int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size) {
+
+    return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size,
+            elem_size, block_size);
+}
+
+
+int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size) {
+
+    return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size,
+            elem_size, block_size);
+}
+
+
+#undef TRANS_BIT_8X8
+#undef TRANS_ELEM_TYPE
+#undef MAX
+#undef CHECK_MULT_EIGHT
+#undef CHECK_ERR_FREE
+
+#undef USESSE2
+#undef USEAVX2
diff --git a/bslz4/src/bitshuffle_core.h b/bslz4/src/bitshuffle_core.h
new file mode 100644
index 0000000..4516ef4
--- /dev/null
+++ b/bslz4/src/bitshuffle_core.h
@@ -0,0 +1,156 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ *
+ * Header File
+ *
+ * Worker routines return an int64_t which is the number of bytes processed
+ * if positive or an error code if negative.
+ *
+ * Error codes:
+ *      -1    : Failed to allocate memory.
+ *      -11   : Missing SSE.
+ *      -12   : Missing AVX.
+ *      -80   : Input size not a multiple of 8.
+ *      -81   : block_size not multiple of 8.
+ *      -91   : Decompression error, wrong number of bytes processed.
+ *      -1YYY : Error internal to compression routine with error code -YYY.
+ */
+
+
+#ifndef BITSHUFFLE_CORE_H
+#define BITSHUFFLE_CORE_H
+
+// We assume GNU g++ defining `__cplusplus` has stdint.h
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
+#include <stdint.h>
+#else
+  typedef unsigned char       uint8_t;
+  typedef unsigned short      uint16_t;
+  typedef unsigned int        uint32_t;
+  typedef   signed int        int32_t;
+  typedef unsigned long long  uint64_t;
+  typedef long long           int64_t;
+#endif
+
+#include <stdlib.h>
+
+
+// These are usually set in the setup.py.
+#ifndef BSHUF_VERSION_MAJOR
+#define BSHUF_VERSION_MAJOR 0
+#define BSHUF_VERSION_MINOR 3
+#define BSHUF_VERSION_POINT 4
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- bshuf_using_SSE2 ----
+ *
+ * Whether routines where compiled with the SSE2 instruction set.
+ *
+ * Returns
+ * -------
+ *  1 if using SSE2, 0 otherwise.
+ *
+ */
+int bshuf_using_SSE2(void);
+
+
+/* ---- bshuf_using_AVX2 ----
+ *
+ * Whether routines where compiled with the AVX2 instruction set.
+ *
+ * Returns
+ * -------
+ *  1 if using AVX2, 0 otherwise.
+ *
+ */
+int bshuf_using_AVX2(void);
+
+
+/* ---- bshuf_default_block_size ----
+ *
+ * The default block size as function of element size.
+ *
+ * This is the block size used by the blocked routines (any routine
+ * taking a *block_size* argument) when the block_size is not provided
+ * (zero is passed).
+ *
+ * The results of this routine are guaranteed to be stable such that
+ * shuffled/compressed data can always be decompressed.
+ *
+ * Parameters
+ * ----------
+ *  elem_size : element size of data to be shuffled/compressed.
+ *
+ */
+size_t bshuf_default_block_size(const size_t elem_size);
+
+
+/* ---- bshuf_bitshuffle ----
+ *
+ * Bitshuffle the data.
+ *
+ * Transpose the bits within elements, in blocks of *block_size*
+ * elements.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Do transpose in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  number of bytes processed, negative error-code if failed.
+ *
+ */
+int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size);
+
+
+/* ---- bshuf_bitunshuffle ----
+ *
+ * Unshuffle bitshuffled data.
+ *
+ * Untranspose the bits within elements, in blocks of *block_size*
+ * elements.
+ *
+ * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
+ * must match the parameters used to shuffle the data.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Do transpose in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  number of bytes processed, negative error-code if failed.
+ *
+ */
+int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif  // BITSHUFFLE_CORE_H
diff --git a/bslz4/src/bitshuffle_internals.h b/bslz4/src/bitshuffle_internals.h
new file mode 100644
index 0000000..e039925
--- /dev/null
+++ b/bslz4/src/bitshuffle_internals.h
@@ -0,0 +1,75 @@
+/*
+ * Bitshuffle - Filter for improving compression of typed binary data.
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ */
+
+
+#ifndef BITSHUFFLE_INTERNALS_H
+#define BITSHUFFLE_INTERNALS_H
+
+// We assume GNU g++ defining `__cplusplus` has stdint.h
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
+#include <stdint.h>
+#else
+  typedef unsigned char       uint8_t;
+  typedef unsigned short      uint16_t;
+  typedef unsigned int        uint32_t;
+  typedef   signed int        int32_t;
+  typedef unsigned long long  uint64_t;
+  typedef long long           int64_t;
+#endif
+
+#include <stdlib.h>
+#include "iochain.h"
+
+
+// Constants.
+#ifndef BSHUF_MIN_RECOMMEND_BLOCK
+#define BSHUF_MIN_RECOMMEND_BLOCK 128
+#define BSHUF_BLOCKED_MULT 8    // Block sizes must be multiple of this.
+#define BSHUF_TARGET_BLOCK_SIZE_B 8192
+#endif
+
+
+// Macros.
+#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ---- Utility functions for internal use only ---- */
+
+int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
+        const size_t elem_size);
+
+/* Read a 32 bit unsigned integer from a buffer big endian order. */
+uint32_t bshuf_read_uint32_BE(const void* buf);
+
+/* Write a 32 bit unsigned integer to a buffer in big endian order. */
+void bshuf_write_uint32_BE(void* buf, uint32_t num);
+
+int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
+        const size_t elem_size);
+
+/* Function definition for worker functions that process a single block. */
+typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr,
+        const size_t size, const size_t elem_size);
+
+/* Wrap a function for processing a single block to process an entire buffer in
+ * parallel. */
+int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
+        const size_t size, const size_t elem_size, size_t block_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif  // BITSHUFFLE_INTERNALS_H
diff --git a/bslz4/src/iochain.c b/bslz4/src/iochain.c
new file mode 100644
index 0000000..baa9729
--- /dev/null
+++ b/bslz4/src/iochain.c
@@ -0,0 +1,90 @@
+/*
+ * IOchain - Distribute a chain of dependant IO events amoung threads.
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ */
+
+#include <stdlib.h>
+#include "iochain.h"
+
+
+void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0) {
+#ifdef _OPENMP
+    omp_init_lock(&C->next_lock);
+    for (size_t ii = 0; ii < IOC_SIZE; ii ++) {
+        omp_init_lock(&(C->in_pl[ii].lock));
+        omp_init_lock(&(C->out_pl[ii].lock));
+    }
+#endif
+    C->next = 0;
+    C->in_pl[0].ptr = in_ptr_0;
+    C->out_pl[0].ptr = out_ptr_0;
+}
+
+
+void ioc_destroy(ioc_chain *C) {
+#ifdef _OPENMP
+    omp_destroy_lock(&C->next_lock);
+    for (size_t ii = 0; ii < IOC_SIZE; ii ++) {
+        omp_destroy_lock(&(C->in_pl[ii].lock));
+        omp_destroy_lock(&(C->out_pl[ii].lock));
+    }
+#endif
+}
+
+
+const void * ioc_get_in(ioc_chain *C, size_t *this_iter) {
+#ifdef _OPENMP
+    omp_set_lock(&C->next_lock);
+    #pragma omp flush
+#endif
+    *this_iter = C->next;
+    C->next ++;
+#ifdef _OPENMP
+    omp_set_lock(&(C->in_pl[*this_iter % IOC_SIZE].lock));
+    omp_set_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock));
+    omp_set_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
+    omp_unset_lock(&C->next_lock);
+#endif
+    return C->in_pl[*this_iter % IOC_SIZE].ptr;
+}
+
+
+void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr) {
+    C->in_pl[(*this_iter + 1) % IOC_SIZE].ptr = in_ptr;
+#ifdef _OPENMP
+    omp_unset_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock));
+#endif
+}
+
+
+void * ioc_get_out(ioc_chain *C, size_t *this_iter) {
+#ifdef _OPENMP
+    omp_set_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock));
+    #pragma omp flush
+#endif
+    void *out_ptr = C->out_pl[*this_iter % IOC_SIZE].ptr;
+#ifdef _OPENMP
+    omp_unset_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock));
+#endif
+    return out_ptr;
+}
+
+
+void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
+    C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
+#ifdef _OPENMP
+    omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
+    // *in_pl[this_iter]* lock released at the end of the iteration to avoid being
+    // overtaken by previous threads and having *out_pl[this_iter]* corrupted.
+    // Especially worried about thread 0, iteration 0.
+    omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
+#endif
+}
+
diff --git a/bslz4/src/iochain.h b/bslz4/src/iochain.h
new file mode 100644
index 0000000..4e225d1
--- /dev/null
+++ b/bslz4/src/iochain.h
@@ -0,0 +1,94 @@
+/*
+ * IOchain - Distribute a chain of dependant IO events amoung threads.
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ *
+ * Header File
+ *
+ * Similar in concept to a queue. Each task includes reading an input
+ * and writing output, but the location of the input/output (the pointers)
+ * depend on the previous item in the chain.
+ *
+ * This is designed for parallelizing blocked compression/decompression IO,
+ * where the destination of a compressed block depends on the compressed size
+ * of all previous blocks.
+ *
+ * Implemented with OpenMP locks.
+ *
+ *
+ * Usage
+ * -----
+ *  - Call `ioc_init` in serial block.
+ *  - Each thread should create a local variable *size_t this_iter* and 
+ *    pass its address to all function calls. Its value will be set
+ *    inside the functions and is used to identify the thread.
+ *  - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
+ *    exactly once per iteration, starting with `ioc_get_in` and ending
+ *    with `ioc_set_next_out`.
+ *  - The order (`ioc_get_in`, `ioc_set_next_in`, *work*, `ioc_get_out`,
+ *    `ioc_set_next_out`, *work*) is most efficient.
+ *  - Have each thread call `ioc_end_pop`.
+ *  - `ioc_get_in` is blocked until the previous entry's
+ *    `ioc_set_next_in` is called.
+ *  - `ioc_get_out` is blocked until the previous entry's
+ *    `ioc_set_next_out` is called.
+ *  - There are no blocks on the very first iteration.
+ *  - Call `ioc_destroy` in serial block.
+ *  - Safe for num_threads >= IOC_SIZE (but less efficient).
+ *
+ */
+
+
+#ifndef IOCHAIN_H
+#define IOCHAIN_H
+
+
+#include <stdlib.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+
+#define IOC_SIZE 33
+
+
+typedef struct ioc_ptr_and_lock {
+#ifdef _OPENMP
+    omp_lock_t lock;
+#endif
+    void *ptr;
+} ptr_and_lock;
+
+typedef struct ioc_const_ptr_and_lock {
+#ifdef _OPENMP
+    omp_lock_t lock;
+#endif
+    const void *ptr;
+} const_ptr_and_lock;
+
+
+typedef struct ioc_chain {
+#ifdef _OPENMP
+    omp_lock_t next_lock;
+#endif
+    size_t next;
+    const_ptr_and_lock in_pl[IOC_SIZE];
+    ptr_and_lock out_pl[IOC_SIZE];
+} ioc_chain;
+
+
+void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0);
+void ioc_destroy(ioc_chain *C);
+const void * ioc_get_in(ioc_chain *C, size_t *this_iter);
+void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
+void * ioc_get_out(ioc_chain *C, size_t *this_iter);
+void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);
+
+#endif  // IOCHAIN_H
+
diff --git a/bslz4/src/lz4.c b/bslz4/src/lz4.c
new file mode 100644
index 0000000..08cf6b5
--- /dev/null
+++ b/bslz4/src/lz4.c
@@ -0,0 +1,1516 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+
+/**************************************
+*  Tuning parameters
+**************************************/
+/*
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#define HEAPMODE 0
+
+/*
+ * ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define ACCELERATION_DEFAULT 1
+
+
+/**************************************
+*  CPU Feature Detection
+**************************************/
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+/**************************************
+*  Includes
+**************************************/
+#include "lz4.h"
+
+
+/**************************************
+*  Compiler Options
+**************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4293)        /* disable: C4293: too large shift (32-bits) */
+#else
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+#    if defined(__GNUC__) || defined(__clang__)
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif   /* __STDC_VERSION__ */
+#endif  /* _MSC_VER */
+
+/* LZ4_GCC_VERSION is defined into lz4.h */
+#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#define likely(expr)     expect((expr) != 0, 1)
+#define unlikely(expr)   expect((expr) != 0, 0)
+
+
+/**************************************
+*  Memory routines
+**************************************/
+#include <stdlib.h>   /* malloc, calloc, free */
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM        free
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT       memset
+
+
+/**************************************
+*  Basic Types
+**************************************/
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+
+/**************************************
+*  Reading and writing into memory
+**************************************/
+#define STEPSIZE sizeof(size_t)
+
+static unsigned LZ4_64bits(void) { return sizeof(void*)==8; }
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val16;
+    memcpy(&val16, memPtr, 2);
+    return val16;
+}
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian())
+    {
+        return LZ4_read16(memPtr);
+    }
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian())
+    {
+        memcpy(memPtr, &value, 2);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val32;
+    memcpy(&val32, memPtr, 4);
+    return val32;
+}
+
+static U64 LZ4_read64(const void* memPtr)
+{
+    U64 val64;
+    memcpy(&val64, memPtr, 8);
+    return val64;
+}
+
+static size_t LZ4_read_ARCH(const void* p)
+{
+    if (LZ4_64bits())
+        return (size_t)LZ4_read64(p);
+    else
+        return (size_t)LZ4_read32(p);
+}
+
+
+static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); }
+
+static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); }
+
+/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
+static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* e = (BYTE*)dstEnd;
+    do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
+}
+
+
+/**************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/**************************************
+*  Common Utils
+**************************************/
+#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/**************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (register size_t val)
+{
+    if (LZ4_isLittleEndian())
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward( &r, (U32)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    }
+    else   /* Big Endian CPU */
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clzll((U64)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+    }
+}
+
+static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    while (likely(pIn<pInLimit-(STEPSIZE-1)))
+    {
+        size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if (LZ4_64bits()) if ((pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/**************************************
+*  Local Constants
+**************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/**************************************
+*  Local Structures and types
+**************************************/
+typedef struct {
+    U32 hashTable[HASH_SIZE_U32];
+    U32 currentOffset;
+    U32 initCheck;
+    const BYTE* dictionary;
+    BYTE* bufferStart;   /* obsolete, used for slideInputBuffer */
+    U32 dictSize;
+} LZ4_stream_t_internal;
+
+typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+/**************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+
+
+
+/********************************
+*  Compression functions
+********************************/
+
+static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+static const U64 prime5bytes = 889523592379ULL;
+static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    const U32 hashMask = (1<<hashLog) - 1;
+    return ((sequence * prime5bytes) >> (40 - hashLog)) & hashMask;
+}
+
+static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType)
+{
+    if (LZ4_64bits())
+        return LZ4_hashSequence64(sequence, tableType);
+    return LZ4_hashSequence((U32)sequence, tableType);
+}
+
+static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); }
+
+static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+FORCE_INLINE int LZ4_compress_generic(
+                 void* const ctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputLimited,
+                 const tableType_t tableType,
+                 const dict_directive dict,
+                 const dictIssue_directive dictIssue,
+                 const U32 acceleration)
+{
+    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* base;
+    const BYTE* lowLimit;
+    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
+    const BYTE* const dictionary = dictPtr->dictionary;
+    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
+    const size_t dictDelta = dictEnd - (const BYTE*)source;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 forwardH;
+    size_t refDelta=0;
+
+    /* Init conditions */
+    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;   /* Unsupported input size, too large (or negative) */
+    switch(dict)
+    {
+    case noDict:
+    default:
+        base = (const BYTE*)source;
+        lowLimit = (const BYTE*)source;
+        break;
+    case withPrefix64k:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source - dictPtr->dictSize;
+        break;
+    case usingExtDict:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source;
+        break;
+    }
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (inputSize<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = acceleration << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit)) goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                if (dict==usingExtDict)
+                {
+                    if (match<(const BYTE*)source)
+                    {
+                        refDelta = dictDelta;
+                        lowLimit = dictionary;
+                    }
+                    else
+                    {
+                        refDelta = 0;
+                        lowLimit = (const BYTE*)source;
+                    }
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0)
+                || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
+                return 0;   /* Check output limit */
+            if (litLength>=RUN_MASK)
+            {
+                int len = (int)litLength-RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op+=litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            unsigned matchLength;
+
+            if ((dict==usingExtDict) && (lowLimit==dictionary))
+            {
+                const BYTE* limit;
+                match += refDelta;
+                limit = ip + (dictEnd-match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += MINMATCH + matchLength;
+                if (ip==limit)
+                {
+                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
+                    matchLength += more;
+                    ip += more;
+                }
+            }
+            else
+            {
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += MINMATCH + matchLength;
+            }
+
+            if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
+                return 0;    /* Check output limit */
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
+                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip > mflimit) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        if (dict==usingExtDict)
+        {
+            if (match<(const BYTE*)source)
+            {
+                refDelta = dictDelta;
+                lowLimit = dictionary;
+            }
+            else
+            {
+                refDelta = 0;
+                lowLimit = (const BYTE*)source;
+            }
+        }
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1)
+            && (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        const size_t lastRun = (size_t)(iend - anchor);
+        if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
+            return 0;   /* Check output limit */
+        if (lastRun >= RUN_MASK)
+        {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRun);
+        op += lastRun;
+    }
+
+    /* End */
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    if (maxOutputSize >= LZ4_compressBound(inputSize))
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+    else
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+#if (HEAPMODE)
+    void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctx;
+    void* ctxPtr = &ctx;
+#endif
+
+    int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1);
+}
+
+
+/* hidden debug function */
+/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
+int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t ctx;
+
+    LZ4_resetStream(&ctx);
+
+    if (inputSize < LZ4_64Klimit)
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+    else
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+}
+
+
+/********************************
+*  destSize variant
+********************************/
+
+static int LZ4_compress_destSize_generic(
+                       void* const ctx,
+                 const char* const src,
+                       char* const dst,
+                       int*  const srcSizePtr,
+                 const int targetDstSize,
+                 const tableType_t tableType)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* base = (const BYTE*) src;
+    const BYTE* lowLimit = (const BYTE*) src;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + *srcSizePtr;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dst;
+    BYTE* const oend = op + targetDstSize;
+    BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */;
+    BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */);
+    BYTE* const oMaxSeq = oMaxLit - 1 /* token */;
+
+    U32 forwardH;
+
+
+    /* Init conditions */
+    if (targetDstSize < 1) return 0;                                     /* Impossible to store anything */
+    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;            /* Unsupported input size, too large (or negative) */
+    if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (*srcSizePtr<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    *srcSizePtr = 0;
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = 1 << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit))
+                    goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if (op + ((litLength+240)/255) + litLength > oMaxLit)
+            {
+                /* Not enough space for a last match */
+                op--;
+                goto _last_literals;
+            }
+            if (litLength>=RUN_MASK)
+            {
+                unsigned len = litLength - RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op += litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            size_t matchLength;
+
+            matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+
+            if (op + ((matchLength+240)/255) > oMaxMatch)
+            {
+                /* Match description too long : reduce it */
+                matchLength = (15-1) + (oMaxMatch-op) * 255;
+            }
+            //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH);
+            ip += MINMATCH + matchLength;
+
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                while (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of block */
+        if (ip > mflimit) break;
+        if (op > oMaxSeq) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        size_t lastRunSize = (size_t)(iend - anchor);
+        if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend)
+        {
+            /* adapt lastRunSize to fill 'dst' */
+            lastRunSize  = (oend-op) - 1;
+            lastRunSize -= (lastRunSize+240)/255;
+        }
+        ip = anchor + lastRunSize;
+
+        if (lastRunSize >= RUN_MASK)
+        {
+            size_t accumulator = lastRunSize - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRunSize<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRunSize);
+        op += lastRunSize;
+    }
+
+    /* End */
+    *srcSizePtr = (int) (((const char*)ip)-src);
+    return (int) (((char*)op)-dst);
+}
+
+
+static int LZ4_compress_destSize_extState (void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr))   /* compression success is guaranteed */
+    {
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    }
+    else
+    {
+        if (*srcSizePtr < LZ4_64Klimit)
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16);
+        else
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr);
+    }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (HEAPMODE)
+    void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctxBody;
+    void* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/********************************
+*  Streaming functions
+********************************/
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    LZ4_resetStream(lz4s);
+    return lz4s;
+}
+
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+#define HASH_UNIT sizeof(size_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    if ((dict->initCheck) || (dict->currentOffset > 1 GB))  /* Uninitialized structure, or reuse overflow */
+        LZ4_resetStream(LZ4_dict);
+
+    if (dictSize < (int)HASH_UNIT)
+    {
+        dict->dictionary = NULL;
+        dict->dictSize = 0;
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    dict->currentOffset += 64 KB;
+    base = p - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->currentOffset += dict->dictSize;
+
+    while (p <= dictEnd-HASH_UNIT)
+    {
+        LZ4_putPosition(p, dict->hashTable, byU32, base);
+        p+=3;
+    }
+
+    return dict->dictSize;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
+{
+    if ((LZ4_dict->currentOffset > 0x80000000) ||
+        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
+    {
+        /* rescale hash table */
+        U32 delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        for (i=0; i<HASH_SIZE_U32; i++)
+        {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = (const BYTE*) source;
+    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
+    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
+    LZ4_renormDictT(streamPtr, smallest);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    /* Check overlapping input/dictionary space */
+    {
+        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
+        {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source)
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration);
+        streamPtr->dictSize += (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+
+    /* external dictionary mode */
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration);
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
+    int result;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = dictEnd;
+    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
+    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
+
+    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)inputSize;
+    streamPtr->currentOffset += (U32)inputSize;
+
+    return result;
+}
+
+
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
+
+    memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*******************************
+*  Decompression functions
+*******************************/
+/*
+ * This generic decompression function cover all use cases.
+ * It shall be instantiated several times, using different sets of directives
+ * Note that it is essential this generic function is really inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+FORCE_INLINE int LZ4_decompress_generic(
+                 const char* const source,
+                 char* const dest,
+                 int inputSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
+
+                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
+                 int partialDecoding,    /* full, partial */
+                 int targetOutputSize,   /* only used if partialDecoding==partial */
+                 int dict,               /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* == dest if dict == noDict */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    /* Local Variables */
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+    const BYTE* const lowLimit = lowPrefix - dictSize;
+
+    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
+    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
+    const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+
+    const int safeDecode = (endOnInput==endOnInputSize);
+    const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+    /* Special cases */
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                         /* targetOutputSize too high => decode everything */
+    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
+    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
+
+
+    /* Main Loop */
+    while (1)
+    {
+        unsigned token;
+        size_t length;
+        const BYTE* match;
+
+        /* get literal length */
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)
+        {
+            unsigned s;
+            do
+            {
+                s = *ip++;
+                length += s;
+            }
+            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
+            if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
+        }
+
+        /* copy literals */
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;     /* Necessarily EOF, due to parsing restrictions */
+        }
+        LZ4_wildCopy(op, ip, cpy);
+        ip += length; op = cpy;
+
+        /* get offset */
+        match = cpy - LZ4_readLE16(ip); ip+=2;
+        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
+
+        /* get matchlength */
+        length = token & ML_MASK;
+        if (length == ML_MASK)
+        {
+            unsigned s;
+            do
+            {
+                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
+                s = *ip++;
+                length += s;
+            } while (s==255);
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* overflow detection */
+        }
+        length += MINMATCH;
+
+        /* check external dictionary */
+        if ((dict==usingExtDict) && (match < lowPrefix))
+        {
+            if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error;   /* doesn't respect parsing restriction */
+
+            if (length <= (size_t)(lowPrefix-match))
+            {
+                /* match can be copied as a single segment from external dictionary */
+                match = dictEnd - (lowPrefix-match);
+                memmove(op, match, length); op += length;
+            }
+            else
+            {
+                /* match encompass external dictionary and current segment */
+                size_t copySize = (size_t)(lowPrefix-match);
+                memcpy(op, dictEnd - copySize, copySize);
+                op += copySize;
+                copySize = length - copySize;
+                if (copySize > (size_t)(op-lowPrefix))   /* overlap within current segment */
+                {
+                    BYTE* const endOfMatch = op + copySize;
+                    const BYTE* copyFrom = lowPrefix;
+                    while (op < endOfMatch) *op++ = *copyFrom++;
+                }
+                else
+                {
+                    memcpy(op, lowPrefix, copySize);
+                    op += copySize;
+                }
+            }
+            continue;
+        }
+
+        /* copy repeated sequence */
+        cpy = op + length;
+        if (unlikely((op-match)<8))
+        {
+            const size_t dec64 = dec64table[op-match];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[op-match];
+            LZ4_copy4(op+4, match);
+            op += 8; match -= dec64;
+        } else { LZ4_copy8(op, match); op+=8; match+=8; }
+
+        if (unlikely(cpy>oend-12))
+        {
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals */
+            if (op < oend-8)
+            {
+                LZ4_wildCopy(op, match, oend-8);
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<cpy) *op++ = *match++;
+        }
+        else
+            LZ4_wildCopy(op, match, cpy);
+        op=cpy;   /* correction */
+    }
+
+    /* end of decoding */
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
+    else
+       return (int) (((const char*)ip)-source);   /* Nb of input bytes read */
+
+    /* Overflow error detected */
+_output_error:
+    return (int) (-(((const char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
+}
+
+
+/* streaming decompression functions */
+
+typedef struct
+{
+    const BYTE* externalDict;
+    size_t extDictSize;
+    const BYTE* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode()
+ * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t));
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary
+ * This function is not necessary if previous data is still available where it was decoded.
+ * Loading a size of 0 is allowed (same effect as no dictionary).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    lz4sd->prefixSize = (size_t) dictSize;
+    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += result;
+        lz4sd->prefixEnd  += result;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0);
+    if (dictStart+dictSize == dest)
+    {
+        if (dictSize >= (int)(64 KB - 1))
+            return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0);
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0);
+    }
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
+}
+
+/* debug function */
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+
+/***************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); }
+int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); }
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); }
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); }
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); }
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); }
+
+/*
+These function names are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+
+static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base)
+{
+    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
+    lz4ds->bufferStart = base;
+}
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
+    LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer);
+    return 0;
+}
+
+void* LZ4_create (char* inputBuffer)
+{
+    void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer);
+    return lz4ds;
+}
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+    LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data;
+    int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB);
+    return (char*)(ctx->bufferStart + dictSize);
+}
+
+/* Obsolete streaming decompression functions */
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
diff --git a/bslz4/src/lz4.h b/bslz4/src/lz4.h
new file mode 100644
index 0000000..3e74002
--- /dev/null
+++ b/bslz4/src/lz4.h
@@ -0,0 +1,360 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * lz4.h provides block compression functions, and gives full buffer control to programmer.
+ * If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
+ * and can let the library handle its own memory, please use lz4frame.h instead.
+*/
+
+/**************************************
+*  Version
+**************************************/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    7    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+int LZ4_versionNumber (void);
+
+/**************************************
+*  Tuning parameter
+**************************************/
+/*
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#define LZ4_MEMORY_USAGE 14
+
+
+/**************************************
+*  Simple Functions
+**************************************/
+
+int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
+int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+
+/*
+LZ4_compress_default() :
+    Compresses 'sourceSize' bytes from buffer 'source'
+    into already allocated 'dest' buffer of size 'maxDestSize'.
+    Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
+    It also runs faster, so it's a recommended setting.
+    If the function cannot compress 'source' into a more limited 'dest' budget,
+    compression stops *immediately*, and the function result is zero.
+    As a consequence, 'dest' content is not valid.
+    This function never writes outside 'dest' buffer, nor read outside 'source' buffer.
+        sourceSize  : Max supported value is LZ4_MAX_INPUT_VALUE
+        maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
+        return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
+              or 0 if compression fails
+
+LZ4_decompress_safe() :
+    compressedSize : is the precise full size of the compressed block.
+    maxDecompressedSize : is the size of destination buffer, which must be already allocated.
+    return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
+             If destination buffer is not large enough, decoding will stop and output an error code (<0).
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function is protected against buffer overflow exploits, including malicious data packets.
+             It never writes outside output buffer, nor reads outside input buffer.
+*/
+
+
+/**************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*
+LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+int LZ4_compressBound(int inputSize);
+
+/*
+LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
+*/
+int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_fast_extState() :
+    Same compression function, just using an externally allocated memory space to store compression state.
+    Use LZ4_sizeofState() to know how much memory must be allocated,
+    and allocate it on 8-bytes boundaries (using malloc() typically).
+    Then, provide it as 'void* state' to compression function.
+*/
+int LZ4_sizeofState(void);
+int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_destSize() :
+    Reverse the logic, by compressing as much data as possible from 'source' buffer
+    into already allocated buffer 'dest' of size 'targetDestSize'.
+    This function either compresses the entire 'source' content into 'dest' if it's large enough,
+    or fill 'dest' buffer completely with as much data as possible from 'source'.
+        *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'.
+                         New value is necessarily <= old value.
+        return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
+              or 0 if compression fails
+*/
+int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
+
+
+/*
+LZ4_decompress_fast() :
+    originalSize : is the original and therefore uncompressed size
+    return : the number of bytes read from the source buffer (in other words, the compressed size)
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
+    note : This function fully respect memory boundaries for properly formed compressed data.
+           It is a bit faster than LZ4_decompress_safe().
+           However, it does not provide any protection against intentionally modified data stream (malicious input).
+           Use this function in trusted environment only (data to decode comes from a trusted source).
+*/
+int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
+
+/*
+LZ4_decompress_safe_partial() :
+    This function decompress a compressed block of size 'compressedSize' at position 'source'
+    into destination buffer 'dest' of size 'maxDecompressedSize'.
+    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+    reducing decompression time.
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
+       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+             Always control how many bytes were decoded.
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
+
+
+/***********************************************
+*  Streaming Compression Functions
+***********************************************/
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U64 * sizeof(long long))
+/*
+ * LZ4_stream_t
+ * information structure to track an LZ4 stream.
+ * important : init this structure content before first use !
+ * note : only allocated directly the structure if you are statically linking LZ4
+ *        If you are using liblz4 as a DLL, please use below construction methods instead.
+ */
+typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
+
+/*
+ * LZ4_resetStream
+ * Use this function to init an allocated LZ4_stream_t structure
+ */
+void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_createStream will allocate and initialize an LZ4_stream_t structure
+ * LZ4_freeStream releases its memory.
+ * In the context of a DLL (liblz4), please use these methods rather than the static struct.
+ * They are more future proof, in case of a change of LZ4_stream_t size.
+ */
+LZ4_stream_t* LZ4_createStream(void);
+int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_loadDict
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary' will remain in memory.
+ * Loading a size of 0 is allowed.
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*
+ * LZ4_compress_fast_continue
+ * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
+ * Important : Previous data blocks are assumed to still be present and unmodified !
+ * 'dst' buffer must be already allocated.
+ * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
+ */
+int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
+
+/*
+ * LZ4_saveDict
+ * If previously compressed data block is not guaranteed to remain available at its memory location
+ * save it into a safer place (char* safeBuffer)
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ *        dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
+ * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
+ */
+int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
+
+
+/************************************************
+*  Streaming Decompression Functions
+************************************************/
+
+#define LZ4_STREAMDECODESIZE_U64  4
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
+/*
+ * LZ4_streamDecode_t
+ * information structure to track an LZ4 stream.
+ * init this structure content using LZ4_setStreamDecode or memset() before first use !
+ *
+ * In the context of a DLL (liblz4) please prefer usage of construction methods below.
+ * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
+ * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
+ * LZ4_freeStreamDecode releases its memory.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary.
+ * Setting a size of 0 is allowed (same effect as reset).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
+    In the case of a ring buffers, decoding buffer must be either :
+    - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
+      In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
+    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+      maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including small ones ( < 64 KB).
+    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including larger than decoding buffer.
+    Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
+    and indicate where it is saved using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as
+    a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
+    They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
+*/
+int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
+int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
+
+
+
+/**************************************
+*  Obsolete Functions
+**************************************/
+/* Deprecate Warnings */
+/* Should these warnings messages be a problem,
+   it is generally possible to disable them,
+   with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual for example.
+   You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
+#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (LZ4_GCC_VERSION >= 301)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
+#    define LZ4_DEPRECATED(message)
+#  endif
+#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */
+
+/* Obsolete compression functions */
+/* These functions are planned to start generate warnings by r131 approximately */
+int LZ4_compress               (const char* source, char* dest, int sourceSize);
+int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
+int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete decompression functions */
+/* These function names are completely deprecated and must no longer be used.
+   They are only provided here for compatibility with older programs.
+    - LZ4_uncompress is the same as LZ4_decompress_fast
+    - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
+   These function prototypes are now disabled; uncomment them only if you really need them.
+   It is highly recommended to stop using these prototypes and migrate to maintained ones */
+/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
+/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
+
+/* Obsolete streaming functions; use new streaming interface whenever possible */
+LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_createStream() instead") int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("use LZ4_resetStream() instead")  int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_saveDict() instead")     char* LZ4_slideInputBuffer (void* state);
+
+/* Obsolete streaming decoding functions */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+
+#if defined (__cplusplus)
+}
+#endif

From 38660b17fdba1611ea55fe411263e9c4d3b2f9c1 Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Fri, 17 Aug 2018 11:02:36 +0100
Subject: [PATCH 2/8] Fixes to allow bitshuffle-lz4 to compile with -std=c89

Removes all "//" style comments, adds a missing typedef
for "int16_t" (although these are not checked for correct width)
and removes duplicated "intX_t" typedefs.

It would be preferable to detect GCC (with its extensions) and use
its definitions of fixed-with integers if not compiling with C99.
---
 bslz4/src/bitshuffle.c           | 14 ++++-----
 bslz4/src/bitshuffle.h           |  4 +--
 bslz4/src/bitshuffle_core.c      | 53 ++++++++++++++++----------------
 bslz4/src/bitshuffle_core.h      | 11 ++++---
 bslz4/src/bitshuffle_internals.h | 20 +++---------
 bslz4/src/iochain.c              |  6 ++--
 bslz4/src/iochain.h              |  4 +--
 bslz4/src/lz4.c                  |  1 -
 8 files changed, 50 insertions(+), 63 deletions(-)

diff --git a/bslz4/src/bitshuffle.c b/bslz4/src/bitshuffle.c
index 54ff045..3b8815b 100644
--- a/bslz4/src/bitshuffle.c
+++ b/bslz4/src/bitshuffle.c
@@ -18,12 +18,12 @@
 #include <string.h>
 
 
-// Constants.
-// Use fast decompression instead of safe decompression for LZ4.
+
+
 #define BSHUF_LZ4_DECOMPRESS_FAST
 
 
-// Macros.
+
 #define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) {                      \
     free(buf); return count - 1000; }
 
@@ -138,13 +138,13 @@ size_t bshuf_compress_lz4_bound(const size_t size,
     }
     if (block_size % BSHUF_BLOCKED_MULT) return -81;
 
-    // Note that each block gets a 4 byte header.
-    // Size of full blocks.
+
+
     bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
-    // Size of partial blocks, if any.
+
     leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
     if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
-    // Size of uncompressed data not fitting into any blocks.
+
     bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
     return bound;
 }
diff --git a/bslz4/src/bitshuffle.h b/bslz4/src/bitshuffle.h
index 3df95f4..dc87e9e 100644
--- a/bslz4/src/bitshuffle.h
+++ b/bslz4/src/bitshuffle.h
@@ -117,7 +117,7 @@ int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
         const size_t elem_size, size_t block_size);
 
 #ifdef __cplusplus
-} // extern "C"
+}
 #endif
 
-#endif  // BITSHUFFLE_H
+#endif
diff --git a/bslz4/src/bitshuffle_core.c b/bslz4/src/bitshuffle_core.c
index 583e4fe..1a184d6 100644
--- a/bslz4/src/bitshuffle_core.c
+++ b/bslz4/src/bitshuffle_core.c
@@ -25,7 +25,7 @@
 #endif
 
 
-// Conditional includes for SSE2 and AVX2.
+
 #ifdef USEAVX2
 #include <immintrin.h>
 #elif defined USESSE2
@@ -33,7 +33,7 @@
 #endif
 
 
-// Macros.
+
 #define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 
@@ -131,8 +131,8 @@ int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t
     CHECK_MULT_EIGHT(start);
 
     if (size > start) {
-        // ii loop separated into 2 loops so the compiler can unroll
-        // the inner one.
+
+
         for (ii = start; ii + 7 < size; ii += 8) {
             for (jj = 0; jj < elem_size; jj++) {
                 for (kk = 0; kk < 8; kk++) {
@@ -351,7 +351,7 @@ int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size
 /* ---- Worker code that uses SSE2 ----
  *
  * The following code makes use of the SSE2 instruction set and specialized
- * 16 byte registers. The SSE2 instructions are present on modern x86 
+ * 16 byte registers. The SSE2 instructions are present on modern x86
  * processors. The first Intel processor microarchitecture supporting SSE2 was
  * Pentium 4 (2000).
  *
@@ -512,7 +512,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
 
     int64_t count;
 
-    // Trivial cases: power of 2 bytes.
+
     switch (elem_size) {
         case 1:
             count = bshuf_copy(in, out, size, elem_size);
@@ -528,14 +528,14 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
             return count;
     }
 
-    // Worst case: odd number of bytes. Turns out that this is faster for
-    // (odd * 2) byte elements as well (hence % 4).
+
+
     if (elem_size % 4) {
         count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
         return count;
     }
 
-    // Multiple of power of 2: transpose hierarchically.
+
     {
         size_t nchunk_elem;
         void* tmp_buf = malloc(size * elem_size);
@@ -554,7 +554,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
                     size * nchunk_elem);
             bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
         } else {
-            // Not used since scalar algorithm is faster.
+
             nchunk_elem = elem_size / 2;
             TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
             count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
@@ -687,8 +687,8 @@ int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size
             g1 = _mm_unpacklo_epi32(g0, h0);
             h1 = _mm_unpackhi_epi32(g0, h0);
 
-            // We don't have a storeh instruction for integers, so interpret
-            // as a float. Have a storel (_mm_storel_epi64).
+
+
             as = (__m128 *) &a1;
             bs = (__m128 *) &b1;
             cs = (__m128 *) &c1;
@@ -737,8 +737,8 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
 
     CHECK_MULT_EIGHT(size);
 
-    // With a bit of care, this could be written such that such that it is
-    // in_buf = out_buf safe.
+
+
     const char* in_b = (const char*) in;
     uint16_t* out_ui16 = (uint16_t*) out;
 
@@ -788,7 +788,7 @@ int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
     return count;
 }
 
-#else // #ifdef USESSE2
+#else
 
 
 int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
@@ -842,7 +842,7 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
 }
 
 
-#endif // #ifdef USESSE2
+#endif
 
 
 /* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
@@ -1014,8 +1014,8 @@ int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t
 
     CHECK_MULT_EIGHT(size);
 
-    // With a bit of care, this could be written such that such that it is
-    // in_buf = out_buf safe.
+
+
     const char* in_b = (const char*) in;
     char* out_b = (char*) out;
 
@@ -1065,7 +1065,7 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
 }
 
 
-#else // #ifdef USEAVX2
+#else
 
 int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
          const size_t elem_size) {
@@ -1096,12 +1096,12 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
     return -12;
 }
 
-#endif // #ifdef USEAVX2
+#endif
 
 
 /* ---- Drivers selecting best instruction set at compile time. ---- */
 
-int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, 
+int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
         const size_t elem_size) {
 
     int64_t count;
@@ -1116,7 +1116,7 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
 }
 
 
-int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, 
+int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
         const size_t elem_size) {
 
     int64_t count;
@@ -1178,7 +1178,6 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
     if (err < 0) return err;
 
     leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
-    //this_iter;
     last_in = (char *) ioc_get_in(&C, &this_iter);
     ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
     last_out = (char *) ioc_get_out(&C, &this_iter);
@@ -1202,7 +1201,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
     int64_t count;
 
 
-    
+
     in = ioc_get_in(C_ptr, &this_iter);
     ioc_set_next_in(C_ptr, &this_iter,
             (void*) ((char*) in + size * elem_size));
@@ -1297,11 +1296,11 @@ uint32_t bshuf_read_uint32_BE(const void* buf) {
  */
 
 size_t bshuf_default_block_size(const size_t elem_size) {
-    // This function needs to be absolutely stable between versions.
-    // Otherwise encoded data will not be decodable.
+
+
 
     size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
-    // Ensure it is a required multiple.
+
     block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
     return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
 }
diff --git a/bslz4/src/bitshuffle_core.h b/bslz4/src/bitshuffle_core.h
index 4516ef4..8996d91 100644
--- a/bslz4/src/bitshuffle_core.h
+++ b/bslz4/src/bitshuffle_core.h
@@ -28,14 +28,15 @@
 #ifndef BITSHUFFLE_CORE_H
 #define BITSHUFFLE_CORE_H
 
-// We assume GNU g++ defining `__cplusplus` has stdint.h
+
 #if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
 #include <stdint.h>
 #else
   typedef unsigned char       uint8_t;
   typedef unsigned short      uint16_t;
+  typedef signed short        int16_t;
   typedef unsigned int        uint32_t;
-  typedef   signed int        int32_t;
+  typedef signed int          int32_t;
   typedef unsigned long long  uint64_t;
   typedef long long           int64_t;
 #endif
@@ -43,7 +44,7 @@
 #include <stdlib.h>
 
 
-// These are usually set in the setup.py.
+
 #ifndef BSHUF_VERSION_MAJOR
 #define BSHUF_VERSION_MAJOR 0
 #define BSHUF_VERSION_MINOR 3
@@ -150,7 +151,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
         const size_t elem_size, size_t block_size);
 
 #ifdef __cplusplus
-} // extern "C"
+}
 #endif
 
-#endif  // BITSHUFFLE_CORE_H
+#endif
diff --git a/bslz4/src/bitshuffle_internals.h b/bslz4/src/bitshuffle_internals.h
index e039925..7b5b8a5 100644
--- a/bslz4/src/bitshuffle_internals.h
+++ b/bslz4/src/bitshuffle_internals.h
@@ -13,31 +13,19 @@
 #ifndef BITSHUFFLE_INTERNALS_H
 #define BITSHUFFLE_INTERNALS_H
 
-// We assume GNU g++ defining `__cplusplus` has stdint.h
-#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
-#include <stdint.h>
-#else
-  typedef unsigned char       uint8_t;
-  typedef unsigned short      uint16_t;
-  typedef unsigned int        uint32_t;
-  typedef   signed int        int32_t;
-  typedef unsigned long long  uint64_t;
-  typedef long long           int64_t;
-#endif
 
 #include <stdlib.h>
 #include "iochain.h"
 
 
-// Constants.
 #ifndef BSHUF_MIN_RECOMMEND_BLOCK
 #define BSHUF_MIN_RECOMMEND_BLOCK 128
-#define BSHUF_BLOCKED_MULT 8    // Block sizes must be multiple of this.
+#define BSHUF_BLOCKED_MULT 8
 #define BSHUF_TARGET_BLOCK_SIZE_B 8192
 #endif
 
 
-// Macros.
+
 #define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }
 
 
@@ -69,7 +57,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
         const size_t size, const size_t elem_size, size_t block_size);
 
 #ifdef __cplusplus
-} // extern "C"
+}
 #endif
 
-#endif  // BITSHUFFLE_INTERNALS_H
+#endif
diff --git a/bslz4/src/iochain.c b/bslz4/src/iochain.c
index baa9729..5c5eddd 100644
--- a/bslz4/src/iochain.c
+++ b/bslz4/src/iochain.c
@@ -81,9 +81,9 @@ void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
     C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
 #ifdef _OPENMP
     omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
-    // *in_pl[this_iter]* lock released at the end of the iteration to avoid being
-    // overtaken by previous threads and having *out_pl[this_iter]* corrupted.
-    // Especially worried about thread 0, iteration 0.
+
+
+
     omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
 #endif
 }
diff --git a/bslz4/src/iochain.h b/bslz4/src/iochain.h
index 4e225d1..6a398ff 100644
--- a/bslz4/src/iochain.h
+++ b/bslz4/src/iochain.h
@@ -25,7 +25,7 @@
  * Usage
  * -----
  *  - Call `ioc_init` in serial block.
- *  - Each thread should create a local variable *size_t this_iter* and 
+ *  - Each thread should create a local variable *size_t this_iter* and
  *    pass its address to all function calls. Its value will be set
  *    inside the functions and is used to identify the thread.
  *  - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
@@ -90,5 +90,5 @@ void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
 void * ioc_get_out(ioc_chain *C, size_t *this_iter);
 void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);
 
-#endif  // IOCHAIN_H
+#endif
 
diff --git a/bslz4/src/lz4.c b/bslz4/src/lz4.c
index 08cf6b5..31f489d 100644
--- a/bslz4/src/lz4.c
+++ b/bslz4/src/lz4.c
@@ -825,7 +825,6 @@ _next_match:
                 /* Match description too long : reduce it */
                 matchLength = (15-1) + (oMaxMatch-op) * 255;
             }
-            //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH);
             ip += MINMATCH + matchLength;
 
             if (matchLength>=ML_MASK)

From ace6a4671734d068e01352785b84d235debeec48 Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Thu, 16 Aug 2018 17:35:27 +0100
Subject: [PATCH 3/8] Implement direct-chunk reading to read data when possible

The HDF5 library was made thread-safe via excessive locking, so does
not gain much from reads being parallelized.

By using the H5DOread_chunk function (introduced in HDF5 1.10.2)
we reduce the time spent the library, improving performance for
when XDS uses multiple threads to process data.

The decompression and type conversions have to be done manually
however, and this is only used in a limited case.
---
 Makefile      |   7 +-
 src/file.c    | 315 ++++++++++++++++++++++++++++++++++++++++++++++++--
 src/file.h    |  10 ++
 src/filters.c |  58 ++++++++++
 src/filters.h |  31 +++++
 5 files changed, 411 insertions(+), 10 deletions(-)
 create mode 100644 src/filters.c
 create mode 100644 src/filters.h

diff --git a/Makefile b/Makefile
index 9e80aa4..8fd50ff 100644
--- a/Makefile
+++ b/Makefile
@@ -39,14 +39,17 @@ $(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o
 	mkdir -p $(BUILD_DIR)
 	ar rcs $@ $^
 
-$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/bslz4.a
+$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \
+$(BUILD_DIR)/bslz4.a
 	mkdir -p $(BUILD_DIR)
 	$(CC) $(CFLAGS) -shared $^ -o $(BUILD_DIR)/durin-plugin.so
 
-$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/bslz4.a
+$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \
+$(BUILD_DIR)/bslz4.a
 	mkdir -p $(BUILD_DIR)
 	$(CC) $(CFLAGS) $^ -o $(BUILD_DIR)/example
 
 .PHONY: clean
 clean:
 	rm -r $(BUILD_DIR)
+	rm -r $(BSLZ4_BUILD_DIR)
diff --git a/src/file.c b/src/file.c
index 38180c7..cc78f59 100644
--- a/src/file.c
+++ b/src/file.c
@@ -5,12 +5,14 @@
 
 
 #include <hdf5.h>
+#include <hdf5_hl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include "file.h"
 #include "err.h"
+#include "filters.h"
 
 hid_t h5_int_type_from_width(const int width) {
 	if (width == sizeof(char)) {
@@ -55,6 +57,11 @@ void free_eiger_data_description(struct data_description_t *desc) {
 	desc->extra = NULL;
 }
 
+void free_optimised_eiger_data_description(struct data_description_t *desc) {
+	/* no extra memory is allocated for the optimised struct definition, so call the original */
+	free_eiger_data_description(desc);
+}
+
 
 double scale_from_units(const char* unit_string) {
 	if (strcasecmp("m", unit_string) == 0 ||
@@ -132,10 +139,19 @@ done:
 	return retval;
 }
 
-int get_frame(hid_t g_id, const char* name, hsize_t *frame_idx, hsize_t *frame_size, int data_width, void *buffer) {
+int get_frame_simple(const struct data_description_t *desc,
+		const char *name,
+		hsize_t *frame_idx,
+		hsize_t *frame_size,
+		int data_width,
+		void *buffer) {
+
 	int retval = 0;
 	herr_t err = 0;
-	hid_t ds_id, s_id, ms_id, t_id;
+	hid_t g_id, ds_id, s_id, ms_id, t_id;
+
+	g_id = desc->data_group_id;
+
 	ds_id = H5Dopen2(g_id, name, H5P_DEFAULT);
 	if (ds_id <= 0) {
 		char message[64];
@@ -176,6 +192,162 @@ done:
 	return retval;
 }
 
+
+int get_frame_from_chunk(const struct data_description_t *desc,
+		const char *ds_name,
+		hsize_t *frame_idx,
+		hsize_t *frame_size,
+		int requested_data_width,
+		void *buffer) {
+
+	hid_t d_id = 0;
+	hid_t t_id = 0;
+	hsize_t c_offset[3] = {frame_idx[0], 0, 0};
+	uint32_t c_filter_mask = 0;
+	hsize_t c_bytes;
+	void *c_buffer = NULL;
+	void *u_buffer = NULL;
+	const struct optimised_eiger_data_description_t *eiger_desc = desc->extra;
+	int retval = 0;
+	size_t raw_data_width = 0;
+
+	if (frame_idx[1] != 0 || frame_idx[2] != 0) {
+		char message[64];
+		sprintf(message, "Require frame selection starts at [n, 0, 0], not [n, %llu, %llu]",
+				frame_idx[1], frame_idx[2]);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	d_id = H5Dopen(desc->data_group_id, ds_name, H5P_DEFAULT);
+	if (d_id < 0) {
+		char message[64];
+		sprintf(message, "Error opening dataset %.32s", ds_name);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	/* TODO: pass down the dataset's data_width so we don't need to do this */
+	t_id = H5Dget_type(d_id);
+	if (t_id < 0) {
+		ERROR_JUMP(-1, done, "Error getting datatype");
+	}
+
+	raw_data_width = H5Tget_size(t_id);
+
+	if (H5Dget_chunk_storage_size(d_id, c_offset, &c_bytes) < 0) {
+		char message[96];
+		sprintf(message, "Error reading chunk size from %.32s for frame %llu", ds_name, frame_idx[0]);
+		ERROR_JUMP(-1, done, message);
+	}
+	if (c_bytes == 0) {
+		char message[96];
+		sprintf(message, "Target chunk %llu has zero size for dataset %.32s", frame_idx[0], ds_name);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	c_buffer = malloc(c_bytes);
+	if (!c_buffer) {
+		char message[128];
+		sprintf(message, "Unable to allocate chunk buffer for dataset %.32s - frame %llu, size %llu bytes",
+				ds_name, frame_idx[0], c_bytes);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	if (H5DOread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) < 0) {
+		char message[128];
+		sprintf(message, "Error reading chunk %llu from dataset %.32s - size %llu bytes",
+				frame_idx[0], ds_name, c_bytes);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	if (raw_data_width == requested_data_width) {
+		/* can output straight to the target buffer */
+		u_buffer = buffer;
+	} else {
+		/* must perform a type conversion, so require a second buffer */
+		u_buffer = malloc(raw_data_width * frame_size[1] * frame_size[2]);
+		if (!u_buffer) {
+			ERROR_JUMP(-1, done, "Unable to allocate output buffer for bslz4 decompression");
+		}
+	}
+
+	if (bslz4_decompress(
+				eiger_desc->bs_filter_params,
+				c_bytes,
+				c_buffer,
+				raw_data_width * frame_size[1] * frame_size[2],
+				u_buffer) < 0) {
+		char message[128];
+		sprintf(message, "Error processing chunk %llu from %.32s with bitshuffle_lz4",
+				frame_idx[0], ds_name);
+		ERROR_JUMP(-1, done, message);
+	}
+
+
+	if (u_buffer != buffer) {
+		int could_convert = 1;
+		/* transfer data to output buffer, performing data conversion as required */
+		/* TODO: decide how conversion of data should work
+		 * Should we sign extend? Neggia doesn't (casts from uint*), but may be more intuitive */
+		if (requested_data_width == sizeof(int8_t)) {
+			if (raw_data_width == sizeof(int16_t)) {
+				CONVERT_BUFFER(u_buffer, int16_t, buffer, int8_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int32_t)) {
+				CONVERT_BUFFER(u_buffer, int32_t, buffer, int8_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int64_t)) {
+				CONVERT_BUFFER(u_buffer, int64_t, buffer, int8_t, frame_size[1] * frame_size[2]);
+			} else {
+				could_convert = 0;
+			}
+		} else if (requested_data_width == sizeof(int16_t)) {
+			if (raw_data_width == sizeof(int8_t)) {
+				CONVERT_BUFFER(u_buffer, int8_t, buffer, int16_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int32_t)) {
+				CONVERT_BUFFER(u_buffer, int32_t, buffer, int16_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int64_t)) {
+				CONVERT_BUFFER(u_buffer, int64_t, buffer, int16_t, frame_size[1] * frame_size[2]);
+			} else {
+				could_convert = 0;
+			}
+		} else if (requested_data_width == sizeof(int32_t)) {
+			if (raw_data_width == sizeof(int8_t)) {
+				CONVERT_BUFFER(u_buffer, int8_t, buffer, int32_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int16_t)) {
+				CONVERT_BUFFER(u_buffer, int16_t, buffer, int32_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int64_t)) {
+				CONVERT_BUFFER(u_buffer, int64_t, buffer, int32_t, frame_size[1] * frame_size[2]);
+			} else {
+				could_convert = 0;
+			}
+		} else if (requested_data_width == sizeof(int64_t)) {
+			if (raw_data_width == sizeof(int8_t)) {
+				CONVERT_BUFFER(u_buffer, int8_t, buffer, int64_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int16_t)) {
+				CONVERT_BUFFER(u_buffer, int16_t, buffer, int64_t, frame_size[1] * frame_size[2]);
+			} else if (raw_data_width == sizeof(int32_t)) {
+				CONVERT_BUFFER(u_buffer, int32_t, buffer, int64_t, frame_size[1] * frame_size[2]);
+			} else {
+				could_convert = 0;
+			}
+		} else {
+			could_convert = 0;
+		}
+		if (!could_convert) {
+			char message[128];
+			sprintf(message, "Unsupported conversion of data width %lu to %d for %.32s",
+					raw_data_width, requested_data_width, ds_name);
+			ERROR_JUMP(-1, done, message);
+		}
+	}
+
+done:
+	if (c_buffer) free(c_buffer);
+	if (u_buffer && (u_buffer != buffer)) free(u_buffer);
+	if (t_id) H5Tclose(t_id);
+	if (d_id) H5Dclose(d_id);
+	return retval;
+}
+
+
 int get_nxs_frame(
 		const struct data_description_t *desc,
 		const struct dataset_properties_t *ds_prop,
@@ -192,7 +364,7 @@ int get_nxs_frame(
 		sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) ds_prop->dims[0] - 1);
 		ERROR_JUMP(-1, done, message);
 	}
-	retval = get_frame(desc->data_group_id, "data", frame_idx, frame_size, data_width, buffer);
+	retval = get_frame_simple(desc, "data", frame_idx, frame_size, data_width, buffer);
 	if (retval < 0) {
 		ERROR_JUMP(retval, done, "");
 	}
@@ -228,7 +400,7 @@ int get_dectris_eiger_frame(
 	idx = n - (frame_count - eiger_desc->block_sizes[block]); /* index in current block */
 	frame_idx[0] = idx;
 	sprintf(data_name, "data_%06d", block + 1);
-	retval = get_frame(desc->data_group_id, data_name, frame_idx, frame_size, data_width, buffer);
+	retval = eiger_desc->raw_frame_func(desc, data_name, frame_idx, frame_size, data_width, buffer);
 	if (retval < 0) {
 		ERROR_JUMP(retval, done, "");
 	}
@@ -602,6 +774,103 @@ done:
 	return retval;
 }
 
+int check_for_chunk_read(
+		hid_t g_id,
+		const char* ds_name,
+		struct optimised_eiger_data_description_t *desc) {
+
+	int retval = 0;
+	int n_filters;
+	hsize_t cdims[3];
+	hid_t ds_id, dcpl, s_id;
+	unsigned int filter_flags, filter_config;
+	char filter_name[16];
+	size_t name_len = 16;
+	size_t cd_nelems = BS_H5_N_PARAMS;
+	hsize_t dims[3];
+	H5Z_filter_t filter;
+
+	dcpl = 0;
+	s_id = 0;
+	ds_id = 0;
+
+	ds_id = H5Dopen2(g_id, ds_name, H5P_DEFAULT);
+	if (ds_id < 0) {
+		char message[64];
+		sprintf(message, "Error opening dataset %.32s", ds_name);
+		ERROR_JUMP(-1, done, message)
+	}
+
+	s_id = H5Dget_space(ds_id);
+	if (s_id < 0) {
+		char message[64];
+		sprintf(message, "Error opening dataspace for %.32s", ds_name);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	if (3 != H5Sget_simple_extent_ndims(s_id)) {
+		goto done;
+	}
+
+	if (H5Sget_simple_extent_dims(s_id, dims, NULL) < 0) {
+		char message[80];
+		sprintf(message, "Error retriving dataset dimensions for %.32s", ds_name);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	dcpl = H5Dget_create_plist(ds_id);
+	if (dcpl < 0) {
+		ERROR_JUMP(-1, done, "Error getting dataset creation property list");
+	}
+
+	/* check the chunk layout matches the layout we expect of
+	 * [1, frame_size_y, frame_size_x] (1 frame == 1 chunk) */
+	int cndims = H5Pget_chunk(dcpl, 3, cdims);
+	if (cndims != 3) {
+		goto done;
+	}
+	if (cdims[0] != 1 || cdims[1] != dims[1] || cdims[2] != dims[2]) {
+		goto done;
+	}
+
+	/* check for potential filters - only the bitshuffle filter is supported */
+	n_filters = H5Pget_nfilters(dcpl);
+	if (n_filters < 0) {
+		ERROR_JUMP(-1, done, "Error retrieving number of filters on dataset");
+	} else if (n_filters > 1) {
+		goto done;
+	}
+
+	if (n_filters == 1) {
+		filter = H5Pget_filter2(dcpl, 0, &filter_flags,
+				&cd_nelems, desc->bs_filter_params,
+				name_len, filter_name,
+				&filter_config);
+		if (filter < 0) {
+			ERROR_JUMP(-1, done, "Error retrieving filter information");
+		}
+		if (filter != BS_H5_FILTER_ID) {
+			goto done;
+		}
+		if (cd_nelems > BS_H5_N_PARAMS) {
+			char message[128];
+			sprintf(message,
+					"More than expected number of parameters to bitshuffle filter - expected %d, was %lu",
+					BS_H5_N_PARAMS, cd_nelems);
+			ERROR_JUMP(-1, done, message);
+		}
+	} else {
+		desc->n_filter_params = 0;
+	}
+
+	retval = 1;
+
+done:
+	if (dcpl) H5Pclose(dcpl);
+	if (s_id) H5Sclose(s_id);
+	if (ds_id) H5Dclose(ds_id);
+	return retval;
+}
 
 int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_objects_t *visit_result) {
 	int retval = 0;
@@ -660,14 +929,42 @@ int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_
 	}
 
 	if (data_desc->get_data_properties == &get_dectris_eiger_dataset_dims) {
-		/* setup the "extra eiger info" struct */
-		struct eiger_data_description_t *eiger_desc = malloc(sizeof(*eiger_desc));
+
+		/* setup the "extra info" structs */
+		struct eiger_data_description_t *eiger_desc;
+		struct optimised_eiger_data_description_t *o_eiger_desc;
+
+		eiger_desc = malloc(sizeof(*eiger_desc));
 		if (!eiger_desc) {
 			ERROR_JUMP(-1, done, "Memory error creating data description for Eiger");
 		}
 		memset(eiger_desc, 0, sizeof(*eiger_desc));
-		data_desc->extra = eiger_desc;
-		data_desc->free_extra = free_eiger_data_description;
+		eiger_desc->raw_frame_func = &get_frame_simple;
+
+		o_eiger_desc = malloc(sizeof(*o_eiger_desc));
+		if (!o_eiger_desc) {
+			free(eiger_desc);
+			ERROR_JUMP(-1, done, "Memory error creating data description for optimised Eiger");
+		}
+		o_eiger_desc->base.raw_frame_func = &get_frame_from_chunk;
+
+		/* check if we can perform the optimised chunk read */
+		retval = check_for_chunk_read(data_desc->data_group_id, "data_000001", o_eiger_desc);
+		if (retval < 0) {
+			free(o_eiger_desc);
+			free(eiger_desc);
+			ERROR_JUMP(-1, done, "");
+		}
+		if (retval) {
+			free(eiger_desc);
+			data_desc->extra = o_eiger_desc;
+			data_desc->free_extra = free_optimised_eiger_data_description;
+		} else {
+			free(o_eiger_desc);
+			data_desc->extra = eiger_desc;
+			data_desc->free_extra = free_eiger_data_description;
+		}
+
 	} else {
 		data_desc->free_extra = free_nxs_data_description;
 	}
@@ -702,6 +999,8 @@ int extract_detector_info(
 	if ((retval = data_desc->get_data_properties(data_desc, dataset_prop)) < 0) {
 		ERROR_JUMP(retval, done, "");
 	}
+
+
 done:
 	return retval;
 }
diff --git a/src/file.h b/src/file.h
index 9e97116..0733ef6 100644
--- a/src/file.h
+++ b/src/file.h
@@ -9,6 +9,7 @@
 
 #include <hdf5.h>
 #include "err.h"
+#include "filters.h"
 
 struct dataset_properties_t {
 	int data_width;
@@ -31,10 +32,19 @@ void free_nxs_data_description(struct data_description_t *desc);
 struct eiger_data_description_t {
 	int n_data_blocks;
 	int *block_sizes;
+	int (*raw_frame_func)(const struct data_description_t*, const char*, hsize_t*, hsize_t*, int, void*);
 };
 
 void free_eiger_data_description(struct data_description_t *desc);
 
+struct optimised_eiger_data_description_t {
+	struct eiger_data_description_t base;
+	int n_filter_params;
+	unsigned int bs_filter_params[BS_H5_N_PARAMS];
+};
+
+void free_optimised_eiger_data_description(struct data_description_t *desc);
+
 struct det_visit_objects_t {
 	hid_t nxdata;
 	hid_t nxdetector;
diff --git a/src/filters.c b/src/filters.c
new file mode 100644
index 0000000..49369df
--- /dev/null
+++ b/src/filters.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Diamond Light Source Ltd.
+ * Author: Charles Mita
+ */
+
+#include <stdio.h>
+#include "filters.h"
+#include "err.h"
+#include "bitshuffle.h"
+
+/* Required prototypes from bitshuffle.c but not included in header */
+uint64_t bshuf_read_uint64_BE(const void *buffer);
+uint32_t bshuf_read_uint32_BE(const void *buffer);
+
+
+/*
+ * Derived from the h5 filter code from the bitshuffle project (not included here)
+ */
+int bslz4_decompress(
+		const unsigned int* bs_params,
+		size_t in_size,
+		void *in_buffer,
+		size_t out_size,
+		void *out_buffer) {
+
+	int retval = 0;
+	size_t size, elem_size, block_size, u_bytes;
+
+	elem_size = bs_params[2];
+	u_bytes = bshuf_read_uint64_BE(in_buffer);
+
+	if (u_bytes != out_size) {
+		char message[64];
+		sprintf(message, "Decompressed chunk is %lu bytes, expected %lu", u_bytes, out_size);
+		ERROR_JUMP(-1, done, message);
+	}
+
+	block_size = bshuf_read_uint32_BE((const char *) in_buffer + 8) / elem_size;
+	if (!block_size) {
+		ERROR_JUMP(-1, done, "Read block bitshuffle lz4 block size as 0");
+	}
+	/* skip over header */
+	in_buffer += 12;
+	size = u_bytes / elem_size;
+
+	if (bs_params[4] == BS_H5_PARAM_LZ4_COMPRESS) {
+		if (bshuf_decompress_lz4(in_buffer, out_buffer, size, elem_size, block_size) < 0) {
+			ERROR_JUMP(-1, done, "Error performing bitshuffle_lz4 decompression");
+		}
+	} else {
+		if (bshuf_bitunshuffle(in_buffer, out_buffer, size, elem_size, block_size) < 0) {
+			ERROR_JUMP(-1, done, "Error performing bit unshuffle");
+		}
+	}
+
+done:
+	return retval;
+}
diff --git a/src/filters.h b/src/filters.h
new file mode 100644
index 0000000..7996783
--- /dev/null
+++ b/src/filters.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018 Diamond Light Source Ltd.
+ * Author: Charles Mita
+ */
+
+#ifndef NXS_XDS_FILTER_H
+#define NXS_XDS_FILTER_H
+
+#define BS_H5_N_PARAMS 5
+#define BS_H5_FILTER_ID 32008
+#define BS_H5_PARAM_LZ4_COMPRESS 2
+
+
+/* Perform type conversion during buffer copy */
+#define CONVERT_BUFFER(in, t_in, out, t_out, size) \
+{ \
+    t_in *pin = in; \
+    t_out *pout = out; \
+    t_in *end = pin + size; \
+    while (pin < end) *pout++ = (t_out) *pin++; \
+}
+
+
+int bslz4_decompress(
+		const unsigned int* bs_params,
+		size_t in_size,
+		void *in_buffer,
+		size_t out_size,
+		void *out_buffer);
+
+#endif /* NXS_XDS_FILTER_H */

From d1b25e18b847c8930555e82bbbacf761e0d73af9 Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Fri, 17 Aug 2018 15:13:58 +0100
Subject: [PATCH 4/8] Refactor description structs

---
 src/file.c   | 245 +++++++++++++++++++++++++++------------------------
 src/file.h   |  49 ++++-------
 src/plugin.c |  32 ++++---
 src/test.c   |  15 ++--
 4 files changed, 174 insertions(+), 167 deletions(-)

diff --git a/src/file.c b/src/file.c
index cc78f59..58e495e 100644
--- a/src/file.c
+++ b/src/file.c
@@ -43,25 +43,27 @@ void clear_det_visit_objects(struct det_visit_objects_t *objects) {
 }
 
 
-void free_nxs_data_description(struct data_description_t *desc) {
-	if (desc->extra) free(desc->extra); /* should just be NULL */
-	desc->extra = NULL;
+void free_ds_desc(struct ds_desc_t *desc) {
+	H5Gclose(desc->det_g_id);
+	H5Gclose(desc->data_g_id);
+	free(desc);
 }
 
-
-void free_eiger_data_description(struct data_description_t *desc) {
-	if (!desc->extra) return;
-	struct eiger_data_description_t *extra = desc->extra;
-	if (extra->block_sizes) free(extra->block_sizes);
-	free(extra);
-	desc->extra = NULL;
+void free_nxs_desc(struct ds_desc_t *desc) {
+	free_ds_desc(desc);
 }
 
-void free_optimised_eiger_data_description(struct data_description_t *desc) {
-	/* no extra memory is allocated for the optimised struct definition, so call the original */
-	free_eiger_data_description(desc);
+void free_eiger_desc(struct ds_desc_t *desc) {
+	struct eiger_ds_desc_t *e_desc = (struct eiger_ds_desc_t *) desc;
+	free(e_desc->block_sizes);
+	free_ds_desc(desc);
 }
 
+void free_opt_eiger_desc(struct ds_desc_t *desc) {
+	free_eiger_desc(desc);
+}
+
+
 
 double scale_from_units(const char* unit_string) {
 	if (strcasecmp("m", unit_string) == 0 ||
@@ -87,13 +89,12 @@ double scale_from_units(const char* unit_string) {
 	}
 }
 
-int get_nxs_dataset_dims(const struct data_description_t *desc, struct dataset_properties_t *properties) {
+int get_nxs_dataset_dims(struct ds_desc_t *desc) {
 	hid_t g_id, ds_id, s_id, t_id;
 	int retval = 0;
 	int ndims = 0;
 	int width = 0;
-	hsize_t dims[3] = {0};
-	g_id = desc->data_group_id;;
+	g_id = desc->data_g_id;;
 
 	ds_id = H5Dopen2(g_id, "data", H5P_DEFAULT);
 	if (ds_id <= 0) {
@@ -122,12 +123,11 @@ int get_nxs_dataset_dims(const struct data_description_t *desc, struct dataset_p
 		ERROR_JUMP(-1, close_space, message);
 	}
 
-	if (H5Sget_simple_extent_dims(s_id, dims, NULL) < 0) {
+	if (H5Sget_simple_extent_dims(s_id, desc->dims, NULL) < 0) {
 		ERROR_JUMP(-1, close_space, "Error getting dataset dimensions");
 	}
 
-	memcpy(properties->dims, dims, 3 * sizeof(*dims));
-	properties->data_width = width;
+	desc->data_width = width;
 
 close_space:
 	H5Sclose(s_id);
@@ -139,18 +139,18 @@ done:
 	return retval;
 }
 
-int get_frame_simple(const struct data_description_t *desc,
+int get_frame_simple(const struct ds_desc_t *desc,
 		const char *name,
-		hsize_t *frame_idx,
-		hsize_t *frame_size,
-		int data_width,
+		const hsize_t *frame_idx,
+		const hsize_t *frame_size,
+		const int data_width,
 		void *buffer) {
 
 	int retval = 0;
 	herr_t err = 0;
 	hid_t g_id, ds_id, s_id, ms_id, t_id;
 
-	g_id = desc->data_group_id;
+	g_id = desc->data_g_id;
 
 	ds_id = H5Dopen2(g_id, name, H5P_DEFAULT);
 	if (ds_id <= 0) {
@@ -193,11 +193,11 @@ done:
 }
 
 
-int get_frame_from_chunk(const struct data_description_t *desc,
+int get_frame_from_chunk(const struct ds_desc_t *desc,
 		const char *ds_name,
-		hsize_t *frame_idx,
-		hsize_t *frame_size,
-		int requested_data_width,
+		const hsize_t *frame_idx,
+		const hsize_t *frame_size,
+		const int requested_data_width,
 		void *buffer) {
 
 	hid_t d_id = 0;
@@ -207,7 +207,7 @@ int get_frame_from_chunk(const struct data_description_t *desc,
 	hsize_t c_bytes;
 	void *c_buffer = NULL;
 	void *u_buffer = NULL;
-	const struct optimised_eiger_data_description_t *eiger_desc = desc->extra;
+	const struct opt_eiger_ds_desc_t *o_eiger_desc = (struct opt_eiger_ds_desc_t *) desc;
 	int retval = 0;
 	size_t raw_data_width = 0;
 
@@ -218,7 +218,7 @@ int get_frame_from_chunk(const struct data_description_t *desc,
 		ERROR_JUMP(-1, done, message);
 	}
 
-	d_id = H5Dopen(desc->data_group_id, ds_name, H5P_DEFAULT);
+	d_id = H5Dopen(desc->data_g_id, ds_name, H5P_DEFAULT);
 	if (d_id < 0) {
 		char message[64];
 		sprintf(message, "Error opening dataset %.32s", ds_name);
@@ -271,7 +271,7 @@ int get_frame_from_chunk(const struct data_description_t *desc,
 	}
 
 	if (bslz4_decompress(
-				eiger_desc->bs_filter_params,
+				o_eiger_desc->bs_params,
 				c_bytes,
 				c_buffer,
 				raw_data_width * frame_size[1] * frame_size[2],
@@ -349,19 +349,18 @@ done:
 
 
 int get_nxs_frame(
-		const struct data_description_t *desc,
-		const struct dataset_properties_t *ds_prop,
-		int n,
-		int data_width,
+		const struct ds_desc_t *desc,
+		const int n,
+		const int data_width,
 		void *buffer) {
 	/* detector data are the two inner most indices */
 	/* TODO: handle ndims > 3 and select appropriately */
 	int retval = 0;
 	hsize_t frame_idx[3] = {n, 0, 0};
-	hsize_t frame_size[3] = {1, ds_prop->dims[1], ds_prop->dims[2]};
-	if (n < 0 || n >= ds_prop->dims[0]) {
+	hsize_t frame_size[3] = {1, desc->dims[1], desc->dims[2]};
+	if (n < 0 || n >= desc->dims[0]) {
 		char message[64];
-		sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) ds_prop->dims[0] - 1);
+		sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) desc->dims[0] - 1);
 		ERROR_JUMP(-1, done, message);
 	}
 	retval = get_frame_simple(desc, "data", frame_idx, frame_size, data_width, buffer);
@@ -374,22 +373,21 @@ done:
 
 
 int get_dectris_eiger_frame(
-		const struct data_description_t *desc,
-		const struct dataset_properties_t *ds_prop,
+		const struct ds_desc_t *desc,
 		int n,
 		int data_width,
 		void *buffer) {
 
 	int retval = 0;
 	int block, frame_count, idx;
-	struct eiger_data_description_t *eiger_desc = desc->extra;
+	struct eiger_ds_desc_t *eiger_desc = (struct eiger_ds_desc_t*) desc;
 	char data_name[16] = {0};
 	hsize_t frame_idx[3] = {0, 0, 0};
-	hsize_t frame_size[3] = {1, ds_prop->dims[1], ds_prop->dims[2]};
+	hsize_t frame_size[3] = {1, desc->dims[1], desc->dims[2]};
 
-	if (n < 0 || n >= ds_prop->dims[0]) {
+	if (n < 0 || n >= desc->dims[0]) {
 		char message[64];
-		sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) ds_prop->dims[0] - 1);
+		sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) desc->dims[0] - 1);
 		ERROR_JUMP(-1, done, message);
 	}
 
@@ -400,7 +398,7 @@ int get_dectris_eiger_frame(
 	idx = n - (frame_count - eiger_desc->block_sizes[block]); /* index in current block */
 	frame_idx[0] = idx;
 	sprintf(data_name, "data_%06d", block + 1);
-	retval = eiger_desc->raw_frame_func(desc, data_name, frame_idx, frame_size, data_width, buffer);
+	retval = eiger_desc->frame_func(desc, data_name, frame_idx, frame_size, data_width, buffer);
 	if (retval < 0) {
 		ERROR_JUMP(retval, done, "");
 	}
@@ -409,7 +407,7 @@ done:
 }
 
 
-int get_dectris_eiger_dataset_dims(const struct data_description_t *desc, struct dataset_properties_t *properties) {
+int get_dectris_eiger_dataset_dims(struct ds_desc_t *desc) {
 	int retval = 0;
 	int n_datas = 0;
 	int n = 0;
@@ -418,11 +416,12 @@ int get_dectris_eiger_dataset_dims(const struct data_description_t *desc, struct
 	char ds_name[16] = {0}; /* 12 chars in "data_xxxxxx\0" */
 	int *frame_counts = NULL;
 	hsize_t dims[3] = {0};
+	struct eiger_ds_desc_t *eiger_desc = (struct eiger_ds_desc_t*) desc;
 
 	/* datasets are "data_%06d % n" - need to determine how many of these there are and what the ranges are */
 
 	sprintf(ds_name, "data_%06d", n_datas + 1);
-	while (H5Lexists(desc->data_group_id, ds_name, H5P_DEFAULT) > 0) {
+	while (H5Lexists(desc->data_g_id, ds_name, H5P_DEFAULT) > 0) {
 		sprintf(ds_name, "data_%06d", ++n_datas + 1);
 	}
 
@@ -432,7 +431,7 @@ int get_dectris_eiger_dataset_dims(const struct data_description_t *desc, struct
 		hid_t ds_id, t_id, s_id;
 		hsize_t block_dims[3] = {0};
 		sprintf(ds_name, "data_%06d", n + 1);
-		ds_id = H5Dopen2(desc->data_group_id, ds_name, H5P_DEFAULT);
+		ds_id = H5Dopen2(desc->data_g_id, ds_name, H5P_DEFAULT);
 		if (ds_id < 0) {
 			char message[64];
 			sprintf("Unable to open dataset %.16s", ds_name);
@@ -481,10 +480,10 @@ loop_end:
 	if (retval < 0) {
 		free(frame_counts);
 	} else {
-		memcpy(properties->dims, dims, 3 * sizeof(*dims));
-		properties->data_width = data_width;
-		((struct eiger_data_description_t *) desc->extra)->n_data_blocks = n_datas;
-		((struct eiger_data_description_t *) desc->extra)->block_sizes = frame_counts;
+		memcpy(desc->dims, dims, 3 * sizeof(*dims));
+		desc->data_width = data_width;
+		eiger_desc->n_data_blocks = n_datas;
+		eiger_desc->block_sizes = frame_counts;
 	}
 	return retval;
 }
@@ -596,12 +595,12 @@ done:
 }
 
 
-int get_nxs_pixel_info(const struct data_description_t *desc, double *x_size, double *y_size) {
+int get_nxs_pixel_info(const struct ds_desc_t *desc, double *x_size, double *y_size) {
 	int retval = 0;
-	if (read_pixel_info(desc->det_group_id, "x_pixel_size", x_size) < 0) {
+	if (read_pixel_info(desc->det_g_id, "x_pixel_size", x_size) < 0) {
 		ERROR_JUMP(-1, done, "");
 	}
-	if (read_pixel_info(desc->det_group_id, "y_pixel_size", y_size) < 0) {
+	if (read_pixel_info(desc->det_g_id, "y_pixel_size", y_size) < 0) {
 		ERROR_JUMP(-1, done, "");
 	}
 done:
@@ -609,12 +608,12 @@ done:
 }
 
 
-int get_dectris_eiger_pixel_info(const struct data_description_t *desc, double *x_size, double *y_size) {
+int get_dectris_eiger_pixel_info(const struct ds_desc_t *desc, double *x_size, double *y_size) {
 	int retval = 0;
-	if (read_pixel_info(desc->det_group_id, "detectorSpecific/x_pixel_size", x_size) < 0) {
+	if (read_pixel_info(desc->det_g_id, "detectorSpecific/x_pixel_size", x_size) < 0) {
 		ERROR_JUMP(-1, done, "");
 	}
-	if (read_pixel_info(desc->det_group_id, "detectorSpecific/y_pixel_size", y_size) < 0) {
+	if (read_pixel_info(desc->det_g_id, "detectorSpecific/y_pixel_size", y_size) < 0) {
 		ERROR_JUMP(-1, done, "");
 	}
 done:
@@ -622,12 +621,12 @@ done:
 }
 
 
-int get_nxs_pixel_mask(const struct data_description_t *desc, int *buffer) {
+int get_nxs_pixel_mask(const struct ds_desc_t *desc, int *buffer) {
 	int retval = 0;
 	hid_t ds_id;
 	herr_t err = 0;
 
-	ds_id = H5Dopen2(desc->det_group_id, "pixel_mask", H5P_DEFAULT);
+	ds_id = H5Dopen2(desc->det_g_id, "pixel_mask", H5P_DEFAULT);
 	if (ds_id < 0) {
 		ERROR_JUMP(-1, done, "Error opening pixel_mask dataset");
 	}
@@ -644,12 +643,12 @@ done:
 }
 
 
-int get_dectris_eiger_pixel_mask(const struct data_description_t *desc, int *buffer) {
+int get_dectris_eiger_pixel_mask(const struct ds_desc_t *desc, int *buffer) {
 	int retval = 0;
 	hid_t ds_id;
 	herr_t err = 0;
 
-	ds_id = H5Dopen2(desc->det_group_id, "detectorSpecific/pixel_mask", H5P_DEFAULT);
+	ds_id = H5Dopen2(desc->det_g_id, "detectorSpecific/pixel_mask", H5P_DEFAULT);
 	if (ds_id < 0) {
 		ERROR_JUMP(-1, done, "Error opening detectorSpecific/pixel_mask");
 	}
@@ -777,7 +776,7 @@ done:
 int check_for_chunk_read(
 		hid_t g_id,
 		const char* ds_name,
-		struct optimised_eiger_data_description_t *desc) {
+		struct opt_eiger_ds_desc_t *desc) {
 
 	int retval = 0;
 	int n_filters;
@@ -843,7 +842,7 @@ int check_for_chunk_read(
 
 	if (n_filters == 1) {
 		filter = H5Pget_filter2(dcpl, 0, &filter_flags,
-				&cd_nelems, desc->bs_filter_params,
+				&cd_nelems, desc->bs_params,
 				name_len, filter_name,
 				&filter_config);
 		if (filter < 0) {
@@ -859,8 +858,9 @@ int check_for_chunk_read(
 					BS_H5_N_PARAMS, cd_nelems);
 			ERROR_JUMP(-1, done, message);
 		}
+		desc->bs_applied = 1;
 	} else {
-		desc->n_filter_params = 0;
+		desc->bs_applied = 0;
 	}
 
 	retval = 1;
@@ -872,84 +872,89 @@ done:
 	return retval;
 }
 
-int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_objects_t *visit_result) {
+int create_dataset_descriptor(struct ds_desc_t **desc, struct det_visit_objects_t *visit_result) {
 	int retval = 0;
-	data_desc->det_group_id = visit_result->nxdetector;
+	hid_t g_id, ds_id;
+	int (*pxl_func)(const struct ds_desc_t*, double*, double*);
+	int (*pxl_mask_func)(const struct ds_desc_t*, int*);
+	int (*ds_prop_func)(struct ds_desc_t*);
+	int (*frame_func)(const struct ds_desc_t*, int, int, void*);
+	void (*free_func)(struct ds_desc_t*);
+	struct ds_desc_t *output;
+
+	g_id = visit_result->nxdetector;
 
 	/* determine the pixel information location */
-	if (H5Lexists(data_desc->det_group_id, "x_pixel_size", H5P_DEFAULT) > 0 &&
-			H5Lexists(data_desc->det_group_id, "y_pixel_size", H5P_DEFAULT)) {
-		data_desc->get_pixel_properties = &get_nxs_pixel_info;
-	} else if (H5Lexists(data_desc->det_group_id, "detectorSpecific", H5P_DEFAULT) > 0 &&
-			H5Lexists(data_desc->det_group_id, "detectorSpecific/x_pixel_size", H5P_DEFAULT) > 0 &&
-			H5Lexists(data_desc->det_group_id, "detectorSpecific/y_pixel_size", H5P_DEFAULT) > 0) {
-		data_desc->get_pixel_properties = &get_dectris_eiger_pixel_info;
+	if (H5Lexists(g_id, "x_pixel_size", H5P_DEFAULT) > 0 &&
+			H5Lexists(g_id, "y_pixel_size", H5P_DEFAULT)) {
+		pxl_func = &get_nxs_pixel_info;
+	} else if (H5Lexists(g_id, "detectorSpecific", H5P_DEFAULT) > 0 &&
+			H5Lexists(g_id, "detectorSpecific/x_pixel_size", H5P_DEFAULT) > 0 &&
+			H5Lexists(g_id, "detectorSpecific/y_pixel_size", H5P_DEFAULT) > 0) {
+		pxl_func = &get_dectris_eiger_pixel_info;
 	} else {
-		data_desc->get_pixel_properties = NULL;
 		ERROR_JUMP(-1, done, "Could not locate x_pixel_size and y_pixel_size");
 	}
 
 	/* determine pixel mask location */
-	if (H5Lexists(data_desc->det_group_id, "pixel_mask", H5P_DEFAULT) > 0) {
-		data_desc->get_pixel_mask = &get_nxs_pixel_mask;
-	} else if (H5Lexists(data_desc->det_group_id, "detectorSpecific", H5P_DEFAULT) > 0 &&
-			H5Lexists(data_desc->det_group_id, "detectorSpecific/pixel_mask", H5P_DEFAULT) > 0) {
-		data_desc->get_pixel_mask = &get_dectris_eiger_pixel_mask;
+	if (H5Lexists(g_id, "pixel_mask", H5P_DEFAULT) > 0) {
+		pxl_mask_func = &get_nxs_pixel_mask;
+	} else if (H5Lexists(g_id, "detectorSpecific", H5P_DEFAULT) > 0 &&
+			H5Lexists(g_id, "detectorSpecific/pixel_mask", H5P_DEFAULT) > 0) {
+		pxl_mask_func = &get_dectris_eiger_pixel_mask;
 	} else {
-		data_desc->get_pixel_mask = NULL;
 		ERROR_JUMP(-1, done, "Could not locate pixel_mask");
 	}
 
 	/* determine where the data is stored and what strategy to use */
 	/* we select the "dectris-eiger" strategy if both are valid due to
-	 * potential confusion with the sizes of a virtual dataset (and possible
-	 * failure opening it if the library version is not up to date)
+	 * potential confusion with the sizes of a virtual dataset, possible failure
+	 * opening if we're using an old library version, and the potential to use the
+	 * optimised chunk read strategy
 	 */
 	if (H5Lexists(visit_result->nxdetector, "data_000001", H5P_DEFAULT) > 0) {
-		data_desc->data_group_id = visit_result->nxdetector;
-		data_desc->get_data_properties = &get_dectris_eiger_dataset_dims;
-		data_desc->get_data_frame = &get_dectris_eiger_frame;
+		ds_id = visit_result->nxdetector;
+		ds_prop_func = &get_dectris_eiger_dataset_dims;
+		frame_func = &get_dectris_eiger_frame;
 	} else if (H5Lexists(visit_result->nxdetector, "data", H5P_DEFAULT) > 0) {
-		data_desc->data_group_id = visit_result->nxdetector;
-		data_desc->get_data_properties = &get_nxs_dataset_dims;
-		data_desc->get_data_frame = &get_nxs_frame;
+		ds_id = visit_result->nxdetector;
+		ds_prop_func = &get_nxs_dataset_dims;
+		frame_func = &get_nxs_frame;
 	} else if (H5Lexists(visit_result->nxdata, "data_000001", H5P_DEFAULT) > 0) {
-		data_desc->data_group_id = visit_result->nxdata;
-		data_desc->get_data_properties = &get_dectris_eiger_dataset_dims;
-		data_desc->get_data_frame = &get_dectris_eiger_frame;
+		ds_id = visit_result->nxdata;
+		ds_prop_func = &get_dectris_eiger_dataset_dims;
+		frame_func = &get_dectris_eiger_frame;
 	} else if (H5Lexists(visit_result->nxdata, "data", H5P_DEFAULT) > 0) {
-		data_desc->data_group_id = visit_result->nxdata;
-		data_desc->get_data_properties = &get_nxs_dataset_dims;
-		data_desc->get_data_frame = &get_nxs_frame;
+		ds_id = visit_result->nxdata;
+		ds_prop_func = &get_nxs_dataset_dims;
+		frame_func = &get_nxs_frame;
 	} else {
-		data_desc->data_group_id = 0;
-		data_desc->get_data_properties = NULL;
-		data_desc->get_data_frame = NULL;
 		ERROR_JUMP(-1, done, "Could not locate detector dataset");
 	}
 
-	if (data_desc->get_data_properties == &get_dectris_eiger_dataset_dims) {
+
+	if (ds_prop_func == &get_dectris_eiger_dataset_dims) {
 
 		/* setup the "extra info" structs */
-		struct eiger_data_description_t *eiger_desc;
-		struct optimised_eiger_data_description_t *o_eiger_desc;
+		struct eiger_ds_desc_t *eiger_desc;
+		struct opt_eiger_ds_desc_t *o_eiger_desc;
 
 		eiger_desc = malloc(sizeof(*eiger_desc));
 		if (!eiger_desc) {
 			ERROR_JUMP(-1, done, "Memory error creating data description for Eiger");
 		}
 		memset(eiger_desc, 0, sizeof(*eiger_desc));
-		eiger_desc->raw_frame_func = &get_frame_simple;
+		eiger_desc->frame_func = &get_frame_simple;
 
 		o_eiger_desc = malloc(sizeof(*o_eiger_desc));
 		if (!o_eiger_desc) {
 			free(eiger_desc);
 			ERROR_JUMP(-1, done, "Memory error creating data description for optimised Eiger");
 		}
-		o_eiger_desc->base.raw_frame_func = &get_frame_from_chunk;
+		o_eiger_desc->base.frame_func = &get_frame_from_chunk;
 
 		/* check if we can perform the optimised chunk read */
-		retval = check_for_chunk_read(data_desc->data_group_id, "data_000001", o_eiger_desc);
+		retval = check_for_chunk_read(ds_id, "data_000001", o_eiger_desc);
 		if (retval < 0) {
 			free(o_eiger_desc);
 			free(eiger_desc);
@@ -957,27 +962,38 @@ int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_
 		}
 		if (retval) {
 			free(eiger_desc);
-			data_desc->extra = o_eiger_desc;
-			data_desc->free_extra = free_optimised_eiger_data_description;
+			*(struct opt_eiger_ds_desc_t**) desc = o_eiger_desc;
+			free_func = &free_opt_eiger_desc;
 		} else {
 			free(o_eiger_desc);
-			data_desc->extra = eiger_desc;
-			data_desc->free_extra = free_eiger_data_description;
+			*(struct eiger_ds_desc_t**) desc = eiger_desc;
+			free_func = &free_eiger_desc;
 		}
 
 	} else {
-		data_desc->free_extra = free_nxs_data_description;
+		*desc = malloc(sizeof(struct nxs_ds_desc_t));
+		free_func = &free_nxs_desc;
 	}
 
+	output = *((struct ds_desc_t **) desc);
+	output->det_g_id = g_id;
+	output->data_g_id = ds_id;
+	output->get_pixel_properties = pxl_func;
+	output->get_pixel_mask = pxl_mask_func;
+	output->get_data_frame = frame_func;
+	output->free_desc = free_func;
+
+	ds_prop_func(output);
+
 done:
 	return retval;
 }
 
 
-int extract_detector_info(
+int get_detector_info(
 		const hid_t fid,
-		struct data_description_t *data_desc,
-		struct dataset_properties_t *dataset_prop) {
+		struct ds_desc_t **desc) {
+
 	int retval = 0;
 	herr_t err = 0;
 	struct det_visit_objects_t objects = {0};
@@ -993,12 +1009,9 @@ int extract_detector_info(
 		fprintf(stderr, "WARNING: Could not locate an NXdetector entry\n");
 	}
 
-	if ((retval = fill_data_descriptor(data_desc, &objects)) < 0) {
+	if ((retval = create_dataset_descriptor(desc, &objects)) < 0) {
 		ERROR_JUMP(retval, done, "");
 	};
-	if ((retval = data_desc->get_data_properties(data_desc, dataset_prop)) < 0) {
-		ERROR_JUMP(retval, done, "");
-	}
 
 
 done:
diff --git a/src/file.h b/src/file.h
index 0733ef6..1025365 100644
--- a/src/file.h
+++ b/src/file.h
@@ -11,51 +11,40 @@
 #include "err.h"
 #include "filters.h"
 
-struct dataset_properties_t {
-	int data_width;
+
+struct ds_desc_t {
+	hid_t det_g_id;
+	hid_t data_g_id;
 	hsize_t dims[3];
+	int data_width;
+	int (*get_pixel_properties)(const struct ds_desc_t*, double*, double*);
+	int (*get_pixel_mask)(const struct ds_desc_t*, int*);
+	int (*get_data_frame)(const struct ds_desc_t*, const int, const int, void*);
+	void (*free_desc)(struct ds_desc_t*);
 };
 
-struct data_description_t {
-	hid_t det_group_id;
-	hid_t data_group_id;
-	int (*get_pixel_properties)(const struct data_description_t*, double*, double*);
-	int (*get_pixel_mask)(const struct data_description_t*, int*);
-	int (*get_data_properties)(const struct data_description_t*, struct dataset_properties_t*);
-	int (*get_data_frame)(const struct data_description_t*, const struct dataset_properties_t*, int, int, void*);
-	void *extra;
-	void (*free_extra)(struct data_description_t*);
+struct nxs_ds_desc_t {
+	struct ds_desc_t base;
 };
 
-void free_nxs_data_description(struct data_description_t *desc);
-
-struct eiger_data_description_t {
+struct eiger_ds_desc_t {
+	struct ds_desc_t base;
 	int n_data_blocks;
 	int *block_sizes;
-	int (*raw_frame_func)(const struct data_description_t*, const char*, hsize_t*, hsize_t*, int, void*);
+	int (*frame_func)(const struct ds_desc_t*, const char*, const hsize_t*, const hsize_t*, const int, void*);
 };
 
-void free_eiger_data_description(struct data_description_t *desc);
-
-struct optimised_eiger_data_description_t {
-	struct eiger_data_description_t base;
-	int n_filter_params;
-	unsigned int bs_filter_params[BS_H5_N_PARAMS];
+struct opt_eiger_ds_desc_t {
+	struct eiger_ds_desc_t base;
+	int bs_applied;
+	unsigned int bs_params[BS_H5_N_PARAMS];
 };
 
-void free_optimised_eiger_data_description(struct data_description_t *desc);
+int get_detector_info(const hid_t fid, struct ds_desc_t **desc);
 
 struct det_visit_objects_t {
 	hid_t nxdata;
 	hid_t nxdetector;
 };
 
-void clear_det_visit_objects(struct det_visit_objects_t *objects);
-
-int get_nxs_dataset_dims(const struct data_description_t *desc, struct dataset_properties_t *properties);
-
-int fill_data_descriptor(struct data_description_t *data_desc, struct det_visit_objects_t *visit_result);
-
-int extract_detector_info(const hid_t fid, struct data_description_t *data_desc, struct dataset_properties_t *ds_prop);
-
 #endif /* NXS_XDS_FILE_H */
diff --git a/src/plugin.c b/src/plugin.c
index 83ea552..c66f014 100644
--- a/src/plugin.c
+++ b/src/plugin.c
@@ -16,8 +16,7 @@
 
 
 static hid_t file_id = 0;
-static struct data_description_t data_desc = {0};
-static struct dataset_properties_t ds_prop = {0};
+static struct ds_desc_t *data_desc = NULL;
 static int *mask_buffer = NULL;
 
 
@@ -74,14 +73,14 @@ void plugin_open(
 	}
 
 	reset_error_stack();
-	retval = extract_detector_info(file_id, &data_desc, &ds_prop);
+	retval = get_detector_info(file_id, &data_desc);
 	if (retval < 0) {
 		ERROR_JUMP(-4, done, "");
 	}
 
-	mask_buffer = malloc(ds_prop.dims[1] * ds_prop.dims[2] * sizeof(int));
+	mask_buffer = malloc(data_desc->dims[1] * data_desc->dims[2] * sizeof(int));
 	if (mask_buffer) {
-		retval = data_desc.get_pixel_mask(&data_desc, mask_buffer);
+		retval = data_desc->get_pixel_mask(data_desc, mask_buffer);
 		if (retval < 0) {
 			fprintf(ERROR_OUTPUT, "WARNING: Could not read pixel mask - no masking will be applied\n");
 			dump_error_stack(ERROR_OUTPUT);
@@ -94,6 +93,10 @@ void plugin_open(
 done:
 	*error_flag = retval;
 	if (retval < 0) {
+		if ((data_desc) && (data_desc->free_desc)) {
+			data_desc->free_desc(data_desc);
+			data_desc = NULL;
+		}
 		dump_error_stack(ERROR_OUTPUT);
 	}
 }
@@ -112,15 +115,15 @@ void plugin_get_header(
 	reset_error_stack();
 	fill_info_array(info);
 
-	err = data_desc.get_pixel_properties(&data_desc, &x_pixel_size, &y_pixel_size);
+	err = data_desc->get_pixel_properties(data_desc, &x_pixel_size, &y_pixel_size);
 	if (err < 0) {
 		ERROR_JUMP(err, done, "Failed to retrieve pixel information");
 	}
 
-	*nx = ds_prop.dims[2];
-	*ny = ds_prop.dims[1];
-	*nbytes = ds_prop.data_width;
-	*number_of_frames = ds_prop.dims[0];
+	*nx = data_desc->dims[2];
+	*ny = data_desc->dims[1];
+	*nbytes = data_desc->data_width;
+	*number_of_frames = data_desc->dims[0];
 	*qx = (float) x_pixel_size;
 	*qy = (float) y_pixel_size;
 
@@ -141,13 +144,13 @@ void plugin_get_data(
 	int retval = 0;
 	reset_error_stack();
 	fill_info_array(info);
-	if (data_desc.get_data_frame(&data_desc, &ds_prop, (*frame_number) - 1, sizeof(int), data_array) < 0) {
+	if (data_desc->get_data_frame(data_desc, (*frame_number) - 1, sizeof(int), data_array) < 0) {
 		char message[64] = {0};
 		sprintf(message, "Failed to retrieve data for frame %d", *frame_number);
 		ERROR_JUMP(-2, done, message);
 	}
 	if (mask_buffer) {
-		apply_mask(data_array, mask_buffer, ds_prop.dims[1] * ds_prop.dims[2]);
+		apply_mask(data_array, mask_buffer, data_desc->dims[1] * data_desc->dims[2]);
 	}
 
 done:
@@ -168,7 +171,10 @@ void plugin_close(int *error_flag) {
 	file_id = 0;
 
 	if (mask_buffer) free(mask_buffer);
-	if (data_desc.free_extra) data_desc.free_extra(&data_desc);
+	if (data_desc->free_desc) {
+		data_desc->free_desc(data_desc);
+		data_desc = NULL;
+	}
 	if (H5close() < 0) {
 		*error_flag = -1;
 	}
diff --git a/src/test.c b/src/test.c
index ec6d669..db5d141 100644
--- a/src/test.c
+++ b/src/test.c
@@ -38,8 +38,7 @@ int main(int argc, char **argv) {
 	int err = 0;
 	int retval = 0;
 	char *test_file = "";
-	struct data_description_t desc = {0};
-	struct dataset_properties_t prop = {0};
+	struct ds_desc_t *desc;
 	int dims[3] = {0};
 	hid_t fid = 0;
 	int frame_idx = 0;
@@ -58,13 +57,13 @@ int main(int argc, char **argv) {
 	fid = H5Fopen(test_file, H5F_ACC_RDONLY, H5P_DEFAULT);
 	if (fid < 0) ERROR_JUMP(-1, done, "Error opening file");
 
-	err = extract_detector_info(fid, &desc, &prop);
+	err = get_detector_info(fid, &desc);
 	if (err < 0) {
 		ERROR_JUMP(err, done, "");
 	}
-	dims[0] = prop.dims[0];
-	dims[1] = prop.dims[1];
-	dims[2] = prop.dims[2];
+	dims[0] = desc->dims[0];
+	dims[1] = desc->dims[1];
+	dims[2] = desc->dims[2];
 
 	printf("Dims: %d, %d, %d\n", dims[0], dims[1], dims[2]);
 
@@ -72,13 +71,13 @@ int main(int argc, char **argv) {
 	if (!mask) {
 		ERROR_JUMP(err, done, "Failed to allocate space for pixel mask");
 	}
-	err = desc.get_pixel_mask(&desc, mask);
+	err = desc->get_pixel_mask(desc, mask);
 	if (err < 0) {
 		ERROR_JUMP(err, done, "");
 	}
 
 	data = malloc(dims[1] * dims[2] * sizeof(*data));
-	err = desc.get_data_frame(&desc, &prop, frame_idx, sizeof(*data), data);
+	err = desc->get_data_frame(desc, frame_idx, sizeof(*data), data);
 	if (err < 0) {
 		ERROR_JUMP(err, done, "");
 	}

From f6ba8eb2aae946342852a30565757862fa2130ea Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Fri, 17 Aug 2018 17:34:54 +0100
Subject: [PATCH 5/8] Use the dataset's datatype for H5Dread, convert manually
 later.

Hopefully this results in less work for the HDF5 library (which would
not be done in parallel) and ensures the conversion is consistent
across all data retrieval strategies.
---
 src/file.c   | 123 +++++----------------------------------------------
 src/file.h   |   4 +-
 src/plugin.c |  49 +++++++++++++++++++-
 src/test.c   |  24 +++++++++-
 4 files changed, 84 insertions(+), 116 deletions(-)

diff --git a/src/file.c b/src/file.c
index 58e495e..018061f 100644
--- a/src/file.c
+++ b/src/file.c
@@ -14,22 +14,6 @@
 #include "err.h"
 #include "filters.h"
 
-hid_t h5_int_type_from_width(const int width) {
-	if (width == sizeof(char)) {
-		return H5T_NATIVE_SCHAR;
-	} else if (width == sizeof(short)) {
-		return H5T_NATIVE_SHORT;
-	} else if (width == sizeof(int)) {
-		return H5T_NATIVE_INT;
-	} else if (width == sizeof(long)) {
-		return H5T_NATIVE_LONG;
-	} else if (width == sizeof(long long)) {
-		return H5T_NATIVE_LLONG;
-	} else {
-		/* TODO: error */
-		return -1;
-	}
-}
 
 void clear_det_visit_objects(struct det_visit_objects_t *objects) {
 	if (objects->nxdata) {
@@ -64,7 +48,6 @@ void free_opt_eiger_desc(struct ds_desc_t *desc) {
 }
 
 
-
 double scale_from_units(const char* unit_string) {
 	if (strcasecmp("m", unit_string) == 0 ||
 			strcasecmp("metres", unit_string) == 0 ||
@@ -143,7 +126,6 @@ int get_frame_simple(const struct ds_desc_t *desc,
 		const char *name,
 		const hsize_t *frame_idx,
 		const hsize_t *frame_size,
-		const int data_width,
 		void *buffer) {
 
 	int retval = 0;
@@ -162,6 +144,10 @@ int get_frame_simple(const struct ds_desc_t *desc,
 	if (s_id <= 0) {
 		ERROR_JUMP(-1, close_dataset, "Error getting dataspace");
 	}
+	t_id = H5Dget_type(ds_id);
+	if (t_id <= 0) {
+		ERROR_JUMP(-1, close_type, "Error retrieving datatype");
+	}
 	err = H5Sselect_hyperslab(s_id, H5S_SELECT_SET, frame_idx, NULL, frame_size, NULL);
 	if (err < 0) {
 		ERROR_JUMP(-1, close_space, "Error seleting hyperslab");
@@ -171,12 +157,6 @@ int get_frame_simple(const struct ds_desc_t *desc,
 		ERROR_JUMP(-1, close_space, "Could not create dataspace");
 	}
 
-	t_id = h5_int_type_from_width(data_width);
-	if (t_id < 0) {
-		char message[64];
-		sprintf(message, "Could not infer signed integer from width %d", data_width);
-		ERROR_JUMP(-1, close_mspace, message);
-	}
 	err = H5Dread(ds_id, t_id, ms_id, s_id, H5P_DEFAULT, buffer);
 	if (err < 0) {
 		ERROR_JUMP(-1, close_mspace, "Error reading dataset");
@@ -186,6 +166,8 @@ close_mspace:
 	H5Sclose(ms_id);
 close_space:
 	H5Sclose(s_id);
+close_type:
+	H5Tclose(t_id);
 close_dataset:
 	H5Dclose(ds_id);
 done:
@@ -197,19 +179,15 @@ int get_frame_from_chunk(const struct ds_desc_t *desc,
 		const char *ds_name,
 		const hsize_t *frame_idx,
 		const hsize_t *frame_size,
-		const int requested_data_width,
 		void *buffer) {
 
 	hid_t d_id = 0;
-	hid_t t_id = 0;
 	hsize_t c_offset[3] = {frame_idx[0], 0, 0};
 	uint32_t c_filter_mask = 0;
 	hsize_t c_bytes;
 	void *c_buffer = NULL;
-	void *u_buffer = NULL;
 	const struct opt_eiger_ds_desc_t *o_eiger_desc = (struct opt_eiger_ds_desc_t *) desc;
 	int retval = 0;
-	size_t raw_data_width = 0;
 
 	if (frame_idx[1] != 0 || frame_idx[2] != 0) {
 		char message[64];
@@ -225,13 +203,6 @@ int get_frame_from_chunk(const struct ds_desc_t *desc,
 		ERROR_JUMP(-1, done, message);
 	}
 
-	/* TODO: pass down the dataset's data_width so we don't need to do this */
-	t_id = H5Dget_type(d_id);
-	if (t_id < 0) {
-		ERROR_JUMP(-1, done, "Error getting datatype");
-	}
-
-	raw_data_width = H5Tget_size(t_id);
 
 	if (H5Dget_chunk_storage_size(d_id, c_offset, &c_bytes) < 0) {
 		char message[96];
@@ -259,90 +230,20 @@ int get_frame_from_chunk(const struct ds_desc_t *desc,
 		ERROR_JUMP(-1, done, message);
 	}
 
-	if (raw_data_width == requested_data_width) {
-		/* can output straight to the target buffer */
-		u_buffer = buffer;
-	} else {
-		/* must perform a type conversion, so require a second buffer */
-		u_buffer = malloc(raw_data_width * frame_size[1] * frame_size[2]);
-		if (!u_buffer) {
-			ERROR_JUMP(-1, done, "Unable to allocate output buffer for bslz4 decompression");
-		}
-	}
-
 	if (bslz4_decompress(
 				o_eiger_desc->bs_params,
 				c_bytes,
 				c_buffer,
-				raw_data_width * frame_size[1] * frame_size[2],
-				u_buffer) < 0) {
+				desc->data_width * frame_size[1] * frame_size[2],
+				buffer) < 0) {
 		char message[128];
 		sprintf(message, "Error processing chunk %llu from %.32s with bitshuffle_lz4",
 				frame_idx[0], ds_name);
 		ERROR_JUMP(-1, done, message);
 	}
 
-
-	if (u_buffer != buffer) {
-		int could_convert = 1;
-		/* transfer data to output buffer, performing data conversion as required */
-		/* TODO: decide how conversion of data should work
-		 * Should we sign extend? Neggia doesn't (casts from uint*), but may be more intuitive */
-		if (requested_data_width == sizeof(int8_t)) {
-			if (raw_data_width == sizeof(int16_t)) {
-				CONVERT_BUFFER(u_buffer, int16_t, buffer, int8_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int32_t)) {
-				CONVERT_BUFFER(u_buffer, int32_t, buffer, int8_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int64_t)) {
-				CONVERT_BUFFER(u_buffer, int64_t, buffer, int8_t, frame_size[1] * frame_size[2]);
-			} else {
-				could_convert = 0;
-			}
-		} else if (requested_data_width == sizeof(int16_t)) {
-			if (raw_data_width == sizeof(int8_t)) {
-				CONVERT_BUFFER(u_buffer, int8_t, buffer, int16_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int32_t)) {
-				CONVERT_BUFFER(u_buffer, int32_t, buffer, int16_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int64_t)) {
-				CONVERT_BUFFER(u_buffer, int64_t, buffer, int16_t, frame_size[1] * frame_size[2]);
-			} else {
-				could_convert = 0;
-			}
-		} else if (requested_data_width == sizeof(int32_t)) {
-			if (raw_data_width == sizeof(int8_t)) {
-				CONVERT_BUFFER(u_buffer, int8_t, buffer, int32_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int16_t)) {
-				CONVERT_BUFFER(u_buffer, int16_t, buffer, int32_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int64_t)) {
-				CONVERT_BUFFER(u_buffer, int64_t, buffer, int32_t, frame_size[1] * frame_size[2]);
-			} else {
-				could_convert = 0;
-			}
-		} else if (requested_data_width == sizeof(int64_t)) {
-			if (raw_data_width == sizeof(int8_t)) {
-				CONVERT_BUFFER(u_buffer, int8_t, buffer, int64_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int16_t)) {
-				CONVERT_BUFFER(u_buffer, int16_t, buffer, int64_t, frame_size[1] * frame_size[2]);
-			} else if (raw_data_width == sizeof(int32_t)) {
-				CONVERT_BUFFER(u_buffer, int32_t, buffer, int64_t, frame_size[1] * frame_size[2]);
-			} else {
-				could_convert = 0;
-			}
-		} else {
-			could_convert = 0;
-		}
-		if (!could_convert) {
-			char message[128];
-			sprintf(message, "Unsupported conversion of data width %lu to %d for %.32s",
-					raw_data_width, requested_data_width, ds_name);
-			ERROR_JUMP(-1, done, message);
-		}
-	}
-
 done:
 	if (c_buffer) free(c_buffer);
-	if (u_buffer && (u_buffer != buffer)) free(u_buffer);
-	if (t_id) H5Tclose(t_id);
 	if (d_id) H5Dclose(d_id);
 	return retval;
 }
@@ -351,7 +252,6 @@ done:
 int get_nxs_frame(
 		const struct ds_desc_t *desc,
 		const int n,
-		const int data_width,
 		void *buffer) {
 	/* detector data are the two inner most indices */
 	/* TODO: handle ndims > 3 and select appropriately */
@@ -363,7 +263,7 @@ int get_nxs_frame(
 		sprintf(message, "Selected frame %d is out of range valid range [0, %d]", n, (int) desc->dims[0] - 1);
 		ERROR_JUMP(-1, done, message);
 	}
-	retval = get_frame_simple(desc, "data", frame_idx, frame_size, data_width, buffer);
+	retval = get_frame_simple(desc, "data", frame_idx, frame_size, buffer);
 	if (retval < 0) {
 		ERROR_JUMP(retval, done, "");
 	}
@@ -375,7 +275,6 @@ done:
 int get_dectris_eiger_frame(
 		const struct ds_desc_t *desc,
 		int n,
-		int data_width,
 		void *buffer) {
 
 	int retval = 0;
@@ -398,7 +297,7 @@ int get_dectris_eiger_frame(
 	idx = n - (frame_count - eiger_desc->block_sizes[block]); /* index in current block */
 	frame_idx[0] = idx;
 	sprintf(data_name, "data_%06d", block + 1);
-	retval = eiger_desc->frame_func(desc, data_name, frame_idx, frame_size, data_width, buffer);
+	retval = eiger_desc->frame_func(desc, data_name, frame_idx, frame_size, buffer);
 	if (retval < 0) {
 		ERROR_JUMP(retval, done, "");
 	}
@@ -878,7 +777,7 @@ int create_dataset_descriptor(struct ds_desc_t **desc, struct det_visit_objects_
 	int (*pxl_func)(const struct ds_desc_t*, double*, double*);
 	int (*pxl_mask_func)(const struct ds_desc_t*, int*);
 	int (*ds_prop_func)(struct ds_desc_t*);
-	int (*frame_func)(const struct ds_desc_t*, int, int, void*);
+	int (*frame_func)(const struct ds_desc_t*, int, void*);
 	void (*free_func)(struct ds_desc_t*);
 	struct ds_desc_t *output;
 
diff --git a/src/file.h b/src/file.h
index 1025365..a7e0ae9 100644
--- a/src/file.h
+++ b/src/file.h
@@ -19,7 +19,7 @@ struct ds_desc_t {
 	int data_width;
 	int (*get_pixel_properties)(const struct ds_desc_t*, double*, double*);
 	int (*get_pixel_mask)(const struct ds_desc_t*, int*);
-	int (*get_data_frame)(const struct ds_desc_t*, const int, const int, void*);
+	int (*get_data_frame)(const struct ds_desc_t*, const int, void*);
 	void (*free_desc)(struct ds_desc_t*);
 };
 
@@ -31,7 +31,7 @@ struct eiger_ds_desc_t {
 	struct ds_desc_t base;
 	int n_data_blocks;
 	int *block_sizes;
-	int (*frame_func)(const struct ds_desc_t*, const char*, const hsize_t*, const hsize_t*, const int, void*);
+	int (*frame_func)(const struct ds_desc_t*, const char*, const hsize_t*, const hsize_t*, void*);
 };
 
 struct opt_eiger_ds_desc_t {
diff --git a/src/plugin.c b/src/plugin.c
index c66f014..1989157 100644
--- a/src/plugin.c
+++ b/src/plugin.c
@@ -7,6 +7,7 @@
 #include <hdf5.h>
 #include <stdlib.h>
 #include "file.h"
+#include "filters.h"
 #include "plugin.h"
 
 
@@ -42,6 +43,29 @@ void apply_mask(int *data, int *mask, int size) {
 	}
 }
 
+int convert_data_to_int(void *in_buffer, int d_width, int *out_buffer, int length) {
+	/* transfer data to output buffer, performing data conversion as required */
+	int retval = 0;
+	/* TODO: decide how conversion of data should work
+	 * Should we sign extend? Neggia doesn't (casts from uint*), but may be more intuitive */
+	if (d_width == sizeof(signed char)) {
+		CONVERT_BUFFER(in_buffer, signed char, out_buffer, int, length);
+	} else if (d_width == sizeof(short)) {
+		CONVERT_BUFFER(in_buffer, short, out_buffer, int, length);
+	} else if (d_width == sizeof(int)) {
+		CONVERT_BUFFER(in_buffer, int, out_buffer, int, length);
+	} else if (d_width == sizeof(long int)) {
+		CONVERT_BUFFER(in_buffer, long int, out_buffer, int, length);
+	} else if (d_width == sizeof(long long int)) {
+		CONVERT_BUFFER(in_buffer, long long int, out_buffer, int, length);
+	} else {
+		char message[128];
+		sprintf(message, "Unsupported conversion of data width %d to %ld (int)", d_width, sizeof(int));
+		ERROR_JUMP(-1, done, message);
+	}
+done:
+	return retval;
+}
 
 #ifdef __cplusplus
 extern "C" {
@@ -141,14 +165,36 @@ void plugin_get_data(
 		int *data_array,
 		int info[1024],
 		int *error_flag) {
+
 	int retval = 0;
+	int frame_size_px = data_desc->dims[1] * data_desc->dims[2];
 	reset_error_stack();
 	fill_info_array(info);
-	if (data_desc->get_data_frame(data_desc, (*frame_number) - 1, sizeof(int), data_array) < 0) {
+
+	void *buffer = NULL;
+	if (sizeof(*data_array) == data_desc->data_width) {
+		buffer = data_array;
+	} else {
+		buffer = malloc(data_desc->data_width * frame_size_px);
+		if (!buffer) {
+			ERROR_JUMP(-1, done, "Unable to allocate data buffer");
+		}
+	}
+
+	if (data_desc->get_data_frame(data_desc, (*frame_number) - 1, buffer) < 0) {
 		char message[64] = {0};
 		sprintf(message, "Failed to retrieve data for frame %d", *frame_number);
 		ERROR_JUMP(-2, done, message);
 	}
+
+	if (buffer != data_array) {
+		if (convert_data_to_int(buffer, data_desc->data_width, data_array, frame_size_px) < 0) {
+			char message[64];
+			sprintf(message, "Error converting data for frame %d", *frame_number);
+			ERROR_JUMP(-2, done, message);
+		}
+	}
+
 	if (mask_buffer) {
 		apply_mask(data_array, mask_buffer, data_desc->dims[1] * data_desc->dims[2]);
 	}
@@ -158,6 +204,7 @@ done:
 	if (retval < 0) {
 		dump_error_stack(ERROR_OUTPUT);
 	}
+	if (buffer && (buffer != data_array)) free(buffer);
 }
 
 
diff --git a/src/test.c b/src/test.c
index db5d141..b29b2ce 100644
--- a/src/test.c
+++ b/src/test.c
@@ -44,6 +44,7 @@ int main(int argc, char **argv) {
 	int frame_idx = 0;
 	int *mask = NULL;
 	int *data = NULL;
+	void *buffer = NULL;
 
 	init_error_handling();
 	if (init_h5_error_handling() < 0) {
@@ -77,11 +78,31 @@ int main(int argc, char **argv) {
 	}
 
 	data = malloc(dims[1] * dims[2] * sizeof(*data));
-	err = desc->get_data_frame(desc, frame_idx, sizeof(*data), data);
+	if (sizeof(*data) != desc->data_width) {
+		buffer = malloc(dims[1] * dims[2] * desc->data_width);
+	} else {
+		buffer = data;
+	}
+
+	err = desc->get_data_frame(desc, frame_idx, buffer);
 	if (err < 0) {
 		ERROR_JUMP(err, done, "");
 	}
 
+	if (buffer != data) {
+		if (desc->data_width == sizeof(signed char)) {
+			CONVERT_BUFFER(buffer, signed char, data, int, dims[1] * dims[2]);
+		} else if (desc->data_width == sizeof(short)) {
+			CONVERT_BUFFER(buffer, short, data, int, dims[1] * dims[2]);
+		} else if (desc->data_width == sizeof(int)) {
+			CONVERT_BUFFER(buffer, int, data, int, dims[1] * dims[2]);
+		} else if (desc->data_width == sizeof(long int)) {
+			CONVERT_BUFFER(buffer, long int, data, int, dims[1] * dims[2]);
+		} else if (desc->data_width == sizeof(long long int)) {
+			CONVERT_BUFFER(buffer, long long int, data, int, dims[1] * dims[2]);
+		}
+	}
+
 	apply_mask(data, mask, dims[1] * dims[2]);
 	{
 		int i, j;
@@ -100,6 +121,7 @@ int main(int argc, char **argv) {
 done:
 	if (fid > 0) H5Fclose(fid);
 	if (data) free(data);
+	if (buffer && (data != buffer)) free(buffer);
 	if (mask) free(mask);
 	if (retval != 0) dump_error_stack(stderr);
 	return retval;

From 5c0b6e83664e390cb8542d2b9ba3fdd92318d959 Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Fri, 17 Aug 2018 17:50:54 +0100
Subject: [PATCH 6/8] Check if the bitshuffle filter was applied before
 decompressing

---
 src/file.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/file.c b/src/file.c
index 018061f..20ca72b 100644
--- a/src/file.c
+++ b/src/file.c
@@ -215,12 +215,16 @@ int get_frame_from_chunk(const struct ds_desc_t *desc,
 		ERROR_JUMP(-1, done, message);
 	}
 
-	c_buffer = malloc(c_bytes);
-	if (!c_buffer) {
-		char message[128];
-		sprintf(message, "Unable to allocate chunk buffer for dataset %.32s - frame %llu, size %llu bytes",
-				ds_name, frame_idx[0], c_bytes);
-		ERROR_JUMP(-1, done, message);
+	if (o_eiger_desc->bs_applied) {
+		c_buffer = malloc(c_bytes);
+		if (!c_buffer) {
+			char message[128];
+			sprintf(message, "Unable to allocate chunk buffer for dataset %.32s - frame %llu, size %llu bytes",
+					ds_name, frame_idx[0], c_bytes);
+			ERROR_JUMP(-1, done, message);
+		}
+	} else {
+		c_buffer = buffer;
 	}
 
 	if (H5DOread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) < 0) {
@@ -230,20 +234,22 @@ int get_frame_from_chunk(const struct ds_desc_t *desc,
 		ERROR_JUMP(-1, done, message);
 	}
 
-	if (bslz4_decompress(
-				o_eiger_desc->bs_params,
-				c_bytes,
-				c_buffer,
-				desc->data_width * frame_size[1] * frame_size[2],
-				buffer) < 0) {
-		char message[128];
-		sprintf(message, "Error processing chunk %llu from %.32s with bitshuffle_lz4",
-				frame_idx[0], ds_name);
-		ERROR_JUMP(-1, done, message);
+	if (o_eiger_desc->bs_applied) {
+		if (bslz4_decompress(
+					o_eiger_desc->bs_params,
+					c_bytes,
+					c_buffer,
+					desc->data_width * frame_size[1] * frame_size[2],
+					buffer) < 0) {
+			char message[128];
+			sprintf(message, "Error processing chunk %llu from %.32s with bitshuffle_lz4",
+					frame_idx[0], ds_name);
+			ERROR_JUMP(-1, done, message);
+		}
 	}
 
 done:
-	if (c_buffer) free(c_buffer);
+	if (c_buffer && (c_buffer != buffer)) free(c_buffer);
 	if (d_id) H5Dclose(d_id);
 	return retval;
 }

From d433e6b1d0a8bb38c5c799a9d695c37ac114a4a2 Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Mon, 20 Aug 2018 10:38:56 +0100
Subject: [PATCH 7/8] Apply mask at same time as data conversion

---
 src/filters.h |  9 -------
 src/plugin.c  | 75 ++++++++++++++++++++++++++++++++-------------------
 src/test.c    | 42 ++++++++++++++++-------------
 3 files changed, 72 insertions(+), 54 deletions(-)

diff --git a/src/filters.h b/src/filters.h
index 7996783..caa9a37 100644
--- a/src/filters.h
+++ b/src/filters.h
@@ -11,15 +11,6 @@
 #define BS_H5_PARAM_LZ4_COMPRESS 2
 
 
-/* Perform type conversion during buffer copy */
-#define CONVERT_BUFFER(in, t_in, out, t_out, size) \
-{ \
-    t_in *pin = in; \
-    t_out *pout = out; \
-    t_in *end = pin + size; \
-    while (pin < end) *pout++ = (t_out) *pin++; \
-}
-
 
 int bslz4_decompress(
 		const unsigned int* bs_params,
diff --git a/src/plugin.c b/src/plugin.c
index 1989157..ef5be54 100644
--- a/src/plugin.c
+++ b/src/plugin.c
@@ -16,6 +16,38 @@
 #define ERROR_OUTPUT stderr
 
 
+
+/* mask bits loosely based on what Neggia does and what NeXus says should be done */
+/* basically - anything in the low byte (& 0xFF) means "ignore this" */
+/* Neggia usses the value -2 if bit 1, 2 or 3 are set */
+#define COPY_AND_MASK(in, out, size, mask) \
+{ \
+	int i; \
+	if (mask) { \
+		for (i = 0; i < size; ++i) { \
+			out[i] = in[i]; \
+			if (mask[i] & 0xFF) out[i] = -1; \
+			if (mask[i] & 30) out[i] = -2; \
+		} \
+	} else { \
+		for (i = 0; i < size; i++) { \
+			out[i] = in[i]; \
+		} \
+	} \
+}
+
+#define APPLY_MASK(buffer, mask, size) \
+{ \
+	int i; \
+	if (mask) { \
+		for (i = 0; i < size; ++i) { \
+			if (mask[i] & 0xFF) buffer[i] = -1; \
+			if (mask[i] & 30) buffer[i] = -2; \
+		} \
+	} \
+}
+
+
 static hid_t file_id = 0;
 static struct ds_desc_t *data_desc = NULL;
 static int *mask_buffer = NULL;
@@ -29,35 +61,26 @@ void fill_info_array(int info[1024]) {
 	info[4] = VERSION_TIMESTAMP;
 }
 
-void apply_mask(int *data, int *mask, int size) {
-	int *dptr, *mptr;
-	dptr = data;
-	mptr = mask;
-	while (dptr < data + size && mptr < mask + size) {
-		/* mask bits loosely based on what Neggia does and what NeXus says should be done */
-		/* basically - anything in the low byte (& 0xFF) means "ignore this" */
-		if (*mptr & 0x01) *dptr = -1;
-		if (*mptr & 0xFE) *dptr = -2;
-		dptr++;
-		mptr++;
-	}
-}
-
-int convert_data_to_int(void *in_buffer, int d_width, int *out_buffer, int length) {
+int convert_to_int_and_mask(void *in_buffer, int d_width, int *out_buffer, int length, int *mask) {
 	/* transfer data to output buffer, performing data conversion as required */
 	int retval = 0;
-	/* TODO: decide how conversion of data should work
-	 * Should we sign extend? Neggia doesn't (casts from uint*), but may be more intuitive */
+	/* TODO: decide how conversion of data should work */
+	/* Should we sign extend? Neggia doesn't (casts from uint*), but may be more intuitive */
 	if (d_width == sizeof(signed char)) {
-		CONVERT_BUFFER(in_buffer, signed char, out_buffer, int, length);
+		signed char *in = in_buffer;
+		COPY_AND_MASK(in, out_buffer, length, mask);
 	} else if (d_width == sizeof(short)) {
-		CONVERT_BUFFER(in_buffer, short, out_buffer, int, length);
+		short *in = in_buffer;
+		COPY_AND_MASK(in, out_buffer, length, mask);
 	} else if (d_width == sizeof(int)) {
-		CONVERT_BUFFER(in_buffer, int, out_buffer, int, length);
+		int *in = in_buffer;
+		COPY_AND_MASK(in, out_buffer, length, mask);
 	} else if (d_width == sizeof(long int)) {
-		CONVERT_BUFFER(in_buffer, long int, out_buffer, int, length);
+		long int *in = in_buffer;
+		COPY_AND_MASK(in, out_buffer, length, mask);
 	} else if (d_width == sizeof(long long int)) {
-		CONVERT_BUFFER(in_buffer, long long int, out_buffer, int, length);
+		long long int *in = in_buffer;
+		COPY_AND_MASK(in, out_buffer, length, mask);
 	} else {
 		char message[128];
 		sprintf(message, "Unsupported conversion of data width %d to %ld (int)", d_width, sizeof(int));
@@ -188,15 +211,13 @@ void plugin_get_data(
 	}
 
 	if (buffer != data_array) {
-		if (convert_data_to_int(buffer, data_desc->data_width, data_array, frame_size_px) < 0) {
+		if (convert_to_int_and_mask(buffer, data_desc->data_width, data_array, frame_size_px, mask_buffer) < 0) {
 			char message[64];
 			sprintf(message, "Error converting data for frame %d", *frame_number);
 			ERROR_JUMP(-2, done, message);
 		}
-	}
-
-	if (mask_buffer) {
-		apply_mask(data_array, mask_buffer, data_desc->dims[1] * data_desc->dims[2]);
+	} else {
+		APPLY_MASK(data_array, mask_buffer, frame_size_px);
 	}
 
 done:
diff --git a/src/test.c b/src/test.c
index b29b2ce..68a7ef9 100644
--- a/src/test.c
+++ b/src/test.c
@@ -4,20 +4,22 @@
 #include "file.h"
 #include "err.h"
 
-
-void apply_mask(int *data, int *mask, int size) {
-	int *dptr, *mptr;
-	dptr = data;
-	mptr = mask;
-	while (dptr < data + size && mptr < mask + size) {
-		if (*mptr & 0x01) *dptr = -1;
-		if (*mptr & 0xFE) *dptr = -2;
-		dptr++;
-		mptr++;
-	}
+#define COPY_AND_MASK(in, out, size, mask) \
+{ \
+	int i; \
+	if (mask) { \
+		for (i = 0; i < size; ++i) { \
+			out[i] = in[i]; \
+			if (mask[i] & 0xFE) out[i] = -2; \
+			if (mask[i] & 0x01) out[i] = -1; \
+		} \
+	} else { \
+		for (i = 0; i < size; i++) { \
+			out[i] = in[i]; \
+		} \
+	} \
 }
 
-
 int parse_args(int argc, char **argv, char **file_name, int *frame_idx) {
 	int retval = 0;
 	if (argc == 2) {
@@ -91,19 +93,23 @@ int main(int argc, char **argv) {
 
 	if (buffer != data) {
 		if (desc->data_width == sizeof(signed char)) {
-			CONVERT_BUFFER(buffer, signed char, data, int, dims[1] * dims[2]);
+			signed char *in = buffer;
+			COPY_AND_MASK(in, data, dims[1] * dims[2], mask);
 		} else if (desc->data_width == sizeof(short)) {
-			CONVERT_BUFFER(buffer, short, data, int, dims[1] * dims[2]);
+			short *in = buffer;
+			COPY_AND_MASK(in, data, dims[1] * dims[2], mask);
 		} else if (desc->data_width == sizeof(int)) {
-			CONVERT_BUFFER(buffer, int, data, int, dims[1] * dims[2]);
+			int *in = buffer;
+			COPY_AND_MASK(in, data, dims[1] * dims[2], mask);
 		} else if (desc->data_width == sizeof(long int)) {
-			CONVERT_BUFFER(buffer, long int, data, int, dims[1] * dims[2]);
+			long int *in = buffer;
+			COPY_AND_MASK(in, data, dims[1] * dims[2], mask);
 		} else if (desc->data_width == sizeof(long long int)) {
-			CONVERT_BUFFER(buffer, long long int, data, int, dims[1] * dims[2]);
+			long long int *in = buffer;
+			COPY_AND_MASK(in, data, dims[1] * dims[2], mask);
 		}
 	}
 
-	apply_mask(data, mask, dims[1] * dims[2]);
 	{
 		int i, j;
 		int max_i = 30;

From 78d49aa161f9a4befd2f3f67d75d9c33e1ab695e Mon Sep 17 00:00:00 2001
From: Charles Mita <charles.mita@diamond.ac.uk>
Date: Tue, 9 Oct 2018 14:12:30 +0100
Subject: [PATCH 8/8] Add -std=c89 to compile flags

Helps with some of the fixed-width typedefs in the bitshuffle library.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8fd50ff..ae5825a 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ BSLZ4_BUILD_DIR = ./bslz4/build
 BSLZ4_INC_DIR = $(BSLZ4_SRC_DIR)
 
 CC=h5cc
-CFLAGS=-Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR)
+CFLAGS=-Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR) -std=c89
 
 .PHONY: all
 all: plugin example test_plugin