Don't hide symbols in the shared library

README: Update
CI: Dedicated upload python script
2026-04-08 16:03:30 +02:00 · 2026-04-08 15:15:22 +02:00 · 2026-04-08 14:58:53 +02:00 · 2026-04-08 14:51:20 +02:00 · 2026-04-08 14:48:56 +02:00 · 2026-04-08 14:46:36 +02:00
20 changed files with 4624 additions and 1475 deletions
@@ -0,0 +1,35 @@
+name: Build Packages
+
+on:
+  push:
+    branches:
+      - '**'
+    tags:
+      - '**'
+
+jobs:
+  build-library:
+    name: Build
+    runs-on: durin
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build library
+        shell: bash
+        run: |
+          mkdir -p build
+          cd build
+          cmake -DCMAKE_BUILD_TYPE=Release ..
+          make -j
+
+      - name: Upload release assets to Gitea
+        if: github.ref_type == 'tag'
+        shell: bash
+        env:
+          GITEA_TOKEN: ${{ secrets.PIP_REPOSITORY_API_TOKEN }}
+        run: |
+          set -euo pipefail
+          python tools/gitea_release_upload.py \
+            "${{ github.server_url }}" \
+            "${{ github.repository }}" \
+            "${{ github.ref_name }}"
@@ -0,0 +1,56 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.19)
+
+PROJECT(durin VERSION 1.0.0 LANGUAGES C)
+
+include(FetchContent)
+
+SET(CMAKE_C_FLAGS_RELEASE "-O3")
+
+SET(CMAKE_C_STANDARD 99)
+SET(CMAKE_C_STANDARD_REQUIRED ON)
+SET(CMAKE_C_EXTENSIONS OFF)
+
+SET(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+SET(BUILD_SHARED_LIBS OFF)
+
+set(HDF5_USE_STATIC_LIBRARIES TRUE)
+
+SET(HDF5_BUILD_HL_LIB OFF)
+
+SET(HDF5_ENABLE_THREADSAFE ON)
+SET(HDF5_ENABLE_SZIP_SUPPORT OFF)
+SET(HDF5_ENABLE_SZIP_ENCODING OFF)
+SET(HDF5_BUILD_EXAMPLES OFF)
+SET(HDF5_BUILD_CPP_LIB OFF)
+SET(HDF5_ENABLE_Z_LIB_SUPPORT OFF)
+SET(HDF5_EXTERNALLY_CONFIGURED 1)
+
+INCLUDE_DIRECTORIES(bslz4/src)
+
+FetchContent_Declare(hdf5
+        URL https://github.com/HDFGroup/hdf5/releases/download/hdf5_1.14.6/hdf5-1.14.6.tar.gz
+        DOWNLOAD_EXTRACT_TIMESTAMP FALSE
+        EXCLUDE_FROM_ALL)
+
+FetchContent_MakeAvailable(hdf5)
+
+ADD_LIBRARY(durin-plugin SHARED
+        src/plugin.c src/plugin.h
+        src/err.c src/err.h
+        src/filters.c src/filters.h
+        src/file.c src/file.h bslz4/src/bitshuffle.c bslz4/src/bitshuffle.h
+        bslz4/src/bitshuffle_core.c bslz4/src/bitshuffle_core.h
+        bslz4/src/bitshuffle_internals.h
+        bslz4/src/bshuf_h5filter.c bslz4/src/bshuf_h5filter.h
+        bslz4/src/iochain.c bslz4/src/iochain.h
+        bslz4/src/lz4.c bslz4/src/lz4.h
+        )
+
+set_target_properties(durin-plugin PROPERTIES VERSION 1.0.0)
+
+TARGET_COMPILE_DEFINITIONS(durin-plugin PRIVATE
+        H5_USE_110_API
+        USE_BITSHUFFLE)
+
+TARGET_LINK_LIBRARIES(durin-plugin PRIVATE hdf5-static)
@@ -0,0 +1,43 @@
+FROM registry.access.redhat.com/ubi7/ubi
+
+ARG HDF5_TAG="hdf5_1.14.6"
+
+LABEL authors="Filip Leonarski"
+
+ARG CMAKE_VERSION=3.31.6
+ARG NODE_MAJOR=16
+
+RUN yum -y update && \
+    yum -y install \
+        gcc \
+        gcc-c++ \
+        git \
+        make \
+        tar \
+        gzip \
+        curl && \
+    yum clean all
+
+# Install a recent Node.js (NodeSource). Change NODE_MAJOR if you want another major version.
+RUN curl -fsSL https://rpm.nodesource.com/setup_${NODE_MAJOR}.x | bash - && \
+    yum -y install nodejs && \
+    yum clean all && \
+    node --version && npm --version && (corepack enable || true)
+
+RUN set -eux; \
+    arch="$(uname -m)"; \
+    case "$arch" in \
+      x86_64) cmake_arch="x86_64" ;; \
+      aarch64) cmake_arch="aarch64" ;; \
+      *) echo "Unsupported architecture: $arch"; exit 1 ;; \
+    esac; \
+    curl -L "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${cmake_arch}.tar.gz" \
+      -o /tmp/cmake.tar.gz; \
+    tar -xzf /tmp/cmake.tar.gz -C /opt; \
+    ln -s "/opt/cmake-${CMAKE_VERSION}-linux-${cmake_arch}/bin/cmake" /usr/local/bin/cmake; \
+    ln -s "/opt/cmake-${CMAKE_VERSION}-linux-${cmake_arch}/bin/ctest" /usr/local/bin/ctest; \
+    ln -s "/opt/cmake-${CMAKE_VERSION}-linux-${cmake_arch}/bin/cpack" /usr/local/bin/cpack; \
+    rm -f /tmp/cmake.tar.gz
+
+# Default entrypoint prints tool versions and hints.
+CMD ["/bin/bash", "-l"]
@@ -1,55 +0,0 @@
-BUILD_DIR ?= ./build
-SRC_DIR = ./src
-TEST_DIR = ./test
-INC_DIR = $(SRC_DIR)
-
-BSLZ4_SRC_DIR = ./bslz4/src
-BSLZ4_BUILD_DIR = ./bslz4/build
-BSLZ4_INC_DIR = $(BSLZ4_SRC_DIR)
-
-CC=h5cc
-CFLAGS=-DH5_USE_110_API -Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR) -std=c99 -shlib
-
-.PHONY: plugin
-plugin: $(BUILD_DIR)/durin-plugin.so
-
-.PHONY: all
-all: plugin example test_plugin
-
-.PHONY: example
-example: $(BUILD_DIR)/example
-
-.PHONY: test_plugin
-test_plugin: $(BUILD_DIR)/test_plugin
-
-$(BUILD_DIR)/test_plugin: $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_generic_host.f90
-	mkdir -p $(BUILD_DIR)
-	gfortran -O -g -fopenmp -ldl $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_generic_host.f90 -o $@ -J$(BUILD_DIR)
-
-$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c
-	mkdir -p $(BUILD_DIR)
-	$(CC) $(CFLAGS) -c $< -o $@
-
-$(BSLZ4_BUILD_DIR)/%.o: $(BSLZ4_SRC_DIR)/%.c
-	mkdir -p $(BSLZ4_BUILD_DIR)
-	$(CC) $(CFLAGS) -c $< -o $@
-
-$(BUILD_DIR)/bslz4.a: $(BSLZ4_BUILD_DIR)/lz4.o $(BSLZ4_BUILD_DIR)/bitshuffle.o \
-$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o
-	mkdir -p $(BUILD_DIR)
-	ar rcs $@ $^
-
-$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \
-$(BUILD_DIR)/bslz4.a
-	mkdir -p $(BUILD_DIR)
-	$(CC) $(CFLAGS) -shared -noshlib $^ -o $(BUILD_DIR)/durin-plugin.so
-
-$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \
-$(BUILD_DIR)/bslz4.a
-	mkdir -p $(BUILD_DIR)
-	$(CC) $(CFLAGS) $^ -o $(BUILD_DIR)/example
-
-.PHONY: clean
-clean:
-	rm -r $(BUILD_DIR)
-	rm -r $(BSLZ4_BUILD_DIR)
@@ -8,14 +8,24 @@ See:
 * https://www.dectris.com/features/features-eiger-x/hdf5-and-nexus
 * https://strucbio.biologie.uni-konstanz.de/xdswiki

-## Get Durin
+## Paul Scherrer Institute fork
+This fork is maintained by Paul Scherrer Institute.
+The plugin is based on the code developed by the Diamond Light Source and modified by Global Phasing.
+Modifications from PSI side:
+* Using CMake for building the plugin
+* HDF5 is built as part of the CMake process
+* Bitshuffle/LZ4 is updated to the latest version
+* HDF5 filter is automatically registered for virtual dataset HDF5 files
+* Docker image to build the plugin on x86/RH7 is provided.
+* Generated versioned shared library, so the used version can be tracked.

-Latest release at https://github.com/DiamondLightSource/durin/releases/tag/2019v1
+## Get Durin
+Linux x86 version is automatically built on RHEL 7 and available from the Gitea release page.

 ## Usage
 In your XDS.INP add:
 ```
-LIB=[path to durin-plugin.so]
+LIB=[path to libdurin-plugin.so]
 NAME_TEMPLATE_OF_DATA_FRAMES=[data_path]/data_images_??????.h5
 ```
 XDS will instruct the plugin to load `[data_path]/data_images_master.h5` and this must be the
@@ -27,55 +37,25 @@ the master file contains an `NXdata` or `NXdetector` group with either a dataset
 series of datasets named `data_000001`, `data_000002`, etc.


-## Requirements
-* HDF5 Library (https://www.hdfgroup.org/downloads)
-
-
 ## Building
-
-### Building HDF5 library
-The HDF5 library used when building durin must have been compiled with specific switches enabled
-to allow the durin plugin to be built and used.
-
-Download the HDF5 source code (https://www.hdfgroup.org/downloads/hdf5/source-code) and extract
-to any directory (referred to as `/hdf5_dir`), and run the following commands.
+Requires CMake version 3.19 or later + GCC compiler. There is no need to build HDF5 separately.
+To build:
 ```
-cd /hdf5_dir
 mkdir build
 cd build
-export CFLAGS=-fPIC
-../configure --enable-threadsafe --enable-deprecated-symbols --enable-hl --enable-unsupported
-make
-make check
-make install
+cmake ..
+make -j
 ```
-The hdf5 tools and libraries should now be located in `/hdf5_dir/build/hdf5`
-
-For reference, the plugin requires the thread-safe switch and the optimised chunk read function.
-The chunk read function may be defined in the high level library instead of the regular library,
-depending on the exact HDF5 version downloaded (hence the --enable-deprecated-symbols _and_ --enable-hl).
-The unsupported flag enables building with both threadsafe and high-level enabled.
-
-
-### Building durin plugin
-The plugin makefile will use the "h5cc" compiler wrapper, provided by the HDF5 library, which
-must be on your PATH.
-Download or clone the plugin source code (https://github.com/DiamondLightSource/durin)
-into any directory (referred to as `/durin_dir`) and run the following commands.
-```
-cd /durin_dir
-PATH=/hdf5_dir/build/hdf5/bin:$PATH
-make
-```
-The plugin is located at `/durin_dir/build/durin-plugin.so` and should be added to the
-XDS.INP file as `LIB=/durin_dir/build/durin-plugin.so`
-

+The plugin is located at `build/libdurin-plugin.so` and should be added to the
+XDS.INP file as `LIB=<CURRENT_DIRECTORY>/build/libdurin-plugin.so`.
+Alternatively, versioned copy is also provided, e.g. `libdurin-plugin.so.1.0.0`, allowing to track 
+the current version of the plugin.

 ## Example XDS.INP
 ```
 DETECTOR=PILATUS MINIMUM_VALID_PIXEL_VALUE=0 OVERLOAD=4096
-LIB=/opt/durin/build/durin-plugin.so
+LIB=/opt/durin/build/libdurin-plugin.so
 SENSOR_THICKNESS= 0.450
 !SENSOR_MATERIAL / THICKNESS Si 0.450
 !SILICON= 3.953379
@@ -95,6 +75,3 @@ TRUSTED_REGION= 0.0 1.41
 DATA_RANGE= 1 600
 JOB=XYCORR INIT COLSPOT IDXREF DEFPIX INTEGRATE CORRECT
 ```
-
-N.B. the master file is needed, not the .nxs one which follows the
-standard.
@@ -14,23 +14,22 @@
 #include "bitshuffle_internals.h"
 #include "lz4.h"

+#ifdef ZSTD_SUPPORT
+#include "zstd.h"
+#endif
+
 #include <stdio.h>
 #include <string.h>


-
-
-#define BSHUF_LZ4_DECOMPRESS_FAST
-
-
-
+// Macros.
 #define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) {                      \
    free(buf); return count - 1000; }


 /* Bitshuffle and compress a single block. */
 int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
-        const size_t size, const size_t elem_size) {
+        const size_t size, const size_t elem_size, const int option) {

    int64_t nbytes, count;
    void *tmp_buf_bshuf;
@@ -42,7 +41,8 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
    tmp_buf_bshuf = malloc(size * elem_size);
    if (tmp_buf_bshuf == NULL) return -1;

-    tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size));
+    int dst_capacity = LZ4_compressBound(size * elem_size);
+    tmp_buf_lz4 = malloc(dst_capacity);
    if (tmp_buf_lz4 == NULL){
        free(tmp_buf_bshuf);
        return -1;
@@ -58,7 +58,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
        free(tmp_buf_bshuf);
        return count;
    }
-    nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size);
+    nbytes = LZ4_compress_default((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size, dst_capacity);
    free(tmp_buf_bshuf);
    CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4);

@@ -76,7 +76,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \

 /* Decompress and bitunshuffle a single block. */
 int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
-        const size_t size, const size_t elem_size) {
+        const size_t size, const size_t elem_size, const int option) {

    int64_t nbytes, count;
    void *out, *tmp_buf;
@@ -96,14 +96,6 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
    tmp_buf = malloc(size * elem_size);
    if (tmp_buf == NULL) return -1;

-#ifdef BSHUF_LZ4_DECOMPRESS_FAST
-    nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size);
-    CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
-    if (nbytes != nbytes_from_header) {
-        free(tmp_buf);
-        return -91;
-    }
-#else
    nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header,
                                 size * elem_size);
    CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
@@ -112,7 +104,7 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
        return -91;
    }
    nbytes = nbytes_from_header;
-#endif
+
    count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
    CHECK_ERR_FREE(count, tmp_buf);
    nbytes += 4;
@@ -121,6 +113,92 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
    return nbytes;
 }

+#ifdef ZSTD_SUPPORT
+/* Bitshuffle and compress a single block. */
+int64_t bshuf_compress_zstd_block(ioc_chain *C_ptr, \
+        const size_t size, const size_t elem_size, const int comp_lvl) {
+
+    int64_t nbytes, count;
+    void *tmp_buf_bshuf;
+    void *tmp_buf_zstd;
+    size_t this_iter;
+    const void *in;
+    void *out;
+
+    tmp_buf_bshuf = malloc(size * elem_size);
+    if (tmp_buf_bshuf == NULL) return -1;
+
+    size_t tmp_buf_zstd_size = ZSTD_compressBound(size * elem_size);
+    tmp_buf_zstd = malloc(tmp_buf_zstd_size);
+    if (tmp_buf_zstd == NULL){
+        free(tmp_buf_bshuf);
+        return -1;
+    }
+
+    in = ioc_get_in(C_ptr, &this_iter);
+    ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size));
+
+    count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size);
+    if (count < 0) {
+        free(tmp_buf_zstd);
+        free(tmp_buf_bshuf);
+        return count;
+    }
+    nbytes = ZSTD_compress(tmp_buf_zstd, tmp_buf_zstd_size, (const void*)tmp_buf_bshuf,  size * elem_size, comp_lvl);
+    free(tmp_buf_bshuf);
+    CHECK_ERR_FREE_LZ(nbytes, tmp_buf_zstd);
+
+    out = ioc_get_out(C_ptr, &this_iter);
+    ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4));
+
+    bshuf_write_uint32_BE(out, nbytes);
+    memcpy((char *) out + 4, tmp_buf_zstd, nbytes);
+
+    free(tmp_buf_zstd);
+
+    return nbytes + 4;
+}
+
+
+/* Decompress and bitunshuffle a single block. */
+int64_t bshuf_decompress_zstd_block(ioc_chain *C_ptr,
+        const size_t size, const size_t elem_size, const int option) {
+
+    int64_t nbytes, count;
+    void *out, *tmp_buf;
+    const void *in;
+    size_t this_iter;
+    int32_t nbytes_from_header;
+
+    in = ioc_get_in(C_ptr, &this_iter);
+    nbytes_from_header = bshuf_read_uint32_BE(in);
+    ioc_set_next_in(C_ptr, &this_iter,
+            (void*) ((char*) in + nbytes_from_header + 4));
+
+    out = ioc_get_out(C_ptr, &this_iter);
+    ioc_set_next_out(C_ptr, &this_iter,
+            (void *) ((char *) out + size * elem_size));
+
+    tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    nbytes = ZSTD_decompress(tmp_buf, size * elem_size, (void *)((char *) in + 4), nbytes_from_header);
+    CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
+    if (nbytes != size * elem_size) {
+        free(tmp_buf);
+        return -91;
+    }
+
+    nbytes = nbytes_from_header;
+    count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    nbytes += 4;
+
+    free(tmp_buf);
+    return nbytes;
+}
+#endif // ZSTD_SUPPORT
+

 /* ---- Public functions ----
 *
@@ -138,13 +216,13 @@ size_t bshuf_compress_lz4_bound(const size_t size,
    }
    if (block_size % BSHUF_BLOCKED_MULT) return -81;

-
-
+    // Note that each block gets a 4 byte header.
+    // Size of full blocks.
    bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
-
+    // Size of partial blocks, if any.
    leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
    if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
-
+    // Size of uncompressed data not fitting into any blocks.
    bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
    return bound;
 }
@@ -153,13 +231,49 @@ size_t bshuf_compress_lz4_bound(const size_t size,
 int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size,
        const size_t elem_size, size_t block_size) {
    return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size,
-            elem_size, block_size);
+            elem_size, block_size, 0/*option*/);
 }


 int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
        const size_t elem_size, size_t block_size) {
    return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size,
-            elem_size, block_size);
+            elem_size, block_size, 0/*option*/);
 }

+#ifdef ZSTD_SUPPORT
+size_t bshuf_compress_zstd_bound(const size_t size,
+        const size_t elem_size, size_t block_size) {
+
+    size_t bound, leftover;
+
+    if (block_size == 0) {
+        block_size = bshuf_default_block_size(elem_size);
+    }
+    if (block_size % BSHUF_BLOCKED_MULT) return -81;
+
+    // Note that each block gets a 4 byte header.
+    // Size of full blocks.
+    bound = (ZSTD_compressBound(block_size * elem_size) + 4) * (size / block_size);
+    // Size of partial blocks, if any.
+    leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
+    if (leftover) bound += ZSTD_compressBound(leftover * elem_size) + 4;
+    // Size of uncompressed data not fitting into any blocks.
+    bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
+    return bound;
+}
+
+
+int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size, const int comp_lvl) {
+    return bshuf_blocked_wrap_fun(&bshuf_compress_zstd_block, in, out, size,
+            elem_size, block_size, comp_lvl);
+}
+
+
+int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size) {
+    return bshuf_blocked_wrap_fun(&bshuf_decompress_zstd_block, in, out, size,
+            elem_size, block_size, 0/*option*/);
+}
+#endif // ZSTD_SUPPORT
@@ -35,6 +35,10 @@
 extern "C" {
 #endif

+/*
+ * ---- LZ4 Interface ----
+ */
+
 /* ---- bshuf_compress_lz4_bound ----
 *
 * Bound on size of data compressed with *bshuf_compress_lz4*.
@@ -94,11 +98,6 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
 * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
 * must patch the parameters used to compress the data.
 *
- * NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function
- * LZ4_decompress_fast from LZ4, which does not protect against maliciously
- * formed datasets. By modifying the compressed data, this function could be
- * coerced into leaving the boundaries of the input buffer.
- *
 * Parameters
 * ----------
 *  in : input buffer
@@ -116,8 +115,91 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
 int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
        const size_t elem_size, size_t block_size);

+/*
+ * ---- ZSTD Interface ----
+ */
+
+#ifdef ZSTD_SUPPORT
+/* ---- bshuf_compress_zstd_bound ----
+ *
+ * Bound on size of data compressed with *bshuf_compress_zstd*.
+ *
+ * Parameters
+ * ----------
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Process in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  Bound on compressed data size.
+ *
+ */
+size_t bshuf_compress_zstd_bound(const size_t size,
+        const size_t elem_size, size_t block_size);
+
+/* ---- bshuf_compress_zstd ----
+ *
+ * Bitshuffled and compress the data using zstd.
+ *
+ * Transpose within elements, in blocks of data of *block_size* elements then
+ * compress the blocks using ZSTD.  In the output buffer, each block is prefixed
+ * by a 4 byte integer giving the compressed size of that block.
+ *
+ * Output buffer must be large enough to hold the compressed data.  This could
+ * be in principle substantially larger than the input buffer.  Use the routine
+ * *bshuf_compress_zstd_bound* to get an upper limit.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer, must be of size * elem_size bytes
+ *  out : output buffer, must be large enough to hold data.
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Process in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *  comp_lvl : compression level applied
+ *
+ * Returns
+ * -------
+ *  number of bytes used in output buffer, negative error-code if failed.
+ *
+ */
+int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size, const size_t
+        elem_size, size_t block_size, const int comp_lvl);
+
+
+/* ---- bshuf_decompress_zstd ----
+ *
+ * Undo compression and bitshuffling.
+ *
+ * Decompress data then un-bitshuffle it in blocks of *block_size* elements.
+ *
+ * To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
+ * must patch the parameters used to compress the data.
+ *
+ * Parameters
+ * ----------
+ *  in : input buffer
+ *  out : output buffer, must be of size * elem_size bytes
+ *  size : number of elements in input
+ *  elem_size : element size of typed data
+ *  block_size : Process in blocks of this many elements. Pass 0 to
+ *  select automatically (recommended).
+ *
+ * Returns
+ * -------
+ *  number of bytes consumed in *input* buffer, negative error-code if failed.
+ *
+ */
+int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
+        const size_t elem_size, size_t block_size);
+
+#endif // ZSTD_SUPPORT
+
 #ifdef __cplusplus
-}
+} // extern "C"
 #endif

-#endif
+#endif  // BITSHUFFLE_H
@@ -16,30 +16,55 @@
 #include <string.h>


+#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__)
+#define USEAVX512
+#endif
+
 #if defined(__AVX2__) && defined (__SSE2__)
 #define USEAVX2
 #endif

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(NO_WARN_X86_INTRINSICS)
 #define USESSE2
 #endif

+#if defined(__ARM_NEON__) || (__ARM_NEON)
+#ifdef __aarch64__
+#define USEARMNEON
+#endif
+#endif

-
+// Conditional includes for SSE2 and AVX2.
 #ifdef USEAVX2
 #include <immintrin.h>
 #elif defined USESSE2
 #include <emmintrin.h>
+#elif defined USEARMNEON
+#include <arm_neon.h>
 #endif

+#if defined(_OPENMP) && defined(_MSC_VER)
+typedef int64_t omp_size_t;
+#else
+typedef size_t omp_size_t;
+#endif

-
+// Macros.
 #define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))


 /* ---- Functions indicating compile time instruction set. ---- */

+int bshuf_using_NEON(void) {
+#ifdef USEARMNEON
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+
 int bshuf_using_SSE2(void) {
 #ifdef USESSE2
    return 1;
@@ -58,6 +83,14 @@ int bshuf_using_AVX2(void) {
 }


+int bshuf_using_AVX512(void) {
+#ifdef USEAVX512
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 /* ---- Worker code not requiring special instruction sets. ----
 *
 * The following code does not use any x86 specific vectorized instructions
@@ -131,8 +164,8 @@ int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t
    CHECK_MULT_EIGHT(start);

    if (size > start) {
-
-
+        // ii loop separated into 2 loops so the compiler can unroll
+        // the inner one.
        for (ii = start; ii + 7 < size; ii += 8) {
            for (jj = 0; jj < elem_size; jj++) {
                for (kk = 0; kk < 8; kk++) {
@@ -348,10 +381,516 @@ int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size
 }


+/* ---- Worker code that uses Arm NEON ----
+ *
+ * The following code makes use of the Arm NEON instruction set.
+ * NEON technology is the implementation of the ARM Advanced Single
+ * Instruction Multiple Data (SIMD) extension.
+ * The NEON unit is the component of the processor that executes SIMD instructions.
+ * It is also called the NEON Media Processing Engine (MPE).
+ *
+ */
+
+#ifdef USEARMNEON
+
+/* Transpose bytes within elements for 16 bit elements. */
+int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
+
+    size_t ii;
+    const char *in_b = (const char*) in;
+    char *out_b = (char*) out;
+    int8x16_t a0, b0, a1, b1;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = vld1q_s8(in_b + 2*ii + 0*16);
+        b0 = vld1q_s8(in_b + 2*ii + 1*16);
+
+        a1 = vzip1q_s8(a0, b0);
+        b1 = vzip2q_s8(a0, b0);
+
+        a0 = vzip1q_s8(a1, b1);
+        b0 = vzip2q_s8(a1, b1);
+
+        a1 = vzip1q_s8(a0, b0);
+        b1 = vzip2q_s8(a0, b0);
+
+        a0 = vzip1q_s8(a1, b1);
+        b0 = vzip2q_s8(a1, b1);
+
+        vst1q_s8(out_b + 0*size + ii, a0);
+        vst1q_s8(out_b + 1*size + ii, b0);
+    }
+
+    return bshuf_trans_byte_elem_remainder(in, out, size, 2,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements for 32 bit elements. */
+int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
+
+    size_t ii;
+    const char *in_b;
+    char *out_b;
+    in_b = (const char*) in;
+    out_b = (char*) out;
+    int8x16_t a0, b0, c0, d0, a1, b1, c1, d1;
+    int64x2_t a2, b2, c2, d2;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = vld1q_s8(in_b + 4*ii + 0*16);
+        b0 = vld1q_s8(in_b + 4*ii + 1*16);
+        c0 = vld1q_s8(in_b + 4*ii + 2*16);
+        d0 = vld1q_s8(in_b + 4*ii + 3*16);
+
+        a1 = vzip1q_s8(a0, b0);
+        b1 = vzip2q_s8(a0, b0);
+        c1 = vzip1q_s8(c0, d0);
+        d1 = vzip2q_s8(c0, d0);
+
+        a0 = vzip1q_s8(a1, b1);
+        b0 = vzip2q_s8(a1, b1);
+        c0 = vzip1q_s8(c1, d1);
+        d0 = vzip2q_s8(c1, d1);
+
+        a1 = vzip1q_s8(a0, b0);
+        b1 = vzip2q_s8(a0, b0);
+        c1 = vzip1q_s8(c0, d0);
+        d1 = vzip2q_s8(c0, d0);
+
+        a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
+        b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
+        c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
+        d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
+
+        vst1q_s64((int64_t *) (out_b + 0*size + ii), a2);
+        vst1q_s64((int64_t *) (out_b + 1*size + ii), b2);
+        vst1q_s64((int64_t *) (out_b + 2*size + ii), c2);
+        vst1q_s64((int64_t *) (out_b + 3*size + ii), d2);
+    }
+
+    return bshuf_trans_byte_elem_remainder(in, out, size, 4,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements for 64 bit elements. */
+int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
+
+    size_t ii;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
+    int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
+
+    for (ii=0; ii + 15 < size; ii += 16) {
+        a0 = vld1q_s8(in_b + 8*ii + 0*16);
+        b0 = vld1q_s8(in_b + 8*ii + 1*16);
+        c0 = vld1q_s8(in_b + 8*ii + 2*16);
+        d0 = vld1q_s8(in_b + 8*ii + 3*16);
+        e0 = vld1q_s8(in_b + 8*ii + 4*16);
+        f0 = vld1q_s8(in_b + 8*ii + 5*16);
+        g0 = vld1q_s8(in_b + 8*ii + 6*16);
+        h0 = vld1q_s8(in_b + 8*ii + 7*16);
+
+        a1 = vzip1q_s8 (a0, b0);
+        b1 = vzip2q_s8 (a0, b0);
+        c1 = vzip1q_s8 (c0, d0);
+        d1 = vzip2q_s8 (c0, d0);
+        e1 = vzip1q_s8 (e0, f0);
+        f1 = vzip2q_s8 (e0, f0);
+        g1 = vzip1q_s8 (g0, h0);
+        h1 = vzip2q_s8 (g0, h0);
+
+        a0 = vzip1q_s8 (a1, b1);
+        b0 = vzip2q_s8 (a1, b1);
+        c0 = vzip1q_s8 (c1, d1);
+        d0 = vzip2q_s8 (c1, d1);
+        e0 = vzip1q_s8 (e1, f1);
+        f0 = vzip2q_s8 (e1, f1);
+        g0 = vzip1q_s8 (g1, h1);
+        h0 = vzip2q_s8 (g1, h1);
+
+        a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
+        b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
+        c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
+        d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
+        e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
+        f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
+        g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
+        h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
+
+        a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
+        b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
+        c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
+        d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
+        e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
+        f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
+        g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
+        h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
+
+        vst1q_s8(out_b + 0*size + ii, a0);
+        vst1q_s8(out_b + 1*size + ii, b0);
+        vst1q_s8(out_b + 2*size + ii, c0);
+        vst1q_s8(out_b + 3*size + ii, d0);
+        vst1q_s8(out_b + 4*size + ii, e0);
+        vst1q_s8(out_b + 5*size + ii, f0);
+        vst1q_s8(out_b + 6*size + ii, g0);
+        vst1q_s8(out_b + 7*size + ii, h0);
+    }
+
+    return bshuf_trans_byte_elem_remainder(in, out, size, 8,
+            size - size % 16);
+}
+
+
+/* Transpose bytes within elements using best NEON algorithm available. */
+int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    // Trivial cases: power of 2 bytes.
+    switch (elem_size) {
+        case 1:
+            count = bshuf_copy(in, out, size, elem_size);
+            return count;
+        case 2:
+            count = bshuf_trans_byte_elem_NEON_16(in, out, size);
+            return count;
+        case 4:
+            count = bshuf_trans_byte_elem_NEON_32(in, out, size);
+            return count;
+        case 8:
+            count = bshuf_trans_byte_elem_NEON_64(in, out, size);
+            return count;
+    }
+
+    // Worst case: odd number of bytes. Turns out that this is faster for
+    // (odd * 2) byte elements as well (hence % 4).
+    if (elem_size % 4) {
+        count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
+        return count;
+    }
+
+    // Multiple of power of 2: transpose hierarchically.
+    {
+        size_t nchunk_elem;
+        void* tmp_buf = malloc(size * elem_size);
+        if (tmp_buf == NULL) return -1;
+
+        if ((elem_size % 8) == 0) {
+            nchunk_elem = elem_size / 8;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
+            count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
+        } else if ((elem_size % 4) == 0) {
+            nchunk_elem = elem_size / 4;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
+            count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
+        } else {
+            // Not used since scalar algorithm is faster.
+            nchunk_elem = elem_size / 2;
+            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
+            count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf,
+                    size * nchunk_elem);
+            bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
+        }
+
+        free(tmp_buf);
+        return count;
+    }
+}
+
+
+/* Creates a mask made up of the most significant
+ * bit of each byte of 'input'
+ */
+int32_t move_byte_mask_neon(uint8x16_t input) {
+
+    return (  ((input[0] & 0x80) >> 7)          | (((input[1] & 0x80) >> 7) << 1)   | (((input[2] & 0x80) >> 7) << 2)   | (((input[3] & 0x80) >> 7) << 3)
+            | (((input[4] & 0x80) >> 7) << 4)   | (((input[5] & 0x80) >> 7) << 5)   | (((input[6] & 0x80) >> 7) << 6)   | (((input[7] & 0x80) >> 7) << 7)
+            | (((input[8] & 0x80) >> 7) << 8)   | (((input[9] & 0x80) >> 7) << 9)   | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11)
+            | (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15)
+           );
+}
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    uint16_t* out_ui16;
+
+    int64_t count;
+
+    size_t nbyte = elem_size * size;
+
+    CHECK_MULT_EIGHT(nbyte);
+
+    int16x8_t xmm;
+    int32_t bt;
+
+    for (ii = 0; ii + 15 < nbyte; ii += 16) {
+        xmm = vld1q_s16((int16_t *) (in_b + ii));
+        for (kk = 0; kk < 8; kk++) {
+            bt = move_byte_mask_neon((uint8x16_t) xmm);
+            xmm = vshlq_n_s16(xmm, 1);
+            out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_ui16 = bt;
+        }
+    }
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 16);
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+
+/* For data organized into a row for each bit (8 * elem_size rows), transpose
+ * the bytes. */
+int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, jj;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    CHECK_MULT_EIGHT(size);
+
+    size_t nrows = 8 * elem_size;
+    size_t nbyte_row = size / 8;
+
+    int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
+    int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
+    int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
+
+    for (ii = 0; ii + 7 < nrows; ii += 8) {
+        for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
+            a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj);
+            b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj);
+            c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj);
+            d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj);
+            e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj);
+            f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj);
+            g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj);
+            h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj);
+
+            a1 = vzip1q_s8(a0, b0);
+            b1 = vzip1q_s8(c0, d0);
+            c1 = vzip1q_s8(e0, f0);
+            d1 = vzip1q_s8(g0, h0);
+            e1 = vzip2q_s8(a0, b0);
+            f1 = vzip2q_s8(c0, d0);
+            g1 = vzip2q_s8(e0, f0);
+            h1 = vzip2q_s8(g0, h0);
+
+            a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
+            b0=  (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
+            c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
+            d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
+            e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
+            f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
+            g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
+            h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
+
+            a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
+            b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
+            c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
+            d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
+            e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
+            f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
+            g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
+            h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
+
+            as = (int64x1_t *) &a1;
+            bs = (int64x1_t *) &b1;
+            cs = (int64x1_t *) &c1;
+            ds = (int64x1_t *) &d1;
+            es = (int64x1_t *) &e1;
+            fs = (int64x1_t *) &f1;
+            gs = (int64x1_t *) &g1;
+            hs = (int64x1_t *) &h1;
+
+            vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as);
+            vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs);
+            vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs);
+            vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds);
+            vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es);
+            vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs);
+            vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs);
+            vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1));
+            vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs);
+            vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1));
+        }
+        for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
+            out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
+            out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
+    const char* in_b = (const char*) in;
+    uint16_t* out_ui16 = (uint16_t*) out;
+
+    size_t ii, jj, kk;
+    size_t nbyte = elem_size * size;
+
+    int16x8_t xmm;
+    int32_t bt;
+
+    if (elem_size % 2) {
+        bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
+    } else {
+        for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                ii += 8 * elem_size) {
+            for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
+                xmm = vld1q_s16((int16_t *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = move_byte_mask_neon((uint8x16_t) xmm);
+                    xmm = vshlq_n_s16(xmm, 1);
+                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    out_ui16[ind / 2] = bt;
+                }
+            }
+        }
+    }
+    return size * elem_size;
+}
+
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+}
+
+#else // #ifdef USEARMNEON
+
+int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
+    return -13;
+}
+
+
+int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
+    return -13;
+}
+
+
+int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -13;
+}
+
+
+#endif
+
+
+
+
+
 /* ---- Worker code that uses SSE2 ----
 *
 * The following code makes use of the SSE2 instruction set and specialized
- * 16 byte registers. The SSE2 instructions are present on modern x86
+ * 16 byte registers. The SSE2 instructions are present on modern x86 
 * processors. The first Intel processor microarchitecture supporting SSE2 was
 * Pentium 4 (2000).
 *
@@ -512,7 +1051,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,

    int64_t count;

-
+    // Trivial cases: power of 2 bytes.
    switch (elem_size) {
        case 1:
            count = bshuf_copy(in, out, size, elem_size);
@@ -528,14 +1067,14 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
            return count;
    }

-
-
+    // Worst case: odd number of bytes. Turns out that this is faster for
+    // (odd * 2) byte elements as well (hence % 4).
    if (elem_size % 4) {
        count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
        return count;
    }

-
+    // Multiple of power of 2: transpose hierarchically.
    {
        size_t nchunk_elem;
        void* tmp_buf = malloc(size * elem_size);
@@ -554,7 +1093,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
                    size * nchunk_elem);
            bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
        } else {
-
+            // Not used since scalar algorithm is faster.
            nchunk_elem = elem_size / 2;
            TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
            count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
@@ -687,8 +1226,8 @@ int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size
            g1 = _mm_unpacklo_epi32(g0, h0);
            h1 = _mm_unpackhi_epi32(g0, h0);

-
-
+            // We don't have a storeh instruction for integers, so interpret
+            // as a float. Have a storel (_mm_storel_epi64).
            as = (__m128 *) &a1;
            bs = (__m128 *) &b1;
            cs = (__m128 *) &c1;
@@ -737,8 +1276,8 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t

    CHECK_MULT_EIGHT(size);

-
-
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
    const char* in_b = (const char*) in;
    uint16_t* out_ui16 = (uint16_t*) out;

@@ -788,7 +1327,7 @@ int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
    return count;
 }

-#else
+#else // #ifdef USESSE2


 int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
@@ -842,7 +1381,7 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
 }


-#endif
+#endif // #ifdef USESSE2


 /* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
@@ -857,7 +1396,6 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
 */

 #ifdef USEAVX2
-
 /* Transpose bits within bytes. */
 int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
         const size_t elem_size) {
@@ -1014,8 +1552,8 @@ int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t

    CHECK_MULT_EIGHT(size);

-
-
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
    const char* in_b = (const char*) in;
    char* out_b = (char*) out;

@@ -1065,7 +1603,7 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
 }


-#else
+#else // #ifdef USEAVX2

 int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
         const size_t elem_size) {
@@ -1096,19 +1634,179 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
    return -12;
 }

-#endif
+#endif // #ifdef USEAVX2

+#ifdef USEAVX512
+
+/* Transpose bits within bytes. */
+int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    size_t ii, kk;
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+    size_t nbyte = elem_size * size;
+    int64_t count;
+
+    int64_t* out_i64;
+    __m512i zmm;
+    __mmask64 bt;
+    if (nbyte >= 64) {
+        const __m512i mask = _mm512_set1_epi8(0);
+
+       for (ii = 0; ii + 63 < nbyte; ii += 64) {
+            zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
+            for (kk = 0; kk < 8; kk++) {
+                bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
+                zmm = _mm512_slli_epi16(zmm, 1);
+                out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+                *out_i64 = (int64_t)bt;
+            }
+        }
+    }
+
+    __m256i ymm;
+    int32_t bt32;
+    int32_t* out_i32;
+    size_t start = nbyte - nbyte % 64;
+    for (ii = start; ii + 31 < nbyte; ii += 32) {
+        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
+        for (kk = 0; kk < 8; kk++) {
+            bt32 = _mm256_movemask_epi8(ymm);
+            ymm = _mm256_slli_epi16(ymm, 1);
+            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
+            *out_i32 = bt32;
+        }
+    }
+
+
+    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
+            nbyte - nbyte % 64 % 32);
+
+    return count;
+}
+
+
+/* Transpose bits within elements. */
+int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+
+    return count;
+
+}
+
+/* Shuffle bits within the bytes of eight element blocks. */
+int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    CHECK_MULT_EIGHT(size);
+
+    // With a bit of care, this could be written such that such that it is
+    // in_buf = out_buf safe.
+    const char* in_b = (const char*) in;
+    char* out_b = (char*) out;
+
+    size_t ii, jj, kk;
+    size_t nbyte = elem_size * size;
+
+    __m512i zmm;
+    __mmask64 bt;
+
+    if (elem_size % 8) {
+        return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
+    } else {
+        const __m512i mask = _mm512_set1_epi8(0);
+        for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
+            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
+                    ii += 8 * elem_size) {
+                zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
+                for (kk = 0; kk < 8; kk++) {
+                    bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
+                    zmm = _mm512_slli_epi16(zmm, 1);
+                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
+                    * (int64_t *) &out_b[ind] = bt;
+                }
+            }
+        }
+
+    }
+    return size * elem_size;
+}
+
+/* Untranspose bits within elements. */
+int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    int64_t count;
+
+    CHECK_MULT_EIGHT(size);
+
+    void* tmp_buf = malloc(size * elem_size);
+    if (tmp_buf == NULL) return -1;
+
+    count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
+    CHECK_ERR_FREE(count, tmp_buf);
+    count =  bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
+
+    free(tmp_buf);
+    return count;
+}
+
+#else // #ifdef USEAVX512
+
+int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+
+    return -14;
+}
+
+int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+
+}
+
+int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+}
+
+int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
+         const size_t elem_size) {
+    return -14;
+}
+
+#endif

 /* ---- Drivers selecting best instruction set at compile time. ---- */

-int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
+int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size, 
        const size_t elem_size) {

    int64_t count;
-#ifdef USEAVX2
+#ifdef USEAVX512
+    count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size);
+#elif defined USEAVX2
    count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
 #elif defined(USESSE2)
    count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
+#elif defined(USEARMNEON)
+    count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size);
 #else
    count = bshuf_trans_bit_elem_scal(in, out, size, elem_size);
 #endif
@@ -1116,14 +1814,18 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
 }


-int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
+int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size, 
        const size_t elem_size) {

    int64_t count;
-#ifdef USEAVX2
+#ifdef USEAVX512
+    count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size);
+#elif defined USEAVX2
    count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
 #elif defined(USESSE2)
    count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
+#elif defined(USEARMNEON)
+    count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size);
 #else
    count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size);
 #endif
@@ -1136,9 +1838,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
 /* Wrap a function for processing a single block to process an entire buffer in
 * parallel. */
 int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \
-        const size_t size, const size_t elem_size, size_t block_size) {
+        const size_t size, const size_t elem_size, size_t block_size, const int option) {

-    size_t ii;
+    omp_size_t ii = 0;
    int64_t err = 0;
    int64_t count, cum_count=0;
    size_t last_block_size;
@@ -1161,8 +1863,8 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
    #pragma omp parallel for schedule(dynamic, 1) \
            private(count) reduction(+ : cum_count)
 #endif
-    for (ii = 0; ii < size / block_size; ii ++) {
-        count = fun(&C, block_size, elem_size);
+    for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) {
+        count = fun(&C, block_size, elem_size, option);
        if (count < 0) err = count;
        cum_count += count;
    }
@@ -1170,7 +1872,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
    last_block_size = size % block_size;
    last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT;
    if (last_block_size) {
-        count = fun(&C, last_block_size, elem_size);
+        count = fun(&C, last_block_size, elem_size, option);
        if (count < 0) err = count;
        cum_count += count;
    }
@@ -1178,6 +1880,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
    if (err < 0) return err;

    leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
+    //this_iter;
    last_in = (char *) ioc_get_in(&C, &this_iter);
    ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
    last_out = (char *) ioc_get_out(&C, &this_iter);
@@ -1193,7 +1896,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,

 /* Bitshuffle a single block. */
 int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
-        const size_t size, const size_t elem_size) {
+        const size_t size, const size_t elem_size, const int option) {

    size_t this_iter;
    const void *in;
@@ -1201,7 +1904,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
    int64_t count;


-
+    
    in = ioc_get_in(C_ptr, &this_iter);
    ioc_set_next_in(C_ptr, &this_iter,
            (void*) ((char*) in + size * elem_size));
@@ -1216,7 +1919,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \

 /* Bitunshuffle a single block. */
 int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \
-        const size_t size, const size_t elem_size) {
+        const size_t size, const size_t elem_size, const int option) {


    size_t this_iter;
@@ -1296,11 +1999,11 @@ uint32_t bshuf_read_uint32_BE(const void* buf) {
 */

 size_t bshuf_default_block_size(const size_t elem_size) {
-
-
+    // This function needs to be absolutely stable between versions.
+    // Otherwise encoded data will not be decodable.

    size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
-
+    // Ensure it is a required multiple.
    block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
    return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
 }
@@ -1310,7 +2013,7 @@ int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
        const size_t elem_size, size_t block_size) {

    return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size,
-            elem_size, block_size);
+            elem_size, block_size, 0/*option*/);
 }


@@ -1318,7 +2021,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
        const size_t elem_size, size_t block_size) {

    return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size,
-            elem_size, block_size);
+            elem_size, block_size, 0/*option*/);
 }


@@ -18,6 +18,8 @@
 *      -1    : Failed to allocate memory.
 *      -11   : Missing SSE.
 *      -12   : Missing AVX.
+ *      -13   : Missing Arm Neon.
+ *      -14   : Missing AVX512.
 *      -80   : Input size not a multiple of 8.
 *      -81   : block_size not multiple of 8.
 *      -91   : Decompression error, wrong number of bytes processed.
@@ -28,15 +30,14 @@
 #ifndef BITSHUFFLE_CORE_H
 #define BITSHUFFLE_CORE_H

-
+// We assume GNU g++ defining `__cplusplus` has stdint.h
 #if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
 #include <stdint.h>
 #else
  typedef unsigned char       uint8_t;
  typedef unsigned short      uint16_t;
-  typedef signed short        int16_t;
  typedef unsigned int        uint32_t;
-  typedef signed int          int32_t;
+  typedef   signed int        int32_t;
  typedef unsigned long long  uint64_t;
  typedef long long           int64_t;
 #endif
@@ -44,11 +45,11 @@
 #include <stdlib.h>


-
+// These are usually set in the setup.py.
 #ifndef BSHUF_VERSION_MAJOR
 #define BSHUF_VERSION_MAJOR 0
-#define BSHUF_VERSION_MINOR 3
-#define BSHUF_VERSION_POINT 4
+#define BSHUF_VERSION_MINOR 4
+#define BSHUF_VERSION_POINT 0
 #endif

 #ifdef __cplusplus
@@ -67,6 +68,18 @@ extern "C" {
 int bshuf_using_SSE2(void);


+/* ---- bshuf_using_NEON ----
+ *
+ * Whether routines where compiled with the NEON instruction set.
+ *
+ * Returns
+ * -------
+ *  1 if using NEON, 0 otherwise.
+ *
+ */
+int bshuf_using_NEON(void);
+
+
 /* ---- bshuf_using_AVX2 ----
 *
 * Whether routines where compiled with the AVX2 instruction set.
@@ -79,6 +92,18 @@ int bshuf_using_SSE2(void);
 int bshuf_using_AVX2(void);


+/* ---- bshuf_using_AVX512 ----
+ *
+ * Whether routines where compiled with the AVX512 instruction set.
+ *
+ * Returns
+ * -------
+ *  1 if using AVX512, 0 otherwise.
+ *
+ */
+int bshuf_using_AVX512(void);
+
+
 /* ---- bshuf_default_block_size ----
 *
 * The default block size as function of element size.
@@ -151,7 +176,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
        const size_t elem_size, size_t block_size);

 #ifdef __cplusplus
-}
+} // extern "C"
 #endif

-#endif
+#endif  // BITSHUFFLE_CORE_H
@@ -13,19 +13,31 @@
 #ifndef BITSHUFFLE_INTERNALS_H
 #define BITSHUFFLE_INTERNALS_H

+// We assume GNU g++ defining `__cplusplus` has stdint.h
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
+#include <stdint.h>
+#else
+  typedef unsigned char       uint8_t;
+  typedef unsigned short      uint16_t;
+  typedef unsigned int        uint32_t;
+  typedef   signed int        int32_t;
+  typedef unsigned long long  uint64_t;
+  typedef long long           int64_t;
+#endif

 #include <stdlib.h>
 #include "iochain.h"


+// Constants.
 #ifndef BSHUF_MIN_RECOMMEND_BLOCK
 #define BSHUF_MIN_RECOMMEND_BLOCK 128
-#define BSHUF_BLOCKED_MULT 8
+#define BSHUF_BLOCKED_MULT 8    // Block sizes must be multiple of this.
 #define BSHUF_TARGET_BLOCK_SIZE_B 8192
 #endif


-
+// Macros.
 #define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }


@@ -49,15 +61,15 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,

 /* Function definition for worker functions that process a single block. */
 typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr,
-        const size_t size, const size_t elem_size);
+        const size_t size, const size_t elem_size, const int option);

 /* Wrap a function for processing a single block to process an entire buffer in
 * parallel. */
 int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
-        const size_t size, const size_t elem_size, size_t block_size);
+        const size_t size, const size_t elem_size, size_t block_size, const int option);

 #ifdef __cplusplus
-}
+} // extern "C"
 #endif

-#endif
+#endif  // BITSHUFFLE_INTERNALS_H
@@ -0,0 +1,260 @@
+/*
+ * Bitshuffle HDF5 filter
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ */
+
+#include "bitshuffle.h"
+#include "bshuf_h5filter.h"
+
+
+#define PUSH_ERR(func, minor, str)                                      \
+    H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
+
+
+// Prototypes from bitshuffle.c
+void bshuf_write_uint64_BE(void* buf, uint64_t num);
+uint64_t bshuf_read_uint64_BE(void* buf);
+void bshuf_write_uint32_BE(void* buf, uint32_t num);
+uint32_t bshuf_read_uint32_BE(const void* buf);
+
+
+// Only called on compression, not on reverse.
+herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
+
+    herr_t r;
+    size_t ii;
+
+    unsigned int elem_size;
+
+    unsigned int flags;
+    size_t nelements = 8;
+    size_t nelem_max = 11;
+    unsigned values[] = {0,0,0,0,0,0,0,0,0,0,0};
+    unsigned tmp_values[] = {0,0,0,0,0,0,0,0};
+    char msg[80];
+
+    r = H5Pget_filter_by_id2(dcpl, BSHUF_H5FILTER, &flags, &nelements,
+            tmp_values, 0, NULL, NULL);
+    if(r<0) return -1;
+
+    // First 3 slots reserved. Move any passed options to higher addresses.
+    for (ii=0; ii < nelements && ii + 3 < nelem_max; ii++) {
+        values[ii + 3] = tmp_values[ii];
+    }
+
+    nelements = 3 + nelements;
+
+    values[0] = BSHUF_VERSION_MAJOR;
+    values[1] = BSHUF_VERSION_MINOR;
+
+    elem_size = H5Tget_size(type);
+    if(elem_size <= 0) {
+        PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, 
+                "Invalid element size.");
+        return -1;
+    }
+
+    values[2] = elem_size;
+
+    // Validate user supplied arguments.
+    if (nelements > 3) {
+        if (values[3] % 8 || values[3] < 0) {
+            sprintf(msg, "Error in bitshuffle. Invalid block size: %d.",
+                    values[3]);
+            PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, msg);
+            return -1;
+        }
+    }
+    if (nelements > 4) {
+        switch (values[4]) {
+            case 0:
+                break;
+            case BSHUF_H5_COMPRESS_LZ4:
+                break;
+            #ifdef ZSTD_SUPPORT
+            case BSHUF_H5_COMPRESS_ZSTD:
+                break;
+            #endif
+            default:
+                PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, 
+                         "Invalid bitshuffle compression.");
+        }
+    }
+
+    r = H5Pmodify_filter(dcpl, BSHUF_H5FILTER, flags, nelements, values);
+    if(r<0) return -1;
+
+    return 1;
+}
+
+
+size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
+           const unsigned int cd_values[], size_t nbytes,
+           size_t *buf_size, void **buf) {
+
+    size_t size, elem_size;
+    int err = -1;
+    char msg[80];
+    size_t block_size = 0;
+    size_t buf_size_out, nbytes_uncomp, nbytes_out;
+    char* in_buf = *buf;
+    void *out_buf;
+
+    if (cd_nelmts < 3) {
+        PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, 
+                "Not enough parameters.");
+        return 0;
+    }
+    elem_size = cd_values[2];
+#ifdef ZSTD_SUPPORT
+    const int comp_lvl = cd_values[5]; 
+#endif
+
+    // User specified block size.
+    if (cd_nelmts > 3) block_size = cd_values[3];
+
+    if (block_size == 0) block_size = bshuf_default_block_size(elem_size);
+    
+#ifndef ZSTD_SUPPORT
+    if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
+        PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, 
+                "ZSTD compression filter chosen but ZSTD support not installed.");
+        return 0;
+    }
+#endif
+
+    // Compression in addition to bitshuffle.
+    if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
+        if (flags & H5Z_FLAG_REVERSE) {
+            // First eight bytes is the number of bytes in the output buffer,
+            // little endian.
+            nbytes_uncomp = bshuf_read_uint64_BE(in_buf);
+            // Override the block size with the one read from the header.
+            block_size = bshuf_read_uint32_BE((const char*) in_buf + 8) / elem_size;
+            // Skip over the header.
+            in_buf += 12;
+            buf_size_out = nbytes_uncomp;
+        } else {
+            nbytes_uncomp = nbytes;
+            // Pick which compressions library to use
+            if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
+              buf_size_out = bshuf_compress_lz4_bound(nbytes_uncomp / elem_size, 
+                  elem_size, block_size) + 12;
+            }
+#ifdef ZSTD_SUPPORT
+            else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
+              buf_size_out = bshuf_compress_zstd_bound(nbytes_uncomp / elem_size, 
+                  elem_size, block_size) + 12;
+            }
+#endif
+        }
+    } else {
+        nbytes_uncomp = nbytes;
+        buf_size_out = nbytes;
+    }
+
+    // TODO, remove this restriction by memcopying the extra.
+    if (nbytes_uncomp % elem_size) {
+        PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, 
+                "Non integer number of elements.");
+        return 0;
+    }
+    size = nbytes_uncomp / elem_size;
+
+    out_buf = malloc(buf_size_out);
+    if (out_buf == NULL) {
+        PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, 
+                "Could not allocate output buffer.");
+        return 0;
+    }
+
+    if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
+        if (flags & H5Z_FLAG_REVERSE) {
+            // Bit unshuffle/decompress.
+            // Pick which compressions library to use
+            if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
+              err = bshuf_decompress_lz4(in_buf, out_buf, size, elem_size, block_size);
+            }
+#ifdef ZSTD_SUPPORT
+            else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
+              err = bshuf_decompress_zstd(in_buf, out_buf, size, elem_size, block_size);
+            }
+#endif
+            nbytes_out = nbytes_uncomp;
+        } else {
+            // Bit shuffle/compress.
+            // Write the header, described in
+            // http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
+            // Technically we should be using signed integers instead of
+            // unsigned ones, however for valid inputs (positive numbers) these
+            // have the same representation.
+            bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
+            bshuf_write_uint32_BE((char*) out_buf + 8, block_size * elem_size);
+            if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
+                err = bshuf_compress_lz4(in_buf, (char*) out_buf + 12, size,
+                        elem_size, block_size); 
+            }
+#ifdef ZSTD_SUPPORT
+            else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
+                err = bshuf_compress_zstd(in_buf, (char*) out_buf + 12, size,
+                        elem_size, block_size, comp_lvl); 
+            }
+#endif
+            nbytes_out = err + 12;
+        } 
+    } else {
+            if (flags & H5Z_FLAG_REVERSE) {
+            // Bit unshuffle.
+            err = bshuf_bitunshuffle(in_buf, out_buf, size, elem_size,
+                    block_size); } else {
+            // Bit shuffle.
+            err = bshuf_bitshuffle(in_buf, out_buf, size, elem_size,
+                    block_size); } nbytes_out = nbytes; }
+    //printf("nb_in %d, nb_uncomp %d, nb_out %d, buf_out %d, block %d\n",
+    //nbytes, nbytes_uncomp, nbytes_out, buf_size_out, block_size);
+
+    if (err < 0) {
+        sprintf(msg, "Error in bitshuffle with error code %d.", err);
+        PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, msg);
+        free(out_buf);
+        return 0;
+    } else {
+        free(*buf);
+        *buf = out_buf;
+        *buf_size = buf_size_out;
+
+        return nbytes_out;
+    }
+}
+
+
+
+H5Z_class_t bshuf_H5Filter[1] = {{
+    H5Z_CLASS_T_VERS,
+    (H5Z_filter_t)(BSHUF_H5FILTER),
+    1, 1,
+    "bitshuffle; see https://github.com/kiyo-masui/bitshuffle",
+    NULL,
+    (H5Z_set_local_func_t)(bshuf_h5_set_local),
+    (H5Z_func_t)(bshuf_h5_filter)
+}};
+
+
+int bshuf_register_h5filter(void){
+
+    int retval;
+
+    retval = H5Zregister(bshuf_H5Filter);
+    if(retval<0){
+        PUSH_ERR("bshuf_register_h5filter",
+                 H5E_CANTREGISTER, "Can't register bitshuffle filter");
+    }
+    return retval;
+}
@@ -0,0 +1,67 @@
+/*
+ * Bitshuffle HDF5 filter
+ *
+ * This file is part of Bitshuffle
+ * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
+ * Website: http://www.github.com/kiyo-masui/bitshuffle
+ * Created: 2014
+ *
+ * See LICENSE file for details about copyright and rights to use.
+ *
+ *
+ * Header File
+ *
+ * Filter Options
+ * --------------
+ *  block_size (option slot 0) : integer (optional)
+ *      What block size to use (in elements not bytes). Default is 0,
+ *      for which bitshuffle will pick a block size with a target of 8kb.
+ *  Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
+ *      Whether to apply LZ4 compression to the data after bitshuffling.
+ *      This is much faster than applying compression as a second filter
+ *      because it is done when the small block of data is already in the
+ *      L1 cache.
+ *
+ *      For LZ4 compression, the compressed format of the data is the same as
+ *      for the normal LZ4 filter described in
+ *      http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
+ *
+ */
+
+
+#ifndef BSHUF_H5FILTER_H
+#define BSHUF_H5FILTER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define H5Z_class_t_vers 2
+#include "hdf5.h"
+
+
+#define BSHUF_H5FILTER 32008
+
+
+#define BSHUF_H5_COMPRESS_LZ4 2
+#define BSHUF_H5_COMPRESS_ZSTD 3
+
+
+extern H5Z_class_t bshuf_H5Filter[1];
+
+
+/* ---- bshuf_register_h5filter ----
+ *
+ * Register the bitshuffle HDF5 filter within the HDF5 library.
+ *
+ * Call this before using the bitshuffle HDF5 filter from C unless
+ * using dynamically loaded filters.
+ *
+ */
+int bshuf_register_h5filter(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // BSHUF_H5FILTER_H
@@ -1,5 +1,5 @@
 /*
- * IOchain - Distribute a chain of dependant IO events amoung threads.
+ * IOchain - Distribute a chain of dependent IO events among threads.
 *
 * This file is part of Bitshuffle
 * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
@@ -81,9 +81,9 @@ void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
    C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
 #ifdef _OPENMP
    omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
-
-
-
+    // *in_pl[this_iter]* lock released at the end of the iteration to avoid being
+    // overtaken by previous threads and having *out_pl[this_iter]* corrupted.
+    // Especially worried about thread 0, iteration 0.
    omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
 #endif
 }
@@ -1,5 +1,5 @@
 /*
- * IOchain - Distribute a chain of dependant IO events amoung threads.
+ * IOchain - Distribute a chain of dependent IO events among threads.
 *
 * This file is part of Bitshuffle
 * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
@@ -25,7 +25,7 @@
 * Usage
 * -----
 *  - Call `ioc_init` in serial block.
- *  - Each thread should create a local variable *size_t this_iter* and
+ *  - Each thread should create a local variable *size_t this_iter* and 
 *    pass its address to all function calls. Its value will be set
 *    inside the functions and is used to identify the thread.
 *  - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
@@ -90,5 +90,5 @@ void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
 void * ioc_get_out(ioc_chain *C, size_t *this_iter);
 void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);

-#endif
+#endif  // IOCHAIN_H

@@ -4,7 +4,6 @@
 */

 #include <hdf5.h>
-#include <hdf5_hl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -104,7 +103,13 @@ int get_nxs_dataset_dims(struct ds_desc_t *desc) {
    ERROR_JUMP(-1, close_space, "Error getting dataset dimensions");
  }

-  desc->data_width = width;
+  if ( H5Tequal(t_id,H5T_NATIVE_CHAR)>0 || H5Tequal(t_id,H5T_NATIVE_INT)>0 || H5Tequal(t_id,H5T_NATIVE_SHORT)>0 || H5Tequal(t_id,H5T_NATIVE_LONG)>0 || H5Tequal(t_id,H5T_NATIVE_LLONG)>0 ) {
+    // signed
+    desc->data_width = -width;
+  } else {
+    // unsigned
+    desc->data_width =  width;
+  }

 close_space:
  H5Sclose(s_id);
@@ -222,7 +227,7 @@ int get_frame_from_chunk(const struct ds_desc_t *desc, const char *ds_name,
    c_buffer = buffer;
  }

-  if (H5DOread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) <
+  if (H5Dread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) <
      0) {
    char message[128];
    sprintf(message,
@@ -233,7 +238,7 @@ int get_frame_from_chunk(const struct ds_desc_t *desc, const char *ds_name,

  if (o_eiger_desc->bs_applied) {
    if (bslz4_decompress(o_eiger_desc->bs_params, c_bytes, c_buffer,
-                         desc->data_width * frame_size[1] * frame_size[2],
+                         abs(desc->data_width) * frame_size[1] * frame_size[2],
                         buffer) < 0) {
      char message[128];
      sprintf(message,
@@ -251,10 +256,11 @@ done:
  return retval;
 }

-int get_nxs_frame(const struct ds_desc_t *desc, const int n, void *buffer) {
+int get_nxs_frame(const struct ds_desc_t *desc, const int nin, void *buffer) {
  /* detector data are the two inner most indices */
  /* TODO: handle ndims > 3 and select appropriately */
  int retval = 0;
+  int n = nin - desc->image_number_offset;
  hsize_t frame_idx[3] = {n, 0, 0};
  hsize_t frame_size[3] = {1, desc->dims[1], desc->dims[2]};
  if (n < 0 || n >= desc->dims[0]) {
@@ -271,9 +277,10 @@ done:
  return retval;
 }

-int get_dectris_eiger_frame(const struct ds_desc_t *desc, int n, void *buffer) {
+int get_dectris_eiger_frame(const struct ds_desc_t *desc, int nin, void *buffer) {

  int retval = 0;
+  int n = nin - desc->image_number_offset;
  int block, frame_count, idx;
  struct eiger_ds_desc_t *eiger_desc = (struct eiger_ds_desc_t *)desc;
  char data_name[16] = {0};
@@ -349,6 +356,10 @@ int get_dectris_eiger_dataset_dims(struct ds_desc_t *desc) {
    if (data_width <= 0) {
      ERROR_JUMP(-1, close_space, "Unable to get type size");
    }
+    if ( H5Tequal(t_id,H5T_NATIVE_CHAR)>0 || H5Tequal(t_id,H5T_NATIVE_INT)>0 || H5Tequal(t_id,H5T_NATIVE_SHORT)>0 || H5Tequal(t_id,H5T_NATIVE_LONG)>0 || H5Tequal(t_id,H5T_NATIVE_LLONG)>0 ) {
+      // signed
+      data_width = -data_width;
+    }

    ndims = H5Sget_simple_extent_ndims(s_id);
    if (ndims != 3) {
@@ -561,9 +572,37 @@ int get_dectris_eiger_pixel_mask(const struct ds_desc_t *desc, int *buffer) {
    ERROR_JUMP(-1, done, "Error opening detectorSpecific/pixel_mask");
  }

+  // what if this is compressed?
+  hid_t dcpl    = H5Dget_create_plist(ds_id);
+  int n_filters = H5Pget_nfilters(dcpl);
+  H5Z_filter_t    filter_id;
+  if (n_filters>0) {
+    unsigned int    flags;
+    size_t          nelmts = 1;
+    unsigned int    values_out[1] = {99};
+    char            filter_name[80];
+    for ( int i_filt = 0; i_filt < n_filters; i_filt++) {
+      filter_id = H5Pget_filter(dcpl, i_filt, &flags, &nelmts, values_out, sizeof(filter_name), filter_name, NULL);
+      if (filter_id>=0) {
+        fprintf(stderr," filter #%d name =\"%s\"\n",(i_filt+1),filter_name);
+      }
+    }
+  }
+
+  int i0 = H5Zfilter_avail(BS_H5_FILTER_ID);
+
  err = H5Dread(ds_id, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, buffer);
  if (err < 0) {
-    ERROR_JUMP(-1, close_dataset, "Error reading detectorSpecific/pixel_mask");
+    if (n_filters>0) {
+      ERROR_JUMP(-1, close_dataset, "Error reading detectorSpecific/pixel_mask with filter(s)");
+    }
+    else {
+      ERROR_JUMP(-1, close_dataset, "Error reading detectorSpecific/pixel_mask");
+    }
+  }
+
+  if (!i0 && H5Zfilter_avail(BS_H5_FILTER_ID)) {
+    fprintf(stderr," bitshuffle filter is available now since H5Dread (of pixel-mask) triggered loading of the filter.\n");
  }

 close_dataset:
@@ -594,9 +633,9 @@ herr_t det_visit_callback(hid_t root_id, const char *name,

  /* check for an "NX_class" attribute */
  {
-    int str_size = 0;
-    void *buffer = NULL;
-    hid_t a_id, t_id, mt_id;
+    char* buffer = (char*)malloc(1);
+    buffer[0] = '\0';
+    hid_t a_id, t_id;
    if (H5Aexists(g_id, "NX_class") <= 0) {
      /* not an error - just close group and allow continuation */
      retval = 0;
@@ -616,39 +655,31 @@ herr_t det_visit_callback(hid_t root_id, const char *name,
    if (t_id < 0) {
      ERROR_JUMP(-1, close_attr, "Error getting datatype");
    }
-    if (H5Tis_variable_str(t_id) > 0) {
-      str_size = -1;
-      buffer = malloc(sizeof(char *));
-    } else {
-      str_size = H5Tget_size(t_id);
-      buffer = malloc(str_size + 1);
+
+    H5A_info_t a_info;
+    herr_t err = H5Aget_info(a_id, &a_info);
+    if (err<0) {
+      ERROR_JUMP(-1, close_type,
+                 "Unable to get attribute info for NX_class");
    }
+    else {
+      if (a_info.cset != H5T_CSET_ASCII && a_info.cset != H5T_CSET_UTF8) {
+        fprintf(stderr," %s : NX_class attribute info cset = unknown with size %d\n",name,(int) a_info.data_size);
+      }
+    }
+
+    buffer = (char *)malloc(sizeof(char)*(H5Tget_size(t_id)+1));
    if (!buffer) {
      ERROR_JUMP(-1, close_type, "Error allocating string buffer");
    }

-    mt_id = H5Tcopy(H5T_C_S1);
-    if (mt_id < 0) {
-      ERROR_JUMP(-1, free_buffer, "Error creating HDF5 String datatype");
-    }
-    // set the target string type to be one longer than the recorded string
-    // in keeping with the malloc'd buffer - if this is already a null
-    // terminated string then we will just have two nulls, if it is not
-    // then we won't clobber the last char in the buffer with a null in the
-    // H5Aread call
-    if (H5Tset_size(mt_id, str_size == -1 ? H5T_VARIABLE : str_size + 1) < 0) {
-      char message[64];
-      sprintf(message, "Error setting string datatype to size %d", str_size);
-      ERROR_JUMP(-1, close_mtype, message);
-    }
-
-    if (H5Aread(a_id, mt_id, buffer) < 0) {
+    if (H5Aread(a_id, t_id, buffer) < 0) {
      char message[256];
      sprintf(
          message,
          "H5OVisit callback: Error reading NX_class attribute on group %.128s",
          name);
-      ERROR_JUMP(-1, close_mtype, message);
+      ERROR_JUMP(-1, free_buffer, message);
    }

    /* ensure the buffer is null terminated */
@@ -656,26 +687,16 @@ herr_t det_visit_callback(hid_t root_id, const char *name,

    /* test for NXdata or NXdetector */
    {
-      char *nxclass = str_size > 0 ? (char *)buffer : *((char **)buffer);
-      if (strcmp("NXdata", nxclass) == 0) {
+      if      (strcmp("NXdata", buffer) == 0) {
        hid_t out_id = H5Gopen(root_id, name, H5P_DEFAULT);
        output_data->nxdata = out_id;
-      } else if (strcmp("NXdetector", nxclass) == 0) {
+      }
+      else if (strcmp("NXdetector", buffer) == 0) {
        hid_t out_id = H5Gopen(root_id, name, H5P_DEFAULT);
        output_data->nxdetector = out_id;
      }
    }

-    if (str_size == -1) {
-      hsize_t dims[1] = {1};
-      hid_t s_id = H5Screate_simple(1, dims, NULL);
-      H5Sselect_all(s_id);
-      H5Dvlen_reclaim(mt_id, s_id, H5P_DEFAULT, buffer);
-      H5Sclose(s_id);
-    }
-
-  close_mtype:
-    H5Tclose(mt_id);
  free_buffer:
    free(buffer);
  close_type:
@@ -15,10 +15,12 @@ struct ds_desc_t {
  hid_t data_g_id;
  hsize_t dims[3];
  int data_width;
+  int image_number_offset;
  int (*get_pixel_properties)(const struct ds_desc_t *, double *, double *);
  int (*get_pixel_mask)(const struct ds_desc_t *, int *);
  int (*get_data_frame)(const struct ds_desc_t *, const int, void *);
  void (*free_desc)(struct ds_desc_t *);
+  int i2i[]; // array to hold a translation from the image number requested by XDS and the actual position in the HDF5 file
 };

 struct nxs_ds_desc_t {
@@ -5,11 +5,16 @@

 #include <hdf5.h>
 #include <stdlib.h>
+#include <string.h>

 #include "file.h"
 #include "filters.h"
 #include "plugin.h"

+#ifdef USE_BITSHUFFLE
+#include "bshuf_h5filter.h"
+#endif
+
 /* XDS does not provide an error callback facility, so just write to stderr
   for now - generally regarded as poor practice */
 #define ERROR_OUTPUT stderr
@@ -17,24 +22,41 @@
 /* mask bits loosely based on what Neggia does and what NeXus says should be
   done basically - anything in the low byte (& 0xFF) means "ignore this"
   Neggia uses the value -2 if bit 1, 2 or 3 are set */
-#define COPY_AND_MASK(in, out, size, mask)                                     \
+/* CV-GPhL-20210408: we want more control over the value non-masked
+   pixels should be set to. */
+#define COPY_AND_MASK(in, value, setValue, out, size, mask)                    \
  {                                                                            \
    int i;                                                                     \
-    if (mask) {                                                                \
-      for (i = 0; i < size; ++i) {                                             \
-        out[i] = in[i];                                                        \
-        if (mask[i] & 0xFF)                                                    \
-          out[i] = -1;                                                         \
-        if (mask[i] & 30)                                                      \
-          out[i] = -2;                                                         \
+    if (value!=0) {                                                            \
+      if (mask) {                                                              \
+        for (i = 0; i < size; ++i) {                                           \
+          out[i] = (in[i] == value) ? setValue : in[i];                        \
+          if (mask[i] & 0xFF)                                                  \
+            out[i] = -1;                                                       \
+          if (mask[i] & 30)                                                    \
+            out[i] = -2;                                                       \
+        }                                                                      \
+      } else {                                                                 \
+        for (i = 0; i < size; i++) {                                           \
+          out[i] = (in[i] == value) ? setValue : in[i];                        \
+        }                                                                      \
      }                                                                        \
    } else {                                                                   \
-      for (i = 0; i < size; i++) {                                             \
-        out[i] = in[i];                                                        \
+      if (mask) {                                                              \
+        for (i = 0; i < size; ++i) {                                           \
+          out[i] = in[i];                                                      \
+          if (mask[i] & 0xFF)                                                  \
+            out[i] = -1;                                                       \
+          if (mask[i] & 30)                                                    \
+            out[i] = -2;                                                       \
+        }                                                                      \
+      } else {                                                                 \
+        for (i = 0; i < size; i++) {                                           \
+          out[i] = in[i];                                                      \
+        }                                                                      \
      }                                                                        \
    }                                                                          \
  }
-
 #define APPLY_MASK(buffer, mask, size)                                         \
  {                                                                            \
    int i;                                                                     \
@@ -52,42 +74,200 @@ static hid_t file_id = 0;
 static struct ds_desc_t *data_desc = NULL;
 static int *mask_buffer = NULL;

+// CV-20240605: potentially provide a mapping from frame number (as
+//              requested by caller) to actual 2D slice within 3D data
+//              array.
+//
+//              This is defined by the environment variable
+//              DURIN_IMAGE2ORDINAL (see below).
+int *image2ordinal = NULL;
+int image2ordinal_debug = 0;
+int image2ordinal_imin = 0;
+int image2ordinal_imax = 0;
+
 void fill_info_array(int info[1024]) {
  info[0] = DLS_CUSTOMER_ID;
  info[1] = VERSION_MAJOR;
  info[2] = VERSION_MINOR;
  info[3] = VERSION_PATCH;
  info[4] = VERSION_TIMESTAMP;
+  info[5] = 0; // image number offset
+  info[6] = -1; // marked pixels not already in pixel_mask: reset to this value
+
+  char *cenv;
+  cenv = getenv("DURIN_IMAGE_NUMBER_OFFSET");
+  if (cenv!=NULL) {
+    info[5] = atoi(cenv);
+  }
+  cenv = getenv("DURIN_RESET_UNMASKED_PIXEL");
+  if (cenv!=NULL) {
+    info[6] = atoi(cenv);
+  }
+
+  cenv = getenv("DURIN_IMAGE2ORDINAL");
+  if (cenv!=NULL&&(!image2ordinal)) {
+
+    char *denv = getenv("DURIN_IMAGE2ORDINAL_DEBUG");
+    if (denv!=NULL) {
+      image2ordinal_debug=1;
+    }
+
+    // <ordinal_start>,<image_1_start>-<image_1_end>,<image_2_start>-<image_2_end>,..,<image_N_start>-<image_N_end>
+    if (image2ordinal_debug) printf("DURIN_IMAGE2ORDINAL = \"%s\"\n",cenv);
+
+    const char outer_delimiters[] = ",";
+    const char inner_delimiters[] = "-";
+
+    char *found;
+
+    char *outer_saveptr;
+    char *inner_saveptr;
+
+    int ordinal_start = 0;
+    int ordinal = 0;
+    int ntt = -1;
+    found = strtok_r(cenv,outer_delimiters, &outer_saveptr);
+    if (found!=NULL) {
+      int tt[1000][2];
+      while(found) {
+        if (ordinal_start==0) {
+          ordinal_start = atoi(found);
+          ordinal = ordinal_start - 1;
+        }
+        else {
+          char* s = strtok_r(found, inner_delimiters, &inner_saveptr);
+          if (s!=NULL) {
+            int i1 = atoi(s);
+            s = strtok_r(NULL,inner_delimiters, &inner_saveptr);
+            if (s!=NULL) {
+              int i2 = atoi(s);
+              ntt++;
+              if (ntt<=1000) {
+                tt[ntt][0] = i1;
+                tt[ntt][1] = i2;
+                for(int i=i1; i<=i2; ++i) {
+                  ordinal++;
+                  if (ordinal==1) {
+                    image2ordinal_imin=i;
+                    image2ordinal_imax=i;
+                  }
+                  else {
+                    if (i<image2ordinal_imin) image2ordinal_imin=i;
+                    if (i>image2ordinal_imax) image2ordinal_imax=i;
+                  }
+                }
+              }
+            }
+          }
+        }
+        found = strtok_r(NULL,outer_delimiters,&outer_saveptr);
+      }
+
+      if (ordinal_start>0) {
+        if (image2ordinal_debug) {
+          printf("ordinal_start, end = %d %d\n",ordinal_start, ordinal);
+          printf("imin, imax         = %d %d\n",image2ordinal_imin,image2ordinal_imax);
+        }
+
+        // allocate array to go from image number/id to ordinal:
+        image2ordinal = malloc((image2ordinal_imax-image2ordinal_imin+1) * sizeof(image2ordinal_imin));
+        int ordinal = ordinal_start - 1;
+        for(int i=0; i<=ntt; i++) {
+          for(int j=tt[i][0];j<=tt[i][1];j++) {
+            ordinal++;
+            //printf(" %d -> %d\n",ordinal,j);
+            image2ordinal[j] = ordinal;
+          }
+        }
+        if (image2ordinal&&image2ordinal_debug) {
+          for(int i=image2ordinal_imin; i<=image2ordinal_imax; i++) {
+            if (image2ordinal[i]>0) {
+              printf(" %d -> %d\n",i,image2ordinal[i]);
+            }
+          }
+        }
+      }
+    }
+  }
+
 }

-int convert_to_int_and_mask(void *in_buffer, int d_width, int *out_buffer,
+int convert_to_int_and_mask(void *in_buffer, int width, int setValue, int *out_buffer,
                            int length, int *mask) {
  /* transfer data to output buffer, performing data conversion as required */
  int retval = 0;
  /* TODO: decide how conversion of data should work */
  /* Should we sign extend? Neggia doesn't (casts from uint*), but may be more
   * intuitive */
-  if (d_width == sizeof(signed char)) {
-    signed char *in = in_buffer;
-    COPY_AND_MASK(in, out_buffer, length, mask);
-  } else if (d_width == sizeof(short)) {
-    short *in = in_buffer;
-    COPY_AND_MASK(in, out_buffer, length, mask);
-  } else if (d_width == sizeof(int)) {
-    int *in = in_buffer;
-    COPY_AND_MASK(in, out_buffer, length, mask);
-  } else if (d_width == sizeof(long int)) {
-    long int *in = in_buffer;
-    COPY_AND_MASK(in, out_buffer, length, mask);
-  } else if (d_width == sizeof(long long int)) {
-    long long int *in = in_buffer;
-    COPY_AND_MASK(in, out_buffer, length, mask);
-  } else {
-    char message[128];
-    sprintf(message, "Unsupported conversion of data width %d to %ld (int)",
-            d_width, sizeof(int));
-    ERROR_JUMP(-1, done, message);
+
+  int d_width = abs(width);
+
+  // CV-20210407
+  //   Dealing with a signed data array: no extra check for marker
+  //   value needed (since data can already take advantage of the
+  //   negative data range). It is unclear though why/when data would
+  //   come in as a signed array in the first place ...
+  if (width<0) {
+    if (d_width == sizeof(signed char)) {
+      // 8-bit
+      signed char *in = in_buffer;
+      COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(short)) {
+      // 16-bit
+      short *in = in_buffer;
+      COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(int)) {
+      // 16-bit
+      int *in = in_buffer;
+      COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(long int)) {
+      // 32-bit
+      long int *in = in_buffer;
+      COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(long long int)) {
+      // 64-bit
+      long long int *in = in_buffer;
+      COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
+    } else {
+      char message[128];
+      sprintf(message, "Unsupported conversion of data width %d to %ld (int)",
+              d_width, sizeof(int));
+      ERROR_JUMP(-1, done, message);
+    }
  }
+  // CV-20210407
+  //   Dealing with an unsigned data array: extra check for marker
+  //   value required (to handle overloaded pixels correctly if wanted
+  //   - see also DURIN_RESET_UNMASKED_PIXEL environment variable).
+  else {
+    if (d_width == sizeof(unsigned char)) {
+      // 8-bit
+      unsigned char *in = in_buffer;
+      COPY_AND_MASK(in, UINT8_MAX, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(unsigned short)) {
+      // 16-bit
+      unsigned short *in = in_buffer;
+      COPY_AND_MASK(in, UINT16_MAX, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(unsigned int)) {
+      // 16-bit
+      unsigned int *in = in_buffer;
+      COPY_AND_MASK(in, UINT16_MAX, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(unsigned long)) {
+      // 32-bit
+      unsigned long *in = in_buffer;
+      COPY_AND_MASK(in, UINT32_MAX, setValue, out_buffer, length, mask);
+    } else if (d_width == sizeof(unsigned long long)) {
+      // 64-bit
+      unsigned long long *in = in_buffer;
+      COPY_AND_MASK(in, UINT32_MAX, setValue, out_buffer, length, mask);
+    } else {
+      char message[128];
+      sprintf(message, "Unsupported conversion of data width %d to %ld (int)",
+              d_width, sizeof(int));
+      ERROR_JUMP(-1, done, message);
+    }
+  }
+
 done:
  return retval;
 }
@@ -110,6 +290,12 @@ void plugin_open(const char *filename, int info[1024], int *error_flag) {
    ERROR_JUMP(-2, done, "Failed to configure HDF5 error handling");
  }

+#ifdef USE_BITSHUFFLE
+  if (bshuf_register_h5filter() < 0 ) {
+    ERROR_JUMP(-2, done, "Failed to register bitshuffle filter");
+  }
+#endif
+
  fill_info_array(info);
  file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
  if (file_id < 0) {
@@ -124,6 +310,8 @@ void plugin_open(const char *filename, int info[1024], int *error_flag) {
    ERROR_JUMP(-4, done, "");
  }

+  data_desc->image_number_offset = info[5];
+
  mask_buffer = malloc(data_desc->dims[1] * data_desc->dims[2] * sizeof(int));
  if (mask_buffer) {
    retval = data_desc->get_pixel_mask(data_desc, mask_buffer);
@@ -138,6 +326,10 @@ void plugin_open(const char *filename, int info[1024], int *error_flag) {
  }
  retval = 0;

+#ifdef GPHL_COMPILE_DATE
+  fprintf(ERROR_OUTPUT, "\n XDS HDF5/Durin plugin %d.%d.%d (DLS, 2018-2023; GPhL, 2020-2024 - built %d)\n", info[1], info[2], info[3], GPHL_COMPILE_DATE);
+#endif
+
 done:
  *error_flag = retval;
  if (retval < 0) {
@@ -165,7 +357,7 @@ void plugin_get_header(int *nx, int *ny, int *nbytes, float *qx, float *qy,

  *nx = data_desc->dims[2];
  *ny = data_desc->dims[1];
-  *nbytes = data_desc->data_width;
+  *nbytes = abs(data_desc->data_width);
  *number_of_frames = data_desc->dims[0];
  *qx = (float)x_pixel_size;
  *qy = (float)y_pixel_size;
@@ -186,26 +378,47 @@ void plugin_get_data(int *frame_number, int *nx, int *ny, int *data_array,
  fill_info_array(info);

  void *buffer = NULL;
-  if (sizeof(*data_array) == data_desc->data_width) {
+  if (sizeof(*data_array) == abs(data_desc->data_width)) {
    buffer = data_array;
  } else {
-    buffer = malloc(data_desc->data_width * frame_size_px);
+    buffer = malloc(abs(data_desc->data_width) * frame_size_px);
    if (!buffer) {
      ERROR_JUMP(-1, done, "Unable to allocate data buffer");
    }
  }

-  if (data_desc->get_data_frame(data_desc, (*frame_number) - 1, buffer) < 0) {
+  int ordinal = *frame_number;
+  if (image2ordinal) {
+    if (ordinal < image2ordinal_imin || ordinal>image2ordinal_imax) {
+      char message[64] = {0};
+      sprintf(message, "Failed to map frame %d to ordinals since outside of range %d - %d", ordinal,image2ordinal_imin,image2ordinal_imax);
+      ERROR_JUMP(-2, done, message);
+    }
+    ordinal = image2ordinal[ordinal];
+    if (ordinal!=*frame_number) {
+      if (image2ordinal_debug) printf("fetching data from ordinal %d for frame %d\n",ordinal,*frame_number);
+    }
+  }
+
+  if (data_desc->get_data_frame(data_desc, ordinal - 1, buffer) < 0) {
    char message[64] = {0};
-    sprintf(message, "Failed to retrieve data for frame %d", *frame_number);
+    if (image2ordinal) {
+      sprintf(message, "Failed to retrieve data for frame %d (ordinal %d)", *frame_number, ordinal);
+    } else {
+      sprintf(message, "Failed to retrieve data for frame %d", *frame_number);
+    }
    ERROR_JUMP(-2, done, message);
  }

  if (buffer != data_array) {
-    if (convert_to_int_and_mask(buffer, data_desc->data_width, data_array,
+    if (convert_to_int_and_mask(buffer, data_desc->data_width, info[6], data_array,
                                frame_size_px, mask_buffer) < 0) {
      char message[64];
-      sprintf(message, "Error converting data for frame %d", *frame_number);
+      if (image2ordinal) {
+        sprintf(message, "Error converting data for frame %d (ordinal %d)", *frame_number, ordinal);
+      } else {
+        sprintf(message, "Error converting data for frame %d", *frame_number);
+      }
      ERROR_JUMP(-2, done, message);
    }
  } else {
@@ -230,6 +443,11 @@ void plugin_close(int *error_flag) {
  }
  file_id = 0;

+  if (image2ordinal) {
+    free(image2ordinal);
+    image2ordinal = NULL;
+  }
+
  if (mask_buffer) {
    free(mask_buffer);
    mask_buffer = NULL;
@@ -0,0 +1,195 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+import json
+import mimetools
+import mimetypes
+import os
+import sys
+import urllib2
+
+
+def fail(msg, code=1):
+    sys.stderr.write("ERROR: %s\n" % msg)
+    sys.exit(code)
+
+
+def api_request(url, token, method="GET", json_data=None, content_type=None):
+    data = None
+    headers = {
+        "Authorization": "token %s" % token,
+    }
+
+    if json_data is not None:
+        data = json.dumps(json_data)
+        headers["Content-Type"] = "application/json"
+    elif content_type is not None:
+        headers["Content-Type"] = content_type
+
+    request = urllib2.Request(url, data=data, headers=headers)
+    request.get_method = lambda: method
+
+    try:
+        response = urllib2.urlopen(request)
+        body = response.read()
+        status = response.getcode()
+        return status, body
+    except urllib2.HTTPError as e:
+        return e.code, e.read()
+
+
+def encode_multipart_formdata(fields, files):
+    boundary = mimetools.choose_boundary()
+    crlf = "\r\n"
+    lines = []
+
+    for key, value in fields:
+        lines.append("--" + boundary)
+        lines.append('Content-Disposition: form-data; name="%s"' % key)
+        lines.append("")
+        lines.append(value)
+
+    for key, filename, content, content_type in files:
+        lines.append("--" + boundary)
+        lines.append(
+            'Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)
+        )
+        lines.append("Content-Type: %s" % content_type)
+        lines.append("")
+        lines.append(content)
+
+    lines.append("--" + boundary + "--")
+    lines.append("")
+    body = crlf.join(lines)
+    content_type = "multipart/form-data; boundary=%s" % boundary
+    return content_type, body
+
+
+def multipart_request(url, token, fields, files):
+    content_type, body = encode_multipart_formdata(fields, files)
+    headers = {
+        "Authorization": "token %s" % token,
+        "Content-Type": content_type,
+    }
+
+    request = urllib2.Request(url, data=body, headers=headers)
+    request.get_method = lambda: "POST"
+
+    try:
+        response = urllib2.urlopen(request)
+        return response.getcode(), response.read()
+    except urllib2.HTTPError as e:
+        return e.code, e.read()
+
+
+def get_release_by_tag(api_base, token, tag):
+    url = "%s/releases/tags/%s" % (api_base, tag)
+    status, body = api_request(url, token, method="GET")
+    if status == 200:
+        return json.loads(body)
+    if status == 404:
+        return None
+    fail("failed to fetch release for tag %s: HTTP %s\n%s" % (tag, status, body))
+
+
+def create_release(api_base, token, tag):
+    url = "%s/releases" % api_base
+    payload = {
+        "tag_name": tag,
+        "name": tag,
+        "draft": False,
+        "prerelease": False,
+    }
+    status, body = api_request(url, token, method="POST", json_data=payload)
+    if status not in (200, 201):
+        fail("failed to create release for tag %s: HTTP %s\n%s" % (tag, status, body))
+    return json.loads(body)
+
+
+def ensure_release(api_base, token, tag):
+    release = get_release_by_tag(api_base, token, tag)
+    if release is not None:
+        print("Release for tag %s already exists (id=%s)" % (tag, release.get("id")))
+        return release
+
+    print("Release for tag %s does not exist, creating it" % tag)
+    release = create_release(api_base, token, tag)
+    print("Created release id=%s" % release.get("id"))
+    return release
+
+
+def upload_asset(api_base, token, release_id, file_path):
+    if not os.path.isfile(file_path):
+        fail("file not found: %s" % file_path)
+
+    asset_name = os.path.basename(file_path)
+    mime_type = mimetypes.guess_type(asset_name)[0] or "application/octet-stream"
+
+    with open(file_path, "rb") as f:
+        content = f.read()
+
+    url = "%s/releases/%s/assets" % (api_base, release_id)
+    status, body = multipart_request(
+        url,
+        token,
+        fields=[("name", asset_name)],
+        files=[("attachment", asset_name, content, mime_type)],
+    )
+
+    if status not in (200, 201):
+        fail("failed to upload asset %s: HTTP %s\n%s" % (asset_name, status, body))
+
+    print("Uploaded asset: %s" % asset_name)
+
+
+def find_assets(build_dir):
+    names = []
+    for name in sorted(os.listdir(build_dir)):
+        if name.startswith("libdurin-plugin.so"):
+            full = os.path.join(build_dir, name)
+            if os.path.isfile(full):
+                names.append(full)
+    return names
+
+
+def main():
+    if len(sys.argv) != 4:
+        sys.stderr.write(
+            "Usage: %s <gitea-server> <owner/repo> <tag>\n" % sys.argv[0]
+        )
+        sys.stderr.write(
+            "Example: %s https://gitea.psi.ch mx/durin 1.0.0\n" % sys.argv[0]
+        )
+        sys.exit(1)
+
+    server = sys.argv[1].rstrip("/")
+    repo = sys.argv[2]
+    tag = sys.argv[3]
+
+    token = os.environ.get("GITEA_TOKEN")
+    if not token:
+        fail("GITEA_TOKEN environment variable is not set")
+
+    build_dir = "build"
+    if not os.path.isdir(build_dir):
+        fail("build directory not found: %s" % build_dir)
+
+    assets = find_assets(build_dir)
+    if not assets:
+        fail("no libdurin-plugin.so* files found in %s" % build_dir)
+
+    api_base = "%s/api/v1/repos/%s" % (server, repo)
+
+    release = ensure_release(api_base, token, tag)
+    release_id = release.get("id")
+    if not release_id:
+        fail("release id missing in API response")
+
+    for asset in assets:
+        upload_asset(api_base, token, release_id, asset)
+
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
leonarski_f	d39e9d7a99	Don't hide symbols in the shared library Build Packages / Build (push) Successful in 31s Details	2026-04-08 16:03:30 +02:00
leonarski_f	a3235390a4	README: Update Build Packages / Build (push) Successful in 33s Details	2026-04-08 15:15:22 +02:00
leonarski_f	215e9471ad	CI: Dedicated upload python script Build Packages / Build (push) Successful in 31s Details	2026-04-08 14:58:53 +02:00
leonarski_f	7fb6117be7	CI: Try again upload Build Packages / Build (push) Failing after 32s Details	2026-04-08 14:51:20 +02:00
leonarski_f	a87f4a56e4	CI: Try again upload Build Packages / Build (push) Failing after 29s Details	2026-04-08 14:48:56 +02:00
leonarski_f	942e3ae532	CI: Try again upload Build Packages / Build (push) Failing after 39s Details	2026-04-08 14:46:36 +02:00
leonarski_f	11c9b581f2	Fix TOKEN Build Packages / Build (push) Failing after 31s Details	2026-04-08 14:35:28 +02:00
leonarski_f	a7b81b7db0	Trying with tag and uploading release Build Packages / Build (push) Failing after 29s Details	2026-04-08 13:32:09 +02:00
leonarski_f	959636cbc3	Add version 1.0.0	2026-04-08 13:05:52 +02:00
leonarski_f	60135d9471	Include Node.js in the docker (needed for gitea)	2026-04-08 13:04:54 +02:00
leonarski_f	dd8bb91931	Add gitea process Build Packages / Build (push) Successful in 29s Details	2026-04-08 12:49:39 +02:00
leonarski_f	15e8781a74	Use CMake to build the plugin and HDF5 (internally)	2026-04-08 12:39:50 +02:00
leonarski_f	cfe032b731	Remove need for hdf5_hl.h (so no need for unsupported configuration of HDF5)	2026-04-08 12:38:52 +02:00
leonarski_f	6421de97bb	Improve Makefile	2026-04-08 10:32:20 +02:00
leonarski_f	69470cd374	Update bitshuffle/lz4 code + embed bshuf_h5filter code into the library	2026-04-08 10:16:46 +02:00
CV-GPhL	2bcd3074cc	Null terminate buffer in file.c Ensure the buffer is null terminated after allocation - otherwise correct results from strcmp below is undefined.	2026-02-04 11:20:21 +01:00
Clemens Vonrhein	cfe0e78319	loop over all filters registered when printing	2024-12-18 16:46:14 +00:00
Clemens Vonrhein	d7801cdb54	modified reading of NX_class attribute - to accommodate both DATATYPE H5T_STRING { STRSIZE 14; STRPAD H5T_STR_NULLTERM; CSET H5T_CSET_ASCII; CTYPE H5T_C_S1; } and DATATYPE H5T_STRING { STRSIZE H5T_VARIABLE; STRPAD H5T_STR_NULLTERM; CSET H5T_CSET_UTF8; CTYPE H5T_C_S1; }	2024-12-17 09:59:55 +00:00
Clemens Vonrhein	4cf18ceff4	Mechanism to read bitshuffle-compressed pixel-mask data - by loading/using the normal bitshuffle source from https://github.com/kiyo-masui/bitshuffle (see Makefile).	2024-12-16 15:05:38 +00:00
Clemens Vonrhein	44ba3cd278	Provide a mechanism to distinguish between an image number (as understood on the XDS side) and the image ordinal (as stored in the HDF data). The 2D data arrays are not necessarily at image_nr_low=1 ;-)	2024-12-16 14:57:32 +00:00
CV-GPhL	ca8806c912	Merge pull request #2 from CV-GPhL/CV-GPhL-patch-1 Update plugin.c to write compilation time if available/set	2024-03-14 16:43:22 +01:00
CV-GPhL	163d5bdfb1	Update plugin.c to write compilation time if available/set	2024-03-14 16:42:52 +01:00
CV-GPhL	ca2c0ee0e7	Merge pull request #1 from DiamondLightSource/master Catching up 20240223	2024-02-23 12:55:55 +01:00
Graeme Winter	808f5fbd42	Do not depend on the strings being NULL terminated (#29 ) Fixes #28 Instead make them NULL terminated by reading one longer into buffer	2023-08-09 09:42:16 +01:00
Clemens Vonrhein	77d2b84957	Better handling of (usually) unsigned data arrays when calling routine to convert to (signed) int and apply pixel mask. The environment variable DURIN_RESET_UNMASKED_PIXEL can now be used to set non-masked saturated pixels in order to process them correctly with e.g. XDS.	2021-04-08 14:46:48 +01:00
Clemens Vonrhein	0f27832f78	Resolved merge conflicts	2021-04-06 12:33:21 +01:00
Clemens Vonrhein	9e7d609f42	allow for image offset via DURIN_IMAGE_NUMBER_OFFSET environment variable	2020-06-12 15:57:14 +01:00