Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0e258cc8c9 | |||
| bec44c8691 |
@@ -1,35 +0,0 @@
|
||||
name: Build Packages
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- '**'
|
||||
tags:
|
||||
- '**'
|
||||
|
||||
jobs:
|
||||
build-library:
|
||||
name: Build
|
||||
runs-on: durin
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build library
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
|
||||
- name: Upload release assets to Gitea
|
||||
if: github.ref_type == 'tag'
|
||||
shell: bash
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.PIP_REPOSITORY_API_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
python tools/gitea_release_upload.py \
|
||||
"${{ github.server_url }}" \
|
||||
"${{ github.repository }}" \
|
||||
"${{ github.ref_name }}"
|
||||
@@ -1,56 +0,0 @@
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.19)
|
||||
|
||||
PROJECT(durin VERSION 1.0.0 LANGUAGES C)
|
||||
|
||||
include(FetchContent)
|
||||
|
||||
SET(CMAKE_C_FLAGS_RELEASE "-O3")
|
||||
|
||||
SET(CMAKE_C_STANDARD 99)
|
||||
SET(CMAKE_C_STANDARD_REQUIRED ON)
|
||||
SET(CMAKE_C_EXTENSIONS OFF)
|
||||
|
||||
SET(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
SET(BUILD_SHARED_LIBS OFF)
|
||||
|
||||
set(HDF5_USE_STATIC_LIBRARIES TRUE)
|
||||
|
||||
SET(HDF5_BUILD_HL_LIB OFF)
|
||||
|
||||
SET(HDF5_ENABLE_THREADSAFE ON)
|
||||
SET(HDF5_ENABLE_SZIP_SUPPORT OFF)
|
||||
SET(HDF5_ENABLE_SZIP_ENCODING OFF)
|
||||
SET(HDF5_BUILD_EXAMPLES OFF)
|
||||
SET(HDF5_BUILD_CPP_LIB OFF)
|
||||
SET(HDF5_ENABLE_Z_LIB_SUPPORT OFF)
|
||||
SET(HDF5_EXTERNALLY_CONFIGURED 1)
|
||||
|
||||
INCLUDE_DIRECTORIES(bslz4/src)
|
||||
|
||||
FetchContent_Declare(hdf5
|
||||
URL https://github.com/HDFGroup/hdf5/releases/download/hdf5_1.14.6/hdf5-1.14.6.tar.gz
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP FALSE
|
||||
EXCLUDE_FROM_ALL)
|
||||
|
||||
FetchContent_MakeAvailable(hdf5)
|
||||
|
||||
ADD_LIBRARY(durin-plugin SHARED
|
||||
src/plugin.c src/plugin.h
|
||||
src/err.c src/err.h
|
||||
src/filters.c src/filters.h
|
||||
src/file.c src/file.h bslz4/src/bitshuffle.c bslz4/src/bitshuffle.h
|
||||
bslz4/src/bitshuffle_core.c bslz4/src/bitshuffle_core.h
|
||||
bslz4/src/bitshuffle_internals.h
|
||||
bslz4/src/bshuf_h5filter.c bslz4/src/bshuf_h5filter.h
|
||||
bslz4/src/iochain.c bslz4/src/iochain.h
|
||||
bslz4/src/lz4.c bslz4/src/lz4.h
|
||||
)
|
||||
|
||||
set_target_properties(durin-plugin PROPERTIES VERSION 1.0.0)
|
||||
|
||||
TARGET_COMPILE_DEFINITIONS(durin-plugin PRIVATE
|
||||
H5_USE_110_API
|
||||
USE_BITSHUFFLE)
|
||||
|
||||
TARGET_LINK_LIBRARIES(durin-plugin PRIVATE hdf5-static)
|
||||
-43
@@ -1,43 +0,0 @@
|
||||
FROM registry.access.redhat.com/ubi7/ubi
|
||||
|
||||
ARG HDF5_TAG="hdf5_1.14.6"
|
||||
|
||||
LABEL authors="Filip Leonarski"
|
||||
|
||||
ARG CMAKE_VERSION=3.31.6
|
||||
ARG NODE_MAJOR=16
|
||||
|
||||
RUN yum -y update && \
|
||||
yum -y install \
|
||||
gcc \
|
||||
gcc-c++ \
|
||||
git \
|
||||
make \
|
||||
tar \
|
||||
gzip \
|
||||
curl && \
|
||||
yum clean all
|
||||
|
||||
# Install a recent Node.js (NodeSource). Change NODE_MAJOR if you want another major version.
|
||||
RUN curl -fsSL https://rpm.nodesource.com/setup_${NODE_MAJOR}.x | bash - && \
|
||||
yum -y install nodejs && \
|
||||
yum clean all && \
|
||||
node --version && npm --version && (corepack enable || true)
|
||||
|
||||
RUN set -eux; \
|
||||
arch="$(uname -m)"; \
|
||||
case "$arch" in \
|
||||
x86_64) cmake_arch="x86_64" ;; \
|
||||
aarch64) cmake_arch="aarch64" ;; \
|
||||
*) echo "Unsupported architecture: $arch"; exit 1 ;; \
|
||||
esac; \
|
||||
curl -L "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${cmake_arch}.tar.gz" \
|
||||
-o /tmp/cmake.tar.gz; \
|
||||
tar -xzf /tmp/cmake.tar.gz -C /opt; \
|
||||
ln -s "/opt/cmake-${CMAKE_VERSION}-linux-${cmake_arch}/bin/cmake" /usr/local/bin/cmake; \
|
||||
ln -s "/opt/cmake-${CMAKE_VERSION}-linux-${cmake_arch}/bin/ctest" /usr/local/bin/ctest; \
|
||||
ln -s "/opt/cmake-${CMAKE_VERSION}-linux-${cmake_arch}/bin/cpack" /usr/local/bin/cpack; \
|
||||
rm -f /tmp/cmake.tar.gz
|
||||
|
||||
# Default entrypoint prints tool versions and hints.
|
||||
CMD ["/bin/bash", "-l"]
|
||||
@@ -0,0 +1,55 @@
|
||||
BUILD_DIR ?= ./build
|
||||
SRC_DIR = ./src
|
||||
TEST_DIR = ./test
|
||||
INC_DIR = $(SRC_DIR)
|
||||
|
||||
BSLZ4_SRC_DIR = ./bslz4/src
|
||||
BSLZ4_BUILD_DIR = ./bslz4/build
|
||||
BSLZ4_INC_DIR = $(BSLZ4_SRC_DIR)
|
||||
|
||||
CC=h5cc
|
||||
CFLAGS=-DH5_USE_110_API -Wall -g -O2 -fpic -I$(INC_DIR) -I$(BSLZ4_INC_DIR) -std=c99 -shlib
|
||||
|
||||
.PHONY: plugin
|
||||
plugin: $(BUILD_DIR)/durin-plugin.so
|
||||
|
||||
.PHONY: all
|
||||
all: plugin example test_plugin
|
||||
|
||||
.PHONY: example
|
||||
example: $(BUILD_DIR)/example
|
||||
|
||||
.PHONY: test_plugin
|
||||
test_plugin: $(BUILD_DIR)/test_plugin
|
||||
|
||||
$(BUILD_DIR)/test_plugin: $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_generic_host.f90
|
||||
mkdir -p $(BUILD_DIR)
|
||||
gfortran -O -g -fopenmp -ldl $(TEST_DIR)/generic_data_plugin.f90 $(TEST_DIR)/test_generic_host.f90 -o $@ -J$(BUILD_DIR)
|
||||
|
||||
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c
|
||||
mkdir -p $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(BSLZ4_BUILD_DIR)/%.o: $(BSLZ4_SRC_DIR)/%.c
|
||||
mkdir -p $(BSLZ4_BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(BUILD_DIR)/bslz4.a: $(BSLZ4_BUILD_DIR)/lz4.o $(BSLZ4_BUILD_DIR)/bitshuffle.o \
|
||||
$(BSLZ4_BUILD_DIR)/bitshuffle_core.o $(BSLZ4_BUILD_DIR)/iochain.o
|
||||
mkdir -p $(BUILD_DIR)
|
||||
ar rcs $@ $^
|
||||
|
||||
$(BUILD_DIR)/durin-plugin.so: $(BUILD_DIR)/plugin.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \
|
||||
$(BUILD_DIR)/bslz4.a
|
||||
mkdir -p $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS) -shared -noshlib $^ -o $(BUILD_DIR)/durin-plugin.so
|
||||
|
||||
$(BUILD_DIR)/example: $(BUILD_DIR)/test.o $(BUILD_DIR)/file.o $(BUILD_DIR)/err.o $(BUILD_DIR)/filters.o \
|
||||
$(BUILD_DIR)/bslz4.a
|
||||
mkdir -p $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS) $^ -o $(BUILD_DIR)/example
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -r $(BUILD_DIR)
|
||||
rm -r $(BSLZ4_BUILD_DIR)
|
||||
@@ -8,24 +8,14 @@ See:
|
||||
* https://www.dectris.com/features/features-eiger-x/hdf5-and-nexus
|
||||
* https://strucbio.biologie.uni-konstanz.de/xdswiki
|
||||
|
||||
## Paul Scherrer Institute fork
|
||||
This fork is maintained by Paul Scherrer Institute.
|
||||
The plugin is based on the code developed by the Diamond Light Source and modified by Global Phasing.
|
||||
Modifications from PSI side:
|
||||
* Using CMake for building the plugin
|
||||
* HDF5 is built as part of the CMake process
|
||||
* Bitshuffle/LZ4 is updated to the latest version
|
||||
* HDF5 filter is automatically registered for virtual dataset HDF5 files
|
||||
* Docker image to build the plugin on x86/RH7 is provided.
|
||||
* Generated versioned shared library, so the used version can be tracked.
|
||||
|
||||
## Get Durin
|
||||
Linux x86 version is automatically built on RHEL 7 and available from the Gitea release page.
|
||||
|
||||
Latest release at https://github.com/DiamondLightSource/durin/releases/tag/2019v1
|
||||
|
||||
## Usage
|
||||
In your XDS.INP add:
|
||||
```
|
||||
LIB=[path to libdurin-plugin.so]
|
||||
LIB=[path to durin-plugin.so]
|
||||
NAME_TEMPLATE_OF_DATA_FRAMES=[data_path]/data_images_??????.h5
|
||||
```
|
||||
XDS will instruct the plugin to load `[data_path]/data_images_master.h5` and this must be the
|
||||
@@ -37,25 +27,55 @@ the master file contains an `NXdata` or `NXdetector` group with either a dataset
|
||||
series of datasets named `data_000001`, `data_000002`, etc.
|
||||
|
||||
|
||||
## Requirements
|
||||
* HDF5 Library (https://www.hdfgroup.org/downloads)
|
||||
|
||||
|
||||
## Building
|
||||
Requires CMake version 3.19 or later + GCC compiler. There is no need to build HDF5 separately.
|
||||
To build:
|
||||
|
||||
### Building HDF5 library
|
||||
The HDF5 library used when building durin must have been compiled with specific switches enabled
|
||||
to allow the durin plugin to be built and used.
|
||||
|
||||
Download the HDF5 source code (https://www.hdfgroup.org/downloads/hdf5/source-code) and extract
|
||||
to any directory (referred to as `/hdf5_dir`), and run the following commands.
|
||||
```
|
||||
cd /hdf5_dir
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j
|
||||
export CFLAGS=-fPIC
|
||||
../configure --enable-threadsafe --enable-deprecated-symbols --enable-hl --enable-unsupported
|
||||
make
|
||||
make check
|
||||
make install
|
||||
```
|
||||
The hdf5 tools and libraries should now be located in `/hdf5_dir/build/hdf5`
|
||||
|
||||
For reference, the plugin requires the thread-safe switch and the optimised chunk read function.
|
||||
The chunk read function may be defined in the high level library instead of the regular library,
|
||||
depending on the exact HDF5 version downloaded (hence the --enable-deprecated-symbols _and_ --enable-hl).
|
||||
The unsupported flag enables building with both threadsafe and high-level enabled.
|
||||
|
||||
|
||||
### Building durin plugin
|
||||
The plugin makefile will use the "h5cc" compiler wrapper, provided by the HDF5 library, which
|
||||
must be on your PATH.
|
||||
Download or clone the plugin source code (https://github.com/DiamondLightSource/durin)
|
||||
into any directory (referred to as `/durin_dir`) and run the following commands.
|
||||
```
|
||||
cd /durin_dir
|
||||
PATH=/hdf5_dir/build/hdf5/bin:$PATH
|
||||
make
|
||||
```
|
||||
The plugin is located at `/durin_dir/build/durin-plugin.so` and should be added to the
|
||||
XDS.INP file as `LIB=/durin_dir/build/durin-plugin.so`
|
||||
|
||||
|
||||
The plugin is located at `build/libdurin-plugin.so` and should be added to the
|
||||
XDS.INP file as `LIB=<CURRENT_DIRECTORY>/build/libdurin-plugin.so`.
|
||||
Alternatively, versioned copy is also provided, e.g. `libdurin-plugin.so.1.0.0`, allowing to track
|
||||
the current version of the plugin.
|
||||
|
||||
## Example XDS.INP
|
||||
```
|
||||
DETECTOR=PILATUS MINIMUM_VALID_PIXEL_VALUE=0 OVERLOAD=4096
|
||||
LIB=/opt/durin/build/libdurin-plugin.so
|
||||
LIB=/opt/durin/build/durin-plugin.so
|
||||
SENSOR_THICKNESS= 0.450
|
||||
!SENSOR_MATERIAL / THICKNESS Si 0.450
|
||||
!SILICON= 3.953379
|
||||
@@ -75,3 +95,6 @@ TRUSTED_REGION= 0.0 1.41
|
||||
DATA_RANGE= 1 600
|
||||
JOB=XYCORR INIT COLSPOT IDXREF DEFPIX INTEGRATE CORRECT
|
||||
```
|
||||
|
||||
N.B. the master file is needed, not the .nxs one which follows the
|
||||
standard.
|
||||
|
||||
+25
-139
@@ -14,22 +14,23 @@
|
||||
#include "bitshuffle_internals.h"
|
||||
#include "lz4.h"
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
#include "zstd.h"
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
// Macros.
|
||||
|
||||
|
||||
#define BSHUF_LZ4_DECOMPRESS_FAST
|
||||
|
||||
|
||||
|
||||
#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \
|
||||
free(buf); return count - 1000; }
|
||||
|
||||
|
||||
/* Bitshuffle and compress a single block. */
|
||||
int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
const size_t size, const size_t elem_size) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *tmp_buf_bshuf;
|
||||
@@ -41,8 +42,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
tmp_buf_bshuf = malloc(size * elem_size);
|
||||
if (tmp_buf_bshuf == NULL) return -1;
|
||||
|
||||
int dst_capacity = LZ4_compressBound(size * elem_size);
|
||||
tmp_buf_lz4 = malloc(dst_capacity);
|
||||
tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size));
|
||||
if (tmp_buf_lz4 == NULL){
|
||||
free(tmp_buf_bshuf);
|
||||
return -1;
|
||||
@@ -58,7 +58,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
free(tmp_buf_bshuf);
|
||||
return count;
|
||||
}
|
||||
nbytes = LZ4_compress_default((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size, dst_capacity);
|
||||
nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size);
|
||||
free(tmp_buf_bshuf);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4);
|
||||
|
||||
@@ -76,7 +76,7 @@ int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
|
||||
|
||||
/* Decompress and bitunshuffle a single block. */
|
||||
int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
const size_t size, const size_t elem_size) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *out, *tmp_buf;
|
||||
@@ -96,6 +96,14 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
#ifdef BSHUF_LZ4_DECOMPRESS_FAST
|
||||
nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
|
||||
if (nbytes != nbytes_from_header) {
|
||||
free(tmp_buf);
|
||||
return -91;
|
||||
}
|
||||
#else
|
||||
nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header,
|
||||
size * elem_size);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
|
||||
@@ -104,7 +112,7 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
return -91;
|
||||
}
|
||||
nbytes = nbytes_from_header;
|
||||
|
||||
#endif
|
||||
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
nbytes += 4;
|
||||
@@ -113,92 +121,6 @@ int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
/* Bitshuffle and compress a single block. */
|
||||
int64_t bshuf_compress_zstd_block(ioc_chain *C_ptr, \
|
||||
const size_t size, const size_t elem_size, const int comp_lvl) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *tmp_buf_bshuf;
|
||||
void *tmp_buf_zstd;
|
||||
size_t this_iter;
|
||||
const void *in;
|
||||
void *out;
|
||||
|
||||
tmp_buf_bshuf = malloc(size * elem_size);
|
||||
if (tmp_buf_bshuf == NULL) return -1;
|
||||
|
||||
size_t tmp_buf_zstd_size = ZSTD_compressBound(size * elem_size);
|
||||
tmp_buf_zstd = malloc(tmp_buf_zstd_size);
|
||||
if (tmp_buf_zstd == NULL){
|
||||
free(tmp_buf_bshuf);
|
||||
return -1;
|
||||
}
|
||||
|
||||
in = ioc_get_in(C_ptr, &this_iter);
|
||||
ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size));
|
||||
|
||||
count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size);
|
||||
if (count < 0) {
|
||||
free(tmp_buf_zstd);
|
||||
free(tmp_buf_bshuf);
|
||||
return count;
|
||||
}
|
||||
nbytes = ZSTD_compress(tmp_buf_zstd, tmp_buf_zstd_size, (const void*)tmp_buf_bshuf, size * elem_size, comp_lvl);
|
||||
free(tmp_buf_bshuf);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_zstd);
|
||||
|
||||
out = ioc_get_out(C_ptr, &this_iter);
|
||||
ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4));
|
||||
|
||||
bshuf_write_uint32_BE(out, nbytes);
|
||||
memcpy((char *) out + 4, tmp_buf_zstd, nbytes);
|
||||
|
||||
free(tmp_buf_zstd);
|
||||
|
||||
return nbytes + 4;
|
||||
}
|
||||
|
||||
|
||||
/* Decompress and bitunshuffle a single block. */
|
||||
int64_t bshuf_decompress_zstd_block(ioc_chain *C_ptr,
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
|
||||
int64_t nbytes, count;
|
||||
void *out, *tmp_buf;
|
||||
const void *in;
|
||||
size_t this_iter;
|
||||
int32_t nbytes_from_header;
|
||||
|
||||
in = ioc_get_in(C_ptr, &this_iter);
|
||||
nbytes_from_header = bshuf_read_uint32_BE(in);
|
||||
ioc_set_next_in(C_ptr, &this_iter,
|
||||
(void*) ((char*) in + nbytes_from_header + 4));
|
||||
|
||||
out = ioc_get_out(C_ptr, &this_iter);
|
||||
ioc_set_next_out(C_ptr, &this_iter,
|
||||
(void *) ((char *) out + size * elem_size));
|
||||
|
||||
tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
nbytes = ZSTD_decompress(tmp_buf, size * elem_size, (void *)((char *) in + 4), nbytes_from_header);
|
||||
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
|
||||
if (nbytes != size * elem_size) {
|
||||
free(tmp_buf);
|
||||
return -91;
|
||||
}
|
||||
|
||||
nbytes = nbytes_from_header;
|
||||
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
nbytes += 4;
|
||||
|
||||
free(tmp_buf);
|
||||
return nbytes;
|
||||
}
|
||||
#endif // ZSTD_SUPPORT
|
||||
|
||||
|
||||
/* ---- Public functions ----
|
||||
*
|
||||
@@ -216,13 +138,13 @@ size_t bshuf_compress_lz4_bound(const size_t size,
|
||||
}
|
||||
if (block_size % BSHUF_BLOCKED_MULT) return -81;
|
||||
|
||||
// Note that each block gets a 4 byte header.
|
||||
// Size of full blocks.
|
||||
|
||||
|
||||
bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
|
||||
// Size of partial blocks, if any.
|
||||
|
||||
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
|
||||
if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
|
||||
// Size of uncompressed data not fitting into any blocks.
|
||||
|
||||
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
|
||||
return bound;
|
||||
}
|
||||
@@ -231,49 +153,13 @@ size_t bshuf_compress_lz4_bound(const size_t size,
|
||||
int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size,
|
||||
elem_size, block_size, 0/*option*/);
|
||||
elem_size, block_size);
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size,
|
||||
elem_size, block_size, 0/*option*/);
|
||||
elem_size, block_size);
|
||||
}
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
size_t bshuf_compress_zstd_bound(const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
|
||||
size_t bound, leftover;
|
||||
|
||||
if (block_size == 0) {
|
||||
block_size = bshuf_default_block_size(elem_size);
|
||||
}
|
||||
if (block_size % BSHUF_BLOCKED_MULT) return -81;
|
||||
|
||||
// Note that each block gets a 4 byte header.
|
||||
// Size of full blocks.
|
||||
bound = (ZSTD_compressBound(block_size * elem_size) + 4) * (size / block_size);
|
||||
// Size of partial blocks, if any.
|
||||
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
|
||||
if (leftover) bound += ZSTD_compressBound(leftover * elem_size) + 4;
|
||||
// Size of uncompressed data not fitting into any blocks.
|
||||
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
|
||||
return bound;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size, const int comp_lvl) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_compress_zstd_block, in, out, size,
|
||||
elem_size, block_size, comp_lvl);
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
return bshuf_blocked_wrap_fun(&bshuf_decompress_zstd_block, in, out, size,
|
||||
elem_size, block_size, 0/*option*/);
|
||||
}
|
||||
#endif // ZSTD_SUPPORT
|
||||
|
||||
+7
-89
@@ -35,10 +35,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ---- LZ4 Interface ----
|
||||
*/
|
||||
|
||||
/* ---- bshuf_compress_lz4_bound ----
|
||||
*
|
||||
* Bound on size of data compressed with *bshuf_compress_lz4*.
|
||||
@@ -98,6 +94,11 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
|
||||
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
|
||||
* must patch the parameters used to compress the data.
|
||||
*
|
||||
* NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function
|
||||
* LZ4_decompress_fast from LZ4, which does not protect against maliciously
|
||||
* formed datasets. By modifying the compressed data, this function could be
|
||||
* coerced into leaving the boundaries of the input buffer.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* in : input buffer
|
||||
@@ -115,91 +116,8 @@ int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const s
|
||||
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
/*
|
||||
* ---- ZSTD Interface ----
|
||||
*/
|
||||
|
||||
#ifdef ZSTD_SUPPORT
|
||||
/* ---- bshuf_compress_zstd_bound ----
|
||||
*
|
||||
* Bound on size of data compressed with *bshuf_compress_zstd*.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* size : number of elements in input
|
||||
* elem_size : element size of typed data
|
||||
* block_size : Process in blocks of this many elements. Pass 0 to
|
||||
* select automatically (recommended).
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* Bound on compressed data size.
|
||||
*
|
||||
*/
|
||||
size_t bshuf_compress_zstd_bound(const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
/* ---- bshuf_compress_zstd ----
|
||||
*
|
||||
* Bitshuffled and compress the data using zstd.
|
||||
*
|
||||
* Transpose within elements, in blocks of data of *block_size* elements then
|
||||
* compress the blocks using ZSTD. In the output buffer, each block is prefixed
|
||||
* by a 4 byte integer giving the compressed size of that block.
|
||||
*
|
||||
* Output buffer must be large enough to hold the compressed data. This could
|
||||
* be in principle substantially larger than the input buffer. Use the routine
|
||||
* *bshuf_compress_zstd_bound* to get an upper limit.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* in : input buffer, must be of size * elem_size bytes
|
||||
* out : output buffer, must be large enough to hold data.
|
||||
* size : number of elements in input
|
||||
* elem_size : element size of typed data
|
||||
* block_size : Process in blocks of this many elements. Pass 0 to
|
||||
* select automatically (recommended).
|
||||
* comp_lvl : compression level applied
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* number of bytes used in output buffer, negative error-code if failed.
|
||||
*
|
||||
*/
|
||||
int64_t bshuf_compress_zstd(const void* in, void* out, const size_t size, const size_t
|
||||
elem_size, size_t block_size, const int comp_lvl);
|
||||
|
||||
|
||||
/* ---- bshuf_decompress_zstd ----
|
||||
*
|
||||
* Undo compression and bitshuffling.
|
||||
*
|
||||
* Decompress data then un-bitshuffle it in blocks of *block_size* elements.
|
||||
*
|
||||
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
|
||||
* must patch the parameters used to compress the data.
|
||||
*
|
||||
* Parameters
|
||||
* ----------
|
||||
* in : input buffer
|
||||
* out : output buffer, must be of size * elem_size bytes
|
||||
* size : number of elements in input
|
||||
* elem_size : element size of typed data
|
||||
* block_size : Process in blocks of this many elements. Pass 0 to
|
||||
* select automatically (recommended).
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* number of bytes consumed in *input* buffer, negative error-code if failed.
|
||||
*
|
||||
*/
|
||||
int64_t bshuf_decompress_zstd(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
#endif // ZSTD_SUPPORT
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BITSHUFFLE_H
|
||||
#endif
|
||||
|
||||
+39
-742
@@ -16,55 +16,30 @@
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined (__AVX512BW__) && defined(__AVX2__) && defined(__SSE2__)
|
||||
#define USEAVX512
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__) && defined (__SSE2__)
|
||||
#define USEAVX2
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || defined(NO_WARN_X86_INTRINSICS)
|
||||
#if defined(__SSE2__)
|
||||
#define USESSE2
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON__) || (__ARM_NEON)
|
||||
#ifdef __aarch64__
|
||||
#define USEARMNEON
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Conditional includes for SSE2 and AVX2.
|
||||
|
||||
#ifdef USEAVX2
|
||||
#include <immintrin.h>
|
||||
#elif defined USESSE2
|
||||
#include <emmintrin.h>
|
||||
#elif defined USEARMNEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(_OPENMP) && defined(_MSC_VER)
|
||||
typedef int64_t omp_size_t;
|
||||
#else
|
||||
typedef size_t omp_size_t;
|
||||
#endif
|
||||
|
||||
// Macros.
|
||||
|
||||
#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
|
||||
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
|
||||
|
||||
|
||||
/* ---- Functions indicating compile time instruction set. ---- */
|
||||
|
||||
int bshuf_using_NEON(void) {
|
||||
#ifdef USEARMNEON
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int bshuf_using_SSE2(void) {
|
||||
#ifdef USESSE2
|
||||
return 1;
|
||||
@@ -83,14 +58,6 @@ int bshuf_using_AVX2(void) {
|
||||
}
|
||||
|
||||
|
||||
int bshuf_using_AVX512(void) {
|
||||
#ifdef USEAVX512
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---- Worker code not requiring special instruction sets. ----
|
||||
*
|
||||
* The following code does not use any x86 specific vectorized instructions
|
||||
@@ -164,8 +131,8 @@ int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t
|
||||
CHECK_MULT_EIGHT(start);
|
||||
|
||||
if (size > start) {
|
||||
// ii loop separated into 2 loops so the compiler can unroll
|
||||
// the inner one.
|
||||
|
||||
|
||||
for (ii = start; ii + 7 < size; ii += 8) {
|
||||
for (jj = 0; jj < elem_size; jj++) {
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
@@ -381,516 +348,10 @@ int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size
|
||||
}
|
||||
|
||||
|
||||
/* ---- Worker code that uses Arm NEON ----
|
||||
*
|
||||
* The following code makes use of the Arm NEON instruction set.
|
||||
* NEON technology is the implementation of the ARM Advanced Single
|
||||
* Instruction Multiple Data (SIMD) extension.
|
||||
* The NEON unit is the component of the processor that executes SIMD instructions.
|
||||
* It is also called the NEON Media Processing Engine (MPE).
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef USEARMNEON
|
||||
|
||||
/* Transpose bytes within elements for 16 bit elements. */
|
||||
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
|
||||
|
||||
size_t ii;
|
||||
const char *in_b = (const char*) in;
|
||||
char *out_b = (char*) out;
|
||||
int8x16_t a0, b0, a1, b1;
|
||||
|
||||
for (ii=0; ii + 15 < size; ii += 16) {
|
||||
a0 = vld1q_s8(in_b + 2*ii + 0*16);
|
||||
b0 = vld1q_s8(in_b + 2*ii + 1*16);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
|
||||
a0 = vzip1q_s8(a1, b1);
|
||||
b0 = vzip2q_s8(a1, b1);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
|
||||
a0 = vzip1q_s8(a1, b1);
|
||||
b0 = vzip2q_s8(a1, b1);
|
||||
|
||||
vst1q_s8(out_b + 0*size + ii, a0);
|
||||
vst1q_s8(out_b + 1*size + ii, b0);
|
||||
}
|
||||
|
||||
return bshuf_trans_byte_elem_remainder(in, out, size, 2,
|
||||
size - size % 16);
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bytes within elements for 32 bit elements. */
|
||||
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
|
||||
|
||||
size_t ii;
|
||||
const char *in_b;
|
||||
char *out_b;
|
||||
in_b = (const char*) in;
|
||||
out_b = (char*) out;
|
||||
int8x16_t a0, b0, c0, d0, a1, b1, c1, d1;
|
||||
int64x2_t a2, b2, c2, d2;
|
||||
|
||||
for (ii=0; ii + 15 < size; ii += 16) {
|
||||
a0 = vld1q_s8(in_b + 4*ii + 0*16);
|
||||
b0 = vld1q_s8(in_b + 4*ii + 1*16);
|
||||
c0 = vld1q_s8(in_b + 4*ii + 2*16);
|
||||
d0 = vld1q_s8(in_b + 4*ii + 3*16);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
c1 = vzip1q_s8(c0, d0);
|
||||
d1 = vzip2q_s8(c0, d0);
|
||||
|
||||
a0 = vzip1q_s8(a1, b1);
|
||||
b0 = vzip2q_s8(a1, b1);
|
||||
c0 = vzip1q_s8(c1, d1);
|
||||
d0 = vzip2q_s8(c1, d1);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip2q_s8(a0, b0);
|
||||
c1 = vzip1q_s8(c0, d0);
|
||||
d1 = vzip2q_s8(c0, d0);
|
||||
|
||||
a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
|
||||
b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
|
||||
c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
|
||||
d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
|
||||
|
||||
vst1q_s64((int64_t *) (out_b + 0*size + ii), a2);
|
||||
vst1q_s64((int64_t *) (out_b + 1*size + ii), b2);
|
||||
vst1q_s64((int64_t *) (out_b + 2*size + ii), c2);
|
||||
vst1q_s64((int64_t *) (out_b + 3*size + ii), d2);
|
||||
}
|
||||
|
||||
return bshuf_trans_byte_elem_remainder(in, out, size, 4,
|
||||
size - size % 16);
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bytes within elements for 64 bit elements. */
|
||||
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
|
||||
|
||||
size_t ii;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
|
||||
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
|
||||
|
||||
for (ii=0; ii + 15 < size; ii += 16) {
|
||||
a0 = vld1q_s8(in_b + 8*ii + 0*16);
|
||||
b0 = vld1q_s8(in_b + 8*ii + 1*16);
|
||||
c0 = vld1q_s8(in_b + 8*ii + 2*16);
|
||||
d0 = vld1q_s8(in_b + 8*ii + 3*16);
|
||||
e0 = vld1q_s8(in_b + 8*ii + 4*16);
|
||||
f0 = vld1q_s8(in_b + 8*ii + 5*16);
|
||||
g0 = vld1q_s8(in_b + 8*ii + 6*16);
|
||||
h0 = vld1q_s8(in_b + 8*ii + 7*16);
|
||||
|
||||
a1 = vzip1q_s8 (a0, b0);
|
||||
b1 = vzip2q_s8 (a0, b0);
|
||||
c1 = vzip1q_s8 (c0, d0);
|
||||
d1 = vzip2q_s8 (c0, d0);
|
||||
e1 = vzip1q_s8 (e0, f0);
|
||||
f1 = vzip2q_s8 (e0, f0);
|
||||
g1 = vzip1q_s8 (g0, h0);
|
||||
h1 = vzip2q_s8 (g0, h0);
|
||||
|
||||
a0 = vzip1q_s8 (a1, b1);
|
||||
b0 = vzip2q_s8 (a1, b1);
|
||||
c0 = vzip1q_s8 (c1, d1);
|
||||
d0 = vzip2q_s8 (c1, d1);
|
||||
e0 = vzip1q_s8 (e1, f1);
|
||||
f0 = vzip2q_s8 (e1, f1);
|
||||
g0 = vzip1q_s8 (g1, h1);
|
||||
h0 = vzip2q_s8 (g1, h1);
|
||||
|
||||
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
|
||||
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
|
||||
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
|
||||
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
|
||||
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
|
||||
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
|
||||
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
|
||||
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
|
||||
|
||||
a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
|
||||
b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
|
||||
c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
|
||||
d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
|
||||
e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
|
||||
f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
|
||||
g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
|
||||
h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
|
||||
|
||||
vst1q_s8(out_b + 0*size + ii, a0);
|
||||
vst1q_s8(out_b + 1*size + ii, b0);
|
||||
vst1q_s8(out_b + 2*size + ii, c0);
|
||||
vst1q_s8(out_b + 3*size + ii, d0);
|
||||
vst1q_s8(out_b + 4*size + ii, e0);
|
||||
vst1q_s8(out_b + 5*size + ii, f0);
|
||||
vst1q_s8(out_b + 6*size + ii, g0);
|
||||
vst1q_s8(out_b + 7*size + ii, h0);
|
||||
}
|
||||
|
||||
return bshuf_trans_byte_elem_remainder(in, out, size, 8,
|
||||
size - size % 16);
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bytes within elements using best NEON algorithm available. */
|
||||
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
// Trivial cases: power of 2 bytes.
|
||||
switch (elem_size) {
|
||||
case 1:
|
||||
count = bshuf_copy(in, out, size, elem_size);
|
||||
return count;
|
||||
case 2:
|
||||
count = bshuf_trans_byte_elem_NEON_16(in, out, size);
|
||||
return count;
|
||||
case 4:
|
||||
count = bshuf_trans_byte_elem_NEON_32(in, out, size);
|
||||
return count;
|
||||
case 8:
|
||||
count = bshuf_trans_byte_elem_NEON_64(in, out, size);
|
||||
return count;
|
||||
}
|
||||
|
||||
// Worst case: odd number of bytes. Turns out that this is faster for
|
||||
// (odd * 2) byte elements as well (hence % 4).
|
||||
if (elem_size % 4) {
|
||||
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
|
||||
return count;
|
||||
}
|
||||
|
||||
// Multiple of power of 2: transpose hierarchically.
|
||||
{
|
||||
size_t nchunk_elem;
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
if ((elem_size % 8) == 0) {
|
||||
nchunk_elem = elem_size / 8;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
|
||||
count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
|
||||
} else if ((elem_size % 4) == 0) {
|
||||
nchunk_elem = elem_size / 4;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
|
||||
count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
|
||||
} else {
|
||||
// Not used since scalar algorithm is faster.
|
||||
nchunk_elem = elem_size / 2;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
|
||||
count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
|
||||
}
|
||||
|
||||
free(tmp_buf);
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Creates a mask made up of the most significant
|
||||
* bit of each byte of 'input'
|
||||
*/
|
||||
int32_t move_byte_mask_neon(uint8x16_t input) {
|
||||
|
||||
return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3)
|
||||
| (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7)
|
||||
| (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11)
|
||||
| (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15)
|
||||
);
|
||||
}
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
size_t ii, kk;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
uint16_t* out_ui16;
|
||||
|
||||
int64_t count;
|
||||
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
CHECK_MULT_EIGHT(nbyte);
|
||||
|
||||
int16x8_t xmm;
|
||||
int32_t bt;
|
||||
|
||||
for (ii = 0; ii + 15 < nbyte; ii += 16) {
|
||||
xmm = vld1q_s16((int16_t *) (in_b + ii));
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = move_byte_mask_neon((uint8x16_t) xmm);
|
||||
xmm = vshlq_n_s16(xmm, 1);
|
||||
out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_ui16 = bt;
|
||||
}
|
||||
}
|
||||
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
|
||||
nbyte - nbyte % 16);
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bits within elements. */
|
||||
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* For data organized into a row for each bit (8 * elem_size rows), transpose
|
||||
* the bytes. */
|
||||
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
size_t ii, jj;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
size_t nrows = 8 * elem_size;
|
||||
size_t nbyte_row = size / 8;
|
||||
|
||||
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
|
||||
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
|
||||
int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
|
||||
|
||||
for (ii = 0; ii + 7 < nrows; ii += 8) {
|
||||
for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
|
||||
a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj);
|
||||
b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj);
|
||||
c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj);
|
||||
d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj);
|
||||
e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj);
|
||||
f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj);
|
||||
g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj);
|
||||
h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj);
|
||||
|
||||
a1 = vzip1q_s8(a0, b0);
|
||||
b1 = vzip1q_s8(c0, d0);
|
||||
c1 = vzip1q_s8(e0, f0);
|
||||
d1 = vzip1q_s8(g0, h0);
|
||||
e1 = vzip2q_s8(a0, b0);
|
||||
f1 = vzip2q_s8(c0, d0);
|
||||
g1 = vzip2q_s8(e0, f0);
|
||||
h1 = vzip2q_s8(g0, h0);
|
||||
|
||||
a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
|
||||
b0= (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
|
||||
c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
|
||||
d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
|
||||
e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
|
||||
f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
|
||||
g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
|
||||
h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
|
||||
|
||||
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
|
||||
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
|
||||
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
|
||||
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
|
||||
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
|
||||
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
|
||||
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
|
||||
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
|
||||
|
||||
as = (int64x1_t *) &a1;
|
||||
bs = (int64x1_t *) &b1;
|
||||
cs = (int64x1_t *) &c1;
|
||||
ds = (int64x1_t *) &d1;
|
||||
es = (int64x1_t *) &e1;
|
||||
fs = (int64x1_t *) &f1;
|
||||
gs = (int64x1_t *) &g1;
|
||||
hs = (int64x1_t *) &h1;
|
||||
|
||||
vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1));
|
||||
vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs);
|
||||
vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1));
|
||||
}
|
||||
for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
|
||||
out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
|
||||
out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
|
||||
}
|
||||
}
|
||||
return size * elem_size;
|
||||
}
|
||||
|
||||
|
||||
/* Shuffle bits within the bytes of eight element blocks. */
|
||||
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
uint16_t* out_ui16 = (uint16_t*) out;
|
||||
|
||||
size_t ii, jj, kk;
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
int16x8_t xmm;
|
||||
int32_t bt;
|
||||
|
||||
if (elem_size % 2) {
|
||||
bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
|
||||
} else {
|
||||
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
|
||||
ii += 8 * elem_size) {
|
||||
for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
|
||||
xmm = vld1q_s16((int16_t *) &in_b[ii + jj]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = move_byte_mask_neon((uint8x16_t) xmm);
|
||||
xmm = vshlq_n_s16(xmm, 1);
|
||||
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
|
||||
out_ui16[ind / 2] = bt;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return size * elem_size;
|
||||
}
|
||||
|
||||
|
||||
/* Untranspose bits within elements. */
|
||||
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
#else // #ifdef USEARMNEON
|
||||
|
||||
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -13;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* ---- Worker code that uses SSE2 ----
|
||||
*
|
||||
* The following code makes use of the SSE2 instruction set and specialized
|
||||
* 16 byte registers. The SSE2 instructions are present on modern x86
|
||||
* 16 byte registers. The SSE2 instructions are present on modern x86
|
||||
* processors. The first Intel processor microarchitecture supporting SSE2 was
|
||||
* Pentium 4 (2000).
|
||||
*
|
||||
@@ -1051,7 +512,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
|
||||
|
||||
int64_t count;
|
||||
|
||||
// Trivial cases: power of 2 bytes.
|
||||
|
||||
switch (elem_size) {
|
||||
case 1:
|
||||
count = bshuf_copy(in, out, size, elem_size);
|
||||
@@ -1067,14 +528,14 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
|
||||
return count;
|
||||
}
|
||||
|
||||
// Worst case: odd number of bytes. Turns out that this is faster for
|
||||
// (odd * 2) byte elements as well (hence % 4).
|
||||
|
||||
|
||||
if (elem_size % 4) {
|
||||
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
|
||||
return count;
|
||||
}
|
||||
|
||||
// Multiple of power of 2: transpose hierarchically.
|
||||
|
||||
{
|
||||
size_t nchunk_elem;
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
@@ -1093,7 +554,7 @@ int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
|
||||
size * nchunk_elem);
|
||||
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
|
||||
} else {
|
||||
// Not used since scalar algorithm is faster.
|
||||
|
||||
nchunk_elem = elem_size / 2;
|
||||
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
|
||||
count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
|
||||
@@ -1226,8 +687,8 @@ int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size
|
||||
g1 = _mm_unpacklo_epi32(g0, h0);
|
||||
h1 = _mm_unpackhi_epi32(g0, h0);
|
||||
|
||||
// We don't have a storeh instruction for integers, so interpret
|
||||
// as a float. Have a storel (_mm_storel_epi64).
|
||||
|
||||
|
||||
as = (__m128 *) &a1;
|
||||
bs = (__m128 *) &b1;
|
||||
cs = (__m128 *) &c1;
|
||||
@@ -1276,8 +737,8 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
|
||||
|
||||
const char* in_b = (const char*) in;
|
||||
uint16_t* out_ui16 = (uint16_t*) out;
|
||||
|
||||
@@ -1327,7 +788,7 @@ int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
|
||||
return count;
|
||||
}
|
||||
|
||||
#else // #ifdef USESSE2
|
||||
#else
|
||||
|
||||
|
||||
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
|
||||
@@ -1381,7 +842,7 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
|
||||
}
|
||||
|
||||
|
||||
#endif // #ifdef USESSE2
|
||||
#endif
|
||||
|
||||
|
||||
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
|
||||
@@ -1396,6 +857,7 @@ int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t
|
||||
*/
|
||||
|
||||
#ifdef USEAVX2
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
@@ -1552,8 +1014,8 @@ int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
|
||||
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
|
||||
@@ -1603,7 +1065,7 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
|
||||
}
|
||||
|
||||
|
||||
#else // #ifdef USEAVX2
|
||||
#else
|
||||
|
||||
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
@@ -1634,179 +1096,19 @@ int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
|
||||
return -12;
|
||||
}
|
||||
|
||||
#endif // #ifdef USEAVX2
|
||||
|
||||
#ifdef USEAVX512
|
||||
|
||||
/* Transpose bits within bytes. */
|
||||
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
size_t ii, kk;
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
size_t nbyte = elem_size * size;
|
||||
int64_t count;
|
||||
|
||||
int64_t* out_i64;
|
||||
__m512i zmm;
|
||||
__mmask64 bt;
|
||||
if (nbyte >= 64) {
|
||||
const __m512i mask = _mm512_set1_epi8(0);
|
||||
|
||||
for (ii = 0; ii + 63 < nbyte; ii += 64) {
|
||||
zmm = _mm512_loadu_si512((__m512i *) &in_b[ii]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
|
||||
zmm = _mm512_slli_epi16(zmm, 1);
|
||||
out_i64 = (int64_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_i64 = (int64_t)bt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__m256i ymm;
|
||||
int32_t bt32;
|
||||
int32_t* out_i32;
|
||||
size_t start = nbyte - nbyte % 64;
|
||||
for (ii = start; ii + 31 < nbyte; ii += 32) {
|
||||
ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt32 = _mm256_movemask_epi8(ymm);
|
||||
ymm = _mm256_slli_epi16(ymm, 1);
|
||||
out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
|
||||
*out_i32 = bt32;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
|
||||
nbyte - nbyte % 64 % 32);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* Transpose bits within elements. */
|
||||
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bit_byte_AVX512(out, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
|
||||
return count;
|
||||
|
||||
}
|
||||
|
||||
/* Shuffle bits within the bytes of eight element blocks. */
|
||||
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
// With a bit of care, this could be written such that such that it is
|
||||
// in_buf = out_buf safe.
|
||||
const char* in_b = (const char*) in;
|
||||
char* out_b = (char*) out;
|
||||
|
||||
size_t ii, jj, kk;
|
||||
size_t nbyte = elem_size * size;
|
||||
|
||||
__m512i zmm;
|
||||
__mmask64 bt;
|
||||
|
||||
if (elem_size % 8) {
|
||||
return bshuf_shuffle_bit_eightelem_AVX(in, out, size, elem_size);
|
||||
} else {
|
||||
const __m512i mask = _mm512_set1_epi8(0);
|
||||
for (jj = 0; jj + 63 < 8 * elem_size; jj += 64) {
|
||||
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
|
||||
ii += 8 * elem_size) {
|
||||
zmm = _mm512_loadu_si512((__m512i *) &in_b[ii + jj]);
|
||||
for (kk = 0; kk < 8; kk++) {
|
||||
bt = _mm512_cmp_epi8_mask(zmm, mask, 1);
|
||||
zmm = _mm512_slli_epi16(zmm, 1);
|
||||
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
|
||||
* (int64_t *) &out_b[ind] = bt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return size * elem_size;
|
||||
}
|
||||
|
||||
/* Untranspose bits within elements. */
|
||||
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
|
||||
CHECK_MULT_EIGHT(size);
|
||||
|
||||
void* tmp_buf = malloc(size * elem_size);
|
||||
if (tmp_buf == NULL) return -1;
|
||||
|
||||
count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
|
||||
CHECK_ERR_FREE(count, tmp_buf);
|
||||
count = bshuf_shuffle_bit_eightelem_AVX512(tmp_buf, out, size, elem_size);
|
||||
|
||||
free(tmp_buf);
|
||||
return count;
|
||||
}
|
||||
|
||||
#else // #ifdef USEAVX512
|
||||
|
||||
int64_t bshuf_trans_bit_byte_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
return -14;
|
||||
}
|
||||
|
||||
int64_t bshuf_trans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -14;
|
||||
|
||||
}
|
||||
|
||||
int64_t bshuf_shuffle_bit_eightelem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -14;
|
||||
}
|
||||
|
||||
int64_t bshuf_untrans_bit_elem_AVX512(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
return -14;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* ---- Drivers selecting best instruction set at compile time. ---- */
|
||||
|
||||
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
|
||||
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
#ifdef USEAVX512
|
||||
count = bshuf_trans_bit_elem_AVX512(in, out, size, elem_size);
|
||||
#elif defined USEAVX2
|
||||
#ifdef USEAVX2
|
||||
count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
|
||||
#elif defined(USESSE2)
|
||||
count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
|
||||
#elif defined(USEARMNEON)
|
||||
count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size);
|
||||
#else
|
||||
count = bshuf_trans_bit_elem_scal(in, out, size, elem_size);
|
||||
#endif
|
||||
@@ -1814,18 +1116,14 @@ int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
|
||||
}
|
||||
|
||||
|
||||
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size) {
|
||||
|
||||
int64_t count;
|
||||
#ifdef USEAVX512
|
||||
count = bshuf_untrans_bit_elem_AVX512(in, out, size, elem_size);
|
||||
#elif defined USEAVX2
|
||||
#ifdef USEAVX2
|
||||
count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
|
||||
#elif defined(USESSE2)
|
||||
count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
|
||||
#elif defined(USEARMNEON)
|
||||
count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size);
|
||||
#else
|
||||
count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size);
|
||||
#endif
|
||||
@@ -1838,9 +1136,9 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
/* Wrap a function for processing a single block to process an entire buffer in
|
||||
* parallel. */
|
||||
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \
|
||||
const size_t size, const size_t elem_size, size_t block_size, const int option) {
|
||||
const size_t size, const size_t elem_size, size_t block_size) {
|
||||
|
||||
omp_size_t ii = 0;
|
||||
size_t ii;
|
||||
int64_t err = 0;
|
||||
int64_t count, cum_count=0;
|
||||
size_t last_block_size;
|
||||
@@ -1863,8 +1161,8 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
#pragma omp parallel for schedule(dynamic, 1) \
|
||||
private(count) reduction(+ : cum_count)
|
||||
#endif
|
||||
for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) {
|
||||
count = fun(&C, block_size, elem_size, option);
|
||||
for (ii = 0; ii < size / block_size; ii ++) {
|
||||
count = fun(&C, block_size, elem_size);
|
||||
if (count < 0) err = count;
|
||||
cum_count += count;
|
||||
}
|
||||
@@ -1872,7 +1170,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
last_block_size = size % block_size;
|
||||
last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT;
|
||||
if (last_block_size) {
|
||||
count = fun(&C, last_block_size, elem_size, option);
|
||||
count = fun(&C, last_block_size, elem_size);
|
||||
if (count < 0) err = count;
|
||||
cum_count += count;
|
||||
}
|
||||
@@ -1880,7 +1178,6 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
if (err < 0) return err;
|
||||
|
||||
leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
|
||||
//this_iter;
|
||||
last_in = (char *) ioc_get_in(&C, &this_iter);
|
||||
ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
|
||||
last_out = (char *) ioc_get_out(&C, &this_iter);
|
||||
@@ -1896,7 +1193,7 @@ int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
|
||||
/* Bitshuffle a single block. */
|
||||
int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
const size_t size, const size_t elem_size) {
|
||||
|
||||
size_t this_iter;
|
||||
const void *in;
|
||||
@@ -1904,7 +1201,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
|
||||
int64_t count;
|
||||
|
||||
|
||||
|
||||
|
||||
in = ioc_get_in(C_ptr, &this_iter);
|
||||
ioc_set_next_in(C_ptr, &this_iter,
|
||||
(void*) ((char*) in + size * elem_size));
|
||||
@@ -1919,7 +1216,7 @@ int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
|
||||
|
||||
/* Bitunshuffle a single block. */
|
||||
int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \
|
||||
const size_t size, const size_t elem_size, const int option) {
|
||||
const size_t size, const size_t elem_size) {
|
||||
|
||||
|
||||
size_t this_iter;
|
||||
@@ -1999,11 +1296,11 @@ uint32_t bshuf_read_uint32_BE(const void* buf) {
|
||||
*/
|
||||
|
||||
size_t bshuf_default_block_size(const size_t elem_size) {
|
||||
// This function needs to be absolutely stable between versions.
|
||||
// Otherwise encoded data will not be decodable.
|
||||
|
||||
|
||||
|
||||
size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
|
||||
// Ensure it is a required multiple.
|
||||
|
||||
block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
|
||||
return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
|
||||
}
|
||||
@@ -2013,7 +1310,7 @@ int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
|
||||
return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size,
|
||||
elem_size, block_size, 0/*option*/);
|
||||
elem_size, block_size);
|
||||
}
|
||||
|
||||
|
||||
@@ -2021,7 +1318,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size) {
|
||||
|
||||
return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size,
|
||||
elem_size, block_size, 0/*option*/);
|
||||
elem_size, block_size);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -18,8 +18,6 @@
|
||||
* -1 : Failed to allocate memory.
|
||||
* -11 : Missing SSE.
|
||||
* -12 : Missing AVX.
|
||||
* -13 : Missing Arm Neon.
|
||||
* -14 : Missing AVX512.
|
||||
* -80 : Input size not a multiple of 8.
|
||||
* -81 : block_size not multiple of 8.
|
||||
* -91 : Decompression error, wrong number of bytes processed.
|
||||
@@ -30,14 +28,15 @@
|
||||
#ifndef BITSHUFFLE_CORE_H
|
||||
#define BITSHUFFLE_CORE_H
|
||||
|
||||
// We assume GNU g++ defining `__cplusplus` has stdint.h
|
||||
|
||||
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
|
||||
#include <stdint.h>
|
||||
#else
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef signed short int16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef long long int64_t;
|
||||
#endif
|
||||
@@ -45,11 +44,11 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
// These are usually set in the setup.py.
|
||||
|
||||
#ifndef BSHUF_VERSION_MAJOR
|
||||
#define BSHUF_VERSION_MAJOR 0
|
||||
#define BSHUF_VERSION_MINOR 4
|
||||
#define BSHUF_VERSION_POINT 0
|
||||
#define BSHUF_VERSION_MINOR 3
|
||||
#define BSHUF_VERSION_POINT 4
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -68,18 +67,6 @@ extern "C" {
|
||||
int bshuf_using_SSE2(void);
|
||||
|
||||
|
||||
/* ---- bshuf_using_NEON ----
|
||||
*
|
||||
* Whether routines where compiled with the NEON instruction set.
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* 1 if using NEON, 0 otherwise.
|
||||
*
|
||||
*/
|
||||
int bshuf_using_NEON(void);
|
||||
|
||||
|
||||
/* ---- bshuf_using_AVX2 ----
|
||||
*
|
||||
* Whether routines where compiled with the AVX2 instruction set.
|
||||
@@ -92,18 +79,6 @@ int bshuf_using_NEON(void);
|
||||
int bshuf_using_AVX2(void);
|
||||
|
||||
|
||||
/* ---- bshuf_using_AVX512 ----
|
||||
*
|
||||
* Whether routines where compiled with the AVX512 instruction set.
|
||||
*
|
||||
* Returns
|
||||
* -------
|
||||
* 1 if using AVX512, 0 otherwise.
|
||||
*
|
||||
*/
|
||||
int bshuf_using_AVX512(void);
|
||||
|
||||
|
||||
/* ---- bshuf_default_block_size ----
|
||||
*
|
||||
* The default block size as function of element size.
|
||||
@@ -176,7 +151,7 @@ int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
|
||||
const size_t elem_size, size_t block_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BITSHUFFLE_CORE_H
|
||||
#endif
|
||||
|
||||
@@ -13,31 +13,19 @@
|
||||
#ifndef BITSHUFFLE_INTERNALS_H
|
||||
#define BITSHUFFLE_INTERNALS_H
|
||||
|
||||
// We assume GNU g++ defining `__cplusplus` has stdint.h
|
||||
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
|
||||
#include <stdint.h>
|
||||
#else
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef long long int64_t;
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "iochain.h"
|
||||
|
||||
|
||||
// Constants.
|
||||
#ifndef BSHUF_MIN_RECOMMEND_BLOCK
|
||||
#define BSHUF_MIN_RECOMMEND_BLOCK 128
|
||||
#define BSHUF_BLOCKED_MULT 8 // Block sizes must be multiple of this.
|
||||
#define BSHUF_BLOCKED_MULT 8
|
||||
#define BSHUF_TARGET_BLOCK_SIZE_B 8192
|
||||
#endif
|
||||
|
||||
|
||||
// Macros.
|
||||
|
||||
#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }
|
||||
|
||||
|
||||
@@ -61,15 +49,15 @@ int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
|
||||
|
||||
/* Function definition for worker functions that process a single block. */
|
||||
typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr,
|
||||
const size_t size, const size_t elem_size, const int option);
|
||||
const size_t size, const size_t elem_size);
|
||||
|
||||
/* Wrap a function for processing a single block to process an entire buffer in
|
||||
* parallel. */
|
||||
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
|
||||
const size_t size, const size_t elem_size, size_t block_size, const int option);
|
||||
const size_t size, const size_t elem_size, size_t block_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BITSHUFFLE_INTERNALS_H
|
||||
#endif
|
||||
|
||||
@@ -1,260 +0,0 @@
|
||||
/*
|
||||
* Bitshuffle HDF5 filter
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
* Website: http://www.github.com/kiyo-masui/bitshuffle
|
||||
* Created: 2014
|
||||
*
|
||||
* See LICENSE file for details about copyright and rights to use.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "bitshuffle.h"
|
||||
#include "bshuf_h5filter.h"
|
||||
|
||||
|
||||
#define PUSH_ERR(func, minor, str) \
|
||||
H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
|
||||
|
||||
|
||||
// Prototypes from bitshuffle.c
|
||||
void bshuf_write_uint64_BE(void* buf, uint64_t num);
|
||||
uint64_t bshuf_read_uint64_BE(void* buf);
|
||||
void bshuf_write_uint32_BE(void* buf, uint32_t num);
|
||||
uint32_t bshuf_read_uint32_BE(const void* buf);
|
||||
|
||||
|
||||
// Only called on compression, not on reverse.
|
||||
herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
|
||||
|
||||
herr_t r;
|
||||
size_t ii;
|
||||
|
||||
unsigned int elem_size;
|
||||
|
||||
unsigned int flags;
|
||||
size_t nelements = 8;
|
||||
size_t nelem_max = 11;
|
||||
unsigned values[] = {0,0,0,0,0,0,0,0,0,0,0};
|
||||
unsigned tmp_values[] = {0,0,0,0,0,0,0,0};
|
||||
char msg[80];
|
||||
|
||||
r = H5Pget_filter_by_id2(dcpl, BSHUF_H5FILTER, &flags, &nelements,
|
||||
tmp_values, 0, NULL, NULL);
|
||||
if(r<0) return -1;
|
||||
|
||||
// First 3 slots reserved. Move any passed options to higher addresses.
|
||||
for (ii=0; ii < nelements && ii + 3 < nelem_max; ii++) {
|
||||
values[ii + 3] = tmp_values[ii];
|
||||
}
|
||||
|
||||
nelements = 3 + nelements;
|
||||
|
||||
values[0] = BSHUF_VERSION_MAJOR;
|
||||
values[1] = BSHUF_VERSION_MINOR;
|
||||
|
||||
elem_size = H5Tget_size(type);
|
||||
if(elem_size <= 0) {
|
||||
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
|
||||
"Invalid element size.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
values[2] = elem_size;
|
||||
|
||||
// Validate user supplied arguments.
|
||||
if (nelements > 3) {
|
||||
if (values[3] % 8 || values[3] < 0) {
|
||||
sprintf(msg, "Error in bitshuffle. Invalid block size: %d.",
|
||||
values[3]);
|
||||
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, msg);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (nelements > 4) {
|
||||
switch (values[4]) {
|
||||
case 0:
|
||||
break;
|
||||
case BSHUF_H5_COMPRESS_LZ4:
|
||||
break;
|
||||
#ifdef ZSTD_SUPPORT
|
||||
case BSHUF_H5_COMPRESS_ZSTD:
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
|
||||
"Invalid bitshuffle compression.");
|
||||
}
|
||||
}
|
||||
|
||||
r = H5Pmodify_filter(dcpl, BSHUF_H5FILTER, flags, nelements, values);
|
||||
if(r<0) return -1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
|
||||
const unsigned int cd_values[], size_t nbytes,
|
||||
size_t *buf_size, void **buf) {
|
||||
|
||||
size_t size, elem_size;
|
||||
int err = -1;
|
||||
char msg[80];
|
||||
size_t block_size = 0;
|
||||
size_t buf_size_out, nbytes_uncomp, nbytes_out;
|
||||
char* in_buf = *buf;
|
||||
void *out_buf;
|
||||
|
||||
if (cd_nelmts < 3) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"Not enough parameters.");
|
||||
return 0;
|
||||
}
|
||||
elem_size = cd_values[2];
|
||||
#ifdef ZSTD_SUPPORT
|
||||
const int comp_lvl = cd_values[5];
|
||||
#endif
|
||||
|
||||
// User specified block size.
|
||||
if (cd_nelmts > 3) block_size = cd_values[3];
|
||||
|
||||
if (block_size == 0) block_size = bshuf_default_block_size(elem_size);
|
||||
|
||||
#ifndef ZSTD_SUPPORT
|
||||
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"ZSTD compression filter chosen but ZSTD support not installed.");
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Compression in addition to bitshuffle.
|
||||
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
|
||||
if (flags & H5Z_FLAG_REVERSE) {
|
||||
// First eight bytes is the number of bytes in the output buffer,
|
||||
// little endian.
|
||||
nbytes_uncomp = bshuf_read_uint64_BE(in_buf);
|
||||
// Override the block size with the one read from the header.
|
||||
block_size = bshuf_read_uint32_BE((const char*) in_buf + 8) / elem_size;
|
||||
// Skip over the header.
|
||||
in_buf += 12;
|
||||
buf_size_out = nbytes_uncomp;
|
||||
} else {
|
||||
nbytes_uncomp = nbytes;
|
||||
// Pick which compressions library to use
|
||||
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
|
||||
buf_size_out = bshuf_compress_lz4_bound(nbytes_uncomp / elem_size,
|
||||
elem_size, block_size) + 12;
|
||||
}
|
||||
#ifdef ZSTD_SUPPORT
|
||||
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
|
||||
buf_size_out = bshuf_compress_zstd_bound(nbytes_uncomp / elem_size,
|
||||
elem_size, block_size) + 12;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
nbytes_uncomp = nbytes;
|
||||
buf_size_out = nbytes;
|
||||
}
|
||||
|
||||
// TODO, remove this restriction by memcopying the extra.
|
||||
if (nbytes_uncomp % elem_size) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"Non integer number of elements.");
|
||||
return 0;
|
||||
}
|
||||
size = nbytes_uncomp / elem_size;
|
||||
|
||||
out_buf = malloc(buf_size_out);
|
||||
if (out_buf == NULL) {
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
|
||||
"Could not allocate output buffer.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cd_nelmts > 4 && (cd_values[4] == BSHUF_H5_COMPRESS_LZ4 || cd_values[4] == BSHUF_H5_COMPRESS_ZSTD)) {
|
||||
if (flags & H5Z_FLAG_REVERSE) {
|
||||
// Bit unshuffle/decompress.
|
||||
// Pick which compressions library to use
|
||||
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
|
||||
err = bshuf_decompress_lz4(in_buf, out_buf, size, elem_size, block_size);
|
||||
}
|
||||
#ifdef ZSTD_SUPPORT
|
||||
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
|
||||
err = bshuf_decompress_zstd(in_buf, out_buf, size, elem_size, block_size);
|
||||
}
|
||||
#endif
|
||||
nbytes_out = nbytes_uncomp;
|
||||
} else {
|
||||
// Bit shuffle/compress.
|
||||
// Write the header, described in
|
||||
// http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
|
||||
// Technically we should be using signed integers instead of
|
||||
// unsigned ones, however for valid inputs (positive numbers) these
|
||||
// have the same representation.
|
||||
bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
|
||||
bshuf_write_uint32_BE((char*) out_buf + 8, block_size * elem_size);
|
||||
if(cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
|
||||
err = bshuf_compress_lz4(in_buf, (char*) out_buf + 12, size,
|
||||
elem_size, block_size);
|
||||
}
|
||||
#ifdef ZSTD_SUPPORT
|
||||
else if (cd_values[4] == BSHUF_H5_COMPRESS_ZSTD) {
|
||||
err = bshuf_compress_zstd(in_buf, (char*) out_buf + 12, size,
|
||||
elem_size, block_size, comp_lvl);
|
||||
}
|
||||
#endif
|
||||
nbytes_out = err + 12;
|
||||
}
|
||||
} else {
|
||||
if (flags & H5Z_FLAG_REVERSE) {
|
||||
// Bit unshuffle.
|
||||
err = bshuf_bitunshuffle(in_buf, out_buf, size, elem_size,
|
||||
block_size); } else {
|
||||
// Bit shuffle.
|
||||
err = bshuf_bitshuffle(in_buf, out_buf, size, elem_size,
|
||||
block_size); } nbytes_out = nbytes; }
|
||||
//printf("nb_in %d, nb_uncomp %d, nb_out %d, buf_out %d, block %d\n",
|
||||
//nbytes, nbytes_uncomp, nbytes_out, buf_size_out, block_size);
|
||||
|
||||
if (err < 0) {
|
||||
sprintf(msg, "Error in bitshuffle with error code %d.", err);
|
||||
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, msg);
|
||||
free(out_buf);
|
||||
return 0;
|
||||
} else {
|
||||
free(*buf);
|
||||
*buf = out_buf;
|
||||
*buf_size = buf_size_out;
|
||||
|
||||
return nbytes_out;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
H5Z_class_t bshuf_H5Filter[1] = {{
|
||||
H5Z_CLASS_T_VERS,
|
||||
(H5Z_filter_t)(BSHUF_H5FILTER),
|
||||
1, 1,
|
||||
"bitshuffle; see https://github.com/kiyo-masui/bitshuffle",
|
||||
NULL,
|
||||
(H5Z_set_local_func_t)(bshuf_h5_set_local),
|
||||
(H5Z_func_t)(bshuf_h5_filter)
|
||||
}};
|
||||
|
||||
|
||||
int bshuf_register_h5filter(void){
|
||||
|
||||
int retval;
|
||||
|
||||
retval = H5Zregister(bshuf_H5Filter);
|
||||
if(retval<0){
|
||||
PUSH_ERR("bshuf_register_h5filter",
|
||||
H5E_CANTREGISTER, "Can't register bitshuffle filter");
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
* Bitshuffle HDF5 filter
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
* Website: http://www.github.com/kiyo-masui/bitshuffle
|
||||
* Created: 2014
|
||||
*
|
||||
* See LICENSE file for details about copyright and rights to use.
|
||||
*
|
||||
*
|
||||
* Header File
|
||||
*
|
||||
* Filter Options
|
||||
* --------------
|
||||
* block_size (option slot 0) : integer (optional)
|
||||
* What block size to use (in elements not bytes). Default is 0,
|
||||
* for which bitshuffle will pick a block size with a target of 8kb.
|
||||
* Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
|
||||
* Whether to apply LZ4 compression to the data after bitshuffling.
|
||||
* This is much faster than applying compression as a second filter
|
||||
* because it is done when the small block of data is already in the
|
||||
* L1 cache.
|
||||
*
|
||||
* For LZ4 compression, the compressed format of the data is the same as
|
||||
* for the normal LZ4 filter described in
|
||||
* http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BSHUF_H5FILTER_H
|
||||
#define BSHUF_H5FILTER_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define H5Z_class_t_vers 2
|
||||
#include "hdf5.h"
|
||||
|
||||
|
||||
#define BSHUF_H5FILTER 32008
|
||||
|
||||
|
||||
#define BSHUF_H5_COMPRESS_LZ4 2
|
||||
#define BSHUF_H5_COMPRESS_ZSTD 3
|
||||
|
||||
|
||||
extern H5Z_class_t bshuf_H5Filter[1];
|
||||
|
||||
|
||||
/* ---- bshuf_register_h5filter ----
|
||||
*
|
||||
* Register the bitshuffle HDF5 filter within the HDF5 library.
|
||||
*
|
||||
* Call this before using the bitshuffle HDF5 filter from C unless
|
||||
* using dynamically loaded filters.
|
||||
*
|
||||
*/
|
||||
int bshuf_register_h5filter(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // BSHUF_H5FILTER_H
|
||||
+4
-4
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* IOchain - Distribute a chain of dependent IO events among threads.
|
||||
* IOchain - Distribute a chain of dependant IO events amoung threads.
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
@@ -81,9 +81,9 @@ void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
|
||||
C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
|
||||
// *in_pl[this_iter]* lock released at the end of the iteration to avoid being
|
||||
// overtaken by previous threads and having *out_pl[this_iter]* corrupted.
|
||||
// Especially worried about thread 0, iteration 0.
|
||||
|
||||
|
||||
|
||||
omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
|
||||
#endif
|
||||
}
|
||||
|
||||
+3
-3
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* IOchain - Distribute a chain of dependent IO events among threads.
|
||||
* IOchain - Distribute a chain of dependant IO events amoung threads.
|
||||
*
|
||||
* This file is part of Bitshuffle
|
||||
* Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
|
||||
@@ -25,7 +25,7 @@
|
||||
* Usage
|
||||
* -----
|
||||
* - Call `ioc_init` in serial block.
|
||||
* - Each thread should create a local variable *size_t this_iter* and
|
||||
* - Each thread should create a local variable *size_t this_iter* and
|
||||
* pass its address to all function calls. Its value will be set
|
||||
* inside the functions and is used to identify the thread.
|
||||
* - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
|
||||
@@ -90,5 +90,5 @@ void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
|
||||
void * ioc_get_out(ioc_chain *C, size_t *this_iter);
|
||||
void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);
|
||||
|
||||
#endif // IOCHAIN_H
|
||||
#endif
|
||||
|
||||
|
||||
+917
-1897
File diff suppressed because it is too large
Load Diff
+250
-664
File diff suppressed because it is too large
Load Diff
+46
-67
@@ -4,6 +4,7 @@
|
||||
*/
|
||||
|
||||
#include <hdf5.h>
|
||||
#include <hdf5_hl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@@ -103,13 +104,7 @@ int get_nxs_dataset_dims(struct ds_desc_t *desc) {
|
||||
ERROR_JUMP(-1, close_space, "Error getting dataset dimensions");
|
||||
}
|
||||
|
||||
if ( H5Tequal(t_id,H5T_NATIVE_CHAR)>0 || H5Tequal(t_id,H5T_NATIVE_INT)>0 || H5Tequal(t_id,H5T_NATIVE_SHORT)>0 || H5Tequal(t_id,H5T_NATIVE_LONG)>0 || H5Tequal(t_id,H5T_NATIVE_LLONG)>0 ) {
|
||||
// signed
|
||||
desc->data_width = -width;
|
||||
} else {
|
||||
// unsigned
|
||||
desc->data_width = width;
|
||||
}
|
||||
desc->data_width = width;
|
||||
|
||||
close_space:
|
||||
H5Sclose(s_id);
|
||||
@@ -227,7 +222,7 @@ int get_frame_from_chunk(const struct ds_desc_t *desc, const char *ds_name,
|
||||
c_buffer = buffer;
|
||||
}
|
||||
|
||||
if (H5Dread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) <
|
||||
if (H5DOread_chunk(d_id, H5P_DEFAULT, c_offset, &c_filter_mask, c_buffer) <
|
||||
0) {
|
||||
char message[128];
|
||||
sprintf(message,
|
||||
@@ -238,7 +233,7 @@ int get_frame_from_chunk(const struct ds_desc_t *desc, const char *ds_name,
|
||||
|
||||
if (o_eiger_desc->bs_applied) {
|
||||
if (bslz4_decompress(o_eiger_desc->bs_params, c_bytes, c_buffer,
|
||||
abs(desc->data_width) * frame_size[1] * frame_size[2],
|
||||
desc->data_width * frame_size[1] * frame_size[2],
|
||||
buffer) < 0) {
|
||||
char message[128];
|
||||
sprintf(message,
|
||||
@@ -256,11 +251,10 @@ done:
|
||||
return retval;
|
||||
}
|
||||
|
||||
int get_nxs_frame(const struct ds_desc_t *desc, const int nin, void *buffer) {
|
||||
int get_nxs_frame(const struct ds_desc_t *desc, const int n, void *buffer) {
|
||||
/* detector data are the two inner most indices */
|
||||
/* TODO: handle ndims > 3 and select appropriately */
|
||||
int retval = 0;
|
||||
int n = nin - desc->image_number_offset;
|
||||
hsize_t frame_idx[3] = {n, 0, 0};
|
||||
hsize_t frame_size[3] = {1, desc->dims[1], desc->dims[2]};
|
||||
if (n < 0 || n >= desc->dims[0]) {
|
||||
@@ -277,10 +271,9 @@ done:
|
||||
return retval;
|
||||
}
|
||||
|
||||
int get_dectris_eiger_frame(const struct ds_desc_t *desc, int nin, void *buffer) {
|
||||
int get_dectris_eiger_frame(const struct ds_desc_t *desc, int n, void *buffer) {
|
||||
|
||||
int retval = 0;
|
||||
int n = nin - desc->image_number_offset;
|
||||
int block, frame_count, idx;
|
||||
struct eiger_ds_desc_t *eiger_desc = (struct eiger_ds_desc_t *)desc;
|
||||
char data_name[16] = {0};
|
||||
@@ -356,10 +349,6 @@ int get_dectris_eiger_dataset_dims(struct ds_desc_t *desc) {
|
||||
if (data_width <= 0) {
|
||||
ERROR_JUMP(-1, close_space, "Unable to get type size");
|
||||
}
|
||||
if ( H5Tequal(t_id,H5T_NATIVE_CHAR)>0 || H5Tequal(t_id,H5T_NATIVE_INT)>0 || H5Tequal(t_id,H5T_NATIVE_SHORT)>0 || H5Tequal(t_id,H5T_NATIVE_LONG)>0 || H5Tequal(t_id,H5T_NATIVE_LLONG)>0 ) {
|
||||
// signed
|
||||
data_width = -data_width;
|
||||
}
|
||||
|
||||
ndims = H5Sget_simple_extent_ndims(s_id);
|
||||
if (ndims != 3) {
|
||||
@@ -572,37 +561,9 @@ int get_dectris_eiger_pixel_mask(const struct ds_desc_t *desc, int *buffer) {
|
||||
ERROR_JUMP(-1, done, "Error opening detectorSpecific/pixel_mask");
|
||||
}
|
||||
|
||||
// what if this is compressed?
|
||||
hid_t dcpl = H5Dget_create_plist(ds_id);
|
||||
int n_filters = H5Pget_nfilters(dcpl);
|
||||
H5Z_filter_t filter_id;
|
||||
if (n_filters>0) {
|
||||
unsigned int flags;
|
||||
size_t nelmts = 1;
|
||||
unsigned int values_out[1] = {99};
|
||||
char filter_name[80];
|
||||
for ( int i_filt = 0; i_filt < n_filters; i_filt++) {
|
||||
filter_id = H5Pget_filter(dcpl, i_filt, &flags, &nelmts, values_out, sizeof(filter_name), filter_name, NULL);
|
||||
if (filter_id>=0) {
|
||||
fprintf(stderr," filter #%d name =\"%s\"\n",(i_filt+1),filter_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int i0 = H5Zfilter_avail(BS_H5_FILTER_ID);
|
||||
|
||||
err = H5Dread(ds_id, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, buffer);
|
||||
if (err < 0) {
|
||||
if (n_filters>0) {
|
||||
ERROR_JUMP(-1, close_dataset, "Error reading detectorSpecific/pixel_mask with filter(s)");
|
||||
}
|
||||
else {
|
||||
ERROR_JUMP(-1, close_dataset, "Error reading detectorSpecific/pixel_mask");
|
||||
}
|
||||
}
|
||||
|
||||
if (!i0 && H5Zfilter_avail(BS_H5_FILTER_ID)) {
|
||||
fprintf(stderr," bitshuffle filter is available now since H5Dread (of pixel-mask) triggered loading of the filter.\n");
|
||||
ERROR_JUMP(-1, close_dataset, "Error reading detectorSpecific/pixel_mask");
|
||||
}
|
||||
|
||||
close_dataset:
|
||||
@@ -633,9 +594,9 @@ herr_t det_visit_callback(hid_t root_id, const char *name,
|
||||
|
||||
/* check for an "NX_class" attribute */
|
||||
{
|
||||
char* buffer = (char*)malloc(1);
|
||||
buffer[0] = '\0';
|
||||
hid_t a_id, t_id;
|
||||
int str_size = 0;
|
||||
void *buffer = NULL;
|
||||
hid_t a_id, t_id, mt_id;
|
||||
if (H5Aexists(g_id, "NX_class") <= 0) {
|
||||
/* not an error - just close group and allow continuation */
|
||||
retval = 0;
|
||||
@@ -655,31 +616,39 @@ herr_t det_visit_callback(hid_t root_id, const char *name,
|
||||
if (t_id < 0) {
|
||||
ERROR_JUMP(-1, close_attr, "Error getting datatype");
|
||||
}
|
||||
|
||||
H5A_info_t a_info;
|
||||
herr_t err = H5Aget_info(a_id, &a_info);
|
||||
if (err<0) {
|
||||
ERROR_JUMP(-1, close_type,
|
||||
"Unable to get attribute info for NX_class");
|
||||
if (H5Tis_variable_str(t_id) > 0) {
|
||||
str_size = -1;
|
||||
buffer = malloc(sizeof(char *));
|
||||
} else {
|
||||
str_size = H5Tget_size(t_id);
|
||||
buffer = malloc(str_size + 1);
|
||||
}
|
||||
else {
|
||||
if (a_info.cset != H5T_CSET_ASCII && a_info.cset != H5T_CSET_UTF8) {
|
||||
fprintf(stderr," %s : NX_class attribute info cset = unknown with size %d\n",name,(int) a_info.data_size);
|
||||
}
|
||||
}
|
||||
|
||||
buffer = (char *)malloc(sizeof(char)*(H5Tget_size(t_id)+1));
|
||||
if (!buffer) {
|
||||
ERROR_JUMP(-1, close_type, "Error allocating string buffer");
|
||||
}
|
||||
|
||||
if (H5Aread(a_id, t_id, buffer) < 0) {
|
||||
mt_id = H5Tcopy(H5T_C_S1);
|
||||
if (mt_id < 0) {
|
||||
ERROR_JUMP(-1, free_buffer, "Error creating HDF5 String datatype");
|
||||
}
|
||||
// set the target string type to be one longer than the recorded string
|
||||
// in keeping with the malloc'd buffer - if this is already a null
|
||||
// terminated string then we will just have two nulls, if it is not
|
||||
// then we won't clobber the last char in the buffer with a null in the
|
||||
// H5Aread call
|
||||
if (H5Tset_size(mt_id, str_size == -1 ? H5T_VARIABLE : str_size + 1) < 0) {
|
||||
char message[64];
|
||||
sprintf(message, "Error setting string datatype to size %d", str_size);
|
||||
ERROR_JUMP(-1, close_mtype, message);
|
||||
}
|
||||
|
||||
if (H5Aread(a_id, mt_id, buffer) < 0) {
|
||||
char message[256];
|
||||
sprintf(
|
||||
message,
|
||||
"H5OVisit callback: Error reading NX_class attribute on group %.128s",
|
||||
name);
|
||||
ERROR_JUMP(-1, free_buffer, message);
|
||||
ERROR_JUMP(-1, close_mtype, message);
|
||||
}
|
||||
|
||||
/* ensure the buffer is null terminated */
|
||||
@@ -687,16 +656,26 @@ herr_t det_visit_callback(hid_t root_id, const char *name,
|
||||
|
||||
/* test for NXdata or NXdetector */
|
||||
{
|
||||
if (strcmp("NXdata", buffer) == 0) {
|
||||
char *nxclass = str_size > 0 ? (char *)buffer : *((char **)buffer);
|
||||
if (strcmp("NXdata", nxclass) == 0) {
|
||||
hid_t out_id = H5Gopen(root_id, name, H5P_DEFAULT);
|
||||
output_data->nxdata = out_id;
|
||||
}
|
||||
else if (strcmp("NXdetector", buffer) == 0) {
|
||||
} else if (strcmp("NXdetector", nxclass) == 0) {
|
||||
hid_t out_id = H5Gopen(root_id, name, H5P_DEFAULT);
|
||||
output_data->nxdetector = out_id;
|
||||
}
|
||||
}
|
||||
|
||||
if (str_size == -1) {
|
||||
hsize_t dims[1] = {1};
|
||||
hid_t s_id = H5Screate_simple(1, dims, NULL);
|
||||
H5Sselect_all(s_id);
|
||||
H5Dvlen_reclaim(mt_id, s_id, H5P_DEFAULT, buffer);
|
||||
H5Sclose(s_id);
|
||||
}
|
||||
|
||||
close_mtype:
|
||||
H5Tclose(mt_id);
|
||||
free_buffer:
|
||||
free(buffer);
|
||||
close_type:
|
||||
|
||||
@@ -15,12 +15,10 @@ struct ds_desc_t {
|
||||
hid_t data_g_id;
|
||||
hsize_t dims[3];
|
||||
int data_width;
|
||||
int image_number_offset;
|
||||
int (*get_pixel_properties)(const struct ds_desc_t *, double *, double *);
|
||||
int (*get_pixel_mask)(const struct ds_desc_t *, int *);
|
||||
int (*get_data_frame)(const struct ds_desc_t *, const int, void *);
|
||||
void (*free_desc)(struct ds_desc_t *);
|
||||
int i2i[]; // array to hold a translation from the image number requested by XDS and the actual position in the HDF5 file
|
||||
};
|
||||
|
||||
struct nxs_ds_desc_t {
|
||||
|
||||
+39
-257
@@ -5,16 +5,11 @@
|
||||
|
||||
#include <hdf5.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "file.h"
|
||||
#include "filters.h"
|
||||
#include "plugin.h"
|
||||
|
||||
#ifdef USE_BITSHUFFLE
|
||||
#include "bshuf_h5filter.h"
|
||||
#endif
|
||||
|
||||
/* XDS does not provide an error callback facility, so just write to stderr
|
||||
for now - generally regarded as poor practice */
|
||||
#define ERROR_OUTPUT stderr
|
||||
@@ -22,41 +17,24 @@
|
||||
/* mask bits loosely based on what Neggia does and what NeXus says should be
|
||||
done basically - anything in the low byte (& 0xFF) means "ignore this"
|
||||
Neggia uses the value -2 if bit 1, 2 or 3 are set */
|
||||
/* CV-GPhL-20210408: we want more control over the value non-masked
|
||||
pixels should be set to. */
|
||||
#define COPY_AND_MASK(in, value, setValue, out, size, mask) \
|
||||
#define COPY_AND_MASK(in, out, size, mask) \
|
||||
{ \
|
||||
int i; \
|
||||
if (value!=0) { \
|
||||
if (mask) { \
|
||||
for (i = 0; i < size; ++i) { \
|
||||
out[i] = (in[i] == value) ? setValue : in[i]; \
|
||||
if (mask[i] & 0xFF) \
|
||||
out[i] = -1; \
|
||||
if (mask[i] & 30) \
|
||||
out[i] = -2; \
|
||||
} \
|
||||
} else { \
|
||||
for (i = 0; i < size; i++) { \
|
||||
out[i] = (in[i] == value) ? setValue : in[i]; \
|
||||
} \
|
||||
if (mask) { \
|
||||
for (i = 0; i < size; ++i) { \
|
||||
out[i] = in[i]; \
|
||||
if (mask[i] & 0xFF) \
|
||||
out[i] = -1; \
|
||||
if (mask[i] & 30) \
|
||||
out[i] = -2; \
|
||||
} \
|
||||
} else { \
|
||||
if (mask) { \
|
||||
for (i = 0; i < size; ++i) { \
|
||||
out[i] = in[i]; \
|
||||
if (mask[i] & 0xFF) \
|
||||
out[i] = -1; \
|
||||
if (mask[i] & 30) \
|
||||
out[i] = -2; \
|
||||
} \
|
||||
} else { \
|
||||
for (i = 0; i < size; i++) { \
|
||||
out[i] = in[i]; \
|
||||
} \
|
||||
for (i = 0; i < size; i++) { \
|
||||
out[i] = in[i]; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define APPLY_MASK(buffer, mask, size) \
|
||||
{ \
|
||||
int i; \
|
||||
@@ -74,200 +52,42 @@ static hid_t file_id = 0;
|
||||
static struct ds_desc_t *data_desc = NULL;
|
||||
static int *mask_buffer = NULL;
|
||||
|
||||
// CV-20240605: potentially provide a mapping from frame number (as
|
||||
// requested by caller) to actual 2D slice within 3D data
|
||||
// array.
|
||||
//
|
||||
// This is defined by the environment variable
|
||||
// DURIN_IMAGE2ORDINAL (see below).
|
||||
int *image2ordinal = NULL;
|
||||
int image2ordinal_debug = 0;
|
||||
int image2ordinal_imin = 0;
|
||||
int image2ordinal_imax = 0;
|
||||
|
||||
void fill_info_array(int info[1024]) {
|
||||
info[0] = DLS_CUSTOMER_ID;
|
||||
info[1] = VERSION_MAJOR;
|
||||
info[2] = VERSION_MINOR;
|
||||
info[3] = VERSION_PATCH;
|
||||
info[4] = VERSION_TIMESTAMP;
|
||||
info[5] = 0; // image number offset
|
||||
info[6] = -1; // marked pixels not already in pixel_mask: reset to this value
|
||||
|
||||
char *cenv;
|
||||
cenv = getenv("DURIN_IMAGE_NUMBER_OFFSET");
|
||||
if (cenv!=NULL) {
|
||||
info[5] = atoi(cenv);
|
||||
}
|
||||
cenv = getenv("DURIN_RESET_UNMASKED_PIXEL");
|
||||
if (cenv!=NULL) {
|
||||
info[6] = atoi(cenv);
|
||||
}
|
||||
|
||||
cenv = getenv("DURIN_IMAGE2ORDINAL");
|
||||
if (cenv!=NULL&&(!image2ordinal)) {
|
||||
|
||||
char *denv = getenv("DURIN_IMAGE2ORDINAL_DEBUG");
|
||||
if (denv!=NULL) {
|
||||
image2ordinal_debug=1;
|
||||
}
|
||||
|
||||
// <ordinal_start>,<image_1_start>-<image_1_end>,<image_2_start>-<image_2_end>,..,<image_N_start>-<image_N_end>
|
||||
if (image2ordinal_debug) printf("DURIN_IMAGE2ORDINAL = \"%s\"\n",cenv);
|
||||
|
||||
const char outer_delimiters[] = ",";
|
||||
const char inner_delimiters[] = "-";
|
||||
|
||||
char *found;
|
||||
|
||||
char *outer_saveptr;
|
||||
char *inner_saveptr;
|
||||
|
||||
int ordinal_start = 0;
|
||||
int ordinal = 0;
|
||||
int ntt = -1;
|
||||
found = strtok_r(cenv,outer_delimiters, &outer_saveptr);
|
||||
if (found!=NULL) {
|
||||
int tt[1000][2];
|
||||
while(found) {
|
||||
if (ordinal_start==0) {
|
||||
ordinal_start = atoi(found);
|
||||
ordinal = ordinal_start - 1;
|
||||
}
|
||||
else {
|
||||
char* s = strtok_r(found, inner_delimiters, &inner_saveptr);
|
||||
if (s!=NULL) {
|
||||
int i1 = atoi(s);
|
||||
s = strtok_r(NULL,inner_delimiters, &inner_saveptr);
|
||||
if (s!=NULL) {
|
||||
int i2 = atoi(s);
|
||||
ntt++;
|
||||
if (ntt<=1000) {
|
||||
tt[ntt][0] = i1;
|
||||
tt[ntt][1] = i2;
|
||||
for(int i=i1; i<=i2; ++i) {
|
||||
ordinal++;
|
||||
if (ordinal==1) {
|
||||
image2ordinal_imin=i;
|
||||
image2ordinal_imax=i;
|
||||
}
|
||||
else {
|
||||
if (i<image2ordinal_imin) image2ordinal_imin=i;
|
||||
if (i>image2ordinal_imax) image2ordinal_imax=i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
found = strtok_r(NULL,outer_delimiters,&outer_saveptr);
|
||||
}
|
||||
|
||||
if (ordinal_start>0) {
|
||||
if (image2ordinal_debug) {
|
||||
printf("ordinal_start, end = %d %d\n",ordinal_start, ordinal);
|
||||
printf("imin, imax = %d %d\n",image2ordinal_imin,image2ordinal_imax);
|
||||
}
|
||||
|
||||
// allocate array to go from image number/id to ordinal:
|
||||
image2ordinal = malloc((image2ordinal_imax-image2ordinal_imin+1) * sizeof(image2ordinal_imin));
|
||||
int ordinal = ordinal_start - 1;
|
||||
for(int i=0; i<=ntt; i++) {
|
||||
for(int j=tt[i][0];j<=tt[i][1];j++) {
|
||||
ordinal++;
|
||||
//printf(" %d -> %d\n",ordinal,j);
|
||||
image2ordinal[j] = ordinal;
|
||||
}
|
||||
}
|
||||
if (image2ordinal&&image2ordinal_debug) {
|
||||
for(int i=image2ordinal_imin; i<=image2ordinal_imax; i++) {
|
||||
if (image2ordinal[i]>0) {
|
||||
printf(" %d -> %d\n",i,image2ordinal[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int convert_to_int_and_mask(void *in_buffer, int width, int setValue, int *out_buffer,
|
||||
int convert_to_int_and_mask(void *in_buffer, int d_width, int *out_buffer,
|
||||
int length, int *mask) {
|
||||
/* transfer data to output buffer, performing data conversion as required */
|
||||
int retval = 0;
|
||||
/* TODO: decide how conversion of data should work */
|
||||
/* Should we sign extend? Neggia doesn't (casts from uint*), but may be more
|
||||
* intuitive */
|
||||
|
||||
int d_width = abs(width);
|
||||
|
||||
// CV-20210407
|
||||
// Dealing with a signed data array: no extra check for marker
|
||||
// value needed (since data can already take advantage of the
|
||||
// negative data range). It is unclear though why/when data would
|
||||
// come in as a signed array in the first place ...
|
||||
if (width<0) {
|
||||
if (d_width == sizeof(signed char)) {
|
||||
// 8-bit
|
||||
signed char *in = in_buffer;
|
||||
COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(short)) {
|
||||
// 16-bit
|
||||
short *in = in_buffer;
|
||||
COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(int)) {
|
||||
// 16-bit
|
||||
int *in = in_buffer;
|
||||
COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(long int)) {
|
||||
// 32-bit
|
||||
long int *in = in_buffer;
|
||||
COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(long long int)) {
|
||||
// 64-bit
|
||||
long long int *in = in_buffer;
|
||||
COPY_AND_MASK(in, 0, setValue, out_buffer, length, mask);
|
||||
} else {
|
||||
char message[128];
|
||||
sprintf(message, "Unsupported conversion of data width %d to %ld (int)",
|
||||
d_width, sizeof(int));
|
||||
ERROR_JUMP(-1, done, message);
|
||||
}
|
||||
if (d_width == sizeof(signed char)) {
|
||||
signed char *in = in_buffer;
|
||||
COPY_AND_MASK(in, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(short)) {
|
||||
short *in = in_buffer;
|
||||
COPY_AND_MASK(in, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(int)) {
|
||||
int *in = in_buffer;
|
||||
COPY_AND_MASK(in, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(long int)) {
|
||||
long int *in = in_buffer;
|
||||
COPY_AND_MASK(in, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(long long int)) {
|
||||
long long int *in = in_buffer;
|
||||
COPY_AND_MASK(in, out_buffer, length, mask);
|
||||
} else {
|
||||
char message[128];
|
||||
sprintf(message, "Unsupported conversion of data width %d to %ld (int)",
|
||||
d_width, sizeof(int));
|
||||
ERROR_JUMP(-1, done, message);
|
||||
}
|
||||
// CV-20210407
|
||||
// Dealing with an unsigned data array: extra check for marker
|
||||
// value required (to handle overloaded pixels correctly if wanted
|
||||
// - see also DURIN_RESET_UNMASKED_PIXEL environment variable).
|
||||
else {
|
||||
if (d_width == sizeof(unsigned char)) {
|
||||
// 8-bit
|
||||
unsigned char *in = in_buffer;
|
||||
COPY_AND_MASK(in, UINT8_MAX, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(unsigned short)) {
|
||||
// 16-bit
|
||||
unsigned short *in = in_buffer;
|
||||
COPY_AND_MASK(in, UINT16_MAX, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(unsigned int)) {
|
||||
// 16-bit
|
||||
unsigned int *in = in_buffer;
|
||||
COPY_AND_MASK(in, UINT16_MAX, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(unsigned long)) {
|
||||
// 32-bit
|
||||
unsigned long *in = in_buffer;
|
||||
COPY_AND_MASK(in, UINT32_MAX, setValue, out_buffer, length, mask);
|
||||
} else if (d_width == sizeof(unsigned long long)) {
|
||||
// 64-bit
|
||||
unsigned long long *in = in_buffer;
|
||||
COPY_AND_MASK(in, UINT32_MAX, setValue, out_buffer, length, mask);
|
||||
} else {
|
||||
char message[128];
|
||||
sprintf(message, "Unsupported conversion of data width %d to %ld (int)",
|
||||
d_width, sizeof(int));
|
||||
ERROR_JUMP(-1, done, message);
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
return retval;
|
||||
}
|
||||
@@ -290,12 +110,6 @@ void plugin_open(const char *filename, int info[1024], int *error_flag) {
|
||||
ERROR_JUMP(-2, done, "Failed to configure HDF5 error handling");
|
||||
}
|
||||
|
||||
#ifdef USE_BITSHUFFLE
|
||||
if (bshuf_register_h5filter() < 0 ) {
|
||||
ERROR_JUMP(-2, done, "Failed to register bitshuffle filter");
|
||||
}
|
||||
#endif
|
||||
|
||||
fill_info_array(info);
|
||||
file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
|
||||
if (file_id < 0) {
|
||||
@@ -310,8 +124,6 @@ void plugin_open(const char *filename, int info[1024], int *error_flag) {
|
||||
ERROR_JUMP(-4, done, "");
|
||||
}
|
||||
|
||||
data_desc->image_number_offset = info[5];
|
||||
|
||||
mask_buffer = malloc(data_desc->dims[1] * data_desc->dims[2] * sizeof(int));
|
||||
if (mask_buffer) {
|
||||
retval = data_desc->get_pixel_mask(data_desc, mask_buffer);
|
||||
@@ -326,10 +138,6 @@ void plugin_open(const char *filename, int info[1024], int *error_flag) {
|
||||
}
|
||||
retval = 0;
|
||||
|
||||
#ifdef GPHL_COMPILE_DATE
|
||||
fprintf(ERROR_OUTPUT, "\n XDS HDF5/Durin plugin %d.%d.%d (DLS, 2018-2023; GPhL, 2020-2024 - built %d)\n", info[1], info[2], info[3], GPHL_COMPILE_DATE);
|
||||
#endif
|
||||
|
||||
done:
|
||||
*error_flag = retval;
|
||||
if (retval < 0) {
|
||||
@@ -357,7 +165,7 @@ void plugin_get_header(int *nx, int *ny, int *nbytes, float *qx, float *qy,
|
||||
|
||||
*nx = data_desc->dims[2];
|
||||
*ny = data_desc->dims[1];
|
||||
*nbytes = abs(data_desc->data_width);
|
||||
*nbytes = data_desc->data_width;
|
||||
*number_of_frames = data_desc->dims[0];
|
||||
*qx = (float)x_pixel_size;
|
||||
*qy = (float)y_pixel_size;
|
||||
@@ -378,47 +186,26 @@ void plugin_get_data(int *frame_number, int *nx, int *ny, int *data_array,
|
||||
fill_info_array(info);
|
||||
|
||||
void *buffer = NULL;
|
||||
if (sizeof(*data_array) == abs(data_desc->data_width)) {
|
||||
if (sizeof(*data_array) == data_desc->data_width) {
|
||||
buffer = data_array;
|
||||
} else {
|
||||
buffer = malloc(abs(data_desc->data_width) * frame_size_px);
|
||||
buffer = malloc(data_desc->data_width * frame_size_px);
|
||||
if (!buffer) {
|
||||
ERROR_JUMP(-1, done, "Unable to allocate data buffer");
|
||||
}
|
||||
}
|
||||
|
||||
int ordinal = *frame_number;
|
||||
if (image2ordinal) {
|
||||
if (ordinal < image2ordinal_imin || ordinal>image2ordinal_imax) {
|
||||
char message[64] = {0};
|
||||
sprintf(message, "Failed to map frame %d to ordinals since outside of range %d - %d", ordinal,image2ordinal_imin,image2ordinal_imax);
|
||||
ERROR_JUMP(-2, done, message);
|
||||
}
|
||||
ordinal = image2ordinal[ordinal];
|
||||
if (ordinal!=*frame_number) {
|
||||
if (image2ordinal_debug) printf("fetching data from ordinal %d for frame %d\n",ordinal,*frame_number);
|
||||
}
|
||||
}
|
||||
|
||||
if (data_desc->get_data_frame(data_desc, ordinal - 1, buffer) < 0) {
|
||||
if (data_desc->get_data_frame(data_desc, (*frame_number) - 1, buffer) < 0) {
|
||||
char message[64] = {0};
|
||||
if (image2ordinal) {
|
||||
sprintf(message, "Failed to retrieve data for frame %d (ordinal %d)", *frame_number, ordinal);
|
||||
} else {
|
||||
sprintf(message, "Failed to retrieve data for frame %d", *frame_number);
|
||||
}
|
||||
sprintf(message, "Failed to retrieve data for frame %d", *frame_number);
|
||||
ERROR_JUMP(-2, done, message);
|
||||
}
|
||||
|
||||
if (buffer != data_array) {
|
||||
if (convert_to_int_and_mask(buffer, data_desc->data_width, info[6], data_array,
|
||||
if (convert_to_int_and_mask(buffer, data_desc->data_width, data_array,
|
||||
frame_size_px, mask_buffer) < 0) {
|
||||
char message[64];
|
||||
if (image2ordinal) {
|
||||
sprintf(message, "Error converting data for frame %d (ordinal %d)", *frame_number, ordinal);
|
||||
} else {
|
||||
sprintf(message, "Error converting data for frame %d", *frame_number);
|
||||
}
|
||||
sprintf(message, "Error converting data for frame %d", *frame_number);
|
||||
ERROR_JUMP(-2, done, message);
|
||||
}
|
||||
} else {
|
||||
@@ -443,11 +230,6 @@ void plugin_close(int *error_flag) {
|
||||
}
|
||||
file_id = 0;
|
||||
|
||||
if (image2ordinal) {
|
||||
free(image2ordinal);
|
||||
image2ordinal = NULL;
|
||||
}
|
||||
|
||||
if (mask_buffer) {
|
||||
free(mask_buffer);
|
||||
mask_buffer = NULL;
|
||||
|
||||
@@ -1,195 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import mimetools
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
import urllib2
|
||||
|
||||
|
||||
def fail(msg, code=1):
|
||||
sys.stderr.write("ERROR: %s\n" % msg)
|
||||
sys.exit(code)
|
||||
|
||||
|
||||
def api_request(url, token, method="GET", json_data=None, content_type=None):
|
||||
data = None
|
||||
headers = {
|
||||
"Authorization": "token %s" % token,
|
||||
}
|
||||
|
||||
if json_data is not None:
|
||||
data = json.dumps(json_data)
|
||||
headers["Content-Type"] = "application/json"
|
||||
elif content_type is not None:
|
||||
headers["Content-Type"] = content_type
|
||||
|
||||
request = urllib2.Request(url, data=data, headers=headers)
|
||||
request.get_method = lambda: method
|
||||
|
||||
try:
|
||||
response = urllib2.urlopen(request)
|
||||
body = response.read()
|
||||
status = response.getcode()
|
||||
return status, body
|
||||
except urllib2.HTTPError as e:
|
||||
return e.code, e.read()
|
||||
|
||||
|
||||
def encode_multipart_formdata(fields, files):
|
||||
boundary = mimetools.choose_boundary()
|
||||
crlf = "\r\n"
|
||||
lines = []
|
||||
|
||||
for key, value in fields:
|
||||
lines.append("--" + boundary)
|
||||
lines.append('Content-Disposition: form-data; name="%s"' % key)
|
||||
lines.append("")
|
||||
lines.append(value)
|
||||
|
||||
for key, filename, content, content_type in files:
|
||||
lines.append("--" + boundary)
|
||||
lines.append(
|
||||
'Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)
|
||||
)
|
||||
lines.append("Content-Type: %s" % content_type)
|
||||
lines.append("")
|
||||
lines.append(content)
|
||||
|
||||
lines.append("--" + boundary + "--")
|
||||
lines.append("")
|
||||
body = crlf.join(lines)
|
||||
content_type = "multipart/form-data; boundary=%s" % boundary
|
||||
return content_type, body
|
||||
|
||||
|
||||
def multipart_request(url, token, fields, files):
|
||||
content_type, body = encode_multipart_formdata(fields, files)
|
||||
headers = {
|
||||
"Authorization": "token %s" % token,
|
||||
"Content-Type": content_type,
|
||||
}
|
||||
|
||||
request = urllib2.Request(url, data=body, headers=headers)
|
||||
request.get_method = lambda: "POST"
|
||||
|
||||
try:
|
||||
response = urllib2.urlopen(request)
|
||||
return response.getcode(), response.read()
|
||||
except urllib2.HTTPError as e:
|
||||
return e.code, e.read()
|
||||
|
||||
|
||||
def get_release_by_tag(api_base, token, tag):
|
||||
url = "%s/releases/tags/%s" % (api_base, tag)
|
||||
status, body = api_request(url, token, method="GET")
|
||||
if status == 200:
|
||||
return json.loads(body)
|
||||
if status == 404:
|
||||
return None
|
||||
fail("failed to fetch release for tag %s: HTTP %s\n%s" % (tag, status, body))
|
||||
|
||||
|
||||
def create_release(api_base, token, tag):
|
||||
url = "%s/releases" % api_base
|
||||
payload = {
|
||||
"tag_name": tag,
|
||||
"name": tag,
|
||||
"draft": False,
|
||||
"prerelease": False,
|
||||
}
|
||||
status, body = api_request(url, token, method="POST", json_data=payload)
|
||||
if status not in (200, 201):
|
||||
fail("failed to create release for tag %s: HTTP %s\n%s" % (tag, status, body))
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def ensure_release(api_base, token, tag):
|
||||
release = get_release_by_tag(api_base, token, tag)
|
||||
if release is not None:
|
||||
print("Release for tag %s already exists (id=%s)" % (tag, release.get("id")))
|
||||
return release
|
||||
|
||||
print("Release for tag %s does not exist, creating it" % tag)
|
||||
release = create_release(api_base, token, tag)
|
||||
print("Created release id=%s" % release.get("id"))
|
||||
return release
|
||||
|
||||
|
||||
def upload_asset(api_base, token, release_id, file_path):
|
||||
if not os.path.isfile(file_path):
|
||||
fail("file not found: %s" % file_path)
|
||||
|
||||
asset_name = os.path.basename(file_path)
|
||||
mime_type = mimetypes.guess_type(asset_name)[0] or "application/octet-stream"
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
url = "%s/releases/%s/assets" % (api_base, release_id)
|
||||
status, body = multipart_request(
|
||||
url,
|
||||
token,
|
||||
fields=[("name", asset_name)],
|
||||
files=[("attachment", asset_name, content, mime_type)],
|
||||
)
|
||||
|
||||
if status not in (200, 201):
|
||||
fail("failed to upload asset %s: HTTP %s\n%s" % (asset_name, status, body))
|
||||
|
||||
print("Uploaded asset: %s" % asset_name)
|
||||
|
||||
|
||||
def find_assets(build_dir):
|
||||
names = []
|
||||
for name in sorted(os.listdir(build_dir)):
|
||||
if name.startswith("libdurin-plugin.so"):
|
||||
full = os.path.join(build_dir, name)
|
||||
if os.path.isfile(full):
|
||||
names.append(full)
|
||||
return names
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 4:
|
||||
sys.stderr.write(
|
||||
"Usage: %s <gitea-server> <owner/repo> <tag>\n" % sys.argv[0]
|
||||
)
|
||||
sys.stderr.write(
|
||||
"Example: %s https://gitea.psi.ch mx/durin 1.0.0\n" % sys.argv[0]
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
server = sys.argv[1].rstrip("/")
|
||||
repo = sys.argv[2]
|
||||
tag = sys.argv[3]
|
||||
|
||||
token = os.environ.get("GITEA_TOKEN")
|
||||
if not token:
|
||||
fail("GITEA_TOKEN environment variable is not set")
|
||||
|
||||
build_dir = "build"
|
||||
if not os.path.isdir(build_dir):
|
||||
fail("build directory not found: %s" % build_dir)
|
||||
|
||||
assets = find_assets(build_dir)
|
||||
if not assets:
|
||||
fail("no libdurin-plugin.so* files found in %s" % build_dir)
|
||||
|
||||
api_base = "%s/api/v1/repos/%s" % (server, repo)
|
||||
|
||||
release = ensure_release(api_base, token, tag)
|
||||
release_id = release.get("id")
|
||||
if not release_id:
|
||||
fail("release id missing in API response")
|
||||
|
||||
for asset in assets:
|
||||
upload_asset(api_base, token, release_id, asset)
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user