1729 lines
76 KiB
C
1729 lines
76 KiB
C
// SPDX-License-Identifier: MIT OR Apache-2.0
|
|
// Copyright (c) 2023 Kal Conley
|
|
#include "bitshuffle.h"
|
|
|
|
#include <assert.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
|
|
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
|
|
#include <immintrin.h>
|
|
#endif
|
|
|
|
#ifndef __has_attribute
|
|
#define __has_attribute(x) 0
|
|
#endif
|
|
#ifndef __has_builtin
|
|
#define __has_builtin(x) 0
|
|
#endif
|
|
|
|
#if defined(__INTEL_COMPILER)
|
|
#pragma warning(disable : 177) // entity-kind "entity" was declared but never referenced
|
|
#elif defined(__GNUC__)
|
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
|
#endif
|
|
|
|
#if defined(_MSC_VER)
|
|
#pragma warning(disable : 4244) // conversion from 'type1' to 'type2', possible loss of data
|
|
#endif
|
|
|
|
#if defined(_MSC_VER)
|
|
#if defined(_M_IX86) && _M_IX86_FP == 2 || defined(_M_X64)
|
|
#ifndef __SSE2__
|
|
#define __SSE2__ 1
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
|
|
__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ || \
|
|
defined(__BIG_ENDIAN__)
|
|
#error big endian not supported
|
|
#endif
|
|
|
|
#ifndef BITSHUF_USE_IFUNC
|
|
#if defined(__INTEL_COMPILER) || defined(__clang__) && __clang_major__ < 8
|
|
#define BITSHUF_USE_IFUNC 0 // GFNI not supported by compiler.
|
|
#endif
|
|
#endif
|
|
#ifndef BITSHUF_USE_IFUNC
|
|
#if (__has_attribute(ifunc) && __has_attribute(target) && __has_builtin(__builtin_cpu_init) && \
|
|
__has_builtin(__builtin_cpu_is) && __has_builtin(__builtin_cpu_supports)) || \
|
|
(defined(__GNUC__) && __GNUC__ >= 8)
|
|
#define BITSHUF_USE_IFUNC 1
|
|
#else
|
|
#define BITSHUF_USE_IFUNC 0
|
|
#endif
|
|
#endif
|
|
|
|
#define STRINGIZE(x) #x
|
|
|
|
#if __has_attribute(target) && !defined(__INTEL_COMPILER)
|
|
#define ATTRIBUTE_TARGET(x) __attribute__((target(x)))
|
|
#else
|
|
#define ATTRIBUTE_TARGET(x)
|
|
#endif
|
|
|
|
#if __has_attribute(always_inline) || defined(__GNUC__)
|
|
#define ALWAYS_INLINE inline __attribute__((always_inline))
|
|
#elif defined(_MSC_VER)
|
|
#define ALWAYS_INLINE __forceinline
|
|
#else
|
|
#define ALWAYS_INLINE inline
|
|
#endif
|
|
|
|
#if __has_attribute(noinline) || defined(__GNUC__)
|
|
#define NO_INLINE __attribute__((noinline))
|
|
#elif defined(_MSC_VER)
|
|
#define NO_INLINE __declspec(noinline)
|
|
#else
|
|
#define NO_INLINE
|
|
#endif
|
|
|
|
#if __has_attribute(no_sanitize_address)
|
|
#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
|
|
#else
|
|
#define NO_SANITIZE_ADDRESS
|
|
#endif
|
|
|
|
#if __has_attribute(no_sanitize_memory)
|
|
#define NO_SANITIZE_MEMORY __attribute__((no_sanitize_memory))
|
|
#else
|
|
#define NO_SANITIZE_MEMORY
|
|
#endif
|
|
|
|
#if __has_attribute(no_sanitize_thread)
|
|
#define NO_SANITIZE_THREAD __attribute__((no_sanitize_thread))
|
|
#else
|
|
#define NO_SANITIZE_THREAD
|
|
#endif
|
|
|
|
#if __has_attribute(disable_sanitizer_instrumentation)
|
|
#define DISABLE_SANITIZER_INSTRUMENTATION __attribute__((disable_sanitizer_instrumentation))
|
|
#else
|
|
#define DISABLE_SANITIZER_INSTRUMENTATION
|
|
#endif
|
|
|
|
#if __has_attribute(fallthrough)
|
|
#define FALLTHROUGH __attribute__((fallthrough))
|
|
#else
|
|
#define FALLTHROUGH
|
|
#endif
|
|
|
|
#if __has_builtin(__builtin_expect) || defined(__GNUC__)
|
|
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
|
|
#else
|
|
#define UNLIKELY(x) (x)
|
|
#endif
|
|
|
|
// clang-format off
|
|
#define IMPLEMENT_IFUNC(NAME, PARAMS) \
|
|
__attribute__((ifunc(STRINGIZE(NAME##_resolver)))) \
|
|
static void NAME PARAMS; \
|
|
\
|
|
DISABLE_SANITIZER_INSTRUMENTATION \
|
|
NO_SANITIZE_ADDRESS NO_SANITIZE_MEMORY NO_SANITIZE_THREAD \
|
|
static void (*NAME##_resolver(void))PARAMS
|
|
// clang-format on
|
|
|
|
#define IMPLEMENT_LOAD_FUNCTION(NAME, TYPE) \
|
|
static ALWAYS_INLINE TYPE NAME(const void* mem_addr) { \
|
|
TYPE a; \
|
|
memcpy(&a, mem_addr, sizeof(a)); \
|
|
return a; \
|
|
}
|
|
|
|
#define IMPLEMENT_STORE_FUNCTION(NAME, TYPE) \
|
|
static ALWAYS_INLINE void NAME(void* mem_addr, const TYPE a) { \
|
|
memcpy(mem_addr, &a, sizeof(a)); \
|
|
}
|
|
|
|
#if !defined(__SSE2__)
|
|
IMPLEMENT_LOAD_FUNCTION(LOAD_U64, uint64_t)
|
|
#endif
|
|
IMPLEMENT_STORE_FUNCTION(STORE_U8, uint8_t)
|
|
IMPLEMENT_STORE_FUNCTION(STORE_U64, uint64_t)
|
|
|
|
// Computes the transpose of an 8x8 bit matrix.
|
|
// Ref: "Hacker's Delight" 7-3 by Henry Warren.
|
|
static uint64_t transpose8(uint64_t x) {
|
|
uint64_t t;
|
|
t = (x ^ (x >> 7)) & 0x00aa00aa00aa00aa;
|
|
x = (x ^ t ^ (t << 7));
|
|
t = (x ^ (x >> 14)) & 0x0000cccc0000cccc;
|
|
x = (x ^ t ^ (t << 14));
|
|
t = (x ^ (x >> 28)) & 0x00000000f0f0f0f0;
|
|
x = (x ^ t ^ (t << 28));
|
|
return x;
|
|
}
|
|
|
|
#if !defined(__SSE2__)
|
|
NO_INLINE
|
|
static void bitshuf_trans_bit(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
const uint64_t a = LOAD_U64(&in[i * 8]);
|
|
const uint64_t x = transpose8(a);
|
|
STORE_U8(&out[0 * size + i], x);
|
|
STORE_U8(&out[1 * size + i], x >> 8 * 1);
|
|
STORE_U8(&out[2 * size + i], x >> 8 * 2);
|
|
STORE_U8(&out[3 * size + i], x >> 8 * 3);
|
|
STORE_U8(&out[4 * size + i], x >> 8 * 4);
|
|
STORE_U8(&out[5 * size + i], x >> 8 * 5);
|
|
STORE_U8(&out[6 * size + i], x >> 8 * 6);
|
|
STORE_U8(&out[7 * size + i], x >> 8 * 7);
|
|
}
|
|
}
|
|
|
|
NO_INLINE
|
|
static void bitshuf_trans_byte(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size,
|
|
size_t elem_size) {
|
|
assert(size % 8 == 0);
|
|
|
|
for (size_t i = 0; i < size; i += 8) {
|
|
for (size_t j = 0; j < elem_size; j++) {
|
|
for (size_t k = 0; k < 8; k++) {
|
|
out[j * size + (i + k)] = in[(i + k) * elem_size + j];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
NO_INLINE
|
|
static void bitshuf_trans_byte_2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size = size / 8 * 8;
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
out[0 * size + i] = in[i * 2 + 0];
|
|
out[1 * size + i] = in[i * 2 + 1];
|
|
}
|
|
}
|
|
|
|
NO_INLINE
|
|
static void bitshuf_trans_byte_4(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size = size / 8 * 8;
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
out[0 * size + i] = in[i * 4 + 0];
|
|
out[1 * size + i] = in[i * 4 + 1];
|
|
out[2 * size + i] = in[i * 4 + 2];
|
|
out[3 * size + i] = in[i * 4 + 3];
|
|
}
|
|
}
|
|
|
|
static void bitshuf_trans_byte_8(char* restrict out, const char* restrict in, size_t size) {
|
|
bitshuf_trans_byte(out, in, size, 8);
|
|
}
|
|
#endif
|
|
|
|
static void bitshuf_untrans_bit_tail(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size,
|
|
size_t index) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
for (size_t i = index; i < size; i++) {
|
|
const uint64_t a = (uint64_t)(uint8_t)in[0 * size + i] |
|
|
(uint64_t)(uint8_t)in[1 * size + i] << 8 * 1 |
|
|
(uint64_t)(uint8_t)in[2 * size + i] << 8 * 2 |
|
|
(uint64_t)(uint8_t)in[3 * size + i] << 8 * 3 |
|
|
(uint64_t)(uint8_t)in[4 * size + i] << 8 * 4 |
|
|
(uint64_t)(uint8_t)in[5 * size + i] << 8 * 5 |
|
|
(uint64_t)(uint8_t)in[6 * size + i] << 8 * 6 |
|
|
(uint64_t)(uint8_t)in[7 * size + i] << 8 * 7;
|
|
STORE_U64(&out[i * 8], transpose8(a));
|
|
}
|
|
}
|
|
|
|
#if !defined(__SSE2__)
|
|
NO_INLINE
|
|
static void bitshuf_untrans_bit(char* restrict out, const char* restrict in, size_t size) {
|
|
bitshuf_untrans_bit_tail(out, in, size, 0);
|
|
}
|
|
|
|
NO_INLINE
|
|
static void bitshuf_untrans_byte(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size,
|
|
size_t elem_size) {
|
|
assert(size % 8 == 0);
|
|
|
|
for (size_t i = 0; i < size; i += 8) {
|
|
for (size_t j = 0; j < elem_size; j++) {
|
|
for (size_t k = 0; k < 8; k++) {
|
|
out[(i + k) * elem_size + j] = in[j * size + (i + k)];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
NO_INLINE
|
|
static void bitshuf_untrans_byte_2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size = size / 8 * 8;
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
out[i * 2 + 0] = in[0 * size + i];
|
|
out[i * 2 + 1] = in[1 * size + i];
|
|
}
|
|
}
|
|
|
|
NO_INLINE
|
|
static void bitshuf_untrans_byte_4(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size = size / 8 * 8;
|
|
|
|
for (size_t i = 0; i < size; i++) {
|
|
out[i * 4 + 0] = in[0 * size + i];
|
|
out[i * 4 + 1] = in[1 * size + i];
|
|
out[i * 4 + 2] = in[2 * size + i];
|
|
out[i * 4 + 3] = in[3 * size + i];
|
|
}
|
|
}
|
|
|
|
static void bitshuf_untrans_byte_8(char* restrict out, const char* restrict in, size_t size) {
|
|
bitshuf_untrans_byte(out, in, size, 8);
|
|
}
|
|
#endif
|
|
|
|
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
|
|
|
|
IMPLEMENT_LOAD_FUNCTION(LOAD_I64, int64_t)
|
|
IMPLEMENT_STORE_FUNCTION(STORE_U16, uint16_t)
|
|
IMPLEMENT_STORE_FUNCTION(STORE_U32, uint32_t)
|
|
|
|
#define MM256_SETR_M128I(lo, hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
|
|
|
|
#if defined(__clang__)
|
|
#define X(A) \
|
|
({ \
|
|
__asm__("" : "+x"(A)); \
|
|
(A); \
|
|
})
|
|
#else
|
|
#define X(A) (A)
|
|
#endif
|
|
|
|
#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx512vbmi,avx512vl,gfni")
|
|
static void bitshuf_trans_bit_avx512vbmi_gfni(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
const __m512i BSWAP64 = _mm512_set_epi64(
|
|
0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607,
|
|
0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607);
|
|
const __m512i C0 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3931292119110901, 0x3830282018100800);
|
|
const __m512i C1 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3b332b231b130b03, 0x3a322a221a120a02);
|
|
const __m512i C2 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3d352d251d150d05, 0x3c342c241c140c04);
|
|
const __m512i C3 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3f372f271f170f07, 0x3e362e261e160e06);
|
|
const __m512i I8 = _mm512_set1_epi64(0x8040201008040201);
|
|
size_t i = 0;
|
|
for (; i + 8 <= size; i += 8) {
|
|
const __m512i a = _mm512_loadu_si512(&in[i * 8]);
|
|
const __m512i u = _mm512_gf2p8affine_epi64_epi8(I8, _mm512_shuffle_epi8(a, BSWAP64), 0x00);
|
|
const __m128i u0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C0, u));
|
|
const __m128i u1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C1, u));
|
|
const __m128i u2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C2, u));
|
|
const __m128i u3 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C3, u));
|
|
_mm_storel_epi64((__m128i*)&out[0 * size + i], u0);
|
|
_mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[2 * size + i], u1);
|
|
_mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(u1, 8));
|
|
_mm_storel_epi64((__m128i*)&out[4 * size + i], u2);
|
|
_mm_storel_epi64((__m128i*)&out[5 * size + i], _mm_srli_si128(u2, 8));
|
|
_mm_storel_epi64((__m128i*)&out[6 * size + i], u3);
|
|
_mm_storel_epi64((__m128i*)&out[7 * size + i], _mm_srli_si128(u3, 8));
|
|
}
|
|
if (i < size) {
|
|
const __mmask8 k = (1U << (size - i)) - 1;
|
|
const __m512i a = _mm512_maskz_loadu_epi64(k, &in[i * 8]);
|
|
const __m512i u = _mm512_gf2p8affine_epi64_epi8(I8, _mm512_shuffle_epi8(a, BSWAP64), 0x00);
|
|
const __m128i u0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C0, u));
|
|
const __m128i u1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C1, u));
|
|
const __m128i u2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C2, u));
|
|
const __m128i u3 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C3, u));
|
|
_mm_mask_storeu_epi8(&out[0 * size + i], k, u0);
|
|
_mm_mask_storeu_epi8(&out[1 * size + i], k, _mm_srli_si128(u0, 8));
|
|
_mm_mask_storeu_epi8(&out[2 * size + i], k, u1);
|
|
_mm_mask_storeu_epi8(&out[3 * size + i], k, _mm_srli_si128(u1, 8));
|
|
_mm_mask_storeu_epi8(&out[4 * size + i], k, u2);
|
|
_mm_mask_storeu_epi8(&out[5 * size + i], k, _mm_srli_si128(u2, 8));
|
|
_mm_mask_storeu_epi8(&out[6 * size + i], k, u3);
|
|
_mm_mask_storeu_epi8(&out[7 * size + i], k, _mm_srli_si128(u3, 8));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX512BW__) && defined(__AVX512VL__) || BITSHUF_USE_IFUNC
|
|
IMPLEMENT_STORE_FUNCTION(STORE_MASK64, __mmask64)
|
|
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx512bw,avx512vl")
|
|
static void bitshuf_trans_bit_avx512bw(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
const __m512i C0 = _mm512_set1_epi8(0x01);
|
|
const __m512i C1 = _mm512_set1_epi8(0x02);
|
|
const __m512i C2 = _mm512_set1_epi8(0x04);
|
|
const __m512i C3 = _mm512_set1_epi8(0x08);
|
|
const __m512i C4 = _mm512_set1_epi8(0x10);
|
|
const __m512i C5 = _mm512_set1_epi8(0x20);
|
|
const __m512i C6 = _mm512_set1_epi8(0x40);
|
|
size_t i = 0;
|
|
for (; i + 8 <= size; i += 8) {
|
|
const __m512i a = _mm512_loadu_si512(&in[i * 8]);
|
|
STORE_MASK64(&out[0 * size + i], _mm512_test_epi8_mask(a, C0));
|
|
STORE_MASK64(&out[1 * size + i], _mm512_test_epi8_mask(a, C1));
|
|
STORE_MASK64(&out[2 * size + i], _mm512_test_epi8_mask(a, C2));
|
|
STORE_MASK64(&out[3 * size + i], _mm512_test_epi8_mask(a, C3));
|
|
STORE_MASK64(&out[4 * size + i], _mm512_test_epi8_mask(a, C4));
|
|
STORE_MASK64(&out[5 * size + i], _mm512_test_epi8_mask(a, C5));
|
|
STORE_MASK64(&out[6 * size + i], _mm512_test_epi8_mask(a, C6));
|
|
STORE_MASK64(&out[7 * size + i], _mm512_movepi8_mask(a));
|
|
}
|
|
if (i < size) {
|
|
const __mmask8 k = (1U << (size - i)) - 1;
|
|
const __m512i a = _mm512_maskz_loadu_epi64(k, &in[i * 8]);
|
|
// clang-format off
|
|
_mm_mask_storeu_epi8(&out[0 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C0)));
|
|
_mm_mask_storeu_epi8(&out[1 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C1)));
|
|
_mm_mask_storeu_epi8(&out[2 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C2)));
|
|
_mm_mask_storeu_epi8(&out[3 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C3)));
|
|
_mm_mask_storeu_epi8(&out[4 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C4)));
|
|
_mm_mask_storeu_epi8(&out[5 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C5)));
|
|
_mm_mask_storeu_epi8(&out[6 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C6)));
|
|
_mm_mask_storeu_epi8(&out[7 * size + i], k, _mm_set_epi64x(0, _mm512_movepi8_mask(a)));
|
|
// clang-format on
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx2")
|
|
static void bitshuf_trans_bit_avx2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
size_t i = 0;
|
|
for (; i + 4 <= size; i += 4) {
|
|
const __m256i a = _mm256_loadu_si256((const __m256i*)&in[i * 8]);
|
|
__m256i u;
|
|
STORE_U32(&out[7 * size + i], _mm256_movemask_epi8(u = a));
|
|
STORE_U32(&out[6 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
|
|
STORE_U32(&out[5 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
|
|
STORE_U32(&out[4 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
|
|
STORE_U32(&out[3 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
|
|
STORE_U32(&out[2 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
|
|
STORE_U32(&out[1 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u)));
|
|
STORE_U32(&out[0 * size + i], _mm256_movemask_epi8(_mm256_add_epi8(X(u), u)));
|
|
}
|
|
if (i + 2 <= size) {
|
|
const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 8]);
|
|
__m128i u;
|
|
STORE_U16(&out[7 * size + i], _mm_movemask_epi8(u = a));
|
|
STORE_U16(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
|
|
i += 2;
|
|
}
|
|
if (i < size) {
|
|
const __m128i a = _mm_loadl_epi64((const __m128i*)&in[i * 8]);
|
|
__m128i u;
|
|
STORE_U8(&out[7 * size + i], _mm_movemask_epi8(u = a));
|
|
STORE_U8(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_trans_bit_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
size_t i = 0;
|
|
for (; i + 2 <= size; i += 2) {
|
|
const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 8]);
|
|
__m128i u;
|
|
STORE_U16(&out[7 * size + i], _mm_movemask_epi8(u = a));
|
|
STORE_U16(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U16(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
|
|
}
|
|
if (i < size) {
|
|
const __m128i a = _mm_loadl_epi64((const __m128i*)&in[i * 8]);
|
|
__m128i u;
|
|
STORE_U8(&out[7 * size + i], _mm_movemask_epi8(u = a));
|
|
STORE_U8(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u)));
|
|
STORE_U8(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u)));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__)
|
|
#define bitshuf_trans_bit bitshuf_trans_bit_avx512vbmi_gfni
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_trans_bit_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("avx512vbmi") && __builtin_cpu_supports("avx512vl") &&
|
|
__builtin_cpu_supports("gfni") && !__builtin_cpu_is("intel"))
|
|
{
|
|
return bitshuf_trans_bit_avx512vbmi_gfni;
|
|
}
|
|
#if defined(__AVX512BW__) && defined(__AVX512VL__)
|
|
return bitshuf_trans_bit_avx512bw;
|
|
#else
|
|
if (__builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512vl"))
|
|
return bitshuf_trans_bit_avx512bw;
|
|
#if defined(__AVX2__)
|
|
return bitshuf_trans_bit_avx2;
|
|
#else
|
|
if (__builtin_cpu_supports("avx2"))
|
|
return bitshuf_trans_bit_avx2;
|
|
#if defined(__SSE2__)
|
|
return bitshuf_trans_bit_sse2;
|
|
#else
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_trans_bit_sse2;
|
|
|
|
return bitshuf_trans_bit;
|
|
#endif
|
|
#endif
|
|
#endif
|
|
}
|
|
#define bitshuf_trans_bit bitshuf_trans_bit_ifunc
|
|
#elif defined(__AVX512BW__) && defined(__AVX512VL__)
|
|
#define bitshuf_trans_bit bitshuf_trans_bit_avx512bw
|
|
#elif defined(__AVX2__)
|
|
#define bitshuf_trans_bit bitshuf_trans_bit_avx2
|
|
#elif defined(__SSE2__)
|
|
#define bitshuf_trans_bit bitshuf_trans_bit_sse2
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_trans_byte_sse2(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size,
|
|
size_t elem_size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t j = 0;
|
|
for (; j + 8 <= elem_size; j += 8) {
|
|
for (size_t i = 0; i < size; i += 8) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(i + 0) * elem_size + j]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(i + 1) * elem_size + j]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(i + 2) * elem_size + j]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(i + 3) * elem_size + j]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(i + 4) * elem_size + j]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(i + 5) * elem_size + j]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(i + 6) * elem_size + j]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(i + 7) * elem_size + j]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u2 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u3 = _mm_unpacklo_epi8(a6, a7);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
_mm_storel_epi64((__m128i*)&out[(j + 0) * size + i], u0);
|
|
_mm_storel_epi64((__m128i*)&out[(j + 1) * size + i], _mm_srli_si128(u0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(j + 2) * size + i], u1);
|
|
_mm_storel_epi64((__m128i*)&out[(j + 3) * size + i], _mm_srli_si128(u1, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(j + 4) * size + i], u2);
|
|
_mm_storel_epi64((__m128i*)&out[(j + 5) * size + i], _mm_srli_si128(u2, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(j + 6) * size + i], u3);
|
|
_mm_storel_epi64((__m128i*)&out[(j + 7) * size + i], _mm_srli_si128(u3, 8));
|
|
}
|
|
}
|
|
if (j < elem_size) {
|
|
for (size_t i = 0; i + 8 < size; i += 8) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(i + 0) * elem_size + j]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(i + 1) * elem_size + j]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(i + 2) * elem_size + j]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(i + 3) * elem_size + j]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(i + 4) * elem_size + j]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(i + 5) * elem_size + j]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(i + 6) * elem_size + j]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(i + 7) * elem_size + j]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u2 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u3 = _mm_unpacklo_epi8(a6, a7);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
switch (elem_size - j) {
|
|
case 7:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 6) * size + i], u3);
|
|
FALLTHROUGH;
|
|
case 6:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 5) * size + i], _mm_srli_si128(u2, 8));
|
|
FALLTHROUGH;
|
|
case 5:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 4) * size + i], u2);
|
|
FALLTHROUGH;
|
|
case 4:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 3) * size + i], _mm_srli_si128(u1, 8));
|
|
FALLTHROUGH;
|
|
case 3:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 2) * size + i], u1);
|
|
FALLTHROUGH;
|
|
case 2:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 1) * size + i], _mm_srli_si128(u0, 8));
|
|
FALLTHROUGH;
|
|
default:
|
|
_mm_storel_epi64((__m128i*)&out[(j + 0) * size + i], u0);
|
|
}
|
|
}
|
|
for (; j < elem_size; j++) {
|
|
for (size_t i = size - 8; i < size; i++)
|
|
out[j * size + i] = in[i * elem_size + j];
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__)
|
|
#define bitshuf_trans_byte bitshuf_trans_byte_sse2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_trans_byte_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size, size_t elem_size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_trans_byte_sse2;
|
|
|
|
return bitshuf_trans_byte;
|
|
}
|
|
#define bitshuf_trans_byte bitshuf_trans_byte_ifunc
|
|
#endif
|
|
|
|
#if defined(__AVX2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx2")
|
|
static void bitshuf_trans_byte_2_avx2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
const __m256i MASK = _mm256_set1_epi16(0x00ff);
|
|
size_t i = 0;
|
|
for (; i + 32 <= size; i += 32) {
|
|
const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[i * 2]);
|
|
const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[i * 2 + 32]);
|
|
__m256i u0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a1), 1);
|
|
__m256i u1 = _mm256_permute2x128_si256(a0, a1, 0x31);
|
|
const __m256i v0 = _mm256_and_si256(u0, MASK);
|
|
const __m256i v1 = _mm256_and_si256(u1, MASK);
|
|
const __m256i v2 = _mm256_srli_epi16(u0, 8);
|
|
const __m256i v3 = _mm256_srli_epi16(u1, 8);
|
|
u0 = _mm256_packus_epi16(v0, v1);
|
|
u1 = _mm256_packus_epi16(v2, v3);
|
|
_mm256_storeu_si256((__m256i*)&out[0 * size + i], u0);
|
|
_mm256_storeu_si256((__m256i*)&out[1 * size + i], u1);
|
|
}
|
|
if (i + 16 <= size) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 2]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 2 + 16]);
|
|
const __m128i u0 = _mm_and_si128(a0, _mm256_castsi256_si128(MASK));
|
|
const __m128i u1 = _mm_and_si128(a1, _mm256_castsi256_si128(MASK));
|
|
const __m128i u2 = _mm_srli_epi16(a0, 8);
|
|
const __m128i u3 = _mm_srli_epi16(a1, 8);
|
|
const __m128i v0 = _mm_packus_epi16(u0, u1);
|
|
const __m128i v1 = _mm_packus_epi16(u2, u3);
|
|
_mm_storeu_si128((__m128i*)&out[0 * size + i], v0);
|
|
_mm_storeu_si128((__m128i*)&out[1 * size + i], v1);
|
|
i += 16;
|
|
}
|
|
if (i < size) {
|
|
const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 2]);
|
|
const __m128i u0 = _mm_and_si128(a, _mm256_castsi256_si128(MASK));
|
|
const __m128i u1 = _mm_srli_epi16(a, 8);
|
|
const __m128i u = _mm_packus_epi16(u0, u1);
|
|
_mm_storel_epi64((__m128i*)&out[0 * size + i], u);
|
|
_mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u, 8));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_trans_byte_2_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
const __m128i MASK = _mm_set1_epi16(0x00ff);
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 2]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 2 + 16]);
|
|
const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK));
|
|
const __m128i u1 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8));
|
|
_mm_storeu_si128((__m128i*)&out[0 * size + i], u0);
|
|
_mm_storeu_si128((__m128i*)&out[1 * size + i], u1);
|
|
}
|
|
if (i < size) {
|
|
const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 2]);
|
|
const __m128i u = _mm_packus_epi16(_mm_and_si128(a, MASK), _mm_srli_epi16(a, 8));
|
|
_mm_storel_epi64((__m128i*)&out[0 * size + i], u);
|
|
_mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u, 8));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX2__)
|
|
#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_avx2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_trans_byte_2_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("avx2"))
|
|
return bitshuf_trans_byte_2_avx2;
|
|
#if defined(__SSE2__)
|
|
return bitshuf_trans_byte_2_sse2;
|
|
#else
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_trans_byte_2_sse2;
|
|
|
|
return bitshuf_trans_byte_2;
|
|
#endif
|
|
}
|
|
#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_ifunc
|
|
#elif defined(__SSE2__)
|
|
#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_sse2
|
|
#endif
|
|
|
|
#if defined(__AVX2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx2")
|
|
static void bitshuf_trans_byte_4_avx2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
const __m256i SHUF = _mm256_set_epi64x(0x0f0b07030e0a0602, 0x0d0905010c080400,
|
|
0x0f0b07030e0a0602, 0x0d0905010c080400);
|
|
const __m256i PERM = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[i * 4]);
|
|
const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[i * 4 + 32]);
|
|
const __m256i u0 = _mm256_shuffle_epi8(a0, SHUF);
|
|
const __m256i u1 = _mm256_shuffle_epi8(a1, SHUF);
|
|
const __m256i v0 = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi32(u0, u1), PERM);
|
|
const __m256i v1 = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi32(u0, u1), PERM);
|
|
_mm_storeu_si128((__m128i*)&out[0 * size + i], _mm256_castsi256_si128(v0));
|
|
_mm_storeu_si128((__m128i*)&out[1 * size + i], _mm256_extracti128_si256(v0, 1));
|
|
_mm_storeu_si128((__m128i*)&out[2 * size + i], _mm256_castsi256_si128(v1));
|
|
_mm_storeu_si128((__m128i*)&out[3 * size + i], _mm256_extracti128_si256(v1, 1));
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16]);
|
|
const __m128i u0 = _mm_shuffle_epi8(a0, _mm256_castsi256_si128(SHUF));
|
|
const __m128i u1 = _mm_shuffle_epi8(a1, _mm256_castsi256_si128(SHUF));
|
|
const __m128i v0 = _mm_unpacklo_epi32(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi32(u0, u1);
|
|
_mm_storel_epi64((__m128i*)&out[0 * size + i], v0);
|
|
_mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(v0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[2 * size + i], v1);
|
|
_mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(v1, 8));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_trans_byte_4_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
const __m128i MASK = _mm_set1_epi16(0x00ff);
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 0]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 1]);
|
|
const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 2]);
|
|
const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 3]);
|
|
const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK));
|
|
const __m128i u1 = _mm_packus_epi16(_mm_and_si128(a2, MASK), _mm_and_si128(a3, MASK));
|
|
const __m128i u2 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8));
|
|
const __m128i u3 = _mm_packus_epi16(_mm_srli_epi16(a2, 8), _mm_srli_epi16(a3, 8));
|
|
const __m128i v0 = _mm_packus_epi16(_mm_and_si128(u0, MASK), _mm_and_si128(u1, MASK));
|
|
const __m128i v1 = _mm_packus_epi16(_mm_and_si128(u2, MASK), _mm_and_si128(u3, MASK));
|
|
const __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(u0, 8), _mm_srli_epi16(u1, 8));
|
|
const __m128i v3 = _mm_packus_epi16(_mm_srli_epi16(u2, 8), _mm_srli_epi16(u3, 8));
|
|
_mm_storeu_si128((__m128i*)&out[0 * size + i], v0);
|
|
_mm_storeu_si128((__m128i*)&out[1 * size + i], v1);
|
|
_mm_storeu_si128((__m128i*)&out[2 * size + i], v2);
|
|
_mm_storeu_si128((__m128i*)&out[3 * size + i], v3);
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16]);
|
|
const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK));
|
|
const __m128i u1 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8));
|
|
const __m128i v0 = _mm_packus_epi16(_mm_and_si128(u0, MASK), _mm_and_si128(u1, MASK));
|
|
const __m128i v1 = _mm_packus_epi16(_mm_srli_epi16(u0, 8), _mm_srli_epi16(u1, 8));
|
|
_mm_storel_epi64((__m128i*)&out[0 * size + i], v0);
|
|
_mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(v0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[2 * size + i], v1);
|
|
_mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(v1, 8));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX2__)
|
|
#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_avx2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_trans_byte_4_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("avx2"))
|
|
return bitshuf_trans_byte_4_avx2;
|
|
#if defined(__SSE2__)
|
|
return bitshuf_trans_byte_4_sse2;
|
|
#else
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_trans_byte_4_sse2;
|
|
|
|
return bitshuf_trans_byte_4;
|
|
#endif
|
|
}
|
|
#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_ifunc
|
|
#elif defined(__SSE2__)
|
|
#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_sse2
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_trans_byte_8_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 0]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 1]);
|
|
const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 2]);
|
|
const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 3]);
|
|
const __m128i a4 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 4]);
|
|
const __m128i a5 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 5]);
|
|
const __m128i a6 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 6]);
|
|
const __m128i a7 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 7]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
__m128i u2 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u3 = _mm_unpackhi_epi8(a2, a3);
|
|
__m128i u4 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u5 = _mm_unpackhi_epi8(a4, a5);
|
|
__m128i u6 = _mm_unpacklo_epi8(a6, a7);
|
|
__m128i u7 = _mm_unpackhi_epi8(a6, a7);
|
|
__m128i v0 = _mm_unpacklo_epi8(u0, u1);
|
|
__m128i v1 = _mm_unpackhi_epi8(u0, u1);
|
|
__m128i v2 = _mm_unpacklo_epi8(u2, u3);
|
|
__m128i v3 = _mm_unpackhi_epi8(u2, u3);
|
|
__m128i v4 = _mm_unpacklo_epi8(u4, u5);
|
|
__m128i v5 = _mm_unpackhi_epi8(u4, u5);
|
|
__m128i v6 = _mm_unpacklo_epi8(u6, u7);
|
|
__m128i v7 = _mm_unpackhi_epi8(u6, u7);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
u4 = _mm_unpacklo_epi32(v4, v6);
|
|
u5 = _mm_unpackhi_epi32(v4, v6);
|
|
u6 = _mm_unpacklo_epi32(v5, v7);
|
|
u7 = _mm_unpackhi_epi32(v5, v7);
|
|
v0 = _mm_unpacklo_epi64(u0, u4);
|
|
v1 = _mm_unpackhi_epi64(u0, u4);
|
|
v2 = _mm_unpacklo_epi64(u1, u5);
|
|
v3 = _mm_unpackhi_epi64(u1, u5);
|
|
v4 = _mm_unpacklo_epi64(u2, u6);
|
|
v5 = _mm_unpackhi_epi64(u2, u6);
|
|
v6 = _mm_unpacklo_epi64(u3, u7);
|
|
v7 = _mm_unpackhi_epi64(u3, u7);
|
|
_mm_storeu_si128((__m128i*)&out[0 * size + i], v0);
|
|
_mm_storeu_si128((__m128i*)&out[1 * size + i], v1);
|
|
_mm_storeu_si128((__m128i*)&out[2 * size + i], v2);
|
|
_mm_storeu_si128((__m128i*)&out[3 * size + i], v3);
|
|
_mm_storeu_si128((__m128i*)&out[4 * size + i], v4);
|
|
_mm_storeu_si128((__m128i*)&out[5 * size + i], v5);
|
|
_mm_storeu_si128((__m128i*)&out[6 * size + i], v6);
|
|
_mm_storeu_si128((__m128i*)&out[7 * size + i], v7);
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 0]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 1]);
|
|
const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 2]);
|
|
const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 3]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
__m128i u2 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u3 = _mm_unpackhi_epi8(a2, a3);
|
|
const __m128i v0 = _mm_unpacklo_epi8(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi8(u0, u1);
|
|
const __m128i v2 = _mm_unpacklo_epi8(u2, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi8(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
_mm_storel_epi64((__m128i*)&out[0 * size + i], u0);
|
|
_mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[2 * size + i], u1);
|
|
_mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(u1, 8));
|
|
_mm_storel_epi64((__m128i*)&out[4 * size + i], u2);
|
|
_mm_storel_epi64((__m128i*)&out[5 * size + i], _mm_srli_si128(u2, 8));
|
|
_mm_storel_epi64((__m128i*)&out[6 * size + i], u3);
|
|
_mm_storel_epi64((__m128i*)&out[7 * size + i], _mm_srli_si128(u3, 8));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__)
|
|
#define bitshuf_trans_byte_8 bitshuf_trans_byte_8_sse2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_trans_byte_8_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_trans_byte_8_sse2;
|
|
|
|
return bitshuf_trans_byte_8;
|
|
}
|
|
#define bitshuf_trans_byte_8 bitshuf_trans_byte_8_ifunc
|
|
#endif
|
|
|
|
#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx512vbmi,avx512vl,gfni")
|
|
static void bitshuf_untrans_bit_avx512vbmi_gfni(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
const __m512i C = _mm512_set_epi64(0x070f171f474f575f, 0x060e161e464e565e, 0x050d151d454d555d,
|
|
0x040c141c444c545c, 0x030b131b434b535b, 0x020a121a424a525a,
|
|
0x0109111941495159, 0x0008101840485058);
|
|
const __m512i I8 = _mm512_set1_epi64(0x8040201008040201);
|
|
size_t i = 0;
|
|
for (; i + 8 <= size; i += 8) {
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const int64_t a1 = LOAD_I64(&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const int64_t a3 = LOAD_I64(&in[3 * size + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
|
|
const int64_t a5 = LOAD_I64(&in[5 * size + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
|
|
const int64_t a7 = LOAD_I64(&in[7 * size + i]);
|
|
const __m128i u0 = _mm_insert_epi64(a0, a1, 1);
|
|
const __m128i u1 = _mm_insert_epi64(a2, a3, 1);
|
|
const __m128i u2 = _mm_insert_epi64(a4, a5, 1);
|
|
const __m128i u3 = _mm_insert_epi64(a6, a7, 1);
|
|
#else
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi64(a0, a1);
|
|
const __m128i u1 = _mm_unpacklo_epi64(a2, a3);
|
|
const __m128i u2 = _mm_unpacklo_epi64(a4, a5);
|
|
const __m128i u3 = _mm_unpacklo_epi64(a6, a7);
|
|
#endif
|
|
const __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(u0), u1, 1);
|
|
const __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(u2), u3, 1);
|
|
__m512i u;
|
|
u = _mm512_permutex2var_epi8(_mm512_castsi256_si512(v0), C, _mm512_castsi256_si512(v1));
|
|
u = _mm512_gf2p8affine_epi64_epi8(I8, u, 0x00);
|
|
_mm512_storeu_si512(&out[i * 8], u);
|
|
}
|
|
if (i < size) {
|
|
const __mmask8 k = (1U << (size - i)) - 1;
|
|
const __m128i a0 = _mm_maskz_loadu_epi8(k, &in[0 * size + i]);
|
|
const __m128i a1 = _mm_maskz_loadu_epi8(k, &in[1 * size + i]);
|
|
const __m128i a2 = _mm_maskz_loadu_epi8(k, &in[2 * size + i]);
|
|
const __m128i a3 = _mm_maskz_loadu_epi8(k, &in[3 * size + i]);
|
|
const __m128i a4 = _mm_maskz_loadu_epi8(k, &in[4 * size + i]);
|
|
const __m128i a5 = _mm_maskz_loadu_epi8(k, &in[5 * size + i]);
|
|
const __m128i a6 = _mm_maskz_loadu_epi8(k, &in[6 * size + i]);
|
|
const __m128i a7 = _mm_maskz_loadu_epi8(k, &in[7 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi64(a0, a1);
|
|
const __m128i u1 = _mm_unpacklo_epi64(a2, a3);
|
|
const __m128i u2 = _mm_unpacklo_epi64(a4, a5);
|
|
const __m128i u3 = _mm_unpacklo_epi64(a6, a7);
|
|
const __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(u0), u1, 1);
|
|
const __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(u2), u3, 1);
|
|
__m512i u;
|
|
u = _mm512_permutex2var_epi8(_mm512_castsi256_si512(v0), C, _mm512_castsi256_si512(v1));
|
|
u = _mm512_gf2p8affine_epi64_epi8(I8, u, 0x00);
|
|
_mm512_mask_storeu_epi64(&out[i * 8], k, u);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX512BW__) && defined(__AVX512VL__) || BITSHUF_USE_IFUNC
|
|
IMPLEMENT_LOAD_FUNCTION(LOAD_MASK64, __mmask64)
|
|
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static ALWAYS_INLINE __mmask64 MM_CVTSI128_MASK64(__m128i a) {
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
return _mm_cvtsi128_si64(a);
|
|
#else
|
|
__mmask64 k;
|
|
_mm_storel_epi64((__m128i*)&k, a);
|
|
return k;
|
|
#endif
|
|
}
|
|
|
|
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
|
|
// https://github.com/llvm/llvm-project/issues/65205
|
|
ATTRIBUTE_TARGET("avx512bw,avx512vl")
|
|
static ALWAYS_INLINE __m512i MM512_MASK_ADD_EPI8(__m512i src, __mmask64 k, __m512i a, __m512i b) {
|
|
__asm__("vpaddb\t{%3, %2, %0 %{%1%}|%0 %{%1%}, %2, %3}" : "+v"(src) : "Yk"(k), "v"(a), "v"(b));
|
|
return src;
|
|
}
|
|
#else
|
|
#define MM512_MASK_ADD_EPI8(SRC, K, A, B) _mm512_mask_add_epi8(SRC, K, A, B)
|
|
#endif
|
|
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx512bw,avx512vl")
|
|
static void bitshuf_untrans_bit_avx512bw(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
const __m512i C0 = _mm512_set1_epi8(0x01);
|
|
const __m512i C1 = _mm512_set1_epi8(0x02);
|
|
const __m512i C2 = _mm512_set1_epi8(0x04);
|
|
const __m512i C3 = _mm512_set1_epi8(0x08);
|
|
const __m512i C4 = _mm512_set1_epi8(0x10);
|
|
const __m512i C5 = _mm512_set1_epi8(0x20);
|
|
const __m512i C6 = _mm512_set1_epi8(0x40);
|
|
const __m512i C7 = _mm512_set1_epi8(-128);
|
|
size_t i = 0;
|
|
for (; i + 8 <= size; i += 8) {
|
|
const __mmask64 a0 = LOAD_MASK64(&in[0 * size + i]);
|
|
__m512i u = _mm512_maskz_mov_epi8(a0, C0);
|
|
const __mmask64 a1 = LOAD_MASK64(&in[1 * size + i]);
|
|
const __mmask64 a2 = LOAD_MASK64(&in[2 * size + i]);
|
|
const __mmask64 a3 = LOAD_MASK64(&in[3 * size + i]);
|
|
const __mmask64 a4 = LOAD_MASK64(&in[4 * size + i]);
|
|
const __mmask64 a5 = LOAD_MASK64(&in[5 * size + i]);
|
|
const __mmask64 a6 = LOAD_MASK64(&in[6 * size + i]);
|
|
const __mmask64 a7 = LOAD_MASK64(&in[7 * size + i]);
|
|
u = MM512_MASK_ADD_EPI8(u, a1, u, C1);
|
|
u = MM512_MASK_ADD_EPI8(u, a2, u, C2);
|
|
u = MM512_MASK_ADD_EPI8(u, a3, u, C3);
|
|
u = MM512_MASK_ADD_EPI8(u, a4, u, C4);
|
|
u = MM512_MASK_ADD_EPI8(u, a5, u, C5);
|
|
u = MM512_MASK_ADD_EPI8(u, a6, u, C6);
|
|
u = MM512_MASK_ADD_EPI8(u, a7, u, C7);
|
|
_mm512_storeu_si512(&out[i * 8], u);
|
|
}
|
|
if (i < size) {
|
|
const __mmask8 k = (1U << (size - i)) - 1;
|
|
const __mmask64 a0 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[0 * size + i]));
|
|
const __mmask64 a1 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[1 * size + i]));
|
|
const __mmask64 a2 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[2 * size + i]));
|
|
const __mmask64 a3 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[3 * size + i]));
|
|
const __mmask64 a4 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[4 * size + i]));
|
|
const __mmask64 a5 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[5 * size + i]));
|
|
const __mmask64 a6 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[6 * size + i]));
|
|
const __mmask64 a7 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[7 * size + i]));
|
|
__m512i u = _mm512_maskz_mov_epi8(a0, C0);
|
|
u = MM512_MASK_ADD_EPI8(u, a1, u, C1);
|
|
u = MM512_MASK_ADD_EPI8(u, a2, u, C2);
|
|
u = MM512_MASK_ADD_EPI8(u, a3, u, C3);
|
|
u = MM512_MASK_ADD_EPI8(u, a4, u, C4);
|
|
u = MM512_MASK_ADD_EPI8(u, a5, u, C5);
|
|
u = MM512_MASK_ADD_EPI8(u, a6, u, C6);
|
|
u = MM512_MASK_ADD_EPI8(u, a7, u, C7);
|
|
_mm512_mask_storeu_epi64(&out[i * 8], k, u);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx2")
|
|
static void bitshuf_untrans_bit_avx2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
const __m256i PERM = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
|
|
const __m256i MASK0 = _mm256_set1_epi64x(0x00aa00aa00aa00aa);
|
|
const __m256i MASK1 = _mm256_set1_epi64x(0x0000cccc0000cccc);
|
|
const __m256i MASK2 = _mm256_set1_epi64x(0x00000000f0f0f0f0);
|
|
size_t i = 0;
|
|
for (; i + 8 <= size; i += 8) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
|
|
__m256i u0 = MM256_SETR_M128I(_mm_unpacklo_epi8(a0, a1), _mm_unpacklo_epi8(a4, a5));
|
|
__m256i u1 = MM256_SETR_M128I(_mm_unpacklo_epi8(a2, a3), _mm_unpacklo_epi8(a6, a7));
|
|
__m256i v0 = _mm256_unpacklo_epi16(u0, u1);
|
|
__m256i v1 = _mm256_unpackhi_epi16(u0, u1);
|
|
u0 = _mm256_permutevar8x32_epi32(v0, PERM);
|
|
u1 = _mm256_permutevar8x32_epi32(v1, PERM);
|
|
v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 07)), MASK0);
|
|
v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 07)), MASK0);
|
|
u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 07)), v0);
|
|
u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 07)), v1);
|
|
v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 14)), MASK1);
|
|
v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 14)), MASK1);
|
|
u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 14)), v0);
|
|
u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 14)), v1);
|
|
v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 28)), MASK2);
|
|
v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 28)), MASK2);
|
|
u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 28)), v0);
|
|
u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 28)), v1);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 8], u0);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 8 + 32], u1);
|
|
}
|
|
if (i < size)
|
|
bitshuf_untrans_bit_tail(out, in, size * 8, i);
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_untrans_bit_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
size /= 8;
|
|
|
|
const __m128i MASK0 = _mm_set1_epi64x(0x00aa00aa00aa00aa);
|
|
const __m128i MASK1 = _mm_set1_epi64x(0x0000cccc0000cccc);
|
|
const __m128i MASK2 = _mm_set1_epi64x(0x00000000f0f0f0f0);
|
|
size_t i = 0;
|
|
for (; i + 8 <= size; i += 8) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u2 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u3 = _mm_unpacklo_epi8(a6, a7);
|
|
__m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
__m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
__m128i v2 = _mm_unpacklo_epi16(u2, u3);
|
|
__m128i v3 = _mm_unpackhi_epi16(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 07)), MASK0);
|
|
v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 07)), MASK0);
|
|
v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 07)), MASK0);
|
|
v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 07)), MASK0);
|
|
u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 07)), v0);
|
|
u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 07)), v1);
|
|
u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 07)), v2);
|
|
u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 07)), v3);
|
|
v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 14)), MASK1);
|
|
v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 14)), MASK1);
|
|
v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 14)), MASK1);
|
|
v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 14)), MASK1);
|
|
u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 14)), v0);
|
|
u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 14)), v1);
|
|
u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 14)), v2);
|
|
u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 14)), v3);
|
|
v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 28)), MASK2);
|
|
v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 28)), MASK2);
|
|
v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 28)), MASK2);
|
|
v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 28)), MASK2);
|
|
u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 28)), v0);
|
|
u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 28)), v1);
|
|
u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 28)), v2);
|
|
u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 28)), v3);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3);
|
|
}
|
|
if (i < size)
|
|
bitshuf_untrans_bit_tail(out, in, size * 8, i);
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__)
|
|
#define bitshuf_untrans_bit bitshuf_untrans_bit_avx512vbmi_gfni
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_untrans_bit_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("avx512vbmi") && __builtin_cpu_supports("avx512vl") &&
|
|
__builtin_cpu_supports("gfni"))
|
|
{
|
|
return bitshuf_untrans_bit_avx512vbmi_gfni;
|
|
}
|
|
#if defined(__AVX512BW__) && defined(__AVX512VL__)
|
|
return bitshuf_untrans_bit_avx512bw;
|
|
#else
|
|
if (__builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512vl"))
|
|
return bitshuf_untrans_bit_avx512bw;
|
|
#if defined(__AVX2__)
|
|
return bitshuf_untrans_bit_avx2;
|
|
#else
|
|
if (__builtin_cpu_supports("avx2"))
|
|
return bitshuf_untrans_bit_avx2;
|
|
#if defined(__SSE2__)
|
|
return bitshuf_untrans_bit_sse2;
|
|
#else
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_untrans_bit_sse2;
|
|
|
|
return bitshuf_untrans_bit;
|
|
#endif
|
|
#endif
|
|
#endif
|
|
}
|
|
#define bitshuf_untrans_bit bitshuf_untrans_bit_ifunc
|
|
#elif defined(__AVX512BW__) && defined(__AVX512VL__)
|
|
#define bitshuf_untrans_bit bitshuf_untrans_bit_avx512bw
|
|
#elif defined(__AVX2__)
|
|
#define bitshuf_untrans_bit bitshuf_untrans_bit_avx2
|
|
#elif defined(__SSE2__)
|
|
#define bitshuf_untrans_bit bitshuf_untrans_bit_sse2
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_untrans_byte_sse2(char* restrict out,
|
|
const char* restrict in,
|
|
size_t size,
|
|
size_t elem_size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t j = 0;
|
|
for (; j + 8 <= elem_size; j += 8) {
|
|
for (size_t i = 0; i < size; i += 8) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(j + 0) * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(j + 1) * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(j + 2) * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(j + 3) * size + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(j + 4) * size + i]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(j + 5) * size + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(j + 6) * size + i]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(j + 7) * size + i]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u2 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u3 = _mm_unpacklo_epi8(a6, a7);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 0) * elem_size + j], u0);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 1) * elem_size + j], _mm_srli_si128(u0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(i + 2) * elem_size + j], u1);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 3) * elem_size + j], _mm_srli_si128(u1, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(i + 4) * elem_size + j], u2);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 5) * elem_size + j], _mm_srli_si128(u2, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(i + 6) * elem_size + j], u3);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 7) * elem_size + j], _mm_srli_si128(u3, 8));
|
|
}
|
|
}
|
|
if (j < elem_size) {
|
|
const size_t j0 = (j + 0) * size;
|
|
const size_t j1 = (j + 1) < elem_size ? (j + 1) * size : 1;
|
|
const size_t j2 = (j + 2) % elem_size * size + (j + 2) / elem_size;
|
|
const size_t j3 = (j + 3) % elem_size * size + (j + 3) / elem_size;
|
|
const size_t j4 = (j + 4) % elem_size * size + (j + 4) / elem_size;
|
|
const size_t j5 = (j + 5) % elem_size * size + (j + 5) / elem_size;
|
|
const size_t j6 = (j + 6) % elem_size * size + (j + 6) / elem_size;
|
|
const size_t j7 = (j + 7) % elem_size * size + (j + 7) / elem_size;
|
|
for (size_t i = 0; i + 8 < size; i += 8) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[j0 + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[j1 + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[j2 + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[j3 + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[j4 + i]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[j5 + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[j6 + i]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[j7 + i]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u2 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u3 = _mm_unpacklo_epi8(a6, a7);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 0) * elem_size + j], u0);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 1) * elem_size + j], _mm_srli_si128(u0, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(i + 2) * elem_size + j], u1);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 3) * elem_size + j], _mm_srli_si128(u1, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(i + 4) * elem_size + j], u2);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 5) * elem_size + j], _mm_srli_si128(u2, 8));
|
|
_mm_storel_epi64((__m128i*)&out[(i + 6) * elem_size + j], u3);
|
|
_mm_storel_epi64((__m128i*)&out[(i + 7) * elem_size + j], _mm_srli_si128(u3, 8));
|
|
}
|
|
for (; j < elem_size; j++) {
|
|
for (size_t i = size - 8; i < size; i++)
|
|
out[i * elem_size + j] = in[j * size + i];
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__)
|
|
#define bitshuf_untrans_byte bitshuf_untrans_byte_sse2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_untrans_byte_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size, size_t elem_size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_untrans_byte_sse2;
|
|
|
|
return bitshuf_untrans_byte;
|
|
}
|
|
#define bitshuf_untrans_byte bitshuf_untrans_byte_ifunc
|
|
#endif
|
|
|
|
#if defined(__AVX2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx2")
|
|
static void bitshuf_untrans_byte_2_avx2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t i = 0;
|
|
for (; i + 32 <= size; i += 32) {
|
|
const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[0 * size + i]);
|
|
const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[1 * size + i]);
|
|
const __m256i u0 = _mm256_permute4x64_epi64(a0, 0xd8);
|
|
const __m256i u1 = _mm256_permute4x64_epi64(a1, 0xd8);
|
|
const __m256i v0 = _mm256_unpacklo_epi8(u0, u1);
|
|
const __m256i v1 = _mm256_unpackhi_epi8(u0, u1);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 2], v0);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 2 + 32], v1);
|
|
}
|
|
if (i + 16 <= size) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 2], u0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 2 + 16], u1);
|
|
i += 16;
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i u = _mm_unpacklo_epi8(a0, a1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 2], u);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_untrans_byte_2_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 2], u0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 2 + 16], u1);
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i u = _mm_unpacklo_epi8(a0, a1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 2], u);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX2__)
|
|
#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_avx2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_untrans_byte_2_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("avx2"))
|
|
return bitshuf_untrans_byte_2_avx2;
|
|
#if defined(__SSE2__)
|
|
return bitshuf_untrans_byte_2_sse2;
|
|
#else
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_untrans_byte_2_sse2;
|
|
|
|
return bitshuf_untrans_byte_2;
|
|
#endif
|
|
}
|
|
#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_ifunc
|
|
#elif defined(__SSE2__)
|
|
#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_sse2
|
|
#endif
|
|
|
|
#if defined(__AVX2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("avx2")
|
|
static void bitshuf_untrans_byte_4_avx2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t i = 0;
|
|
for (; i + 32 <= size; i += 32) {
|
|
const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[0 * size + i]);
|
|
const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[1 * size + i]);
|
|
const __m256i a2 = _mm256_loadu_si256((const __m256i*)&in[2 * size + i]);
|
|
const __m256i a3 = _mm256_loadu_si256((const __m256i*)&in[3 * size + i]);
|
|
__m256i u0 = _mm256_unpacklo_epi8(a0, a1);
|
|
__m256i u1 = _mm256_unpackhi_epi8(a0, a1);
|
|
__m256i u2 = _mm256_unpacklo_epi8(a2, a3);
|
|
__m256i u3 = _mm256_unpackhi_epi8(a2, a3);
|
|
const __m256i v0 = _mm256_unpacklo_epi16(u0, u2);
|
|
const __m256i v1 = _mm256_unpackhi_epi16(u0, u2);
|
|
const __m256i v2 = _mm256_unpacklo_epi16(u1, u3);
|
|
const __m256i v3 = _mm256_unpackhi_epi16(u1, u3);
|
|
u0 = _mm256_inserti128_si256(v0, _mm256_castsi256_si128(v1), 1);
|
|
u1 = _mm256_inserti128_si256(v2, _mm256_castsi256_si128(v3), 1);
|
|
u2 = _mm256_permute2x128_si256(v0, v1, 0x31);
|
|
u3 = _mm256_permute2x128_si256(v2, v3, 0x31);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 0], u0);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 1], u1);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 2], u2);
|
|
_mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 3], u3);
|
|
}
|
|
if (i + 16 <= size) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
const __m128i u2 = _mm_unpacklo_epi8(a2, a3);
|
|
const __m128i u3 = _mm_unpackhi_epi8(a2, a3);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u2);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u2);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u1, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u1, u3);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 0], v0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 1], v1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 2], v2);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 3], v3);
|
|
i += 16;
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
const __m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4], v0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16], v1);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_untrans_byte_4_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
const __m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
const __m128i u2 = _mm_unpacklo_epi8(a2, a3);
|
|
const __m128i u3 = _mm_unpackhi_epi8(a2, a3);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u2);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u2);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u1, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u1, u3);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 0], v0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 1], v1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 2], v2);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 3], v3);
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
|
|
const __m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
const __m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4], v0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 4 + 16], v1);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__AVX2__)
|
|
#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_avx2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_untrans_byte_4_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("avx2"))
|
|
return bitshuf_untrans_byte_4_avx2;
|
|
#if defined(__SSE2__)
|
|
return bitshuf_untrans_byte_4_sse2;
|
|
#else
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_untrans_byte_4_sse2;
|
|
|
|
return bitshuf_untrans_byte_4;
|
|
#endif
|
|
}
|
|
#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_ifunc
|
|
#elif defined(__SSE2__)
|
|
#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_sse2
|
|
#endif
|
|
|
|
#if defined(__SSE2__) || BITSHUF_USE_IFUNC
|
|
NO_INLINE
|
|
ATTRIBUTE_TARGET("sse2")
|
|
static void bitshuf_untrans_byte_8_sse2(char* restrict out, const char* restrict in, size_t size) {
|
|
assert(size % 8 == 0);
|
|
|
|
size_t i = 0;
|
|
for (; i + 16 <= size; i += 16) {
|
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]);
|
|
const __m128i a4 = _mm_loadu_si128((const __m128i*)&in[4 * size + i]);
|
|
const __m128i a5 = _mm_loadu_si128((const __m128i*)&in[5 * size + i]);
|
|
const __m128i a6 = _mm_loadu_si128((const __m128i*)&in[6 * size + i]);
|
|
const __m128i a7 = _mm_loadu_si128((const __m128i*)&in[7 * size + i]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpackhi_epi8(a0, a1);
|
|
__m128i u2 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u3 = _mm_unpackhi_epi8(a2, a3);
|
|
__m128i u4 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u5 = _mm_unpackhi_epi8(a4, a5);
|
|
__m128i u6 = _mm_unpacklo_epi8(a6, a7);
|
|
__m128i u7 = _mm_unpackhi_epi8(a6, a7);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u2);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u2);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u1, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u1, u3);
|
|
const __m128i v4 = _mm_unpacklo_epi16(u4, u6);
|
|
const __m128i v5 = _mm_unpackhi_epi16(u4, u6);
|
|
const __m128i v6 = _mm_unpacklo_epi16(u5, u7);
|
|
const __m128i v7 = _mm_unpackhi_epi16(u5, u7);
|
|
u0 = _mm_unpacklo_epi32(v0, v4);
|
|
u1 = _mm_unpackhi_epi32(v0, v4);
|
|
u2 = _mm_unpacklo_epi32(v1, v5);
|
|
u3 = _mm_unpackhi_epi32(v1, v5);
|
|
u4 = _mm_unpacklo_epi32(v2, v6);
|
|
u5 = _mm_unpackhi_epi32(v2, v6);
|
|
u6 = _mm_unpacklo_epi32(v3, v7);
|
|
u7 = _mm_unpackhi_epi32(v3, v7);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 4], u4);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 5], u5);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 6], u6);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 7], u7);
|
|
}
|
|
if (i + 8 <= size) {
|
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]);
|
|
const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]);
|
|
const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]);
|
|
const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]);
|
|
const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]);
|
|
const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]);
|
|
const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]);
|
|
const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]);
|
|
__m128i u0 = _mm_unpacklo_epi8(a0, a1);
|
|
__m128i u1 = _mm_unpacklo_epi8(a2, a3);
|
|
__m128i u2 = _mm_unpacklo_epi8(a4, a5);
|
|
__m128i u3 = _mm_unpacklo_epi8(a6, a7);
|
|
const __m128i v0 = _mm_unpacklo_epi16(u0, u1);
|
|
const __m128i v1 = _mm_unpackhi_epi16(u0, u1);
|
|
const __m128i v2 = _mm_unpacklo_epi16(u2, u3);
|
|
const __m128i v3 = _mm_unpackhi_epi16(u2, u3);
|
|
u0 = _mm_unpacklo_epi32(v0, v2);
|
|
u1 = _mm_unpackhi_epi32(v0, v2);
|
|
u2 = _mm_unpacklo_epi32(v1, v3);
|
|
u3 = _mm_unpackhi_epi32(v1, v3);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2);
|
|
_mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if defined(__SSE2__)
|
|
#define bitshuf_untrans_byte_8 bitshuf_untrans_byte_8_sse2
|
|
#elif BITSHUF_USE_IFUNC
|
|
IMPLEMENT_IFUNC(bitshuf_untrans_byte_8_ifunc,
|
|
(char* restrict out, const char* restrict in, size_t size)) {
|
|
__builtin_cpu_init();
|
|
|
|
if (__builtin_cpu_supports("sse2"))
|
|
return bitshuf_untrans_byte_8_sse2;
|
|
|
|
return bitshuf_untrans_byte_8;
|
|
}
|
|
#define bitshuf_untrans_byte_8 bitshuf_untrans_byte_8_ifunc
|
|
#endif
|
|
|
|
#endif
|
|
|
|
int bitshuf_encode_block(char* restrict out,
|
|
const char* restrict in,
|
|
char* restrict scratch,
|
|
size_t size,
|
|
size_t elem_size) {
|
|
if (UNLIKELY(size & 7))
|
|
return -1;
|
|
|
|
if (elem_size == 1) {
|
|
bitshuf_trans_bit(out, in, size);
|
|
} else {
|
|
if (UNLIKELY(!scratch && elem_size > 1))
|
|
return -1;
|
|
|
|
switch (elem_size) {
|
|
case 2:
|
|
bitshuf_trans_byte_2(scratch, in, size);
|
|
break;
|
|
case 4:
|
|
bitshuf_trans_byte_4(scratch, in, size);
|
|
break;
|
|
case 8:
|
|
bitshuf_trans_byte_8(scratch, in, size);
|
|
break;
|
|
default:
|
|
bitshuf_trans_byte(scratch, in, size, elem_size);
|
|
break;
|
|
}
|
|
for (size_t i = 0; i < elem_size; i++)
|
|
bitshuf_trans_bit(&out[i * size], &scratch[i * size], size);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int bitshuf_decode_block(char* restrict out,
|
|
const char* restrict in,
|
|
char* restrict scratch,
|
|
size_t size,
|
|
size_t elem_size) {
|
|
if (UNLIKELY(size & 7))
|
|
return -1;
|
|
|
|
if (elem_size == 1) {
|
|
bitshuf_untrans_bit(out, in, size);
|
|
} else {
|
|
if (UNLIKELY(!scratch && elem_size > 1))
|
|
return -1;
|
|
|
|
for (size_t i = 0; i < elem_size; i++)
|
|
bitshuf_untrans_bit(&scratch[i * size], &in[i * size], size);
|
|
|
|
switch (elem_size) {
|
|
case 2:
|
|
bitshuf_untrans_byte_2(out, scratch, size);
|
|
break;
|
|
case 4:
|
|
bitshuf_untrans_byte_4(out, scratch, size);
|
|
break;
|
|
case 8:
|
|
bitshuf_untrans_byte_8(out, scratch, size);
|
|
break;
|
|
default:
|
|
bitshuf_untrans_byte(out, scratch, size, elem_size);
|
|
break;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|