From b5d0b34fc83ab8627b828eb035861fa8d9294b70 Mon Sep 17 00:00:00 2001
From: Filip Leonarski <filip.leonarski@psi.ch>
Date: Wed, 7 Jun 2023 20:49:35 +0200
Subject: [PATCH] JFConversion: Simplify (processing per module, not per
 packet) + add GPU conversion procedure - currently only synchronous

---
 common/FrameTransformation.cpp         |   2 +-
 jungfrau/CMakeLists.txt                |  14 ++-
 jungfrau/JFConversion.h                |   7 +-
 jungfrau/JFConversionFixedPoint.cpp    |  10 +-
 jungfrau/JFConversionFixedPoint.h      |   2 +-
 jungfrau/JFConversionFloatingPoint.cpp |  85 +++++++------
 jungfrau/JFConversionFloatingPoint.h   |   3 +-
 jungfrau/JFConversionGPU.cu            | 136 +++++++++++++++++++++
 jungfrau/JFConversionGPU.h             |  49 ++++++++
 jungfrau/JFConversionGPU_Alt.cpp       |  23 ++++
 jungfrau/ProcessJFPacket.cpp           |  28 +----
 jungfrau/ProcessJFPacket.h             |   2 -
 tests/JFConversionTest.cpp             | 129 ++++++++++++++++++--
 tests/ProcessRawPacketTest.cpp         |  38 ------
 tools/JFCalibrationPerfTest.cpp        | 157 ++-----------------------
 15 files changed, 404 insertions(+), 281 deletions(-)
 create mode 100644 jungfrau/JFConversionGPU.cu
 create mode 100644 jungfrau/JFConversionGPU.h
 create mode 100644 jungfrau/JFConversionGPU_Alt.cpp

diff --git a/common/FrameTransformation.cpp b/common/FrameTransformation.cpp
index ea24c2a4..14db8989 100644
--- a/common/FrameTransformation.cpp
+++ b/common/FrameTransformation.cpp
@@ -164,7 +164,7 @@ void FrameTransformation::ProcessModule(JFConversion &conv, const int16_t *input
         if (experiment.GetDetectorMode() != DetectorMode::Conversion)
             memcpy(output + RAW_MODULE_SIZE * module_number_abs, input, RAW_MODULE_SIZE * experiment.GetPixelDepth());
         else {
-            conv.Convert(conversion_buffer.data(), (uint16_t *) input);
+            conv.ConvertModule(conversion_buffer.data(), (uint16_t *) input);
             TransferModuleAdjustMultipixels(output, conversion_buffer.data(),
                                             experiment.GetModuleSlowDirectionStep(module_number_abs),
                                             static_cast<int16_t>(INT16_MIN),
diff --git a/jungfrau/CMakeLists.txt b/jungfrau/CMakeLists.txt
index 10e032ac..ee02541a 100644
--- a/jungfrau/CMakeLists.txt
+++ b/jungfrau/CMakeLists.txt
@@ -6,8 +6,18 @@ ADD_LIBRARY(JFCalibration STATIC
         JFModulePedestal.cpp JFModulePedestal.h
         JFModuleGainCalibration.cpp JFModuleGainCalibration.h
         JFPedestalCalc.cpp JFPedestalCalc.h
-        ProcessJFPacket.cpp ProcessJFPacket.h)
+        ProcessJFPacket.cpp ProcessJFPacket.h
+        JFConversionGPU.h JFConversionGPU_Alt.cpp)
 
 SET_SOURCE_FILES_PROPERTIES(JFPedestalCalc.cpp JFConversionFloatingPoint.cpp JFConversionFixedPoint.cpp PROPERTIES COMPILE_FLAGS -Ofast)
 
-TARGET_LINK_LIBRARIES(JFCalibration JFJochProtoBuf Compression)
\ No newline at end of file
+TARGET_LINK_LIBRARIES(JFCalibration JFJochProtoBuf Compression)
+
+IF (CMAKE_CUDA_COMPILER)
+    TARGET_SOURCES(JFCalibration PRIVATE JFConversionGPU.cu )
+
+    TARGET_COMPILE_DEFINITIONS(JFCalibration PUBLIC -DJFJOCH_USE_CUDA)
+
+    FIND_LIBRARY(CUDART_LIBRARY cudart_static PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
+    TARGET_LINK_LIBRARIES(JFCalibration ${CUDART_LIBRARY} ${CMAKE_DL_LIBS} rt)
+ENDIF()
diff --git a/jungfrau/JFConversion.h b/jungfrau/JFConversion.h
index c92ee854..65ced286 100644
--- a/jungfrau/JFConversion.h
+++ b/jungfrau/JFConversion.h
@@ -16,11 +16,8 @@ public:
                        const JFModulePedestal &pedestal_g2,
                        double energy) = 0;
 
-    virtual void ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) = 0;
-    void Convert(int16_t* dest, const uint16_t* source) {
-        for (int i = 0; i < 128; i++)
-            ConvertPacket(dest + i * 4 * RAW_MODULE_COLS, source + i * 4 * RAW_MODULE_COLS, i);
-    }
+    virtual void ConvertModule(int16_t* dest, const uint16_t* source) = 0;
+    virtual void Sync() {};
 };
 
 #endif //JUNGFRAUJOCH_JFCONVERSION_H
diff --git a/jungfrau/JFConversionFixedPoint.cpp b/jungfrau/JFConversionFixedPoint.cpp
index 2078aed4..a81c59c9 100644
--- a/jungfrau/JFConversionFixedPoint.cpp
+++ b/jungfrau/JFConversionFixedPoint.cpp
@@ -100,7 +100,7 @@ void JFConversionFixedPoint::ConvertLine(int16_t *dest, const uint16_t *source,
     }
 }
 
-void JFConversionFixedPoint::ConvertPacket(int16_t *dest, const uint16_t *source, uint16_t packet_number) {
+void JFConversionFixedPoint::ConvertModule(int16_t *dest, const uint16_t *source) {
     auto gain_g0_aligned = std::assume_aligned<64>(gain_g0);
     auto gain_g1_aligned = std::assume_aligned<64>(gain_g1);
     auto gain_g2_aligned = std::assume_aligned<64>(gain_g2);
@@ -108,7 +108,7 @@ void JFConversionFixedPoint::ConvertPacket(int16_t *dest, const uint16_t *source
     auto pedestal_g1_aligned = std::assume_aligned<64>(pedestal_g1);
     auto pedestal_g2_aligned = std::assume_aligned<64>(pedestal_g2);
 
-    for (int i = 0; i < 4 * RAW_MODULE_COLS; i++) {
+    for (int i = 0; i < RAW_MODULE_SIZE; i++) {
         uint16_t gainbits = source[i] & 0xc000;
         int32_t adc = source[i] & 0x3fff;
         int32_t val = INT32_MIN;
@@ -116,17 +116,17 @@ void JFConversionFixedPoint::ConvertPacket(int16_t *dest, const uint16_t *source
         switch (gainbits) {
             case 0:
                 [[likely]]
-                        val = (adc - pedestal_g0_aligned[i + 4 * packet_number * RAW_MODULE_COLS]) * gain_g0_aligned[i + 4 * packet_number * RAW_MODULE_COLS];
+                        val = (adc - pedestal_g0_aligned[i]) * gain_g0_aligned[i];
                 break;
             case 0x4000:
                 if (source[i] != 0x4000)
-                    val = (adc - pedestal_g1_aligned[i + 4 * packet_number * RAW_MODULE_COLS]) * gain_g1_aligned[i + 4 * packet_number *  RAW_MODULE_COLS];
+                    val = (adc - pedestal_g1_aligned[i]) * gain_g1_aligned[i];
                 break;
             case 0xc000:
                 if (source[i] == 0xc000)
                     val = INT32_MAX;
                 else if (source[i] != 0xffff)
-                    val = (adc - pedestal_g2_aligned[i + 4 * packet_number *  RAW_MODULE_COLS]) * gain_g2_aligned[i + 4 * packet_number * RAW_MODULE_COLS];
+                    val = (adc - pedestal_g2_aligned[i]) * gain_g2_aligned[i];
                 break;
             default:
                 break;
diff --git a/jungfrau/JFConversionFixedPoint.h b/jungfrau/JFConversionFixedPoint.h
index 2a8cb713..b0cff7bf 100644
--- a/jungfrau/JFConversionFixedPoint.h
+++ b/jungfrau/JFConversionFixedPoint.h
@@ -26,7 +26,7 @@ public:
                const JFModulePedestal &pedestal_g1,
                const JFModulePedestal &pedestal_g2,
                double energy) override;
-    void ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) override;
+    void ConvertModule(int16_t *dest, const uint16_t *source) override;
 };
 
 
diff --git a/jungfrau/JFConversionFloatingPoint.cpp b/jungfrau/JFConversionFloatingPoint.cpp
index f3d14868..af68c1dd 100644
--- a/jungfrau/JFConversionFloatingPoint.cpp
+++ b/jungfrau/JFConversionFloatingPoint.cpp
@@ -33,9 +33,46 @@ void JFConversionFloatingPoint::Setup(const JFModuleGainCalibration &gain_calibr
     }
 }
 
-void JFConversionFloatingPoint::ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number)  {
-    for (int i = 0; i < 4; i++)
-        ConvertLine(dest + i * RAW_MODULE_COLS, source  + i * RAW_MODULE_COLS, 4 * packet_number + i);
+void JFConversionFloatingPoint::ConvertModule(int16_t *dest, const uint16_t *source) {
+    for (size_t i = 0; i < RAW_MODULE_SIZE; i++) {
+        uint16_t gainbits = source[i] & 0xc000;
+        uint16_t adc = source[i] & 0x3fff;
+        int16_t pedestal_subtracted_adu;
+
+        float expected = PIXEL_OUT_LOST;
+
+        switch (gainbits) {
+            case 0:
+                [[likely]]
+                        pedestal_subtracted_adu = adc - pedestal_g0[i];
+                expected = static_cast<float>(pedestal_subtracted_adu) * gain_g0[i];
+                break;
+            case 0x4000:
+                pedestal_subtracted_adu = adc - pedestal_g1[i];
+                expected = static_cast<float>(pedestal_subtracted_adu) * gain_g1[i];
+                if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION;
+                break;
+            case 0xc000:
+                pedestal_subtracted_adu = adc - pedestal_g2[i];
+                expected = static_cast<float>(pedestal_subtracted_adu) * gain_g2[i];
+                if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION;
+                else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF;
+                break;
+            default:
+                expected = PIXEL_OUT_GAINBIT_2;
+                break;
+        }
+
+        if (std::isinf(expected) || (expected == INT16_MIN))
+            dest[i] = PIXEL_OUT_LOST;
+        else if (expected > INT16_MAX)
+            dest[i] = INT16_MAX;
+        else if (expected >= 0.0)
+            [[likely]]
+                    dest[i] = static_cast<uint16_t>(expected + 0.5f);
+        else
+            dest[i] = static_cast<uint16_t>(expected - 0.5f);
+    }
 }
 
 void JFConversionFloatingPoint::ConvertFP(float *dest, const uint16_t *source) {
@@ -75,45 +112,3 @@ void JFConversionFloatingPoint::ConvertFP(float *dest, const uint16_t *source) {
         dest[i] = expected;
     }
 }
-
-void JFConversionFloatingPoint::ConvertLine(int16_t *dest, const uint16_t *source, int line) {
-    for (size_t i = 0; i < RAW_MODULE_COLS; i++) {
-        uint16_t gainbits = source[i] & 0xc000;
-        uint16_t adc = source[i] & 0x3fff;
-        int16_t pedestal_subtracted_adu;
-
-        float expected = PIXEL_OUT_LOST;
-
-        switch (gainbits) {
-            case 0:
-                [[likely]]
-                        pedestal_subtracted_adu = adc - pedestal_g0[i + line * RAW_MODULE_COLS];
-                expected = static_cast<float>(pedestal_subtracted_adu) * gain_g0[i + line * RAW_MODULE_COLS];
-                break;
-            case 0x4000:
-                pedestal_subtracted_adu = adc - pedestal_g1[i + line * RAW_MODULE_COLS];
-                expected = static_cast<float>(pedestal_subtracted_adu) * gain_g1[i + line * RAW_MODULE_COLS];
-                if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION;
-                break;
-            case 0xc000:
-                pedestal_subtracted_adu = adc - pedestal_g2[i + line * RAW_MODULE_COLS];
-                expected = static_cast<float>(pedestal_subtracted_adu) * gain_g2[i + line * RAW_MODULE_COLS];
-                if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION;
-                else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF;
-                break;
-            default:
-                expected = PIXEL_OUT_GAINBIT_2;
-                break;
-        }
-
-        if (std::isinf(expected) || (expected == INT16_MIN))
-            dest[i] = PIXEL_OUT_LOST;
-        else if (expected > INT16_MAX)
-            dest[i] = INT16_MAX;
-        else if (expected >= 0.0)
-            [[likely]]
-                    dest[i] = static_cast<uint16_t>(expected + 0.5f);
-        else
-            dest[i] = static_cast<uint16_t>(expected - 0.5f);
-    }
-}
\ No newline at end of file
diff --git a/jungfrau/JFConversionFloatingPoint.h b/jungfrau/JFConversionFloatingPoint.h
index e515591d..6f798919 100644
--- a/jungfrau/JFConversionFloatingPoint.h
+++ b/jungfrau/JFConversionFloatingPoint.h
@@ -14,7 +14,6 @@ class JFConversionFloatingPoint : public JFConversion {
     std::vector<float> gain_g0;
     std::vector<float> gain_g1;
     std::vector<float> gain_g2;
-    void ConvertLine(int16_t *dest, const uint16_t *source, int line);
 public:
     JFConversionFloatingPoint();
 
@@ -23,7 +22,7 @@ public:
                const JFModulePedestal &pedestal_g1,
                const JFModulePedestal &pedestal_g2,
                double energy) override;
-    void ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) override;
+    void ConvertModule(int16_t *dest, const uint16_t *source) override;
     void ConvertFP(float *dest, const uint16_t *source);
 };
 
diff --git a/jungfrau/JFConversionGPU.cu b/jungfrau/JFConversionGPU.cu
new file mode 100644
index 00000000..c8980fb9
--- /dev/null
+++ b/jungfrau/JFConversionGPU.cu
@@ -0,0 +1,136 @@
+// Copyright (2019-2023) Paul Scherrer Institute
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "JFConversionGPU.h"
+#include "../common/JFJochException.h"
+
+inline void cuda_err(cudaError_t val) {
+    if (val != cudaSuccess)
+        throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
+}
+
+struct CudaStreamWrapper {
+    cudaStream_t v;
+};
+
+inline float one_over_gain_energy(double gain_factor, double energy) {
+    double tmp = gain_factor * energy;
+    if (!std::isfinite(tmp) || (tmp == 0.0))
+        return std::numeric_limits<float>::infinity();
+    else
+        return static_cast<float>(1.0 / (gain_factor * energy));
+}
+
+__global__ void gpu_jf_convert(int16_t *output, const uint16_t* input,
+                               const uint16_t *pedestal_g0,
+                               const uint16_t *pedestal_g1,
+                               const uint16_t *pedestal_g2,
+                               const float *gain_g0,
+                               const float *gain_g1,
+                               const float *gain_g2) {
+    uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
+
+    uint16_t gainbits = input[idx] & 0xc000;
+    uint16_t adc = input[idx] & 0x3fff;
+    int16_t pedestal_subtracted_adu;
+
+    float expected = PIXEL_OUT_LOST;
+
+    switch (gainbits) {
+        case 0:
+            pedestal_subtracted_adu = adc - pedestal_g0[idx];
+            expected = static_cast<float>(pedestal_subtracted_adu) * gain_g0[idx];
+            break;
+        case 0x4000:
+            pedestal_subtracted_adu = adc - pedestal_g1[idx];
+            expected = static_cast<float>(pedestal_subtracted_adu) * gain_g1[idx];
+            if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION;
+            break;
+        case 0xc000:
+            pedestal_subtracted_adu = adc - pedestal_g2[idx];
+            expected = static_cast<float>(pedestal_subtracted_adu) * gain_g2[idx];
+            if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION;
+            else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF;
+            break;
+        default:
+            expected = PIXEL_OUT_GAINBIT_2;
+            break;
+    }
+
+    output[idx] = std::round(expected);
+
+    if (expected <= INT16_MIN)
+        output[idx] = PIXEL_OUT_LOST;
+    else if (expected >= INT16_MAX)
+        output[idx] = INT16_MAX;
+}
+
+JFConversionGPU::JFConversionGPU() {
+    cudastream = new(CudaStreamWrapper);
+
+    cuda_err(cudaStreamCreate(&cudastream->v));
+    cuda_err(cudaMalloc(&gpu_pedestal_g0, RAW_MODULE_SIZE * sizeof(uint16_t)));
+    cuda_err(cudaMalloc(&gpu_pedestal_g1, RAW_MODULE_SIZE * sizeof(uint16_t)));
+    cuda_err(cudaMalloc(&gpu_pedestal_g2, RAW_MODULE_SIZE * sizeof(uint16_t)));
+
+    cuda_err(cudaMalloc(&gpu_gain_g0, RAW_MODULE_SIZE * sizeof(float)));
+    cuda_err(cudaMalloc(&gpu_gain_g1, RAW_MODULE_SIZE * sizeof(float)));
+    cuda_err(cudaMalloc(&gpu_gain_g2, RAW_MODULE_SIZE * sizeof(float)));
+
+    cuda_err(cudaMallocHost(&host_gain_g0, RAW_MODULE_SIZE * sizeof(float)));
+    cuda_err(cudaMallocHost(&host_gain_g1, RAW_MODULE_SIZE * sizeof(float)));
+    cuda_err(cudaMallocHost(&host_gain_g2, RAW_MODULE_SIZE * sizeof(float)));
+
+    cuda_err(cudaMalloc(&gpu_input, RAW_MODULE_SIZE * sizeof(uint16_t)));
+    cuda_err(cudaMalloc(&gpu_output, RAW_MODULE_SIZE * sizeof(int16_t)));
+}
+
+JFConversionGPU::~JFConversionGPU() {
+    cudaStreamSynchronize(cudastream->v);
+    cudaStreamDestroy(cudastream->v);
+    delete cudastream;
+
+    cudaFree(gpu_pedestal_g0);
+    cudaFree(gpu_pedestal_g1);
+    cudaFree(gpu_pedestal_g2);
+
+    cudaFree(gpu_gain_g0);
+    cudaFree(gpu_gain_g1);
+    cudaFree(gpu_gain_g2);
+
+    cudaFreeHost(host_gain_g0);
+    cudaFreeHost(host_gain_g1);
+    cudaFreeHost(host_gain_g2);
+
+    cudaFree(gpu_input);
+    cudaFree(gpu_output);
+}
+
+void JFConversionGPU::Setup(const JFModuleGainCalibration &gain_calibration, const JFModulePedestal &pedestal_g0,
+                            const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) {
+    auto &gain_arr = gain_calibration.GetGainCalibration();
+
+    for (int i = 0; i < RAW_MODULE_SIZE; i++) {
+        host_gain_g0[i] = one_over_gain_energy(gain_arr[i], energy);
+        host_gain_g1[i] = one_over_gain_energy(gain_arr[i + RAW_MODULE_SIZE], energy);
+        host_gain_g2[i] = one_over_gain_energy(gain_arr[i + 2 * RAW_MODULE_SIZE], energy);
+    }
+    cudaMemcpy(gpu_pedestal_g0, pedestal_g0.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_pedestal_g1, pedestal_g1.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_pedestal_g2, pedestal_g2.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_gain_g0, host_gain_g0, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_gain_g1, host_gain_g1, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_gain_g2, host_gain_g2, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+}
+
+void JFConversionGPU::ConvertModule(int16_t *dest, const uint16_t *source) {
+    cudaMemcpy(gpu_input, source, RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    gpu_jf_convert<<<RAW_MODULE_SIZE/128, 128>>>(gpu_output, gpu_input,
+                                                         gpu_pedestal_g0, gpu_pedestal_g1, gpu_pedestal_g2,
+                                                         gpu_gain_g0, gpu_gain_g1, gpu_gain_g2);
+    cudaMemcpy(dest, gpu_output, RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyDeviceToHost);
+}
+
+void JFConversionGPU::Sync() {
+    cudaStreamSynchronize(cudastream->v);
+}
\ No newline at end of file
diff --git a/jungfrau/JFConversionGPU.h b/jungfrau/JFConversionGPU.h
new file mode 100644
index 00000000..9e5d4eb1
--- /dev/null
+++ b/jungfrau/JFConversionGPU.h
@@ -0,0 +1,49 @@
+// Copyright (2019-2023) Paul Scherrer Institute
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef JUNGFRAUJOCH_JFCONVERSIONGPU_CUH
+#define JUNGFRAUJOCH_JFCONVERSIONGPU_CUH
+
+#include "JFConversion.h"
+#include "JFConversionFixedPoint.h"
+
+struct CudaStreamWrapper;
+
+class JFConversionGPU : public JFConversion {
+#ifdef JFJOCH_USE_CUDA
+    CudaStreamWrapper *cudastream = nullptr;
+
+    uint16_t *gpu_pedestal_g0 = nullptr;
+    uint16_t *gpu_pedestal_g1 = nullptr;
+    uint16_t *gpu_pedestal_g2 = nullptr;
+    float *gpu_gain_g0 = nullptr;
+    float *gpu_gain_g1 = nullptr;
+    float *gpu_gain_g2 = nullptr;
+
+    float *host_gain_g0 = nullptr;
+    float *host_gain_g1 = nullptr;
+    float *host_gain_g2 = nullptr;
+
+
+    uint16_t *gpu_input = nullptr;
+    int16_t *gpu_output = nullptr;
+#else
+    JFConversionFixedPoint alt_conv;
+#endif
+public:
+    JFConversionGPU();
+    JFConversionGPU(JFConversionGPU& other) = delete;
+    JFConversionGPU& operator=(JFConversionGPU& other) = delete;
+    ~JFConversionGPU();
+    void Setup(const JFModuleGainCalibration &gain_calibration,
+                       const JFModulePedestal &pedestal_g0,
+                       const JFModulePedestal &pedestal_g1,
+                       const JFModulePedestal &pedestal_g2,
+                       double energy) override;
+
+    void ConvertModule(int16_t *dest, const uint16_t *source) override;
+    void Sync() override;
+};
+
+
+#endif //JUNGFRAUJOCH_JFCONVERSIONGPU_CUH
diff --git a/jungfrau/JFConversionGPU_Alt.cpp b/jungfrau/JFConversionGPU_Alt.cpp
new file mode 100644
index 00000000..0a0af1bb
--- /dev/null
+++ b/jungfrau/JFConversionGPU_Alt.cpp
@@ -0,0 +1,23 @@
+// Copyright (2019-2023) Paul Scherrer Institute
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "JFConversionGPU.h"
+
+#ifndef JFJOCH_USE_CUDA
+
+JFConversionGPU::JFConversionGPU() {}
+
+JFConversionGPU::~JFConversionGPU() {}
+
+void JFConversionGPU::Setup(const JFModuleGainCalibration &gain_calibration, const JFModulePedestal &pedestal_g0,
+                            const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) {
+    alt_conv.Setup(gain_calibration, pedestal_g0, pedestal_g1, pedestal_g2, energy);
+}
+
+void JFConversionGPU::ConvertPacket(int16_t *dest, const uint16_t *source, uint16_t packet_number) {
+    alt_conv.ConvertPacket(dest, source, packet_number);
+}
+
+JFConversionGPU::Sync() {}
+
+#endif
diff --git a/jungfrau/ProcessJFPacket.cpp b/jungfrau/ProcessJFPacket.cpp
index 16c24d49..21655801 100644
--- a/jungfrau/ProcessJFPacket.cpp
+++ b/jungfrau/ProcessJFPacket.cpp
@@ -13,8 +13,7 @@ ProcessJFPacket::ProcessJFPacket(ThreadSafeFIFO<Completion> &in_c, ThreadSafeFIF
         : m(2 * nmodules),
           c_fifo(in_c),
           wr_fifo(in_wr),
-          module_info(2 * nmodules),
-          conv(nmodules)
+          module_info(2 * nmodules)
 {
     for (auto &i: module_info)
         i.c.frame_number = UINT64_MAX;
@@ -68,10 +67,7 @@ void ProcessJFPacket::ProcessPacket(jf_udp_payload *datagram) {
         module_info[module_info_location].c.packet_mask[packetnum >= 64 ? 1 : 0] |= (1LU << (packetnum % 64));
 
         uint16_t* dst = module_info[module_info_location].ptr + 4096 * packetnum;
-        if (conv[module_number])
-            conv[module_number]->ConvertPacket((int16_t *) dst, datagram->data, packetnum);
-        else
-            memcpy(dst, datagram->data, 4096 * sizeof(uint16_t));
+        memcpy(dst, datagram->data, 4096 * sizeof(uint16_t));
     }
     packet_counter++;
 }
@@ -79,23 +75,3 @@ void ProcessJFPacket::ProcessPacket(jf_udp_payload *datagram) {
 uint64_t ProcessJFPacket::GetCounter() {
     return packet_counter;
 }
-
-void ProcessJFPacket::RegisterConversion(const DiffractionExperiment &experiment,
-                                         const JFCalibration &calib,
-                                         uint16_t data_stream) {
-    if (data_stream >= experiment.GetDataStreamsNum())
-        throw JFJochException(JFJochExceptionCategory::ArrayOutOfBounds, "Data stream not found");
-
-    if (conv.size() != experiment.GetModulesNum(data_stream))
-        throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Wrong module count");
-
-    auto module0 = experiment.GetFirstModuleOfDataStream(data_stream);
-    for (int i = 0; i < experiment.GetModulesNum(data_stream); i++) {
-        conv[i] = std::make_unique<JFConversionFixedPoint>();
-        conv[i]->Setup(calib.GainCalibration(module0 + i),
-                       calib.Pedestal(module0 + i, 0, 0),
-                       calib.Pedestal(module0 + i, 1, 0),
-                       calib.Pedestal(module0 + i, 2, 0),
-                       experiment.GetPhotonEnergy_keV());
-    }
-}
diff --git a/jungfrau/ProcessJFPacket.h b/jungfrau/ProcessJFPacket.h
index 7f428f23..8d4021d7 100644
--- a/jungfrau/ProcessJFPacket.h
+++ b/jungfrau/ProcessJFPacket.h
@@ -24,11 +24,9 @@ class ProcessJFPacket {
     ThreadSafeFIFO<WorkRequest> &wr_fifo;
     std::vector<ModuleInfo> module_info;
     std::atomic<uint64_t> packet_counter = 0;
-    std::vector<std::unique_ptr<JFConversion> > conv;
 public:
     ProcessJFPacket(ThreadSafeFIFO<Completion> &c, ThreadSafeFIFO<WorkRequest> &wr, uint32_t nmodules);
     ~ProcessJFPacket();
-    void RegisterConversion(const DiffractionExperiment& experiment, const JFCalibration& calib, uint16_t data_stream);
     void ProcessPacket(jf_udp_payload *datagram);
     uint64_t GetCounter();
 };
diff --git a/tests/JFConversionTest.cpp b/tests/JFConversionTest.cpp
index 72da2ab1..6b448578 100644
--- a/tests/JFConversionTest.cpp
+++ b/tests/JFConversionTest.cpp
@@ -5,6 +5,7 @@
 
 #include "../tests/FPGAUnitTest.h"
 #include "../jungfrau/JFConversionFloatingPoint.h"
+#include "../jungfrau/JFConversionGPU.h"
 #include "../jungfrau/JFConversionFixedPoint.h"
 
 void SetupPedestal( JFModulePedestal &pedestal_g0, JFModulePedestal &pedestal_g1, JFModulePedestal &pedestal_g2) {
@@ -41,7 +42,7 @@ TEST_CASE("JFConversionFloatingPoint_G0","[JFConversion]") {
         input[i] = i % 16384;
 
     for (int i = 0; i < 128; i++)
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
     conv.ConvertFP(output_fp.data(), input.data());
 
     auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -76,7 +77,7 @@ TEST_CASE("JFConversionFixedPoint_G0","[JFConversion]") {
         for (int i = 0; i < RAW_MODULE_SIZE; i++)
             input[i] = i % 16384;
 
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
         conv_fp.ConvertFP(output_fp.data(), input.data());
 
         auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -88,6 +89,42 @@ TEST_CASE("JFConversionFixedPoint_G0","[JFConversion]") {
     }
 }
 
+TEST_CASE("JFConversionGPU_G0","[JFConversion]") {
+    JFConversionGPU conv;
+    JFConversionFloatingPoint conv_fp;
+
+    JFModulePedestal pedestal_g0;
+    JFModulePedestal pedestal_g1;
+    JFModulePedestal pedestal_g2;
+
+    SetupPedestal(pedestal_g0, pedestal_g1, pedestal_g2);
+
+    JFModuleGainCalibration gain;
+
+    std::vector<double> energy{4.0, 6.0, 12.4, 25.0};
+    for (auto &e: energy) {
+        conv.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e);
+        conv_fp.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e);
+
+        std::vector<uint16_t> input(RAW_MODULE_SIZE);
+        std::vector<float> output_fp(RAW_MODULE_SIZE);
+        std::vector<int16_t> output_16bit(RAW_MODULE_SIZE);
+
+        for (int i = 0; i < RAW_MODULE_SIZE; i++)
+            input[i] = i % 16384;
+
+        conv.ConvertModule(output_16bit.data(), input.data());
+        conv_fp.ConvertFP(output_fp.data(), input.data());
+        conv.Sync();
+
+        auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
+        auto max_err = MaxErrorOnConversion(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
+        std::cout << "Error on conversion " << err << " max error " << max_err << std::endl;
+        REQUIRE(err < 0.5);
+        REQUIRE(max_err <= 1.0);
+    }
+}
+
 TEST_CASE("JFConversionFixedPoint_G0_TestFile","[JFConversion]") {
     JFConversionFixedPoint conv;
     JFConversionFloatingPoint conv_fp;
@@ -112,7 +149,7 @@ TEST_CASE("JFConversionFixedPoint_G0_TestFile","[JFConversion]") {
         for (int i = 0; i < RAW_MODULE_SIZE; i++)
             input[i] = i % 16384;
 
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
         conv_fp.ConvertFP(output_fp.data(), input.data());
 
         auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -144,7 +181,7 @@ TEST_CASE("JFConversionFloatingPoint_G1","[JFConversion]") {
     for (int i = 0; i < RAW_MODULE_SIZE; i++)
         input[i] = (i % 16384) | 0x4000;
 
-    conv.Convert(output_16bit.data(), input.data());
+    conv.ConvertModule(output_16bit.data(), input.data());
     conv.ConvertFP(output_fp.data(), input.data());
 
     auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -179,7 +216,7 @@ TEST_CASE("JFConversionFixedPoint_G1","[JFConversion]") {
         for (int i = 0; i < RAW_MODULE_SIZE; i++)
             input[i] = (i % 16384) | 0x4000;
 
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
         conv_fp.ConvertFP(output_fp.data(), input.data());
 
         auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -191,6 +228,43 @@ TEST_CASE("JFConversionFixedPoint_G1","[JFConversion]") {
     }
 }
 
+TEST_CASE("JFConversionGPU_G1","[JFConversion]") {
+    JFConversionGPU conv;
+    JFConversionFloatingPoint conv_fp;
+
+    JFModulePedestal pedestal_g0;
+    JFModulePedestal pedestal_g1;
+    JFModulePedestal pedestal_g2;
+
+    SetupPedestal(pedestal_g0, pedestal_g1, pedestal_g2);
+
+    JFModuleGainCalibration gain;
+
+    std::vector<double> energy{4.0, 6.0, 12.4, 25.0};
+    for (auto &e: energy) {
+        conv.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e);
+        conv_fp.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e);
+
+        std::vector<uint16_t> input(RAW_MODULE_SIZE);
+        std::vector<float> output_fp(RAW_MODULE_SIZE);
+        std::vector<int16_t> output_16bit(RAW_MODULE_SIZE);
+
+        for (int i = 0; i < RAW_MODULE_SIZE; i++)
+            input[i] = (i % 16384) | 0x4000;
+
+        conv.ConvertModule(output_16bit.data(), input.data());
+        conv_fp.ConvertFP(output_fp.data(), input.data());
+        conv.Sync();
+
+        auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
+        auto max_err = MaxErrorOnConversion(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
+
+        std::cout << "Error on conversion " << err << " max error " << max_err << std::endl;
+        REQUIRE(err < 0.5);
+        REQUIRE(max_err <= 1.0);
+    }
+}
+
 TEST_CASE("JFConversionFloatingPoint_G2","[JFConversion]") {
     JFConversionFloatingPoint conv;
 
@@ -211,7 +285,7 @@ TEST_CASE("JFConversionFloatingPoint_G2","[JFConversion]") {
     for (int i = 0; i < RAW_MODULE_SIZE; i++)
         input[i] = (i % 16384) | 0xC000;
 
-    conv.Convert(output_16bit.data(), input.data());
+    conv.ConvertModule(output_16bit.data(), input.data());
     conv.ConvertFP(output_fp.data(), input.data());
 
     auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -246,7 +320,7 @@ TEST_CASE("JFConversionFixedPoint_G2","[JFConversion]") {
         for (int i = 0; i < RAW_MODULE_SIZE; i++)
             input[i] = (i % 16384) | 0xC000;
 
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
         conv_fp.ConvertFP(output_fp.data(), input.data());
 
         auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -258,6 +332,43 @@ TEST_CASE("JFConversionFixedPoint_G2","[JFConversion]") {
     }
 }
 
+TEST_CASE("JFConversionGPU_G2","[JFConversion]") {
+    JFConversionGPU conv;
+    JFConversionFloatingPoint conv_fp;
+
+    JFModulePedestal pedestal_g0;
+    JFModulePedestal pedestal_g1;
+    JFModulePedestal pedestal_g2;
+
+    SetupPedestal(pedestal_g0, pedestal_g1, pedestal_g2);
+
+    JFModuleGainCalibration gain;
+
+    std::vector<double> energy{4.0, 6.0, 12.4, 25.0};
+    for (auto &e: energy) {
+        conv.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e);
+        conv_fp.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e);
+
+        std::vector<uint16_t> input(RAW_MODULE_SIZE);
+        std::vector<float> output_fp(RAW_MODULE_SIZE);
+        std::vector<int16_t> output_16bit(RAW_MODULE_SIZE);
+
+        for (int i = 0; i < RAW_MODULE_SIZE; i++)
+            input[i] = (i % 16384) | 0xC000;
+
+        conv.ConvertModule(output_16bit.data(), input.data());
+        conv_fp.ConvertFP(output_fp.data(), input.data());
+        conv.Sync();
+
+        auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
+        auto max_err = MaxErrorOnConversion(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
+
+        std::cout << "Error on conversion " << err << " max error " << max_err << std::endl;
+        REQUIRE(err < 0.5);
+        REQUIRE(max_err <= 1.0);
+    }
+}
+
 TEST_CASE("JFConversionFixedPoint_G1_TestFile","[JFConversion]") {
     JFConversionFixedPoint conv;
     JFConversionFloatingPoint conv_fp;
@@ -282,7 +393,7 @@ TEST_CASE("JFConversionFixedPoint_G1_TestFile","[JFConversion]") {
         for (int i = 0; i < RAW_MODULE_SIZE; i++)
             input[i] = (i % 16384) | 0x4000;
 
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
         conv_fp.ConvertFP(output_fp.data(), input.data());
 
         auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
@@ -318,7 +429,7 @@ TEST_CASE("JFConversionFixedPoint_G2_TestFile","[JFConversion]") {
         for (int i = 0; i < RAW_MODULE_SIZE; i++)
             input[i] = (i % 16384) | 0xC000;
 
-        conv.Convert(output_16bit.data(), input.data());
+        conv.ConvertModule(output_16bit.data(), input.data());
         conv_fp.ConvertFP(output_fp.data(), input.data());
 
         auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE);
diff --git a/tests/ProcessRawPacketTest.cpp b/tests/ProcessRawPacketTest.cpp
index e23593aa..16f415c2 100644
--- a/tests/ProcessRawPacketTest.cpp
+++ b/tests/ProcessRawPacketTest.cpp
@@ -92,41 +92,3 @@ TEST_CASE("ProcessRawPacketTest") {
     CHECK(array_1[4096*(36+64)] == 6345);
     CHECK(array_2[4096*(16+64)] == 6346);
 }
-
-TEST_CASE("ProcessRawPacketTest_Conversion") {
-    ThreadSafeFIFO<Completion> c_fifo;
-    ThreadSafeFIFO<WorkRequest> wr_fifo;
-
-    std::vector<uint16_t> array_0(RAW_MODULE_SIZE);
-
-    wr_fifo.Put(WorkRequest{.ptr = array_0.data(), .handle = 0});
-
-    DiffractionExperiment experiment(DetectorGeometry(8));
-    experiment.DataStreams(2).PhotonEnergy_keV(2);
-    JFCalibration calibration(experiment);
-    calibration.Pedestal(6, 0, 0).GetPedestal()[4096*36] = 300;
-    calibration.Pedestal(6, 1, 0).GetPedestal()[4096*36+1] = 15000;
-    calibration.Pedestal(6, 2, 0).GetPedestal()[4096*36+2] = 14000;
-
-    {
-        ProcessJFPacket process(c_fifo, wr_fifo, 4);
-        REQUIRE_NOTHROW(process.RegisterConversion(experiment, calibration, 1));
-        jf_udp_payload datagram;
-
-        datagram.packetnum = 36;
-        datagram.framenum = 2;
-        datagram.bunchid = 84;
-        datagram.data[0] = 600;
-        datagram.data[1] = 16384 | 4000;
-        datagram.data[2] = 32768 | 16384 | 9000;
-
-        datagram.xCoord = 4;
-        process.ProcessPacket(&datagram);
-
-        REQUIRE(process.GetCounter() == 1);
-    }
-
-    CHECK(array_0[4096*36] == std::round((600 - 300) / (DEFAULT_G0_FACTOR*2)));
-    CHECK(array_0[4096*36+1] == std::round((4000 - 15000) / (DEFAULT_G1_FACTOR*2)));
-    CHECK(array_0[4096*36+2] == std::round((9000 - 14000) / (DEFAULT_G2_FACTOR*2)));
-}
\ No newline at end of file
diff --git a/tools/JFCalibrationPerfTest.cpp b/tools/JFCalibrationPerfTest.cpp
index 09614f06..6ea3fe7c 100644
--- a/tools/JFCalibrationPerfTest.cpp
+++ b/tools/JFCalibrationPerfTest.cpp
@@ -6,11 +6,14 @@
 #include <iomanip>
 #include <future>
 
+#include <numa.h>
+
 #include "../jungfrau/JFPedestalCalc.h"
 #include "../common/Logger.h"
 #include "../jungfrau/JFCalibration.h"
 #include "../jungfrau/JFConversionFloatingPoint.h"
 #include "../jungfrau/JFConversionFixedPoint.h"
+#include "../jungfrau/JFConversionGPU.h"
 #include "../tests/FPGAUnitTest.h"
 #include "../jungfrau/jf_packet.h"
 #include "../jungfrau/ProcessJFPacket.h"
@@ -82,16 +85,19 @@ template <class T> void test_conversion(Logger &logger) {
 
     x.Mode(DetectorMode::Conversion);
 
-    logger.Info("JF FP conversion input prepared");
     auto start_time = std::chrono::system_clock::now();
     for (int z = 0; z < ntries; z++) {
         for (int i = 0; i < nframes; i++) {
             for (int m = 0; m < nmodules; m++) {
-                v[m].Convert(output.data() + (i * nmodules + m) * RAW_MODULE_SIZE,
+                v[m].ConvertModule(output.data() + (i * nmodules + m) * RAW_MODULE_SIZE,
                              input.data() + (i * nmodules + m) * RAW_MODULE_SIZE);
             }
         }
     }
+
+    for (int m = 0; m < nmodules; m++)
+        v[m].Sync();
+
     auto end_time = std::chrono::system_clock::now();
     auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
 
@@ -143,133 +149,6 @@ void test_packet_processing(Logger &logger) {
                 ntries * nframes * nmodules * RAW_MODULE_SIZE * sizeof(uint16_t) * 1000 * 1000/ ((double) elapsed.count() * 1024 * 1024 * 1024));
 }
 
-void test_packet_processing_with_conversion(Logger &logger) {
-    size_t nframes = 128;
-    int64_t nmodules = 8;
-    int64_t ntries = 8;
-
-    std::vector<jf_raw_packet> packets(nframes * nmodules * 128);
-    std::vector<uint16_t> output(nframes * nmodules * CONVERTED_MODULE_SIZE);
-
-    std::vector<uint16_t> input(RAW_MODULE_SIZE);
-    std::string image_path = "../../tests/test_data/mod5_raw0.bin";
-    LoadBinaryFile(image_path, input.data(), RAW_MODULE_SIZE);
-
-    for (int frame = 0; frame < nframes; frame++) {
-        for (int m = 0; m < nmodules; m++) {
-            for (int p = 0; p < 128; p++) {
-                packets.at((frame * 128 + p) * nmodules + m).jf.xCoord = (m * 2);
-                packets.at((frame * 128 + p) * nmodules + m).jf.packetnum = p;
-                packets.at((frame * 128 + p) * nmodules + m).jf.framenum = frame + 1;
-                memcpy(packets.at((frame * 128 + p) * nmodules + m).jf.data, input.data() + 4096 * p, 4096 * sizeof(uint16_t));
-            }
-        }
-    }
-
-    DiffractionExperiment experiment((DetectorGeometry(nmodules)));
-    JFCalibration calib(experiment);
-    experiment.PhotonEnergy_keV(12.4).DataStreams(1);
-
-    for (int m = 0; m < nmodules; m++) {
-        calib.GainCalibration(m) = GainCalibrationFromTestFile();
-
-        for (int i = 0; i < RAW_MODULE_SIZE; i++) {
-            calib.Pedestal(m, 0, 0).GetPedestal()[i] = 3000 + i % 50 + m * 135;
-            calib.Pedestal(m, 1, 0).GetPedestal()[i] = 15000 + i % 50 - m * 135;
-            calib.Pedestal(m, 2, 0).GetPedestal()[i] = 14000 + i % 50 - m * 135;
-        }
-    }
-
-    auto start_time = std::chrono::system_clock::now();
-    for (int z = 0; z < ntries; z++) {
-        ThreadSafeFIFO<Completion> c;
-        ThreadSafeFIFO<WorkRequest> wr;
-        ProcessJFPacket process(c, wr, nmodules);
-        process.RegisterConversion(experiment, calib, 0);
-
-        for (uint32_t i = 0; i < nmodules * nframes; i++)
-            wr.Put(WorkRequest{
-                    .ptr = output.data() + i * RAW_MODULE_SIZE,
-                    .handle = i
-            });
-        for (auto &packet: packets)
-            process.ProcessPacket(&packet.jf);
-    }
-    auto end_time = std::chrono::system_clock::now();
-    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-
-    logger.Info("Packet analysis with conversion performance: {:5d} us/module {:5.2f} GB/s", std::lround(elapsed.count() / ((double) (ntries * nframes * nmodules))),
-                ntries * nframes * nmodules * RAW_MODULE_SIZE * sizeof(uint16_t) * 1000 * 1000/ ((double) elapsed.count() * 1024 * 1024 * 1024));
-}
-
-void test_packet_processing_with_conversion_parallel(Logger &logger, int64_t nmodules) {
-    size_t nframes = 128;
-    int64_t ntries = 8;
-
-    std::vector<jf_raw_packet> packets(nframes * nmodules * 128);
-    std::vector<uint16_t> output(nframes * nmodules * CONVERTED_MODULE_SIZE);
-
-    std::vector<uint16_t> input(RAW_MODULE_SIZE);
-    std::string image_path = "../../tests/test_data/mod5_raw0.bin";
-    LoadBinaryFile(image_path, input.data(), RAW_MODULE_SIZE);
-
-    for (int frame = 0; frame < nframes; frame++) {
-        for (int m = 0; m < nmodules; m++) {
-            for (int p = 0; p < 128; p++) {
-                packets.at((frame * 128 + p) * nmodules + m).jf.xCoord = (m * 2);
-                packets.at((frame * 128 + p) * nmodules + m).jf.packetnum = p;
-                packets.at((frame * 128 + p) * nmodules + m).jf.framenum = frame + 1;
-                memcpy(packets.at((frame * 128 + p) * nmodules + m).jf.data, input.data() + 4096 * p, 4096 * sizeof(uint16_t));
-            }
-        }
-    }
-
-    DiffractionExperiment experiment((DetectorGeometry(nmodules)));
-    JFCalibration calib(experiment);
-    experiment.PhotonEnergy_keV(12.4).DataStreams(1);
-
-    for (int m = 0; m < nmodules; m++) {
-        calib.GainCalibration(m) = GainCalibrationFromTestFile();
-
-        for (int i = 0; i < RAW_MODULE_SIZE; i++) {
-            calib.Pedestal(m, 0, 0).GetPedestal()[i] = 3000 + i % 50 + m * 135;
-            calib.Pedestal(m, 1, 0).GetPedestal()[i] = 15000 + i % 50 - m * 135;
-            calib.Pedestal(m, 2, 0).GetPedestal()[i] = 14000 + i % 50 - m * 135;
-        }
-    }
-
-    auto start_time = std::chrono::system_clock::now();
-    for (int z = 0; z < ntries; z++) {
-        ThreadSafeFIFO<Completion> c;
-        ThreadSafeFIFO<WorkRequest> wr;
-        ProcessJFPacket process(c, wr, nmodules);
-        process.RegisterConversion(experiment, calib, 0);
-
-        for (uint32_t i = 0; i < nmodules * nframes; i++)
-            wr.Put(WorkRequest{
-                    .ptr = output.data() + i * RAW_MODULE_SIZE,
-                    .handle = i
-            });
-
-        std::vector<std::future<void>> f;
-        for (int m = 0; m < nmodules; m++) {
-            f.emplace_back(std::async(std::launch::async, [&] (int x) {
-                for (int frame = 0; frame < nframes; frame++) {
-                    for (int i = 0; i < 128; i++)
-                        process.ProcessPacket(&packets[(frame * 128 + i) * nmodules + x].jf);
-                }
-            }, m));
-        }
-        for (auto &i: f)
-            i.get();
-    }
-    auto end_time = std::chrono::system_clock::now();
-    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
-
-    logger.Info("Packet analysis with conversion performance: {:5d} us/module {:5.2f} GB/s", std::lround(elapsed.count() / ((double) (ntries * nframes * nmodules))),
-                ntries * nframes * nmodules * RAW_MODULE_SIZE * sizeof(uint16_t) * 1000 * 1000/ ((double) elapsed.count() * 1024 * 1024 * 1024));
-}
-
 int main () {
     Logger logger("JFCalibrationPerfTest");
     test_pedestal(logger);
@@ -280,21 +159,9 @@ int main () {
     logger.Info("Fixed point conversion");
     test_conversion<JFConversionFixedPoint>(logger);
 
-    logger.Info("Packet processing without conversion");
+    logger.Info("Floating point conversion (GPU)");
+    test_conversion<JFConversionGPU>(logger);
+
+    logger.Info("Packet processing");
     test_packet_processing(logger);
-
-    logger.Info("Packet processing with conversion");
-    test_packet_processing_with_conversion(logger);
-
-    logger.Info("Packet processing with conversion (4 threads)");
-    test_packet_processing_with_conversion_parallel(logger, 4);
-
-    logger.Info("Packet processing with conversion (8 threads)");
-    test_packet_processing_with_conversion_parallel(logger, 8);
-
-    logger.Info("Packet processing with conversion (16 threads)");
-    test_packet_processing_with_conversion_parallel(logger, 16);
-
-    logger.Info("Packet processing with conversion (32 threads)");
-    test_packet_processing_with_conversion_parallel(logger, 32);
 }