From b5d0b34fc83ab8627b828eb035861fa8d9294b70 Mon Sep 17 00:00:00 2001 From: Filip Leonarski Date: Wed, 7 Jun 2023 20:49:35 +0200 Subject: [PATCH] JFConversion: Simplify (processing per module, not per packet) + add GPU conversion procedure - currently only synchronous --- common/FrameTransformation.cpp | 2 +- jungfrau/CMakeLists.txt | 14 ++- jungfrau/JFConversion.h | 7 +- jungfrau/JFConversionFixedPoint.cpp | 10 +- jungfrau/JFConversionFixedPoint.h | 2 +- jungfrau/JFConversionFloatingPoint.cpp | 85 +++++++------ jungfrau/JFConversionFloatingPoint.h | 3 +- jungfrau/JFConversionGPU.cu | 136 +++++++++++++++++++++ jungfrau/JFConversionGPU.h | 49 ++++++++ jungfrau/JFConversionGPU_Alt.cpp | 23 ++++ jungfrau/ProcessJFPacket.cpp | 28 +---- jungfrau/ProcessJFPacket.h | 2 - tests/JFConversionTest.cpp | 129 ++++++++++++++++++-- tests/ProcessRawPacketTest.cpp | 38 ------ tools/JFCalibrationPerfTest.cpp | 157 ++----------------------- 15 files changed, 404 insertions(+), 281 deletions(-) create mode 100644 jungfrau/JFConversionGPU.cu create mode 100644 jungfrau/JFConversionGPU.h create mode 100644 jungfrau/JFConversionGPU_Alt.cpp diff --git a/common/FrameTransformation.cpp b/common/FrameTransformation.cpp index ea24c2a4..14db8989 100644 --- a/common/FrameTransformation.cpp +++ b/common/FrameTransformation.cpp @@ -164,7 +164,7 @@ void FrameTransformation::ProcessModule(JFConversion &conv, const int16_t *input if (experiment.GetDetectorMode() != DetectorMode::Conversion) memcpy(output + RAW_MODULE_SIZE * module_number_abs, input, RAW_MODULE_SIZE * experiment.GetPixelDepth()); else { - conv.Convert(conversion_buffer.data(), (uint16_t *) input); + conv.ConvertModule(conversion_buffer.data(), (uint16_t *) input); TransferModuleAdjustMultipixels(output, conversion_buffer.data(), experiment.GetModuleSlowDirectionStep(module_number_abs), static_cast(INT16_MIN), diff --git a/jungfrau/CMakeLists.txt b/jungfrau/CMakeLists.txt index 10e032ac..ee02541a 100644 --- a/jungfrau/CMakeLists.txt +++ b/jungfrau/CMakeLists.txt @@ -6,8 +6,18 @@ ADD_LIBRARY(JFCalibration STATIC JFModulePedestal.cpp JFModulePedestal.h JFModuleGainCalibration.cpp JFModuleGainCalibration.h JFPedestalCalc.cpp JFPedestalCalc.h - ProcessJFPacket.cpp ProcessJFPacket.h) + ProcessJFPacket.cpp ProcessJFPacket.h + JFConversionGPU.h JFConversionGPU_Alt.cpp) SET_SOURCE_FILES_PROPERTIES(JFPedestalCalc.cpp JFConversionFloatingPoint.cpp JFConversionFixedPoint.cpp PROPERTIES COMPILE_FLAGS -Ofast) -TARGET_LINK_LIBRARIES(JFCalibration JFJochProtoBuf Compression) \ No newline at end of file +TARGET_LINK_LIBRARIES(JFCalibration JFJochProtoBuf Compression) + +IF (CMAKE_CUDA_COMPILER) + TARGET_SOURCES(JFCalibration PRIVATE JFConversionGPU.cu ) + + TARGET_COMPILE_DEFINITIONS(JFCalibration PUBLIC -DJFJOCH_USE_CUDA) + + FIND_LIBRARY(CUDART_LIBRARY cudart_static PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) + TARGET_LINK_LIBRARIES(JFCalibration ${CUDART_LIBRARY} ${CMAKE_DL_LIBS} rt) +ENDIF() diff --git a/jungfrau/JFConversion.h b/jungfrau/JFConversion.h index c92ee854..65ced286 100644 --- a/jungfrau/JFConversion.h +++ b/jungfrau/JFConversion.h @@ -16,11 +16,8 @@ public: const JFModulePedestal &pedestal_g2, double energy) = 0; - virtual void ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) = 0; - void Convert(int16_t* dest, const uint16_t* source) { - for (int i = 0; i < 128; i++) - ConvertPacket(dest + i * 4 * RAW_MODULE_COLS, source + i * 4 * RAW_MODULE_COLS, i); - } + virtual void ConvertModule(int16_t* dest, const uint16_t* source) = 0; + virtual void Sync() {}; }; #endif //JUNGFRAUJOCH_JFCONVERSION_H diff --git a/jungfrau/JFConversionFixedPoint.cpp b/jungfrau/JFConversionFixedPoint.cpp index 2078aed4..a81c59c9 100644 --- a/jungfrau/JFConversionFixedPoint.cpp +++ b/jungfrau/JFConversionFixedPoint.cpp @@ -100,7 +100,7 @@ void JFConversionFixedPoint::ConvertLine(int16_t *dest, const uint16_t *source, } } -void JFConversionFixedPoint::ConvertPacket(int16_t *dest, const uint16_t *source, uint16_t packet_number) { +void JFConversionFixedPoint::ConvertModule(int16_t *dest, const uint16_t *source) { auto gain_g0_aligned = std::assume_aligned<64>(gain_g0); auto gain_g1_aligned = std::assume_aligned<64>(gain_g1); auto gain_g2_aligned = std::assume_aligned<64>(gain_g2); @@ -108,7 +108,7 @@ void JFConversionFixedPoint::ConvertPacket(int16_t *dest, const uint16_t *source auto pedestal_g1_aligned = std::assume_aligned<64>(pedestal_g1); auto pedestal_g2_aligned = std::assume_aligned<64>(pedestal_g2); - for (int i = 0; i < 4 * RAW_MODULE_COLS; i++) { + for (int i = 0; i < RAW_MODULE_SIZE; i++) { uint16_t gainbits = source[i] & 0xc000; int32_t adc = source[i] & 0x3fff; int32_t val = INT32_MIN; @@ -116,17 +116,17 @@ void JFConversionFixedPoint::ConvertPacket(int16_t *dest, const uint16_t *source switch (gainbits) { case 0: [[likely]] - val = (adc - pedestal_g0_aligned[i + 4 * packet_number * RAW_MODULE_COLS]) * gain_g0_aligned[i + 4 * packet_number * RAW_MODULE_COLS]; + val = (adc - pedestal_g0_aligned[i]) * gain_g0_aligned[i]; break; case 0x4000: if (source[i] != 0x4000) - val = (adc - pedestal_g1_aligned[i + 4 * packet_number * RAW_MODULE_COLS]) * gain_g1_aligned[i + 4 * packet_number * RAW_MODULE_COLS]; + val = (adc - pedestal_g1_aligned[i]) * gain_g1_aligned[i]; break; case 0xc000: if (source[i] == 0xc000) val = INT32_MAX; else if (source[i] != 0xffff) - val = (adc - pedestal_g2_aligned[i + 4 * packet_number * RAW_MODULE_COLS]) * gain_g2_aligned[i + 4 * packet_number * RAW_MODULE_COLS]; + val = (adc - pedestal_g2_aligned[i]) * gain_g2_aligned[i]; break; default: break; diff --git a/jungfrau/JFConversionFixedPoint.h b/jungfrau/JFConversionFixedPoint.h index 2a8cb713..b0cff7bf 100644 --- a/jungfrau/JFConversionFixedPoint.h +++ b/jungfrau/JFConversionFixedPoint.h @@ -26,7 +26,7 @@ public: const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) override; - void ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) override; + void ConvertModule(int16_t *dest, const uint16_t *source) override; }; diff --git a/jungfrau/JFConversionFloatingPoint.cpp b/jungfrau/JFConversionFloatingPoint.cpp index f3d14868..af68c1dd 100644 --- a/jungfrau/JFConversionFloatingPoint.cpp +++ b/jungfrau/JFConversionFloatingPoint.cpp @@ -33,9 +33,46 @@ void JFConversionFloatingPoint::Setup(const JFModuleGainCalibration &gain_calibr } } -void JFConversionFloatingPoint::ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) { - for (int i = 0; i < 4; i++) - ConvertLine(dest + i * RAW_MODULE_COLS, source + i * RAW_MODULE_COLS, 4 * packet_number + i); +void JFConversionFloatingPoint::ConvertModule(int16_t *dest, const uint16_t *source) { + for (size_t i = 0; i < RAW_MODULE_SIZE; i++) { + uint16_t gainbits = source[i] & 0xc000; + uint16_t adc = source[i] & 0x3fff; + int16_t pedestal_subtracted_adu; + + float expected = PIXEL_OUT_LOST; + + switch (gainbits) { + case 0: + [[likely]] + pedestal_subtracted_adu = adc - pedestal_g0[i]; + expected = static_cast(pedestal_subtracted_adu) * gain_g0[i]; + break; + case 0x4000: + pedestal_subtracted_adu = adc - pedestal_g1[i]; + expected = static_cast(pedestal_subtracted_adu) * gain_g1[i]; + if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION; + break; + case 0xc000: + pedestal_subtracted_adu = adc - pedestal_g2[i]; + expected = static_cast(pedestal_subtracted_adu) * gain_g2[i]; + if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION; + else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF; + break; + default: + expected = PIXEL_OUT_GAINBIT_2; + break; + } + + if (std::isinf(expected) || (expected == INT16_MIN)) + dest[i] = PIXEL_OUT_LOST; + else if (expected > INT16_MAX) + dest[i] = INT16_MAX; + else if (expected >= 0.0) + [[likely]] + dest[i] = static_cast(expected + 0.5f); + else + dest[i] = static_cast(expected - 0.5f); + } } void JFConversionFloatingPoint::ConvertFP(float *dest, const uint16_t *source) { @@ -75,45 +112,3 @@ void JFConversionFloatingPoint::ConvertFP(float *dest, const uint16_t *source) { dest[i] = expected; } } - -void JFConversionFloatingPoint::ConvertLine(int16_t *dest, const uint16_t *source, int line) { - for (size_t i = 0; i < RAW_MODULE_COLS; i++) { - uint16_t gainbits = source[i] & 0xc000; - uint16_t adc = source[i] & 0x3fff; - int16_t pedestal_subtracted_adu; - - float expected = PIXEL_OUT_LOST; - - switch (gainbits) { - case 0: - [[likely]] - pedestal_subtracted_adu = adc - pedestal_g0[i + line * RAW_MODULE_COLS]; - expected = static_cast(pedestal_subtracted_adu) * gain_g0[i + line * RAW_MODULE_COLS]; - break; - case 0x4000: - pedestal_subtracted_adu = adc - pedestal_g1[i + line * RAW_MODULE_COLS]; - expected = static_cast(pedestal_subtracted_adu) * gain_g1[i + line * RAW_MODULE_COLS]; - if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION; - break; - case 0xc000: - pedestal_subtracted_adu = adc - pedestal_g2[i + line * RAW_MODULE_COLS]; - expected = static_cast(pedestal_subtracted_adu) * gain_g2[i + line * RAW_MODULE_COLS]; - if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION; - else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF; - break; - default: - expected = PIXEL_OUT_GAINBIT_2; - break; - } - - if (std::isinf(expected) || (expected == INT16_MIN)) - dest[i] = PIXEL_OUT_LOST; - else if (expected > INT16_MAX) - dest[i] = INT16_MAX; - else if (expected >= 0.0) - [[likely]] - dest[i] = static_cast(expected + 0.5f); - else - dest[i] = static_cast(expected - 0.5f); - } -} \ No newline at end of file diff --git a/jungfrau/JFConversionFloatingPoint.h b/jungfrau/JFConversionFloatingPoint.h index e515591d..6f798919 100644 --- a/jungfrau/JFConversionFloatingPoint.h +++ b/jungfrau/JFConversionFloatingPoint.h @@ -14,7 +14,6 @@ class JFConversionFloatingPoint : public JFConversion { std::vector gain_g0; std::vector gain_g1; std::vector gain_g2; - void ConvertLine(int16_t *dest, const uint16_t *source, int line); public: JFConversionFloatingPoint(); @@ -23,7 +22,7 @@ public: const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) override; - void ConvertPacket(int16_t* dest, const uint16_t* source, uint16_t packet_number) override; + void ConvertModule(int16_t *dest, const uint16_t *source) override; void ConvertFP(float *dest, const uint16_t *source); }; diff --git a/jungfrau/JFConversionGPU.cu b/jungfrau/JFConversionGPU.cu new file mode 100644 index 00000000..c8980fb9 --- /dev/null +++ b/jungfrau/JFConversionGPU.cu @@ -0,0 +1,136 @@ +// Copyright (2019-2023) Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "JFConversionGPU.h" +#include "../common/JFJochException.h" + +inline void cuda_err(cudaError_t val) { + if (val != cudaSuccess) + throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val)); +} + +struct CudaStreamWrapper { + cudaStream_t v; +}; + +inline float one_over_gain_energy(double gain_factor, double energy) { + double tmp = gain_factor * energy; + if (!std::isfinite(tmp) || (tmp == 0.0)) + return std::numeric_limits::infinity(); + else + return static_cast(1.0 / (gain_factor * energy)); +} + +__global__ void gpu_jf_convert(int16_t *output, const uint16_t* input, + const uint16_t *pedestal_g0, + const uint16_t *pedestal_g1, + const uint16_t *pedestal_g2, + const float *gain_g0, + const float *gain_g1, + const float *gain_g2) { + uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x; + + uint16_t gainbits = input[idx] & 0xc000; + uint16_t adc = input[idx] & 0x3fff; + int16_t pedestal_subtracted_adu; + + float expected = PIXEL_OUT_LOST; + + switch (gainbits) { + case 0: + pedestal_subtracted_adu = adc - pedestal_g0[idx]; + expected = static_cast(pedestal_subtracted_adu) * gain_g0[idx]; + break; + case 0x4000: + pedestal_subtracted_adu = adc - pedestal_g1[idx]; + expected = static_cast(pedestal_subtracted_adu) * gain_g1[idx]; + if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION; + break; + case 0xc000: + pedestal_subtracted_adu = adc - pedestal_g2[idx]; + expected = static_cast(pedestal_subtracted_adu) * gain_g2[idx]; + if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION; + else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF; + break; + default: + expected = PIXEL_OUT_GAINBIT_2; + break; + } + + output[idx] = std::round(expected); + + if (expected <= INT16_MIN) + output[idx] = PIXEL_OUT_LOST; + else if (expected >= INT16_MAX) + output[idx] = INT16_MAX; +} + +JFConversionGPU::JFConversionGPU() { + cudastream = new(CudaStreamWrapper); + + cuda_err(cudaStreamCreate(&cudastream->v)); + cuda_err(cudaMalloc(&gpu_pedestal_g0, RAW_MODULE_SIZE * sizeof(uint16_t))); + cuda_err(cudaMalloc(&gpu_pedestal_g1, RAW_MODULE_SIZE * sizeof(uint16_t))); + cuda_err(cudaMalloc(&gpu_pedestal_g2, RAW_MODULE_SIZE * sizeof(uint16_t))); + + cuda_err(cudaMalloc(&gpu_gain_g0, RAW_MODULE_SIZE * sizeof(float))); + cuda_err(cudaMalloc(&gpu_gain_g1, RAW_MODULE_SIZE * sizeof(float))); + cuda_err(cudaMalloc(&gpu_gain_g2, RAW_MODULE_SIZE * sizeof(float))); + + cuda_err(cudaMallocHost(&host_gain_g0, RAW_MODULE_SIZE * sizeof(float))); + cuda_err(cudaMallocHost(&host_gain_g1, RAW_MODULE_SIZE * sizeof(float))); + cuda_err(cudaMallocHost(&host_gain_g2, RAW_MODULE_SIZE * sizeof(float))); + + cuda_err(cudaMalloc(&gpu_input, RAW_MODULE_SIZE * sizeof(uint16_t))); + cuda_err(cudaMalloc(&gpu_output, RAW_MODULE_SIZE * sizeof(int16_t))); +} + +JFConversionGPU::~JFConversionGPU() { + cudaStreamSynchronize(cudastream->v); + cudaStreamDestroy(cudastream->v); + delete cudastream; + + cudaFree(gpu_pedestal_g0); + cudaFree(gpu_pedestal_g1); + cudaFree(gpu_pedestal_g2); + + cudaFree(gpu_gain_g0); + cudaFree(gpu_gain_g1); + cudaFree(gpu_gain_g2); + + cudaFreeHost(host_gain_g0); + cudaFreeHost(host_gain_g1); + cudaFreeHost(host_gain_g2); + + cudaFree(gpu_input); + cudaFree(gpu_output); +} + +void JFConversionGPU::Setup(const JFModuleGainCalibration &gain_calibration, const JFModulePedestal &pedestal_g0, + const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) { + auto &gain_arr = gain_calibration.GetGainCalibration(); + + for (int i = 0; i < RAW_MODULE_SIZE; i++) { + host_gain_g0[i] = one_over_gain_energy(gain_arr[i], energy); + host_gain_g1[i] = one_over_gain_energy(gain_arr[i + RAW_MODULE_SIZE], energy); + host_gain_g2[i] = one_over_gain_energy(gain_arr[i + 2 * RAW_MODULE_SIZE], energy); + } + cudaMemcpy(gpu_pedestal_g0, pedestal_g0.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_pedestal_g1, pedestal_g1.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_pedestal_g2, pedestal_g2.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_gain_g0, host_gain_g0, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_gain_g1, host_gain_g1, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_gain_g2, host_gain_g2, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice); +} + +void JFConversionGPU::ConvertModule(int16_t *dest, const uint16_t *source) { + cudaMemcpy(gpu_input, source, RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice); + gpu_jf_convert<<>>(gpu_output, gpu_input, + gpu_pedestal_g0, gpu_pedestal_g1, gpu_pedestal_g2, + gpu_gain_g0, gpu_gain_g1, gpu_gain_g2); + cudaMemcpy(dest, gpu_output, RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyDeviceToHost); +} + +void JFConversionGPU::Sync() { + cudaStreamSynchronize(cudastream->v); +} \ No newline at end of file diff --git a/jungfrau/JFConversionGPU.h b/jungfrau/JFConversionGPU.h new file mode 100644 index 00000000..9e5d4eb1 --- /dev/null +++ b/jungfrau/JFConversionGPU.h @@ -0,0 +1,49 @@ +// Copyright (2019-2023) Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef JUNGFRAUJOCH_JFCONVERSIONGPU_CUH +#define JUNGFRAUJOCH_JFCONVERSIONGPU_CUH + +#include "JFConversion.h" +#include "JFConversionFixedPoint.h" + +struct CudaStreamWrapper; + +class JFConversionGPU : public JFConversion { +#ifdef JFJOCH_USE_CUDA + CudaStreamWrapper *cudastream = nullptr; + + uint16_t *gpu_pedestal_g0 = nullptr; + uint16_t *gpu_pedestal_g1 = nullptr; + uint16_t *gpu_pedestal_g2 = nullptr; + float *gpu_gain_g0 = nullptr; + float *gpu_gain_g1 = nullptr; + float *gpu_gain_g2 = nullptr; + + float *host_gain_g0 = nullptr; + float *host_gain_g1 = nullptr; + float *host_gain_g2 = nullptr; + + + uint16_t *gpu_input = nullptr; + int16_t *gpu_output = nullptr; +#else + JFConversionFixedPoint alt_conv; +#endif +public: + JFConversionGPU(); + JFConversionGPU(JFConversionGPU& other) = delete; + JFConversionGPU& operator=(JFConversionGPU& other) = delete; + ~JFConversionGPU(); + void Setup(const JFModuleGainCalibration &gain_calibration, + const JFModulePedestal &pedestal_g0, + const JFModulePedestal &pedestal_g1, + const JFModulePedestal &pedestal_g2, + double energy) override; + + void ConvertModule(int16_t *dest, const uint16_t *source) override; + void Sync() override; +}; + + +#endif //JUNGFRAUJOCH_JFCONVERSIONGPU_CUH diff --git a/jungfrau/JFConversionGPU_Alt.cpp b/jungfrau/JFConversionGPU_Alt.cpp new file mode 100644 index 00000000..0a0af1bb --- /dev/null +++ b/jungfrau/JFConversionGPU_Alt.cpp @@ -0,0 +1,23 @@ +// Copyright (2019-2023) Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "JFConversionGPU.h" + +#ifndef JFJOCH_USE_CUDA + +JFConversionGPU::JFConversionGPU() {} + +JFConversionGPU::~JFConversionGPU() {} + +void JFConversionGPU::Setup(const JFModuleGainCalibration &gain_calibration, const JFModulePedestal &pedestal_g0, + const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) { + alt_conv.Setup(gain_calibration, pedestal_g0, pedestal_g1, pedestal_g2, energy); +} + +void JFConversionGPU::ConvertPacket(int16_t *dest, const uint16_t *source, uint16_t packet_number) { + alt_conv.ConvertPacket(dest, source, packet_number); +} + +JFConversionGPU::Sync() {} + +#endif diff --git a/jungfrau/ProcessJFPacket.cpp b/jungfrau/ProcessJFPacket.cpp index 16c24d49..21655801 100644 --- a/jungfrau/ProcessJFPacket.cpp +++ b/jungfrau/ProcessJFPacket.cpp @@ -13,8 +13,7 @@ ProcessJFPacket::ProcessJFPacket(ThreadSafeFIFO &in_c, ThreadSafeFIF : m(2 * nmodules), c_fifo(in_c), wr_fifo(in_wr), - module_info(2 * nmodules), - conv(nmodules) + module_info(2 * nmodules) { for (auto &i: module_info) i.c.frame_number = UINT64_MAX; @@ -68,10 +67,7 @@ void ProcessJFPacket::ProcessPacket(jf_udp_payload *datagram) { module_info[module_info_location].c.packet_mask[packetnum >= 64 ? 1 : 0] |= (1LU << (packetnum % 64)); uint16_t* dst = module_info[module_info_location].ptr + 4096 * packetnum; - if (conv[module_number]) - conv[module_number]->ConvertPacket((int16_t *) dst, datagram->data, packetnum); - else - memcpy(dst, datagram->data, 4096 * sizeof(uint16_t)); + memcpy(dst, datagram->data, 4096 * sizeof(uint16_t)); } packet_counter++; } @@ -79,23 +75,3 @@ void ProcessJFPacket::ProcessPacket(jf_udp_payload *datagram) { uint64_t ProcessJFPacket::GetCounter() { return packet_counter; } - -void ProcessJFPacket::RegisterConversion(const DiffractionExperiment &experiment, - const JFCalibration &calib, - uint16_t data_stream) { - if (data_stream >= experiment.GetDataStreamsNum()) - throw JFJochException(JFJochExceptionCategory::ArrayOutOfBounds, "Data stream not found"); - - if (conv.size() != experiment.GetModulesNum(data_stream)) - throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Wrong module count"); - - auto module0 = experiment.GetFirstModuleOfDataStream(data_stream); - for (int i = 0; i < experiment.GetModulesNum(data_stream); i++) { - conv[i] = std::make_unique(); - conv[i]->Setup(calib.GainCalibration(module0 + i), - calib.Pedestal(module0 + i, 0, 0), - calib.Pedestal(module0 + i, 1, 0), - calib.Pedestal(module0 + i, 2, 0), - experiment.GetPhotonEnergy_keV()); - } -} diff --git a/jungfrau/ProcessJFPacket.h b/jungfrau/ProcessJFPacket.h index 7f428f23..8d4021d7 100644 --- a/jungfrau/ProcessJFPacket.h +++ b/jungfrau/ProcessJFPacket.h @@ -24,11 +24,9 @@ class ProcessJFPacket { ThreadSafeFIFO &wr_fifo; std::vector module_info; std::atomic packet_counter = 0; - std::vector > conv; public: ProcessJFPacket(ThreadSafeFIFO &c, ThreadSafeFIFO &wr, uint32_t nmodules); ~ProcessJFPacket(); - void RegisterConversion(const DiffractionExperiment& experiment, const JFCalibration& calib, uint16_t data_stream); void ProcessPacket(jf_udp_payload *datagram); uint64_t GetCounter(); }; diff --git a/tests/JFConversionTest.cpp b/tests/JFConversionTest.cpp index 72da2ab1..6b448578 100644 --- a/tests/JFConversionTest.cpp +++ b/tests/JFConversionTest.cpp @@ -5,6 +5,7 @@ #include "../tests/FPGAUnitTest.h" #include "../jungfrau/JFConversionFloatingPoint.h" +#include "../jungfrau/JFConversionGPU.h" #include "../jungfrau/JFConversionFixedPoint.h" void SetupPedestal( JFModulePedestal &pedestal_g0, JFModulePedestal &pedestal_g1, JFModulePedestal &pedestal_g2) { @@ -41,7 +42,7 @@ TEST_CASE("JFConversionFloatingPoint_G0","[JFConversion]") { input[i] = i % 16384; for (int i = 0; i < 128; i++) - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -76,7 +77,7 @@ TEST_CASE("JFConversionFixedPoint_G0","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = i % 16384; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv_fp.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -88,6 +89,42 @@ TEST_CASE("JFConversionFixedPoint_G0","[JFConversion]") { } } +TEST_CASE("JFConversionGPU_G0","[JFConversion]") { + JFConversionGPU conv; + JFConversionFloatingPoint conv_fp; + + JFModulePedestal pedestal_g0; + JFModulePedestal pedestal_g1; + JFModulePedestal pedestal_g2; + + SetupPedestal(pedestal_g0, pedestal_g1, pedestal_g2); + + JFModuleGainCalibration gain; + + std::vector energy{4.0, 6.0, 12.4, 25.0}; + for (auto &e: energy) { + conv.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e); + conv_fp.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e); + + std::vector input(RAW_MODULE_SIZE); + std::vector output_fp(RAW_MODULE_SIZE); + std::vector output_16bit(RAW_MODULE_SIZE); + + for (int i = 0; i < RAW_MODULE_SIZE; i++) + input[i] = i % 16384; + + conv.ConvertModule(output_16bit.data(), input.data()); + conv_fp.ConvertFP(output_fp.data(), input.data()); + conv.Sync(); + + auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); + auto max_err = MaxErrorOnConversion(output_16bit.data(), output_fp, RAW_MODULE_SIZE); + std::cout << "Error on conversion " << err << " max error " << max_err << std::endl; + REQUIRE(err < 0.5); + REQUIRE(max_err <= 1.0); + } +} + TEST_CASE("JFConversionFixedPoint_G0_TestFile","[JFConversion]") { JFConversionFixedPoint conv; JFConversionFloatingPoint conv_fp; @@ -112,7 +149,7 @@ TEST_CASE("JFConversionFixedPoint_G0_TestFile","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = i % 16384; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv_fp.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -144,7 +181,7 @@ TEST_CASE("JFConversionFloatingPoint_G1","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = (i % 16384) | 0x4000; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -179,7 +216,7 @@ TEST_CASE("JFConversionFixedPoint_G1","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = (i % 16384) | 0x4000; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv_fp.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -191,6 +228,43 @@ TEST_CASE("JFConversionFixedPoint_G1","[JFConversion]") { } } +TEST_CASE("JFConversionGPU_G1","[JFConversion]") { + JFConversionGPU conv; + JFConversionFloatingPoint conv_fp; + + JFModulePedestal pedestal_g0; + JFModulePedestal pedestal_g1; + JFModulePedestal pedestal_g2; + + SetupPedestal(pedestal_g0, pedestal_g1, pedestal_g2); + + JFModuleGainCalibration gain; + + std::vector energy{4.0, 6.0, 12.4, 25.0}; + for (auto &e: energy) { + conv.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e); + conv_fp.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e); + + std::vector input(RAW_MODULE_SIZE); + std::vector output_fp(RAW_MODULE_SIZE); + std::vector output_16bit(RAW_MODULE_SIZE); + + for (int i = 0; i < RAW_MODULE_SIZE; i++) + input[i] = (i % 16384) | 0x4000; + + conv.ConvertModule(output_16bit.data(), input.data()); + conv_fp.ConvertFP(output_fp.data(), input.data()); + conv.Sync(); + + auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); + auto max_err = MaxErrorOnConversion(output_16bit.data(), output_fp, RAW_MODULE_SIZE); + + std::cout << "Error on conversion " << err << " max error " << max_err << std::endl; + REQUIRE(err < 0.5); + REQUIRE(max_err <= 1.0); + } +} + TEST_CASE("JFConversionFloatingPoint_G2","[JFConversion]") { JFConversionFloatingPoint conv; @@ -211,7 +285,7 @@ TEST_CASE("JFConversionFloatingPoint_G2","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = (i % 16384) | 0xC000; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -246,7 +320,7 @@ TEST_CASE("JFConversionFixedPoint_G2","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = (i % 16384) | 0xC000; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv_fp.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -258,6 +332,43 @@ TEST_CASE("JFConversionFixedPoint_G2","[JFConversion]") { } } +TEST_CASE("JFConversionGPU_G2","[JFConversion]") { + JFConversionGPU conv; + JFConversionFloatingPoint conv_fp; + + JFModulePedestal pedestal_g0; + JFModulePedestal pedestal_g1; + JFModulePedestal pedestal_g2; + + SetupPedestal(pedestal_g0, pedestal_g1, pedestal_g2); + + JFModuleGainCalibration gain; + + std::vector energy{4.0, 6.0, 12.4, 25.0}; + for (auto &e: energy) { + conv.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e); + conv_fp.Setup(gain, pedestal_g0, pedestal_g1, pedestal_g2, e); + + std::vector input(RAW_MODULE_SIZE); + std::vector output_fp(RAW_MODULE_SIZE); + std::vector output_16bit(RAW_MODULE_SIZE); + + for (int i = 0; i < RAW_MODULE_SIZE; i++) + input[i] = (i % 16384) | 0xC000; + + conv.ConvertModule(output_16bit.data(), input.data()); + conv_fp.ConvertFP(output_fp.data(), input.data()); + conv.Sync(); + + auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); + auto max_err = MaxErrorOnConversion(output_16bit.data(), output_fp, RAW_MODULE_SIZE); + + std::cout << "Error on conversion " << err << " max error " << max_err << std::endl; + REQUIRE(err < 0.5); + REQUIRE(max_err <= 1.0); + } +} + TEST_CASE("JFConversionFixedPoint_G1_TestFile","[JFConversion]") { JFConversionFixedPoint conv; JFConversionFloatingPoint conv_fp; @@ -282,7 +393,7 @@ TEST_CASE("JFConversionFixedPoint_G1_TestFile","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = (i % 16384) | 0x4000; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv_fp.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); @@ -318,7 +429,7 @@ TEST_CASE("JFConversionFixedPoint_G2_TestFile","[JFConversion]") { for (int i = 0; i < RAW_MODULE_SIZE; i++) input[i] = (i % 16384) | 0xC000; - conv.Convert(output_16bit.data(), input.data()); + conv.ConvertModule(output_16bit.data(), input.data()); conv_fp.ConvertFP(output_fp.data(), input.data()); auto err = Compare(output_16bit.data(), output_fp, RAW_MODULE_SIZE); diff --git a/tests/ProcessRawPacketTest.cpp b/tests/ProcessRawPacketTest.cpp index e23593aa..16f415c2 100644 --- a/tests/ProcessRawPacketTest.cpp +++ b/tests/ProcessRawPacketTest.cpp @@ -92,41 +92,3 @@ TEST_CASE("ProcessRawPacketTest") { CHECK(array_1[4096*(36+64)] == 6345); CHECK(array_2[4096*(16+64)] == 6346); } - -TEST_CASE("ProcessRawPacketTest_Conversion") { - ThreadSafeFIFO c_fifo; - ThreadSafeFIFO wr_fifo; - - std::vector array_0(RAW_MODULE_SIZE); - - wr_fifo.Put(WorkRequest{.ptr = array_0.data(), .handle = 0}); - - DiffractionExperiment experiment(DetectorGeometry(8)); - experiment.DataStreams(2).PhotonEnergy_keV(2); - JFCalibration calibration(experiment); - calibration.Pedestal(6, 0, 0).GetPedestal()[4096*36] = 300; - calibration.Pedestal(6, 1, 0).GetPedestal()[4096*36+1] = 15000; - calibration.Pedestal(6, 2, 0).GetPedestal()[4096*36+2] = 14000; - - { - ProcessJFPacket process(c_fifo, wr_fifo, 4); - REQUIRE_NOTHROW(process.RegisterConversion(experiment, calibration, 1)); - jf_udp_payload datagram; - - datagram.packetnum = 36; - datagram.framenum = 2; - datagram.bunchid = 84; - datagram.data[0] = 600; - datagram.data[1] = 16384 | 4000; - datagram.data[2] = 32768 | 16384 | 9000; - - datagram.xCoord = 4; - process.ProcessPacket(&datagram); - - REQUIRE(process.GetCounter() == 1); - } - - CHECK(array_0[4096*36] == std::round((600 - 300) / (DEFAULT_G0_FACTOR*2))); - CHECK(array_0[4096*36+1] == std::round((4000 - 15000) / (DEFAULT_G1_FACTOR*2))); - CHECK(array_0[4096*36+2] == std::round((9000 - 14000) / (DEFAULT_G2_FACTOR*2))); -} \ No newline at end of file diff --git a/tools/JFCalibrationPerfTest.cpp b/tools/JFCalibrationPerfTest.cpp index 09614f06..6ea3fe7c 100644 --- a/tools/JFCalibrationPerfTest.cpp +++ b/tools/JFCalibrationPerfTest.cpp @@ -6,11 +6,14 @@ #include #include +#include + #include "../jungfrau/JFPedestalCalc.h" #include "../common/Logger.h" #include "../jungfrau/JFCalibration.h" #include "../jungfrau/JFConversionFloatingPoint.h" #include "../jungfrau/JFConversionFixedPoint.h" +#include "../jungfrau/JFConversionGPU.h" #include "../tests/FPGAUnitTest.h" #include "../jungfrau/jf_packet.h" #include "../jungfrau/ProcessJFPacket.h" @@ -82,16 +85,19 @@ template void test_conversion(Logger &logger) { x.Mode(DetectorMode::Conversion); - logger.Info("JF FP conversion input prepared"); auto start_time = std::chrono::system_clock::now(); for (int z = 0; z < ntries; z++) { for (int i = 0; i < nframes; i++) { for (int m = 0; m < nmodules; m++) { - v[m].Convert(output.data() + (i * nmodules + m) * RAW_MODULE_SIZE, + v[m].ConvertModule(output.data() + (i * nmodules + m) * RAW_MODULE_SIZE, input.data() + (i * nmodules + m) * RAW_MODULE_SIZE); } } } + + for (int m = 0; m < nmodules; m++) + v[m].Sync(); + auto end_time = std::chrono::system_clock::now(); auto elapsed = std::chrono::duration_cast(end_time - start_time); @@ -143,133 +149,6 @@ void test_packet_processing(Logger &logger) { ntries * nframes * nmodules * RAW_MODULE_SIZE * sizeof(uint16_t) * 1000 * 1000/ ((double) elapsed.count() * 1024 * 1024 * 1024)); } -void test_packet_processing_with_conversion(Logger &logger) { - size_t nframes = 128; - int64_t nmodules = 8; - int64_t ntries = 8; - - std::vector packets(nframes * nmodules * 128); - std::vector output(nframes * nmodules * CONVERTED_MODULE_SIZE); - - std::vector input(RAW_MODULE_SIZE); - std::string image_path = "../../tests/test_data/mod5_raw0.bin"; - LoadBinaryFile(image_path, input.data(), RAW_MODULE_SIZE); - - for (int frame = 0; frame < nframes; frame++) { - for (int m = 0; m < nmodules; m++) { - for (int p = 0; p < 128; p++) { - packets.at((frame * 128 + p) * nmodules + m).jf.xCoord = (m * 2); - packets.at((frame * 128 + p) * nmodules + m).jf.packetnum = p; - packets.at((frame * 128 + p) * nmodules + m).jf.framenum = frame + 1; - memcpy(packets.at((frame * 128 + p) * nmodules + m).jf.data, input.data() + 4096 * p, 4096 * sizeof(uint16_t)); - } - } - } - - DiffractionExperiment experiment((DetectorGeometry(nmodules))); - JFCalibration calib(experiment); - experiment.PhotonEnergy_keV(12.4).DataStreams(1); - - for (int m = 0; m < nmodules; m++) { - calib.GainCalibration(m) = GainCalibrationFromTestFile(); - - for (int i = 0; i < RAW_MODULE_SIZE; i++) { - calib.Pedestal(m, 0, 0).GetPedestal()[i] = 3000 + i % 50 + m * 135; - calib.Pedestal(m, 1, 0).GetPedestal()[i] = 15000 + i % 50 - m * 135; - calib.Pedestal(m, 2, 0).GetPedestal()[i] = 14000 + i % 50 - m * 135; - } - } - - auto start_time = std::chrono::system_clock::now(); - for (int z = 0; z < ntries; z++) { - ThreadSafeFIFO c; - ThreadSafeFIFO wr; - ProcessJFPacket process(c, wr, nmodules); - process.RegisterConversion(experiment, calib, 0); - - for (uint32_t i = 0; i < nmodules * nframes; i++) - wr.Put(WorkRequest{ - .ptr = output.data() + i * RAW_MODULE_SIZE, - .handle = i - }); - for (auto &packet: packets) - process.ProcessPacket(&packet.jf); - } - auto end_time = std::chrono::system_clock::now(); - auto elapsed = std::chrono::duration_cast(end_time - start_time); - - logger.Info("Packet analysis with conversion performance: {:5d} us/module {:5.2f} GB/s", std::lround(elapsed.count() / ((double) (ntries * nframes * nmodules))), - ntries * nframes * nmodules * RAW_MODULE_SIZE * sizeof(uint16_t) * 1000 * 1000/ ((double) elapsed.count() * 1024 * 1024 * 1024)); -} - -void test_packet_processing_with_conversion_parallel(Logger &logger, int64_t nmodules) { - size_t nframes = 128; - int64_t ntries = 8; - - std::vector packets(nframes * nmodules * 128); - std::vector output(nframes * nmodules * CONVERTED_MODULE_SIZE); - - std::vector input(RAW_MODULE_SIZE); - std::string image_path = "../../tests/test_data/mod5_raw0.bin"; - LoadBinaryFile(image_path, input.data(), RAW_MODULE_SIZE); - - for (int frame = 0; frame < nframes; frame++) { - for (int m = 0; m < nmodules; m++) { - for (int p = 0; p < 128; p++) { - packets.at((frame * 128 + p) * nmodules + m).jf.xCoord = (m * 2); - packets.at((frame * 128 + p) * nmodules + m).jf.packetnum = p; - packets.at((frame * 128 + p) * nmodules + m).jf.framenum = frame + 1; - memcpy(packets.at((frame * 128 + p) * nmodules + m).jf.data, input.data() + 4096 * p, 4096 * sizeof(uint16_t)); - } - } - } - - DiffractionExperiment experiment((DetectorGeometry(nmodules))); - JFCalibration calib(experiment); - experiment.PhotonEnergy_keV(12.4).DataStreams(1); - - for (int m = 0; m < nmodules; m++) { - calib.GainCalibration(m) = GainCalibrationFromTestFile(); - - for (int i = 0; i < RAW_MODULE_SIZE; i++) { - calib.Pedestal(m, 0, 0).GetPedestal()[i] = 3000 + i % 50 + m * 135; - calib.Pedestal(m, 1, 0).GetPedestal()[i] = 15000 + i % 50 - m * 135; - calib.Pedestal(m, 2, 0).GetPedestal()[i] = 14000 + i % 50 - m * 135; - } - } - - auto start_time = std::chrono::system_clock::now(); - for (int z = 0; z < ntries; z++) { - ThreadSafeFIFO c; - ThreadSafeFIFO wr; - ProcessJFPacket process(c, wr, nmodules); - process.RegisterConversion(experiment, calib, 0); - - for (uint32_t i = 0; i < nmodules * nframes; i++) - wr.Put(WorkRequest{ - .ptr = output.data() + i * RAW_MODULE_SIZE, - .handle = i - }); - - std::vector> f; - for (int m = 0; m < nmodules; m++) { - f.emplace_back(std::async(std::launch::async, [&] (int x) { - for (int frame = 0; frame < nframes; frame++) { - for (int i = 0; i < 128; i++) - process.ProcessPacket(&packets[(frame * 128 + i) * nmodules + x].jf); - } - }, m)); - } - for (auto &i: f) - i.get(); - } - auto end_time = std::chrono::system_clock::now(); - auto elapsed = std::chrono::duration_cast(end_time - start_time); - - logger.Info("Packet analysis with conversion performance: {:5d} us/module {:5.2f} GB/s", std::lround(elapsed.count() / ((double) (ntries * nframes * nmodules))), - ntries * nframes * nmodules * RAW_MODULE_SIZE * sizeof(uint16_t) * 1000 * 1000/ ((double) elapsed.count() * 1024 * 1024 * 1024)); -} - int main () { Logger logger("JFCalibrationPerfTest"); test_pedestal(logger); @@ -280,21 +159,9 @@ int main () { logger.Info("Fixed point conversion"); test_conversion(logger); - logger.Info("Packet processing without conversion"); + logger.Info("Floating point conversion (GPU)"); + test_conversion(logger); + + logger.Info("Packet processing"); test_packet_processing(logger); - - logger.Info("Packet processing with conversion"); - test_packet_processing_with_conversion(logger); - - logger.Info("Packet processing with conversion (4 threads)"); - test_packet_processing_with_conversion_parallel(logger, 4); - - logger.Info("Packet processing with conversion (8 threads)"); - test_packet_processing_with_conversion_parallel(logger, 8); - - logger.Info("Packet processing with conversion (16 threads)"); - test_packet_processing_with_conversion_parallel(logger, 16); - - logger.Info("Packet processing with conversion (32 threads)"); - test_packet_processing_with_conversion_parallel(logger, 32); }