136 lines
5.6 KiB
Plaintext
136 lines
5.6 KiB
Plaintext
// Copyright (2019-2023) Paul Scherrer Institute
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "JFConversionGPU.h"
|
|
#include "../common/JFJochException.h"
|
|
|
|
inline void cuda_err(cudaError_t val) {
|
|
if (val != cudaSuccess)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
|
}
|
|
|
|
struct CudaStreamWrapper {
|
|
cudaStream_t v;
|
|
};
|
|
|
|
inline float one_over_gain_energy(double gain_factor, double energy) {
|
|
double tmp = gain_factor * energy;
|
|
if (!std::isfinite(tmp) || (tmp == 0.0))
|
|
return std::numeric_limits<float>::infinity();
|
|
else
|
|
return static_cast<float>(1.0 / (gain_factor * energy));
|
|
}
|
|
|
|
__global__ void gpu_jf_convert(int16_t *output, const uint16_t* input,
|
|
const uint16_t *pedestal_g0,
|
|
const uint16_t *pedestal_g1,
|
|
const uint16_t *pedestal_g2,
|
|
const float *gain_g0,
|
|
const float *gain_g1,
|
|
const float *gain_g2) {
|
|
uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
|
|
|
|
uint16_t gainbits = input[idx] & 0xc000;
|
|
uint16_t adc = input[idx] & 0x3fff;
|
|
int16_t pedestal_subtracted_adu;
|
|
|
|
float expected = PIXEL_OUT_LOST;
|
|
|
|
switch (gainbits) {
|
|
case 0:
|
|
pedestal_subtracted_adu = adc - pedestal_g0[idx];
|
|
expected = static_cast<float>(pedestal_subtracted_adu) * gain_g0[idx];
|
|
break;
|
|
case 0x4000:
|
|
pedestal_subtracted_adu = adc - pedestal_g1[idx];
|
|
expected = static_cast<float>(pedestal_subtracted_adu) * gain_g1[idx];
|
|
if (adc == 0) [[unlikely]] expected = PIXEL_OUT_G1_SATURATION;
|
|
break;
|
|
case 0xc000:
|
|
pedestal_subtracted_adu = adc - pedestal_g2[idx];
|
|
expected = static_cast<float>(pedestal_subtracted_adu) * gain_g2[idx];
|
|
if (adc == 0) [[unlikely]] expected = PIXEL_OUT_SATURATION;
|
|
else if (adc == 0x3fff) [[unlikely]] expected = PIXEL_OUT_0xFFFF;
|
|
break;
|
|
default:
|
|
expected = PIXEL_OUT_GAINBIT_2;
|
|
break;
|
|
}
|
|
|
|
output[idx] = std::round(expected);
|
|
|
|
if (expected <= INT16_MIN)
|
|
output[idx] = PIXEL_OUT_LOST;
|
|
else if (expected >= INT16_MAX)
|
|
output[idx] = INT16_MAX;
|
|
}
|
|
|
|
JFConversionGPU::JFConversionGPU() {
|
|
cudastream = new(CudaStreamWrapper);
|
|
|
|
cuda_err(cudaStreamCreate(&cudastream->v));
|
|
cuda_err(cudaMalloc(&gpu_pedestal_g0, RAW_MODULE_SIZE * sizeof(uint16_t)));
|
|
cuda_err(cudaMalloc(&gpu_pedestal_g1, RAW_MODULE_SIZE * sizeof(uint16_t)));
|
|
cuda_err(cudaMalloc(&gpu_pedestal_g2, RAW_MODULE_SIZE * sizeof(uint16_t)));
|
|
|
|
cuda_err(cudaMalloc(&gpu_gain_g0, RAW_MODULE_SIZE * sizeof(float)));
|
|
cuda_err(cudaMalloc(&gpu_gain_g1, RAW_MODULE_SIZE * sizeof(float)));
|
|
cuda_err(cudaMalloc(&gpu_gain_g2, RAW_MODULE_SIZE * sizeof(float)));
|
|
|
|
cuda_err(cudaMallocHost(&host_gain_g0, RAW_MODULE_SIZE * sizeof(float)));
|
|
cuda_err(cudaMallocHost(&host_gain_g1, RAW_MODULE_SIZE * sizeof(float)));
|
|
cuda_err(cudaMallocHost(&host_gain_g2, RAW_MODULE_SIZE * sizeof(float)));
|
|
|
|
cuda_err(cudaMalloc(&gpu_input, RAW_MODULE_SIZE * sizeof(uint16_t)));
|
|
cuda_err(cudaMalloc(&gpu_output, RAW_MODULE_SIZE * sizeof(int16_t)));
|
|
}
|
|
|
|
JFConversionGPU::~JFConversionGPU() {
|
|
cudaStreamSynchronize(cudastream->v);
|
|
cudaStreamDestroy(cudastream->v);
|
|
delete cudastream;
|
|
|
|
cudaFree(gpu_pedestal_g0);
|
|
cudaFree(gpu_pedestal_g1);
|
|
cudaFree(gpu_pedestal_g2);
|
|
|
|
cudaFree(gpu_gain_g0);
|
|
cudaFree(gpu_gain_g1);
|
|
cudaFree(gpu_gain_g2);
|
|
|
|
cudaFreeHost(host_gain_g0);
|
|
cudaFreeHost(host_gain_g1);
|
|
cudaFreeHost(host_gain_g2);
|
|
|
|
cudaFree(gpu_input);
|
|
cudaFree(gpu_output);
|
|
}
|
|
|
|
void JFConversionGPU::Setup(const JFModuleGainCalibration &gain_calibration, const JFModulePedestal &pedestal_g0,
|
|
const JFModulePedestal &pedestal_g1, const JFModulePedestal &pedestal_g2, double energy) {
|
|
auto &gain_arr = gain_calibration.GetGainCalibration();
|
|
|
|
for (int i = 0; i < RAW_MODULE_SIZE; i++) {
|
|
host_gain_g0[i] = one_over_gain_energy(gain_arr[i], energy);
|
|
host_gain_g1[i] = one_over_gain_energy(gain_arr[i + RAW_MODULE_SIZE], energy);
|
|
host_gain_g2[i] = one_over_gain_energy(gain_arr[i + 2 * RAW_MODULE_SIZE], energy);
|
|
}
|
|
cudaMemcpy(gpu_pedestal_g0, pedestal_g0.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpu_pedestal_g1, pedestal_g1.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpu_pedestal_g2, pedestal_g2.GetPedestal(), RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpu_gain_g0, host_gain_g0, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpu_gain_g1, host_gain_g1, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpu_gain_g2, host_gain_g2, RAW_MODULE_SIZE * sizeof(float), cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
void JFConversionGPU::ConvertModule(int16_t *dest, const uint16_t *source) {
|
|
cudaMemcpy(gpu_input, source, RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
|
gpu_jf_convert<<<RAW_MODULE_SIZE/128, 128>>>(gpu_output, gpu_input,
|
|
gpu_pedestal_g0, gpu_pedestal_g1, gpu_pedestal_g2,
|
|
gpu_gain_g0, gpu_gain_g1, gpu_gain_g2);
|
|
cudaMemcpy(dest, gpu_output, RAW_MODULE_SIZE * sizeof(uint16_t), cudaMemcpyDeviceToHost);
|
|
}
|
|
|
|
void JFConversionGPU::Sync() {
|
|
cudaStreamSynchronize(cudastream->v);
|
|
} |