Files
Jungfraujoch/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
T

176 lines
6.0 KiB
Plaintext

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "ImagePreprocessorGPU.h"
template<class T>
__global__ void preprocess_kernel(
const T *__restrict__ input,
const uint8_t *__restrict__ mask,
int32_t *__restrict__ output,
ImageStatistics *__restrict__ stats,
T saturation_limit,
T err_value,
int npixels) {
// Shared block accumulators
__shared__ unsigned long long s_masked;
__shared__ unsigned long long s_saturated;
__shared__ unsigned long long s_error;
__shared__ long long s_max;
__shared__ long long s_min;
if (threadIdx.x == 0) {
s_masked = 0;
s_saturated = 0;
s_error = 0;
s_max = INT64_MIN;
s_min = INT64_MAX;
}
__syncthreads();
// Thread-local accumulators
unsigned long long local_masked = 0;
unsigned long long local_saturated = 0;
unsigned long long local_error = 0;
long long local_max = INT64_MIN;
long long local_min = INT64_MAX;
constexpr bool is_signed = std::is_signed<T>::value;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < npixels;
i += blockDim.x * gridDim.x) {
T v = input[i];
bool is_masked = mask[i];
bool is_sat = v >= saturation_limit;
bool is_err = is_signed && (v == err_value);
bool valid = !(is_masked || is_sat || is_err);
// Output
output[i] =
is_masked ? INT32_MIN :
is_sat ? INT32_MAX :
is_err ? INT32_MIN :
(int32_t)v;
// Counters
local_masked += is_masked;
local_saturated+= (!is_masked && is_sat);
local_error += (!is_masked && !is_sat && is_err);
// Min/max only for valid
if (valid) {
int64_t val = (int64_t)v;
if (val > local_max) local_max = val;
if (val < local_min) local_min = val;
}
}
// Reduce to shared memory
atomicAdd(&s_masked, local_masked);
atomicAdd(&s_saturated, local_saturated);
atomicAdd(&s_error, local_error);
if (local_min <= local_max) {
atomicMax((long long*)&s_max, (long long)local_max);
atomicMin((long long*)&s_min, (long long)local_min);
}
__syncthreads();
// One thread writes block result
if (threadIdx.x == 0) {
atomicAdd(&stats->masked_pixel_count, s_masked);
atomicAdd(&stats->saturated_pixel_count, s_saturated);
atomicAdd(&stats->error_pixel_count, s_error);
atomicMax((long long*)&stats->max_value, (long long)s_max);
atomicMin((long long*)&stats->min_value, (long long)s_min);
}
}
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
: ImagePreprocessor(experiment),
gpu_mask(npixels),
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
gpu_image(npixels),
gpu_stats(1),
cpu_stats(1),
cpu_stats_reg(cpu_stats) {
stream = std::make_shared<CudaStream>();
// Setup mask
std::vector<uint8_t> mask_vec(npixels);
for (int i = 0; i < npixels; i++)
mask_vec[i] = (mask.GetMask().at(i) != 0);
cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);
// Setup GPU settings
cudaDeviceProp prop{};
cudaGetDeviceProperties(&prop, 0);
threads = 128;
blocks = 4 * prop.multiProcessorCount;
}
ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image, const uint8_t *image_ptr,
CompressedImageMode image_mode) {
switch (image_mode) {
case CompressedImageMode::Int8:
return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
case CompressedImageMode::Int16:
return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
case CompressedImageMode::Int32:
return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
case CompressedImageMode::Uint8:
return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
case CompressedImageMode::Uint16:
return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
case CompressedImageMode::Uint32:
return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
default:
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
}
}
template<class T>
ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image,
const uint8_t *input,
T err_value,
T sat_value) {
if (sat_value > saturation_limit)
sat_value = static_cast<T>(saturation_limit);
cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);
cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
reinterpret_cast<const T *>(gpu_decompressed_image.get()),
gpu_mask,
gpu_image,
gpu_stats,
sat_value,
err_value,
npixels);
cudaMemcpyAsync(processed_image.data(), gpu_image, npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);
cudaStreamSynchronize(*stream);
return cpu_stats[0];
}
const int32_t *ImagePreprocessorGPU::GetImageDevicePtr() const {
return gpu_image;
}
std::shared_ptr<CudaStream> ImagePreprocessorGPU::GetStream() const {
return stream;
}