ImageProcessor: Work in progress on GPU code to preprocess images
This commit is contained in:
@@ -0,0 +1,175 @@
|
||||
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
||||
// SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
#include "ImagePreprocessorGPU.h"
|
||||
|
||||
template<class T>
|
||||
__global__ void preprocess_kernel(
|
||||
const T *__restrict__ input,
|
||||
const uint8_t *__restrict__ mask,
|
||||
int32_t *__restrict__ output,
|
||||
ImageStatistics *__restrict__ stats,
|
||||
T saturation_limit,
|
||||
T err_value,
|
||||
int npixels) {
|
||||
// Shared block accumulators
|
||||
__shared__ unsigned long long s_masked;
|
||||
__shared__ unsigned long long s_saturated;
|
||||
__shared__ unsigned long long s_error;
|
||||
__shared__ long long s_max;
|
||||
__shared__ long long s_min;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_masked = 0;
|
||||
s_saturated = 0;
|
||||
s_error = 0;
|
||||
s_max = INT64_MIN;
|
||||
s_min = INT64_MAX;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Thread-local accumulators
|
||||
unsigned long long local_masked = 0;
|
||||
unsigned long long local_saturated = 0;
|
||||
unsigned long long local_error = 0;
|
||||
long long local_max = INT64_MIN;
|
||||
long long local_min = INT64_MAX;
|
||||
|
||||
constexpr bool is_signed = std::is_signed<T>::value;
|
||||
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
i < npixels;
|
||||
i += blockDim.x * gridDim.x) {
|
||||
T v = input[i];
|
||||
|
||||
bool is_masked = mask[i];
|
||||
bool is_sat = v >= saturation_limit;
|
||||
bool is_err = is_signed && (v == err_value);
|
||||
|
||||
bool valid = !(is_masked || is_sat || is_err);
|
||||
|
||||
// Output
|
||||
output[i] =
|
||||
is_masked ? INT32_MIN :
|
||||
is_sat ? INT32_MAX :
|
||||
is_err ? INT32_MIN :
|
||||
(int32_t)v;
|
||||
|
||||
// Counters
|
||||
local_masked += is_masked;
|
||||
local_saturated+= (!is_masked && is_sat);
|
||||
local_error += (!is_masked && !is_sat && is_err);
|
||||
|
||||
// Min/max only for valid
|
||||
if (valid) {
|
||||
int64_t val = (int64_t)v;
|
||||
if (val > local_max) local_max = val;
|
||||
if (val < local_min) local_min = val;
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce to shared memory
|
||||
atomicAdd(&s_masked, local_masked);
|
||||
atomicAdd(&s_saturated, local_saturated);
|
||||
atomicAdd(&s_error, local_error);
|
||||
|
||||
if (local_min <= local_max) {
|
||||
atomicMax((long long*)&s_max, (long long)local_max);
|
||||
atomicMin((long long*)&s_min, (long long)local_min);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// One thread writes block result
|
||||
if (threadIdx.x == 0) {
|
||||
atomicAdd(&stats->masked_pixel_count, s_masked);
|
||||
atomicAdd(&stats->saturated_pixel_count, s_saturated);
|
||||
atomicAdd(&stats->error_pixel_count, s_error);
|
||||
|
||||
atomicMax((long long*)&stats->max_value, (long long)s_max);
|
||||
atomicMin((long long*)&stats->min_value, (long long)s_min);
|
||||
}
|
||||
}
|
||||
|
||||
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
|
||||
: ImagePreprocessor(experiment),
|
||||
gpu_mask(npixels),
|
||||
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
|
||||
gpu_image(npixels),
|
||||
gpu_stats(1),
|
||||
cpu_stats(1),
|
||||
cpu_stats_reg(cpu_stats) {
|
||||
|
||||
stream = std::make_shared<CudaStream>();
|
||||
|
||||
// Setup mask
|
||||
std::vector<uint8_t> mask_vec(npixels);
|
||||
for (int i = 0; i < npixels; i++)
|
||||
mask_vec[i] = (mask.GetMask().at(i) != 0);
|
||||
|
||||
cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);
|
||||
|
||||
// Setup GPU settings
|
||||
cudaDeviceProp prop{};
|
||||
cudaGetDeviceProperties(&prop, 0);
|
||||
|
||||
threads = 128;
|
||||
blocks = 4 * prop.multiProcessorCount;
|
||||
}
|
||||
|
||||
ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image, const uint8_t *image_ptr,
|
||||
CompressedImageMode image_mode) {
|
||||
switch (image_mode) {
|
||||
case CompressedImageMode::Int8:
|
||||
return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
|
||||
case CompressedImageMode::Int16:
|
||||
return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
|
||||
case CompressedImageMode::Int32:
|
||||
return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
|
||||
case CompressedImageMode::Uint8:
|
||||
return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
|
||||
case CompressedImageMode::Uint16:
|
||||
return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
|
||||
case CompressedImageMode::Uint32:
|
||||
return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
|
||||
default:
|
||||
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_image,
|
||||
const uint8_t *input,
|
||||
T err_value,
|
||||
T sat_value) {
|
||||
|
||||
if (sat_value > saturation_limit)
|
||||
sat_value = static_cast<T>(saturation_limit);
|
||||
|
||||
cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);
|
||||
|
||||
cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
|
||||
cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
|
||||
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
|
||||
reinterpret_cast<const T *>(gpu_decompressed_image.get()),
|
||||
gpu_mask,
|
||||
gpu_image,
|
||||
gpu_stats,
|
||||
sat_value,
|
||||
err_value,
|
||||
npixels);
|
||||
cudaMemcpyAsync(processed_image.data(), gpu_image, npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
|
||||
cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
return cpu_stats[0];
|
||||
}
|
||||
|
||||
const int32_t *ImagePreprocessorGPU::GetImageDevicePtr() const {
|
||||
return gpu_image;
|
||||
}
|
||||
|
||||
std::shared_ptr<CudaStream> ImagePreprocessorGPU::GetStream() const {
|
||||
return stream;
|
||||
}
|
||||
Reference in New Issue
Block a user