Files
Jungfraujoch/image_analysis/image_preprocessing/ImagePreprocessorGPU.cu
T
leonarski_f 17619bd19a
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 15m13s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m49s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 16m45s
Build Packages / build:rpm (rocky8) (push) Successful in 17m36s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m27s
Build Packages / build:rpm (rocky9) (push) Successful in 18m28s
Build Packages / Generate python client (push) Successful in 1m31s
Build Packages / Unit tests (push) Successful in 1h0m56s
Build Packages / build:rpm (ubuntu2204) (push) Failing after 9m13s
Build Packages / Create release (push) Has been skipped
Build Packages / Build documentation (push) Successful in 1m52s
Build Packages / XDS test (durin plugin) (push) Successful in 10m18s
Build Packages / XDS test (neggia plugin) (push) Successful in 8m22s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 9m24s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 13m6s
Build Packages / DIALS test (push) Successful in 13m14s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 11m15s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Failing after 14m5s
ImagePreprocessorBuffer: Dedicated class to handle buffer used by spot finding.
2026-04-23 20:20:24 +02:00

162 lines
5.8 KiB
Plaintext

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "ImagePreprocessorGPU.h"
template<class T>
__global__ void preprocess_kernel(
const T *__restrict__ input,
const uint8_t *__restrict__ mask,
int32_t *__restrict__ output,
ImageStatistics *__restrict__ stats,
T saturation_limit,
T err_value,
int npixels) {
// Shared block accumulators
__shared__ unsigned long long s_masked;
__shared__ unsigned long long s_saturated;
__shared__ unsigned long long s_error;
__shared__ long long s_max;
__shared__ long long s_min;
if (threadIdx.x == 0) {
s_masked = 0;
s_saturated = 0;
s_error = 0;
s_max = INT64_MIN;
s_min = INT64_MAX;
}
__syncthreads();
// Thread-local accumulators
unsigned long long local_masked = 0;
unsigned long long local_saturated = 0;
unsigned long long local_error = 0;
long long local_max = INT64_MIN;
long long local_min = INT64_MAX;
constexpr bool is_signed = std::is_signed<T>::value;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < npixels;
i += blockDim.x * gridDim.x) {
T v = input[i];
bool is_masked = mask[i];
bool is_sat = v >= saturation_limit;
bool is_err = is_signed && (v == err_value);
bool valid = !(is_masked || is_sat || is_err);
// Output
output[i] =
is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;
// Counters
local_masked += is_masked;
local_saturated += (!is_masked && is_sat);
local_error += (!is_masked && !is_sat && is_err);
// Min/max only for valid
if (valid) {
int64_t val = (int64_t) v;
if (val > local_max) local_max = val;
if (val < local_min) local_min = val;
}
}
// Reduce to shared memory
atomicAdd(&s_masked, local_masked);
atomicAdd(&s_saturated, local_saturated);
atomicAdd(&s_error, local_error);
if (local_min <= local_max) {
atomicMax((long long *) &s_max, (long long) local_max);
atomicMin((long long *) &s_min, (long long) local_min);
}
__syncthreads();
// One thread writes block result
if (threadIdx.x == 0) {
atomicAdd(&stats->masked_pixel_count, s_masked);
atomicAdd(&stats->saturated_pixel_count, s_saturated);
atomicAdd(&stats->error_pixel_count, s_error);
atomicMax((long long *) &stats->max_value, (long long) s_max);
atomicMin((long long *) &stats->min_value, (long long) s_min);
}
}
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
std::shared_ptr<CudaStream> stream)
: ImagePreprocessor(experiment),
stream(stream),
gpu_mask(npixels),
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
gpu_stats(1),
cpu_stats(1),
cpu_stats_reg(cpu_stats) {
// Setup mask
std::vector<uint8_t> mask_vec(npixels);
for (int i = 0; i < npixels; i++)
mask_vec[i] = (mask.GetMask().at(i) != 0);
cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);
// Setup GPU settings
cudaDeviceProp prop{};
cudaGetDeviceProperties(&prop, 0);
threads = 128;
blocks = 4 * prop.multiProcessorCount;
}
ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image, const uint8_t *image_ptr,
CompressedImageMode image_mode) {
switch (image_mode) {
case CompressedImageMode::Int8:
return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
case CompressedImageMode::Int16:
return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
case CompressedImageMode::Int32:
return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
case CompressedImageMode::Uint8:
return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
case CompressedImageMode::Uint16:
return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
case CompressedImageMode::Uint32:
return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
default:
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
}
}
template<class T>
ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image,
const uint8_t *input,
T err_value,
T sat_value) {
if (sat_value > saturation_limit)
sat_value = static_cast<T>(saturation_limit);
cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);
cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
reinterpret_cast<const T *>(gpu_decompressed_image.get()),
gpu_mask,
processed_image.getGPUBuffer(),
gpu_stats,
sat_value,
err_value,
npixels);
cudaMemcpyAsync(processed_image.data(), processed_image.getGPUBuffer(), npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);
cudaStreamSynchronize(*stream);
return cpu_stats[0];
}