17619bd19a
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 15m13s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m49s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 16m45s
Build Packages / build:rpm (rocky8) (push) Successful in 17m36s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m27s
Build Packages / build:rpm (rocky9) (push) Successful in 18m28s
Build Packages / Generate python client (push) Successful in 1m31s
Build Packages / Unit tests (push) Successful in 1h0m56s
Build Packages / build:rpm (ubuntu2204) (push) Failing after 9m13s
Build Packages / Create release (push) Has been skipped
Build Packages / Build documentation (push) Successful in 1m52s
Build Packages / XDS test (durin plugin) (push) Successful in 10m18s
Build Packages / XDS test (neggia plugin) (push) Successful in 8m22s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 9m24s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 13m6s
Build Packages / DIALS test (push) Successful in 13m14s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 11m15s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Failing after 14m5s
162 lines
5.8 KiB
Plaintext
162 lines
5.8 KiB
Plaintext
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "ImagePreprocessorGPU.h"
|
|
|
|
template<class T>
|
|
__global__ void preprocess_kernel(
|
|
const T *__restrict__ input,
|
|
const uint8_t *__restrict__ mask,
|
|
int32_t *__restrict__ output,
|
|
ImageStatistics *__restrict__ stats,
|
|
T saturation_limit,
|
|
T err_value,
|
|
int npixels) {
|
|
// Shared block accumulators
|
|
__shared__ unsigned long long s_masked;
|
|
__shared__ unsigned long long s_saturated;
|
|
__shared__ unsigned long long s_error;
|
|
__shared__ long long s_max;
|
|
__shared__ long long s_min;
|
|
|
|
if (threadIdx.x == 0) {
|
|
s_masked = 0;
|
|
s_saturated = 0;
|
|
s_error = 0;
|
|
s_max = INT64_MIN;
|
|
s_min = INT64_MAX;
|
|
}
|
|
__syncthreads();
|
|
|
|
// Thread-local accumulators
|
|
unsigned long long local_masked = 0;
|
|
unsigned long long local_saturated = 0;
|
|
unsigned long long local_error = 0;
|
|
long long local_max = INT64_MIN;
|
|
long long local_min = INT64_MAX;
|
|
|
|
constexpr bool is_signed = std::is_signed<T>::value;
|
|
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < npixels;
|
|
i += blockDim.x * gridDim.x) {
|
|
T v = input[i];
|
|
|
|
bool is_masked = mask[i];
|
|
bool is_sat = v >= saturation_limit;
|
|
bool is_err = is_signed && (v == err_value);
|
|
|
|
bool valid = !(is_masked || is_sat || is_err);
|
|
|
|
// Output
|
|
output[i] =
|
|
is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;
|
|
|
|
// Counters
|
|
local_masked += is_masked;
|
|
local_saturated += (!is_masked && is_sat);
|
|
local_error += (!is_masked && !is_sat && is_err);
|
|
|
|
// Min/max only for valid
|
|
if (valid) {
|
|
int64_t val = (int64_t) v;
|
|
if (val > local_max) local_max = val;
|
|
if (val < local_min) local_min = val;
|
|
}
|
|
}
|
|
|
|
// Reduce to shared memory
|
|
atomicAdd(&s_masked, local_masked);
|
|
atomicAdd(&s_saturated, local_saturated);
|
|
atomicAdd(&s_error, local_error);
|
|
|
|
if (local_min <= local_max) {
|
|
atomicMax((long long *) &s_max, (long long) local_max);
|
|
atomicMin((long long *) &s_min, (long long) local_min);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
// One thread writes block result
|
|
if (threadIdx.x == 0) {
|
|
atomicAdd(&stats->masked_pixel_count, s_masked);
|
|
atomicAdd(&stats->saturated_pixel_count, s_saturated);
|
|
atomicAdd(&stats->error_pixel_count, s_error);
|
|
|
|
atomicMax((long long *) &stats->max_value, (long long) s_max);
|
|
atomicMin((long long *) &stats->min_value, (long long) s_min);
|
|
}
|
|
}
|
|
|
|
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
|
|
std::shared_ptr<CudaStream> stream)
|
|
: ImagePreprocessor(experiment),
|
|
stream(stream),
|
|
gpu_mask(npixels),
|
|
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
|
|
gpu_stats(1),
|
|
cpu_stats(1),
|
|
cpu_stats_reg(cpu_stats) {
|
|
// Setup mask
|
|
std::vector<uint8_t> mask_vec(npixels);
|
|
for (int i = 0; i < npixels; i++)
|
|
mask_vec[i] = (mask.GetMask().at(i) != 0);
|
|
|
|
cudaMemcpy(gpu_mask, mask_vec.data(), npixels, cudaMemcpyHostToDevice);
|
|
|
|
// Setup GPU settings
|
|
cudaDeviceProp prop{};
|
|
cudaGetDeviceProperties(&prop, 0);
|
|
|
|
threads = 128;
|
|
blocks = 4 * prop.multiProcessorCount;
|
|
}
|
|
|
|
ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image, const uint8_t *image_ptr,
|
|
CompressedImageMode image_mode) {
|
|
switch (image_mode) {
|
|
case CompressedImageMode::Int8:
|
|
return Analyze<int8_t>(processed_image, image_ptr, INT8_MIN, INT8_MAX);
|
|
case CompressedImageMode::Int16:
|
|
return Analyze<int16_t>(processed_image, image_ptr, INT16_MIN, INT16_MAX);
|
|
case CompressedImageMode::Int32:
|
|
return Analyze<int32_t>(processed_image, image_ptr, INT32_MIN, INT32_MAX);
|
|
case CompressedImageMode::Uint8:
|
|
return Analyze<uint8_t>(processed_image, image_ptr, UINT8_MAX, UINT8_MAX);
|
|
case CompressedImageMode::Uint16:
|
|
return Analyze<uint16_t>(processed_image, image_ptr, UINT16_MAX, UINT16_MAX);
|
|
case CompressedImageMode::Uint32:
|
|
return Analyze<uint32_t>(processed_image, image_ptr, UINT32_MAX, UINT32_MAX);
|
|
default:
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "RGB/float mode not supported");
|
|
}
|
|
}
|
|
|
|
template<class T>
|
|
ImageStatistics ImagePreprocessorGPU::Analyze(ImagePreprocessorBuffer &processed_image,
|
|
const uint8_t *input,
|
|
T err_value,
|
|
T sat_value) {
|
|
if (sat_value > saturation_limit)
|
|
sat_value = static_cast<T>(saturation_limit);
|
|
|
|
cudaMemcpy(gpu_decompressed_image, input, npixels * sizeof(T), cudaMemcpyHostToDevice);
|
|
|
|
cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
|
|
cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
|
|
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
|
|
reinterpret_cast<const T *>(gpu_decompressed_image.get()),
|
|
gpu_mask,
|
|
processed_image.getGPUBuffer(),
|
|
gpu_stats,
|
|
sat_value,
|
|
err_value,
|
|
npixels);
|
|
cudaMemcpyAsync(processed_image.data(), processed_image.getGPUBuffer(), npixels * sizeof(int32_t), cudaMemcpyDeviceToHost, *stream);
|
|
cudaMemcpyAsync(cpu_stats.data(), gpu_stats, sizeof(ImageStatistics), cudaMemcpyDeviceToHost, *stream);
|
|
|
|
cudaStreamSynchronize(*stream);
|
|
|
|
return cpu_stats[0];
|
|
}
|