MXAnalysisWithoutFPGA: Better synchronize preprocessing, azint and spot finding on GPU
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m40s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 14m26s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m21s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m54s
Build Packages / build:rpm (rocky8) (push) Successful in 17m54s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m58s
Build Packages / build:rpm (rocky9) (push) Successful in 18m35s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m44s
Build Packages / Generate python client (push) Successful in 1m28s
Build Packages / Build documentation (push) Successful in 1m59s
Build Packages / Create release (push) Has been skipped
Build Packages / build:rpm (ubuntu2204) (push) Successful in 10m9s
Build Packages / XDS test (durin plugin) (push) Successful in 9m19s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m0s
Build Packages / XDS test (neggia plugin) (push) Successful in 8m47s
Build Packages / DIALS test (push) Successful in 12m36s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 9m57s
Build Packages / Unit tests (push) Successful in 1h1m21s

This commit is contained in:
2026-04-22 21:33:44 +02:00
parent f208109717
commit e51a45a5ea
11 changed files with 48 additions and 82 deletions
@@ -50,19 +50,16 @@ __global__ void preprocess_kernel(
// Output
output[i] =
is_masked ? INT32_MIN :
is_sat ? INT32_MAX :
is_err ? INT32_MIN :
(int32_t)v;
is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;
// Counters
local_masked += is_masked;
local_saturated+= (!is_masked && is_sat);
local_error += (!is_masked && !is_sat && is_err);
local_masked += is_masked;
local_saturated += (!is_masked && is_sat);
local_error += (!is_masked && !is_sat && is_err);
// Min/max only for valid
if (valid) {
int64_t val = (int64_t)v;
int64_t val = (int64_t) v;
if (val > local_max) local_max = val;
if (val < local_min) local_min = val;
}
@@ -74,8 +71,8 @@ __global__ void preprocess_kernel(
atomicAdd(&s_error, local_error);
if (local_min <= local_max) {
atomicMax((long long*)&s_max, (long long)local_max);
atomicMin((long long*)&s_min, (long long)local_min);
atomicMax((long long *) &s_max, (long long) local_max);
atomicMin((long long *) &s_min, (long long) local_min);
}
__syncthreads();
@@ -86,22 +83,21 @@ __global__ void preprocess_kernel(
atomicAdd(&stats->saturated_pixel_count, s_saturated);
atomicAdd(&stats->error_pixel_count, s_error);
atomicMax((long long*)&stats->max_value, (long long)s_max);
atomicMin((long long*)&stats->min_value, (long long)s_min);
atomicMax((long long *) &stats->max_value, (long long) s_max);
atomicMin((long long *) &stats->min_value, (long long) s_min);
}
}
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
std::shared_ptr<CudaStream> stream)
: ImagePreprocessor(experiment),
stream(stream),
gpu_mask(npixels),
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
gpu_image(npixels),
gpu_stats(1),
cpu_stats(1),
cpu_stats_reg(cpu_stats) {
stream = std::make_shared<CudaStream>();
// Setup mask
std::vector<uint8_t> mask_vec(npixels);
for (int i = 0; i < npixels; i++)
@@ -142,7 +138,6 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
const uint8_t *input,
T err_value,
T sat_value) {
if (sat_value > saturation_limit)
sat_value = static_cast<T>(saturation_limit);
@@ -150,7 +145,7 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
reinterpret_cast<const T *>(gpu_decompressed_image.get()),
gpu_mask,
gpu_image,