MXAnalysisWithoutFPGA: Better synchronize preprocessing, azint and spot finding on GPU

2026-04-22 21:33:44 +02:00
parent f208109717
commit e51a45a5ea
11 changed files with 48 additions and 82 deletions
@@ -50,19 +50,16 @@ __global__ void preprocess_kernel(

        // Output
        output[i] =
-            is_masked ? INT32_MIN :
-            is_sat    ? INT32_MAX :
-            is_err    ? INT32_MIN :
-                        (int32_t)v;
+                is_masked ? INT32_MIN : is_sat ? INT32_MAX : is_err ? INT32_MIN : (int32_t) v;

        // Counters
-        local_masked   += is_masked;
-        local_saturated+= (!is_masked && is_sat);
-        local_error    += (!is_masked && !is_sat && is_err);
+        local_masked += is_masked;
+        local_saturated += (!is_masked && is_sat);
+        local_error += (!is_masked && !is_sat && is_err);

        // Min/max only for valid
        if (valid) {
-            int64_t val = (int64_t)v;
+            int64_t val = (int64_t) v;
            if (val > local_max) local_max = val;
            if (val < local_min) local_min = val;
        }
@@ -74,8 +71,8 @@ __global__ void preprocess_kernel(
    atomicAdd(&s_error, local_error);

    if (local_min <= local_max) {
-        atomicMax((long long*)&s_max, (long long)local_max);
-        atomicMin((long long*)&s_min, (long long)local_min);
+        atomicMax((long long *) &s_max, (long long) local_max);
+        atomicMin((long long *) &s_min, (long long) local_min);
    }

    __syncthreads();
@@ -86,22 +83,21 @@ __global__ void preprocess_kernel(
        atomicAdd(&stats->saturated_pixel_count, s_saturated);
        atomicAdd(&stats->error_pixel_count, s_error);

-        atomicMax((long long*)&stats->max_value, (long long)s_max);
-        atomicMin((long long*)&stats->min_value, (long long)s_min);
+        atomicMax((long long *) &stats->max_value, (long long) s_max);
+        atomicMin((long long *) &stats->min_value, (long long) s_min);
    }
 }

-ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask)
+ImagePreprocessorGPU::ImagePreprocessorGPU(const DiffractionExperiment &experiment, const PixelMask &mask,
+                                           std::shared_ptr<CudaStream> stream)
    : ImagePreprocessor(experiment),
+      stream(stream),
      gpu_mask(npixels),
-      gpu_decompressed_image(npixels * sizeof(uint32_t)), // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
+      gpu_decompressed_image(npixels * sizeof(uint32_t)),  // Overshoot - if input image is 1- or 2-byte, then it is still fine, while memory loss is minimal
      gpu_image(npixels),
      gpu_stats(1),
      cpu_stats(1),
      cpu_stats_reg(cpu_stats) {
-
-    stream = std::make_shared<CudaStream>();
-
    // Setup mask
    std::vector<uint8_t> mask_vec(npixels);
    for (int i = 0; i < npixels; i++)
@@ -142,7 +138,6 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im
                                              const uint8_t *input,
                                              T err_value,
                                              T sat_value) {
-
    if (sat_value > saturation_limit)
        sat_value = static_cast<T>(saturation_limit);

@@ -150,7 +145,7 @@ ImageStatistics ImagePreprocessorGPU::Analyze(std::vector<int32_t> &processed_im

    cpu_stats[0] = ImageStatistics{.max_value = INT64_MIN, .min_value = INT64_MAX};
    cudaMemcpyAsync(gpu_stats, cpu_stats.data(), sizeof(ImageStatistics), cudaMemcpyHostToDevice, *stream);
-    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>> (
+    preprocess_kernel<T> <<< blocks, threads, 0, *stream >>>(
        reinterpret_cast<const T *>(gpu_decompressed_image.get()),
        gpu_mask,
        gpu_image,