Add CUDA cluster finder kernel with host launcher

Implements a GPU version of the sequential ClusterFinder for single-frame cluster reconstrcution. Kernel (ClusterFinderCUDA.cuh): - Shared memory tiling with generalized halo loading for arbitrary cluster sizes (3x3, 5x5, ...) - Zero-initialization of shared memory to handle image boundary and partial edge-block cases. - Pedestal subtraction during shared memory loading. - Compile-time cluster geometry enabling full loop unrolling of the stencil reduction - Atomic global counter for lock-free cluster output across blocks. - RAII host wrapper; `ClusterFinderCUDA` struct.
2026-06-04 23:58:41 +02:00 · 2026-04-08 16:20:43 +02:00
parent a6afa45b3b
commit a43814801a
2 changed files with 452 additions and 0 deletions
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: MPL-2.0
+#pragma once
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#include <stdexcept>
+#include <cstdio>
+#include "aare/Cluster.hpp"
+#include "aare/utils/cuda_check.cuh"
+
+namespace aare::device {
+
+// Copied this struct from ClusterFinder.hpp
+// but may be just include header?
+template <typename ClusterType,
+          typename = std::enable_if_t<is_cluster_v<ClusterType>>>
+struct no_2x2_cluster {
+    constexpr static bool value =
+        ClusterType::cluster_size_x > 2 && ClusterType::cluster_size_y > 2;
+};
+
+// __________________
+//
+// CUDA kernel
+// __________________
+
+template <typename ClusterType = Cluster<int32_t, 3, 3>,
+          typename FRAME_TYPE = uint16_t,
+          typename PEDESTAL_TYPE = double,
+          typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
+__global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d_frame,
+                                              const PEDESTAL_TYPE* __restrict__ d_pd_mean,
+                                              const PEDESTAL_TYPE* __restrict__ d_pd_sum2,
+                                              const uint32_t       n_pd_samples,
+                                              const PEDESTAL_TYPE  m_nSigma,
+                                              const size_t         nrows,
+                                              const size_t         ncols,
+                                              const uint64_t       frame_number,
+                                              ClusterType*         d_clusters,
+                                              uint32_t*            d_cluster_count,
+                                             const uint32_t       max_clusters)
+{
+    using CT = typename ClusterType::value_type;
+
+    // Compile-time cluster geometry useful for unrolling loops
+    constexpr uint8_t CSX = ClusterType::cluster_size_x;
+    constexpr uint8_t CSY = ClusterType::cluster_size_y;
+    constexpr int col_radius = CSX / 2;
+    constexpr int row_radius = CSY / 2;
+
+    // Squared threshold constants; avoids sqrt at runtime
+    // c2^2 is for the 2x2 quadrant test
+    // c3^2 is for the full-cluster total test
+    constexpr int pow2_c2 = ((CSY + 1) / 2) * ((CSX + 1) / 2);
+    constexpr int pow2_c3 = CSX * CSY;
+
+    // Thread/pixel mapping
+    auto col_global = static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
+    auto row_global = static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
+    auto global_tid = static_cast<ssize_t>(col_global + ncols * row_global);
+    auto local_tid  = threadIdx.x + blockDim.x * threadIdx.y;
+
+    // ====================
+    // Shared memory layout
+    // ====================
+    // The tile is laid out contiguously in a 1D configuration:
+    //    [0 ... tile_size-1] = pedestal-subtracted frame values
+    //
+    // Each tile has (blockDim.x + 2*col_radius) x (blockDim.y + 2*row_radius)
+    // elements, with a halo of col_radius/row_radius pixels on each side.
+
+    // CUDA prefers raw bytes + aligned cast
+    // Compile error happens when using: `extern __shared__ T sh[];`
+    extern __shared__ __align__(sizeof(PEDESTAL_TYPE)) unsigned char smem[];
+    PEDESTAL_TYPE* shmem = reinterpret_cast<PEDESTAL_TYPE*>(smem);
+
+    // Stride includes halo on both sides
+    auto shmem_stride = static_cast<int>(blockDim.x) + 2 * col_radius;
+    auto tile_size    = shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
+
+    // Offset so that thread (0,0) maps to shared-memory position
+    // (row_radius, col_radius) i.e. past the top-left halo.
+    auto shmem_tid = (static_cast<int>(threadIdx.y) + row_radius) * shmem_stride
+                   + (static_cast<int>(threadIdx.x) + col_radius);
+
+    // Cooperative zero-fill
+    for (int idx = static_cast<int>(local_tid); idx < tile_size; idx += static_cast<int>(blockDim.x * blockDim.y)) {
+        shmem[idx] = PEDESTAL_TYPE{0};
+    }
+    __syncthreads();
+
+    // OOB flag
+    bool valid_pixel = col_global < static_cast<ssize_t>(ncols) &&
+                       row_global < static_cast<ssize_t>(nrows);
+
+    // ======================================================
+    // Load pedestal-subtracted frame data into shared memory
+    // ======================================================
+
+    // Helper: read (frame - pedestal_mean) from global memory, or 0 if OOB.
+    // gr, gc are the global row/col of the pixel to load.
+    // Returns the pedestal-subtracted value.
+    auto load_pixel = [&] __device__ (ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
+        auto gid = gc + static_cast<ssize_t>(ncols) * gr;
+        return static_cast<PEDESTAL_TYPE>(d_frame[gid]) - d_pd_mean[gid];
+    };
+
+    // A. Interior: every valid thread loads its own pixel
+    if (valid_pixel) {
+        shmem[shmem_tid] = load_pixel(row_global, col_global);
+    }
+
+    // B. Halo regions (Boundaries)
+    // B.1  Top rows of the halo
+    if (threadIdx.y == 0 && valid_pixel) {
+        for (int i = 1; i <= row_radius; ++i) {
+            if (row_global - i >= 0)
+                shmem[shmem_tid - i * shmem_stride] = load_pixel(row_global - i, col_global);
+        }
+        // Top-left corner rectangle
+        if (threadIdx.x == 0) {
+            for (int i = 1; i <= row_radius; ++i)
+                for (int j = 1; j <= col_radius; ++j)
+                    if (row_global - i >= 0 && col_global - j >= 0)
+                        shmem[shmem_tid - i * shmem_stride - j] = load_pixel(row_global - i, col_global - j);
+        }
+        // Top-right corner rectangle
+        if (threadIdx.x == blockDim.x - 1) {
+            for (int i = 1; i <= row_radius; ++i)
+                for (int j = 1; j <= col_radius; ++j)
+                    if (row_global - i >= 0 && col_global + j < static_cast<ssize_t>(ncols))
+                        shmem[shmem_tid - i * shmem_stride + j] = load_pixel(row_global - i, col_global + j);
+        }
+    }
+
+    // B.2  Left column of the halo
+    if (threadIdx.x == 0 && valid_pixel) {
+        for (int j = 1; j <= col_radius; ++j)
+            if (col_global - j >= 0)
+                shmem[shmem_tid - j] = load_pixel(row_global, col_global - j);
+    }
+
+    // B.3  Right column of the halo
+    if (threadIdx.x == blockDim.x - 1 && valid_pixel) {
+        for (int j = 1; j <= col_radius; ++j)
+            if (col_global + j < static_cast<ssize_t>(ncols))
+                shmem[shmem_tid + j] = load_pixel(row_global, col_global + j);
+    }
+
+    // B.4  Bottom rows of the halo
+    if (threadIdx.y == blockDim.y - 1 && valid_pixel) {
+        for (int i = 1; i <= row_radius; ++i) {
+            if (row_global + i < static_cast<ssize_t>(nrows))
+                shmem[shmem_tid + i * shmem_stride] = load_pixel(row_global + i, col_global);
+        }
+        // Bottom-left corner rectangle
+        if (threadIdx.x == 0) {
+            for (int i = 1; i <= row_radius; ++i)
+                for (int j = 1; j <= col_radius; ++j)
+                    if (row_global + i < static_cast<ssize_t>(nrows) && col_global - j >= 0)
+                        shmem[shmem_tid + i * shmem_stride - j] = load_pixel(row_global + i, col_global - j);
+        }
+        // Bottom-right corner rectangle
+        if (threadIdx.x == blockDim.x - 1) {
+            for (int i = 1; i <= row_radius; ++i)
+                for (int j = 1; j <= col_radius; ++j)
+                    if (row_global + i < static_cast<ssize_t>(nrows) && col_global + j < static_cast<ssize_t>(ncols))
+                        shmem[shmem_tid + i * shmem_stride + j] = load_pixel(row_global + i, col_global + j);
+        }
+    }
+
+    __syncthreads();
+
+    // =====================
+    // Cluster-finding logic
+    // =====================
+    if (!valid_pixel) return;
+
+    // Per-pixel RMS from global pedestal arrays
+    // rms = sqrt( E[X^2] - E[X]^2 )
+    PEDESTAL_TYPE mean_px  = d_pd_mean[global_tid];
+    PEDESTAL_TYPE var_px   = d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
+    PEDESTAL_TYPE rms_sq  = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
+    PEDESTAL_TYPE rms_px  = sqrt(rms_sq);
+
+    // Pedestal-subtracted value of the center pixel (already in shmem)
+    PEDESTAL_TYPE val_pixel = shmem[shmem_tid];
+
+    // Negative pedestal early exit 
+    if (val_pixel < -m_nSigma * rms_px)
+        return;  // NOTE: pedestal update for this pixel is skipped (same as sequential)
+
+    // Stencil reduction: total, max, quadrant sums
+    PEDESTAL_TYPE total = PEDESTAL_TYPE{0};
+    PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of `numeric_limits<double>::lowest()`
+    PEDESTAL_TYPE tl = PEDESTAL_TYPE{0};   // top-left quadrant  (ir<=0, ic<=0)
+    PEDESTAL_TYPE tr = PEDESTAL_TYPE{0};   // top-right quadrant (ir<=0, ic>=0)
+    PEDESTAL_TYPE bl = PEDESTAL_TYPE{0};   // bottom-left        (ir>=0, ic<=0)
+    PEDESTAL_TYPE br = PEDESTAL_TYPE{0};   // bottom-right       (ir>=0, ic>=0)
+
+    CT clusterData[CSX * CSY];
+    int idx = 0; // tracks the pixels in the cluster
+
+    #pragma unroll
+    for (int ir = -row_radius; ir <= row_radius; ++ir) {
+        #pragma unroll
+        for (int ic = -col_radius; ic <= col_radius; ++ic) {
+            PEDESTAL_TYPE val = shmem[shmem_tid + ir * shmem_stride + ic];
+
+            total += val;
+            max_val = max(max_val, val);
+
+            // Quadrant accumulation (pixels on the axes contribute to two quadrants)
+            if (ir <= 0 && ic <= 0) tl += val;
+            if (ir <= 0 && ic >= 0) tr += val;
+            if (ir >= 0 && ic <= 0) bl += val;
+            if (ir >= 0 && ic >= 0) br += val;
+
+            // Store pedestal-subtracted value in register array for later cluster output
+            if constexpr (std::is_integral_v<CT> && std::is_floating_point_v<PEDESTAL_TYPE>)
+                clusterData[idx] = static_cast<CT>(lround(val));
+            else
+                clusterData[idx] = static_cast<CT>(val);
+
+            idx++;
+        }
+    }
+
+    // Three-way classification (mirrors ClusterFinder's logic)
+    //
+    //  1. Single-pixel significance:  max_val > nSigma * rms
+    //     -> only the pixel that IS the max gets recorded (local-max suppression)
+    //
+    //  2. Quadrant significance:      max(tl,tr,bl,br) > c2 * nSigma * rms
+    //     -> charge-sharing events where a 2x2 sub-region is significant (incomplete in ClusterFinder)
+    //
+    //  3. Total significance:         total > c3 * nSigma * rms
+    //     -> distributed events where the full cluster sum is significant
+    //
+    //  NOTE: Tried to use squared comparisons, whenever possible,
+    //  to avoid the slower `sqrt()` on c2 and c3:
+    //     (x > c * nSigma * rms)  <-->  (x > 0 && x² > c² * nSigma² * rms²)
+
+    PEDESTAL_TYPE nSig_sq_rms_sq = m_nSigma * m_nSigma * rms_sq;
+
+    bool is_photon = false;
+
+    // Test 1: single-pixel significance
+    if (max_val > m_nSigma * rms_px) {
+        // Local-max suppression: only the center-pixel thread records the cluster
+        if (val_pixel < max_val)
+            return;  // some other pixel in the neighborhood is brighter
+        is_photon = true;
+    }
+
+    // Test 2: quadrant significance (only if test 1 didn't fire)
+    if (!is_photon) {
+        PEDESTAL_TYPE max_quad = max(max(tl, tr), max(bl, br));
+        if (max_quad > 0 && max_quad * max_quad > pow2_c2 * nSig_sq_rms_sq) {
+            is_photon = true;
+        }
+    }
+
+    // Test 3: total significance (only if tests 1 & 2 didn't fire)
+    if (!is_photon) {
+        if (total > 0 && total * total > pow2_c3 * nSig_sq_rms_sq) {
+            is_photon = true;
+        }
+    }
+
+    // Pedestal update (if not a photon) 
+    // TODO: Pedestal update is omitted here. In the sequential code,
+    // non-photon pixels feed back into the running pedestal via push_fast().
+    // On GPU this requires atomic updates to d_pd_sum / d_pd_sum2 which
+    // possibly degrade performance...
+    if (!is_photon)
+        return;
+
+    // Write cluster to global output buffer using atomic index 
+    // for coordination across all blocks
+    uint32_t write_idx = atomicAdd(d_cluster_count, 1u);
+
+    // Guard against overflowing the pre-allocated cluster buffer
+    if (write_idx >= max_clusters)
+        return;
+
+    ClusterType cluster{};
+    cluster.x = static_cast<decltype(cluster.x)>(col_global);
+    cluster.y = static_cast<decltype(cluster.y)>(row_global);
+    
+    // CT* cdata = cluster.data(); 
+    
+    // #pragma unroll
+    // for (int i = 0; i < CSX * CSY; ++i) {
+    //     cdata[i] = clusterData[i];
+    // }
+    memcpy(reinterpret_cast<CT*>(&cluster.data), clusterData, sizeof(CT) * CSX * CSY);    
+
+    d_clusters[write_idx] = cluster;
+}
+
+
+// __________________
+//
+// Host-side launcher
+// __________________
+template <typename ClusterType = Cluster<int32_t, 3, 3>,
+          typename FRAME_TYPE = uint16_t,
+          typename PEDESTAL_TYPE = double>
+struct ClusterFinderCUDA {
+
+    static constexpr int BLOCK_X = 16;
+    static constexpr int BLOCK_Y = 16;
+    static constexpr int col_radius = ClusterType::cluster_size_x / 2;
+    static constexpr int row_radius = ClusterType::cluster_size_y / 2;
+
+    size_t nrows;
+    size_t ncols;
+    uint32_t max_clusters;
+
+    // Device pointers
+    FRAME_TYPE*    d_frame          = nullptr;
+    PEDESTAL_TYPE* d_pd_mean        = nullptr;
+    PEDESTAL_TYPE* d_pd_sum2        = nullptr;
+    ClusterType*   d_clusters       = nullptr;
+    uint32_t*      d_cluster_count  = nullptr;
+
+    cudaStream_t   stream = 0;
+
+    ClusterFinderCUDA(size_t nrows_, size_t ncols_, uint32_t max_clusters_ = 1000000)
+        : nrows(nrows_), ncols(ncols_), max_clusters(max_clusters_)
+    {
+        size_t image_size = nrows * ncols;
+
+        CUDA_CHECK(cudaStreamCreate(&stream));
+
+        CUDA_CHECK(cudaMalloc(&d_frame,         image_size * sizeof(FRAME_TYPE)));
+        CUDA_CHECK(cudaMalloc(&d_pd_mean,       image_size * sizeof(PEDESTAL_TYPE)));
+        CUDA_CHECK(cudaMalloc(&d_pd_sum2,       image_size * sizeof(PEDESTAL_TYPE)));
+        CUDA_CHECK(cudaMalloc(&d_clusters,      max_clusters * sizeof(ClusterType)));
+        CUDA_CHECK(cudaMalloc(&d_cluster_count, sizeof(uint32_t)));
+    }
+
+    ~ClusterFinderCUDA() {
+        if (d_frame)         cudaFree(d_frame);
+        if (d_pd_mean)       cudaFree(d_pd_mean);
+        if (d_pd_sum2)       cudaFree(d_pd_sum2);
+        if (d_clusters)      cudaFree(d_clusters);
+        if (d_cluster_count) cudaFree(d_cluster_count);
+        if (stream)          cudaStreamDestroy(stream);
+    }
+
+    // Non-copyable, only movable
+    ClusterFinderCUDA(const ClusterFinderCUDA&) = delete;
+    ClusterFinderCUDA& operator=(const ClusterFinderCUDA&) = delete;
+    ClusterFinderCUDA(ClusterFinderCUDA&&) = default;
+    ClusterFinderCUDA& operator=(ClusterFinderCUDA&&) = default;
+
+    /**
+     * Upload pedestal statistics (mean and sum-of-squares) to device.
+     * These are typically computed on the host by the Pedestal class.
+     */
+    void upload_pedestal(const PEDESTAL_TYPE* h_mean, const PEDESTAL_TYPE* h_sum2) {
+        size_t bytes = nrows * ncols * sizeof(PEDESTAL_TYPE);
+        CUDA_CHECK(cudaMemcpyAsync(d_pd_mean, h_mean, bytes, cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_pd_sum2, h_sum2, bytes, cudaMemcpyHostToDevice, stream));
+    }
+
+    /**
+     * Find clusters in a single frame.
+     *
+     * @param h_frame       Host pointer to the raw frame data (nrows x ncols)
+     * @param frame_number  Frame sequence number
+     * @param nSigma        Threshold in units of standard deviations
+     * @param n_pd_samples  Number of samples used to compute the pedestal
+     * @param h_clusters    Output: host buffer to receive found clusters
+     * @param h_n_clusters  Output: number of clusters found
+     */
+    void find_clusters(const FRAME_TYPE* h_frame,
+                       uint64_t frame_number,
+                       PEDESTAL_TYPE nSigma,
+                       uint32_t n_pd_samples,
+                       ClusterType* h_clusters,
+                       uint32_t& h_n_clusters)
+    {
+        size_t image_bytes = nrows * ncols * sizeof(FRAME_TYPE);
+
+        // Reset cluster counter
+        CUDA_CHECK(cudaMemsetAsync(d_cluster_count, 0, sizeof(uint32_t), stream));
+
+        // Upload frame
+        CUDA_CHECK(cudaMemcpyAsync(d_frame, h_frame, image_bytes, cudaMemcpyHostToDevice, stream));
+
+        // Grid/Block dimensions
+        dim3 block(BLOCK_X, BLOCK_Y);
+        dim3 grid((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X, // threadIdx.x/.y/.z take unsigned integrales
+                  (static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y
+        );
+
+        // Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + 2*row_radius) doubles
+        size_t shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * sizeof(PEDESTAL_TYPE);
+
+        // Kernel launch
+        find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
+            <<<grid, block, shmem_bytes, stream>>>(d_frame,
+                                                   d_pd_mean,
+                                                   d_pd_sum2,
+                                                   n_pd_samples,
+                                                   nSigma,
+                                                   nrows,
+                                                   ncols,
+                                                   frame_number,
+                                                   d_clusters,
+                                                   d_cluster_count,
+                                                   max_clusters
+            );
+
+        CUDA_CHECK(cudaGetLastError());
+
+        // Read back cluster count
+        CUDA_CHECK(cudaMemcpyAsync(&h_n_clusters, d_cluster_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+
+        // Synchronize to ensure count is available before the CPU reads clusters
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // Clamp to max in case of overflow
+        if (h_n_clusters > max_clusters) h_n_clusters = max_clusters;
+
+        // Read back clusters
+        if (h_n_clusters > 0) {
+            CUDA_CHECK(cudaMemcpyAsync(h_clusters, d_clusters, h_n_clusters * sizeof(ClusterType), cudaMemcpyDeviceToHost, stream));
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+        }
+    }
+};
+
+} // namespace aare::device
@@ -0,0 +1,14 @@
+#pragma once
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime.h>
+
+inline void __cuda_check(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    std::fprintf(stderr, "[%s:%d] CUDA error: %s\n",
+                 file, line, cudaGetErrorString(err));
+    std::exit(1);
+  }
+}
+
+#define CUDA_CHECK(stmt) __cuda_check((stmt), __FILE__, __LINE__)