Add CUDA cluster finder kernel with host launcher

Implements a GPU version of the sequential ClusterFinder for
single-frame cluster reconstrcution.

Kernel (ClusterFinderCUDA.cuh):
- Shared memory tiling with generalized halo loading for arbitrary
  cluster sizes (3x3, 5x5, ...)
- Zero-initialization of shared memory to handle image boundary
  and partial edge-block cases.
- Pedestal subtraction during shared memory loading.
- Compile-time cluster geometry enabling full loop unrolling
  of the stencil reduction
- Atomic global counter for lock-free cluster output across blocks.
- RAII host wrapper; `ClusterFinderCUDA` struct.
This commit is contained in:
Khalil Daniel Ferjaoui
2026-04-08 16:20:43 +02:00
parent a6afa45b3b
commit a43814801a
2 changed files with 452 additions and 0 deletions
+438
View File
@@ -0,0 +1,438 @@
// SPDX-License-Identifier: MPL-2.0
#pragma once
#include <cuda_runtime.h>
#include <cstdint>
#include <cmath>
#include <type_traits>
#include <stdexcept>
#include <cstdio>
#include "aare/Cluster.hpp"
#include "aare/utils/cuda_check.cuh"
namespace aare::device {
// Copied this struct from ClusterFinder.hpp
// but may be just include header?
template <typename ClusterType,
typename = std::enable_if_t<is_cluster_v<ClusterType>>>
struct no_2x2_cluster {
constexpr static bool value =
ClusterType::cluster_size_x > 2 && ClusterType::cluster_size_y > 2;
};
// __________________
//
// CUDA kernel
// __________________
template <typename ClusterType = Cluster<int32_t, 3, 3>,
typename FRAME_TYPE = uint16_t,
typename PEDESTAL_TYPE = double,
typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
__global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d_frame,
const PEDESTAL_TYPE* __restrict__ d_pd_mean,
const PEDESTAL_TYPE* __restrict__ d_pd_sum2,
const uint32_t n_pd_samples,
const PEDESTAL_TYPE m_nSigma,
const size_t nrows,
const size_t ncols,
const uint64_t frame_number,
ClusterType* d_clusters,
uint32_t* d_cluster_count,
const uint32_t max_clusters)
{
using CT = typename ClusterType::value_type;
// Compile-time cluster geometry useful for unrolling loops
constexpr uint8_t CSX = ClusterType::cluster_size_x;
constexpr uint8_t CSY = ClusterType::cluster_size_y;
constexpr int col_radius = CSX / 2;
constexpr int row_radius = CSY / 2;
// Squared threshold constants; avoids sqrt at runtime
// c2^2 is for the 2x2 quadrant test
// c3^2 is for the full-cluster total test
constexpr int pow2_c2 = ((CSY + 1) / 2) * ((CSX + 1) / 2);
constexpr int pow2_c3 = CSX * CSY;
// Thread/pixel mapping
auto col_global = static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
auto row_global = static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
auto global_tid = static_cast<ssize_t>(col_global + ncols * row_global);
auto local_tid = threadIdx.x + blockDim.x * threadIdx.y;
// ====================
// Shared memory layout
// ====================
// The tile is laid out contiguously in a 1D configuration:
// [0 ... tile_size-1] = pedestal-subtracted frame values
//
// Each tile has (blockDim.x + 2*col_radius) x (blockDim.y + 2*row_radius)
// elements, with a halo of col_radius/row_radius pixels on each side.
// CUDA prefers raw bytes + aligned cast
// Compile error happens when using: `extern __shared__ T sh[];`
extern __shared__ __align__(sizeof(PEDESTAL_TYPE)) unsigned char smem[];
PEDESTAL_TYPE* shmem = reinterpret_cast<PEDESTAL_TYPE*>(smem);
// Stride includes halo on both sides
auto shmem_stride = static_cast<int>(blockDim.x) + 2 * col_radius;
auto tile_size = shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
// Offset so that thread (0,0) maps to shared-memory position
// (row_radius, col_radius) i.e. past the top-left halo.
auto shmem_tid = (static_cast<int>(threadIdx.y) + row_radius) * shmem_stride
+ (static_cast<int>(threadIdx.x) + col_radius);
// Cooperative zero-fill
for (int idx = static_cast<int>(local_tid); idx < tile_size; idx += static_cast<int>(blockDim.x * blockDim.y)) {
shmem[idx] = PEDESTAL_TYPE{0};
}
__syncthreads();
// OOB flag
bool valid_pixel = col_global < static_cast<ssize_t>(ncols) &&
row_global < static_cast<ssize_t>(nrows);
// ======================================================
// Load pedestal-subtracted frame data into shared memory
// ======================================================
// Helper: read (frame - pedestal_mean) from global memory, or 0 if OOB.
// gr, gc are the global row/col of the pixel to load.
// Returns the pedestal-subtracted value.
auto load_pixel = [&] __device__ (ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
auto gid = gc + static_cast<ssize_t>(ncols) * gr;
return static_cast<PEDESTAL_TYPE>(d_frame[gid]) - d_pd_mean[gid];
};
// A. Interior: every valid thread loads its own pixel
if (valid_pixel) {
shmem[shmem_tid] = load_pixel(row_global, col_global);
}
// B. Halo regions (Boundaries)
// B.1 Top rows of the halo
if (threadIdx.y == 0 && valid_pixel) {
for (int i = 1; i <= row_radius; ++i) {
if (row_global - i >= 0)
shmem[shmem_tid - i * shmem_stride] = load_pixel(row_global - i, col_global);
}
// Top-left corner rectangle
if (threadIdx.x == 0) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global - i >= 0 && col_global - j >= 0)
shmem[shmem_tid - i * shmem_stride - j] = load_pixel(row_global - i, col_global - j);
}
// Top-right corner rectangle
if (threadIdx.x == blockDim.x - 1) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global - i >= 0 && col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid - i * shmem_stride + j] = load_pixel(row_global - i, col_global + j);
}
}
// B.2 Left column of the halo
if (threadIdx.x == 0 && valid_pixel) {
for (int j = 1; j <= col_radius; ++j)
if (col_global - j >= 0)
shmem[shmem_tid - j] = load_pixel(row_global, col_global - j);
}
// B.3 Right column of the halo
if (threadIdx.x == blockDim.x - 1 && valid_pixel) {
for (int j = 1; j <= col_radius; ++j)
if (col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid + j] = load_pixel(row_global, col_global + j);
}
// B.4 Bottom rows of the halo
if (threadIdx.y == blockDim.y - 1 && valid_pixel) {
for (int i = 1; i <= row_radius; ++i) {
if (row_global + i < static_cast<ssize_t>(nrows))
shmem[shmem_tid + i * shmem_stride] = load_pixel(row_global + i, col_global);
}
// Bottom-left corner rectangle
if (threadIdx.x == 0) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global + i < static_cast<ssize_t>(nrows) && col_global - j >= 0)
shmem[shmem_tid + i * shmem_stride - j] = load_pixel(row_global + i, col_global - j);
}
// Bottom-right corner rectangle
if (threadIdx.x == blockDim.x - 1) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global + i < static_cast<ssize_t>(nrows) && col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid + i * shmem_stride + j] = load_pixel(row_global + i, col_global + j);
}
}
__syncthreads();
// =====================
// Cluster-finding logic
// =====================
if (!valid_pixel) return;
// Per-pixel RMS from global pedestal arrays
// rms = sqrt( E[X^2] - E[X]^2 )
PEDESTAL_TYPE mean_px = d_pd_mean[global_tid];
PEDESTAL_TYPE var_px = d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
PEDESTAL_TYPE rms_px = sqrt(rms_sq);
// Pedestal-subtracted value of the center pixel (already in shmem)
PEDESTAL_TYPE val_pixel = shmem[shmem_tid];
// Negative pedestal early exit
if (val_pixel < -m_nSigma * rms_px)
return; // NOTE: pedestal update for this pixel is skipped (same as sequential)
// Stencil reduction: total, max, quadrant sums
PEDESTAL_TYPE total = PEDESTAL_TYPE{0};
PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of `numeric_limits<double>::lowest()`
PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0, ic<=0)
PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0, ic>=0)
PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0, ic<=0)
PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0, ic>=0)
CT clusterData[CSX * CSY];
int idx = 0; // tracks the pixels in the cluster
#pragma unroll
for (int ir = -row_radius; ir <= row_radius; ++ir) {
#pragma unroll
for (int ic = -col_radius; ic <= col_radius; ++ic) {
PEDESTAL_TYPE val = shmem[shmem_tid + ir * shmem_stride + ic];
total += val;
max_val = max(max_val, val);
// Quadrant accumulation (pixels on the axes contribute to two quadrants)
if (ir <= 0 && ic <= 0) tl += val;
if (ir <= 0 && ic >= 0) tr += val;
if (ir >= 0 && ic <= 0) bl += val;
if (ir >= 0 && ic >= 0) br += val;
// Store pedestal-subtracted value in register array for later cluster output
if constexpr (std::is_integral_v<CT> && std::is_floating_point_v<PEDESTAL_TYPE>)
clusterData[idx] = static_cast<CT>(lround(val));
else
clusterData[idx] = static_cast<CT>(val);
idx++;
}
}
// Three-way classification (mirrors ClusterFinder's logic)
//
// 1. Single-pixel significance: max_val > nSigma * rms
// -> only the pixel that IS the max gets recorded (local-max suppression)
//
// 2. Quadrant significance: max(tl,tr,bl,br) > c2 * nSigma * rms
// -> charge-sharing events where a 2x2 sub-region is significant (incomplete in ClusterFinder)
//
// 3. Total significance: total > c3 * nSigma * rms
// -> distributed events where the full cluster sum is significant
//
// NOTE: Tried to use squared comparisons, whenever possible,
// to avoid the slower `sqrt()` on c2 and c3:
// (x > c * nSigma * rms) <--> (x > 0 && x² > c² * nSigma² * rms²)
PEDESTAL_TYPE nSig_sq_rms_sq = m_nSigma * m_nSigma * rms_sq;
bool is_photon = false;
// Test 1: single-pixel significance
if (max_val > m_nSigma * rms_px) {
// Local-max suppression: only the center-pixel thread records the cluster
if (val_pixel < max_val)
return; // some other pixel in the neighborhood is brighter
is_photon = true;
}
// Test 2: quadrant significance (only if test 1 didn't fire)
if (!is_photon) {
PEDESTAL_TYPE max_quad = max(max(tl, tr), max(bl, br));
if (max_quad > 0 && max_quad * max_quad > pow2_c2 * nSig_sq_rms_sq) {
is_photon = true;
}
}
// Test 3: total significance (only if tests 1 & 2 didn't fire)
if (!is_photon) {
if (total > 0 && total * total > pow2_c3 * nSig_sq_rms_sq) {
is_photon = true;
}
}
// Pedestal update (if not a photon)
// TODO: Pedestal update is omitted here. In the sequential code,
// non-photon pixels feed back into the running pedestal via push_fast().
// On GPU this requires atomic updates to d_pd_sum / d_pd_sum2 which
// possibly degrade performance...
if (!is_photon)
return;
// Write cluster to global output buffer using atomic index
// for coordination across all blocks
uint32_t write_idx = atomicAdd(d_cluster_count, 1u);
// Guard against overflowing the pre-allocated cluster buffer
if (write_idx >= max_clusters)
return;
ClusterType cluster{};
cluster.x = static_cast<decltype(cluster.x)>(col_global);
cluster.y = static_cast<decltype(cluster.y)>(row_global);
// CT* cdata = cluster.data();
// #pragma unroll
// for (int i = 0; i < CSX * CSY; ++i) {
// cdata[i] = clusterData[i];
// }
memcpy(reinterpret_cast<CT*>(&cluster.data), clusterData, sizeof(CT) * CSX * CSY);
d_clusters[write_idx] = cluster;
}
// __________________
//
// Host-side launcher
// __________________
template <typename ClusterType = Cluster<int32_t, 3, 3>,
typename FRAME_TYPE = uint16_t,
typename PEDESTAL_TYPE = double>
struct ClusterFinderCUDA {
static constexpr int BLOCK_X = 16;
static constexpr int BLOCK_Y = 16;
static constexpr int col_radius = ClusterType::cluster_size_x / 2;
static constexpr int row_radius = ClusterType::cluster_size_y / 2;
size_t nrows;
size_t ncols;
uint32_t max_clusters;
// Device pointers
FRAME_TYPE* d_frame = nullptr;
PEDESTAL_TYPE* d_pd_mean = nullptr;
PEDESTAL_TYPE* d_pd_sum2 = nullptr;
ClusterType* d_clusters = nullptr;
uint32_t* d_cluster_count = nullptr;
cudaStream_t stream = 0;
ClusterFinderCUDA(size_t nrows_, size_t ncols_, uint32_t max_clusters_ = 1000000)
: nrows(nrows_), ncols(ncols_), max_clusters(max_clusters_)
{
size_t image_size = nrows * ncols;
CUDA_CHECK(cudaStreamCreate(&stream));
CUDA_CHECK(cudaMalloc(&d_frame, image_size * sizeof(FRAME_TYPE)));
CUDA_CHECK(cudaMalloc(&d_pd_mean, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(cudaMalloc(&d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(cudaMalloc(&d_clusters, max_clusters * sizeof(ClusterType)));
CUDA_CHECK(cudaMalloc(&d_cluster_count, sizeof(uint32_t)));
}
~ClusterFinderCUDA() {
if (d_frame) cudaFree(d_frame);
if (d_pd_mean) cudaFree(d_pd_mean);
if (d_pd_sum2) cudaFree(d_pd_sum2);
if (d_clusters) cudaFree(d_clusters);
if (d_cluster_count) cudaFree(d_cluster_count);
if (stream) cudaStreamDestroy(stream);
}
// Non-copyable, only movable
ClusterFinderCUDA(const ClusterFinderCUDA&) = delete;
ClusterFinderCUDA& operator=(const ClusterFinderCUDA&) = delete;
ClusterFinderCUDA(ClusterFinderCUDA&&) = default;
ClusterFinderCUDA& operator=(ClusterFinderCUDA&&) = default;
/**
* Upload pedestal statistics (mean and sum-of-squares) to device.
* These are typically computed on the host by the Pedestal class.
*/
void upload_pedestal(const PEDESTAL_TYPE* h_mean, const PEDESTAL_TYPE* h_sum2) {
size_t bytes = nrows * ncols * sizeof(PEDESTAL_TYPE);
CUDA_CHECK(cudaMemcpyAsync(d_pd_mean, h_mean, bytes, cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(d_pd_sum2, h_sum2, bytes, cudaMemcpyHostToDevice, stream));
}
/**
* Find clusters in a single frame.
*
* @param h_frame Host pointer to the raw frame data (nrows x ncols)
* @param frame_number Frame sequence number
* @param nSigma Threshold in units of standard deviations
* @param n_pd_samples Number of samples used to compute the pedestal
* @param h_clusters Output: host buffer to receive found clusters
* @param h_n_clusters Output: number of clusters found
*/
void find_clusters(const FRAME_TYPE* h_frame,
uint64_t frame_number,
PEDESTAL_TYPE nSigma,
uint32_t n_pd_samples,
ClusterType* h_clusters,
uint32_t& h_n_clusters)
{
size_t image_bytes = nrows * ncols * sizeof(FRAME_TYPE);
// Reset cluster counter
CUDA_CHECK(cudaMemsetAsync(d_cluster_count, 0, sizeof(uint32_t), stream));
// Upload frame
CUDA_CHECK(cudaMemcpyAsync(d_frame, h_frame, image_bytes, cudaMemcpyHostToDevice, stream));
// Grid/Block dimensions
dim3 block(BLOCK_X, BLOCK_Y);
dim3 grid((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X, // threadIdx.x/.y/.z take unsigned integrales
(static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y
);
// Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + 2*row_radius) doubles
size_t shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * sizeof(PEDESTAL_TYPE);
// Kernel launch
find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
<<<grid, block, shmem_bytes, stream>>>(d_frame,
d_pd_mean,
d_pd_sum2,
n_pd_samples,
nSigma,
nrows,
ncols,
frame_number,
d_clusters,
d_cluster_count,
max_clusters
);
CUDA_CHECK(cudaGetLastError());
// Read back cluster count
CUDA_CHECK(cudaMemcpyAsync(&h_n_clusters, d_cluster_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
// Synchronize to ensure count is available before the CPU reads clusters
CUDA_CHECK(cudaStreamSynchronize(stream));
// Clamp to max in case of overflow
if (h_n_clusters > max_clusters) h_n_clusters = max_clusters;
// Read back clusters
if (h_n_clusters > 0) {
CUDA_CHECK(cudaMemcpyAsync(h_clusters, d_clusters, h_n_clusters * sizeof(ClusterType), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
};
} // namespace aare::device
+14
View File
@@ -0,0 +1,14 @@
#pragma once
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
inline void __cuda_check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
std::fprintf(stderr, "[%s:%d] CUDA error: %s\n",
file, line, cudaGetErrorString(err));
std::exit(1);
}
}
#define CUDA_CHECK(stmt) __cuda_check((stmt), __FILE__, __LINE__)