Format CUDA cluster finder files

2026-07-21 01:12:51 +02:00 · 2026-04-27 11:27:47 +02:00
parent 133cedf755
commit a086cbb897
8 changed files with 508 additions and 440 deletions
@@ -1,31 +1,29 @@
 // SPDX-License-Identifier: MPL-2.0
 #pragma once
-#include <cstdint>
-#include <cmath>
-#include <stdexcept>
-#include <cstdio>
 #include "aare/ClusterFinder.hpp"
 #include "aare/clusterfinder_kernel.cuh"
 #include "aare/utils/cuda_check.cuh"
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>

 namespace aare {
-    
+
 // Per-stream device resources
 template <typename ClusterType, typename FRAME_TYPE, typename PEDESTAL_TYPE>
-struct StreamContext{
-    cudaStream_t   stream           = nullptr;      // handle to the stream
-    FRAME_TYPE*    d_frame          = nullptr;
-    PEDESTAL_TYPE* d_pd_mean        = nullptr;
-    PEDESTAL_TYPE* d_pd_sum         = nullptr;
-    PEDESTAL_TYPE* d_pd_sum2        = nullptr;
-    ClusterType*   d_clusters       = nullptr;
-    uint32_t*      d_cluster_count  = nullptr;
+struct StreamContext {
+    cudaStream_t stream = nullptr; // handle to the stream
+    FRAME_TYPE *d_frame = nullptr;
+    PEDESTAL_TYPE *d_pd_mean = nullptr;
+    PEDESTAL_TYPE *d_pd_sum = nullptr;
+    PEDESTAL_TYPE *d_pd_sum2 = nullptr;
+    ClusterType *d_clusters = nullptr;
+    uint32_t *d_cluster_count = nullptr;
 };

-
 template <typename ClusterType = Cluster<int32_t, 3, 3>,
-          typename FRAME_TYPE = uint16_t,
-          typename PEDESTAL_TYPE = double,
+          typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
          typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
 class ClusterFinderCUDA {

@@ -33,25 +31,25 @@ class ClusterFinderCUDA {
    static constexpr int BLOCK_Y = 16;
    static constexpr int col_radius = ClusterType::cluster_size_x / 2;
    static constexpr int row_radius = ClusterType::cluster_size_y / 2;
-    
-    Shape<2> m_image_size;
-    size_t   nrows;
-    size_t   ncols;
-    size_t   image_size; // nrows * ncols
-    int      n_streams;
-    size_t   m_capacity;

-    PEDESTAL_TYPE               m_nSigma;
-    Pedestal<PEDESTAL_TYPE>     m_pedestal;
-    ClusterVector<ClusterType>  m_clusters;
-    bool                        m_pedestal_dirty = true;
-    
+    Shape<2> m_image_size;
+    size_t nrows;
+    size_t ncols;
+    size_t image_size; // nrows * ncols
+    int n_streams;
+    size_t m_capacity;
+
+    PEDESTAL_TYPE m_nSigma;
+    Pedestal<PEDESTAL_TYPE> m_pedestal;
+    ClusterVector<ClusterType> m_clusters;
+    bool m_pedestal_dirty = true;
+
    using SC = StreamContext<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>;
    std::vector<SC> v_sc;

    // Kernel parameters
-    dim3   grid;
-    dim3   block;
+    dim3 grid;
+    dim3 block;
    size_t shmem_bytes;

  public:
@@ -59,79 +57,87 @@ class ClusterFinderCUDA {
     * @brief Construct a ClusterFinderCUDA
     *
     * @param image_size              shape of the detector frame (rows, cols)
-     * @param nSigma                  threshold in units of per-pixel pedestal std
+     * @param nSigma                  threshold in units of per-pixel pedestal
+     * std
     * @param capacity                device-side cluster buffer size per stream
-     * @param n_streams               number of CUDA streams for multi-frame overlap
+     * @param n_streams               number of CUDA streams for multi-frame
+     * overlap
     */
-    ClusterFinderCUDA(Shape<2> image_size_,
-                      PEDESTAL_TYPE nSigma = 5.0,
-                      size_t capacity = 1000000,
-                      int n_streams_ = 1)
-        : m_image_size(image_size_),
-        nrows(image_size_[0]),
-        ncols(image_size_[1]),
-        image_size(nrows * ncols),
-        n_streams(n_streams_),
-        m_capacity(capacity),
-        m_nSigma(nSigma),
-        m_pedestal(image_size_[0], image_size_[1]),
-        m_clusters(capacity)
-    {
+    ClusterFinderCUDA(Shape<2> image_size_, PEDESTAL_TYPE nSigma = 5.0,
+                      size_t capacity = 1000000, int n_streams_ = 1)
+        : m_image_size(image_size_), nrows(image_size_[0]),
+          ncols(image_size_[1]), image_size(nrows * ncols),
+          n_streams(n_streams_), m_capacity(capacity), m_nSigma(nSigma),
+          m_pedestal(image_size_[0], image_size_[1]), m_clusters(capacity) {
        // Grid/Block dimensions
        block = dim3(BLOCK_X, BLOCK_Y);
-        grid  = dim3((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X,
-                     (static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y);
+        grid = dim3((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X,
+                    (static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y);
+
+        // Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y +
+        // 2*row_radius) doubles
+        shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) *
+                      sizeof(PEDESTAL_TYPE);

-        // Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + 2*row_radius) doubles
-        shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * sizeof(PEDESTAL_TYPE);
-        
        v_sc.resize(n_streams);
-        for(int k=0; k < n_streams; ++k) {
-            auto& sc = v_sc[k];
-            CUDA_CHECK(cudaStreamCreate(&sc.stream)); 
-            CUDA_CHECK(cudaMalloc(&sc.d_frame,         image_size * sizeof(FRAME_TYPE)));
-            CUDA_CHECK(cudaMalloc(&sc.d_pd_mean,       image_size * sizeof(PEDESTAL_TYPE)));
-            CUDA_CHECK(cudaMalloc(&sc.d_pd_sum,        image_size * sizeof(PEDESTAL_TYPE)));
-            CUDA_CHECK(cudaMalloc(&sc.d_pd_sum2,       image_size * sizeof(PEDESTAL_TYPE)));
-            CUDA_CHECK(cudaMalloc(&sc.d_clusters,      capacity * sizeof(ClusterType)));
+        for (int k = 0; k < n_streams; ++k) {
+            auto &sc = v_sc[k];
+            CUDA_CHECK(cudaStreamCreate(&sc.stream));
+            CUDA_CHECK(
+                cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE)));
+            CUDA_CHECK(
+                cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE)));
+            CUDA_CHECK(
+                cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE)));
+            CUDA_CHECK(
+                cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE)));
+            CUDA_CHECK(
+                cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType)));
            CUDA_CHECK(cudaMalloc(&sc.d_cluster_count, sizeof(uint32_t)));
-        }  
+        }
    }

    ~ClusterFinderCUDA() {
-        for(auto& sc : v_sc) {
-            if (sc.d_frame)         cudaFree(sc.d_frame);
-            if (sc.d_pd_mean)       cudaFree(sc.d_pd_mean);
-            if (sc.d_pd_sum)        cudaFree(sc.d_pd_sum);
-            if (sc.d_pd_sum2)       cudaFree(sc.d_pd_sum2);
-            if (sc.d_clusters)      cudaFree(sc.d_clusters);
-            if (sc.d_cluster_count) cudaFree(sc.d_cluster_count);
-            if (sc.stream)          cudaStreamDestroy(sc.stream);
+        for (auto &sc : v_sc) {
+            if (sc.d_frame)
+                cudaFree(sc.d_frame);
+            if (sc.d_pd_mean)
+                cudaFree(sc.d_pd_mean);
+            if (sc.d_pd_sum)
+                cudaFree(sc.d_pd_sum);
+            if (sc.d_pd_sum2)
+                cudaFree(sc.d_pd_sum2);
+            if (sc.d_clusters)
+                cudaFree(sc.d_clusters);
+            if (sc.d_cluster_count)
+                cudaFree(sc.d_cluster_count);
+            if (sc.stream)
+                cudaStreamDestroy(sc.stream);
        }
    }

    // Non-copyable, non-movable
-    ClusterFinderCUDA(const ClusterFinderCUDA&) = delete;
-    ClusterFinderCUDA& operator=(const ClusterFinderCUDA&) = delete;
-    ClusterFinderCUDA(ClusterFinderCUDA&&) = delete;
-    ClusterFinderCUDA& operator=(ClusterFinderCUDA&&) = delete;
+    ClusterFinderCUDA(const ClusterFinderCUDA &) = delete;
+    ClusterFinderCUDA &operator=(const ClusterFinderCUDA &) = delete;
+    ClusterFinderCUDA(ClusterFinderCUDA &&) = delete;
+    ClusterFinderCUDA &operator=(ClusterFinderCUDA &&) = delete;
+
+    void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; }
+    PEDESTAL_TYPE get_nSigma() const { return m_nSigma; }

-    void          set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; }
-    PEDESTAL_TYPE get_nSigma() const               { return m_nSigma; }
- 
    void push_pedestal_frame(NDView<FRAME_TYPE, 2> frame) {
        m_pedestal.push(frame);
        m_pedestal_dirty = true;
    }
- 
+
    void clear_pedestal() {
        m_pedestal.clear();
        m_pedestal_dirty = true;
    }
- 
+
    NDArray<PEDESTAL_TYPE, 2> pedestal() { return m_pedestal.mean(); }
-    NDArray<PEDESTAL_TYPE, 2> noise()    { return m_pedestal.std(); }
- 
+    NDArray<PEDESTAL_TYPE, 2> noise() { return m_pedestal.std(); }
+
    /**
     * @brief Move clusters out of the internal ClusterVector, optionally
     *        reallocating the internal one with the same capacity.
@@ -150,52 +156,47 @@ class ClusterFinderCUDA {
     * @brief Find clusters in a single frame, appending them to the internal
     *        ClusterVector.
     */
-    void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0) 
-    {    
+    void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0) {
        if (m_pedestal_dirty) { // need to update the pedestal on the gpu
            sync_pedestal_to_device();
            m_pedestal_dirty = false;
        }

-        auto& sc = v_sc[0];
+        auto &sc = v_sc[0];
        const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
-        const uint32_t n_pd_samples = static_cast<uint32_t>(m_pedestal.n_samples());
+        const uint32_t n_pd_samples =
+            static_cast<uint32_t>(m_pedestal.n_samples());

        // Reset cluster counter
-        CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t), sc.stream));
-
-        // Upload frame
-        CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes, cudaMemcpyHostToDevice, sc.stream));
-
-        // Kernel launch
-        device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
-            <<<grid, block, shmem_bytes, sc.stream>>>(sc.d_frame,
-                                                   sc.d_pd_mean,
-                                                   sc.d_pd_sum,
-                                                   sc.d_pd_sum2,
-                                                   n_pd_samples,
-                                                   m_nSigma,
-                                                   nrows,
-                                                   ncols,
-                                                   sc.d_clusters,
-                                                   sc.d_cluster_count,
-                                                   static_cast<uint32_t>(m_capacity)
-            );
-        CUDA_CHECK(cudaGetLastError());
-        
-        // Read back cluster count
-        uint32_t n_found = 0;
-        CUDA_CHECK(cudaMemcpyAsync(&n_found,
-                                   sc.d_cluster_count, 
-                                   sizeof(uint32_t), 
-                                   cudaMemcpyDeviceToHost, 
+        CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t),
                                   sc.stream));

-        // Synchronize to ensure count is available before the CPU reads clusters
+        // Upload frame
+        CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes,
+                                   cudaMemcpyHostToDevice, sc.stream));
+
+        // Kernel launch
+        device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE,
+                                              PEDESTAL_TYPE>
+            <<<grid, block, shmem_bytes, sc.stream>>>(
+                sc.d_frame, sc.d_pd_mean, sc.d_pd_sum, sc.d_pd_sum2,
+                n_pd_samples, m_nSigma, nrows, ncols, sc.d_clusters,
+                sc.d_cluster_count, static_cast<uint32_t>(m_capacity));
+        CUDA_CHECK(cudaGetLastError());
+
+        // Read back cluster count
+        uint32_t n_found = 0;
+        CUDA_CHECK(cudaMemcpyAsync(&n_found, sc.d_cluster_count,
+                                   sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                   sc.stream));
+
+        // Synchronize to ensure count is available before the CPU reads
+        // clusters
        CUDA_CHECK(cudaStreamSynchronize(sc.stream));

        // Clamp to max in case of overflow
-        if (n_found > m_capacity) n_found = m_capacity;
+        if (n_found > m_capacity)
+            n_found = m_capacity;

        // Read back clusters
        m_clusters.set_frame_number(frame_number);
@@ -213,74 +214,79 @@ class ClusterFinderCUDA {
     */
    std::vector<ClusterVector<ClusterType>>
    find_clusters_batched(NDView<FRAME_TYPE, 3> frames,
-                          uint64_t first_frame = 0)
-    {
+                          uint64_t first_frame = 0) {
        if (m_pedestal_dirty) {
            sync_pedestal_to_device();
            m_pedestal_dirty = false;
        }
- 
-        const size_t n_frames     = frames.shape(0);
-        const size_t image_bytes  = image_size * sizeof(FRAME_TYPE);
-        const uint32_t n_pd_samples = static_cast<uint32_t>(m_pedestal.n_samples());
- 
+
+        const size_t n_frames = frames.shape(0);
+        const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
+        const uint32_t n_pd_samples =
+            static_cast<uint32_t>(m_pedestal.n_samples());
+
        std::vector<ClusterVector<ClusterType>> results;
        results.reserve(n_frames);
        for (size_t i = 0; i < n_frames; ++i) {
            results.emplace_back(m_capacity);
            results.back().set_frame_number(first_frame + i);
        }
- 
+
        std::vector<uint32_t> host_counts(n_streams, 0);
- 
+
        const size_t n_rounds = (n_frames + n_streams - 1) / n_streams;
        for (size_t round = 0; round < n_rounds; ++round) {
- 
+
            // Launch phase: fan out kernels on all streams for this round
            for (int k = 0; k < n_streams; ++k) {
                // OOB guard
                const size_t frame_idx = round * n_streams + k;
-                if (frame_idx >= n_frames) continue;
- 
-                auto& sc_k = v_sc[k];
-                const FRAME_TYPE* h_src = frames.data() + frame_idx * image_size;
- 
+                if (frame_idx >= n_frames)
+                    continue;
+
+                auto &sc_k = v_sc[k];
+                const FRAME_TYPE *h_src =
+                    frames.data() + frame_idx * image_size;
+
                CUDA_CHECK(cudaMemsetAsync(sc_k.d_cluster_count, 0,
                                           sizeof(uint32_t), sc_k.stream));
                CUDA_CHECK(cudaMemcpyAsync(sc_k.d_frame, h_src, image_bytes,
-                                           cudaMemcpyHostToDevice, sc_k.stream));
- 
-                device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
+                                           cudaMemcpyHostToDevice,
+                                           sc_k.stream));
+
+                device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE,
+                                                      PEDESTAL_TYPE>
                    <<<grid, block, shmem_bytes, sc_k.stream>>>(
-                        sc_k.d_frame,
-                        sc_k.d_pd_mean, sc_k.d_pd_sum, sc_k.d_pd_sum2,
-                        n_pd_samples, m_nSigma,
-                        nrows, ncols,
+                        sc_k.d_frame, sc_k.d_pd_mean, sc_k.d_pd_sum,
+                        sc_k.d_pd_sum2, n_pd_samples, m_nSigma, nrows, ncols,
                        sc_k.d_clusters, sc_k.d_cluster_count, m_capacity);
                CUDA_CHECK(cudaGetLastError());
            }
- 
+
            // Drain phase: fan in results from all streams
            for (int k = 0; k < n_streams; ++k) {
                const size_t frame_idx = round * n_streams + k;
-                if (frame_idx >= n_frames) continue;
- 
-                auto& sc_k = v_sc[k];
- 
-                CUDA_CHECK(cudaMemcpyAsync(&host_counts[k], sc_k.d_cluster_count,
-                                           sizeof(uint32_t),
-                                           cudaMemcpyDeviceToHost, sc_k.stream));
+                if (frame_idx >= n_frames)
+                    continue;
+
+                auto &sc_k = v_sc[k];
+
+                CUDA_CHECK(cudaMemcpyAsync(
+                    &host_counts[k], sc_k.d_cluster_count, sizeof(uint32_t),
+                    cudaMemcpyDeviceToHost, sc_k.stream));
                CUDA_CHECK(cudaStreamSynchronize(sc_k.stream));
- 
+
                uint32_t n_found = host_counts[k];
-                if (n_found > m_capacity) n_found = m_capacity;
- 
+                if (n_found > m_capacity)
+                    n_found = m_capacity;
+
                if (n_found > 0) {
-                    append_device_clusters_to(results[frame_idx], sc_k, n_found);
+                    append_device_clusters_to(results[frame_idx], sc_k,
+                                              n_found);
                }
            }
        }
- 
+
        return results;
    }

@@ -295,34 +301,35 @@ class ClusterFinderCUDA {
        // copies complete, so we synchronise at the end before they go out
        // of scope.
        NDArray<PEDESTAL_TYPE, 2> h_mean = m_pedestal.mean();
-        NDArray<PEDESTAL_TYPE, 2> h_sum  = m_pedestal.get_sum();
+        NDArray<PEDESTAL_TYPE, 2> h_sum = m_pedestal.get_sum();
        NDArray<PEDESTAL_TYPE, 2> h_sum2 = m_pedestal.get_sum2();
- 
+
        const size_t bytes = image_size * sizeof(PEDESTAL_TYPE);
-        for (auto& sc : v_sc) {
+        for (auto &sc : v_sc) {
            CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_mean, h_mean.data(), bytes,
                                       cudaMemcpyHostToDevice, sc.stream));
-            CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum,  h_sum.data(),  bytes,
+            CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes,
                                       cudaMemcpyHostToDevice, sc.stream));
            CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum2, h_sum2.data(), bytes,
                                       cudaMemcpyHostToDevice, sc.stream));
        }
-        for (auto& sc : v_sc) CUDA_CHECK(cudaStreamSynchronize(sc.stream));
+        for (auto &sc : v_sc)
+            CUDA_CHECK(cudaStreamSynchronize(sc.stream));
    }
- 
+
    /**
     * Copy n_found clusters from sc.d_clusters into the given ClusterVector
-     * and block on the transfer. 
+     * and block on the transfer.
     */
-    void append_device_clusters_to(ClusterVector<ClusterType>& cv,
-                                   SC& sc, uint32_t n_found)
-    {
+    void append_device_clusters_to(ClusterVector<ClusterType> &cv, SC &sc,
+                                   uint32_t n_found) {
        std::vector<ClusterType> staging(n_found);
        CUDA_CHECK(cudaMemcpyAsync(staging.data(), sc.d_clusters,
                                   n_found * sizeof(ClusterType),
                                   cudaMemcpyDeviceToHost, sc.stream));
        CUDA_CHECK(cudaStreamSynchronize(sc.stream));
-        for (const auto& c : staging) cv.push_back(c);
+        for (const auto &c : staging)
+            cv.push_back(c);
    }
 };

@@ -1,28 +1,22 @@
 #pragma once
-#include <cuda_runtime.h>
-#include <type_traits>
 #include "aare/Cluster.hpp"
 #include "aare/ClusterFinder.hpp"
+#include <cuda_runtime.h>
+#include <type_traits>

 namespace aare::device {

 template <typename ClusterType = Cluster<int32_t, 3, 3>,
-          typename FRAME_TYPE = uint16_t,
-          typename PEDESTAL_TYPE = double,
+          typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
          typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
-__global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d_frame,
-                                              PEDESTAL_TYPE* __restrict__ d_pd_mean,
-                                              PEDESTAL_TYPE* __restrict__ d_pd_sum,
-                                              PEDESTAL_TYPE* __restrict__ d_pd_sum2,
-                                              const uint32_t       n_pd_samples,
-                                              const PEDESTAL_TYPE  m_nSigma,
-                                              const size_t         nrows,
-                                              const size_t         ncols,
-                                            //   const uint64_t       frame_number,
-                                              ClusterType*         d_clusters,
-                                              uint32_t*            d_cluster_count,
-                                             const uint32_t       max_clusters)
-{
+__global__ void find_clusters_in_single_frame(
+    const FRAME_TYPE *__restrict__ d_frame,
+    PEDESTAL_TYPE *__restrict__ d_pd_mean, PEDESTAL_TYPE *__restrict__ d_pd_sum,
+    PEDESTAL_TYPE *__restrict__ d_pd_sum2, const uint32_t n_pd_samples,
+    const PEDESTAL_TYPE m_nSigma, const size_t nrows, const size_t ncols,
+    //   const uint64_t       frame_number,
+    ClusterType *d_clusters, uint32_t *d_cluster_count,
+    const uint32_t max_clusters) {
    using CT = typename ClusterType::value_type;

    // Compile-time cluster geometry useful for unrolling loops
@@ -38,10 +32,12 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    constexpr int pow2_c3 = CSX * CSY;

    // Thread/pixel mapping
-    auto col_global = static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
-    auto row_global = static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
+    auto col_global =
+        static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
+    auto row_global =
+        static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
    auto global_tid = static_cast<ssize_t>(col_global + ncols * row_global);
-    auto local_tid  = threadIdx.x + blockDim.x * threadIdx.y;
+    auto local_tid = threadIdx.x + blockDim.x * threadIdx.y;

    // ====================
    // Shared memory layout
@@ -55,19 +51,22 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    // CUDA prefers raw bytes + aligned cast
    // Compile error happens when using: `extern __shared__ T sh[];`
    extern __shared__ __align__(sizeof(PEDESTAL_TYPE)) unsigned char smem[];
-    PEDESTAL_TYPE* shmem = reinterpret_cast<PEDESTAL_TYPE*>(smem);
+    PEDESTAL_TYPE *shmem = reinterpret_cast<PEDESTAL_TYPE *>(smem);

    // Stride includes halo on both sides
    auto shmem_stride = static_cast<int>(blockDim.x) + 2 * col_radius;
-    auto tile_size    = shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
+    auto tile_size =
+        shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);

    // Offset so that thread (0,0) maps to shared-memory position
    // (row_radius, col_radius) i.e. past the top-left halo.
-    auto shmem_tid = (static_cast<int>(threadIdx.y) + row_radius) * shmem_stride
-                   + (static_cast<int>(threadIdx.x) + col_radius);
+    auto shmem_tid =
+        (static_cast<int>(threadIdx.y) + row_radius) * shmem_stride +
+        (static_cast<int>(threadIdx.x) + col_radius);

    // Cooperative zero-fill
-    for (int idx = static_cast<int>(local_tid); idx < tile_size; idx += static_cast<int>(blockDim.x * blockDim.y)) {
+    for (int idx = static_cast<int>(local_tid); idx < tile_size;
+         idx += static_cast<int>(blockDim.x * blockDim.y)) {
        shmem[idx] = PEDESTAL_TYPE{0};
    }
    __syncthreads();
@@ -83,7 +82,7 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    // Helper: read (frame - pedestal_mean) from global memory, or 0 if OOB.
    // gr, gc are the global row/col of the pixel to load.
    // Returns the pedestal-subtracted value.
-    auto load_pixel = [&] __device__ (ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
+    auto load_pixel = [&] __device__(ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
        auto gid = gc + static_cast<ssize_t>(ncols) * gr;
        return static_cast<PEDESTAL_TYPE>(d_frame[gid]) - d_pd_mean[gid];
    };
@@ -98,21 +97,25 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    if (threadIdx.y == 0 && valid_pixel) {
        for (int i = 1; i <= row_radius; ++i) {
            if (row_global - i >= 0)
-                shmem[shmem_tid - i * shmem_stride] = load_pixel(row_global - i, col_global);
+                shmem[shmem_tid - i * shmem_stride] =
+                    load_pixel(row_global - i, col_global);
        }
        // Top-left corner rectangle
        if (threadIdx.x == 0) {
            for (int i = 1; i <= row_radius; ++i)
                for (int j = 1; j <= col_radius; ++j)
                    if (row_global - i >= 0 && col_global - j >= 0)
-                        shmem[shmem_tid - i * shmem_stride - j] = load_pixel(row_global - i, col_global - j);
+                        shmem[shmem_tid - i * shmem_stride - j] =
+                            load_pixel(row_global - i, col_global - j);
        }
        // Top-right corner rectangle
        if (threadIdx.x == blockDim.x - 1) {
            for (int i = 1; i <= row_radius; ++i)
                for (int j = 1; j <= col_radius; ++j)
-                    if (row_global - i >= 0 && col_global + j < static_cast<ssize_t>(ncols))
-                        shmem[shmem_tid - i * shmem_stride + j] = load_pixel(row_global - i, col_global + j);
+                    if (row_global - i >= 0 &&
+                        col_global + j < static_cast<ssize_t>(ncols))
+                        shmem[shmem_tid - i * shmem_stride + j] =
+                            load_pixel(row_global - i, col_global + j);
        }
    }

@@ -134,21 +137,26 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    if (threadIdx.y == blockDim.y - 1 && valid_pixel) {
        for (int i = 1; i <= row_radius; ++i) {
            if (row_global + i < static_cast<ssize_t>(nrows))
-                shmem[shmem_tid + i * shmem_stride] = load_pixel(row_global + i, col_global);
+                shmem[shmem_tid + i * shmem_stride] =
+                    load_pixel(row_global + i, col_global);
        }
        // Bottom-left corner rectangle
        if (threadIdx.x == 0) {
            for (int i = 1; i <= row_radius; ++i)
                for (int j = 1; j <= col_radius; ++j)
-                    if (row_global + i < static_cast<ssize_t>(nrows) && col_global - j >= 0)
-                        shmem[shmem_tid + i * shmem_stride - j] = load_pixel(row_global + i, col_global - j);
+                    if (row_global + i < static_cast<ssize_t>(nrows) &&
+                        col_global - j >= 0)
+                        shmem[shmem_tid + i * shmem_stride - j] =
+                            load_pixel(row_global + i, col_global - j);
        }
        // Bottom-right corner rectangle
        if (threadIdx.x == blockDim.x - 1) {
            for (int i = 1; i <= row_radius; ++i)
                for (int j = 1; j <= col_radius; ++j)
-                    if (row_global + i < static_cast<ssize_t>(nrows) && col_global + j < static_cast<ssize_t>(ncols))
-                        shmem[shmem_tid + i * shmem_stride + j] = load_pixel(row_global + i, col_global + j);
+                    if (row_global + i < static_cast<ssize_t>(nrows) &&
+                        col_global + j < static_cast<ssize_t>(ncols))
+                        shmem[shmem_tid + i * shmem_stride + j] =
+                            load_pixel(row_global + i, col_global + j);
        }
    }

@@ -157,51 +165,57 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    // =====================
    // Cluster-finding logic
    // =====================
-    if (!valid_pixel) return;
+    if (!valid_pixel)
+        return;

    // Per-pixel RMS from global pedestal arrays
    // rms = sqrt( E[X^2] - E[X]^2 )
-    PEDESTAL_TYPE mean_px  = d_pd_mean[global_tid];
-    PEDESTAL_TYPE var_px   = d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
-    PEDESTAL_TYPE rms_sq  = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
-    PEDESTAL_TYPE rms_px  = sqrt(rms_sq);
+    PEDESTAL_TYPE mean_px = d_pd_mean[global_tid];
+    PEDESTAL_TYPE var_px =
+        d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
+    PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
+    PEDESTAL_TYPE rms_px = sqrt(rms_sq);

    // Pedestal-subtracted value of the center pixel (already in shmem)
    PEDESTAL_TYPE val_pixel = shmem[shmem_tid];

-    // Negative pedestal early exit 
+    // Negative pedestal early exit
    if (val_pixel < -m_nSigma * rms_px)
-        return;  // NOTE: pedestal update for this pixel is skipped (same as sequential)
+        return; // NOTE: pedestal update for this pixel is skipped (same as
+                // sequential)

    // Stencil reduction: total, max, quadrant sums
    PEDESTAL_TYPE total = PEDESTAL_TYPE{0};
-    PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of `numeric_limits<double>::lowest()`
+    PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of
+                                       // `numeric_limits<double>::lowest()`

-    /* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0};   // top-left quadrant  (ir<=0, ic<=0)
-    PEDESTAL_TYPE tr = PEDESTAL_TYPE{0};   // top-right quadrant (ir<=0, ic>=0)
-    PEDESTAL_TYPE bl = PEDESTAL_TYPE{0};   // bottom-left        (ir>=0, ic<=0)
-    PEDESTAL_TYPE br = PEDESTAL_TYPE{0};   // bottom-right       (ir>=0, ic>=0) */
+    /* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0};   // top-left quadrant  (ir<=0,
+    ic<=0) PEDESTAL_TYPE tr = PEDESTAL_TYPE{0};   // top-right quadrant (ir<=0,
+    ic>=0) PEDESTAL_TYPE bl = PEDESTAL_TYPE{0};   // bottom-left        (ir>=0,
+    ic<=0) PEDESTAL_TYPE br = PEDESTAL_TYPE{0};   // bottom-right       (ir>=0,
+    ic>=0) */

    CT clusterData[CSX * CSY];
    int idx = 0; // tracks the pixels in the cluster

-    #pragma unroll
+#pragma unroll
    for (int ir = -row_radius; ir <= row_radius; ++ir) {
-        #pragma unroll
+#pragma unroll
        for (int ic = -col_radius; ic <= col_radius; ++ic) {
            PEDESTAL_TYPE val = shmem[shmem_tid + ir * shmem_stride + ic];

            total += val;
            max_val = max(max_val, val);

-            /* // Quadrant accumulation (pixels on the axes contribute to two quadrants)
-            if (ir <= 0 && ic <= 0) tl += val;
-            if (ir <= 0 && ic >= 0) tr += val;
-            if (ir >= 0 && ic <= 0) bl += val;
-            if (ir >= 0 && ic >= 0) br += val; */
+            /* // Quadrant accumulation (pixels on the axes contribute to two
+            quadrants) if (ir <= 0 && ic <= 0) tl += val; if (ir <= 0 && ic >=
+            0) tr += val; if (ir >= 0 && ic <= 0) bl += val; if (ir >= 0 && ic
+            >= 0) br += val; */

-            // Store pedestal-subtracted value in register array for later cluster output
-            if constexpr (std::is_integral_v<CT> && std::is_floating_point_v<PEDESTAL_TYPE>)
+            // Store pedestal-subtracted value in register array for later
+            // cluster output
+            if constexpr (std::is_integral_v<CT> &&
+                          std::is_floating_point_v<PEDESTAL_TYPE>)
                clusterData[idx] = static_cast<CT>(lround(val));
            else
                clusterData[idx] = static_cast<CT>(val);
@@ -213,10 +227,11 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    // Three-way classification (mirrors ClusterFinder's logic)
    //
    //  1. Single-pixel significance:  max_val > nSigma * rms
-    //     -> only the pixel that IS the max gets recorded (local-max suppression)
+    //     -> only the pixel that IS the max gets recorded (local-max
+    //     suppression)
    //
    //  2. Quadrant significance:      max(tl,tr,bl,br) > c2 * nSigma * rms
-    //     -> charge-sharing events where a 2x2 sub-region is significant 
+    //     -> charge-sharing events where a 2x2 sub-region is significant
    //  NOTE: This test is absent in the serial ClusterFinder!
    //
    //  3. Total significance:         total > c3 * nSigma * rms
@@ -228,9 +243,10 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d

    // Test 1: single-pixel significance
    if (max_val > m_nSigma * rms_px) {
-        // Local-max suppression: only the center-pixel thread records the cluster
+        // Local-max suppression: only the center-pixel thread records the
+        // cluster
        if (val_pixel < max_val)
-            return;  // some other pixel in the neighborhood is brighter
+            return; // some other pixel in the neighborhood is brighter
        is_photon = true;
    }

@@ -249,29 +265,30 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
        }
    }

-    // Pedestal update (if not a photon) 
-    // In the sequential code, non-photon pixels feed back into the running pedestal via push_fast().
-    // In this kernel, the GPU updates all pixels in a frame simultaneously. So the updated pedestal 
-    // will only be used starting from the next frame. -> This avoids a/serialization and b/global mem I/O.
+    // Pedestal update (if not a photon)
+    // In the sequential code, non-photon pixels feed back into the running
+    // pedestal via push_fast(). In this kernel, the GPU updates all pixels in a
+    // frame simultaneously. So the updated pedestal will only be used starting
+    // from the next frame. -> This avoids a/serialization and b/global mem I/O.
    if (!is_photon && valid_pixel) {
        PEDESTAL_TYPE raw_val = static_cast<PEDESTAL_TYPE>(d_frame[global_tid]);
-        PEDESTAL_TYPE sum  = d_pd_sum[global_tid];
+        PEDESTAL_TYPE sum = d_pd_sum[global_tid];
        PEDESTAL_TYPE sum2 = d_pd_sum2[global_tid];

-        sum  += raw_val           - sum  / n_pd_samples;
+        sum += raw_val - sum / n_pd_samples;
        sum2 += raw_val * raw_val - sum2 / n_pd_samples;

-        d_pd_sum[global_tid]  = sum;
+        d_pd_sum[global_tid] = sum;
        d_pd_sum2[global_tid] = sum2;
        d_pd_mean[global_tid] = sum / n_pd_samples;
        return;
    }

-    /* 
+    /*
    if (!is_photon) return; // Debugging
    */

-    // Write cluster to global output buffer using atomic index 
+    // Write cluster to global output buffer using atomic index
    // for coordination across all blocks
    uint32_t write_idx = atomicAdd(d_cluster_count, 1u);

@@ -283,7 +300,8 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE*   __restrict__ d
    cluster.x = static_cast<decltype(cluster.x)>(col_global);
    cluster.y = static_cast<decltype(cluster.y)>(row_global);

-    memcpy(reinterpret_cast<CT*>(&cluster.data), clusterData, sizeof(CT) * CSX * CSY);    
+    memcpy(reinterpret_cast<CT *>(&cluster.data), clusterData,
+           sizeof(CT) * CSX * CSY);

    d_clusters[write_idx] = cluster;
 }
@@ -1,17 +1,15 @@
 // SPDX-License-Identifier: MPL-2.0
 #pragma once
-#include <vector>
-#include <cstring>
 #include "aare/NDArray.hpp"
-
+#include <cstring>
+#include <vector>

 template <typename FRAME_TYPE>
-void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>>& frames,
-                      size_t first_frame,
-                      size_t n_frames,
-                      std::vector<FRAME_TYPE>& batch)
-{
-    if (n_frames == 0) return;
+void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>> &frames,
+                      size_t first_frame, size_t n_frames,
+                      std::vector<FRAME_TYPE> &batch) {
+    if (n_frames == 0)
+        return;

    const size_t rows = frames[first_frame].shape(0);
    const size_t cols = frames[first_frame].shape(1);
@@ -23,8 +21,8 @@ void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>>& frames,
    }

    for (size_t k = 0; k < n_frames; ++k) {
-        const FRAME_TYPE* src = frames[first_frame + k].data();
-        FRAME_TYPE* dst = batch.data() + k * image_size;
+        const FRAME_TYPE *src = frames[first_frame + k].data();
+        FRAME_TYPE *dst = batch.data() + k * image_size;
        std::memcpy(dst, src, image_size * sizeof(FRAME_TYPE));
    }
 }
@@ -3,12 +3,12 @@
 #include <cstdlib>
 #include <cuda_runtime.h>

-inline void __cuda_check(cudaError_t err, const char* file, int line) {
-  if (err != cudaSuccess) {
-    std::fprintf(stderr, "[%s:%d] CUDA error: %s\n",
-                 file, line, cudaGetErrorString(err));
-    std::exit(1);
-  }
+inline void __cuda_check(cudaError_t err, const char *file, int line) {
+    if (err != cudaSuccess) {
+        std::fprintf(stderr, "[%s:%d] CUDA error: %s\n", file, line,
+                     cudaGetErrorString(err));
+        std::exit(1);
+    }
 }

 #define CUDA_CHECK(stmt) __cuda_check((stmt), __FILE__, __LINE__)
@@ -27,19 +27,15 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
    auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr);

    using ClusterType = Cluster<T, ClusterSizeX, ClusterSizeY, CoordType>;
-    using CF          = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
+    using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;

    py::class_<CF>(m, class_name.c_str())
-        .def(py::init<Shape<2>, pd_type, size_t, int>(),
-             py::arg("image_size"),
-             py::arg("n_sigma")    = 5.0,
-             py::arg("capacity")   = 1'000'000,
-             py::arg("n_streams")  = 1)
+        .def(py::init<Shape<2>, pd_type, size_t, int>(), py::arg("image_size"),
+             py::arg("n_sigma") = 5.0, py::arg("capacity") = 1'000'000,
+             py::arg("n_streams") = 1)

        .def_property(
-            "nSigma",
-            &CF::get_nSigma,
-            &CF::set_nSigma,
+            "nSigma", &CF::get_nSigma, &CF::set_nSigma,
            R"(Number of sigma above the pedestal to consider a photon during cluster finding.)")

        .def("push_pedestal_frame",
@@ -50,21 +46,19 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {

        .def("clear_pedestal", &CF::clear_pedestal)

-        .def_property_readonly(
-            "pedestal",
-            [](CF &self) {
-                auto pd = new NDArray<pd_type, 2>{};
-                *pd = self.pedestal();
-                return return_image_data(pd);
-            })
+        .def_property_readonly("pedestal",
+                               [](CF &self) {
+                                   auto pd = new NDArray<pd_type, 2>{};
+                                   *pd = self.pedestal();
+                                   return return_image_data(pd);
+                               })

-        .def_property_readonly(
-            "noise",
-            [](CF &self) {
-                auto arr = new NDArray<pd_type, 2>{};
-                *arr = self.noise();
-                return return_image_data(arr);
-            })
+        .def_property_readonly("noise",
+                               [](CF &self) {
+                                   auto arr = new NDArray<pd_type, 2>{};
+                                   *arr = self.noise();
+                                   return return_image_data(arr);
+                               })

        .def(
            "steal_clusters",
@@ -86,7 +80,8 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
        .def(
            "find_clusters_batched",
            [](CF &self, py::array_t<uint16_t> frames, uint64_t first_frame) {
-                // frames is expected as a 3D numpy array (n_frames, nrows, ncols)
+                // frames is expected as a 3D numpy array (n_frames, nrows,
+                // ncols)
                auto view = make_view_3d(frames);
                return self.find_clusters_batched(view, first_frame);
            },
@@ -30,8 +30,7 @@ void define_ClusterVector(py::module &m, const std::string &typestr) {

    py::class_<ClusterVector<
        Cluster<Type, ClusterSizeX, ClusterSizeY, CoordType>, void>>(
-        m, class_name.c_str(),
-        py::buffer_protocol(),
+        m, class_name.c_str(), py::buffer_protocol(),
        py::module_local())

        .def(py::init()) // TODO change!!!
@@ -6,8 +6,8 @@
 // back a usable ClusterVector without _aare needing to be imported first.

 #include "bind_Cluster.hpp"
-#include "bind_ClusterVector.hpp"
 #include "bind_ClusterFinderCUDA.hpp"
+#include "bind_ClusterVector.hpp"

 #include <pybind11/pybind11.h>

@@ -22,45 +22,45 @@ namespace py = pybind11;
    define_Cluster<T, N, M, U>(m, #N "x" #M #TYPE_CODE);

 #define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE)              \
-    aare::define_ClusterFinderCUDA<T, N, M, U>(                                \
-        m, "Cluster" #N "x" #M #TYPE_CODE);
+    aare::define_ClusterFinderCUDA<T, N, M, U>(m, "Cluster" #N                 \
+                                                  "x" #M #TYPE_CODE);

 PYBIND11_MODULE(_aare_cuda, m) {

    // Types first — finders reference them in their signatures.
    // SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too.
-    DEFINE_CUDA_CLUSTER_TYPES(int,    3, 3, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
    DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d);
-    DEFINE_CUDA_CLUSTER_TYPES(float,  3, 3, uint16_t, f);
+    DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);

-    DEFINE_CUDA_CLUSTER_TYPES(int,    5, 5, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
    DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d);
-    DEFINE_CUDA_CLUSTER_TYPES(float,  5, 5, uint16_t, f);
+    DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);

-    DEFINE_CUDA_CLUSTER_TYPES(int,    7, 7, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
    DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d);
-    DEFINE_CUDA_CLUSTER_TYPES(float,  7, 7, uint16_t, f);
+    DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);

-    DEFINE_CUDA_CLUSTER_TYPES(int,    9, 9, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
    DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d);
-    DEFINE_CUDA_CLUSTER_TYPES(float,  9, 9, uint16_t, f);
+    DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);

    // Finders
-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    3, 3, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d);
-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  3, 3, uint16_t, f);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);

-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    5, 5, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d);
-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  5, 5, uint16_t, f);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);

-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    7, 7, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d);
-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  7, 7, uint16_t, f);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);

-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    9, 9, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d);
-    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  9, 9, uint16_t, f);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
 }

 #undef DEFINE_CUDA_CLUSTER_TYPES
@@ -9,14 +9,14 @@
 #include <numeric>
 #include <vector>

-#include "aare/defs.hpp"
-#include "aare/utils/batch.hpp"
+#include "aare/ClusterFinder.hpp"
+#include "aare/ClusterFinderCUDA.hpp"
 #include "aare/File.hpp"
 #include "aare/Frame.hpp"
 #include "aare/NDArray.hpp"
 #include "aare/Pedestal.hpp"
-#include "aare/ClusterFinder.hpp"
-#include "aare/ClusterFinderCUDA.hpp"
+#include "aare/defs.hpp"
+#include "aare/utils/batch.hpp"

 // _____________
 //
@@ -29,19 +29,19 @@ struct Timer {
    void start() { t0 = clock::now(); }

    double elapsed_ms() const {
-        return std::chrono::duration<double, std::milli>(clock::now() - t0).count();
+        return std::chrono::duration<double, std::milli>(clock::now() - t0)
+            .count();
    }
 };

 // __________________
-// 
+//
 // Cluster comparison
 // __________________
-template <typename ClusterType>
-struct ClusterComparison {
+template <typename ClusterType> struct ClusterComparison {
    size_t cpu_count = 0;
    size_t gpu_count = 0;
-    size_t matched  = 0;
+    size_t matched = 0;
    size_t position_mismatch = 0;
    size_t data_mismatch = 0;
    size_t cpu_only = 0;
@@ -50,19 +50,20 @@ struct ClusterComparison {

 // Sort clusters by (y, x) for deterministic comparison
 template <typename ClusterType>
-void sort_clusters(std::vector<ClusterType>& clusters) {
+void sort_clusters(std::vector<ClusterType> &clusters) {
    std::sort(clusters.begin(), clusters.end(),
-              [](const ClusterType& a, const ClusterType& b) {
-                  if (a.y != b.y) return a.y < b.y;
+              [](const ClusterType &a, const ClusterType &b) {
+                  if (a.y != b.y)
+                      return a.y < b.y;
                  return a.x < b.x;
              });
 }

 // Compare two sorted cluster lists
 template <typename ClusterType>
-ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_clusters,
-                                                std::vector<ClusterType>& gpu_clusters)
-{
+ClusterComparison<ClusterType>
+compare_clusters(std::vector<ClusterType> &cpu_clusters,
+                 std::vector<ClusterType> &gpu_clusters) {
    sort_clusters(cpu_clusters);
    sort_clusters(gpu_clusters);

@@ -72,15 +73,17 @@ ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_cl

    size_t ci = 0, gi = 0;
    while (ci < cpu_clusters.size() && gi < gpu_clusters.size()) {
-        const auto& cc = cpu_clusters[ci];
-        const auto& gc = gpu_clusters[gi];
+        const auto &cc = cpu_clusters[ci];
+        const auto &gc = gpu_clusters[gi];

        if (cc.y == gc.y && cc.x == gc.x) {
            // Same position/check data
            bool data_ok = true;
-            constexpr int N = ClusterType::cluster_size_x * ClusterType::cluster_size_y;
+            constexpr int N =
+                ClusterType::cluster_size_x * ClusterType::cluster_size_y;
            for (int k = 0; k < N; ++k) {
-                // if (cc.data[k] != gc.data[k]) { // a bit too strict espacially that pedestal update is slightly different
+                // if (cc.data[k] != gc.data[k]) { // a bit too strict
+                // espacially that pedestal update is slightly different
                if (std::abs(cc.data[k] - gc.data[k]) > 5) {
                    data_ok = false;
                    break;
@@ -111,80 +114,109 @@ ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_cl
 // Cluster printing
 // ________________
 template <typename ClusterType>
-void print_cluster_comparison(const std::vector<std::vector<ClusterType>>& cpu_results,
-                              const std::vector<std::vector<ClusterType>>& gpu_results,
-                              size_t max_per_frame = 10,
-                              size_t max_frames = 100)
-{
+void print_cluster_comparison(
+    const std::vector<std::vector<ClusterType>> &cpu_results,
+    const std::vector<std::vector<ClusterType>> &gpu_results,
+    size_t max_per_frame = 10, size_t max_frames = 100) {
    constexpr int NX = ClusterType::cluster_size_x;
    constexpr int NY = ClusterType::cluster_size_y;
-    constexpr int N  = NX * NY;
- 
+    constexpr int N = NX * NY;
+
    size_t frames_shown = 0;
-    for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames; ++fi) {
+    for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames;
+         ++fi) {
        if (cpu_results[fi].empty() && gpu_results[fi].empty())
            continue;
- 
+
        size_t n_cpu = cpu_results[fi].size();
        size_t n_gpu = gpu_results[fi].size();
-        printf("\n  Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu, n_gpu);
- 
+        printf("\n  Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu,
+               n_gpu);
+
        // Merge-walk over sorted lists (assumes already sorted by y,x)
        size_t ci = 0, gi = 0;
        size_t shown = 0;
        while ((ci < n_cpu || gi < n_gpu) && shown < max_per_frame) {
            bool have_cpu = ci < n_cpu;
            bool have_gpu = gi < n_gpu;
- 
+
            // Determine if current entries match in position
            bool same_pos = have_cpu && have_gpu &&
                            cpu_results[fi][ci].x == gpu_results[fi][gi].x &&
                            cpu_results[fi][ci].y == gpu_results[fi][gi].y;
- 
+
            if (same_pos) {
-                const auto& cc = cpu_results[fi][ci];
-                const auto& gc = gpu_results[fi][gi];
- 
+                const auto &cc = cpu_results[fi][ci];
+                const auto &gc = gpu_results[fi][gi];
+
                // Check if data differs
                bool differs = false;
                for (int k = 0; k < N; ++k) {
-                    if (cc.data[k] != gc.data[k]) { differs = true; break; }
+                    if (cc.data[k] != gc.data[k]) {
+                        differs = true;
+                        break;
+                    }
                }

-                printf("    CPU and GPU clusters found at SAME position (col=%3d, row=%3d)\n %s\n", cc.x, cc.y,
-                       differs ? "DATA MISMATCH" : "MATCH");
+                printf("    CPU and GPU clusters found at SAME position "
+                       "(col=%3d, row=%3d)\n %s\n",
+                       cc.x, cc.y, differs ? "DATA MISMATCH" : "MATCH");
                printf("      CPU: [");
-                for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(cc.data[k])); }
+                for (int k = 0; k < N; ++k) {
+                    if (k)
+                        printf(", ");
+                    printf("%6d", static_cast<int>(cc.data[k]));
+                }
                printf("]\n");
                if (differs) {
                    printf("      GPU: [");
-                    for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(gc.data[k])); }
+                    for (int k = 0; k < N; ++k) {
+                        if (k)
+                            printf(", ");
+                        printf("%6d", static_cast<int>(gc.data[k]));
+                    }
                    printf("]\n");
                    printf("     diff: [");
                    for (int k = 0; k < N; ++k) {
-                        if (k) printf(", ");
-                        int d = static_cast<int>(gc.data[k]) - static_cast<int>(cc.data[k]);
+                        if (k)
+                            printf(", ");
+                        int d = static_cast<int>(gc.data[k]) -
+                                static_cast<int>(cc.data[k]);
                        printf("%+6d", d);
                    }
                    printf("]\n");
                }
-                ci++; gi++; shown++;
-            } else if (!have_gpu || (have_cpu && (cpu_results[fi][ci].y < gpu_results[fi][gi].y ||
-                       (cpu_results[fi][ci].y == gpu_results[fi][gi].y &&
-                        cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) {
-                const auto& cc = cpu_results[fi][ci];
+                ci++;
+                gi++;
+                shown++;
+            } else if (!have_gpu ||
+                       (have_cpu &&
+                        (cpu_results[fi][ci].y < gpu_results[fi][gi].y ||
+                         (cpu_results[fi][ci].y == gpu_results[fi][gi].y &&
+                          cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) {
+                const auto &cc = cpu_results[fi][ci];
                printf("    (%3d, %3d) CPU ONLY\n", cc.x, cc.y);
                printf("      CPU: [");
-                for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(cc.data[k])); }
+                for (int k = 0; k < N; ++k) {
+                    if (k)
+                        printf(", ");
+                    printf("%6d", static_cast<int>(cc.data[k]));
+                }
                printf("]\n");
-                ci++; shown++;
+                ci++;
+                shown++;
            } else {
-                const auto& gc = gpu_results[fi][gi];
+                const auto &gc = gpu_results[fi][gi];
                printf("    (%3d, %3d) GPU ONLY\n", gc.x, gc.y);
                printf("      GPU: [");
-                for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(gc.data[k])); }
+                for (int k = 0; k < N; ++k) {
+                    if (k)
+                        printf(", ");
+                    printf("%6d", static_cast<int>(gc.data[k]));
+                }
                printf("]\n");
-                gi++; shown++;
+                gi++;
+                shown++;
            }
        }
        frames_shown++;
@@ -195,24 +227,25 @@ void print_cluster_comparison(const std::vector<std::vector<ClusterType>>& cpu_r
 //
 // Helpers for the updated (CPU-parity) API
 // _________________________________________
- 
+
 // Copy a ClusterVector into a std::vector<ClusterType> for downstream
 // comparison code that expects the latter.
 template <typename Finder, typename ClusterType>
-void drain_into(Finder& f, std::vector<ClusterType>& out) {
+void drain_into(Finder &f, std::vector<ClusterType> &out) {
    auto cv = f.steal_clusters(true);
    out.clear();
    out.reserve(cv.size());
-    for (size_t j = 0; j < cv.size(); ++j) out.push_back(cv[j]);
+    for (size_t j = 0; j < cv.size(); ++j)
+        out.push_back(cv[j]);
 }
- 
+
 // Feed a set of cached pedestal frames through any finder exposing the
 // CPU-style push_pedestal_frame(NDView) method. Works for both
 // ClusterFinder and ClusterFinderCUDA.
 template <typename Finder, typename FRAME_TYPE>
-void feed_pedestal(Finder& f,
-                   const std::vector<aare::NDArray<FRAME_TYPE, 2>>& ped_frames) {
-    for (const auto& frame : ped_frames) {
+void feed_pedestal(
+    Finder &f, const std::vector<aare::NDArray<FRAME_TYPE, 2>> &ped_frames) {
+    for (const auto &frame : ped_frames) {
        f.push_pedestal_frame(frame.view());
    }
 }
@@ -221,16 +254,20 @@ void feed_pedestal(Finder& f,
 //
 //     Main
 // ____________
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {

    // Parse arguments
-    const char* default_pedestal = 
-                "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json";
-    // const char* default_pedestal = 
+    const char *default_pedestal =
+        "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/"
+        "2025Sept_m694/Sn25300eV/500_us_voltage_40kV/"
+        "250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json";
+    // const char* default_pedestal =
    //             "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_pedestal_200keV_DYNAMIC_tint_20us_master_0.json";
-    const char* default_data     = 
-                "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Xray_Tp15C_tint_500_master_0.json";
-    // const char* default_data     = 
+    const char *default_data =
+        "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/"
+        "2025Sept_m694/Sn25300eV/500_us_voltage_40kV/"
+        "250922_CZTonly_Xray_Tp15C_tint_500_master_0.json";
+    // const char* default_data     =
    //             "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_data_200keV_DYNAMIC_tint_20us_master_0.json";

    std::filesystem::path pedestal_path(argc > 1 ? argv[1] : default_pedestal);
@@ -247,18 +284,21 @@ int main(int argc, char* argv[]) {

    // Defaults: Adjust depending on the test dataset used
    size_t n_pedestal_frames = 6000;
-    size_t n_data_frames     = 10000;
-    double nSigma            = 5.0;
+    size_t n_data_frames = 10000;
+    double nSigma = 5.0;

-    if (argc > 3) n_pedestal_frames = std::atol(argv[3]);
-    if (argc > 4) n_data_frames     = std::atol(argv[4]);
-    if (argc > 5) nSigma            = std::atof(argv[5]);
+    if (argc > 3)
+        n_pedestal_frames = std::atol(argv[3]);
+    if (argc > 4)
+        n_data_frames = std::atol(argv[4]);
+    if (argc > 5)
+        nSigma = std::atof(argv[5]);

    // Detector geometry from master file
    constexpr uint8_t cs_x = 3;
    constexpr uint8_t cs_y = 3;
    using ClusterType = aare::Cluster<int32_t, cs_x, cs_y>;
-    using FRAME_TYPE  = uint16_t;
+    using FRAME_TYPE = uint16_t;
    using PEDESTAL_TYPE = double;

    // Read actual frame dimensions from the pedestal file
@@ -272,40 +312,44 @@ int main(int argc, char* argv[]) {

    printf("=== Cluster Finder: CPU vs CUDA ===\n");
    printf("Detector:  %zu x %zu\n", ROWS, COLS);
-    printf("Cluster:   %d x %d\n", ClusterType::cluster_size_x, ClusterType::cluster_size_y);
+    printf("Cluster:   %d x %d\n", ClusterType::cluster_size_x,
+           ClusterType::cluster_size_y);
    printf("nSigma:    %.1f\n", nSigma);

    // =========================================================================
-    // Phase 1: Build pedestal from dark frames (sanity check only + frame cache)
+    // Phase 1: Build pedestal from dark frames (sanity check only + frame
+    // cache)
    // =========================================================================
    //
-    // Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal object;
-    // both build their own via push_pedestal_frame.
-    // We still read the pedestal file once, but cache the frames in memory so
-    // every subsequent finder can be fed without re-hitting disk. We also
-    // build a standalone Pedestal purely to print a sanity check.
+    // Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal
+    // object; both build their own via push_pedestal_frame. We still read the
+    // pedestal file once, but cache the frames in memory so every subsequent
+    // finder can be fed without re-hitting disk. We also build a standalone
+    // Pedestal purely to print a sanity check.
    // =========================================================================
    printf("\n--- Phase 1: Pedestal accumulation (sanity check + cache) ---\n");
- 
+
    std::vector<aare::NDArray<FRAME_TYPE, 2>> pedestal_frames;
    aare::Pedestal<PEDESTAL_TYPE> pedestal(ROWS, COLS, 1000);
- 
+
    {
        aare::File ped_file(pedestal_path, "r");
        size_t total_ped = ped_file.total_frames();
-        size_t use_ped   = (n_pedestal_frames == 0 || n_pedestal_frames > total_ped)
-                               ? total_ped : n_pedestal_frames;
+        size_t use_ped =
+            (n_pedestal_frames == 0 || n_pedestal_frames > total_ped)
+                ? total_ped
+                : n_pedestal_frames;
        printf("Pedestal frames: %zu / %zu\n", use_ped, total_ped);
- 
+
        pedestal_frames.reserve(use_ped);
- 
+
        Timer t;
        t.start();
- 
+
        for (size_t i = 0; i < use_ped; ++i) {
            auto frame = ped_file.read_frame();
-            auto view  = frame.view<FRAME_TYPE>();
- 
+            auto view = frame.view<FRAME_TYPE>();
+
            // Copy into a standalone NDArray that we can reuse as many times
            // as we have finders to feed.
            aare::NDArray<FRAME_TYPE, 2> arr({ROWS, COLS});
@@ -313,16 +357,16 @@ int main(int argc, char* argv[]) {
                for (ssize_t c = 0; c < COLS; ++c)
                    arr(r, c) = view(r, c);
            pedestal_frames.push_back(std::move(arr));
- 
+
            pedestal.push_no_update(view);
        }
        pedestal.update_mean();
- 
+
        printf("Pedestal read+cached+built in %.1f ms\n", t.elapsed_ms());
    }
- 
-    printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n",
-           pedestal.mean(0, 0), pedestal.std(0, 0));
+
+    printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n", pedestal.mean(0, 0),
+           pedestal.std(0, 0));

    // =========================================================================
    // Phase 2: Read data frames
@@ -331,7 +375,7 @@ int main(int argc, char* argv[]) {

    aare::File data_file(data_path, "r");
    size_t total_data = data_file.total_frames();
-    size_t use_data   = std::min(n_data_frames, total_data);
+    size_t use_data = std::min(n_data_frames, total_data);
    printf("Data frames: %zu / %zu\n", use_data, total_data);

    // Pre-read all frames into memory to remove I/O from timing
@@ -391,32 +435,32 @@ int main(int argc, char* argv[]) {
    // time makes sense.
    // =========================================================================
    printf("\n--- Phase 4: CUDA ClusterFinder ---\n");
- 
+
    std::vector<std::vector<ClusterType>> gpu_results(use_data);
    size_t gpu_total_clusters = 0;
- 
+
    // --- Single frame on a single CUDA stream ---
    {
        aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
            {ROWS, COLS}, nSigma);
- 
+
        feed_pedestal(cuda_cf, pedestal_frames);
- 
+
        // Warmup: first CUDA call pays driver/context init overhead. The
        // pedestal drifts slightly during this single frame, which is
        // acceptable for timing purposes.
        cuda_cf.find_clusters(frames[0].view(), 0);
        cuda_cf.steal_clusters(true);
- 
+
        Timer t;
        t.start();
- 
+
        for (size_t i = 0; i < use_data; ++i) {
            cuda_cf.find_clusters(frames[i].view(), static_cast<uint64_t>(i));
            drain_into(cuda_cf, gpu_results[i]);
            gpu_total_clusters += gpu_results[i].size();
        }
- 
+
        double gpu_time = t.elapsed_ms();
        printf("GPU: %zu clusters in %.1f ms (%.2f ms/frame)\n",
               gpu_total_clusters, gpu_time, gpu_time / use_data);
@@ -428,32 +472,33 @@ int main(int argc, char* argv[]) {
    {
        constexpr size_t BATCH_SIZE = 100;
        constexpr int    N_STREAMS  = 2;
- 
+
        aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
            {ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
- 
+
        feed_pedestal(cuda_cf, pedestal_frames);
- 
+
        // Contiguous staging buffer reused across batches
        std::vector<FRAME_TYPE> batch_buffer(BATCH_SIZE * ROWS * COLS);
- 
+
        const size_t n_batches = (use_data + BATCH_SIZE - 1) / BATCH_SIZE;
- 
+
        Timer t;
        t.start();
- 
+
        for (size_t bi = 0; bi < n_batches; ++bi) {
            const size_t offset       = bi * BATCH_SIZE;
            const size_t actual_batch = std::min(BATCH_SIZE, use_data - offset);

            pack_frame_batch(frames, offset, actual_batch, batch_buffer);
- 
+
            aare::NDView<FRAME_TYPE, 3> batch_view(
                batch_buffer.data(),
                {static_cast<ssize_t>(actual_batch), ROWS, COLS});
- 
-            auto batch_results = cuda_cf.find_clusters_batched(batch_view, offset);
- 
+
+            auto batch_results = cuda_cf.find_clusters_batched(batch_view,
+    offset);
+
            for (size_t f = 0; f < actual_batch; ++f) {
                auto& cv = batch_results[f];
                auto& out = gpu_results[offset + f];
@@ -463,10 +508,11 @@ int main(int argc, char* argv[]) {
                gpu_total_clusters += out.size();
            }
        }
- 
+
        double gpu_time = t.elapsed_ms();
-        printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu, streams=%d)\n",
-               gpu_total_clusters, gpu_time, gpu_time / use_data, BATCH_SIZE, N_STREAMS);
+        printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu,
+    streams=%d)\n", gpu_total_clusters, gpu_time, gpu_time / use_data,
+    BATCH_SIZE, N_STREAMS);
    }
    */

@@ -484,21 +530,21 @@ int main(int argc, char* argv[]) {
    for (size_t i = 0; i < use_data; ++i) {
        auto result = compare_clusters(cpu_results[i], gpu_results[i]);

-        total_matched        += result.matched;
-        total_data_mismatch  += result.data_mismatch;
-        total_cpu_only       += result.cpu_only;
-        total_gpu_only       += result.gpu_only;
+        total_matched += result.matched;
+        total_data_mismatch += result.data_mismatch;
+        total_cpu_only += result.cpu_only;
+        total_gpu_only += result.gpu_only;

-        bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 || result.data_mismatch > 0);
+        bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 ||
+                         result.data_mismatch > 0);
        if (has_diff) {
            frames_with_differences++;
            // Print details for first few mismatching frames
            if (frames_with_differences <= 5) {
                printf("  Frame %zu: CPU=%zu GPU=%zu matched=%zu "
                       "data_mismatch=%zu cpu_only=%zu gpu_only=%zu\n",
-                       i, result.cpu_count, result.gpu_count,
-                       result.matched, result.data_mismatch,
-                       result.cpu_only, result.gpu_only);
+                       i, result.cpu_count, result.gpu_count, result.matched,
+                       result.data_mismatch, result.cpu_only, result.gpu_only);
            }
        }
    }
@@ -510,9 +556,11 @@ int main(int argc, char* argv[]) {
    printf("  Data mismatch:         %zu\n", total_data_mismatch);
    printf("  CPU only:              %zu\n", total_cpu_only);
    printf("  GPU only:              %zu\n", total_gpu_only);
-    printf("  Frames with diffs:     %zu / %zu\n", frames_with_differences, use_data);
+    printf("  Frames with diffs:     %zu / %zu\n", frames_with_differences,
+           use_data);

-    if (total_cpu_only == 0 && total_gpu_only == 0 && total_data_mismatch == 0) {
+    if (total_cpu_only == 0 && total_gpu_only == 0 &&
+        total_data_mismatch == 0) {
        printf("\n*** PASS: CPU and GPU results match exactly ***\n");
    } else {
        printf("\n*** DIFFERENCES DETECTED ***\n");
@@ -522,8 +570,10 @@ int main(int argc, char* argv[]) {
    // if (cpu_total_clusters > 0 || gpu_total_clusters > 0) {
    //     size_t max_clusters_per_frame = 10;
    //     size_t max_frames = 100;
-    //     printf("\n--- Cluster details (up to %zu frames, %zu clusters each) ---\n", max_frames, max_clusters_per_frame);
-    //     print_cluster_comparison(cpu_results, gpu_results, max_clusters_per_frame, max_frames);
+    //     printf("\n--- Cluster details (up to %zu frames, %zu clusters each)
+    //     ---\n", max_frames, max_clusters_per_frame);
+    //     print_cluster_comparison(cpu_results, gpu_results,
+    //     max_clusters_per_frame, max_frames);
    // }

    // =========================================================================
@@ -533,7 +583,7 @@ int main(int argc, char* argv[]) {

    if (use_data > 0) {
        const int N_ITER = 1000;
-        const auto& bench_frame = frames[0];
+        const auto &bench_frame = frames[0];

        // CPU benchmark
        {
@@ -558,44 +608,43 @@ int main(int argc, char* argv[]) {
        }
        // --- GPU benchmark (single frame, single stream) ---
        {
-            aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
-                {ROWS, COLS}, nSigma);
+            aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
+                cuda_cf({ROWS, COLS}, nSigma);
            feed_pedestal(cuda_cf, pedestal_frames);
- 
+
            // Warmup
            cuda_cf.find_clusters(bench_frame.view(), 0);
            cuda_cf.steal_clusters(true);
- 
+
            Timer t;
            t.start();
            for (int iter = 0; iter < N_ITER; ++iter) {
                cuda_cf.find_clusters(bench_frame.view(), 0);
                cuda_cf.steal_clusters(true);
            }
-            printf("GPU:          %.3f ms/frame (H2D + kernel + D2H, single frame, single stream)\n",
+            printf("GPU:          %.3f ms/frame (H2D + kernel + D2H, single "
+                   "frame, single stream)\n",
                   t.elapsed_ms() / N_ITER);
        }
- 
+
        // --- GPU benchmark (batched + multi-streamed) ---
        {
            constexpr size_t BATCH_SIZE = 500;
-            constexpr int    N_STREAMS  = 10;
- 
-            aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
-                {ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
+            constexpr int N_STREAMS = 10;
+
+            aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
+                cuda_cf({ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
            feed_pedestal(cuda_cf, pedestal_frames);
- 
+
            // Build one contiguous batch of BATCH_SIZE copies of bench_frame
            std::vector<FRAME_TYPE> batch(BATCH_SIZE * ROWS * COLS);
            for (size_t k = 0; k < BATCH_SIZE; ++k)
-                std::memcpy(batch.data() + k * ROWS * COLS,
-                            bench_frame.data(),
+                std::memcpy(batch.data() + k * ROWS * COLS, bench_frame.data(),
                            ROWS * COLS * sizeof(FRAME_TYPE));
- 
+
            aare::NDView<FRAME_TYPE, 3> batch_view(
-                batch.data(),
-                {static_cast<ssize_t>(BATCH_SIZE), ROWS, COLS});
- 
+                batch.data(), {static_cast<ssize_t>(BATCH_SIZE), ROWS, COLS});
+
            // Warmup. The kernel mutates the device-side pedestal for every
            // non-photon pixel, so after a 500-frame warmup the pedestal
            // state has drifted. Reset to a clean baseline by clearing the
@@ -604,17 +653,19 @@ int main(int argc, char* argv[]) {
            (void)cuda_cf.find_clusters_batched(batch_view, 0);
            cuda_cf.clear_pedestal();
            feed_pedestal(cuda_cf, pedestal_frames);
- 
-            const size_t n_iter_batches = (N_ITER + BATCH_SIZE - 1) / BATCH_SIZE;
- 
+
+            const size_t n_iter_batches =
+                (N_ITER + BATCH_SIZE - 1) / BATCH_SIZE;
+
            Timer t;
            t.start();
            for (size_t b = 0; b < n_iter_batches; ++b) {
                (void)cuda_cf.find_clusters_batched(batch_view, b * BATCH_SIZE);
            }
-            printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, batch=%zu, streams=%d)\n",
-                   t.elapsed_ms() / (n_iter_batches * BATCH_SIZE),
-                   BATCH_SIZE, N_STREAMS);
+            printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, "
+                   "batch=%zu, streams=%d)\n",
+                   t.elapsed_ms() / (n_iter_batches * BATCH_SIZE), BATCH_SIZE,
+                   N_STREAMS);
        }
    }