From a086cbb897ac45514bbaca41f8d2c9a28ccf1112 Mon Sep 17 00:00:00 2001 From: kferjaoui Date: Mon, 27 Apr 2026 11:27:47 +0200 Subject: [PATCH] Format CUDA cluster finder files --- include/aare/ClusterFinderCUDA.hpp | 323 ++++++++++++------------ include/aare/clusterfinder_kernel.cuh | 158 +++++++----- include/aare/utils/batch.hpp | 20 +- include/aare/utils/cuda_check.cuh | 12 +- python/src/bind_ClusterFinderCUDA.hpp | 43 ++-- python/src/bind_ClusterVector.hpp | 3 +- python/src/cuda_bindings.cu | 38 +-- src/ClusterFinderCUDA.test.cu | 351 +++++++++++++++----------- 8 files changed, 508 insertions(+), 440 deletions(-) diff --git a/include/aare/ClusterFinderCUDA.hpp b/include/aare/ClusterFinderCUDA.hpp index 3c95acd..abf8616 100644 --- a/include/aare/ClusterFinderCUDA.hpp +++ b/include/aare/ClusterFinderCUDA.hpp @@ -1,31 +1,29 @@ // SPDX-License-Identifier: MPL-2.0 #pragma once -#include -#include -#include -#include #include "aare/ClusterFinder.hpp" #include "aare/clusterfinder_kernel.cuh" #include "aare/utils/cuda_check.cuh" +#include +#include +#include +#include namespace aare { - + // Per-stream device resources template -struct StreamContext{ - cudaStream_t stream = nullptr; // handle to the stream - FRAME_TYPE* d_frame = nullptr; - PEDESTAL_TYPE* d_pd_mean = nullptr; - PEDESTAL_TYPE* d_pd_sum = nullptr; - PEDESTAL_TYPE* d_pd_sum2 = nullptr; - ClusterType* d_clusters = nullptr; - uint32_t* d_cluster_count = nullptr; +struct StreamContext { + cudaStream_t stream = nullptr; // handle to the stream + FRAME_TYPE *d_frame = nullptr; + PEDESTAL_TYPE *d_pd_mean = nullptr; + PEDESTAL_TYPE *d_pd_sum = nullptr; + PEDESTAL_TYPE *d_pd_sum2 = nullptr; + ClusterType *d_clusters = nullptr; + uint32_t *d_cluster_count = nullptr; }; - template , - typename FRAME_TYPE = uint16_t, - typename PEDESTAL_TYPE = double, + typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double, typename = std::enable_if_t::value>> class ClusterFinderCUDA { @@ -33,25 +31,25 @@ class ClusterFinderCUDA { static constexpr int BLOCK_Y = 16; static constexpr int col_radius = ClusterType::cluster_size_x / 2; static constexpr int row_radius = ClusterType::cluster_size_y / 2; - - Shape<2> m_image_size; - size_t nrows; - size_t ncols; - size_t image_size; // nrows * ncols - int n_streams; - size_t m_capacity; - PEDESTAL_TYPE m_nSigma; - Pedestal m_pedestal; - ClusterVector m_clusters; - bool m_pedestal_dirty = true; - + Shape<2> m_image_size; + size_t nrows; + size_t ncols; + size_t image_size; // nrows * ncols + int n_streams; + size_t m_capacity; + + PEDESTAL_TYPE m_nSigma; + Pedestal m_pedestal; + ClusterVector m_clusters; + bool m_pedestal_dirty = true; + using SC = StreamContext; std::vector v_sc; // Kernel parameters - dim3 grid; - dim3 block; + dim3 grid; + dim3 block; size_t shmem_bytes; public: @@ -59,79 +57,87 @@ class ClusterFinderCUDA { * @brief Construct a ClusterFinderCUDA * * @param image_size shape of the detector frame (rows, cols) - * @param nSigma threshold in units of per-pixel pedestal std + * @param nSigma threshold in units of per-pixel pedestal + * std * @param capacity device-side cluster buffer size per stream - * @param n_streams number of CUDA streams for multi-frame overlap + * @param n_streams number of CUDA streams for multi-frame + * overlap */ - ClusterFinderCUDA(Shape<2> image_size_, - PEDESTAL_TYPE nSigma = 5.0, - size_t capacity = 1000000, - int n_streams_ = 1) - : m_image_size(image_size_), - nrows(image_size_[0]), - ncols(image_size_[1]), - image_size(nrows * ncols), - n_streams(n_streams_), - m_capacity(capacity), - m_nSigma(nSigma), - m_pedestal(image_size_[0], image_size_[1]), - m_clusters(capacity) - { + ClusterFinderCUDA(Shape<2> image_size_, PEDESTAL_TYPE nSigma = 5.0, + size_t capacity = 1000000, int n_streams_ = 1) + : m_image_size(image_size_), nrows(image_size_[0]), + ncols(image_size_[1]), image_size(nrows * ncols), + n_streams(n_streams_), m_capacity(capacity), m_nSigma(nSigma), + m_pedestal(image_size_[0], image_size_[1]), m_clusters(capacity) { // Grid/Block dimensions block = dim3(BLOCK_X, BLOCK_Y); - grid = dim3((static_cast(ncols) + BLOCK_X - 1) / BLOCK_X, - (static_cast(nrows) + BLOCK_Y - 1) / BLOCK_Y); + grid = dim3((static_cast(ncols) + BLOCK_X - 1) / BLOCK_X, + (static_cast(nrows) + BLOCK_Y - 1) / BLOCK_Y); + + // Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + + // 2*row_radius) doubles + shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * + sizeof(PEDESTAL_TYPE); - // Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + 2*row_radius) doubles - shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * sizeof(PEDESTAL_TYPE); - v_sc.resize(n_streams); - for(int k=0; k < n_streams; ++k) { - auto& sc = v_sc[k]; - CUDA_CHECK(cudaStreamCreate(&sc.stream)); - CUDA_CHECK(cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE))); - CUDA_CHECK(cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE))); - CUDA_CHECK(cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE))); - CUDA_CHECK(cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE))); - CUDA_CHECK(cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType))); + for (int k = 0; k < n_streams; ++k) { + auto &sc = v_sc[k]; + CUDA_CHECK(cudaStreamCreate(&sc.stream)); + CUDA_CHECK( + cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE))); + CUDA_CHECK( + cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE))); + CUDA_CHECK( + cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE))); + CUDA_CHECK( + cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE))); + CUDA_CHECK( + cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType))); CUDA_CHECK(cudaMalloc(&sc.d_cluster_count, sizeof(uint32_t))); - } + } } ~ClusterFinderCUDA() { - for(auto& sc : v_sc) { - if (sc.d_frame) cudaFree(sc.d_frame); - if (sc.d_pd_mean) cudaFree(sc.d_pd_mean); - if (sc.d_pd_sum) cudaFree(sc.d_pd_sum); - if (sc.d_pd_sum2) cudaFree(sc.d_pd_sum2); - if (sc.d_clusters) cudaFree(sc.d_clusters); - if (sc.d_cluster_count) cudaFree(sc.d_cluster_count); - if (sc.stream) cudaStreamDestroy(sc.stream); + for (auto &sc : v_sc) { + if (sc.d_frame) + cudaFree(sc.d_frame); + if (sc.d_pd_mean) + cudaFree(sc.d_pd_mean); + if (sc.d_pd_sum) + cudaFree(sc.d_pd_sum); + if (sc.d_pd_sum2) + cudaFree(sc.d_pd_sum2); + if (sc.d_clusters) + cudaFree(sc.d_clusters); + if (sc.d_cluster_count) + cudaFree(sc.d_cluster_count); + if (sc.stream) + cudaStreamDestroy(sc.stream); } } // Non-copyable, non-movable - ClusterFinderCUDA(const ClusterFinderCUDA&) = delete; - ClusterFinderCUDA& operator=(const ClusterFinderCUDA&) = delete; - ClusterFinderCUDA(ClusterFinderCUDA&&) = delete; - ClusterFinderCUDA& operator=(ClusterFinderCUDA&&) = delete; + ClusterFinderCUDA(const ClusterFinderCUDA &) = delete; + ClusterFinderCUDA &operator=(const ClusterFinderCUDA &) = delete; + ClusterFinderCUDA(ClusterFinderCUDA &&) = delete; + ClusterFinderCUDA &operator=(ClusterFinderCUDA &&) = delete; + + void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; } + PEDESTAL_TYPE get_nSigma() const { return m_nSigma; } - void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; } - PEDESTAL_TYPE get_nSigma() const { return m_nSigma; } - void push_pedestal_frame(NDView frame) { m_pedestal.push(frame); m_pedestal_dirty = true; } - + void clear_pedestal() { m_pedestal.clear(); m_pedestal_dirty = true; } - + NDArray pedestal() { return m_pedestal.mean(); } - NDArray noise() { return m_pedestal.std(); } - + NDArray noise() { return m_pedestal.std(); } + /** * @brief Move clusters out of the internal ClusterVector, optionally * reallocating the internal one with the same capacity. @@ -150,52 +156,47 @@ class ClusterFinderCUDA { * @brief Find clusters in a single frame, appending them to the internal * ClusterVector. */ - void find_clusters(NDView frame, uint64_t frame_number = 0) - { + void find_clusters(NDView frame, uint64_t frame_number = 0) { if (m_pedestal_dirty) { // need to update the pedestal on the gpu sync_pedestal_to_device(); m_pedestal_dirty = false; } - auto& sc = v_sc[0]; + auto &sc = v_sc[0]; const size_t image_bytes = image_size * sizeof(FRAME_TYPE); - const uint32_t n_pd_samples = static_cast(m_pedestal.n_samples()); + const uint32_t n_pd_samples = + static_cast(m_pedestal.n_samples()); // Reset cluster counter - CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t), sc.stream)); - - // Upload frame - CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes, cudaMemcpyHostToDevice, sc.stream)); - - // Kernel launch - device::find_clusters_in_single_frame - <<>>(sc.d_frame, - sc.d_pd_mean, - sc.d_pd_sum, - sc.d_pd_sum2, - n_pd_samples, - m_nSigma, - nrows, - ncols, - sc.d_clusters, - sc.d_cluster_count, - static_cast(m_capacity) - ); - CUDA_CHECK(cudaGetLastError()); - - // Read back cluster count - uint32_t n_found = 0; - CUDA_CHECK(cudaMemcpyAsync(&n_found, - sc.d_cluster_count, - sizeof(uint32_t), - cudaMemcpyDeviceToHost, + CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t), sc.stream)); - // Synchronize to ensure count is available before the CPU reads clusters + // Upload frame + CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes, + cudaMemcpyHostToDevice, sc.stream)); + + // Kernel launch + device::find_clusters_in_single_frame + <<>>( + sc.d_frame, sc.d_pd_mean, sc.d_pd_sum, sc.d_pd_sum2, + n_pd_samples, m_nSigma, nrows, ncols, sc.d_clusters, + sc.d_cluster_count, static_cast(m_capacity)); + CUDA_CHECK(cudaGetLastError()); + + // Read back cluster count + uint32_t n_found = 0; + CUDA_CHECK(cudaMemcpyAsync(&n_found, sc.d_cluster_count, + sizeof(uint32_t), cudaMemcpyDeviceToHost, + sc.stream)); + + // Synchronize to ensure count is available before the CPU reads + // clusters CUDA_CHECK(cudaStreamSynchronize(sc.stream)); // Clamp to max in case of overflow - if (n_found > m_capacity) n_found = m_capacity; + if (n_found > m_capacity) + n_found = m_capacity; // Read back clusters m_clusters.set_frame_number(frame_number); @@ -213,74 +214,79 @@ class ClusterFinderCUDA { */ std::vector> find_clusters_batched(NDView frames, - uint64_t first_frame = 0) - { + uint64_t first_frame = 0) { if (m_pedestal_dirty) { sync_pedestal_to_device(); m_pedestal_dirty = false; } - - const size_t n_frames = frames.shape(0); - const size_t image_bytes = image_size * sizeof(FRAME_TYPE); - const uint32_t n_pd_samples = static_cast(m_pedestal.n_samples()); - + + const size_t n_frames = frames.shape(0); + const size_t image_bytes = image_size * sizeof(FRAME_TYPE); + const uint32_t n_pd_samples = + static_cast(m_pedestal.n_samples()); + std::vector> results; results.reserve(n_frames); for (size_t i = 0; i < n_frames; ++i) { results.emplace_back(m_capacity); results.back().set_frame_number(first_frame + i); } - + std::vector host_counts(n_streams, 0); - + const size_t n_rounds = (n_frames + n_streams - 1) / n_streams; for (size_t round = 0; round < n_rounds; ++round) { - + // Launch phase: fan out kernels on all streams for this round for (int k = 0; k < n_streams; ++k) { // OOB guard const size_t frame_idx = round * n_streams + k; - if (frame_idx >= n_frames) continue; - - auto& sc_k = v_sc[k]; - const FRAME_TYPE* h_src = frames.data() + frame_idx * image_size; - + if (frame_idx >= n_frames) + continue; + + auto &sc_k = v_sc[k]; + const FRAME_TYPE *h_src = + frames.data() + frame_idx * image_size; + CUDA_CHECK(cudaMemsetAsync(sc_k.d_cluster_count, 0, sizeof(uint32_t), sc_k.stream)); CUDA_CHECK(cudaMemcpyAsync(sc_k.d_frame, h_src, image_bytes, - cudaMemcpyHostToDevice, sc_k.stream)); - - device::find_clusters_in_single_frame + cudaMemcpyHostToDevice, + sc_k.stream)); + + device::find_clusters_in_single_frame <<>>( - sc_k.d_frame, - sc_k.d_pd_mean, sc_k.d_pd_sum, sc_k.d_pd_sum2, - n_pd_samples, m_nSigma, - nrows, ncols, + sc_k.d_frame, sc_k.d_pd_mean, sc_k.d_pd_sum, + sc_k.d_pd_sum2, n_pd_samples, m_nSigma, nrows, ncols, sc_k.d_clusters, sc_k.d_cluster_count, m_capacity); CUDA_CHECK(cudaGetLastError()); } - + // Drain phase: fan in results from all streams for (int k = 0; k < n_streams; ++k) { const size_t frame_idx = round * n_streams + k; - if (frame_idx >= n_frames) continue; - - auto& sc_k = v_sc[k]; - - CUDA_CHECK(cudaMemcpyAsync(&host_counts[k], sc_k.d_cluster_count, - sizeof(uint32_t), - cudaMemcpyDeviceToHost, sc_k.stream)); + if (frame_idx >= n_frames) + continue; + + auto &sc_k = v_sc[k]; + + CUDA_CHECK(cudaMemcpyAsync( + &host_counts[k], sc_k.d_cluster_count, sizeof(uint32_t), + cudaMemcpyDeviceToHost, sc_k.stream)); CUDA_CHECK(cudaStreamSynchronize(sc_k.stream)); - + uint32_t n_found = host_counts[k]; - if (n_found > m_capacity) n_found = m_capacity; - + if (n_found > m_capacity) + n_found = m_capacity; + if (n_found > 0) { - append_device_clusters_to(results[frame_idx], sc_k, n_found); + append_device_clusters_to(results[frame_idx], sc_k, + n_found); } } } - + return results; } @@ -295,34 +301,35 @@ class ClusterFinderCUDA { // copies complete, so we synchronise at the end before they go out // of scope. NDArray h_mean = m_pedestal.mean(); - NDArray h_sum = m_pedestal.get_sum(); + NDArray h_sum = m_pedestal.get_sum(); NDArray h_sum2 = m_pedestal.get_sum2(); - + const size_t bytes = image_size * sizeof(PEDESTAL_TYPE); - for (auto& sc : v_sc) { + for (auto &sc : v_sc) { CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_mean, h_mean.data(), bytes, cudaMemcpyHostToDevice, sc.stream)); - CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes, + CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes, cudaMemcpyHostToDevice, sc.stream)); CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum2, h_sum2.data(), bytes, cudaMemcpyHostToDevice, sc.stream)); } - for (auto& sc : v_sc) CUDA_CHECK(cudaStreamSynchronize(sc.stream)); + for (auto &sc : v_sc) + CUDA_CHECK(cudaStreamSynchronize(sc.stream)); } - + /** * Copy n_found clusters from sc.d_clusters into the given ClusterVector - * and block on the transfer. + * and block on the transfer. */ - void append_device_clusters_to(ClusterVector& cv, - SC& sc, uint32_t n_found) - { + void append_device_clusters_to(ClusterVector &cv, SC &sc, + uint32_t n_found) { std::vector staging(n_found); CUDA_CHECK(cudaMemcpyAsync(staging.data(), sc.d_clusters, n_found * sizeof(ClusterType), cudaMemcpyDeviceToHost, sc.stream)); CUDA_CHECK(cudaStreamSynchronize(sc.stream)); - for (const auto& c : staging) cv.push_back(c); + for (const auto &c : staging) + cv.push_back(c); } }; diff --git a/include/aare/clusterfinder_kernel.cuh b/include/aare/clusterfinder_kernel.cuh index 92e1b07..f6002db 100644 --- a/include/aare/clusterfinder_kernel.cuh +++ b/include/aare/clusterfinder_kernel.cuh @@ -1,28 +1,22 @@ #pragma once -#include -#include #include "aare/Cluster.hpp" #include "aare/ClusterFinder.hpp" +#include +#include namespace aare::device { template , - typename FRAME_TYPE = uint16_t, - typename PEDESTAL_TYPE = double, + typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double, typename = std::enable_if_t::value>> -__global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d_frame, - PEDESTAL_TYPE* __restrict__ d_pd_mean, - PEDESTAL_TYPE* __restrict__ d_pd_sum, - PEDESTAL_TYPE* __restrict__ d_pd_sum2, - const uint32_t n_pd_samples, - const PEDESTAL_TYPE m_nSigma, - const size_t nrows, - const size_t ncols, - // const uint64_t frame_number, - ClusterType* d_clusters, - uint32_t* d_cluster_count, - const uint32_t max_clusters) -{ +__global__ void find_clusters_in_single_frame( + const FRAME_TYPE *__restrict__ d_frame, + PEDESTAL_TYPE *__restrict__ d_pd_mean, PEDESTAL_TYPE *__restrict__ d_pd_sum, + PEDESTAL_TYPE *__restrict__ d_pd_sum2, const uint32_t n_pd_samples, + const PEDESTAL_TYPE m_nSigma, const size_t nrows, const size_t ncols, + // const uint64_t frame_number, + ClusterType *d_clusters, uint32_t *d_cluster_count, + const uint32_t max_clusters) { using CT = typename ClusterType::value_type; // Compile-time cluster geometry useful for unrolling loops @@ -38,10 +32,12 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d constexpr int pow2_c3 = CSX * CSY; // Thread/pixel mapping - auto col_global = static_cast(threadIdx.x + blockDim.x * blockIdx.x); - auto row_global = static_cast(threadIdx.y + blockDim.y * blockIdx.y); + auto col_global = + static_cast(threadIdx.x + blockDim.x * blockIdx.x); + auto row_global = + static_cast(threadIdx.y + blockDim.y * blockIdx.y); auto global_tid = static_cast(col_global + ncols * row_global); - auto local_tid = threadIdx.x + blockDim.x * threadIdx.y; + auto local_tid = threadIdx.x + blockDim.x * threadIdx.y; // ==================== // Shared memory layout @@ -55,19 +51,22 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d // CUDA prefers raw bytes + aligned cast // Compile error happens when using: `extern __shared__ T sh[];` extern __shared__ __align__(sizeof(PEDESTAL_TYPE)) unsigned char smem[]; - PEDESTAL_TYPE* shmem = reinterpret_cast(smem); + PEDESTAL_TYPE *shmem = reinterpret_cast(smem); // Stride includes halo on both sides auto shmem_stride = static_cast(blockDim.x) + 2 * col_radius; - auto tile_size = shmem_stride * (static_cast(blockDim.y) + 2 * row_radius); + auto tile_size = + shmem_stride * (static_cast(blockDim.y) + 2 * row_radius); // Offset so that thread (0,0) maps to shared-memory position // (row_radius, col_radius) i.e. past the top-left halo. - auto shmem_tid = (static_cast(threadIdx.y) + row_radius) * shmem_stride - + (static_cast(threadIdx.x) + col_radius); + auto shmem_tid = + (static_cast(threadIdx.y) + row_radius) * shmem_stride + + (static_cast(threadIdx.x) + col_radius); // Cooperative zero-fill - for (int idx = static_cast(local_tid); idx < tile_size; idx += static_cast(blockDim.x * blockDim.y)) { + for (int idx = static_cast(local_tid); idx < tile_size; + idx += static_cast(blockDim.x * blockDim.y)) { shmem[idx] = PEDESTAL_TYPE{0}; } __syncthreads(); @@ -83,7 +82,7 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d // Helper: read (frame - pedestal_mean) from global memory, or 0 if OOB. // gr, gc are the global row/col of the pixel to load. // Returns the pedestal-subtracted value. - auto load_pixel = [&] __device__ (ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE { + auto load_pixel = [&] __device__(ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE { auto gid = gc + static_cast(ncols) * gr; return static_cast(d_frame[gid]) - d_pd_mean[gid]; }; @@ -98,21 +97,25 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d if (threadIdx.y == 0 && valid_pixel) { for (int i = 1; i <= row_radius; ++i) { if (row_global - i >= 0) - shmem[shmem_tid - i * shmem_stride] = load_pixel(row_global - i, col_global); + shmem[shmem_tid - i * shmem_stride] = + load_pixel(row_global - i, col_global); } // Top-left corner rectangle if (threadIdx.x == 0) { for (int i = 1; i <= row_radius; ++i) for (int j = 1; j <= col_radius; ++j) if (row_global - i >= 0 && col_global - j >= 0) - shmem[shmem_tid - i * shmem_stride - j] = load_pixel(row_global - i, col_global - j); + shmem[shmem_tid - i * shmem_stride - j] = + load_pixel(row_global - i, col_global - j); } // Top-right corner rectangle if (threadIdx.x == blockDim.x - 1) { for (int i = 1; i <= row_radius; ++i) for (int j = 1; j <= col_radius; ++j) - if (row_global - i >= 0 && col_global + j < static_cast(ncols)) - shmem[shmem_tid - i * shmem_stride + j] = load_pixel(row_global - i, col_global + j); + if (row_global - i >= 0 && + col_global + j < static_cast(ncols)) + shmem[shmem_tid - i * shmem_stride + j] = + load_pixel(row_global - i, col_global + j); } } @@ -134,21 +137,26 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d if (threadIdx.y == blockDim.y - 1 && valid_pixel) { for (int i = 1; i <= row_radius; ++i) { if (row_global + i < static_cast(nrows)) - shmem[shmem_tid + i * shmem_stride] = load_pixel(row_global + i, col_global); + shmem[shmem_tid + i * shmem_stride] = + load_pixel(row_global + i, col_global); } // Bottom-left corner rectangle if (threadIdx.x == 0) { for (int i = 1; i <= row_radius; ++i) for (int j = 1; j <= col_radius; ++j) - if (row_global + i < static_cast(nrows) && col_global - j >= 0) - shmem[shmem_tid + i * shmem_stride - j] = load_pixel(row_global + i, col_global - j); + if (row_global + i < static_cast(nrows) && + col_global - j >= 0) + shmem[shmem_tid + i * shmem_stride - j] = + load_pixel(row_global + i, col_global - j); } // Bottom-right corner rectangle if (threadIdx.x == blockDim.x - 1) { for (int i = 1; i <= row_radius; ++i) for (int j = 1; j <= col_radius; ++j) - if (row_global + i < static_cast(nrows) && col_global + j < static_cast(ncols)) - shmem[shmem_tid + i * shmem_stride + j] = load_pixel(row_global + i, col_global + j); + if (row_global + i < static_cast(nrows) && + col_global + j < static_cast(ncols)) + shmem[shmem_tid + i * shmem_stride + j] = + load_pixel(row_global + i, col_global + j); } } @@ -157,51 +165,57 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d // ===================== // Cluster-finding logic // ===================== - if (!valid_pixel) return; + if (!valid_pixel) + return; // Per-pixel RMS from global pedestal arrays // rms = sqrt( E[X^2] - E[X]^2 ) - PEDESTAL_TYPE mean_px = d_pd_mean[global_tid]; - PEDESTAL_TYPE var_px = d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px; - PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2 - PEDESTAL_TYPE rms_px = sqrt(rms_sq); + PEDESTAL_TYPE mean_px = d_pd_mean[global_tid]; + PEDESTAL_TYPE var_px = + d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px; + PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2 + PEDESTAL_TYPE rms_px = sqrt(rms_sq); // Pedestal-subtracted value of the center pixel (already in shmem) PEDESTAL_TYPE val_pixel = shmem[shmem_tid]; - // Negative pedestal early exit + // Negative pedestal early exit if (val_pixel < -m_nSigma * rms_px) - return; // NOTE: pedestal update for this pixel is skipped (same as sequential) + return; // NOTE: pedestal update for this pixel is skipped (same as + // sequential) // Stencil reduction: total, max, quadrant sums PEDESTAL_TYPE total = PEDESTAL_TYPE{0}; - PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of `numeric_limits::lowest()` + PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of + // `numeric_limits::lowest()` - /* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0, ic<=0) - PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0, ic>=0) - PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0, ic<=0) - PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0, ic>=0) */ + /* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0, + ic<=0) PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0, + ic>=0) PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0, + ic<=0) PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0, + ic>=0) */ CT clusterData[CSX * CSY]; int idx = 0; // tracks the pixels in the cluster - #pragma unroll +#pragma unroll for (int ir = -row_radius; ir <= row_radius; ++ir) { - #pragma unroll +#pragma unroll for (int ic = -col_radius; ic <= col_radius; ++ic) { PEDESTAL_TYPE val = shmem[shmem_tid + ir * shmem_stride + ic]; total += val; max_val = max(max_val, val); - /* // Quadrant accumulation (pixels on the axes contribute to two quadrants) - if (ir <= 0 && ic <= 0) tl += val; - if (ir <= 0 && ic >= 0) tr += val; - if (ir >= 0 && ic <= 0) bl += val; - if (ir >= 0 && ic >= 0) br += val; */ + /* // Quadrant accumulation (pixels on the axes contribute to two + quadrants) if (ir <= 0 && ic <= 0) tl += val; if (ir <= 0 && ic >= + 0) tr += val; if (ir >= 0 && ic <= 0) bl += val; if (ir >= 0 && ic + >= 0) br += val; */ - // Store pedestal-subtracted value in register array for later cluster output - if constexpr (std::is_integral_v && std::is_floating_point_v) + // Store pedestal-subtracted value in register array for later + // cluster output + if constexpr (std::is_integral_v && + std::is_floating_point_v) clusterData[idx] = static_cast(lround(val)); else clusterData[idx] = static_cast(val); @@ -213,10 +227,11 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d // Three-way classification (mirrors ClusterFinder's logic) // // 1. Single-pixel significance: max_val > nSigma * rms - // -> only the pixel that IS the max gets recorded (local-max suppression) + // -> only the pixel that IS the max gets recorded (local-max + // suppression) // // 2. Quadrant significance: max(tl,tr,bl,br) > c2 * nSigma * rms - // -> charge-sharing events where a 2x2 sub-region is significant + // -> charge-sharing events where a 2x2 sub-region is significant // NOTE: This test is absent in the serial ClusterFinder! // // 3. Total significance: total > c3 * nSigma * rms @@ -228,9 +243,10 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d // Test 1: single-pixel significance if (max_val > m_nSigma * rms_px) { - // Local-max suppression: only the center-pixel thread records the cluster + // Local-max suppression: only the center-pixel thread records the + // cluster if (val_pixel < max_val) - return; // some other pixel in the neighborhood is brighter + return; // some other pixel in the neighborhood is brighter is_photon = true; } @@ -249,29 +265,30 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d } } - // Pedestal update (if not a photon) - // In the sequential code, non-photon pixels feed back into the running pedestal via push_fast(). - // In this kernel, the GPU updates all pixels in a frame simultaneously. So the updated pedestal - // will only be used starting from the next frame. -> This avoids a/serialization and b/global mem I/O. + // Pedestal update (if not a photon) + // In the sequential code, non-photon pixels feed back into the running + // pedestal via push_fast(). In this kernel, the GPU updates all pixels in a + // frame simultaneously. So the updated pedestal will only be used starting + // from the next frame. -> This avoids a/serialization and b/global mem I/O. if (!is_photon && valid_pixel) { PEDESTAL_TYPE raw_val = static_cast(d_frame[global_tid]); - PEDESTAL_TYPE sum = d_pd_sum[global_tid]; + PEDESTAL_TYPE sum = d_pd_sum[global_tid]; PEDESTAL_TYPE sum2 = d_pd_sum2[global_tid]; - sum += raw_val - sum / n_pd_samples; + sum += raw_val - sum / n_pd_samples; sum2 += raw_val * raw_val - sum2 / n_pd_samples; - d_pd_sum[global_tid] = sum; + d_pd_sum[global_tid] = sum; d_pd_sum2[global_tid] = sum2; d_pd_mean[global_tid] = sum / n_pd_samples; return; } - /* + /* if (!is_photon) return; // Debugging */ - // Write cluster to global output buffer using atomic index + // Write cluster to global output buffer using atomic index // for coordination across all blocks uint32_t write_idx = atomicAdd(d_cluster_count, 1u); @@ -283,7 +300,8 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d cluster.x = static_cast(col_global); cluster.y = static_cast(row_global); - memcpy(reinterpret_cast(&cluster.data), clusterData, sizeof(CT) * CSX * CSY); + memcpy(reinterpret_cast(&cluster.data), clusterData, + sizeof(CT) * CSX * CSY); d_clusters[write_idx] = cluster; } diff --git a/include/aare/utils/batch.hpp b/include/aare/utils/batch.hpp index c3122f6..4584a67 100644 --- a/include/aare/utils/batch.hpp +++ b/include/aare/utils/batch.hpp @@ -1,17 +1,15 @@ // SPDX-License-Identifier: MPL-2.0 #pragma once -#include -#include #include "aare/NDArray.hpp" - +#include +#include template -void pack_frame_batch(const std::vector>& frames, - size_t first_frame, - size_t n_frames, - std::vector& batch) -{ - if (n_frames == 0) return; +void pack_frame_batch(const std::vector> &frames, + size_t first_frame, size_t n_frames, + std::vector &batch) { + if (n_frames == 0) + return; const size_t rows = frames[first_frame].shape(0); const size_t cols = frames[first_frame].shape(1); @@ -23,8 +21,8 @@ void pack_frame_batch(const std::vector>& frames, } for (size_t k = 0; k < n_frames; ++k) { - const FRAME_TYPE* src = frames[first_frame + k].data(); - FRAME_TYPE* dst = batch.data() + k * image_size; + const FRAME_TYPE *src = frames[first_frame + k].data(); + FRAME_TYPE *dst = batch.data() + k * image_size; std::memcpy(dst, src, image_size * sizeof(FRAME_TYPE)); } } \ No newline at end of file diff --git a/include/aare/utils/cuda_check.cuh b/include/aare/utils/cuda_check.cuh index f091da3..6918756 100644 --- a/include/aare/utils/cuda_check.cuh +++ b/include/aare/utils/cuda_check.cuh @@ -3,12 +3,12 @@ #include #include -inline void __cuda_check(cudaError_t err, const char* file, int line) { - if (err != cudaSuccess) { - std::fprintf(stderr, "[%s:%d] CUDA error: %s\n", - file, line, cudaGetErrorString(err)); - std::exit(1); - } +inline void __cuda_check(cudaError_t err, const char *file, int line) { + if (err != cudaSuccess) { + std::fprintf(stderr, "[%s:%d] CUDA error: %s\n", file, line, + cudaGetErrorString(err)); + std::exit(1); + } } #define CUDA_CHECK(stmt) __cuda_check((stmt), __FILE__, __LINE__) \ No newline at end of file diff --git a/python/src/bind_ClusterFinderCUDA.hpp b/python/src/bind_ClusterFinderCUDA.hpp index cb4afbe..5ef017e 100644 --- a/python/src/bind_ClusterFinderCUDA.hpp +++ b/python/src/bind_ClusterFinderCUDA.hpp @@ -27,19 +27,15 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) { auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr); using ClusterType = Cluster; - using CF = ClusterFinderCUDA; + using CF = ClusterFinderCUDA; py::class_(m, class_name.c_str()) - .def(py::init, pd_type, size_t, int>(), - py::arg("image_size"), - py::arg("n_sigma") = 5.0, - py::arg("capacity") = 1'000'000, - py::arg("n_streams") = 1) + .def(py::init, pd_type, size_t, int>(), py::arg("image_size"), + py::arg("n_sigma") = 5.0, py::arg("capacity") = 1'000'000, + py::arg("n_streams") = 1) .def_property( - "nSigma", - &CF::get_nSigma, - &CF::set_nSigma, + "nSigma", &CF::get_nSigma, &CF::set_nSigma, R"(Number of sigma above the pedestal to consider a photon during cluster finding.)") .def("push_pedestal_frame", @@ -50,21 +46,19 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) { .def("clear_pedestal", &CF::clear_pedestal) - .def_property_readonly( - "pedestal", - [](CF &self) { - auto pd = new NDArray{}; - *pd = self.pedestal(); - return return_image_data(pd); - }) + .def_property_readonly("pedestal", + [](CF &self) { + auto pd = new NDArray{}; + *pd = self.pedestal(); + return return_image_data(pd); + }) - .def_property_readonly( - "noise", - [](CF &self) { - auto arr = new NDArray{}; - *arr = self.noise(); - return return_image_data(arr); - }) + .def_property_readonly("noise", + [](CF &self) { + auto arr = new NDArray{}; + *arr = self.noise(); + return return_image_data(arr); + }) .def( "steal_clusters", @@ -86,7 +80,8 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) { .def( "find_clusters_batched", [](CF &self, py::array_t frames, uint64_t first_frame) { - // frames is expected as a 3D numpy array (n_frames, nrows, ncols) + // frames is expected as a 3D numpy array (n_frames, nrows, + // ncols) auto view = make_view_3d(frames); return self.find_clusters_batched(view, first_frame); }, diff --git a/python/src/bind_ClusterVector.hpp b/python/src/bind_ClusterVector.hpp index d4cbc3f..eb3670f 100644 --- a/python/src/bind_ClusterVector.hpp +++ b/python/src/bind_ClusterVector.hpp @@ -30,8 +30,7 @@ void define_ClusterVector(py::module &m, const std::string &typestr) { py::class_, void>>( - m, class_name.c_str(), - py::buffer_protocol(), + m, class_name.c_str(), py::buffer_protocol(), py::module_local()) .def(py::init()) // TODO change!!! diff --git a/python/src/cuda_bindings.cu b/python/src/cuda_bindings.cu index a30cd95..1d003d9 100644 --- a/python/src/cuda_bindings.cu +++ b/python/src/cuda_bindings.cu @@ -6,8 +6,8 @@ // back a usable ClusterVector without _aare needing to be imported first. #include "bind_Cluster.hpp" -#include "bind_ClusterVector.hpp" #include "bind_ClusterFinderCUDA.hpp" +#include "bind_ClusterVector.hpp" #include @@ -22,45 +22,45 @@ namespace py = pybind11; define_Cluster(m, #N "x" #M #TYPE_CODE); #define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE) \ - aare::define_ClusterFinderCUDA( \ - m, "Cluster" #N "x" #M #TYPE_CODE); + aare::define_ClusterFinderCUDA(m, "Cluster" #N \ + "x" #M #TYPE_CODE); PYBIND11_MODULE(_aare_cuda, m) { // Types first — finders reference them in their signatures. // SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too. - DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i); + DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i); DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d); - DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f); + DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f); - DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i); + DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i); DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d); - DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f); + DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f); - DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i); + DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i); DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d); - DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f); + DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f); - DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i); + DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i); DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d); - DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f); + DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f); // Finders - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i); DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i); DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i); DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i); DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d); - DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f); + DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f); } #undef DEFINE_CUDA_CLUSTER_TYPES diff --git a/src/ClusterFinderCUDA.test.cu b/src/ClusterFinderCUDA.test.cu index cf725e2..caba093 100644 --- a/src/ClusterFinderCUDA.test.cu +++ b/src/ClusterFinderCUDA.test.cu @@ -9,14 +9,14 @@ #include #include -#include "aare/defs.hpp" -#include "aare/utils/batch.hpp" +#include "aare/ClusterFinder.hpp" +#include "aare/ClusterFinderCUDA.hpp" #include "aare/File.hpp" #include "aare/Frame.hpp" #include "aare/NDArray.hpp" #include "aare/Pedestal.hpp" -#include "aare/ClusterFinder.hpp" -#include "aare/ClusterFinderCUDA.hpp" +#include "aare/defs.hpp" +#include "aare/utils/batch.hpp" // _____________ // @@ -29,19 +29,19 @@ struct Timer { void start() { t0 = clock::now(); } double elapsed_ms() const { - return std::chrono::duration(clock::now() - t0).count(); + return std::chrono::duration(clock::now() - t0) + .count(); } }; // __________________ -// +// // Cluster comparison // __________________ -template -struct ClusterComparison { +template struct ClusterComparison { size_t cpu_count = 0; size_t gpu_count = 0; - size_t matched = 0; + size_t matched = 0; size_t position_mismatch = 0; size_t data_mismatch = 0; size_t cpu_only = 0; @@ -50,19 +50,20 @@ struct ClusterComparison { // Sort clusters by (y, x) for deterministic comparison template -void sort_clusters(std::vector& clusters) { +void sort_clusters(std::vector &clusters) { std::sort(clusters.begin(), clusters.end(), - [](const ClusterType& a, const ClusterType& b) { - if (a.y != b.y) return a.y < b.y; + [](const ClusterType &a, const ClusterType &b) { + if (a.y != b.y) + return a.y < b.y; return a.x < b.x; }); } // Compare two sorted cluster lists template -ClusterComparison compare_clusters(std::vector& cpu_clusters, - std::vector& gpu_clusters) -{ +ClusterComparison +compare_clusters(std::vector &cpu_clusters, + std::vector &gpu_clusters) { sort_clusters(cpu_clusters); sort_clusters(gpu_clusters); @@ -72,15 +73,17 @@ ClusterComparison compare_clusters(std::vector& cpu_cl size_t ci = 0, gi = 0; while (ci < cpu_clusters.size() && gi < gpu_clusters.size()) { - const auto& cc = cpu_clusters[ci]; - const auto& gc = gpu_clusters[gi]; + const auto &cc = cpu_clusters[ci]; + const auto &gc = gpu_clusters[gi]; if (cc.y == gc.y && cc.x == gc.x) { // Same position/check data bool data_ok = true; - constexpr int N = ClusterType::cluster_size_x * ClusterType::cluster_size_y; + constexpr int N = + ClusterType::cluster_size_x * ClusterType::cluster_size_y; for (int k = 0; k < N; ++k) { - // if (cc.data[k] != gc.data[k]) { // a bit too strict espacially that pedestal update is slightly different + // if (cc.data[k] != gc.data[k]) { // a bit too strict + // espacially that pedestal update is slightly different if (std::abs(cc.data[k] - gc.data[k]) > 5) { data_ok = false; break; @@ -111,80 +114,109 @@ ClusterComparison compare_clusters(std::vector& cpu_cl // Cluster printing // ________________ template -void print_cluster_comparison(const std::vector>& cpu_results, - const std::vector>& gpu_results, - size_t max_per_frame = 10, - size_t max_frames = 100) -{ +void print_cluster_comparison( + const std::vector> &cpu_results, + const std::vector> &gpu_results, + size_t max_per_frame = 10, size_t max_frames = 100) { constexpr int NX = ClusterType::cluster_size_x; constexpr int NY = ClusterType::cluster_size_y; - constexpr int N = NX * NY; - + constexpr int N = NX * NY; + size_t frames_shown = 0; - for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames; ++fi) { + for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames; + ++fi) { if (cpu_results[fi].empty() && gpu_results[fi].empty()) continue; - + size_t n_cpu = cpu_results[fi].size(); size_t n_gpu = gpu_results[fi].size(); - printf("\n Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu, n_gpu); - + printf("\n Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu, + n_gpu); + // Merge-walk over sorted lists (assumes already sorted by y,x) size_t ci = 0, gi = 0; size_t shown = 0; while ((ci < n_cpu || gi < n_gpu) && shown < max_per_frame) { bool have_cpu = ci < n_cpu; bool have_gpu = gi < n_gpu; - + // Determine if current entries match in position bool same_pos = have_cpu && have_gpu && cpu_results[fi][ci].x == gpu_results[fi][gi].x && cpu_results[fi][ci].y == gpu_results[fi][gi].y; - + if (same_pos) { - const auto& cc = cpu_results[fi][ci]; - const auto& gc = gpu_results[fi][gi]; - + const auto &cc = cpu_results[fi][ci]; + const auto &gc = gpu_results[fi][gi]; + // Check if data differs bool differs = false; for (int k = 0; k < N; ++k) { - if (cc.data[k] != gc.data[k]) { differs = true; break; } + if (cc.data[k] != gc.data[k]) { + differs = true; + break; + } } - printf(" CPU and GPU clusters found at SAME position (col=%3d, row=%3d)\n %s\n", cc.x, cc.y, - differs ? "DATA MISMATCH" : "MATCH"); + printf(" CPU and GPU clusters found at SAME position " + "(col=%3d, row=%3d)\n %s\n", + cc.x, cc.y, differs ? "DATA MISMATCH" : "MATCH"); printf(" CPU: ["); - for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast(cc.data[k])); } + for (int k = 0; k < N; ++k) { + if (k) + printf(", "); + printf("%6d", static_cast(cc.data[k])); + } printf("]\n"); if (differs) { printf(" GPU: ["); - for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast(gc.data[k])); } + for (int k = 0; k < N; ++k) { + if (k) + printf(", "); + printf("%6d", static_cast(gc.data[k])); + } printf("]\n"); printf(" diff: ["); for (int k = 0; k < N; ++k) { - if (k) printf(", "); - int d = static_cast(gc.data[k]) - static_cast(cc.data[k]); + if (k) + printf(", "); + int d = static_cast(gc.data[k]) - + static_cast(cc.data[k]); printf("%+6d", d); } printf("]\n"); } - ci++; gi++; shown++; - } else if (!have_gpu || (have_cpu && (cpu_results[fi][ci].y < gpu_results[fi][gi].y || - (cpu_results[fi][ci].y == gpu_results[fi][gi].y && - cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) { - const auto& cc = cpu_results[fi][ci]; + ci++; + gi++; + shown++; + } else if (!have_gpu || + (have_cpu && + (cpu_results[fi][ci].y < gpu_results[fi][gi].y || + (cpu_results[fi][ci].y == gpu_results[fi][gi].y && + cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) { + const auto &cc = cpu_results[fi][ci]; printf(" (%3d, %3d) CPU ONLY\n", cc.x, cc.y); printf(" CPU: ["); - for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast(cc.data[k])); } + for (int k = 0; k < N; ++k) { + if (k) + printf(", "); + printf("%6d", static_cast(cc.data[k])); + } printf("]\n"); - ci++; shown++; + ci++; + shown++; } else { - const auto& gc = gpu_results[fi][gi]; + const auto &gc = gpu_results[fi][gi]; printf(" (%3d, %3d) GPU ONLY\n", gc.x, gc.y); printf(" GPU: ["); - for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast(gc.data[k])); } + for (int k = 0; k < N; ++k) { + if (k) + printf(", "); + printf("%6d", static_cast(gc.data[k])); + } printf("]\n"); - gi++; shown++; + gi++; + shown++; } } frames_shown++; @@ -195,24 +227,25 @@ void print_cluster_comparison(const std::vector>& cpu_r // // Helpers for the updated (CPU-parity) API // _________________________________________ - + // Copy a ClusterVector into a std::vector for downstream // comparison code that expects the latter. template -void drain_into(Finder& f, std::vector& out) { +void drain_into(Finder &f, std::vector &out) { auto cv = f.steal_clusters(true); out.clear(); out.reserve(cv.size()); - for (size_t j = 0; j < cv.size(); ++j) out.push_back(cv[j]); + for (size_t j = 0; j < cv.size(); ++j) + out.push_back(cv[j]); } - + // Feed a set of cached pedestal frames through any finder exposing the // CPU-style push_pedestal_frame(NDView) method. Works for both // ClusterFinder and ClusterFinderCUDA. template -void feed_pedestal(Finder& f, - const std::vector>& ped_frames) { - for (const auto& frame : ped_frames) { +void feed_pedestal( + Finder &f, const std::vector> &ped_frames) { + for (const auto &frame : ped_frames) { f.push_pedestal_frame(frame.view()); } } @@ -221,16 +254,20 @@ void feed_pedestal(Finder& f, // // Main // ____________ -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { // Parse arguments - const char* default_pedestal = - "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json"; - // const char* default_pedestal = + const char *default_pedestal = + "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/" + "2025Sept_m694/Sn25300eV/500_us_voltage_40kV/" + "250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json"; + // const char* default_pedestal = // "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_pedestal_200keV_DYNAMIC_tint_20us_master_0.json"; - const char* default_data = - "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Xray_Tp15C_tint_500_master_0.json"; - // const char* default_data = + const char *default_data = + "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/" + "2025Sept_m694/Sn25300eV/500_us_voltage_40kV/" + "250922_CZTonly_Xray_Tp15C_tint_500_master_0.json"; + // const char* default_data = // "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_data_200keV_DYNAMIC_tint_20us_master_0.json"; std::filesystem::path pedestal_path(argc > 1 ? argv[1] : default_pedestal); @@ -247,18 +284,21 @@ int main(int argc, char* argv[]) { // Defaults: Adjust depending on the test dataset used size_t n_pedestal_frames = 6000; - size_t n_data_frames = 10000; - double nSigma = 5.0; + size_t n_data_frames = 10000; + double nSigma = 5.0; - if (argc > 3) n_pedestal_frames = std::atol(argv[3]); - if (argc > 4) n_data_frames = std::atol(argv[4]); - if (argc > 5) nSigma = std::atof(argv[5]); + if (argc > 3) + n_pedestal_frames = std::atol(argv[3]); + if (argc > 4) + n_data_frames = std::atol(argv[4]); + if (argc > 5) + nSigma = std::atof(argv[5]); // Detector geometry from master file constexpr uint8_t cs_x = 3; constexpr uint8_t cs_y = 3; using ClusterType = aare::Cluster; - using FRAME_TYPE = uint16_t; + using FRAME_TYPE = uint16_t; using PEDESTAL_TYPE = double; // Read actual frame dimensions from the pedestal file @@ -272,40 +312,44 @@ int main(int argc, char* argv[]) { printf("=== Cluster Finder: CPU vs CUDA ===\n"); printf("Detector: %zu x %zu\n", ROWS, COLS); - printf("Cluster: %d x %d\n", ClusterType::cluster_size_x, ClusterType::cluster_size_y); + printf("Cluster: %d x %d\n", ClusterType::cluster_size_x, + ClusterType::cluster_size_y); printf("nSigma: %.1f\n", nSigma); // ========================================================================= - // Phase 1: Build pedestal from dark frames (sanity check only + frame cache) + // Phase 1: Build pedestal from dark frames (sanity check only + frame + // cache) // ========================================================================= // - // Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal object; - // both build their own via push_pedestal_frame. - // We still read the pedestal file once, but cache the frames in memory so - // every subsequent finder can be fed without re-hitting disk. We also - // build a standalone Pedestal purely to print a sanity check. + // Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal + // object; both build their own via push_pedestal_frame. We still read the + // pedestal file once, but cache the frames in memory so every subsequent + // finder can be fed without re-hitting disk. We also build a standalone + // Pedestal purely to print a sanity check. // ========================================================================= printf("\n--- Phase 1: Pedestal accumulation (sanity check + cache) ---\n"); - + std::vector> pedestal_frames; aare::Pedestal pedestal(ROWS, COLS, 1000); - + { aare::File ped_file(pedestal_path, "r"); size_t total_ped = ped_file.total_frames(); - size_t use_ped = (n_pedestal_frames == 0 || n_pedestal_frames > total_ped) - ? total_ped : n_pedestal_frames; + size_t use_ped = + (n_pedestal_frames == 0 || n_pedestal_frames > total_ped) + ? total_ped + : n_pedestal_frames; printf("Pedestal frames: %zu / %zu\n", use_ped, total_ped); - + pedestal_frames.reserve(use_ped); - + Timer t; t.start(); - + for (size_t i = 0; i < use_ped; ++i) { auto frame = ped_file.read_frame(); - auto view = frame.view(); - + auto view = frame.view(); + // Copy into a standalone NDArray that we can reuse as many times // as we have finders to feed. aare::NDArray arr({ROWS, COLS}); @@ -313,16 +357,16 @@ int main(int argc, char* argv[]) { for (ssize_t c = 0; c < COLS; ++c) arr(r, c) = view(r, c); pedestal_frames.push_back(std::move(arr)); - + pedestal.push_no_update(view); } pedestal.update_mean(); - + printf("Pedestal read+cached+built in %.1f ms\n", t.elapsed_ms()); } - - printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n", - pedestal.mean(0, 0), pedestal.std(0, 0)); + + printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n", pedestal.mean(0, 0), + pedestal.std(0, 0)); // ========================================================================= // Phase 2: Read data frames @@ -331,7 +375,7 @@ int main(int argc, char* argv[]) { aare::File data_file(data_path, "r"); size_t total_data = data_file.total_frames(); - size_t use_data = std::min(n_data_frames, total_data); + size_t use_data = std::min(n_data_frames, total_data); printf("Data frames: %zu / %zu\n", use_data, total_data); // Pre-read all frames into memory to remove I/O from timing @@ -391,32 +435,32 @@ int main(int argc, char* argv[]) { // time makes sense. // ========================================================================= printf("\n--- Phase 4: CUDA ClusterFinder ---\n"); - + std::vector> gpu_results(use_data); size_t gpu_total_clusters = 0; - + // --- Single frame on a single CUDA stream --- { aare::ClusterFinderCUDA cuda_cf( {ROWS, COLS}, nSigma); - + feed_pedestal(cuda_cf, pedestal_frames); - + // Warmup: first CUDA call pays driver/context init overhead. The // pedestal drifts slightly during this single frame, which is // acceptable for timing purposes. cuda_cf.find_clusters(frames[0].view(), 0); cuda_cf.steal_clusters(true); - + Timer t; t.start(); - + for (size_t i = 0; i < use_data; ++i) { cuda_cf.find_clusters(frames[i].view(), static_cast(i)); drain_into(cuda_cf, gpu_results[i]); gpu_total_clusters += gpu_results[i].size(); } - + double gpu_time = t.elapsed_ms(); printf("GPU: %zu clusters in %.1f ms (%.2f ms/frame)\n", gpu_total_clusters, gpu_time, gpu_time / use_data); @@ -428,32 +472,33 @@ int main(int argc, char* argv[]) { { constexpr size_t BATCH_SIZE = 100; constexpr int N_STREAMS = 2; - + aare::ClusterFinderCUDA cuda_cf( {ROWS, COLS}, nSigma, 1'000'000, N_STREAMS); - + feed_pedestal(cuda_cf, pedestal_frames); - + // Contiguous staging buffer reused across batches std::vector batch_buffer(BATCH_SIZE * ROWS * COLS); - + const size_t n_batches = (use_data + BATCH_SIZE - 1) / BATCH_SIZE; - + Timer t; t.start(); - + for (size_t bi = 0; bi < n_batches; ++bi) { const size_t offset = bi * BATCH_SIZE; const size_t actual_batch = std::min(BATCH_SIZE, use_data - offset); pack_frame_batch(frames, offset, actual_batch, batch_buffer); - + aare::NDView batch_view( batch_buffer.data(), {static_cast(actual_batch), ROWS, COLS}); - - auto batch_results = cuda_cf.find_clusters_batched(batch_view, offset); - + + auto batch_results = cuda_cf.find_clusters_batched(batch_view, + offset); + for (size_t f = 0; f < actual_batch; ++f) { auto& cv = batch_results[f]; auto& out = gpu_results[offset + f]; @@ -463,10 +508,11 @@ int main(int argc, char* argv[]) { gpu_total_clusters += out.size(); } } - + double gpu_time = t.elapsed_ms(); - printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu, streams=%d)\n", - gpu_total_clusters, gpu_time, gpu_time / use_data, BATCH_SIZE, N_STREAMS); + printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu, + streams=%d)\n", gpu_total_clusters, gpu_time, gpu_time / use_data, + BATCH_SIZE, N_STREAMS); } */ @@ -484,21 +530,21 @@ int main(int argc, char* argv[]) { for (size_t i = 0; i < use_data; ++i) { auto result = compare_clusters(cpu_results[i], gpu_results[i]); - total_matched += result.matched; - total_data_mismatch += result.data_mismatch; - total_cpu_only += result.cpu_only; - total_gpu_only += result.gpu_only; + total_matched += result.matched; + total_data_mismatch += result.data_mismatch; + total_cpu_only += result.cpu_only; + total_gpu_only += result.gpu_only; - bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 || result.data_mismatch > 0); + bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 || + result.data_mismatch > 0); if (has_diff) { frames_with_differences++; // Print details for first few mismatching frames if (frames_with_differences <= 5) { printf(" Frame %zu: CPU=%zu GPU=%zu matched=%zu " "data_mismatch=%zu cpu_only=%zu gpu_only=%zu\n", - i, result.cpu_count, result.gpu_count, - result.matched, result.data_mismatch, - result.cpu_only, result.gpu_only); + i, result.cpu_count, result.gpu_count, result.matched, + result.data_mismatch, result.cpu_only, result.gpu_only); } } } @@ -510,9 +556,11 @@ int main(int argc, char* argv[]) { printf(" Data mismatch: %zu\n", total_data_mismatch); printf(" CPU only: %zu\n", total_cpu_only); printf(" GPU only: %zu\n", total_gpu_only); - printf(" Frames with diffs: %zu / %zu\n", frames_with_differences, use_data); + printf(" Frames with diffs: %zu / %zu\n", frames_with_differences, + use_data); - if (total_cpu_only == 0 && total_gpu_only == 0 && total_data_mismatch == 0) { + if (total_cpu_only == 0 && total_gpu_only == 0 && + total_data_mismatch == 0) { printf("\n*** PASS: CPU and GPU results match exactly ***\n"); } else { printf("\n*** DIFFERENCES DETECTED ***\n"); @@ -522,8 +570,10 @@ int main(int argc, char* argv[]) { // if (cpu_total_clusters > 0 || gpu_total_clusters > 0) { // size_t max_clusters_per_frame = 10; // size_t max_frames = 100; - // printf("\n--- Cluster details (up to %zu frames, %zu clusters each) ---\n", max_frames, max_clusters_per_frame); - // print_cluster_comparison(cpu_results, gpu_results, max_clusters_per_frame, max_frames); + // printf("\n--- Cluster details (up to %zu frames, %zu clusters each) + // ---\n", max_frames, max_clusters_per_frame); + // print_cluster_comparison(cpu_results, gpu_results, + // max_clusters_per_frame, max_frames); // } // ========================================================================= @@ -533,7 +583,7 @@ int main(int argc, char* argv[]) { if (use_data > 0) { const int N_ITER = 1000; - const auto& bench_frame = frames[0]; + const auto &bench_frame = frames[0]; // CPU benchmark { @@ -558,44 +608,43 @@ int main(int argc, char* argv[]) { } // --- GPU benchmark (single frame, single stream) --- { - aare::ClusterFinderCUDA cuda_cf( - {ROWS, COLS}, nSigma); + aare::ClusterFinderCUDA + cuda_cf({ROWS, COLS}, nSigma); feed_pedestal(cuda_cf, pedestal_frames); - + // Warmup cuda_cf.find_clusters(bench_frame.view(), 0); cuda_cf.steal_clusters(true); - + Timer t; t.start(); for (int iter = 0; iter < N_ITER; ++iter) { cuda_cf.find_clusters(bench_frame.view(), 0); cuda_cf.steal_clusters(true); } - printf("GPU: %.3f ms/frame (H2D + kernel + D2H, single frame, single stream)\n", + printf("GPU: %.3f ms/frame (H2D + kernel + D2H, single " + "frame, single stream)\n", t.elapsed_ms() / N_ITER); } - + // --- GPU benchmark (batched + multi-streamed) --- { constexpr size_t BATCH_SIZE = 500; - constexpr int N_STREAMS = 10; - - aare::ClusterFinderCUDA cuda_cf( - {ROWS, COLS}, nSigma, 1'000'000, N_STREAMS); + constexpr int N_STREAMS = 10; + + aare::ClusterFinderCUDA + cuda_cf({ROWS, COLS}, nSigma, 1'000'000, N_STREAMS); feed_pedestal(cuda_cf, pedestal_frames); - + // Build one contiguous batch of BATCH_SIZE copies of bench_frame std::vector batch(BATCH_SIZE * ROWS * COLS); for (size_t k = 0; k < BATCH_SIZE; ++k) - std::memcpy(batch.data() + k * ROWS * COLS, - bench_frame.data(), + std::memcpy(batch.data() + k * ROWS * COLS, bench_frame.data(), ROWS * COLS * sizeof(FRAME_TYPE)); - + aare::NDView batch_view( - batch.data(), - {static_cast(BATCH_SIZE), ROWS, COLS}); - + batch.data(), {static_cast(BATCH_SIZE), ROWS, COLS}); + // Warmup. The kernel mutates the device-side pedestal for every // non-photon pixel, so after a 500-frame warmup the pedestal // state has drifted. Reset to a clean baseline by clearing the @@ -604,17 +653,19 @@ int main(int argc, char* argv[]) { (void)cuda_cf.find_clusters_batched(batch_view, 0); cuda_cf.clear_pedestal(); feed_pedestal(cuda_cf, pedestal_frames); - - const size_t n_iter_batches = (N_ITER + BATCH_SIZE - 1) / BATCH_SIZE; - + + const size_t n_iter_batches = + (N_ITER + BATCH_SIZE - 1) / BATCH_SIZE; + Timer t; t.start(); for (size_t b = 0; b < n_iter_batches; ++b) { (void)cuda_cf.find_clusters_batched(batch_view, b * BATCH_SIZE); } - printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, batch=%zu, streams=%d)\n", - t.elapsed_ms() / (n_iter_batches * BATCH_SIZE), - BATCH_SIZE, N_STREAMS); + printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, " + "batch=%zu, streams=%d)\n", + t.elapsed_ms() / (n_iter_batches * BATCH_SIZE), BATCH_SIZE, + N_STREAMS); } }