Format CUDA cluster finder files
Build on RHEL8 / build (push) Successful in 3m13s
Build on RHEL9 / build (push) Successful in 3m34s
Run tests using data on local RHEL8 / build (push) Successful in 3m48s

This commit is contained in:
kferjaoui
2026-04-27 11:27:47 +02:00
parent 133cedf755
commit a086cbb897
8 changed files with 508 additions and 440 deletions
+165 -158
View File
@@ -1,31 +1,29 @@
// SPDX-License-Identifier: MPL-2.0
#pragma once
#include <cstdint>
#include <cmath>
#include <stdexcept>
#include <cstdio>
#include "aare/ClusterFinder.hpp"
#include "aare/clusterfinder_kernel.cuh"
#include "aare/utils/cuda_check.cuh"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <stdexcept>
namespace aare {
// Per-stream device resources
template <typename ClusterType, typename FRAME_TYPE, typename PEDESTAL_TYPE>
struct StreamContext{
cudaStream_t stream = nullptr; // handle to the stream
FRAME_TYPE* d_frame = nullptr;
PEDESTAL_TYPE* d_pd_mean = nullptr;
PEDESTAL_TYPE* d_pd_sum = nullptr;
PEDESTAL_TYPE* d_pd_sum2 = nullptr;
ClusterType* d_clusters = nullptr;
uint32_t* d_cluster_count = nullptr;
struct StreamContext {
cudaStream_t stream = nullptr; // handle to the stream
FRAME_TYPE *d_frame = nullptr;
PEDESTAL_TYPE *d_pd_mean = nullptr;
PEDESTAL_TYPE *d_pd_sum = nullptr;
PEDESTAL_TYPE *d_pd_sum2 = nullptr;
ClusterType *d_clusters = nullptr;
uint32_t *d_cluster_count = nullptr;
};
template <typename ClusterType = Cluster<int32_t, 3, 3>,
typename FRAME_TYPE = uint16_t,
typename PEDESTAL_TYPE = double,
typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
class ClusterFinderCUDA {
@@ -33,25 +31,25 @@ class ClusterFinderCUDA {
static constexpr int BLOCK_Y = 16;
static constexpr int col_radius = ClusterType::cluster_size_x / 2;
static constexpr int row_radius = ClusterType::cluster_size_y / 2;
Shape<2> m_image_size;
size_t nrows;
size_t ncols;
size_t image_size; // nrows * ncols
int n_streams;
size_t m_capacity;
PEDESTAL_TYPE m_nSigma;
Pedestal<PEDESTAL_TYPE> m_pedestal;
ClusterVector<ClusterType> m_clusters;
bool m_pedestal_dirty = true;
Shape<2> m_image_size;
size_t nrows;
size_t ncols;
size_t image_size; // nrows * ncols
int n_streams;
size_t m_capacity;
PEDESTAL_TYPE m_nSigma;
Pedestal<PEDESTAL_TYPE> m_pedestal;
ClusterVector<ClusterType> m_clusters;
bool m_pedestal_dirty = true;
using SC = StreamContext<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>;
std::vector<SC> v_sc;
// Kernel parameters
dim3 grid;
dim3 block;
dim3 grid;
dim3 block;
size_t shmem_bytes;
public:
@@ -59,79 +57,87 @@ class ClusterFinderCUDA {
* @brief Construct a ClusterFinderCUDA
*
* @param image_size shape of the detector frame (rows, cols)
* @param nSigma threshold in units of per-pixel pedestal std
* @param nSigma threshold in units of per-pixel pedestal
* std
* @param capacity device-side cluster buffer size per stream
* @param n_streams number of CUDA streams for multi-frame overlap
* @param n_streams number of CUDA streams for multi-frame
* overlap
*/
ClusterFinderCUDA(Shape<2> image_size_,
PEDESTAL_TYPE nSigma = 5.0,
size_t capacity = 1000000,
int n_streams_ = 1)
: m_image_size(image_size_),
nrows(image_size_[0]),
ncols(image_size_[1]),
image_size(nrows * ncols),
n_streams(n_streams_),
m_capacity(capacity),
m_nSigma(nSigma),
m_pedestal(image_size_[0], image_size_[1]),
m_clusters(capacity)
{
ClusterFinderCUDA(Shape<2> image_size_, PEDESTAL_TYPE nSigma = 5.0,
size_t capacity = 1000000, int n_streams_ = 1)
: m_image_size(image_size_), nrows(image_size_[0]),
ncols(image_size_[1]), image_size(nrows * ncols),
n_streams(n_streams_), m_capacity(capacity), m_nSigma(nSigma),
m_pedestal(image_size_[0], image_size_[1]), m_clusters(capacity) {
// Grid/Block dimensions
block = dim3(BLOCK_X, BLOCK_Y);
grid = dim3((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X,
(static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y);
grid = dim3((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X,
(static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y);
// Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y +
// 2*row_radius) doubles
shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) *
sizeof(PEDESTAL_TYPE);
// Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + 2*row_radius) doubles
shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * sizeof(PEDESTAL_TYPE);
v_sc.resize(n_streams);
for(int k=0; k < n_streams; ++k) {
auto& sc = v_sc[k];
CUDA_CHECK(cudaStreamCreate(&sc.stream));
CUDA_CHECK(cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE)));
CUDA_CHECK(cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType)));
for (int k = 0; k < n_streams; ++k) {
auto &sc = v_sc[k];
CUDA_CHECK(cudaStreamCreate(&sc.stream));
CUDA_CHECK(
cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE)));
CUDA_CHECK(
cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(
cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(
cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE)));
CUDA_CHECK(
cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType)));
CUDA_CHECK(cudaMalloc(&sc.d_cluster_count, sizeof(uint32_t)));
}
}
}
~ClusterFinderCUDA() {
for(auto& sc : v_sc) {
if (sc.d_frame) cudaFree(sc.d_frame);
if (sc.d_pd_mean) cudaFree(sc.d_pd_mean);
if (sc.d_pd_sum) cudaFree(sc.d_pd_sum);
if (sc.d_pd_sum2) cudaFree(sc.d_pd_sum2);
if (sc.d_clusters) cudaFree(sc.d_clusters);
if (sc.d_cluster_count) cudaFree(sc.d_cluster_count);
if (sc.stream) cudaStreamDestroy(sc.stream);
for (auto &sc : v_sc) {
if (sc.d_frame)
cudaFree(sc.d_frame);
if (sc.d_pd_mean)
cudaFree(sc.d_pd_mean);
if (sc.d_pd_sum)
cudaFree(sc.d_pd_sum);
if (sc.d_pd_sum2)
cudaFree(sc.d_pd_sum2);
if (sc.d_clusters)
cudaFree(sc.d_clusters);
if (sc.d_cluster_count)
cudaFree(sc.d_cluster_count);
if (sc.stream)
cudaStreamDestroy(sc.stream);
}
}
// Non-copyable, non-movable
ClusterFinderCUDA(const ClusterFinderCUDA&) = delete;
ClusterFinderCUDA& operator=(const ClusterFinderCUDA&) = delete;
ClusterFinderCUDA(ClusterFinderCUDA&&) = delete;
ClusterFinderCUDA& operator=(ClusterFinderCUDA&&) = delete;
ClusterFinderCUDA(const ClusterFinderCUDA &) = delete;
ClusterFinderCUDA &operator=(const ClusterFinderCUDA &) = delete;
ClusterFinderCUDA(ClusterFinderCUDA &&) = delete;
ClusterFinderCUDA &operator=(ClusterFinderCUDA &&) = delete;
void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; }
PEDESTAL_TYPE get_nSigma() const { return m_nSigma; }
void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; }
PEDESTAL_TYPE get_nSigma() const { return m_nSigma; }
void push_pedestal_frame(NDView<FRAME_TYPE, 2> frame) {
m_pedestal.push(frame);
m_pedestal_dirty = true;
}
void clear_pedestal() {
m_pedestal.clear();
m_pedestal_dirty = true;
}
NDArray<PEDESTAL_TYPE, 2> pedestal() { return m_pedestal.mean(); }
NDArray<PEDESTAL_TYPE, 2> noise() { return m_pedestal.std(); }
NDArray<PEDESTAL_TYPE, 2> noise() { return m_pedestal.std(); }
/**
* @brief Move clusters out of the internal ClusterVector, optionally
* reallocating the internal one with the same capacity.
@@ -150,52 +156,47 @@ class ClusterFinderCUDA {
* @brief Find clusters in a single frame, appending them to the internal
* ClusterVector.
*/
void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0)
{
void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0) {
if (m_pedestal_dirty) { // need to update the pedestal on the gpu
sync_pedestal_to_device();
m_pedestal_dirty = false;
}
auto& sc = v_sc[0];
auto &sc = v_sc[0];
const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
const uint32_t n_pd_samples = static_cast<uint32_t>(m_pedestal.n_samples());
const uint32_t n_pd_samples =
static_cast<uint32_t>(m_pedestal.n_samples());
// Reset cluster counter
CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t), sc.stream));
// Upload frame
CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes, cudaMemcpyHostToDevice, sc.stream));
// Kernel launch
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
<<<grid, block, shmem_bytes, sc.stream>>>(sc.d_frame,
sc.d_pd_mean,
sc.d_pd_sum,
sc.d_pd_sum2,
n_pd_samples,
m_nSigma,
nrows,
ncols,
sc.d_clusters,
sc.d_cluster_count,
static_cast<uint32_t>(m_capacity)
);
CUDA_CHECK(cudaGetLastError());
// Read back cluster count
uint32_t n_found = 0;
CUDA_CHECK(cudaMemcpyAsync(&n_found,
sc.d_cluster_count,
sizeof(uint32_t),
cudaMemcpyDeviceToHost,
CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t),
sc.stream));
// Synchronize to ensure count is available before the CPU reads clusters
// Upload frame
CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes,
cudaMemcpyHostToDevice, sc.stream));
// Kernel launch
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE,
PEDESTAL_TYPE>
<<<grid, block, shmem_bytes, sc.stream>>>(
sc.d_frame, sc.d_pd_mean, sc.d_pd_sum, sc.d_pd_sum2,
n_pd_samples, m_nSigma, nrows, ncols, sc.d_clusters,
sc.d_cluster_count, static_cast<uint32_t>(m_capacity));
CUDA_CHECK(cudaGetLastError());
// Read back cluster count
uint32_t n_found = 0;
CUDA_CHECK(cudaMemcpyAsync(&n_found, sc.d_cluster_count,
sizeof(uint32_t), cudaMemcpyDeviceToHost,
sc.stream));
// Synchronize to ensure count is available before the CPU reads
// clusters
CUDA_CHECK(cudaStreamSynchronize(sc.stream));
// Clamp to max in case of overflow
if (n_found > m_capacity) n_found = m_capacity;
if (n_found > m_capacity)
n_found = m_capacity;
// Read back clusters
m_clusters.set_frame_number(frame_number);
@@ -213,74 +214,79 @@ class ClusterFinderCUDA {
*/
std::vector<ClusterVector<ClusterType>>
find_clusters_batched(NDView<FRAME_TYPE, 3> frames,
uint64_t first_frame = 0)
{
uint64_t first_frame = 0) {
if (m_pedestal_dirty) {
sync_pedestal_to_device();
m_pedestal_dirty = false;
}
const size_t n_frames = frames.shape(0);
const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
const uint32_t n_pd_samples = static_cast<uint32_t>(m_pedestal.n_samples());
const size_t n_frames = frames.shape(0);
const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
const uint32_t n_pd_samples =
static_cast<uint32_t>(m_pedestal.n_samples());
std::vector<ClusterVector<ClusterType>> results;
results.reserve(n_frames);
for (size_t i = 0; i < n_frames; ++i) {
results.emplace_back(m_capacity);
results.back().set_frame_number(first_frame + i);
}
std::vector<uint32_t> host_counts(n_streams, 0);
const size_t n_rounds = (n_frames + n_streams - 1) / n_streams;
for (size_t round = 0; round < n_rounds; ++round) {
// Launch phase: fan out kernels on all streams for this round
for (int k = 0; k < n_streams; ++k) {
// OOB guard
const size_t frame_idx = round * n_streams + k;
if (frame_idx >= n_frames) continue;
auto& sc_k = v_sc[k];
const FRAME_TYPE* h_src = frames.data() + frame_idx * image_size;
if (frame_idx >= n_frames)
continue;
auto &sc_k = v_sc[k];
const FRAME_TYPE *h_src =
frames.data() + frame_idx * image_size;
CUDA_CHECK(cudaMemsetAsync(sc_k.d_cluster_count, 0,
sizeof(uint32_t), sc_k.stream));
CUDA_CHECK(cudaMemcpyAsync(sc_k.d_frame, h_src, image_bytes,
cudaMemcpyHostToDevice, sc_k.stream));
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
cudaMemcpyHostToDevice,
sc_k.stream));
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE,
PEDESTAL_TYPE>
<<<grid, block, shmem_bytes, sc_k.stream>>>(
sc_k.d_frame,
sc_k.d_pd_mean, sc_k.d_pd_sum, sc_k.d_pd_sum2,
n_pd_samples, m_nSigma,
nrows, ncols,
sc_k.d_frame, sc_k.d_pd_mean, sc_k.d_pd_sum,
sc_k.d_pd_sum2, n_pd_samples, m_nSigma, nrows, ncols,
sc_k.d_clusters, sc_k.d_cluster_count, m_capacity);
CUDA_CHECK(cudaGetLastError());
}
// Drain phase: fan in results from all streams
for (int k = 0; k < n_streams; ++k) {
const size_t frame_idx = round * n_streams + k;
if (frame_idx >= n_frames) continue;
auto& sc_k = v_sc[k];
CUDA_CHECK(cudaMemcpyAsync(&host_counts[k], sc_k.d_cluster_count,
sizeof(uint32_t),
cudaMemcpyDeviceToHost, sc_k.stream));
if (frame_idx >= n_frames)
continue;
auto &sc_k = v_sc[k];
CUDA_CHECK(cudaMemcpyAsync(
&host_counts[k], sc_k.d_cluster_count, sizeof(uint32_t),
cudaMemcpyDeviceToHost, sc_k.stream));
CUDA_CHECK(cudaStreamSynchronize(sc_k.stream));
uint32_t n_found = host_counts[k];
if (n_found > m_capacity) n_found = m_capacity;
if (n_found > m_capacity)
n_found = m_capacity;
if (n_found > 0) {
append_device_clusters_to(results[frame_idx], sc_k, n_found);
append_device_clusters_to(results[frame_idx], sc_k,
n_found);
}
}
}
return results;
}
@@ -295,34 +301,35 @@ class ClusterFinderCUDA {
// copies complete, so we synchronise at the end before they go out
// of scope.
NDArray<PEDESTAL_TYPE, 2> h_mean = m_pedestal.mean();
NDArray<PEDESTAL_TYPE, 2> h_sum = m_pedestal.get_sum();
NDArray<PEDESTAL_TYPE, 2> h_sum = m_pedestal.get_sum();
NDArray<PEDESTAL_TYPE, 2> h_sum2 = m_pedestal.get_sum2();
const size_t bytes = image_size * sizeof(PEDESTAL_TYPE);
for (auto& sc : v_sc) {
for (auto &sc : v_sc) {
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_mean, h_mean.data(), bytes,
cudaMemcpyHostToDevice, sc.stream));
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes,
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes,
cudaMemcpyHostToDevice, sc.stream));
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum2, h_sum2.data(), bytes,
cudaMemcpyHostToDevice, sc.stream));
}
for (auto& sc : v_sc) CUDA_CHECK(cudaStreamSynchronize(sc.stream));
for (auto &sc : v_sc)
CUDA_CHECK(cudaStreamSynchronize(sc.stream));
}
/**
* Copy n_found clusters from sc.d_clusters into the given ClusterVector
* and block on the transfer.
* and block on the transfer.
*/
void append_device_clusters_to(ClusterVector<ClusterType>& cv,
SC& sc, uint32_t n_found)
{
void append_device_clusters_to(ClusterVector<ClusterType> &cv, SC &sc,
uint32_t n_found) {
std::vector<ClusterType> staging(n_found);
CUDA_CHECK(cudaMemcpyAsync(staging.data(), sc.d_clusters,
n_found * sizeof(ClusterType),
cudaMemcpyDeviceToHost, sc.stream));
CUDA_CHECK(cudaStreamSynchronize(sc.stream));
for (const auto& c : staging) cv.push_back(c);
for (const auto &c : staging)
cv.push_back(c);
}
};
+88 -70
View File
@@ -1,28 +1,22 @@
#pragma once
#include <cuda_runtime.h>
#include <type_traits>
#include "aare/Cluster.hpp"
#include "aare/ClusterFinder.hpp"
#include <cuda_runtime.h>
#include <type_traits>
namespace aare::device {
template <typename ClusterType = Cluster<int32_t, 3, 3>,
typename FRAME_TYPE = uint16_t,
typename PEDESTAL_TYPE = double,
typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
__global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d_frame,
PEDESTAL_TYPE* __restrict__ d_pd_mean,
PEDESTAL_TYPE* __restrict__ d_pd_sum,
PEDESTAL_TYPE* __restrict__ d_pd_sum2,
const uint32_t n_pd_samples,
const PEDESTAL_TYPE m_nSigma,
const size_t nrows,
const size_t ncols,
// const uint64_t frame_number,
ClusterType* d_clusters,
uint32_t* d_cluster_count,
const uint32_t max_clusters)
{
__global__ void find_clusters_in_single_frame(
const FRAME_TYPE *__restrict__ d_frame,
PEDESTAL_TYPE *__restrict__ d_pd_mean, PEDESTAL_TYPE *__restrict__ d_pd_sum,
PEDESTAL_TYPE *__restrict__ d_pd_sum2, const uint32_t n_pd_samples,
const PEDESTAL_TYPE m_nSigma, const size_t nrows, const size_t ncols,
// const uint64_t frame_number,
ClusterType *d_clusters, uint32_t *d_cluster_count,
const uint32_t max_clusters) {
using CT = typename ClusterType::value_type;
// Compile-time cluster geometry useful for unrolling loops
@@ -38,10 +32,12 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
constexpr int pow2_c3 = CSX * CSY;
// Thread/pixel mapping
auto col_global = static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
auto row_global = static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
auto col_global =
static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
auto row_global =
static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
auto global_tid = static_cast<ssize_t>(col_global + ncols * row_global);
auto local_tid = threadIdx.x + blockDim.x * threadIdx.y;
auto local_tid = threadIdx.x + blockDim.x * threadIdx.y;
// ====================
// Shared memory layout
@@ -55,19 +51,22 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
// CUDA prefers raw bytes + aligned cast
// Compile error happens when using: `extern __shared__ T sh[];`
extern __shared__ __align__(sizeof(PEDESTAL_TYPE)) unsigned char smem[];
PEDESTAL_TYPE* shmem = reinterpret_cast<PEDESTAL_TYPE*>(smem);
PEDESTAL_TYPE *shmem = reinterpret_cast<PEDESTAL_TYPE *>(smem);
// Stride includes halo on both sides
auto shmem_stride = static_cast<int>(blockDim.x) + 2 * col_radius;
auto tile_size = shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
auto tile_size =
shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
// Offset so that thread (0,0) maps to shared-memory position
// (row_radius, col_radius) i.e. past the top-left halo.
auto shmem_tid = (static_cast<int>(threadIdx.y) + row_radius) * shmem_stride
+ (static_cast<int>(threadIdx.x) + col_radius);
auto shmem_tid =
(static_cast<int>(threadIdx.y) + row_radius) * shmem_stride +
(static_cast<int>(threadIdx.x) + col_radius);
// Cooperative zero-fill
for (int idx = static_cast<int>(local_tid); idx < tile_size; idx += static_cast<int>(blockDim.x * blockDim.y)) {
for (int idx = static_cast<int>(local_tid); idx < tile_size;
idx += static_cast<int>(blockDim.x * blockDim.y)) {
shmem[idx] = PEDESTAL_TYPE{0};
}
__syncthreads();
@@ -83,7 +82,7 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
// Helper: read (frame - pedestal_mean) from global memory, or 0 if OOB.
// gr, gc are the global row/col of the pixel to load.
// Returns the pedestal-subtracted value.
auto load_pixel = [&] __device__ (ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
auto load_pixel = [&] __device__(ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
auto gid = gc + static_cast<ssize_t>(ncols) * gr;
return static_cast<PEDESTAL_TYPE>(d_frame[gid]) - d_pd_mean[gid];
};
@@ -98,21 +97,25 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
if (threadIdx.y == 0 && valid_pixel) {
for (int i = 1; i <= row_radius; ++i) {
if (row_global - i >= 0)
shmem[shmem_tid - i * shmem_stride] = load_pixel(row_global - i, col_global);
shmem[shmem_tid - i * shmem_stride] =
load_pixel(row_global - i, col_global);
}
// Top-left corner rectangle
if (threadIdx.x == 0) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global - i >= 0 && col_global - j >= 0)
shmem[shmem_tid - i * shmem_stride - j] = load_pixel(row_global - i, col_global - j);
shmem[shmem_tid - i * shmem_stride - j] =
load_pixel(row_global - i, col_global - j);
}
// Top-right corner rectangle
if (threadIdx.x == blockDim.x - 1) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global - i >= 0 && col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid - i * shmem_stride + j] = load_pixel(row_global - i, col_global + j);
if (row_global - i >= 0 &&
col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid - i * shmem_stride + j] =
load_pixel(row_global - i, col_global + j);
}
}
@@ -134,21 +137,26 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
if (threadIdx.y == blockDim.y - 1 && valid_pixel) {
for (int i = 1; i <= row_radius; ++i) {
if (row_global + i < static_cast<ssize_t>(nrows))
shmem[shmem_tid + i * shmem_stride] = load_pixel(row_global + i, col_global);
shmem[shmem_tid + i * shmem_stride] =
load_pixel(row_global + i, col_global);
}
// Bottom-left corner rectangle
if (threadIdx.x == 0) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global + i < static_cast<ssize_t>(nrows) && col_global - j >= 0)
shmem[shmem_tid + i * shmem_stride - j] = load_pixel(row_global + i, col_global - j);
if (row_global + i < static_cast<ssize_t>(nrows) &&
col_global - j >= 0)
shmem[shmem_tid + i * shmem_stride - j] =
load_pixel(row_global + i, col_global - j);
}
// Bottom-right corner rectangle
if (threadIdx.x == blockDim.x - 1) {
for (int i = 1; i <= row_radius; ++i)
for (int j = 1; j <= col_radius; ++j)
if (row_global + i < static_cast<ssize_t>(nrows) && col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid + i * shmem_stride + j] = load_pixel(row_global + i, col_global + j);
if (row_global + i < static_cast<ssize_t>(nrows) &&
col_global + j < static_cast<ssize_t>(ncols))
shmem[shmem_tid + i * shmem_stride + j] =
load_pixel(row_global + i, col_global + j);
}
}
@@ -157,51 +165,57 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
// =====================
// Cluster-finding logic
// =====================
if (!valid_pixel) return;
if (!valid_pixel)
return;
// Per-pixel RMS from global pedestal arrays
// rms = sqrt( E[X^2] - E[X]^2 )
PEDESTAL_TYPE mean_px = d_pd_mean[global_tid];
PEDESTAL_TYPE var_px = d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
PEDESTAL_TYPE rms_px = sqrt(rms_sq);
PEDESTAL_TYPE mean_px = d_pd_mean[global_tid];
PEDESTAL_TYPE var_px =
d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
PEDESTAL_TYPE rms_px = sqrt(rms_sq);
// Pedestal-subtracted value of the center pixel (already in shmem)
PEDESTAL_TYPE val_pixel = shmem[shmem_tid];
// Negative pedestal early exit
// Negative pedestal early exit
if (val_pixel < -m_nSigma * rms_px)
return; // NOTE: pedestal update for this pixel is skipped (same as sequential)
return; // NOTE: pedestal update for this pixel is skipped (same as
// sequential)
// Stencil reduction: total, max, quadrant sums
PEDESTAL_TYPE total = PEDESTAL_TYPE{0};
PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of `numeric_limits<double>::lowest()`
PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of
// `numeric_limits<double>::lowest()`
/* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0, ic<=0)
PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0, ic>=0)
PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0, ic<=0)
PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0, ic>=0) */
/* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0,
ic<=0) PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0,
ic>=0) PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0,
ic<=0) PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0,
ic>=0) */
CT clusterData[CSX * CSY];
int idx = 0; // tracks the pixels in the cluster
#pragma unroll
#pragma unroll
for (int ir = -row_radius; ir <= row_radius; ++ir) {
#pragma unroll
#pragma unroll
for (int ic = -col_radius; ic <= col_radius; ++ic) {
PEDESTAL_TYPE val = shmem[shmem_tid + ir * shmem_stride + ic];
total += val;
max_val = max(max_val, val);
/* // Quadrant accumulation (pixels on the axes contribute to two quadrants)
if (ir <= 0 && ic <= 0) tl += val;
if (ir <= 0 && ic >= 0) tr += val;
if (ir >= 0 && ic <= 0) bl += val;
if (ir >= 0 && ic >= 0) br += val; */
/* // Quadrant accumulation (pixels on the axes contribute to two
quadrants) if (ir <= 0 && ic <= 0) tl += val; if (ir <= 0 && ic >=
0) tr += val; if (ir >= 0 && ic <= 0) bl += val; if (ir >= 0 && ic
>= 0) br += val; */
// Store pedestal-subtracted value in register array for later cluster output
if constexpr (std::is_integral_v<CT> && std::is_floating_point_v<PEDESTAL_TYPE>)
// Store pedestal-subtracted value in register array for later
// cluster output
if constexpr (std::is_integral_v<CT> &&
std::is_floating_point_v<PEDESTAL_TYPE>)
clusterData[idx] = static_cast<CT>(lround(val));
else
clusterData[idx] = static_cast<CT>(val);
@@ -213,10 +227,11 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
// Three-way classification (mirrors ClusterFinder's logic)
//
// 1. Single-pixel significance: max_val > nSigma * rms
// -> only the pixel that IS the max gets recorded (local-max suppression)
// -> only the pixel that IS the max gets recorded (local-max
// suppression)
//
// 2. Quadrant significance: max(tl,tr,bl,br) > c2 * nSigma * rms
// -> charge-sharing events where a 2x2 sub-region is significant
// -> charge-sharing events where a 2x2 sub-region is significant
// NOTE: This test is absent in the serial ClusterFinder!
//
// 3. Total significance: total > c3 * nSigma * rms
@@ -228,9 +243,10 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
// Test 1: single-pixel significance
if (max_val > m_nSigma * rms_px) {
// Local-max suppression: only the center-pixel thread records the cluster
// Local-max suppression: only the center-pixel thread records the
// cluster
if (val_pixel < max_val)
return; // some other pixel in the neighborhood is brighter
return; // some other pixel in the neighborhood is brighter
is_photon = true;
}
@@ -249,29 +265,30 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
}
}
// Pedestal update (if not a photon)
// In the sequential code, non-photon pixels feed back into the running pedestal via push_fast().
// In this kernel, the GPU updates all pixels in a frame simultaneously. So the updated pedestal
// will only be used starting from the next frame. -> This avoids a/serialization and b/global mem I/O.
// Pedestal update (if not a photon)
// In the sequential code, non-photon pixels feed back into the running
// pedestal via push_fast(). In this kernel, the GPU updates all pixels in a
// frame simultaneously. So the updated pedestal will only be used starting
// from the next frame. -> This avoids a/serialization and b/global mem I/O.
if (!is_photon && valid_pixel) {
PEDESTAL_TYPE raw_val = static_cast<PEDESTAL_TYPE>(d_frame[global_tid]);
PEDESTAL_TYPE sum = d_pd_sum[global_tid];
PEDESTAL_TYPE sum = d_pd_sum[global_tid];
PEDESTAL_TYPE sum2 = d_pd_sum2[global_tid];
sum += raw_val - sum / n_pd_samples;
sum += raw_val - sum / n_pd_samples;
sum2 += raw_val * raw_val - sum2 / n_pd_samples;
d_pd_sum[global_tid] = sum;
d_pd_sum[global_tid] = sum;
d_pd_sum2[global_tid] = sum2;
d_pd_mean[global_tid] = sum / n_pd_samples;
return;
}
/*
/*
if (!is_photon) return; // Debugging
*/
// Write cluster to global output buffer using atomic index
// Write cluster to global output buffer using atomic index
// for coordination across all blocks
uint32_t write_idx = atomicAdd(d_cluster_count, 1u);
@@ -283,7 +300,8 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
cluster.x = static_cast<decltype(cluster.x)>(col_global);
cluster.y = static_cast<decltype(cluster.y)>(row_global);
memcpy(reinterpret_cast<CT*>(&cluster.data), clusterData, sizeof(CT) * CSX * CSY);
memcpy(reinterpret_cast<CT *>(&cluster.data), clusterData,
sizeof(CT) * CSX * CSY);
d_clusters[write_idx] = cluster;
}
+9 -11
View File
@@ -1,17 +1,15 @@
// SPDX-License-Identifier: MPL-2.0
#pragma once
#include <vector>
#include <cstring>
#include "aare/NDArray.hpp"
#include <cstring>
#include <vector>
template <typename FRAME_TYPE>
void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>>& frames,
size_t first_frame,
size_t n_frames,
std::vector<FRAME_TYPE>& batch)
{
if (n_frames == 0) return;
void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>> &frames,
size_t first_frame, size_t n_frames,
std::vector<FRAME_TYPE> &batch) {
if (n_frames == 0)
return;
const size_t rows = frames[first_frame].shape(0);
const size_t cols = frames[first_frame].shape(1);
@@ -23,8 +21,8 @@ void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>>& frames,
}
for (size_t k = 0; k < n_frames; ++k) {
const FRAME_TYPE* src = frames[first_frame + k].data();
FRAME_TYPE* dst = batch.data() + k * image_size;
const FRAME_TYPE *src = frames[first_frame + k].data();
FRAME_TYPE *dst = batch.data() + k * image_size;
std::memcpy(dst, src, image_size * sizeof(FRAME_TYPE));
}
}
+6 -6
View File
@@ -3,12 +3,12 @@
#include <cstdlib>
#include <cuda_runtime.h>
inline void __cuda_check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
std::fprintf(stderr, "[%s:%d] CUDA error: %s\n",
file, line, cudaGetErrorString(err));
std::exit(1);
}
inline void __cuda_check(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
std::fprintf(stderr, "[%s:%d] CUDA error: %s\n", file, line,
cudaGetErrorString(err));
std::exit(1);
}
}
#define CUDA_CHECK(stmt) __cuda_check((stmt), __FILE__, __LINE__)
+19 -24
View File
@@ -27,19 +27,15 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr);
using ClusterType = Cluster<T, ClusterSizeX, ClusterSizeY, CoordType>;
using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
py::class_<CF>(m, class_name.c_str())
.def(py::init<Shape<2>, pd_type, size_t, int>(),
py::arg("image_size"),
py::arg("n_sigma") = 5.0,
py::arg("capacity") = 1'000'000,
py::arg("n_streams") = 1)
.def(py::init<Shape<2>, pd_type, size_t, int>(), py::arg("image_size"),
py::arg("n_sigma") = 5.0, py::arg("capacity") = 1'000'000,
py::arg("n_streams") = 1)
.def_property(
"nSigma",
&CF::get_nSigma,
&CF::set_nSigma,
"nSigma", &CF::get_nSigma, &CF::set_nSigma,
R"(Number of sigma above the pedestal to consider a photon during cluster finding.)")
.def("push_pedestal_frame",
@@ -50,21 +46,19 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
.def("clear_pedestal", &CF::clear_pedestal)
.def_property_readonly(
"pedestal",
[](CF &self) {
auto pd = new NDArray<pd_type, 2>{};
*pd = self.pedestal();
return return_image_data(pd);
})
.def_property_readonly("pedestal",
[](CF &self) {
auto pd = new NDArray<pd_type, 2>{};
*pd = self.pedestal();
return return_image_data(pd);
})
.def_property_readonly(
"noise",
[](CF &self) {
auto arr = new NDArray<pd_type, 2>{};
*arr = self.noise();
return return_image_data(arr);
})
.def_property_readonly("noise",
[](CF &self) {
auto arr = new NDArray<pd_type, 2>{};
*arr = self.noise();
return return_image_data(arr);
})
.def(
"steal_clusters",
@@ -86,7 +80,8 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
.def(
"find_clusters_batched",
[](CF &self, py::array_t<uint16_t> frames, uint64_t first_frame) {
// frames is expected as a 3D numpy array (n_frames, nrows, ncols)
// frames is expected as a 3D numpy array (n_frames, nrows,
// ncols)
auto view = make_view_3d(frames);
return self.find_clusters_batched(view, first_frame);
},
+1 -2
View File
@@ -30,8 +30,7 @@ void define_ClusterVector(py::module &m, const std::string &typestr) {
py::class_<ClusterVector<
Cluster<Type, ClusterSizeX, ClusterSizeY, CoordType>, void>>(
m, class_name.c_str(),
py::buffer_protocol(),
m, class_name.c_str(), py::buffer_protocol(),
py::module_local())
.def(py::init()) // TODO change!!!
+19 -19
View File
@@ -6,8 +6,8 @@
// back a usable ClusterVector without _aare needing to be imported first.
#include "bind_Cluster.hpp"
#include "bind_ClusterVector.hpp"
#include "bind_ClusterFinderCUDA.hpp"
#include "bind_ClusterVector.hpp"
#include <pybind11/pybind11.h>
@@ -22,45 +22,45 @@ namespace py = pybind11;
define_Cluster<T, N, M, U>(m, #N "x" #M #TYPE_CODE);
#define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE) \
aare::define_ClusterFinderCUDA<T, N, M, U>( \
m, "Cluster" #N "x" #M #TYPE_CODE);
aare::define_ClusterFinderCUDA<T, N, M, U>(m, "Cluster" #N \
"x" #M #TYPE_CODE);
PYBIND11_MODULE(_aare_cuda, m) {
// Types first — finders reference them in their signatures.
// SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too.
DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);
// Finders
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
}
#undef DEFINE_CUDA_CLUSTER_TYPES
+201 -150
View File
@@ -9,14 +9,14 @@
#include <numeric>
#include <vector>
#include "aare/defs.hpp"
#include "aare/utils/batch.hpp"
#include "aare/ClusterFinder.hpp"
#include "aare/ClusterFinderCUDA.hpp"
#include "aare/File.hpp"
#include "aare/Frame.hpp"
#include "aare/NDArray.hpp"
#include "aare/Pedestal.hpp"
#include "aare/ClusterFinder.hpp"
#include "aare/ClusterFinderCUDA.hpp"
#include "aare/defs.hpp"
#include "aare/utils/batch.hpp"
// _____________
//
@@ -29,19 +29,19 @@ struct Timer {
void start() { t0 = clock::now(); }
double elapsed_ms() const {
return std::chrono::duration<double, std::milli>(clock::now() - t0).count();
return std::chrono::duration<double, std::milli>(clock::now() - t0)
.count();
}
};
// __________________
//
//
// Cluster comparison
// __________________
template <typename ClusterType>
struct ClusterComparison {
template <typename ClusterType> struct ClusterComparison {
size_t cpu_count = 0;
size_t gpu_count = 0;
size_t matched = 0;
size_t matched = 0;
size_t position_mismatch = 0;
size_t data_mismatch = 0;
size_t cpu_only = 0;
@@ -50,19 +50,20 @@ struct ClusterComparison {
// Sort clusters by (y, x) for deterministic comparison
template <typename ClusterType>
void sort_clusters(std::vector<ClusterType>& clusters) {
void sort_clusters(std::vector<ClusterType> &clusters) {
std::sort(clusters.begin(), clusters.end(),
[](const ClusterType& a, const ClusterType& b) {
if (a.y != b.y) return a.y < b.y;
[](const ClusterType &a, const ClusterType &b) {
if (a.y != b.y)
return a.y < b.y;
return a.x < b.x;
});
}
// Compare two sorted cluster lists
template <typename ClusterType>
ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_clusters,
std::vector<ClusterType>& gpu_clusters)
{
ClusterComparison<ClusterType>
compare_clusters(std::vector<ClusterType> &cpu_clusters,
std::vector<ClusterType> &gpu_clusters) {
sort_clusters(cpu_clusters);
sort_clusters(gpu_clusters);
@@ -72,15 +73,17 @@ ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_cl
size_t ci = 0, gi = 0;
while (ci < cpu_clusters.size() && gi < gpu_clusters.size()) {
const auto& cc = cpu_clusters[ci];
const auto& gc = gpu_clusters[gi];
const auto &cc = cpu_clusters[ci];
const auto &gc = gpu_clusters[gi];
if (cc.y == gc.y && cc.x == gc.x) {
// Same position/check data
bool data_ok = true;
constexpr int N = ClusterType::cluster_size_x * ClusterType::cluster_size_y;
constexpr int N =
ClusterType::cluster_size_x * ClusterType::cluster_size_y;
for (int k = 0; k < N; ++k) {
// if (cc.data[k] != gc.data[k]) { // a bit too strict espacially that pedestal update is slightly different
// if (cc.data[k] != gc.data[k]) { // a bit too strict
// espacially that pedestal update is slightly different
if (std::abs(cc.data[k] - gc.data[k]) > 5) {
data_ok = false;
break;
@@ -111,80 +114,109 @@ ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_cl
// Cluster printing
// ________________
template <typename ClusterType>
void print_cluster_comparison(const std::vector<std::vector<ClusterType>>& cpu_results,
const std::vector<std::vector<ClusterType>>& gpu_results,
size_t max_per_frame = 10,
size_t max_frames = 100)
{
void print_cluster_comparison(
const std::vector<std::vector<ClusterType>> &cpu_results,
const std::vector<std::vector<ClusterType>> &gpu_results,
size_t max_per_frame = 10, size_t max_frames = 100) {
constexpr int NX = ClusterType::cluster_size_x;
constexpr int NY = ClusterType::cluster_size_y;
constexpr int N = NX * NY;
constexpr int N = NX * NY;
size_t frames_shown = 0;
for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames; ++fi) {
for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames;
++fi) {
if (cpu_results[fi].empty() && gpu_results[fi].empty())
continue;
size_t n_cpu = cpu_results[fi].size();
size_t n_gpu = gpu_results[fi].size();
printf("\n Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu, n_gpu);
printf("\n Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu,
n_gpu);
// Merge-walk over sorted lists (assumes already sorted by y,x)
size_t ci = 0, gi = 0;
size_t shown = 0;
while ((ci < n_cpu || gi < n_gpu) && shown < max_per_frame) {
bool have_cpu = ci < n_cpu;
bool have_gpu = gi < n_gpu;
// Determine if current entries match in position
bool same_pos = have_cpu && have_gpu &&
cpu_results[fi][ci].x == gpu_results[fi][gi].x &&
cpu_results[fi][ci].y == gpu_results[fi][gi].y;
if (same_pos) {
const auto& cc = cpu_results[fi][ci];
const auto& gc = gpu_results[fi][gi];
const auto &cc = cpu_results[fi][ci];
const auto &gc = gpu_results[fi][gi];
// Check if data differs
bool differs = false;
for (int k = 0; k < N; ++k) {
if (cc.data[k] != gc.data[k]) { differs = true; break; }
if (cc.data[k] != gc.data[k]) {
differs = true;
break;
}
}
printf(" CPU and GPU clusters found at SAME position (col=%3d, row=%3d)\n %s\n", cc.x, cc.y,
differs ? "DATA MISMATCH" : "MATCH");
printf(" CPU and GPU clusters found at SAME position "
"(col=%3d, row=%3d)\n %s\n",
cc.x, cc.y, differs ? "DATA MISMATCH" : "MATCH");
printf(" CPU: [");
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(cc.data[k])); }
for (int k = 0; k < N; ++k) {
if (k)
printf(", ");
printf("%6d", static_cast<int>(cc.data[k]));
}
printf("]\n");
if (differs) {
printf(" GPU: [");
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(gc.data[k])); }
for (int k = 0; k < N; ++k) {
if (k)
printf(", ");
printf("%6d", static_cast<int>(gc.data[k]));
}
printf("]\n");
printf(" diff: [");
for (int k = 0; k < N; ++k) {
if (k) printf(", ");
int d = static_cast<int>(gc.data[k]) - static_cast<int>(cc.data[k]);
if (k)
printf(", ");
int d = static_cast<int>(gc.data[k]) -
static_cast<int>(cc.data[k]);
printf("%+6d", d);
}
printf("]\n");
}
ci++; gi++; shown++;
} else if (!have_gpu || (have_cpu && (cpu_results[fi][ci].y < gpu_results[fi][gi].y ||
(cpu_results[fi][ci].y == gpu_results[fi][gi].y &&
cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) {
const auto& cc = cpu_results[fi][ci];
ci++;
gi++;
shown++;
} else if (!have_gpu ||
(have_cpu &&
(cpu_results[fi][ci].y < gpu_results[fi][gi].y ||
(cpu_results[fi][ci].y == gpu_results[fi][gi].y &&
cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) {
const auto &cc = cpu_results[fi][ci];
printf(" (%3d, %3d) CPU ONLY\n", cc.x, cc.y);
printf(" CPU: [");
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(cc.data[k])); }
for (int k = 0; k < N; ++k) {
if (k)
printf(", ");
printf("%6d", static_cast<int>(cc.data[k]));
}
printf("]\n");
ci++; shown++;
ci++;
shown++;
} else {
const auto& gc = gpu_results[fi][gi];
const auto &gc = gpu_results[fi][gi];
printf(" (%3d, %3d) GPU ONLY\n", gc.x, gc.y);
printf(" GPU: [");
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(gc.data[k])); }
for (int k = 0; k < N; ++k) {
if (k)
printf(", ");
printf("%6d", static_cast<int>(gc.data[k]));
}
printf("]\n");
gi++; shown++;
gi++;
shown++;
}
}
frames_shown++;
@@ -195,24 +227,25 @@ void print_cluster_comparison(const std::vector<std::vector<ClusterType>>& cpu_r
//
// Helpers for the updated (CPU-parity) API
// _________________________________________
// Copy a ClusterVector into a std::vector<ClusterType> for downstream
// comparison code that expects the latter.
template <typename Finder, typename ClusterType>
void drain_into(Finder& f, std::vector<ClusterType>& out) {
void drain_into(Finder &f, std::vector<ClusterType> &out) {
auto cv = f.steal_clusters(true);
out.clear();
out.reserve(cv.size());
for (size_t j = 0; j < cv.size(); ++j) out.push_back(cv[j]);
for (size_t j = 0; j < cv.size(); ++j)
out.push_back(cv[j]);
}
// Feed a set of cached pedestal frames through any finder exposing the
// CPU-style push_pedestal_frame(NDView) method. Works for both
// ClusterFinder and ClusterFinderCUDA.
template <typename Finder, typename FRAME_TYPE>
void feed_pedestal(Finder& f,
const std::vector<aare::NDArray<FRAME_TYPE, 2>>& ped_frames) {
for (const auto& frame : ped_frames) {
void feed_pedestal(
Finder &f, const std::vector<aare::NDArray<FRAME_TYPE, 2>> &ped_frames) {
for (const auto &frame : ped_frames) {
f.push_pedestal_frame(frame.view());
}
}
@@ -221,16 +254,20 @@ void feed_pedestal(Finder& f,
//
// Main
// ____________
int main(int argc, char* argv[]) {
int main(int argc, char *argv[]) {
// Parse arguments
const char* default_pedestal =
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json";
// const char* default_pedestal =
const char *default_pedestal =
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/"
"2025Sept_m694/Sn25300eV/500_us_voltage_40kV/"
"250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json";
// const char* default_pedestal =
// "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_pedestal_200keV_DYNAMIC_tint_20us_master_0.json";
const char* default_data =
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Xray_Tp15C_tint_500_master_0.json";
// const char* default_data =
const char *default_data =
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/"
"2025Sept_m694/Sn25300eV/500_us_voltage_40kV/"
"250922_CZTonly_Xray_Tp15C_tint_500_master_0.json";
// const char* default_data =
// "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_data_200keV_DYNAMIC_tint_20us_master_0.json";
std::filesystem::path pedestal_path(argc > 1 ? argv[1] : default_pedestal);
@@ -247,18 +284,21 @@ int main(int argc, char* argv[]) {
// Defaults: Adjust depending on the test dataset used
size_t n_pedestal_frames = 6000;
size_t n_data_frames = 10000;
double nSigma = 5.0;
size_t n_data_frames = 10000;
double nSigma = 5.0;
if (argc > 3) n_pedestal_frames = std::atol(argv[3]);
if (argc > 4) n_data_frames = std::atol(argv[4]);
if (argc > 5) nSigma = std::atof(argv[5]);
if (argc > 3)
n_pedestal_frames = std::atol(argv[3]);
if (argc > 4)
n_data_frames = std::atol(argv[4]);
if (argc > 5)
nSigma = std::atof(argv[5]);
// Detector geometry from master file
constexpr uint8_t cs_x = 3;
constexpr uint8_t cs_y = 3;
using ClusterType = aare::Cluster<int32_t, cs_x, cs_y>;
using FRAME_TYPE = uint16_t;
using FRAME_TYPE = uint16_t;
using PEDESTAL_TYPE = double;
// Read actual frame dimensions from the pedestal file
@@ -272,40 +312,44 @@ int main(int argc, char* argv[]) {
printf("=== Cluster Finder: CPU vs CUDA ===\n");
printf("Detector: %zu x %zu\n", ROWS, COLS);
printf("Cluster: %d x %d\n", ClusterType::cluster_size_x, ClusterType::cluster_size_y);
printf("Cluster: %d x %d\n", ClusterType::cluster_size_x,
ClusterType::cluster_size_y);
printf("nSigma: %.1f\n", nSigma);
// =========================================================================
// Phase 1: Build pedestal from dark frames (sanity check only + frame cache)
// Phase 1: Build pedestal from dark frames (sanity check only + frame
// cache)
// =========================================================================
//
// Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal object;
// both build their own via push_pedestal_frame.
// We still read the pedestal file once, but cache the frames in memory so
// every subsequent finder can be fed without re-hitting disk. We also
// build a standalone Pedestal purely to print a sanity check.
// Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal
// object; both build their own via push_pedestal_frame. We still read the
// pedestal file once, but cache the frames in memory so every subsequent
// finder can be fed without re-hitting disk. We also build a standalone
// Pedestal purely to print a sanity check.
// =========================================================================
printf("\n--- Phase 1: Pedestal accumulation (sanity check + cache) ---\n");
std::vector<aare::NDArray<FRAME_TYPE, 2>> pedestal_frames;
aare::Pedestal<PEDESTAL_TYPE> pedestal(ROWS, COLS, 1000);
{
aare::File ped_file(pedestal_path, "r");
size_t total_ped = ped_file.total_frames();
size_t use_ped = (n_pedestal_frames == 0 || n_pedestal_frames > total_ped)
? total_ped : n_pedestal_frames;
size_t use_ped =
(n_pedestal_frames == 0 || n_pedestal_frames > total_ped)
? total_ped
: n_pedestal_frames;
printf("Pedestal frames: %zu / %zu\n", use_ped, total_ped);
pedestal_frames.reserve(use_ped);
Timer t;
t.start();
for (size_t i = 0; i < use_ped; ++i) {
auto frame = ped_file.read_frame();
auto view = frame.view<FRAME_TYPE>();
auto view = frame.view<FRAME_TYPE>();
// Copy into a standalone NDArray that we can reuse as many times
// as we have finders to feed.
aare::NDArray<FRAME_TYPE, 2> arr({ROWS, COLS});
@@ -313,16 +357,16 @@ int main(int argc, char* argv[]) {
for (ssize_t c = 0; c < COLS; ++c)
arr(r, c) = view(r, c);
pedestal_frames.push_back(std::move(arr));
pedestal.push_no_update(view);
}
pedestal.update_mean();
printf("Pedestal read+cached+built in %.1f ms\n", t.elapsed_ms());
}
printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n",
pedestal.mean(0, 0), pedestal.std(0, 0));
printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n", pedestal.mean(0, 0),
pedestal.std(0, 0));
// =========================================================================
// Phase 2: Read data frames
@@ -331,7 +375,7 @@ int main(int argc, char* argv[]) {
aare::File data_file(data_path, "r");
size_t total_data = data_file.total_frames();
size_t use_data = std::min(n_data_frames, total_data);
size_t use_data = std::min(n_data_frames, total_data);
printf("Data frames: %zu / %zu\n", use_data, total_data);
// Pre-read all frames into memory to remove I/O from timing
@@ -391,32 +435,32 @@ int main(int argc, char* argv[]) {
// time makes sense.
// =========================================================================
printf("\n--- Phase 4: CUDA ClusterFinder ---\n");
std::vector<std::vector<ClusterType>> gpu_results(use_data);
size_t gpu_total_clusters = 0;
// --- Single frame on a single CUDA stream ---
{
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
{ROWS, COLS}, nSigma);
feed_pedestal(cuda_cf, pedestal_frames);
// Warmup: first CUDA call pays driver/context init overhead. The
// pedestal drifts slightly during this single frame, which is
// acceptable for timing purposes.
cuda_cf.find_clusters(frames[0].view(), 0);
cuda_cf.steal_clusters(true);
Timer t;
t.start();
for (size_t i = 0; i < use_data; ++i) {
cuda_cf.find_clusters(frames[i].view(), static_cast<uint64_t>(i));
drain_into(cuda_cf, gpu_results[i]);
gpu_total_clusters += gpu_results[i].size();
}
double gpu_time = t.elapsed_ms();
printf("GPU: %zu clusters in %.1f ms (%.2f ms/frame)\n",
gpu_total_clusters, gpu_time, gpu_time / use_data);
@@ -428,32 +472,33 @@ int main(int argc, char* argv[]) {
{
constexpr size_t BATCH_SIZE = 100;
constexpr int N_STREAMS = 2;
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
{ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
feed_pedestal(cuda_cf, pedestal_frames);
// Contiguous staging buffer reused across batches
std::vector<FRAME_TYPE> batch_buffer(BATCH_SIZE * ROWS * COLS);
const size_t n_batches = (use_data + BATCH_SIZE - 1) / BATCH_SIZE;
Timer t;
t.start();
for (size_t bi = 0; bi < n_batches; ++bi) {
const size_t offset = bi * BATCH_SIZE;
const size_t actual_batch = std::min(BATCH_SIZE, use_data - offset);
pack_frame_batch(frames, offset, actual_batch, batch_buffer);
aare::NDView<FRAME_TYPE, 3> batch_view(
batch_buffer.data(),
{static_cast<ssize_t>(actual_batch), ROWS, COLS});
auto batch_results = cuda_cf.find_clusters_batched(batch_view, offset);
auto batch_results = cuda_cf.find_clusters_batched(batch_view,
offset);
for (size_t f = 0; f < actual_batch; ++f) {
auto& cv = batch_results[f];
auto& out = gpu_results[offset + f];
@@ -463,10 +508,11 @@ int main(int argc, char* argv[]) {
gpu_total_clusters += out.size();
}
}
double gpu_time = t.elapsed_ms();
printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu, streams=%d)\n",
gpu_total_clusters, gpu_time, gpu_time / use_data, BATCH_SIZE, N_STREAMS);
printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu,
streams=%d)\n", gpu_total_clusters, gpu_time, gpu_time / use_data,
BATCH_SIZE, N_STREAMS);
}
*/
@@ -484,21 +530,21 @@ int main(int argc, char* argv[]) {
for (size_t i = 0; i < use_data; ++i) {
auto result = compare_clusters(cpu_results[i], gpu_results[i]);
total_matched += result.matched;
total_data_mismatch += result.data_mismatch;
total_cpu_only += result.cpu_only;
total_gpu_only += result.gpu_only;
total_matched += result.matched;
total_data_mismatch += result.data_mismatch;
total_cpu_only += result.cpu_only;
total_gpu_only += result.gpu_only;
bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 || result.data_mismatch > 0);
bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 ||
result.data_mismatch > 0);
if (has_diff) {
frames_with_differences++;
// Print details for first few mismatching frames
if (frames_with_differences <= 5) {
printf(" Frame %zu: CPU=%zu GPU=%zu matched=%zu "
"data_mismatch=%zu cpu_only=%zu gpu_only=%zu\n",
i, result.cpu_count, result.gpu_count,
result.matched, result.data_mismatch,
result.cpu_only, result.gpu_only);
i, result.cpu_count, result.gpu_count, result.matched,
result.data_mismatch, result.cpu_only, result.gpu_only);
}
}
}
@@ -510,9 +556,11 @@ int main(int argc, char* argv[]) {
printf(" Data mismatch: %zu\n", total_data_mismatch);
printf(" CPU only: %zu\n", total_cpu_only);
printf(" GPU only: %zu\n", total_gpu_only);
printf(" Frames with diffs: %zu / %zu\n", frames_with_differences, use_data);
printf(" Frames with diffs: %zu / %zu\n", frames_with_differences,
use_data);
if (total_cpu_only == 0 && total_gpu_only == 0 && total_data_mismatch == 0) {
if (total_cpu_only == 0 && total_gpu_only == 0 &&
total_data_mismatch == 0) {
printf("\n*** PASS: CPU and GPU results match exactly ***\n");
} else {
printf("\n*** DIFFERENCES DETECTED ***\n");
@@ -522,8 +570,10 @@ int main(int argc, char* argv[]) {
// if (cpu_total_clusters > 0 || gpu_total_clusters > 0) {
// size_t max_clusters_per_frame = 10;
// size_t max_frames = 100;
// printf("\n--- Cluster details (up to %zu frames, %zu clusters each) ---\n", max_frames, max_clusters_per_frame);
// print_cluster_comparison(cpu_results, gpu_results, max_clusters_per_frame, max_frames);
// printf("\n--- Cluster details (up to %zu frames, %zu clusters each)
// ---\n", max_frames, max_clusters_per_frame);
// print_cluster_comparison(cpu_results, gpu_results,
// max_clusters_per_frame, max_frames);
// }
// =========================================================================
@@ -533,7 +583,7 @@ int main(int argc, char* argv[]) {
if (use_data > 0) {
const int N_ITER = 1000;
const auto& bench_frame = frames[0];
const auto &bench_frame = frames[0];
// CPU benchmark
{
@@ -558,44 +608,43 @@ int main(int argc, char* argv[]) {
}
// --- GPU benchmark (single frame, single stream) ---
{
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
{ROWS, COLS}, nSigma);
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
cuda_cf({ROWS, COLS}, nSigma);
feed_pedestal(cuda_cf, pedestal_frames);
// Warmup
cuda_cf.find_clusters(bench_frame.view(), 0);
cuda_cf.steal_clusters(true);
Timer t;
t.start();
for (int iter = 0; iter < N_ITER; ++iter) {
cuda_cf.find_clusters(bench_frame.view(), 0);
cuda_cf.steal_clusters(true);
}
printf("GPU: %.3f ms/frame (H2D + kernel + D2H, single frame, single stream)\n",
printf("GPU: %.3f ms/frame (H2D + kernel + D2H, single "
"frame, single stream)\n",
t.elapsed_ms() / N_ITER);
}
// --- GPU benchmark (batched + multi-streamed) ---
{
constexpr size_t BATCH_SIZE = 500;
constexpr int N_STREAMS = 10;
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
{ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
constexpr int N_STREAMS = 10;
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
cuda_cf({ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
feed_pedestal(cuda_cf, pedestal_frames);
// Build one contiguous batch of BATCH_SIZE copies of bench_frame
std::vector<FRAME_TYPE> batch(BATCH_SIZE * ROWS * COLS);
for (size_t k = 0; k < BATCH_SIZE; ++k)
std::memcpy(batch.data() + k * ROWS * COLS,
bench_frame.data(),
std::memcpy(batch.data() + k * ROWS * COLS, bench_frame.data(),
ROWS * COLS * sizeof(FRAME_TYPE));
aare::NDView<FRAME_TYPE, 3> batch_view(
batch.data(),
{static_cast<ssize_t>(BATCH_SIZE), ROWS, COLS});
batch.data(), {static_cast<ssize_t>(BATCH_SIZE), ROWS, COLS});
// Warmup. The kernel mutates the device-side pedestal for every
// non-photon pixel, so after a 500-frame warmup the pedestal
// state has drifted. Reset to a clean baseline by clearing the
@@ -604,17 +653,19 @@ int main(int argc, char* argv[]) {
(void)cuda_cf.find_clusters_batched(batch_view, 0);
cuda_cf.clear_pedestal();
feed_pedestal(cuda_cf, pedestal_frames);
const size_t n_iter_batches = (N_ITER + BATCH_SIZE - 1) / BATCH_SIZE;
const size_t n_iter_batches =
(N_ITER + BATCH_SIZE - 1) / BATCH_SIZE;
Timer t;
t.start();
for (size_t b = 0; b < n_iter_batches; ++b) {
(void)cuda_cf.find_clusters_batched(batch_view, b * BATCH_SIZE);
}
printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, batch=%zu, streams=%d)\n",
t.elapsed_ms() / (n_iter_batches * BATCH_SIZE),
BATCH_SIZE, N_STREAMS);
printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, "
"batch=%zu, streams=%d)\n",
t.elapsed_ms() / (n_iter_batches * BATCH_SIZE), BATCH_SIZE,
N_STREAMS);
}
}