mirror of
https://github.com/slsdetectorgroup/aare.git
synced 2026-06-04 19:58:41 +02:00
Format CUDA cluster finder files
This commit is contained in:
+165
-158
@@ -1,31 +1,29 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
#include <stdexcept>
|
||||
#include <cstdio>
|
||||
#include "aare/ClusterFinder.hpp"
|
||||
#include "aare/clusterfinder_kernel.cuh"
|
||||
#include "aare/utils/cuda_check.cuh"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace aare {
|
||||
|
||||
|
||||
// Per-stream device resources
|
||||
template <typename ClusterType, typename FRAME_TYPE, typename PEDESTAL_TYPE>
|
||||
struct StreamContext{
|
||||
cudaStream_t stream = nullptr; // handle to the stream
|
||||
FRAME_TYPE* d_frame = nullptr;
|
||||
PEDESTAL_TYPE* d_pd_mean = nullptr;
|
||||
PEDESTAL_TYPE* d_pd_sum = nullptr;
|
||||
PEDESTAL_TYPE* d_pd_sum2 = nullptr;
|
||||
ClusterType* d_clusters = nullptr;
|
||||
uint32_t* d_cluster_count = nullptr;
|
||||
struct StreamContext {
|
||||
cudaStream_t stream = nullptr; // handle to the stream
|
||||
FRAME_TYPE *d_frame = nullptr;
|
||||
PEDESTAL_TYPE *d_pd_mean = nullptr;
|
||||
PEDESTAL_TYPE *d_pd_sum = nullptr;
|
||||
PEDESTAL_TYPE *d_pd_sum2 = nullptr;
|
||||
ClusterType *d_clusters = nullptr;
|
||||
uint32_t *d_cluster_count = nullptr;
|
||||
};
|
||||
|
||||
|
||||
template <typename ClusterType = Cluster<int32_t, 3, 3>,
|
||||
typename FRAME_TYPE = uint16_t,
|
||||
typename PEDESTAL_TYPE = double,
|
||||
typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
|
||||
typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
|
||||
class ClusterFinderCUDA {
|
||||
|
||||
@@ -33,25 +31,25 @@ class ClusterFinderCUDA {
|
||||
static constexpr int BLOCK_Y = 16;
|
||||
static constexpr int col_radius = ClusterType::cluster_size_x / 2;
|
||||
static constexpr int row_radius = ClusterType::cluster_size_y / 2;
|
||||
|
||||
Shape<2> m_image_size;
|
||||
size_t nrows;
|
||||
size_t ncols;
|
||||
size_t image_size; // nrows * ncols
|
||||
int n_streams;
|
||||
size_t m_capacity;
|
||||
|
||||
PEDESTAL_TYPE m_nSigma;
|
||||
Pedestal<PEDESTAL_TYPE> m_pedestal;
|
||||
ClusterVector<ClusterType> m_clusters;
|
||||
bool m_pedestal_dirty = true;
|
||||
|
||||
Shape<2> m_image_size;
|
||||
size_t nrows;
|
||||
size_t ncols;
|
||||
size_t image_size; // nrows * ncols
|
||||
int n_streams;
|
||||
size_t m_capacity;
|
||||
|
||||
PEDESTAL_TYPE m_nSigma;
|
||||
Pedestal<PEDESTAL_TYPE> m_pedestal;
|
||||
ClusterVector<ClusterType> m_clusters;
|
||||
bool m_pedestal_dirty = true;
|
||||
|
||||
using SC = StreamContext<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>;
|
||||
std::vector<SC> v_sc;
|
||||
|
||||
// Kernel parameters
|
||||
dim3 grid;
|
||||
dim3 block;
|
||||
dim3 grid;
|
||||
dim3 block;
|
||||
size_t shmem_bytes;
|
||||
|
||||
public:
|
||||
@@ -59,79 +57,87 @@ class ClusterFinderCUDA {
|
||||
* @brief Construct a ClusterFinderCUDA
|
||||
*
|
||||
* @param image_size shape of the detector frame (rows, cols)
|
||||
* @param nSigma threshold in units of per-pixel pedestal std
|
||||
* @param nSigma threshold in units of per-pixel pedestal
|
||||
* std
|
||||
* @param capacity device-side cluster buffer size per stream
|
||||
* @param n_streams number of CUDA streams for multi-frame overlap
|
||||
* @param n_streams number of CUDA streams for multi-frame
|
||||
* overlap
|
||||
*/
|
||||
ClusterFinderCUDA(Shape<2> image_size_,
|
||||
PEDESTAL_TYPE nSigma = 5.0,
|
||||
size_t capacity = 1000000,
|
||||
int n_streams_ = 1)
|
||||
: m_image_size(image_size_),
|
||||
nrows(image_size_[0]),
|
||||
ncols(image_size_[1]),
|
||||
image_size(nrows * ncols),
|
||||
n_streams(n_streams_),
|
||||
m_capacity(capacity),
|
||||
m_nSigma(nSigma),
|
||||
m_pedestal(image_size_[0], image_size_[1]),
|
||||
m_clusters(capacity)
|
||||
{
|
||||
ClusterFinderCUDA(Shape<2> image_size_, PEDESTAL_TYPE nSigma = 5.0,
|
||||
size_t capacity = 1000000, int n_streams_ = 1)
|
||||
: m_image_size(image_size_), nrows(image_size_[0]),
|
||||
ncols(image_size_[1]), image_size(nrows * ncols),
|
||||
n_streams(n_streams_), m_capacity(capacity), m_nSigma(nSigma),
|
||||
m_pedestal(image_size_[0], image_size_[1]), m_clusters(capacity) {
|
||||
// Grid/Block dimensions
|
||||
block = dim3(BLOCK_X, BLOCK_Y);
|
||||
grid = dim3((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X,
|
||||
(static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y);
|
||||
grid = dim3((static_cast<unsigned int>(ncols) + BLOCK_X - 1) / BLOCK_X,
|
||||
(static_cast<unsigned int>(nrows) + BLOCK_Y - 1) / BLOCK_Y);
|
||||
|
||||
// Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y +
|
||||
// 2*row_radius) doubles
|
||||
shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) *
|
||||
sizeof(PEDESTAL_TYPE);
|
||||
|
||||
// Shared memory: one tile of (BLOCK_X + 2*col_radius) x (BLOCK_Y + 2*row_radius) doubles
|
||||
shmem_bytes = (BLOCK_X + 2 * col_radius) * (BLOCK_Y + 2 * row_radius) * sizeof(PEDESTAL_TYPE);
|
||||
|
||||
v_sc.resize(n_streams);
|
||||
for(int k=0; k < n_streams; ++k) {
|
||||
auto& sc = v_sc[k];
|
||||
CUDA_CHECK(cudaStreamCreate(&sc.stream));
|
||||
CUDA_CHECK(cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE)));
|
||||
CUDA_CHECK(cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE)));
|
||||
CUDA_CHECK(cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE)));
|
||||
CUDA_CHECK(cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE)));
|
||||
CUDA_CHECK(cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType)));
|
||||
for (int k = 0; k < n_streams; ++k) {
|
||||
auto &sc = v_sc[k];
|
||||
CUDA_CHECK(cudaStreamCreate(&sc.stream));
|
||||
CUDA_CHECK(
|
||||
cudaMalloc(&sc.d_frame, image_size * sizeof(FRAME_TYPE)));
|
||||
CUDA_CHECK(
|
||||
cudaMalloc(&sc.d_pd_mean, image_size * sizeof(PEDESTAL_TYPE)));
|
||||
CUDA_CHECK(
|
||||
cudaMalloc(&sc.d_pd_sum, image_size * sizeof(PEDESTAL_TYPE)));
|
||||
CUDA_CHECK(
|
||||
cudaMalloc(&sc.d_pd_sum2, image_size * sizeof(PEDESTAL_TYPE)));
|
||||
CUDA_CHECK(
|
||||
cudaMalloc(&sc.d_clusters, capacity * sizeof(ClusterType)));
|
||||
CUDA_CHECK(cudaMalloc(&sc.d_cluster_count, sizeof(uint32_t)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~ClusterFinderCUDA() {
|
||||
for(auto& sc : v_sc) {
|
||||
if (sc.d_frame) cudaFree(sc.d_frame);
|
||||
if (sc.d_pd_mean) cudaFree(sc.d_pd_mean);
|
||||
if (sc.d_pd_sum) cudaFree(sc.d_pd_sum);
|
||||
if (sc.d_pd_sum2) cudaFree(sc.d_pd_sum2);
|
||||
if (sc.d_clusters) cudaFree(sc.d_clusters);
|
||||
if (sc.d_cluster_count) cudaFree(sc.d_cluster_count);
|
||||
if (sc.stream) cudaStreamDestroy(sc.stream);
|
||||
for (auto &sc : v_sc) {
|
||||
if (sc.d_frame)
|
||||
cudaFree(sc.d_frame);
|
||||
if (sc.d_pd_mean)
|
||||
cudaFree(sc.d_pd_mean);
|
||||
if (sc.d_pd_sum)
|
||||
cudaFree(sc.d_pd_sum);
|
||||
if (sc.d_pd_sum2)
|
||||
cudaFree(sc.d_pd_sum2);
|
||||
if (sc.d_clusters)
|
||||
cudaFree(sc.d_clusters);
|
||||
if (sc.d_cluster_count)
|
||||
cudaFree(sc.d_cluster_count);
|
||||
if (sc.stream)
|
||||
cudaStreamDestroy(sc.stream);
|
||||
}
|
||||
}
|
||||
|
||||
// Non-copyable, non-movable
|
||||
ClusterFinderCUDA(const ClusterFinderCUDA&) = delete;
|
||||
ClusterFinderCUDA& operator=(const ClusterFinderCUDA&) = delete;
|
||||
ClusterFinderCUDA(ClusterFinderCUDA&&) = delete;
|
||||
ClusterFinderCUDA& operator=(ClusterFinderCUDA&&) = delete;
|
||||
ClusterFinderCUDA(const ClusterFinderCUDA &) = delete;
|
||||
ClusterFinderCUDA &operator=(const ClusterFinderCUDA &) = delete;
|
||||
ClusterFinderCUDA(ClusterFinderCUDA &&) = delete;
|
||||
ClusterFinderCUDA &operator=(ClusterFinderCUDA &&) = delete;
|
||||
|
||||
void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; }
|
||||
PEDESTAL_TYPE get_nSigma() const { return m_nSigma; }
|
||||
|
||||
void set_nSigma(PEDESTAL_TYPE nSigma) { m_nSigma = nSigma; }
|
||||
PEDESTAL_TYPE get_nSigma() const { return m_nSigma; }
|
||||
|
||||
void push_pedestal_frame(NDView<FRAME_TYPE, 2> frame) {
|
||||
m_pedestal.push(frame);
|
||||
m_pedestal_dirty = true;
|
||||
}
|
||||
|
||||
|
||||
void clear_pedestal() {
|
||||
m_pedestal.clear();
|
||||
m_pedestal_dirty = true;
|
||||
}
|
||||
|
||||
|
||||
NDArray<PEDESTAL_TYPE, 2> pedestal() { return m_pedestal.mean(); }
|
||||
NDArray<PEDESTAL_TYPE, 2> noise() { return m_pedestal.std(); }
|
||||
|
||||
NDArray<PEDESTAL_TYPE, 2> noise() { return m_pedestal.std(); }
|
||||
|
||||
/**
|
||||
* @brief Move clusters out of the internal ClusterVector, optionally
|
||||
* reallocating the internal one with the same capacity.
|
||||
@@ -150,52 +156,47 @@ class ClusterFinderCUDA {
|
||||
* @brief Find clusters in a single frame, appending them to the internal
|
||||
* ClusterVector.
|
||||
*/
|
||||
void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0)
|
||||
{
|
||||
void find_clusters(NDView<FRAME_TYPE, 2> frame, uint64_t frame_number = 0) {
|
||||
if (m_pedestal_dirty) { // need to update the pedestal on the gpu
|
||||
sync_pedestal_to_device();
|
||||
m_pedestal_dirty = false;
|
||||
}
|
||||
|
||||
auto& sc = v_sc[0];
|
||||
auto &sc = v_sc[0];
|
||||
const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
|
||||
const uint32_t n_pd_samples = static_cast<uint32_t>(m_pedestal.n_samples());
|
||||
const uint32_t n_pd_samples =
|
||||
static_cast<uint32_t>(m_pedestal.n_samples());
|
||||
|
||||
// Reset cluster counter
|
||||
CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t), sc.stream));
|
||||
|
||||
// Upload frame
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes, cudaMemcpyHostToDevice, sc.stream));
|
||||
|
||||
// Kernel launch
|
||||
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
|
||||
<<<grid, block, shmem_bytes, sc.stream>>>(sc.d_frame,
|
||||
sc.d_pd_mean,
|
||||
sc.d_pd_sum,
|
||||
sc.d_pd_sum2,
|
||||
n_pd_samples,
|
||||
m_nSigma,
|
||||
nrows,
|
||||
ncols,
|
||||
sc.d_clusters,
|
||||
sc.d_cluster_count,
|
||||
static_cast<uint32_t>(m_capacity)
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Read back cluster count
|
||||
uint32_t n_found = 0;
|
||||
CUDA_CHECK(cudaMemcpyAsync(&n_found,
|
||||
sc.d_cluster_count,
|
||||
sizeof(uint32_t),
|
||||
cudaMemcpyDeviceToHost,
|
||||
CUDA_CHECK(cudaMemsetAsync(sc.d_cluster_count, 0, sizeof(uint32_t),
|
||||
sc.stream));
|
||||
|
||||
// Synchronize to ensure count is available before the CPU reads clusters
|
||||
// Upload frame
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc.d_frame, frame.data(), image_bytes,
|
||||
cudaMemcpyHostToDevice, sc.stream));
|
||||
|
||||
// Kernel launch
|
||||
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE,
|
||||
PEDESTAL_TYPE>
|
||||
<<<grid, block, shmem_bytes, sc.stream>>>(
|
||||
sc.d_frame, sc.d_pd_mean, sc.d_pd_sum, sc.d_pd_sum2,
|
||||
n_pd_samples, m_nSigma, nrows, ncols, sc.d_clusters,
|
||||
sc.d_cluster_count, static_cast<uint32_t>(m_capacity));
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Read back cluster count
|
||||
uint32_t n_found = 0;
|
||||
CUDA_CHECK(cudaMemcpyAsync(&n_found, sc.d_cluster_count,
|
||||
sizeof(uint32_t), cudaMemcpyDeviceToHost,
|
||||
sc.stream));
|
||||
|
||||
// Synchronize to ensure count is available before the CPU reads
|
||||
// clusters
|
||||
CUDA_CHECK(cudaStreamSynchronize(sc.stream));
|
||||
|
||||
// Clamp to max in case of overflow
|
||||
if (n_found > m_capacity) n_found = m_capacity;
|
||||
if (n_found > m_capacity)
|
||||
n_found = m_capacity;
|
||||
|
||||
// Read back clusters
|
||||
m_clusters.set_frame_number(frame_number);
|
||||
@@ -213,74 +214,79 @@ class ClusterFinderCUDA {
|
||||
*/
|
||||
std::vector<ClusterVector<ClusterType>>
|
||||
find_clusters_batched(NDView<FRAME_TYPE, 3> frames,
|
||||
uint64_t first_frame = 0)
|
||||
{
|
||||
uint64_t first_frame = 0) {
|
||||
if (m_pedestal_dirty) {
|
||||
sync_pedestal_to_device();
|
||||
m_pedestal_dirty = false;
|
||||
}
|
||||
|
||||
const size_t n_frames = frames.shape(0);
|
||||
const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
|
||||
const uint32_t n_pd_samples = static_cast<uint32_t>(m_pedestal.n_samples());
|
||||
|
||||
|
||||
const size_t n_frames = frames.shape(0);
|
||||
const size_t image_bytes = image_size * sizeof(FRAME_TYPE);
|
||||
const uint32_t n_pd_samples =
|
||||
static_cast<uint32_t>(m_pedestal.n_samples());
|
||||
|
||||
std::vector<ClusterVector<ClusterType>> results;
|
||||
results.reserve(n_frames);
|
||||
for (size_t i = 0; i < n_frames; ++i) {
|
||||
results.emplace_back(m_capacity);
|
||||
results.back().set_frame_number(first_frame + i);
|
||||
}
|
||||
|
||||
|
||||
std::vector<uint32_t> host_counts(n_streams, 0);
|
||||
|
||||
|
||||
const size_t n_rounds = (n_frames + n_streams - 1) / n_streams;
|
||||
for (size_t round = 0; round < n_rounds; ++round) {
|
||||
|
||||
|
||||
// Launch phase: fan out kernels on all streams for this round
|
||||
for (int k = 0; k < n_streams; ++k) {
|
||||
// OOB guard
|
||||
const size_t frame_idx = round * n_streams + k;
|
||||
if (frame_idx >= n_frames) continue;
|
||||
|
||||
auto& sc_k = v_sc[k];
|
||||
const FRAME_TYPE* h_src = frames.data() + frame_idx * image_size;
|
||||
|
||||
if (frame_idx >= n_frames)
|
||||
continue;
|
||||
|
||||
auto &sc_k = v_sc[k];
|
||||
const FRAME_TYPE *h_src =
|
||||
frames.data() + frame_idx * image_size;
|
||||
|
||||
CUDA_CHECK(cudaMemsetAsync(sc_k.d_cluster_count, 0,
|
||||
sizeof(uint32_t), sc_k.stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc_k.d_frame, h_src, image_bytes,
|
||||
cudaMemcpyHostToDevice, sc_k.stream));
|
||||
|
||||
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
|
||||
cudaMemcpyHostToDevice,
|
||||
sc_k.stream));
|
||||
|
||||
device::find_clusters_in_single_frame<ClusterType, FRAME_TYPE,
|
||||
PEDESTAL_TYPE>
|
||||
<<<grid, block, shmem_bytes, sc_k.stream>>>(
|
||||
sc_k.d_frame,
|
||||
sc_k.d_pd_mean, sc_k.d_pd_sum, sc_k.d_pd_sum2,
|
||||
n_pd_samples, m_nSigma,
|
||||
nrows, ncols,
|
||||
sc_k.d_frame, sc_k.d_pd_mean, sc_k.d_pd_sum,
|
||||
sc_k.d_pd_sum2, n_pd_samples, m_nSigma, nrows, ncols,
|
||||
sc_k.d_clusters, sc_k.d_cluster_count, m_capacity);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
// Drain phase: fan in results from all streams
|
||||
for (int k = 0; k < n_streams; ++k) {
|
||||
const size_t frame_idx = round * n_streams + k;
|
||||
if (frame_idx >= n_frames) continue;
|
||||
|
||||
auto& sc_k = v_sc[k];
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(&host_counts[k], sc_k.d_cluster_count,
|
||||
sizeof(uint32_t),
|
||||
cudaMemcpyDeviceToHost, sc_k.stream));
|
||||
if (frame_idx >= n_frames)
|
||||
continue;
|
||||
|
||||
auto &sc_k = v_sc[k];
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
&host_counts[k], sc_k.d_cluster_count, sizeof(uint32_t),
|
||||
cudaMemcpyDeviceToHost, sc_k.stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(sc_k.stream));
|
||||
|
||||
|
||||
uint32_t n_found = host_counts[k];
|
||||
if (n_found > m_capacity) n_found = m_capacity;
|
||||
|
||||
if (n_found > m_capacity)
|
||||
n_found = m_capacity;
|
||||
|
||||
if (n_found > 0) {
|
||||
append_device_clusters_to(results[frame_idx], sc_k, n_found);
|
||||
append_device_clusters_to(results[frame_idx], sc_k,
|
||||
n_found);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
@@ -295,34 +301,35 @@ class ClusterFinderCUDA {
|
||||
// copies complete, so we synchronise at the end before they go out
|
||||
// of scope.
|
||||
NDArray<PEDESTAL_TYPE, 2> h_mean = m_pedestal.mean();
|
||||
NDArray<PEDESTAL_TYPE, 2> h_sum = m_pedestal.get_sum();
|
||||
NDArray<PEDESTAL_TYPE, 2> h_sum = m_pedestal.get_sum();
|
||||
NDArray<PEDESTAL_TYPE, 2> h_sum2 = m_pedestal.get_sum2();
|
||||
|
||||
|
||||
const size_t bytes = image_size * sizeof(PEDESTAL_TYPE);
|
||||
for (auto& sc : v_sc) {
|
||||
for (auto &sc : v_sc) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_mean, h_mean.data(), bytes,
|
||||
cudaMemcpyHostToDevice, sc.stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes,
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum, h_sum.data(), bytes,
|
||||
cudaMemcpyHostToDevice, sc.stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(sc.d_pd_sum2, h_sum2.data(), bytes,
|
||||
cudaMemcpyHostToDevice, sc.stream));
|
||||
}
|
||||
for (auto& sc : v_sc) CUDA_CHECK(cudaStreamSynchronize(sc.stream));
|
||||
for (auto &sc : v_sc)
|
||||
CUDA_CHECK(cudaStreamSynchronize(sc.stream));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Copy n_found clusters from sc.d_clusters into the given ClusterVector
|
||||
* and block on the transfer.
|
||||
* and block on the transfer.
|
||||
*/
|
||||
void append_device_clusters_to(ClusterVector<ClusterType>& cv,
|
||||
SC& sc, uint32_t n_found)
|
||||
{
|
||||
void append_device_clusters_to(ClusterVector<ClusterType> &cv, SC &sc,
|
||||
uint32_t n_found) {
|
||||
std::vector<ClusterType> staging(n_found);
|
||||
CUDA_CHECK(cudaMemcpyAsync(staging.data(), sc.d_clusters,
|
||||
n_found * sizeof(ClusterType),
|
||||
cudaMemcpyDeviceToHost, sc.stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(sc.stream));
|
||||
for (const auto& c : staging) cv.push_back(c);
|
||||
for (const auto &c : staging)
|
||||
cv.push_back(c);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,28 +1,22 @@
|
||||
#pragma once
|
||||
#include <cuda_runtime.h>
|
||||
#include <type_traits>
|
||||
#include "aare/Cluster.hpp"
|
||||
#include "aare/ClusterFinder.hpp"
|
||||
#include <cuda_runtime.h>
|
||||
#include <type_traits>
|
||||
|
||||
namespace aare::device {
|
||||
|
||||
template <typename ClusterType = Cluster<int32_t, 3, 3>,
|
||||
typename FRAME_TYPE = uint16_t,
|
||||
typename PEDESTAL_TYPE = double,
|
||||
typename FRAME_TYPE = uint16_t, typename PEDESTAL_TYPE = double,
|
||||
typename = std::enable_if_t<no_2x2_cluster<ClusterType>::value>>
|
||||
__global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d_frame,
|
||||
PEDESTAL_TYPE* __restrict__ d_pd_mean,
|
||||
PEDESTAL_TYPE* __restrict__ d_pd_sum,
|
||||
PEDESTAL_TYPE* __restrict__ d_pd_sum2,
|
||||
const uint32_t n_pd_samples,
|
||||
const PEDESTAL_TYPE m_nSigma,
|
||||
const size_t nrows,
|
||||
const size_t ncols,
|
||||
// const uint64_t frame_number,
|
||||
ClusterType* d_clusters,
|
||||
uint32_t* d_cluster_count,
|
||||
const uint32_t max_clusters)
|
||||
{
|
||||
__global__ void find_clusters_in_single_frame(
|
||||
const FRAME_TYPE *__restrict__ d_frame,
|
||||
PEDESTAL_TYPE *__restrict__ d_pd_mean, PEDESTAL_TYPE *__restrict__ d_pd_sum,
|
||||
PEDESTAL_TYPE *__restrict__ d_pd_sum2, const uint32_t n_pd_samples,
|
||||
const PEDESTAL_TYPE m_nSigma, const size_t nrows, const size_t ncols,
|
||||
// const uint64_t frame_number,
|
||||
ClusterType *d_clusters, uint32_t *d_cluster_count,
|
||||
const uint32_t max_clusters) {
|
||||
using CT = typename ClusterType::value_type;
|
||||
|
||||
// Compile-time cluster geometry useful for unrolling loops
|
||||
@@ -38,10 +32,12 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
constexpr int pow2_c3 = CSX * CSY;
|
||||
|
||||
// Thread/pixel mapping
|
||||
auto col_global = static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
|
||||
auto row_global = static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
|
||||
auto col_global =
|
||||
static_cast<ssize_t>(threadIdx.x + blockDim.x * blockIdx.x);
|
||||
auto row_global =
|
||||
static_cast<ssize_t>(threadIdx.y + blockDim.y * blockIdx.y);
|
||||
auto global_tid = static_cast<ssize_t>(col_global + ncols * row_global);
|
||||
auto local_tid = threadIdx.x + blockDim.x * threadIdx.y;
|
||||
auto local_tid = threadIdx.x + blockDim.x * threadIdx.y;
|
||||
|
||||
// ====================
|
||||
// Shared memory layout
|
||||
@@ -55,19 +51,22 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
// CUDA prefers raw bytes + aligned cast
|
||||
// Compile error happens when using: `extern __shared__ T sh[];`
|
||||
extern __shared__ __align__(sizeof(PEDESTAL_TYPE)) unsigned char smem[];
|
||||
PEDESTAL_TYPE* shmem = reinterpret_cast<PEDESTAL_TYPE*>(smem);
|
||||
PEDESTAL_TYPE *shmem = reinterpret_cast<PEDESTAL_TYPE *>(smem);
|
||||
|
||||
// Stride includes halo on both sides
|
||||
auto shmem_stride = static_cast<int>(blockDim.x) + 2 * col_radius;
|
||||
auto tile_size = shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
|
||||
auto tile_size =
|
||||
shmem_stride * (static_cast<int>(blockDim.y) + 2 * row_radius);
|
||||
|
||||
// Offset so that thread (0,0) maps to shared-memory position
|
||||
// (row_radius, col_radius) i.e. past the top-left halo.
|
||||
auto shmem_tid = (static_cast<int>(threadIdx.y) + row_radius) * shmem_stride
|
||||
+ (static_cast<int>(threadIdx.x) + col_radius);
|
||||
auto shmem_tid =
|
||||
(static_cast<int>(threadIdx.y) + row_radius) * shmem_stride +
|
||||
(static_cast<int>(threadIdx.x) + col_radius);
|
||||
|
||||
// Cooperative zero-fill
|
||||
for (int idx = static_cast<int>(local_tid); idx < tile_size; idx += static_cast<int>(blockDim.x * blockDim.y)) {
|
||||
for (int idx = static_cast<int>(local_tid); idx < tile_size;
|
||||
idx += static_cast<int>(blockDim.x * blockDim.y)) {
|
||||
shmem[idx] = PEDESTAL_TYPE{0};
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -83,7 +82,7 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
// Helper: read (frame - pedestal_mean) from global memory, or 0 if OOB.
|
||||
// gr, gc are the global row/col of the pixel to load.
|
||||
// Returns the pedestal-subtracted value.
|
||||
auto load_pixel = [&] __device__ (ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
|
||||
auto load_pixel = [&] __device__(ssize_t gr, ssize_t gc) -> PEDESTAL_TYPE {
|
||||
auto gid = gc + static_cast<ssize_t>(ncols) * gr;
|
||||
return static_cast<PEDESTAL_TYPE>(d_frame[gid]) - d_pd_mean[gid];
|
||||
};
|
||||
@@ -98,21 +97,25 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
if (threadIdx.y == 0 && valid_pixel) {
|
||||
for (int i = 1; i <= row_radius; ++i) {
|
||||
if (row_global - i >= 0)
|
||||
shmem[shmem_tid - i * shmem_stride] = load_pixel(row_global - i, col_global);
|
||||
shmem[shmem_tid - i * shmem_stride] =
|
||||
load_pixel(row_global - i, col_global);
|
||||
}
|
||||
// Top-left corner rectangle
|
||||
if (threadIdx.x == 0) {
|
||||
for (int i = 1; i <= row_radius; ++i)
|
||||
for (int j = 1; j <= col_radius; ++j)
|
||||
if (row_global - i >= 0 && col_global - j >= 0)
|
||||
shmem[shmem_tid - i * shmem_stride - j] = load_pixel(row_global - i, col_global - j);
|
||||
shmem[shmem_tid - i * shmem_stride - j] =
|
||||
load_pixel(row_global - i, col_global - j);
|
||||
}
|
||||
// Top-right corner rectangle
|
||||
if (threadIdx.x == blockDim.x - 1) {
|
||||
for (int i = 1; i <= row_radius; ++i)
|
||||
for (int j = 1; j <= col_radius; ++j)
|
||||
if (row_global - i >= 0 && col_global + j < static_cast<ssize_t>(ncols))
|
||||
shmem[shmem_tid - i * shmem_stride + j] = load_pixel(row_global - i, col_global + j);
|
||||
if (row_global - i >= 0 &&
|
||||
col_global + j < static_cast<ssize_t>(ncols))
|
||||
shmem[shmem_tid - i * shmem_stride + j] =
|
||||
load_pixel(row_global - i, col_global + j);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,21 +137,26 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
if (threadIdx.y == blockDim.y - 1 && valid_pixel) {
|
||||
for (int i = 1; i <= row_radius; ++i) {
|
||||
if (row_global + i < static_cast<ssize_t>(nrows))
|
||||
shmem[shmem_tid + i * shmem_stride] = load_pixel(row_global + i, col_global);
|
||||
shmem[shmem_tid + i * shmem_stride] =
|
||||
load_pixel(row_global + i, col_global);
|
||||
}
|
||||
// Bottom-left corner rectangle
|
||||
if (threadIdx.x == 0) {
|
||||
for (int i = 1; i <= row_radius; ++i)
|
||||
for (int j = 1; j <= col_radius; ++j)
|
||||
if (row_global + i < static_cast<ssize_t>(nrows) && col_global - j >= 0)
|
||||
shmem[shmem_tid + i * shmem_stride - j] = load_pixel(row_global + i, col_global - j);
|
||||
if (row_global + i < static_cast<ssize_t>(nrows) &&
|
||||
col_global - j >= 0)
|
||||
shmem[shmem_tid + i * shmem_stride - j] =
|
||||
load_pixel(row_global + i, col_global - j);
|
||||
}
|
||||
// Bottom-right corner rectangle
|
||||
if (threadIdx.x == blockDim.x - 1) {
|
||||
for (int i = 1; i <= row_radius; ++i)
|
||||
for (int j = 1; j <= col_radius; ++j)
|
||||
if (row_global + i < static_cast<ssize_t>(nrows) && col_global + j < static_cast<ssize_t>(ncols))
|
||||
shmem[shmem_tid + i * shmem_stride + j] = load_pixel(row_global + i, col_global + j);
|
||||
if (row_global + i < static_cast<ssize_t>(nrows) &&
|
||||
col_global + j < static_cast<ssize_t>(ncols))
|
||||
shmem[shmem_tid + i * shmem_stride + j] =
|
||||
load_pixel(row_global + i, col_global + j);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,51 +165,57 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
// =====================
|
||||
// Cluster-finding logic
|
||||
// =====================
|
||||
if (!valid_pixel) return;
|
||||
if (!valid_pixel)
|
||||
return;
|
||||
|
||||
// Per-pixel RMS from global pedestal arrays
|
||||
// rms = sqrt( E[X^2] - E[X]^2 )
|
||||
PEDESTAL_TYPE mean_px = d_pd_mean[global_tid];
|
||||
PEDESTAL_TYPE var_px = d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
|
||||
PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
|
||||
PEDESTAL_TYPE rms_px = sqrt(rms_sq);
|
||||
PEDESTAL_TYPE mean_px = d_pd_mean[global_tid];
|
||||
PEDESTAL_TYPE var_px =
|
||||
d_pd_sum2[global_tid] / n_pd_samples - mean_px * mean_px;
|
||||
PEDESTAL_TYPE rms_sq = max(var_px, PEDESTAL_TYPE{0}); // variance = rms^2
|
||||
PEDESTAL_TYPE rms_px = sqrt(rms_sq);
|
||||
|
||||
// Pedestal-subtracted value of the center pixel (already in shmem)
|
||||
PEDESTAL_TYPE val_pixel = shmem[shmem_tid];
|
||||
|
||||
// Negative pedestal early exit
|
||||
// Negative pedestal early exit
|
||||
if (val_pixel < -m_nSigma * rms_px)
|
||||
return; // NOTE: pedestal update for this pixel is skipped (same as sequential)
|
||||
return; // NOTE: pedestal update for this pixel is skipped (same as
|
||||
// sequential)
|
||||
|
||||
// Stencil reduction: total, max, quadrant sums
|
||||
PEDESTAL_TYPE total = PEDESTAL_TYPE{0};
|
||||
PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of `numeric_limits<double>::lowest()`
|
||||
PEDESTAL_TYPE max_val = -HUGE_VAL; // CUDA-compatible equivalent of
|
||||
// `numeric_limits<double>::lowest()`
|
||||
|
||||
/* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0, ic<=0)
|
||||
PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0, ic>=0)
|
||||
PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0, ic<=0)
|
||||
PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0, ic>=0) */
|
||||
/* PEDESTAL_TYPE tl = PEDESTAL_TYPE{0}; // top-left quadrant (ir<=0,
|
||||
ic<=0) PEDESTAL_TYPE tr = PEDESTAL_TYPE{0}; // top-right quadrant (ir<=0,
|
||||
ic>=0) PEDESTAL_TYPE bl = PEDESTAL_TYPE{0}; // bottom-left (ir>=0,
|
||||
ic<=0) PEDESTAL_TYPE br = PEDESTAL_TYPE{0}; // bottom-right (ir>=0,
|
||||
ic>=0) */
|
||||
|
||||
CT clusterData[CSX * CSY];
|
||||
int idx = 0; // tracks the pixels in the cluster
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int ir = -row_radius; ir <= row_radius; ++ir) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int ic = -col_radius; ic <= col_radius; ++ic) {
|
||||
PEDESTAL_TYPE val = shmem[shmem_tid + ir * shmem_stride + ic];
|
||||
|
||||
total += val;
|
||||
max_val = max(max_val, val);
|
||||
|
||||
/* // Quadrant accumulation (pixels on the axes contribute to two quadrants)
|
||||
if (ir <= 0 && ic <= 0) tl += val;
|
||||
if (ir <= 0 && ic >= 0) tr += val;
|
||||
if (ir >= 0 && ic <= 0) bl += val;
|
||||
if (ir >= 0 && ic >= 0) br += val; */
|
||||
/* // Quadrant accumulation (pixels on the axes contribute to two
|
||||
quadrants) if (ir <= 0 && ic <= 0) tl += val; if (ir <= 0 && ic >=
|
||||
0) tr += val; if (ir >= 0 && ic <= 0) bl += val; if (ir >= 0 && ic
|
||||
>= 0) br += val; */
|
||||
|
||||
// Store pedestal-subtracted value in register array for later cluster output
|
||||
if constexpr (std::is_integral_v<CT> && std::is_floating_point_v<PEDESTAL_TYPE>)
|
||||
// Store pedestal-subtracted value in register array for later
|
||||
// cluster output
|
||||
if constexpr (std::is_integral_v<CT> &&
|
||||
std::is_floating_point_v<PEDESTAL_TYPE>)
|
||||
clusterData[idx] = static_cast<CT>(lround(val));
|
||||
else
|
||||
clusterData[idx] = static_cast<CT>(val);
|
||||
@@ -213,10 +227,11 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
// Three-way classification (mirrors ClusterFinder's logic)
|
||||
//
|
||||
// 1. Single-pixel significance: max_val > nSigma * rms
|
||||
// -> only the pixel that IS the max gets recorded (local-max suppression)
|
||||
// -> only the pixel that IS the max gets recorded (local-max
|
||||
// suppression)
|
||||
//
|
||||
// 2. Quadrant significance: max(tl,tr,bl,br) > c2 * nSigma * rms
|
||||
// -> charge-sharing events where a 2x2 sub-region is significant
|
||||
// -> charge-sharing events where a 2x2 sub-region is significant
|
||||
// NOTE: This test is absent in the serial ClusterFinder!
|
||||
//
|
||||
// 3. Total significance: total > c3 * nSigma * rms
|
||||
@@ -228,9 +243,10 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
|
||||
// Test 1: single-pixel significance
|
||||
if (max_val > m_nSigma * rms_px) {
|
||||
// Local-max suppression: only the center-pixel thread records the cluster
|
||||
// Local-max suppression: only the center-pixel thread records the
|
||||
// cluster
|
||||
if (val_pixel < max_val)
|
||||
return; // some other pixel in the neighborhood is brighter
|
||||
return; // some other pixel in the neighborhood is brighter
|
||||
is_photon = true;
|
||||
}
|
||||
|
||||
@@ -249,29 +265,30 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
}
|
||||
}
|
||||
|
||||
// Pedestal update (if not a photon)
|
||||
// In the sequential code, non-photon pixels feed back into the running pedestal via push_fast().
|
||||
// In this kernel, the GPU updates all pixels in a frame simultaneously. So the updated pedestal
|
||||
// will only be used starting from the next frame. -> This avoids a/serialization and b/global mem I/O.
|
||||
// Pedestal update (if not a photon)
|
||||
// In the sequential code, non-photon pixels feed back into the running
|
||||
// pedestal via push_fast(). In this kernel, the GPU updates all pixels in a
|
||||
// frame simultaneously. So the updated pedestal will only be used starting
|
||||
// from the next frame. -> This avoids a/serialization and b/global mem I/O.
|
||||
if (!is_photon && valid_pixel) {
|
||||
PEDESTAL_TYPE raw_val = static_cast<PEDESTAL_TYPE>(d_frame[global_tid]);
|
||||
PEDESTAL_TYPE sum = d_pd_sum[global_tid];
|
||||
PEDESTAL_TYPE sum = d_pd_sum[global_tid];
|
||||
PEDESTAL_TYPE sum2 = d_pd_sum2[global_tid];
|
||||
|
||||
sum += raw_val - sum / n_pd_samples;
|
||||
sum += raw_val - sum / n_pd_samples;
|
||||
sum2 += raw_val * raw_val - sum2 / n_pd_samples;
|
||||
|
||||
d_pd_sum[global_tid] = sum;
|
||||
d_pd_sum[global_tid] = sum;
|
||||
d_pd_sum2[global_tid] = sum2;
|
||||
d_pd_mean[global_tid] = sum / n_pd_samples;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
if (!is_photon) return; // Debugging
|
||||
*/
|
||||
|
||||
// Write cluster to global output buffer using atomic index
|
||||
// Write cluster to global output buffer using atomic index
|
||||
// for coordination across all blocks
|
||||
uint32_t write_idx = atomicAdd(d_cluster_count, 1u);
|
||||
|
||||
@@ -283,7 +300,8 @@ __global__ void find_clusters_in_single_frame(const FRAME_TYPE* __restrict__ d
|
||||
cluster.x = static_cast<decltype(cluster.x)>(col_global);
|
||||
cluster.y = static_cast<decltype(cluster.y)>(row_global);
|
||||
|
||||
memcpy(reinterpret_cast<CT*>(&cluster.data), clusterData, sizeof(CT) * CSX * CSY);
|
||||
memcpy(reinterpret_cast<CT *>(&cluster.data), clusterData,
|
||||
sizeof(CT) * CSX * CSY);
|
||||
|
||||
d_clusters[write_idx] = cluster;
|
||||
}
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
#pragma once
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include "aare/NDArray.hpp"
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
template <typename FRAME_TYPE>
|
||||
void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>>& frames,
|
||||
size_t first_frame,
|
||||
size_t n_frames,
|
||||
std::vector<FRAME_TYPE>& batch)
|
||||
{
|
||||
if (n_frames == 0) return;
|
||||
void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>> &frames,
|
||||
size_t first_frame, size_t n_frames,
|
||||
std::vector<FRAME_TYPE> &batch) {
|
||||
if (n_frames == 0)
|
||||
return;
|
||||
|
||||
const size_t rows = frames[first_frame].shape(0);
|
||||
const size_t cols = frames[first_frame].shape(1);
|
||||
@@ -23,8 +21,8 @@ void pack_frame_batch(const std::vector<aare::NDArray<FRAME_TYPE, 2>>& frames,
|
||||
}
|
||||
|
||||
for (size_t k = 0; k < n_frames; ++k) {
|
||||
const FRAME_TYPE* src = frames[first_frame + k].data();
|
||||
FRAME_TYPE* dst = batch.data() + k * image_size;
|
||||
const FRAME_TYPE *src = frames[first_frame + k].data();
|
||||
FRAME_TYPE *dst = batch.data() + k * image_size;
|
||||
std::memcpy(dst, src, image_size * sizeof(FRAME_TYPE));
|
||||
}
|
||||
}
|
||||
@@ -3,12 +3,12 @@
|
||||
#include <cstdlib>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
inline void __cuda_check(cudaError_t err, const char* file, int line) {
|
||||
if (err != cudaSuccess) {
|
||||
std::fprintf(stderr, "[%s:%d] CUDA error: %s\n",
|
||||
file, line, cudaGetErrorString(err));
|
||||
std::exit(1);
|
||||
}
|
||||
inline void __cuda_check(cudaError_t err, const char *file, int line) {
|
||||
if (err != cudaSuccess) {
|
||||
std::fprintf(stderr, "[%s:%d] CUDA error: %s\n", file, line,
|
||||
cudaGetErrorString(err));
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#define CUDA_CHECK(stmt) __cuda_check((stmt), __FILE__, __LINE__)
|
||||
@@ -27,19 +27,15 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
|
||||
auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr);
|
||||
|
||||
using ClusterType = Cluster<T, ClusterSizeX, ClusterSizeY, CoordType>;
|
||||
using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
|
||||
using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
|
||||
|
||||
py::class_<CF>(m, class_name.c_str())
|
||||
.def(py::init<Shape<2>, pd_type, size_t, int>(),
|
||||
py::arg("image_size"),
|
||||
py::arg("n_sigma") = 5.0,
|
||||
py::arg("capacity") = 1'000'000,
|
||||
py::arg("n_streams") = 1)
|
||||
.def(py::init<Shape<2>, pd_type, size_t, int>(), py::arg("image_size"),
|
||||
py::arg("n_sigma") = 5.0, py::arg("capacity") = 1'000'000,
|
||||
py::arg("n_streams") = 1)
|
||||
|
||||
.def_property(
|
||||
"nSigma",
|
||||
&CF::get_nSigma,
|
||||
&CF::set_nSigma,
|
||||
"nSigma", &CF::get_nSigma, &CF::set_nSigma,
|
||||
R"(Number of sigma above the pedestal to consider a photon during cluster finding.)")
|
||||
|
||||
.def("push_pedestal_frame",
|
||||
@@ -50,21 +46,19 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
|
||||
|
||||
.def("clear_pedestal", &CF::clear_pedestal)
|
||||
|
||||
.def_property_readonly(
|
||||
"pedestal",
|
||||
[](CF &self) {
|
||||
auto pd = new NDArray<pd_type, 2>{};
|
||||
*pd = self.pedestal();
|
||||
return return_image_data(pd);
|
||||
})
|
||||
.def_property_readonly("pedestal",
|
||||
[](CF &self) {
|
||||
auto pd = new NDArray<pd_type, 2>{};
|
||||
*pd = self.pedestal();
|
||||
return return_image_data(pd);
|
||||
})
|
||||
|
||||
.def_property_readonly(
|
||||
"noise",
|
||||
[](CF &self) {
|
||||
auto arr = new NDArray<pd_type, 2>{};
|
||||
*arr = self.noise();
|
||||
return return_image_data(arr);
|
||||
})
|
||||
.def_property_readonly("noise",
|
||||
[](CF &self) {
|
||||
auto arr = new NDArray<pd_type, 2>{};
|
||||
*arr = self.noise();
|
||||
return return_image_data(arr);
|
||||
})
|
||||
|
||||
.def(
|
||||
"steal_clusters",
|
||||
@@ -86,7 +80,8 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
|
||||
.def(
|
||||
"find_clusters_batched",
|
||||
[](CF &self, py::array_t<uint16_t> frames, uint64_t first_frame) {
|
||||
// frames is expected as a 3D numpy array (n_frames, nrows, ncols)
|
||||
// frames is expected as a 3D numpy array (n_frames, nrows,
|
||||
// ncols)
|
||||
auto view = make_view_3d(frames);
|
||||
return self.find_clusters_batched(view, first_frame);
|
||||
},
|
||||
|
||||
@@ -30,8 +30,7 @@ void define_ClusterVector(py::module &m, const std::string &typestr) {
|
||||
|
||||
py::class_<ClusterVector<
|
||||
Cluster<Type, ClusterSizeX, ClusterSizeY, CoordType>, void>>(
|
||||
m, class_name.c_str(),
|
||||
py::buffer_protocol(),
|
||||
m, class_name.c_str(), py::buffer_protocol(),
|
||||
py::module_local())
|
||||
|
||||
.def(py::init()) // TODO change!!!
|
||||
|
||||
+19
-19
@@ -6,8 +6,8 @@
|
||||
// back a usable ClusterVector without _aare needing to be imported first.
|
||||
|
||||
#include "bind_Cluster.hpp"
|
||||
#include "bind_ClusterVector.hpp"
|
||||
#include "bind_ClusterFinderCUDA.hpp"
|
||||
#include "bind_ClusterVector.hpp"
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
|
||||
@@ -22,45 +22,45 @@ namespace py = pybind11;
|
||||
define_Cluster<T, N, M, U>(m, #N "x" #M #TYPE_CODE);
|
||||
|
||||
#define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE) \
|
||||
aare::define_ClusterFinderCUDA<T, N, M, U>( \
|
||||
m, "Cluster" #N "x" #M #TYPE_CODE);
|
||||
aare::define_ClusterFinderCUDA<T, N, M, U>(m, "Cluster" #N \
|
||||
"x" #M #TYPE_CODE);
|
||||
|
||||
PYBIND11_MODULE(_aare_cuda, m) {
|
||||
|
||||
// Types first — finders reference them in their signatures.
|
||||
// SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too.
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);
|
||||
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);
|
||||
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);
|
||||
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);
|
||||
|
||||
// Finders
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);
|
||||
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);
|
||||
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);
|
||||
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
|
||||
}
|
||||
|
||||
#undef DEFINE_CUDA_CLUSTER_TYPES
|
||||
|
||||
+201
-150
@@ -9,14 +9,14 @@
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "aare/defs.hpp"
|
||||
#include "aare/utils/batch.hpp"
|
||||
#include "aare/ClusterFinder.hpp"
|
||||
#include "aare/ClusterFinderCUDA.hpp"
|
||||
#include "aare/File.hpp"
|
||||
#include "aare/Frame.hpp"
|
||||
#include "aare/NDArray.hpp"
|
||||
#include "aare/Pedestal.hpp"
|
||||
#include "aare/ClusterFinder.hpp"
|
||||
#include "aare/ClusterFinderCUDA.hpp"
|
||||
#include "aare/defs.hpp"
|
||||
#include "aare/utils/batch.hpp"
|
||||
|
||||
// _____________
|
||||
//
|
||||
@@ -29,19 +29,19 @@ struct Timer {
|
||||
void start() { t0 = clock::now(); }
|
||||
|
||||
double elapsed_ms() const {
|
||||
return std::chrono::duration<double, std::milli>(clock::now() - t0).count();
|
||||
return std::chrono::duration<double, std::milli>(clock::now() - t0)
|
||||
.count();
|
||||
}
|
||||
};
|
||||
|
||||
// __________________
|
||||
//
|
||||
//
|
||||
// Cluster comparison
|
||||
// __________________
|
||||
template <typename ClusterType>
|
||||
struct ClusterComparison {
|
||||
template <typename ClusterType> struct ClusterComparison {
|
||||
size_t cpu_count = 0;
|
||||
size_t gpu_count = 0;
|
||||
size_t matched = 0;
|
||||
size_t matched = 0;
|
||||
size_t position_mismatch = 0;
|
||||
size_t data_mismatch = 0;
|
||||
size_t cpu_only = 0;
|
||||
@@ -50,19 +50,20 @@ struct ClusterComparison {
|
||||
|
||||
// Sort clusters by (y, x) for deterministic comparison
|
||||
template <typename ClusterType>
|
||||
void sort_clusters(std::vector<ClusterType>& clusters) {
|
||||
void sort_clusters(std::vector<ClusterType> &clusters) {
|
||||
std::sort(clusters.begin(), clusters.end(),
|
||||
[](const ClusterType& a, const ClusterType& b) {
|
||||
if (a.y != b.y) return a.y < b.y;
|
||||
[](const ClusterType &a, const ClusterType &b) {
|
||||
if (a.y != b.y)
|
||||
return a.y < b.y;
|
||||
return a.x < b.x;
|
||||
});
|
||||
}
|
||||
|
||||
// Compare two sorted cluster lists
|
||||
template <typename ClusterType>
|
||||
ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_clusters,
|
||||
std::vector<ClusterType>& gpu_clusters)
|
||||
{
|
||||
ClusterComparison<ClusterType>
|
||||
compare_clusters(std::vector<ClusterType> &cpu_clusters,
|
||||
std::vector<ClusterType> &gpu_clusters) {
|
||||
sort_clusters(cpu_clusters);
|
||||
sort_clusters(gpu_clusters);
|
||||
|
||||
@@ -72,15 +73,17 @@ ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_cl
|
||||
|
||||
size_t ci = 0, gi = 0;
|
||||
while (ci < cpu_clusters.size() && gi < gpu_clusters.size()) {
|
||||
const auto& cc = cpu_clusters[ci];
|
||||
const auto& gc = gpu_clusters[gi];
|
||||
const auto &cc = cpu_clusters[ci];
|
||||
const auto &gc = gpu_clusters[gi];
|
||||
|
||||
if (cc.y == gc.y && cc.x == gc.x) {
|
||||
// Same position/check data
|
||||
bool data_ok = true;
|
||||
constexpr int N = ClusterType::cluster_size_x * ClusterType::cluster_size_y;
|
||||
constexpr int N =
|
||||
ClusterType::cluster_size_x * ClusterType::cluster_size_y;
|
||||
for (int k = 0; k < N; ++k) {
|
||||
// if (cc.data[k] != gc.data[k]) { // a bit too strict espacially that pedestal update is slightly different
|
||||
// if (cc.data[k] != gc.data[k]) { // a bit too strict
|
||||
// espacially that pedestal update is slightly different
|
||||
if (std::abs(cc.data[k] - gc.data[k]) > 5) {
|
||||
data_ok = false;
|
||||
break;
|
||||
@@ -111,80 +114,109 @@ ClusterComparison<ClusterType> compare_clusters(std::vector<ClusterType>& cpu_cl
|
||||
// Cluster printing
|
||||
// ________________
|
||||
template <typename ClusterType>
|
||||
void print_cluster_comparison(const std::vector<std::vector<ClusterType>>& cpu_results,
|
||||
const std::vector<std::vector<ClusterType>>& gpu_results,
|
||||
size_t max_per_frame = 10,
|
||||
size_t max_frames = 100)
|
||||
{
|
||||
void print_cluster_comparison(
|
||||
const std::vector<std::vector<ClusterType>> &cpu_results,
|
||||
const std::vector<std::vector<ClusterType>> &gpu_results,
|
||||
size_t max_per_frame = 10, size_t max_frames = 100) {
|
||||
constexpr int NX = ClusterType::cluster_size_x;
|
||||
constexpr int NY = ClusterType::cluster_size_y;
|
||||
constexpr int N = NX * NY;
|
||||
|
||||
constexpr int N = NX * NY;
|
||||
|
||||
size_t frames_shown = 0;
|
||||
for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames; ++fi) {
|
||||
for (size_t fi = 0; fi < cpu_results.size() && frames_shown < max_frames;
|
||||
++fi) {
|
||||
if (cpu_results[fi].empty() && gpu_results[fi].empty())
|
||||
continue;
|
||||
|
||||
|
||||
size_t n_cpu = cpu_results[fi].size();
|
||||
size_t n_gpu = gpu_results[fi].size();
|
||||
printf("\n Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu, n_gpu);
|
||||
|
||||
printf("\n Frame %zu: CPU=%zu clusters, GPU=%zu clusters\n", fi, n_cpu,
|
||||
n_gpu);
|
||||
|
||||
// Merge-walk over sorted lists (assumes already sorted by y,x)
|
||||
size_t ci = 0, gi = 0;
|
||||
size_t shown = 0;
|
||||
while ((ci < n_cpu || gi < n_gpu) && shown < max_per_frame) {
|
||||
bool have_cpu = ci < n_cpu;
|
||||
bool have_gpu = gi < n_gpu;
|
||||
|
||||
|
||||
// Determine if current entries match in position
|
||||
bool same_pos = have_cpu && have_gpu &&
|
||||
cpu_results[fi][ci].x == gpu_results[fi][gi].x &&
|
||||
cpu_results[fi][ci].y == gpu_results[fi][gi].y;
|
||||
|
||||
|
||||
if (same_pos) {
|
||||
const auto& cc = cpu_results[fi][ci];
|
||||
const auto& gc = gpu_results[fi][gi];
|
||||
|
||||
const auto &cc = cpu_results[fi][ci];
|
||||
const auto &gc = gpu_results[fi][gi];
|
||||
|
||||
// Check if data differs
|
||||
bool differs = false;
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (cc.data[k] != gc.data[k]) { differs = true; break; }
|
||||
if (cc.data[k] != gc.data[k]) {
|
||||
differs = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf(" CPU and GPU clusters found at SAME position (col=%3d, row=%3d)\n %s\n", cc.x, cc.y,
|
||||
differs ? "DATA MISMATCH" : "MATCH");
|
||||
printf(" CPU and GPU clusters found at SAME position "
|
||||
"(col=%3d, row=%3d)\n %s\n",
|
||||
cc.x, cc.y, differs ? "DATA MISMATCH" : "MATCH");
|
||||
printf(" CPU: [");
|
||||
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(cc.data[k])); }
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (k)
|
||||
printf(", ");
|
||||
printf("%6d", static_cast<int>(cc.data[k]));
|
||||
}
|
||||
printf("]\n");
|
||||
if (differs) {
|
||||
printf(" GPU: [");
|
||||
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(gc.data[k])); }
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (k)
|
||||
printf(", ");
|
||||
printf("%6d", static_cast<int>(gc.data[k]));
|
||||
}
|
||||
printf("]\n");
|
||||
printf(" diff: [");
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (k) printf(", ");
|
||||
int d = static_cast<int>(gc.data[k]) - static_cast<int>(cc.data[k]);
|
||||
if (k)
|
||||
printf(", ");
|
||||
int d = static_cast<int>(gc.data[k]) -
|
||||
static_cast<int>(cc.data[k]);
|
||||
printf("%+6d", d);
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
ci++; gi++; shown++;
|
||||
} else if (!have_gpu || (have_cpu && (cpu_results[fi][ci].y < gpu_results[fi][gi].y ||
|
||||
(cpu_results[fi][ci].y == gpu_results[fi][gi].y &&
|
||||
cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) {
|
||||
const auto& cc = cpu_results[fi][ci];
|
||||
ci++;
|
||||
gi++;
|
||||
shown++;
|
||||
} else if (!have_gpu ||
|
||||
(have_cpu &&
|
||||
(cpu_results[fi][ci].y < gpu_results[fi][gi].y ||
|
||||
(cpu_results[fi][ci].y == gpu_results[fi][gi].y &&
|
||||
cpu_results[fi][ci].x < gpu_results[fi][gi].x)))) {
|
||||
const auto &cc = cpu_results[fi][ci];
|
||||
printf(" (%3d, %3d) CPU ONLY\n", cc.x, cc.y);
|
||||
printf(" CPU: [");
|
||||
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(cc.data[k])); }
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (k)
|
||||
printf(", ");
|
||||
printf("%6d", static_cast<int>(cc.data[k]));
|
||||
}
|
||||
printf("]\n");
|
||||
ci++; shown++;
|
||||
ci++;
|
||||
shown++;
|
||||
} else {
|
||||
const auto& gc = gpu_results[fi][gi];
|
||||
const auto &gc = gpu_results[fi][gi];
|
||||
printf(" (%3d, %3d) GPU ONLY\n", gc.x, gc.y);
|
||||
printf(" GPU: [");
|
||||
for (int k = 0; k < N; ++k) { if (k) printf(", "); printf("%6d", static_cast<int>(gc.data[k])); }
|
||||
for (int k = 0; k < N; ++k) {
|
||||
if (k)
|
||||
printf(", ");
|
||||
printf("%6d", static_cast<int>(gc.data[k]));
|
||||
}
|
||||
printf("]\n");
|
||||
gi++; shown++;
|
||||
gi++;
|
||||
shown++;
|
||||
}
|
||||
}
|
||||
frames_shown++;
|
||||
@@ -195,24 +227,25 @@ void print_cluster_comparison(const std::vector<std::vector<ClusterType>>& cpu_r
|
||||
//
|
||||
// Helpers for the updated (CPU-parity) API
|
||||
// _________________________________________
|
||||
|
||||
|
||||
// Copy a ClusterVector into a std::vector<ClusterType> for downstream
|
||||
// comparison code that expects the latter.
|
||||
template <typename Finder, typename ClusterType>
|
||||
void drain_into(Finder& f, std::vector<ClusterType>& out) {
|
||||
void drain_into(Finder &f, std::vector<ClusterType> &out) {
|
||||
auto cv = f.steal_clusters(true);
|
||||
out.clear();
|
||||
out.reserve(cv.size());
|
||||
for (size_t j = 0; j < cv.size(); ++j) out.push_back(cv[j]);
|
||||
for (size_t j = 0; j < cv.size(); ++j)
|
||||
out.push_back(cv[j]);
|
||||
}
|
||||
|
||||
|
||||
// Feed a set of cached pedestal frames through any finder exposing the
|
||||
// CPU-style push_pedestal_frame(NDView) method. Works for both
|
||||
// ClusterFinder and ClusterFinderCUDA.
|
||||
template <typename Finder, typename FRAME_TYPE>
|
||||
void feed_pedestal(Finder& f,
|
||||
const std::vector<aare::NDArray<FRAME_TYPE, 2>>& ped_frames) {
|
||||
for (const auto& frame : ped_frames) {
|
||||
void feed_pedestal(
|
||||
Finder &f, const std::vector<aare::NDArray<FRAME_TYPE, 2>> &ped_frames) {
|
||||
for (const auto &frame : ped_frames) {
|
||||
f.push_pedestal_frame(frame.view());
|
||||
}
|
||||
}
|
||||
@@ -221,16 +254,20 @@ void feed_pedestal(Finder& f,
|
||||
//
|
||||
// Main
|
||||
// ____________
|
||||
int main(int argc, char* argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
// Parse arguments
|
||||
const char* default_pedestal =
|
||||
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json";
|
||||
// const char* default_pedestal =
|
||||
const char *default_pedestal =
|
||||
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/"
|
||||
"2025Sept_m694/Sn25300eV/500_us_voltage_40kV/"
|
||||
"250922_CZTonly_Pedestal_Tp15C_tint_500_master_0.json";
|
||||
// const char* default_pedestal =
|
||||
// "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_pedestal_200keV_DYNAMIC_tint_20us_master_0.json";
|
||||
const char* default_data =
|
||||
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/2025Sept_m694/Sn25300eV/500_us_voltage_40kV/250922_CZTonly_Xray_Tp15C_tint_500_master_0.json";
|
||||
// const char* default_data =
|
||||
const char *default_data =
|
||||
"/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/Calibration_CZT/"
|
||||
"2025Sept_m694/Sn25300eV/500_us_voltage_40kV/"
|
||||
"250922_CZTonly_Xray_Tp15C_tint_500_master_0.json";
|
||||
// const char* default_data =
|
||||
// "/mnt/sls_det_storage/highZ_data/CZT_Vienna/Khalil/November2025/sparse/dynamic/si_data_200keV_DYNAMIC_tint_20us_master_0.json";
|
||||
|
||||
std::filesystem::path pedestal_path(argc > 1 ? argv[1] : default_pedestal);
|
||||
@@ -247,18 +284,21 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
// Defaults: Adjust depending on the test dataset used
|
||||
size_t n_pedestal_frames = 6000;
|
||||
size_t n_data_frames = 10000;
|
||||
double nSigma = 5.0;
|
||||
size_t n_data_frames = 10000;
|
||||
double nSigma = 5.0;
|
||||
|
||||
if (argc > 3) n_pedestal_frames = std::atol(argv[3]);
|
||||
if (argc > 4) n_data_frames = std::atol(argv[4]);
|
||||
if (argc > 5) nSigma = std::atof(argv[5]);
|
||||
if (argc > 3)
|
||||
n_pedestal_frames = std::atol(argv[3]);
|
||||
if (argc > 4)
|
||||
n_data_frames = std::atol(argv[4]);
|
||||
if (argc > 5)
|
||||
nSigma = std::atof(argv[5]);
|
||||
|
||||
// Detector geometry from master file
|
||||
constexpr uint8_t cs_x = 3;
|
||||
constexpr uint8_t cs_y = 3;
|
||||
using ClusterType = aare::Cluster<int32_t, cs_x, cs_y>;
|
||||
using FRAME_TYPE = uint16_t;
|
||||
using FRAME_TYPE = uint16_t;
|
||||
using PEDESTAL_TYPE = double;
|
||||
|
||||
// Read actual frame dimensions from the pedestal file
|
||||
@@ -272,40 +312,44 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
printf("=== Cluster Finder: CPU vs CUDA ===\n");
|
||||
printf("Detector: %zu x %zu\n", ROWS, COLS);
|
||||
printf("Cluster: %d x %d\n", ClusterType::cluster_size_x, ClusterType::cluster_size_y);
|
||||
printf("Cluster: %d x %d\n", ClusterType::cluster_size_x,
|
||||
ClusterType::cluster_size_y);
|
||||
printf("nSigma: %.1f\n", nSigma);
|
||||
|
||||
// =========================================================================
|
||||
// Phase 1: Build pedestal from dark frames (sanity check only + frame cache)
|
||||
// Phase 1: Build pedestal from dark frames (sanity check only + frame
|
||||
// cache)
|
||||
// =========================================================================
|
||||
//
|
||||
// Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal object;
|
||||
// both build their own via push_pedestal_frame.
|
||||
// We still read the pedestal file once, but cache the frames in memory so
|
||||
// every subsequent finder can be fed without re-hitting disk. We also
|
||||
// build a standalone Pedestal purely to print a sanity check.
|
||||
// Neither ClusterFinder nor ClusterFinderCUDA needs an external Pedestal
|
||||
// object; both build their own via push_pedestal_frame. We still read the
|
||||
// pedestal file once, but cache the frames in memory so every subsequent
|
||||
// finder can be fed without re-hitting disk. We also build a standalone
|
||||
// Pedestal purely to print a sanity check.
|
||||
// =========================================================================
|
||||
printf("\n--- Phase 1: Pedestal accumulation (sanity check + cache) ---\n");
|
||||
|
||||
|
||||
std::vector<aare::NDArray<FRAME_TYPE, 2>> pedestal_frames;
|
||||
aare::Pedestal<PEDESTAL_TYPE> pedestal(ROWS, COLS, 1000);
|
||||
|
||||
|
||||
{
|
||||
aare::File ped_file(pedestal_path, "r");
|
||||
size_t total_ped = ped_file.total_frames();
|
||||
size_t use_ped = (n_pedestal_frames == 0 || n_pedestal_frames > total_ped)
|
||||
? total_ped : n_pedestal_frames;
|
||||
size_t use_ped =
|
||||
(n_pedestal_frames == 0 || n_pedestal_frames > total_ped)
|
||||
? total_ped
|
||||
: n_pedestal_frames;
|
||||
printf("Pedestal frames: %zu / %zu\n", use_ped, total_ped);
|
||||
|
||||
|
||||
pedestal_frames.reserve(use_ped);
|
||||
|
||||
|
||||
Timer t;
|
||||
t.start();
|
||||
|
||||
|
||||
for (size_t i = 0; i < use_ped; ++i) {
|
||||
auto frame = ped_file.read_frame();
|
||||
auto view = frame.view<FRAME_TYPE>();
|
||||
|
||||
auto view = frame.view<FRAME_TYPE>();
|
||||
|
||||
// Copy into a standalone NDArray that we can reuse as many times
|
||||
// as we have finders to feed.
|
||||
aare::NDArray<FRAME_TYPE, 2> arr({ROWS, COLS});
|
||||
@@ -313,16 +357,16 @@ int main(int argc, char* argv[]) {
|
||||
for (ssize_t c = 0; c < COLS; ++c)
|
||||
arr(r, c) = view(r, c);
|
||||
pedestal_frames.push_back(std::move(arr));
|
||||
|
||||
|
||||
pedestal.push_no_update(view);
|
||||
}
|
||||
pedestal.update_mean();
|
||||
|
||||
|
||||
printf("Pedestal read+cached+built in %.1f ms\n", t.elapsed_ms());
|
||||
}
|
||||
|
||||
printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n",
|
||||
pedestal.mean(0, 0), pedestal.std(0, 0));
|
||||
|
||||
printf("Pedestal mean[0,0] = %.2f, std[0,0] = %.4f\n", pedestal.mean(0, 0),
|
||||
pedestal.std(0, 0));
|
||||
|
||||
// =========================================================================
|
||||
// Phase 2: Read data frames
|
||||
@@ -331,7 +375,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
aare::File data_file(data_path, "r");
|
||||
size_t total_data = data_file.total_frames();
|
||||
size_t use_data = std::min(n_data_frames, total_data);
|
||||
size_t use_data = std::min(n_data_frames, total_data);
|
||||
printf("Data frames: %zu / %zu\n", use_data, total_data);
|
||||
|
||||
// Pre-read all frames into memory to remove I/O from timing
|
||||
@@ -391,32 +435,32 @@ int main(int argc, char* argv[]) {
|
||||
// time makes sense.
|
||||
// =========================================================================
|
||||
printf("\n--- Phase 4: CUDA ClusterFinder ---\n");
|
||||
|
||||
|
||||
std::vector<std::vector<ClusterType>> gpu_results(use_data);
|
||||
size_t gpu_total_clusters = 0;
|
||||
|
||||
|
||||
// --- Single frame on a single CUDA stream ---
|
||||
{
|
||||
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
|
||||
{ROWS, COLS}, nSigma);
|
||||
|
||||
|
||||
feed_pedestal(cuda_cf, pedestal_frames);
|
||||
|
||||
|
||||
// Warmup: first CUDA call pays driver/context init overhead. The
|
||||
// pedestal drifts slightly during this single frame, which is
|
||||
// acceptable for timing purposes.
|
||||
cuda_cf.find_clusters(frames[0].view(), 0);
|
||||
cuda_cf.steal_clusters(true);
|
||||
|
||||
|
||||
Timer t;
|
||||
t.start();
|
||||
|
||||
|
||||
for (size_t i = 0; i < use_data; ++i) {
|
||||
cuda_cf.find_clusters(frames[i].view(), static_cast<uint64_t>(i));
|
||||
drain_into(cuda_cf, gpu_results[i]);
|
||||
gpu_total_clusters += gpu_results[i].size();
|
||||
}
|
||||
|
||||
|
||||
double gpu_time = t.elapsed_ms();
|
||||
printf("GPU: %zu clusters in %.1f ms (%.2f ms/frame)\n",
|
||||
gpu_total_clusters, gpu_time, gpu_time / use_data);
|
||||
@@ -428,32 +472,33 @@ int main(int argc, char* argv[]) {
|
||||
{
|
||||
constexpr size_t BATCH_SIZE = 100;
|
||||
constexpr int N_STREAMS = 2;
|
||||
|
||||
|
||||
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
|
||||
{ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
|
||||
|
||||
|
||||
feed_pedestal(cuda_cf, pedestal_frames);
|
||||
|
||||
|
||||
// Contiguous staging buffer reused across batches
|
||||
std::vector<FRAME_TYPE> batch_buffer(BATCH_SIZE * ROWS * COLS);
|
||||
|
||||
|
||||
const size_t n_batches = (use_data + BATCH_SIZE - 1) / BATCH_SIZE;
|
||||
|
||||
|
||||
Timer t;
|
||||
t.start();
|
||||
|
||||
|
||||
for (size_t bi = 0; bi < n_batches; ++bi) {
|
||||
const size_t offset = bi * BATCH_SIZE;
|
||||
const size_t actual_batch = std::min(BATCH_SIZE, use_data - offset);
|
||||
|
||||
pack_frame_batch(frames, offset, actual_batch, batch_buffer);
|
||||
|
||||
|
||||
aare::NDView<FRAME_TYPE, 3> batch_view(
|
||||
batch_buffer.data(),
|
||||
{static_cast<ssize_t>(actual_batch), ROWS, COLS});
|
||||
|
||||
auto batch_results = cuda_cf.find_clusters_batched(batch_view, offset);
|
||||
|
||||
|
||||
auto batch_results = cuda_cf.find_clusters_batched(batch_view,
|
||||
offset);
|
||||
|
||||
for (size_t f = 0; f < actual_batch; ++f) {
|
||||
auto& cv = batch_results[f];
|
||||
auto& out = gpu_results[offset + f];
|
||||
@@ -463,10 +508,11 @@ int main(int argc, char* argv[]) {
|
||||
gpu_total_clusters += out.size();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
double gpu_time = t.elapsed_ms();
|
||||
printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu, streams=%d)\n",
|
||||
gpu_total_clusters, gpu_time, gpu_time / use_data, BATCH_SIZE, N_STREAMS);
|
||||
printf("GPU(batched): %zu clusters in %.1f ms (%.2f ms/frame, batch=%zu,
|
||||
streams=%d)\n", gpu_total_clusters, gpu_time, gpu_time / use_data,
|
||||
BATCH_SIZE, N_STREAMS);
|
||||
}
|
||||
*/
|
||||
|
||||
@@ -484,21 +530,21 @@ int main(int argc, char* argv[]) {
|
||||
for (size_t i = 0; i < use_data; ++i) {
|
||||
auto result = compare_clusters(cpu_results[i], gpu_results[i]);
|
||||
|
||||
total_matched += result.matched;
|
||||
total_data_mismatch += result.data_mismatch;
|
||||
total_cpu_only += result.cpu_only;
|
||||
total_gpu_only += result.gpu_only;
|
||||
total_matched += result.matched;
|
||||
total_data_mismatch += result.data_mismatch;
|
||||
total_cpu_only += result.cpu_only;
|
||||
total_gpu_only += result.gpu_only;
|
||||
|
||||
bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 || result.data_mismatch > 0);
|
||||
bool has_diff = (result.cpu_only > 0 || result.gpu_only > 0 ||
|
||||
result.data_mismatch > 0);
|
||||
if (has_diff) {
|
||||
frames_with_differences++;
|
||||
// Print details for first few mismatching frames
|
||||
if (frames_with_differences <= 5) {
|
||||
printf(" Frame %zu: CPU=%zu GPU=%zu matched=%zu "
|
||||
"data_mismatch=%zu cpu_only=%zu gpu_only=%zu\n",
|
||||
i, result.cpu_count, result.gpu_count,
|
||||
result.matched, result.data_mismatch,
|
||||
result.cpu_only, result.gpu_only);
|
||||
i, result.cpu_count, result.gpu_count, result.matched,
|
||||
result.data_mismatch, result.cpu_only, result.gpu_only);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -510,9 +556,11 @@ int main(int argc, char* argv[]) {
|
||||
printf(" Data mismatch: %zu\n", total_data_mismatch);
|
||||
printf(" CPU only: %zu\n", total_cpu_only);
|
||||
printf(" GPU only: %zu\n", total_gpu_only);
|
||||
printf(" Frames with diffs: %zu / %zu\n", frames_with_differences, use_data);
|
||||
printf(" Frames with diffs: %zu / %zu\n", frames_with_differences,
|
||||
use_data);
|
||||
|
||||
if (total_cpu_only == 0 && total_gpu_only == 0 && total_data_mismatch == 0) {
|
||||
if (total_cpu_only == 0 && total_gpu_only == 0 &&
|
||||
total_data_mismatch == 0) {
|
||||
printf("\n*** PASS: CPU and GPU results match exactly ***\n");
|
||||
} else {
|
||||
printf("\n*** DIFFERENCES DETECTED ***\n");
|
||||
@@ -522,8 +570,10 @@ int main(int argc, char* argv[]) {
|
||||
// if (cpu_total_clusters > 0 || gpu_total_clusters > 0) {
|
||||
// size_t max_clusters_per_frame = 10;
|
||||
// size_t max_frames = 100;
|
||||
// printf("\n--- Cluster details (up to %zu frames, %zu clusters each) ---\n", max_frames, max_clusters_per_frame);
|
||||
// print_cluster_comparison(cpu_results, gpu_results, max_clusters_per_frame, max_frames);
|
||||
// printf("\n--- Cluster details (up to %zu frames, %zu clusters each)
|
||||
// ---\n", max_frames, max_clusters_per_frame);
|
||||
// print_cluster_comparison(cpu_results, gpu_results,
|
||||
// max_clusters_per_frame, max_frames);
|
||||
// }
|
||||
|
||||
// =========================================================================
|
||||
@@ -533,7 +583,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
if (use_data > 0) {
|
||||
const int N_ITER = 1000;
|
||||
const auto& bench_frame = frames[0];
|
||||
const auto &bench_frame = frames[0];
|
||||
|
||||
// CPU benchmark
|
||||
{
|
||||
@@ -558,44 +608,43 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
// --- GPU benchmark (single frame, single stream) ---
|
||||
{
|
||||
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
|
||||
{ROWS, COLS}, nSigma);
|
||||
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
|
||||
cuda_cf({ROWS, COLS}, nSigma);
|
||||
feed_pedestal(cuda_cf, pedestal_frames);
|
||||
|
||||
|
||||
// Warmup
|
||||
cuda_cf.find_clusters(bench_frame.view(), 0);
|
||||
cuda_cf.steal_clusters(true);
|
||||
|
||||
|
||||
Timer t;
|
||||
t.start();
|
||||
for (int iter = 0; iter < N_ITER; ++iter) {
|
||||
cuda_cf.find_clusters(bench_frame.view(), 0);
|
||||
cuda_cf.steal_clusters(true);
|
||||
}
|
||||
printf("GPU: %.3f ms/frame (H2D + kernel + D2H, single frame, single stream)\n",
|
||||
printf("GPU: %.3f ms/frame (H2D + kernel + D2H, single "
|
||||
"frame, single stream)\n",
|
||||
t.elapsed_ms() / N_ITER);
|
||||
}
|
||||
|
||||
|
||||
// --- GPU benchmark (batched + multi-streamed) ---
|
||||
{
|
||||
constexpr size_t BATCH_SIZE = 500;
|
||||
constexpr int N_STREAMS = 10;
|
||||
|
||||
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE> cuda_cf(
|
||||
{ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
|
||||
constexpr int N_STREAMS = 10;
|
||||
|
||||
aare::ClusterFinderCUDA<ClusterType, FRAME_TYPE, PEDESTAL_TYPE>
|
||||
cuda_cf({ROWS, COLS}, nSigma, 1'000'000, N_STREAMS);
|
||||
feed_pedestal(cuda_cf, pedestal_frames);
|
||||
|
||||
|
||||
// Build one contiguous batch of BATCH_SIZE copies of bench_frame
|
||||
std::vector<FRAME_TYPE> batch(BATCH_SIZE * ROWS * COLS);
|
||||
for (size_t k = 0; k < BATCH_SIZE; ++k)
|
||||
std::memcpy(batch.data() + k * ROWS * COLS,
|
||||
bench_frame.data(),
|
||||
std::memcpy(batch.data() + k * ROWS * COLS, bench_frame.data(),
|
||||
ROWS * COLS * sizeof(FRAME_TYPE));
|
||||
|
||||
|
||||
aare::NDView<FRAME_TYPE, 3> batch_view(
|
||||
batch.data(),
|
||||
{static_cast<ssize_t>(BATCH_SIZE), ROWS, COLS});
|
||||
|
||||
batch.data(), {static_cast<ssize_t>(BATCH_SIZE), ROWS, COLS});
|
||||
|
||||
// Warmup. The kernel mutates the device-side pedestal for every
|
||||
// non-photon pixel, so after a 500-frame warmup the pedestal
|
||||
// state has drifted. Reset to a clean baseline by clearing the
|
||||
@@ -604,17 +653,19 @@ int main(int argc, char* argv[]) {
|
||||
(void)cuda_cf.find_clusters_batched(batch_view, 0);
|
||||
cuda_cf.clear_pedestal();
|
||||
feed_pedestal(cuda_cf, pedestal_frames);
|
||||
|
||||
const size_t n_iter_batches = (N_ITER + BATCH_SIZE - 1) / BATCH_SIZE;
|
||||
|
||||
|
||||
const size_t n_iter_batches =
|
||||
(N_ITER + BATCH_SIZE - 1) / BATCH_SIZE;
|
||||
|
||||
Timer t;
|
||||
t.start();
|
||||
for (size_t b = 0; b < n_iter_batches; ++b) {
|
||||
(void)cuda_cf.find_clusters_batched(batch_view, b * BATCH_SIZE);
|
||||
}
|
||||
printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, batch=%zu, streams=%d)\n",
|
||||
t.elapsed_ms() / (n_iter_batches * BATCH_SIZE),
|
||||
BATCH_SIZE, N_STREAMS);
|
||||
printf("GPU(batched): %.3f ms/frame (H2D + kernel + D2H, "
|
||||
"batch=%zu, streams=%d)\n",
|
||||
t.elapsed_ms() / (n_iter_batches * BATCH_SIZE), BATCH_SIZE,
|
||||
N_STREAMS);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user