mirror of
https://github.com/slsdetectorgroup/aare.git
synced 2026-06-05 06:58:42 +02:00
Add Python bindings for CUDA cluster finder
- Add bind_ClusterFinderCUDA.hpp with pybind11 bindings for ClusterFinderCUDA - Build CUDA bindings as separate _aare_cuda.so to avoid segfaults from mixing nvcc and gcc compiled code in the same shared object - Re-export CUDA classes onto _aare in __init__.py so user code uses `from aare import ClusterFinderCUDA` regardless of which .so hosts the class - Factory in ClusterFinder.py selects backend; RuntimeError if GPU requested on CPU-only build - Update python/CMakeLists.txt: _aare_cuda module gated behind AARE_CUDA and AARE_PYTHON_BINDINGS - Add validation notebook: ~20x speedup vs sequential ClusterFinder
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
#pragma once
|
||||
#include "aare/ClusterFinderCUDA.hpp"
|
||||
#include "aare/ClusterVector.hpp"
|
||||
#include "aare/NDView.hpp"
|
||||
#include "aare/Pedestal.hpp"
|
||||
#include "np_helper.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <pybind11/pybind11.h>
|
||||
// #include <pybind11/stl.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
using pd_type = double;
|
||||
|
||||
using namespace aare;
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
|
||||
namespace aare {
|
||||
|
||||
template <typename T, uint8_t ClusterSizeX, uint8_t ClusterSizeY,
|
||||
typename CoordType = uint16_t>
|
||||
void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
|
||||
auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr);
|
||||
|
||||
using ClusterType = Cluster<T, ClusterSizeX, ClusterSizeY, CoordType>;
|
||||
using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
|
||||
|
||||
py::class_<CF>(m, class_name.c_str())
|
||||
.def(py::init<Shape<2>, pd_type, size_t, int>(),
|
||||
py::arg("image_size"),
|
||||
py::arg("n_sigma") = 5.0,
|
||||
py::arg("capacity") = 1'000'000,
|
||||
py::arg("n_streams") = 1)
|
||||
|
||||
.def_property(
|
||||
"nSigma",
|
||||
&CF::get_nSigma,
|
||||
&CF::set_nSigma,
|
||||
R"(Number of sigma above the pedestal to consider a photon during cluster finding.)")
|
||||
|
||||
.def("push_pedestal_frame",
|
||||
[](CF &self, py::array_t<uint16_t> frame) {
|
||||
auto view = make_view_2d(frame);
|
||||
self.push_pedestal_frame(view);
|
||||
})
|
||||
|
||||
.def("clear_pedestal", &CF::clear_pedestal)
|
||||
|
||||
.def_property_readonly(
|
||||
"pedestal",
|
||||
[](CF &self) {
|
||||
auto pd = new NDArray<pd_type, 2>{};
|
||||
*pd = self.pedestal();
|
||||
return return_image_data(pd);
|
||||
})
|
||||
|
||||
.def_property_readonly(
|
||||
"noise",
|
||||
[](CF &self) {
|
||||
auto arr = new NDArray<pd_type, 2>{};
|
||||
*arr = self.noise();
|
||||
return return_image_data(arr);
|
||||
})
|
||||
|
||||
.def(
|
||||
"steal_clusters",
|
||||
[](CF &self, bool realloc_same_capacity) {
|
||||
ClusterVector<ClusterType> clusters =
|
||||
self.steal_clusters(realloc_same_capacity);
|
||||
return clusters;
|
||||
},
|
||||
py::arg("realloc_same_capacity") = false)
|
||||
|
||||
.def(
|
||||
"find_clusters",
|
||||
[](CF &self, py::array_t<uint16_t> frame, uint64_t frame_number) {
|
||||
auto view = make_view_2d(frame);
|
||||
self.find_clusters(view, frame_number);
|
||||
},
|
||||
py::arg("frame"), py::arg("frame_number") = 0)
|
||||
|
||||
.def(
|
||||
"find_clusters_batched",
|
||||
[](CF &self, py::array_t<uint16_t> frames, uint64_t first_frame) {
|
||||
// frames is expected as a 3D numpy array (n_frames, nrows, ncols)
|
||||
auto view = make_view_3d(frames);
|
||||
return self.find_clusters_batched(view, first_frame);
|
||||
},
|
||||
py::arg("frames"), py::arg("first_frame") = 0,
|
||||
R"(Process a 3D array of frames (n_frames, nrows, ncols) in parallel
|
||||
across the configured CUDA streams. Returns a list of ClusterVector, one per
|
||||
input frame.)");
|
||||
}
|
||||
|
||||
} // namespace aare
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
@@ -0,0 +1,67 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
//
|
||||
// CUDA-only Python extension module. Registers ClusterFinderCUDA along with
|
||||
// the ClusterVector and Cluster types it exposes in its return values, so
|
||||
// the module is self-contained — users can call steal_clusters() and get
|
||||
// back a usable ClusterVector without _aare needing to be imported first.
|
||||
|
||||
#include "bind_Cluster.hpp"
|
||||
#include "bind_ClusterVector.hpp"
|
||||
#include "bind_ClusterFinderCUDA.hpp"
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
// Register the Cluster + ClusterVector pair for one (T, N, M) combination.
|
||||
// Subset of DEFINE_CLUSTER_BINDINGS from module.cpp: we register what
|
||||
// ClusterFinderCUDA actually returns, nothing more. File I/O, eta and
|
||||
// reduce_to_2x2 stay on the CPU side.
|
||||
#define DEFINE_CUDA_CLUSTER_TYPES(T, N, M, U, TYPE_CODE) \
|
||||
define_ClusterVector<T, N, M, U>(m, "Cluster" #N "x" #M #TYPE_CODE); \
|
||||
define_Cluster<T, N, M, U>(m, #N "x" #M #TYPE_CODE);
|
||||
|
||||
#define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE) \
|
||||
aare::define_ClusterFinderCUDA<T, N, M, U>( \
|
||||
m, "Cluster" #N "x" #M #TYPE_CODE);
|
||||
|
||||
PYBIND11_MODULE(_aare_cuda, m) {
|
||||
|
||||
// Types first — finders reference them in their signatures.
|
||||
// SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too.
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);
|
||||
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);
|
||||
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);
|
||||
|
||||
DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d);
|
||||
DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);
|
||||
|
||||
// Finders
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);
|
||||
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);
|
||||
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);
|
||||
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d);
|
||||
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
|
||||
}
|
||||
|
||||
#undef DEFINE_CUDA_CLUSTER_TYPES
|
||||
#undef DEFINE_BINDINGS_CLUSTERFINDER_CUDA
|
||||
Reference in New Issue
Block a user