Add Python bindings for CUDA cluster finder

- Add bind_ClusterFinderCUDA.hpp with pybind11 bindings for ClusterFinderCUDA - Build CUDA bindings as separate _aare_cuda.so to avoid segfaults from mixing nvcc and gcc compiled code in the same shared object - Re-export CUDA classes onto _aare in __init__.py so user code uses `from aare import ClusterFinderCUDA` regardless of which .so hosts the class - Factory in ClusterFinder.py selects backend; RuntimeError if GPU requested on CPU-only build - Update python/CMakeLists.txt: _aare_cuda module gated behind AARE_CUDA and AARE_PYTHON_BINDINGS - Add validation notebook: ~20x speedup vs sequential ClusterFinder
2026-07-24 11:12:52 +02:00 · 2026-04-23 11:43:40 +02:00
parent 3ed773e520
commit e894bdac9b
7 changed files with 766 additions and 29 deletions
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: MPL-2.0
+#pragma once
+#include "aare/ClusterFinderCUDA.hpp"
+#include "aare/ClusterVector.hpp"
+#include "aare/NDView.hpp"
+#include "aare/Pedestal.hpp"
+#include "np_helper.hpp"
+
+#include <cstdint>
+#include <pybind11/pybind11.h>
+// #include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+namespace py = pybind11;
+using pd_type = double;
+
+using namespace aare;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+namespace aare {
+
+template <typename T, uint8_t ClusterSizeX, uint8_t ClusterSizeY,
+          typename CoordType = uint16_t>
+void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
+    auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr);
+
+    using ClusterType = Cluster<T, ClusterSizeX, ClusterSizeY, CoordType>;
+    using CF          = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
+
+    py::class_<CF>(m, class_name.c_str())
+        .def(py::init<Shape<2>, pd_type, size_t, int>(),
+             py::arg("image_size"),
+             py::arg("n_sigma")    = 5.0,
+             py::arg("capacity")   = 1'000'000,
+             py::arg("n_streams")  = 1)
+
+        .def_property(
+            "nSigma",
+            &CF::get_nSigma,
+            &CF::set_nSigma,
+            R"(Number of sigma above the pedestal to consider a photon during cluster finding.)")
+
+        .def("push_pedestal_frame",
+             [](CF &self, py::array_t<uint16_t> frame) {
+                 auto view = make_view_2d(frame);
+                 self.push_pedestal_frame(view);
+             })
+
+        .def("clear_pedestal", &CF::clear_pedestal)
+
+        .def_property_readonly(
+            "pedestal",
+            [](CF &self) {
+                auto pd = new NDArray<pd_type, 2>{};
+                *pd = self.pedestal();
+                return return_image_data(pd);
+            })
+
+        .def_property_readonly(
+            "noise",
+            [](CF &self) {
+                auto arr = new NDArray<pd_type, 2>{};
+                *arr = self.noise();
+                return return_image_data(arr);
+            })
+
+        .def(
+            "steal_clusters",
+            [](CF &self, bool realloc_same_capacity) {
+                ClusterVector<ClusterType> clusters =
+                    self.steal_clusters(realloc_same_capacity);
+                return clusters;
+            },
+            py::arg("realloc_same_capacity") = false)
+
+        .def(
+            "find_clusters",
+            [](CF &self, py::array_t<uint16_t> frame, uint64_t frame_number) {
+                auto view = make_view_2d(frame);
+                self.find_clusters(view, frame_number);
+            },
+            py::arg("frame"), py::arg("frame_number") = 0)
+
+        .def(
+            "find_clusters_batched",
+            [](CF &self, py::array_t<uint16_t> frames, uint64_t first_frame) {
+                // frames is expected as a 3D numpy array (n_frames, nrows, ncols)
+                auto view = make_view_3d(frames);
+                return self.find_clusters_batched(view, first_frame);
+            },
+            py::arg("frames"), py::arg("first_frame") = 0,
+            R"(Process a 3D array of frames (n_frames, nrows, ncols) in parallel
+across the configured CUDA streams. Returns a list of ClusterVector, one per
+input frame.)");
+}
+
+} // namespace aare
+
+#pragma GCC diagnostic pop
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MPL-2.0
+//
+// CUDA-only Python extension module. Registers ClusterFinderCUDA along with
+// the ClusterVector and Cluster types it exposes in its return values, so
+// the module is self-contained — users can call steal_clusters() and get
+// back a usable ClusterVector without _aare needing to be imported first.
+
+#include "bind_Cluster.hpp"
+#include "bind_ClusterVector.hpp"
+#include "bind_ClusterFinderCUDA.hpp"
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+// Register the Cluster + ClusterVector pair for one (T, N, M) combination.
+// Subset of DEFINE_CLUSTER_BINDINGS from module.cpp: we register what
+// ClusterFinderCUDA actually returns, nothing more. File I/O, eta and
+// reduce_to_2x2 stay on the CPU side.
+#define DEFINE_CUDA_CLUSTER_TYPES(T, N, M, U, TYPE_CODE)                       \
+    define_ClusterVector<T, N, M, U>(m, "Cluster" #N "x" #M #TYPE_CODE);       \
+    define_Cluster<T, N, M, U>(m, #N "x" #M #TYPE_CODE);
+
+#define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE)              \
+    aare::define_ClusterFinderCUDA<T, N, M, U>(                                \
+        m, "Cluster" #N "x" #M #TYPE_CODE);
+
+PYBIND11_MODULE(_aare_cuda, m) {
+
+    // Types first — finders reference them in their signatures.
+    // SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too.
+    DEFINE_CUDA_CLUSTER_TYPES(int,    3, 3, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d);
+    DEFINE_CUDA_CLUSTER_TYPES(float,  3, 3, uint16_t, f);
+
+    DEFINE_CUDA_CLUSTER_TYPES(int,    5, 5, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d);
+    DEFINE_CUDA_CLUSTER_TYPES(float,  5, 5, uint16_t, f);
+
+    DEFINE_CUDA_CLUSTER_TYPES(int,    7, 7, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d);
+    DEFINE_CUDA_CLUSTER_TYPES(float,  7, 7, uint16_t, f);
+
+    DEFINE_CUDA_CLUSTER_TYPES(int,    9, 9, uint16_t, i);
+    DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d);
+    DEFINE_CUDA_CLUSTER_TYPES(float,  9, 9, uint16_t, f);
+
+    // Finders
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    3, 3, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  3, 3, uint16_t, f);
+
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    5, 5, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  5, 5, uint16_t, f);
+
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    7, 7, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  7, 7, uint16_t, f);
+
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int,    9, 9, uint16_t, i);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d);
+    DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float,  9, 9, uint16_t, f);
+}
+
+#undef DEFINE_CUDA_CLUSTER_TYPES
+#undef DEFINE_BINDINGS_CLUSTERFINDER_CUDA