Add Python bindings for CUDA cluster finder
Build on RHEL8 / build (push) Successful in 2m50s
Build on RHEL9 / build (push) Successful in 2m57s
Run tests using data on local RHEL8 / build (push) Successful in 3m38s

- Add bind_ClusterFinderCUDA.hpp with pybind11 bindings for
  ClusterFinderCUDA
- Build CUDA bindings as separate _aare_cuda.so to avoid
  segfaults from mixing nvcc and gcc compiled code in the
  same shared object
- Re-export CUDA classes onto _aare in __init__.py so user
  code uses `from aare import ClusterFinderCUDA` regardless
  of which .so hosts the class
- Factory in ClusterFinder.py selects backend; RuntimeError
  if GPU requested on CPU-only build
- Update python/CMakeLists.txt: _aare_cuda module gated
  behind AARE_CUDA and AARE_PYTHON_BINDINGS
- Add validation notebook: ~20x speedup vs sequential ClusterFinder
This commit is contained in:
kferjaoui
2026-04-23 11:43:40 +02:00
parent 3ed773e520
commit e894bdac9b
7 changed files with 766 additions and 29 deletions
+50 -15
View File
@@ -15,20 +15,50 @@ else()
find_package(pybind11 2.13 REQUIRED)
endif()
# Add the compiled python extension
pybind11_add_module(
_aare # name of the module
src/module.cpp # source file
)
set_target_properties(_aare PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
)
# ---- Main CPU module --------------------------------------------------------
# module.cpp is the only source for the main module. When AARE_CUDA=ON, the
# CUDA bindings live in a *separate* Python extension (_aare_cuda.so) loaded
# independently at runtime. This isolates the nvcc-compiled translation unit
# into its own ELF image so pybind11's type registry cannot be corrupted by
# weak-symbol collisions between gcc-emitted and nvcc-emitted template
# instantiations.
pybind11_add_module(_aare NO_EXTRAS src/module.cpp)
target_link_libraries(_aare PRIVATE aare_core aare_compiler_flags)
target_include_directories(_aare SYSTEM PRIVATE
$<TARGET_PROPERTY:Minuit2::Minuit2,INTERFACE_INCLUDE_DIRECTORIES>
)
set_target_properties(_aare PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/aare
INTERPROCEDURAL_OPTIMIZATION FALSE
)
# ---- CUDA module (separate .so) --------------------------------------------
if(AARE_CUDA)
pybind11_add_module(_aare_cuda NO_EXTRAS src/cuda_bindings.cu)
target_link_libraries(_aare_cuda PRIVATE aare_cuda aare_compiler_flags)
target_include_directories(_aare_cuda SYSTEM PRIVATE
$<TARGET_PROPERTY:Minuit2::Minuit2,INTERFACE_INCLUDE_DIRECTORIES>
)
set_target_properties(_aare_cuda PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/aare
INTERPROCEDURAL_OPTIMIZATION FALSE
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_SEPARABLE_COMPILATION ON
)
target_compile_options(_aare_cuda PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fvisibility=hidden>
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC>
)
endif()
# List of python files to be copied to the build directory
set( PYTHON_FILES
@@ -51,9 +81,9 @@ foreach(FILE ${PYTHON_FILES})
configure_file(${FILE} ${CMAKE_BINARY_DIR}/${FILE} )
endforeach(FILE ${PYTHON_FILES})
set_target_properties(_aare PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/aare
)
# set_target_properties(_aare PROPERTIES
# LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/aare
# )
set(PYTHON_EXAMPLES
examples/play.py
@@ -69,8 +99,13 @@ endforeach(FILE ${PYTHON_EXAMPLES})
if(AARE_INSTALL_PYTHONEXT)
set(AARE_PY_INSTALL_TARGETS _aare)
if(AARE_CUDA)
list(APPEND AARE_PY_INSTALL_TARGETS _aare_cuda)
endif()
install(
TARGETS _aare
TARGETS ${AARE_PY_INSTALL_TARGETS}
EXPORT "${TARGETS_EXPORT_NAME}"
LIBRARY DESTINATION aare
COMPONENT python
@@ -80,5 +115,5 @@ if(AARE_INSTALL_PYTHONEXT)
FILES ${PYTHON_FILES}
DESTINATION aare
COMPONENT python
)
)
endif()
+38
View File
@@ -49,6 +49,44 @@ def ClusterFinderMT(image_size, cluster_size = (3,3), dtype=np.int32, n_sigma=5,
return cls(image_size, n_sigma=n_sigma, capacity=capacity, n_threads=n_threads)
def _cuda_available():
"""True if this build of aare was compiled with -DAARE_CUDA=ON."""
return hasattr(_aare, "ClusterFinderCUDA_Cluster3x3i")
def ClusterFinderCUDA(image_size, cluster_size=(3,3), n_sigma=5, dtype=np.int32,
capacity=1024, n_streams=1):
"""
Factory function to create a ClusterFinderCUDA object. Provides a cleaner
syntax for the templated ClusterFinderCUDA in C++. API mirrors
ClusterFinder() plus CUDA-specific knobs (n_streams).
.. code-block:: python
from aare import ClusterFinderCUDA
cf = ClusterFinderCUDA(image_size=(512, 1024),
cluster_size=(3, 3),
n_sigma=5,
n_streams=4)
for frame in pedestal_frames:
cf.push_pedestal_frame(frame)
for i, frame in enumerate(data_frames):
cf.find_clusters(frame, frame_number=i)
clusters = cf.steal_clusters()
"""
if not _cuda_available():
raise RuntimeError(
"ClusterFinderCUDA is not available in this build of aare. "
"Rebuild with -DAARE_CUDA=ON (and -DAARE_PYTHON_BINDINGS=ON)."
)
cls = _get_class("ClusterFinderCUDA", cluster_size, dtype)
return cls(image_size,
n_sigma=n_sigma,
capacity=capacity,
n_streams=n_streams)
def ClusterCollector(clusterfindermt, dtype=np.int32):
"""
Factory function to create a ClusterCollector object. Provides a cleaner syntax for
+18
View File
@@ -2,6 +2,23 @@
# Make the compiled classes that live in _aare available from aare.
from . import _aare
# ---- CUDA module (optional) ------------------------------------------------
# When the package was built with AARE_CUDA=ON, a sibling extension
# _aare_cuda contains the ClusterFinderCUDA_* classes. We re-export them
# onto _aare so user code can do `from aare import ClusterFinderCUDA_*`
# regardless of which .so physically hosts the class. On a CPU-only build
# the import fails silently and ClusterFinderCUDA_* classes simply aren't
# present; the factory in ClusterFinder.py handles that case with a clear
# RuntimeError.
try:
from . import _aare_cuda as _aare_cuda_mod
for _name in dir(_aare_cuda_mod):
if _name.startswith("ClusterFinderCUDA"):
setattr(_aare, _name, getattr(_aare_cuda_mod, _name))
del _name
except ImportError:
pass
from . import transform
from ._aare import File, RawMasterFile, RawSubFile, JungfrauDataFile
@@ -14,6 +31,7 @@ from ._aare import corner
# from ._aare import ClusterFinderMT, ClusterCollector, ClusterFileSink, ClusterVector_i
from .ClusterFinder import ClusterFinder, ClusterCollector, ClusterFinderMT, ClusterFileSink, ClusterFile
from .ClusterFinder import ClusterFinderCUDA, _cuda_available
from .ClusterVector import ClusterVector
from .Cluster import Cluster
+101
View File
@@ -0,0 +1,101 @@
// SPDX-License-Identifier: MPL-2.0
#pragma once
#include "aare/ClusterFinderCUDA.hpp"
#include "aare/ClusterVector.hpp"
#include "aare/NDView.hpp"
#include "aare/Pedestal.hpp"
#include "np_helper.hpp"
#include <cstdint>
#include <pybind11/pybind11.h>
// #include <pybind11/stl.h>
#include <pybind11/stl_bind.h>
namespace py = pybind11;
using pd_type = double;
using namespace aare;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
namespace aare {
template <typename T, uint8_t ClusterSizeX, uint8_t ClusterSizeY,
typename CoordType = uint16_t>
void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
auto class_name = fmt::format("ClusterFinderCUDA_{}", typestr);
using ClusterType = Cluster<T, ClusterSizeX, ClusterSizeY, CoordType>;
using CF = ClusterFinderCUDA<ClusterType, uint16_t, pd_type>;
py::class_<CF>(m, class_name.c_str())
.def(py::init<Shape<2>, pd_type, size_t, int>(),
py::arg("image_size"),
py::arg("n_sigma") = 5.0,
py::arg("capacity") = 1'000'000,
py::arg("n_streams") = 1)
.def_property(
"nSigma",
&CF::get_nSigma,
&CF::set_nSigma,
R"(Number of sigma above the pedestal to consider a photon during cluster finding.)")
.def("push_pedestal_frame",
[](CF &self, py::array_t<uint16_t> frame) {
auto view = make_view_2d(frame);
self.push_pedestal_frame(view);
})
.def("clear_pedestal", &CF::clear_pedestal)
.def_property_readonly(
"pedestal",
[](CF &self) {
auto pd = new NDArray<pd_type, 2>{};
*pd = self.pedestal();
return return_image_data(pd);
})
.def_property_readonly(
"noise",
[](CF &self) {
auto arr = new NDArray<pd_type, 2>{};
*arr = self.noise();
return return_image_data(arr);
})
.def(
"steal_clusters",
[](CF &self, bool realloc_same_capacity) {
ClusterVector<ClusterType> clusters =
self.steal_clusters(realloc_same_capacity);
return clusters;
},
py::arg("realloc_same_capacity") = false)
.def(
"find_clusters",
[](CF &self, py::array_t<uint16_t> frame, uint64_t frame_number) {
auto view = make_view_2d(frame);
self.find_clusters(view, frame_number);
},
py::arg("frame"), py::arg("frame_number") = 0)
.def(
"find_clusters_batched",
[](CF &self, py::array_t<uint16_t> frames, uint64_t first_frame) {
// frames is expected as a 3D numpy array (n_frames, nrows, ncols)
auto view = make_view_3d(frames);
return self.find_clusters_batched(view, first_frame);
},
py::arg("frames"), py::arg("first_frame") = 0,
R"(Process a 3D array of frames (n_frames, nrows, ncols) in parallel
across the configured CUDA streams. Returns a list of ClusterVector, one per
input frame.)");
}
} // namespace aare
#pragma GCC diagnostic pop
+67
View File
@@ -0,0 +1,67 @@
// SPDX-License-Identifier: MPL-2.0
//
// CUDA-only Python extension module. Registers ClusterFinderCUDA along with
// the ClusterVector and Cluster types it exposes in its return values, so
// the module is self-contained — users can call steal_clusters() and get
// back a usable ClusterVector without _aare needing to be imported first.
#include "bind_Cluster.hpp"
#include "bind_ClusterVector.hpp"
#include "bind_ClusterFinderCUDA.hpp"
#include <pybind11/pybind11.h>
namespace py = pybind11;
// Register the Cluster + ClusterVector pair for one (T, N, M) combination.
// Subset of DEFINE_CLUSTER_BINDINGS from module.cpp: we register what
// ClusterFinderCUDA actually returns, nothing more. File I/O, eta and
// reduce_to_2x2 stay on the CPU side.
#define DEFINE_CUDA_CLUSTER_TYPES(T, N, M, U, TYPE_CODE) \
define_ClusterVector<T, N, M, U>(m, "Cluster" #N "x" #M #TYPE_CODE); \
define_Cluster<T, N, M, U>(m, #N "x" #M #TYPE_CODE);
#define DEFINE_BINDINGS_CLUSTERFINDER_CUDA(T, N, M, U, TYPE_CODE) \
aare::define_ClusterFinderCUDA<T, N, M, U>( \
m, "Cluster" #N "x" #M #TYPE_CODE);
PYBIND11_MODULE(_aare_cuda, m) {
// Types first — finders reference them in their signatures.
// SFINAE excludes 2x2 on ClusterFinderCUDA, so we skip it here too.
DEFINE_CUDA_CLUSTER_TYPES(int, 3, 3, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 3, 3, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 3, 3, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(int, 5, 5, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 5, 5, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 5, 5, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(int, 7, 7, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 7, 7, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 7, 7, uint16_t, f);
DEFINE_CUDA_CLUSTER_TYPES(int, 9, 9, uint16_t, i);
DEFINE_CUDA_CLUSTER_TYPES(double, 9, 9, uint16_t, d);
DEFINE_CUDA_CLUSTER_TYPES(float, 9, 9, uint16_t, f);
// Finders
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 3, 3, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 3, 3, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 3, 3, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 5, 5, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 5, 5, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 5, 5, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 7, 7, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 7, 7, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 7, 7, uint16_t, f);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(int, 9, 9, uint16_t, i);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(double, 9, 9, uint16_t, d);
DEFINE_BINDINGS_CLUSTERFINDER_CUDA(float, 9, 9, uint16_t, f);
}
#undef DEFINE_CUDA_CLUSTER_TYPES
#undef DEFINE_BINDINGS_CLUSTERFINDER_CUDA
File diff suppressed because one or more lines are too long