Add per-frame kernel timing via CUDA events
Build on RHEL8 / build (push) Successful in 3m13s
Build on RHEL9 / build (push) Successful in 3m37s
Run tests using data on local RHEL8 / build (push) Successful in 3m51s

This commit is contained in:
kferjaoui
2026-04-28 13:09:25 +02:00
parent ac96d1f688
commit 34e69a8065
3 changed files with 147 additions and 73 deletions
+9 -2
View File
@@ -87,8 +87,15 @@ void define_ClusterFinderCUDA(py::module &m, const std::string &typestr) {
},
py::arg("frames"), py::arg("first_frame") = 0,
R"(Process a 3D array of frames (n_frames, nrows, ncols) in parallel
across the configured CUDA streams. Returns a list of ClusterVector, one per
input frame.)");
across the configured CUDA streams. Returns a list of ClusterVector, one per
input frame.)")
.def("avg_kernel_time_ms", &CF::avg_kernel_time_ms,
R"(Average kernel execution time per frame in milliseconds,
excluding PCIe transfers. Use wall_time - avg_kernel_time to estimate transfer overhead.)")
.def("reset_timers", &CF::reset_timers,
R"(Reset the internal kernel timing counters.)");
}
} // namespace aare
File diff suppressed because one or more lines are too long