mirror of
https://github.com/slsdetectorgroup/aare.git
synced 2026-06-09 13:08:41 +02:00
6a12e3de24
Rework the multi-stream pipeline to eliminate per-frame sync barriers and fix the D2H staging architecture. Sync reduction: - Replace one cudaStreamSynchronize per frame with one per stream per batch, cutting synchronisation calls from O(n_frames x n_streams) to O(n_streams) - Introduce a unified per-frame D2H output layout [uint32_t count | clusters[max]] stored in a single class-level lazy-allocated pinned pool (h_output_pinned), replacing the per-stream separate cluster/count device buffers - Move CUDA event pool from per-stream fixed-size to per-frame-slot lazy-allocated, enabling correct kernel timing across any batch size Pinned H2D without CPU-side copy: - Add register_input_buffer(ptr, bytes) / unregister_input_buffer() wrapping cudaHostRegister so callers can pin their existing batch buffer once; all find_clusters_batched() slices then transfer at DMA speed (~22 GB/s) instead of ~15 GB/s for pageable, with no extra memcpy or WC-memory penalty Result (RTX 4090, 400x400 uint16, 3x3 clusters, batch=2000, 5 streams): Before: ~34 µs/frame -> After: ~28 µs/frame (−18 %)
31 lines
921 B
Makefile
31 lines
921 B
Makefile
CXX := /usr/bin/c++
|
|
NVCC := nvcc
|
|
ARCH := -arch=sm_89
|
|
CXXFLAGS := -std=c++17 -O3 --extended-lambda -ccbin $(CXX)
|
|
INCLUDES := -I../include -I../build/_deps/fmt-src/include
|
|
LDFLAGS := -L../build -L../build/_deps/fmt-build
|
|
LIBS := -laare_core -lfmt -lstdc++fs
|
|
DEFINES := -DAARE_LOG_LEVEL=logERROR
|
|
|
|
TARGET_OLD := test_cf_cuda_old
|
|
TARGET := test_cf_cuda
|
|
|
|
SRC_OLD := ClusterFinderCUDA_old.test.cu
|
|
SRC := ClusterFinderCUDA.test.cu
|
|
|
|
DEP := $(SRC:.cu=.d) $(SRC_OLD:.cu=.d)
|
|
|
|
all: $(TARGET) $(TARGET_OLD)
|
|
|
|
$(TARGET): $(SRC) ../include/aare/clusterfinder_kernel.cuh
|
|
$(NVCC) -Xptxas=-v $(ARCH) $(CXXFLAGS) $(DEFINES) $(INCLUDES) $(LDFLAGS) $< -o $@ $(LIBS)
|
|
|
|
$(TARGET_OLD): $(SRC_OLD) ../include/aare/clusterfinder_kernel.cuh
|
|
$(NVCC) -Xptxas=-v $(ARCH) $(CXXFLAGS) $(DEFINES) $(INCLUDES) $(LDFLAGS) $< -o $@ $(LIBS)
|
|
|
|
clean:
|
|
rm -f $(TARGET) $(TARGET_OLD) $(DEP)
|
|
|
|
-include $(DEP)
|
|
|
|
.PHONY: all clean |