mirror of
https://github.com/slsdetectorgroup/aare.git
synced 2026-07-01 19:50:51 +02:00
- Use per-stream pinned host staging buffers for truly async CUDA transfers. - Avoid reserving full device capacity per result frame. - Reduce kernel work by delaying cluster payload construction. - Use squared comparisons and removing per-pixel sqrtf() ops.
79 KiB
79 KiB
In [1]:
import sys; sys.path.append('/home/ferjao_k/aare/build')
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
import boost_histogram as bh
import time
from aare import File, ClusterFinder, ClusterFinderMT, ClusterCollector, ClusterFinderCUDAIn [2]:
# Helpers
N_BINS = 200
def make_hist(clusters):
h = bh.Histogram(bh.axis.Regular(N_BINS, -2, 4000))
h.fill(clusters.sum())
return h
def make_hist_from_batch(result_list):
h = bh.Histogram(bh.axis.Regular(N_BINS, -2, 4000))
energies = [np.asarray(cv.sum()).ravel() for cv in result_list if cv.size > 0]
if energies:
h.fill(np.concatenate(energies))
return hIn [3]:
base = Path('/mnt/sls_det_storage/matterhorn_data/aare_test_data/')
f = File(base / 'Moench03new/cu_half_speed_master_4.json')
n_frames_pd = 1000
N = 88999
cluster_size = (9, 9)
image_size = (f.rows, f.cols)
capacity = 100_000 #3_000_000
print(f'Image size: {image_size}')
print(f'Pedestal frames: {n_frames_pd}')
print(f'Data frames: {N}')Image size: (400, 400) Pedestal frames: 1000 Data frames: 88999
In [4]:
f.total_framesOut [4]:
500000
In [5]:
SERIAL = TrueIn [6]:
if(SERIAL):
cf_cpu = ClusterFinder(image_size, cluster_size, capacity=capacity)
else:
cf_cpu = ClusterFinderMT(image_size, cluster_size, capacity=capacity, n_threads=48)
sink = ClusterCollector(cf_cpu)In [7]:
N_STREAMS = 5
cf_cuda = ClusterFinderCUDA(image_size, cluster_size, n_sigma=7, capacity=capacity, n_streams=N_STREAMS)In [8]:
t0 = time.perf_counter()
for _ in range(n_frames_pd):
img = f.read_frame()
cf_cpu.push_pedestal_frame(img.copy())
cf_cuda.push_pedestal_frame(img.copy())
print(f'Pedestal ({n_frames_pd} frames): {time.perf_counter() - t0:.3f}s')Pedestal (1000 frames): 0.501s
In [9]:
f.seek(n_frames_pd)
t0 = time.perf_counter()
data = f.read_n(N)
t_io = time.perf_counter() - t0
print(f'Reading {N} frames: {t_io:.3f}s ({N/t_io:.0f} FPS, '
f'{f.bytes_per_frame * N / 1024**2 / t_io:.3f} GB/s)')Reading 88999 frames: 30.101s (2957 FPS, 902.311 GB/s)
In [10]:
t0 = time.perf_counter()
for frame in data:
cf_cpu.find_clusters(frame)
t_cpu = time.perf_counter() - t0
if(SERIAL):
clusters_cpu = cf_cpu.steal_clusters(realloc_same_capacity=False)
n_clusters_cpu = clusters_cpu.size
hist_cpu = make_hist(clusters_cpu)
else:
cf_cpu.stop()
sink.stop()
clusters_cpu = sink.steal_clusters() #cf_cpu.steal_clusters(realloc_same_capacity=False)
hist_cpu = bh.Histogram(bh.axis.Regular(N_BINS, -2, 4000))
n_clusters_cpu = 0
for cv in clusters_cpu:
hist_cpu.fill(cv.sum())
n_clusters_cpu += cv.size
print(f'CPU clustering: {t_cpu:.3f}s ({N/t_cpu:.0f} FPS, '
f'{n_clusters_cpu} clusters, {n_clusters_cpu/N:.2f}/frame)')CPU clustering: 835.664s (107 FPS, 90799856 clusters, 1020.23/frame)
In [11]:
BATCHED = TrueIn [18]:
if(BATCHED):
BATCH_SIZE = 2000
# Warmup: first kernel launch pays CUDA context + pedestal H2D upload cost
_ = cf_cuda.find_clusters_batched(data[0:BATCH_SIZE], first_frame=0)
clusters_cuda_per_frame = []
cf_cuda.reset_timers()
t0 = time.perf_counter()
for start in range(0, N, BATCH_SIZE):
stop = min(start + BATCH_SIZE, N)
clusters_cuda_per_frame.extend(
cf_cuda.find_clusters_batched(data[start:stop], first_frame=start)
)
t_cuda = time.perf_counter() - t0
kernel_ms = cf_cuda.avg_kernel_time_ms()
n_clusters_cuda = sum(cv.size for cv in clusters_cuda_per_frame)
hist_cuda = make_hist_from_batch(clusters_cuda_per_frame)
else:
# Warmup
cf_cuda.find_clusters(data[0])
_ = cf_cuda.steal_clusters(realloc_same_capacity=False)
cf_cuda.reset_timers()
t0 = time.perf_counter()
n_clusters_cuda = 0
hist_cuda = None
# steal the clusters as we go rather than at the end of the dataset
# which might trigger an std::bad_alloc...
for idx, frame in enumerate(data):
cf_cuda.find_clusters(frame)
clusters_frame = cf_cuda.steal_clusters(realloc_same_capacity=True)
n_clusters_cuda += clusters_frame.size
h = make_hist(clusters_frame)
hist_cuda = h if hist_cuda is None else hist_cuda + h
t_cuda = time.perf_counter() - t0
kernel_ms = cf_cuda.avg_kernel_time_ms()In [19]:
cluster_sizeOut [19]:
(9, 9)
In [25]:
print(f'CPU clustering: {t_cpu:.3f}s ({N/t_cpu:.0f} FPS, '
f'{n_clusters_cpu} clusters, {n_clusters_cpu/N:.2f}/frame)')CPU clustering: 835.664s (107 FPS, 90799856 clusters, 1020.23/frame)
In [24]:
print(f'CUDA clustering: {t_cuda:.3f}s ({N/t_cuda:.0f} FPS, '
f'{n_clusters_cuda} clusters, {n_clusters_cuda/N:.2f}/frame)')
print(f' Kernel only: {kernel_ms:.3f} ms/frame')
print(f' PCIe + overhead: {t_cuda*1000/N - kernel_ms:.3f} ms/frame')
print(f'Speedup (CPU / CUDA): {t_cpu / t_cuda:.2f}×')CUDA clustering: 8.812s (10100 FPS, 89995010 clusters, 1011.19/frame)
Kernel only: 0.039 ms/frame
PCIe + overhead: 0.060 ms/frame
Speedup (CPU / CUDA): 94.83×
In [21]:
diff = abs(n_clusters_cpu - n_clusters_cuda)
rel = diff / max(n_clusters_cpu, 1)
print(f'Cluster count diff: {diff} ({rel:.2%})')Cluster count diff: 804846 (0.89%)
In [22]:
print(len(hist_cpu.values()), len(hist_cpu.axes[0].edges))
print(len(hist_cuda.values()), len(hist_cuda.axes[0].edges))200 201 200 201
In [23]:
fig, (ax_spec, ax_ratio) = plt.subplots(
2, 1, figsize=(8, 6), sharex=True,
gridspec_kw={'height_ratios': [3, 1]}
)
edges = hist_cpu.axes[0].edges
cpu_vals = hist_cpu.values()
cuda_vals = hist_cuda.values()
ax_spec.stairs(cpu_vals, edges, label=f'CPU ({n_clusters_cpu} clusters)')
ax_spec.stairs(cuda_vals, edges, label=f'CUDA ({n_clusters_cuda} clusters)', linestyle='--')
ax_spec.set_ylabel('Counts')
ax_spec.set_title('Cluster energy spectrum: CPU vs CUDA')
ax_spec.legend()
ax_spec.grid(alpha=0.2)
with np.errstate(divide='ignore', invalid='ignore'):
ratio = np.where(cpu_vals > 0, cuda_vals / cpu_vals, np.nan)
ax_ratio.stairs(ratio, edges, color='k')
ax_ratio.axhline(1.0, color='gray', linewidth=0.5)
ax_ratio.set_ylabel('CUDA / CPU')
ax_ratio.set_xlabel('Energy [ADU]')
ax_ratio.set_ylim(0.5, 2.0)
ax_ratio.grid(alpha=0.3)
plt.tight_layout()
plt.show()In [ ]: