MXAnalysisWithoutFPGA never filled DataMessage.roi, so ROI integrals were only available on the FPGA path. Add a software ROI engine that mirrors the FPGA roi_calc kernel: per-ROI sum, sum of squares, good-pixel count, max and intensity-weighted centre of mass, with each pixel carrying a 16-bit mask so it can contribute to any subset of up to 16 ROIs. New image_analysis/roi/ library (JFJochROIIntegration), structured like azint: a base that precomputes the per-pixel mask and names, a templated CPU engine (generic over pixel type for a future 16-bit path), and a GPU kernel using per-block shared-memory atomics for the STXM case (half-detector ROIs). Masked pixels are excluded entirely; saturated pixels are excluded from the sums but still count towards the max, matching roi_calc exactly. The engine is only constructed when at least one ROI is defined. Downstream CBOR/HDF5 already consume message.roi, so no further changes are needed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
38 lines
1.4 KiB
C++
38 lines
1.4 KiB
C++
// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#pragma once
|
|
|
|
#include <memory>
|
|
|
|
#include "ROIIntegration.h"
|
|
#include "../indexing/CUDAMemHelpers.h"
|
|
|
|
class ROIIntegrationGPU : public ROIIntegration {
|
|
std::shared_ptr<CudaStream> stream;
|
|
int threads;
|
|
int blocks;
|
|
size_t shared_needed;
|
|
|
|
CudaDevicePtr<uint16_t> gpu_roi_map;
|
|
// 64-bit sums are accumulated as unsigned long long (two's-complement bit
|
|
// pattern) because CUDA atomicAdd has no signed 64-bit overload.
|
|
CudaDevicePtr<unsigned long long> gpu_sum;
|
|
CudaDevicePtr<unsigned long long> gpu_sum2;
|
|
CudaDevicePtr<unsigned long long> gpu_pixels;
|
|
CudaDevicePtr<unsigned long long> gpu_x_weighted;
|
|
CudaDevicePtr<unsigned long long> gpu_y_weighted;
|
|
CudaDevicePtr<int> gpu_max;
|
|
|
|
std::vector<unsigned long long> host_sum;
|
|
std::vector<unsigned long long> host_sum2;
|
|
std::vector<unsigned long long> host_pixels;
|
|
std::vector<unsigned long long> host_x_weighted;
|
|
std::vector<unsigned long long> host_y_weighted;
|
|
std::vector<int> host_max;
|
|
std::vector<int> max_init; // INT_MIN seed copied into gpu_max each frame
|
|
public:
|
|
ROIIntegrationGPU(const DiffractionExperiment &experiment, std::shared_ptr<CudaStream> stream);
|
|
void Run(const ImagePreprocessorBuffer &image, std::map<std::string, ROIMessage> &out) override;
|
|
};
|