Jungfraujoch/image_analysis/roi/ROIIntegrationGPU.h

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#pragma once

#include <memory>

#include "ROIIntegration.h"
#include "../indexing/CUDAMemHelpers.h"

class ROIIntegrationGPU : public ROIIntegration {
    std::shared_ptr<CudaStream> stream;
    int threads;
    int blocks;
    size_t shared_needed;

    CudaDevicePtr<uint16_t> gpu_roi_map;
    // 64-bit sums are accumulated as unsigned long long (two's-complement bit
    // pattern) because CUDA atomicAdd has no signed 64-bit overload.
    CudaDevicePtr<unsigned long long> gpu_sum;
    CudaDevicePtr<unsigned long long> gpu_sum2;
    CudaDevicePtr<unsigned long long> gpu_pixels;
    CudaDevicePtr<unsigned long long> gpu_x_weighted;
    CudaDevicePtr<unsigned long long> gpu_y_weighted;
    CudaDevicePtr<int> gpu_max;

    std::vector<unsigned long long> host_sum;
    std::vector<unsigned long long> host_sum2;
    std::vector<unsigned long long> host_pixels;
    std::vector<unsigned long long> host_x_weighted;
    std::vector<unsigned long long> host_y_weighted;
    std::vector<int> host_max;
    std::vector<int> max_init;   // INT_MIN seed copied into gpu_max each frame
public:
    ROIIntegrationGPU(const DiffractionExperiment &experiment, std::shared_ptr<CudaStream> stream);
    void Run(const ImagePreprocessorBuffer &image, std::map<std::string, ROIMessage> &out) override;
};