Jungfraujoch/image_analysis/bragg_integration/BraggIntegrationEngineGPU.h

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#pragma once

#include <cstdint>
#include <memory>
#include <vector>

#include "BraggIntegrationEngine.h"
#include "../indexing/CUDAMemHelpers.h"

// CUDA engine: reproduces BraggIntegrationEngineCPU up to floating-point precision. Each stage is a
// kernel with one CUDA block per reflection cooperating over the small window via shared-memory
// reductions (the natural mapping for thousands of independent, tiny per-spot integrations).
//
// Pipeline (profile modes): reset -> mark_mask -> boxsum -> learn_profile -> build_profiles -> fit
// (the resolution shell is computed inline, so there is no separate shell pass). BoxSum mode stops
// after boxsum (that pass is the BraggIntegrate2D box integrator and the seed of the profile fit).
// The preprocessed image already lives on the device (ImagePreprocessorBufferGPU::getGPUBuffer());
// only the per-frame predicted centres are uploaded.
class BraggIntegrationEngineGPU : public BraggIntegrationEngine {
    std::shared_ptr<CudaStream> stream;
    int threads;
    size_t fit_shared_bytes;

    size_t capacity = 0;   // per-reflection device/host arrays hold at least this many reflections

    // --- per-reflection device arrays (grown by EnsureCapacity) ---
    CudaDevicePtr<float>   d_px_x, d_px_y, d_d;
    CudaDevicePtr<int>     d_cx, d_cy;
    CudaDevicePtr<float>   d_I, d_sigma, d_bkg, d_obs_x, d_obs_y;
    CudaDevicePtr<uint8_t> d_ok, d_strong, d_has_obs;

    // --- fixed-size device arrays ---
    // The learning/fit math is single precision: FP64 is heavily throttled on consumer GPUs and the
    // extraction is Poisson-noise limited, so float reproduces the double CPU path to ~1e-4.
    CudaDevicePtr<uint8_t> d_mask;                       // per-pixel r2-disk reflection mask
    CudaDevicePtr<float>   d_shell_grid, d_global_grid;  // learned profile accumulators (N_SHELL*GG, GG)
    CudaDevicePtr<float>   d_shell_P, d_global_P;         // normalised profiles (empirical mode)
    CudaDevicePtr<float>   d_shell_sigma2, d_global_sigma2;
    CudaDevicePtr<int>     d_shell_n, d_global_n;
    CudaDevicePtr<unsigned long long> d_invd2;           // [min,max] inv-d^2 as monotonic bit patterns

    // --- host staging (copied back once per frame) ---
    std::vector<float>   h_px_x, h_px_y, h_d;
    std::vector<float>   h_I, h_sigma, h_bkg, h_obs_x, h_obs_y;
    std::vector<uint8_t> h_ok, h_has_obs;

    void EnsureCapacity(size_t n);

public:
    BraggIntegrationEngineGPU(const DiffractionExperiment &experiment, std::shared_ptr<CudaStream> stream);
    std::vector<Reflection> Run(const ImagePreprocessorBuffer &image,
                                const std::vector<Reflection> &predicted, size_t npredicted,
                                int64_t image_number) override;
};