Jungfraujoch/image_analysis/bragg_integration/BraggIntegrationEngineGPU.cu

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "BraggIntegrationEngineGPU.h"

using namespace bragg_engine;

namespace {

inline void cuda_err(cudaError_t val) {
    if (val != cudaSuccess)
        throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}

// Fixed scalars passed by value to every kernel (mirrors BraggIntegrationEngine's members).
struct BraggGpuParams {
    int   W, H;
    float r1_sq, r2, r2_sq, r3, r3_sq;
    float min_sigma_ratio;
    int   R, G, GG;
    int   do_clip;        // background sigma-clip (stills, profile modes)
    int   empirical;      // ProfileEmpirical vs ProfileGaussian
    int   broadband;
    int   use_ellipse;
    float bw_sigma;
    float c_radial;
    float F_px;
    float beam_x, beam_y;
};

__device__ inline bool valid(int32_t v) { return v != INT32_MIN && v != INT32_MAX; }

// --- Mark the r2 signal disk of every predicted reflection (race-free: all writes are 1). ---
__global__ void mark_mask(const float *px_x, const float *px_y, uint8_t *mask, BraggGpuParams p, int n) {
    const int i = blockIdx.x;
    if (i >= n) return;
    const float cx = px_x[i], cy = px_y[i];
    const int x0 = max(0, (int) floorf(cx - p.r2 - 1.0f));
    const int x1 = min(p.W - 1, (int) ceilf(cx + p.r2 + 1.0f));
    const int y0 = max(0, (int) floorf(cy - p.r2 - 1.0f));
    const int y1 = min(p.H - 1, (int) ceilf(cy + p.r2 + 1.0f));
    const int bw = x1 - x0 + 1, bh = y1 - y0 + 1;
    if (bw <= 0 || bh <= 0) return;
    for (int t = threadIdx.x; t < bw * bh; t += blockDim.x) {
        const int x = x0 + t % bw, y = y0 + t / bw;
        const float ddx = (float) x - cx, ddy = (float) y - cy;
        if (ddx * ddx + ddy * ddy < p.r2_sq) mask[y * p.W + x] = 1;
    }
}

// --- Pass A box-sum: rough I / background / centroid / strong flag, one block per reflection. ---
__global__ void boxsum(const float *px_x, const float *px_y, const float *dd,
                       const int32_t *img, const uint8_t *mask, BraggGpuParams p, int n,
                       int *cx_o, int *cy_o, float *I_o, float *sigma_o, float *bkg_o,
                       float *obsx_o, float *obsy_o, uint8_t *ok_o, uint8_t *strong_o,
                       uint8_t *hasobs_o, unsigned long long *invd2mm) {
    const int i = blockIdx.x;
    if (i >= n) return;

    __shared__ unsigned long long s_Isum, s_Ix, s_Iy;
    __shared__ int s_ninner, s_ninner_valid, s_nbkg;
    __shared__ double s_bkgsum;
    __shared__ int s_accept;
    __shared__ double s_bkg, s_thr, s_clipsum;
    __shared__ int s_clipn;
    if (threadIdx.x == 0) {
        s_Isum = 0; s_Ix = 0; s_Iy = 0;
        s_ninner = 0; s_ninner_valid = 0; s_nbkg = 0; s_bkgsum = 0.0;
    }
    __syncthreads();

    const float cx = px_x[i], cy = px_y[i];
    const int x0 = max(0, (int) floorf(cx - p.r3 - 1.0f));
    const int x1 = min(p.W - 1, (int) ceilf(cx + p.r3 + 1.0f));
    const int y0 = max(0, (int) floorf(cy - p.r3 - 1.0f));
    const int y1 = min(p.H - 1, (int) ceilf(cy + p.r3 + 1.0f));
    const int bw = x1 - x0 + 1, bh = y1 - y0 + 1;
    const int area = (bw > 0 && bh > 0) ? bw * bh : 0;

    long long l_Isum = 0, l_Ix = 0, l_Iy = 0;
    int l_ni = 0, l_niv = 0, l_nb = 0;
    double l_bkg = 0.0;
    for (int t = threadIdx.x; t < area; t += blockDim.x) {
        const int x = x0 + t % bw, y = y0 + t / bw;
        const float ddx = (float) x - cx, ddy = (float) y - cy;
        const float d2 = ddx * ddx + ddy * ddy;
        const int32_t px = img[y * p.W + x];
        if (d2 < p.r1_sq) {
            ++l_ni;
            if (valid(px)) { l_Isum += px; l_Ix += (long long) x * px; l_Iy += (long long) y * px; ++l_niv; }
        } else if (d2 >= p.r2_sq && d2 < p.r3_sq) {
            if (mask[y * p.W + x]) continue;
            if (!valid(px)) continue;
            l_bkg += (double) px; ++l_nb;
        }
    }
    atomicAdd(&s_Isum, (unsigned long long) l_Isum);
    atomicAdd(&s_Ix, (unsigned long long) l_Ix);
    atomicAdd(&s_Iy, (unsigned long long) l_Iy);
    atomicAdd(&s_ninner, l_ni);
    atomicAdd(&s_ninner_valid, l_niv);
    atomicAdd(&s_nbkg, l_nb);
    atomicAdd(&s_bkgsum, l_bkg);
    __syncthreads();

    if (threadIdx.x == 0) {
        s_accept = (s_ninner_valid == s_ninner && s_nbkg > 5) ? 1 : 0;
        s_bkg = s_accept ? (s_bkgsum / (double) s_nbkg) : 0.0;
        s_thr = s_bkg + 3.0 * sqrt(fmax(s_bkg, 1.0));
        s_clipsum = 0.0; s_clipn = 0;
    }
    __syncthreads();

    // Second ring pass for the stills sigma-clip (re-reads the annulus; avoids storing bkg values).
    if (s_accept && p.do_clip) {
        double c_l = 0.0; int cn_l = 0;
        for (int t = threadIdx.x; t < area; t += blockDim.x) {
            const int x = x0 + t % bw, y = y0 + t / bw;
            const float ddx = (float) x - cx, ddy = (float) y - cy;
            const float d2 = ddx * ddx + ddy * ddy;
            if (!(d2 >= p.r2_sq && d2 < p.r3_sq)) continue;
            if (mask[y * p.W + x]) continue;
            const int32_t px = img[y * p.W + x];
            if (!valid(px)) continue;
            if ((double) px <= s_thr) { c_l += px; ++cn_l; }
        }
        atomicAdd(&s_clipsum, c_l); atomicAdd(&s_clipn, cn_l);
    }
    __syncthreads();

    if (threadIdx.x != 0) return;
    if (!s_accept) { ok_o[i] = 0; strong_o[i] = 0; hasobs_o[i] = 0; return; }

    double bkg = s_bkg;
    if (p.do_clip && s_clipn > 5) bkg = s_clipsum / (double) s_clipn;
    const long long Isum = (long long) s_Isum;
    const double I = (double) Isum - (double) s_ninner * bkg;
    double sigma = fmax(1.0, I * (double) p.min_sigma_ratio);
    uint8_t hasobs = 0; double ox = 0.0, oy = 0.0;
    if (Isum > 0) {
        sigma = fmax(sigma, sqrt((double) Isum));
        ox = (double) (long long) s_Ix / (double) Isum;
        oy = (double) (long long) s_Iy / (double) Isum;
        hasobs = 1;
    }
    cx_o[i] = (int) lroundf(cx);
    cy_o[i] = (int) lroundf(cy);
    I_o[i] = (float) I; sigma_o[i] = (float) sigma; bkg_o[i] = (float) bkg;
    obsx_o[i] = (float) ox; obsy_o[i] = (float) oy; hasobs_o[i] = hasobs;
    ok_o[i] = 1;
    strong_o[i] = (sigma > 0.0 && I / sigma >= STRONG_I_OVER_SIGMA) ? 1 : 0;

    const float d = dd[i];
    if (d > 0.0f) {
        // Positive doubles keep IEEE bit-pattern ordering, so atomicMin/Max on the ull view works.
        const unsigned long long b = (unsigned long long) __double_as_longlong(1.0 / ((double) d * d));
        atomicMin(&invd2mm[0], b);
        atomicMax(&invd2mm[1], b);
    }
}

// Resolution shell of one reflection from the global inv-d^2 range (mirrors CPU shell_of). Computed
// inline in learn_profile and fit so no separate shell array/kernel is needed.
__device__ inline int compute_shell(float d, const unsigned long long *invd2mm) {
    const unsigned long long mn = invd2mm[0], mx = invd2mm[1];
    if (!(d > 0.0f) || mx <= mn) return 0;
    const double invd2 = 1.0 / ((double) d * d);
    const double dmn = __longlong_as_double((long long) mn), dmx = __longlong_as_double((long long) mx);
    const int s = (int) ((invd2 - dmn) / (dmx - dmn) * N_SHELL);
    return s < 0 ? 0 : (s >= N_SHELL ? N_SHELL - 1 : s);
}

// --- Zero the profile accumulators and seed the inv-d^2 range, in one launch (replaces a handful
//     of small cudaMemsetAsync calls, which matter when kernel-launch latency is high). ---
__global__ void reset(float *shell_grid, float *global_grid, int *shell_n, int *global_n,
                      unsigned long long *invd2mm, int GG) {
    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < N_SHELL * GG; k += blockDim.x * gridDim.x)
        shell_grid[k] = 0.0f;
    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < GG; k += blockDim.x * gridDim.x)
        global_grid[k] = 0.0f;
    if (blockIdx.x == 0 && threadIdx.x < N_SHELL) shell_n[threadIdx.x] = 0;
    if (blockIdx.x == 0 && threadIdx.x == 0) {
        *global_n = 0;
        invd2mm[0] = ~0ull;   // min seed
        invd2mm[1] = 0ull;    // max seed
    }
}

// --- Learn the profile: each strong spot adds its bkg-subtracted, I-normalised grid to its shell
//     (and the global grid). One block per reflection. ---
__global__ void learn_profile(const int32_t *img, const int *cx_a, const int *cy_a, const float *dd,
                              const unsigned long long *invd2mm,
                              const float *I_a, const float *bkg_a, const uint8_t *ok_a, const uint8_t *strong_a,
                              float *shell_grid, float *global_grid, int *shell_n, int *global_n,
                              BraggGpuParams p, int n) {
    const int i = blockIdx.x;
    if (i >= n || !ok_a[i] || !strong_a[i]) return;
    const float I = I_a[i];
    if (!(I > 0.0f)) return;
    const int cx = cx_a[i], cy = cy_a[i], sh = compute_shell(dd[i], invd2mm);
    const float bkg = bkg_a[i];
    float *sg = shell_grid + (size_t) sh * p.GG;
    for (int k = threadIdx.x; k < p.GG; k += blockDim.x) {
        const int x = cx + (k % p.G - p.R), y = cy + (k / p.G - p.R);
        if (x < 0 || y < 0 || x >= p.W || y >= p.H) continue;
        const int32_t px = img[y * p.W + x];
        if (!valid(px)) continue;
        const float v = ((float) px - bkg) / I;
        atomicAdd(&sg[k], v);
        atomicAdd(&global_grid[k], v);
    }
    if (threadIdx.x == 0) { atomicAdd(&shell_n[sh], 1); atomicAdd(global_n, 1); }
}

// --- Reduce each learned grid to its 2nd-moment width (and, for empirical, a normalised profile).
//     One block per grid: blocks [0,N_SHELL) are the shells, block N_SHELL is the global grid. ---
__global__ void build_profiles(const float *shell_grid, const float *global_grid,
                               const int *shell_n, const int *global_n,
                               float *shell_P, float *global_P, float *shell_sigma2, float *global_sigma2,
                               BraggGpuParams p) {
    const int b = blockIdx.x;
    const float *grid; int nstrong; float *P; float *sig2;
    if (b < N_SHELL) { grid = shell_grid + (size_t) b * p.GG; nstrong = shell_n[b]; P = shell_P + (size_t) b * p.GG; sig2 = &shell_sigma2[b]; }
    else            { grid = global_grid; nstrong = *global_n; P = global_P; sig2 = global_sigma2; }

    __shared__ float s_m2, s_m2w, s_sum;
    if (threadIdx.x == 0) { s_m2 = 0.0f; s_m2w = 0.0f; s_sum = 0.0f; }
    __syncthreads();

    float l_m2 = 0.0f, l_m2w = 0.0f, l_sum = 0.0f;
    for (int k = threadIdx.x; k < p.GG; k += blockDim.x) {
        const int dx = k % p.G - p.R, dy = k / p.G - p.R;
        const float g = fmaxf(0.0f, grid[k]);
        const int r2i = dx * dx + dy * dy;
        if (p.broadband || (float) r2i < p.r1_sq) { l_m2 += g * (float) r2i; l_m2w += g; }
        l_sum += g;
        if (p.empirical) P[k] = g;   // pre-store clamped grid for in-place normalisation below
    }
    atomicAdd(&s_m2, l_m2); atomicAdd(&s_m2w, l_m2w); atomicAdd(&s_sum, l_sum);
    __syncthreads();

    if (threadIdx.x == 0)
        *sig2 = (nstrong > 0 && s_m2w > 0.0f) ? fmaxf(0.25f, (s_m2 / s_m2w) / 2.0f) : 1.0f;

    if (p.empirical) {
        const float sum = s_sum;
        const bool normalise = nstrong > 0 && sum > 0.0f;
        for (int k = threadIdx.x; k < p.GG; k += blockDim.x) P[k] = normalise ? P[k] / sum : 0.0f;
    }
}

// --- Pass B Kabsch profile fit: I = sum P(c-B)/v over sum P^2/v, v = B + max(I,0)P (iterate).
//     One block per reflection; the (possibly elongated) profile is built in shared memory. ---
__global__ void fit(const int32_t *img, const float *px_x, const float *px_y,
                    const int *cx_a, const int *cy_a, const float *dd, const unsigned long long *invd2mm,
                    const float *I_seed, const float *bkg_a, const uint8_t *ok_a,
                    const float *shell_P, const float *global_P,
                    const float *shell_sigma2, const float *global_sigma2, const int *shell_n,
                    float *I_o, float *sigma_o, uint8_t *ok_o, BraggGpuParams p, int n) {
    const int i = blockIdx.x;
    if (i >= n) return;
    extern __shared__ float Pbuf[];
    __shared__ float s_gs, s_num, s_den, s_I;
    __shared__ int s_Rf, s_Gf;

    if (!ok_a[i]) { if (threadIdx.x == 0) ok_o[i] = 0; return; }

    const int cx = cx_a[i], cy = cy_a[i];
    const int sh = compute_shell(dd[i], invd2mm);
    const bool use_shell = shell_n[sh] >= MIN_STRONG_PER_SHELL;   // else fall back to the global profile
    const float bkg = bkg_a[i];

    if (p.empirical) {
        const float *Psrc = use_shell ? (shell_P + (size_t) sh * p.GG) : global_P;
        if (threadIdx.x == 0) { s_Rf = p.R; s_Gf = p.G; }
        __syncthreads();
        for (int k = threadIdx.x; k < p.GG; k += blockDim.x) Pbuf[k] = Psrc[k];
        __syncthreads();
    } else {
        const float s2t = use_shell ? shell_sigma2[sh] : *global_sigma2;
        const float rx = px_x[i] - p.beam_x, ry = px_y[i] - p.beam_y;
        const float Rpx = sqrtf(rx * rx + ry * ry);
        const float tan2t = Rpx / p.F_px;
        float s2r = s2t, ux = 1.0f, uy = 0.0f;
        bool elong = false;
        if (p.use_ellipse) {
            const float sbw = p.bw_sigma * Rpx;
            const float radial_extra = sbw * sbw + p.c_radial * tan2t * tan2t;
            if (Rpx > 1e-6f && radial_extra > 0.25f) { ux = rx / Rpx; uy = ry / Rpx; s2r = s2t + radial_extra; elong = true; }
        }
        const int Rf = elong ? min(3 * p.R, (int) ceilf(p.r2 + 2.0f * sqrtf(s2r))) : p.R;
        const int Gf = 2 * Rf + 1;
        if (threadIdx.x == 0) { s_Rf = Rf; s_Gf = Gf; s_gs = 0.0f; }
        __syncthreads();
        const float fx = px_x[i] - cx, fy = px_y[i] - cy;
        float l_gs = 0.0f;
        for (int k = threadIdx.x; k < Gf * Gf; k += blockDim.x) {
            const float ex = (k % Gf - Rf) - fx, ey = (k / Gf - Rf) - fy;
            const float rad = ex * ux + ey * uy, tn = -ex * uy + ey * ux;
            const float g = expf(-rad * rad / (2.0f * s2r) - tn * tn / (2.0f * s2t));
            Pbuf[k] = g; l_gs += g;
        }
        atomicAdd(&s_gs, l_gs);
        __syncthreads();
        const float gs = s_gs;
        for (int k = threadIdx.x; k < Gf * Gf; k += blockDim.x) Pbuf[k] /= gs;
        __syncthreads();
    }

    const int Rf = s_Rf, Gf = s_Gf, GfGf = Gf * Gf;
    const float B = fmaxf(bkg, 1.0f);
    if (threadIdx.x == 0) s_I = I_seed[i];
    __syncthreads();

    for (int iter = 0; iter < 4; ++iter) {
        if (threadIdx.x == 0) { s_num = 0.0f; s_den = 0.0f; }
        __syncthreads();
        const float Ihere = s_I;
        float l_num = 0.0f, l_den = 0.0f;
        for (int k = threadIdx.x; k < GfGf; k += blockDim.x) {
            const float Pp = Pbuf[k];
            if (Pp <= 0.0f) continue;
            const int x = cx + (k % Gf - Rf), y = cy + (k / Gf - Rf);
            if (x < 0 || y < 0 || x >= p.W || y >= p.H) continue;
            const int32_t px = img[y * p.W + x];
            if (!valid(px)) continue;
            const float v = B + fmaxf(0.0f, Ihere) * Pp;
            l_num += Pp * ((float) px - bkg) / v;
            l_den += Pp * Pp / v;
        }
        atomicAdd(&s_num, l_num); atomicAdd(&s_den, l_den);
        __syncthreads();
        if (threadIdx.x == 0 && s_den > 0.0f) s_I = s_num / s_den;
        __syncthreads();
    }

    if (threadIdx.x == 0) {
        if (s_den > 0.0f) { I_o[i] = s_I; sigma_o[i] = sqrtf(1.0f / s_den); ok_o[i] = 1; }
        else ok_o[i] = 0;
    }
}

}   // namespace

BraggIntegrationEngineGPU::BraggIntegrationEngineGPU(const DiffractionExperiment &experiment,
                                                     std::shared_ptr<CudaStream> stream)
    : BraggIntegrationEngine(experiment),
      stream(std::move(stream)),
      d_mask(npixel),
      d_shell_grid(static_cast<size_t>(bragg_engine::N_SHELL) * GG),
      d_global_grid(GG),
      d_shell_P(static_cast<size_t>(bragg_engine::N_SHELL) * GG),
      d_global_P(GG),
      d_shell_sigma2(bragg_engine::N_SHELL),
      d_global_sigma2(1),
      d_shell_n(bragg_engine::N_SHELL),
      d_global_n(1),
      d_invd2(2) {
    threads = 128;

    // Fit profile grid: R for empirical / box, up to 3R (radially elongated) for the Gaussian.
    const int max_Rf = empirical ? R : 3 * R;
    const int max_Gf = 2 * max_Rf + 1;
    fit_shared_bytes = static_cast<size_t>(max_Gf) * max_Gf * sizeof(float);

    cudaDeviceProp prop{};
    cuda_err(cudaGetDeviceProperties(&prop, 0));
    if (fit_shared_bytes > prop.sharedMemPerBlock)
        throw JFJochException(JFJochExceptionCategory::GPUCUDAError,
                              "BraggIntegrationEngineGPU: profile grid exceeds shared memory (r2 too large)");
}

void BraggIntegrationEngineGPU::EnsureCapacity(size_t n) {
    if (n <= capacity)
        return;
    d_px_x = CudaDevicePtr<float>(n);
    d_px_y = CudaDevicePtr<float>(n);
    d_d = CudaDevicePtr<float>(n);
    d_cx = CudaDevicePtr<int>(n);
    d_cy = CudaDevicePtr<int>(n);
    d_I = CudaDevicePtr<float>(n);
    d_sigma = CudaDevicePtr<float>(n);
    d_bkg = CudaDevicePtr<float>(n);
    d_obs_x = CudaDevicePtr<float>(n);
    d_obs_y = CudaDevicePtr<float>(n);
    d_ok = CudaDevicePtr<uint8_t>(n);
    d_strong = CudaDevicePtr<uint8_t>(n);
    d_has_obs = CudaDevicePtr<uint8_t>(n);

    h_px_x.resize(n); h_px_y.resize(n); h_d.resize(n);
    h_I.resize(n); h_sigma.resize(n); h_bkg.resize(n);
    h_obs_x.resize(n); h_obs_y.resize(n);
    h_ok.resize(n); h_has_obs.resize(n);
    capacity = n;
}

std::vector<Reflection> BraggIntegrationEngineGPU::Run(const ImagePreprocessorBuffer &image,
                                                       const std::vector<Reflection> &predicted,
                                                       size_t npredicted, int64_t image_number) {
    std::vector<BraggFitResult> results(npredicted);
    if (image.size() != npixel || npredicted == 0)
        return Finalize(predicted, npredicted, results, image_number);

    const int32_t *img = image.getGPUBuffer();
    if (img == nullptr)
        throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
                              "BraggIntegrationEngineGPU: image buffer is not on the GPU");

    EnsureCapacity(npredicted);
    const int n = static_cast<int>(npredicted);
    for (size_t i = 0; i < npredicted; ++i) {
        h_px_x[i] = predicted[i].predicted_x;
        h_px_y[i] = predicted[i].predicted_y;
        h_d[i] = predicted[i].d;
    }
    cuda_err(cudaMemcpyAsync(d_px_x, h_px_x.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
    cuda_err(cudaMemcpyAsync(d_px_y, h_px_y.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
    cuda_err(cudaMemcpyAsync(d_d, h_d.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));

    BraggGpuParams p{
        .W = static_cast<int>(xpixel), .H = static_cast<int>(ypixel),
        .r1_sq = r1_sq, .r2 = r2, .r2_sq = r2_sq, .r3 = r3, .r3_sq = r3_sq,
        .min_sigma_ratio = min_sigma_ratio,
        .R = R, .G = G, .GG = GG,
        .do_clip = (apply_bkg_clip && mode != IntegratorMode::BoxSum) ? 1 : 0,
        .empirical = empirical ? 1 : 0,
        .broadband = broadband ? 1 : 0,
        .use_ellipse = use_ellipse ? 1 : 0,
        .bw_sigma = static_cast<float>(bw_sigma), .c_radial = static_cast<float>(c_radial),
        .F_px = static_cast<float>(F_px),
        .beam_x = beam_x, .beam_y = beam_y,
    };

    // Pass A: reset accumulators, mask, then box-sum.
    cuda_err(cudaMemsetAsync(d_mask, 0, npixel, *stream));
    reset<<<32, 256, 0, *stream>>>(d_shell_grid, d_global_grid, d_shell_n, d_global_n, d_invd2, GG);
    mark_mask<<<n, threads, 0, *stream>>>(d_px_x, d_px_y, d_mask, p, n);
    boxsum<<<n, threads, 0, *stream>>>(d_px_x, d_px_y, d_d, img, d_mask, p, n,
                                       d_cx, d_cy, d_I, d_sigma, d_bkg, d_obs_x, d_obs_y,
                                       d_ok, d_strong, d_has_obs, d_invd2);

    if (mode != IntegratorMode::BoxSum) {
        // Pass B: learn (shell computed inline) -> build -> fit.
        learn_profile<<<n, threads, 0, *stream>>>(img, d_cx, d_cy, d_d, d_invd2, d_I, d_bkg, d_ok, d_strong,
                                                  d_shell_grid, d_global_grid, d_shell_n, d_global_n, p, n);
        build_profiles<<<bragg_engine::N_SHELL + 1, threads, 0, *stream>>>(
            d_shell_grid, d_global_grid, d_shell_n, d_global_n,
            d_shell_P, d_global_P, d_shell_sigma2, d_global_sigma2, p);
        fit<<<n, threads, fit_shared_bytes, *stream>>>(img, d_px_x, d_px_y, d_cx, d_cy, d_d, d_invd2,
                                                       d_I, d_bkg, d_ok, d_shell_P, d_global_P,
                                                       d_shell_sigma2, d_global_sigma2, d_shell_n,
                                                       d_I, d_sigma, d_ok, p, n);
    }

    cuda_err(cudaMemcpyAsync(h_I.data(), d_I, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
    cuda_err(cudaMemcpyAsync(h_sigma.data(), d_sigma, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
    cuda_err(cudaMemcpyAsync(h_bkg.data(), d_bkg, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
    cuda_err(cudaMemcpyAsync(h_ok.data(), d_ok, sizeof(uint8_t) * npredicted, cudaMemcpyDeviceToHost, *stream));
    const bool boxsum_mode = mode == IntegratorMode::BoxSum;
    if (boxsum_mode) {
        cuda_err(cudaMemcpyAsync(h_obs_x.data(), d_obs_x, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
        cuda_err(cudaMemcpyAsync(h_obs_y.data(), d_obs_y, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
        cuda_err(cudaMemcpyAsync(h_has_obs.data(), d_has_obs, sizeof(uint8_t) * npredicted, cudaMemcpyDeviceToHost, *stream));
    }
    cuda_err(cudaStreamSynchronize(*stream));

    for (size_t i = 0; i < npredicted; ++i) {
        if (!h_ok[i]) continue;
        results[i].I = h_I[i];
        results[i].sigma = h_sigma[i];
        results[i].bkg = h_bkg[i];
        results[i].ok = true;
        if (boxsum_mode && h_has_obs[i]) {
            results[i].observed_x = h_obs_x[i];
            results[i].observed_y = h_obs_y[i];
            results[i].has_observed = true;
        }
    }
    return Finalize(predicted, npredicted, results, image_number);
}